23 files changed, 1284 insertions, 1153 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir
index 68302f5..5f98dae 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir
@@ -290,11 +290,8 @@ body: |
     ; CHECK-LABEL: name: s3_from_s35
     ; CHECK: liveins: $w0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[DEF]](s64)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
-    ; CHECK-NEXT: %ext:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: $w0 = COPY [[C]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %val:_(s35) = G_IMPLICIT_DEF
     %extract:_(s3) = G_EXTRACT %val, 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
index 03c28ef..b28298c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
@@ -159,13 +159,16 @@ body: |
     ; CHECK-LABEL: name: test_freeze_v3s8
     ; CHECK: liveins: $q0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[DEF]]
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[BUILD_VECTOR]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s8>), [[UV1:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[TRUNC]](<8 x s8>)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[UV]]
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>)
     ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV]](s8)
-    ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV1]](s8)
-    ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV2]](s8)
+    ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV2]](s8)
+    ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV3]](s8)
+    ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV4]](s8)
     ; CHECK-NEXT: %res:_(<4 x s32>) = G_BUILD_VECTOR %ext0(s32), %ext1(s32), %ext2(s32), %undef(s32)
     ; CHECK-NEXT: $q0 = COPY %res(<4 x s32>)
     %x:_(<3 x s8>) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
index 858a5a2..1cf066d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
@@ -248,21 +248,19 @@ body:             |
   ; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
   ; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
   ; CHECK-NEXT:   [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16)
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF2]](<4 x s8>)
-  ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
-  ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+  ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+  ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<16 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, undef, undef, undef, undef)
   ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[SHUF]](<16 x s8>)
   ; CHECK-NEXT:   [[UITOFP:%[0-9]+]]:_(<4 x s32>) = G_UITOFP [[BITCAST]](<4 x s32>)
-  ; CHECK-NEXT:   [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UITOFP]](<4 x s32>)
-  ; CHECK-NEXT:   G_STORE [[UV10]](s32), [[COPY]](p0) :: (store (s32), align 16)
+  ; CHECK-NEXT:   [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UITOFP]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[UV6]](s32), [[COPY]](p0) :: (store (s32), align 16)
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64)
-  ; CHECK-NEXT:   G_STORE [[UV11]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4)
+  ; CHECK-NEXT:   G_STORE [[UV7]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C4]](s64)
-  ; CHECK-NEXT:   G_STORE [[UV12]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 8, align 8)
+  ; CHECK-NEXT:   G_STORE [[UV8]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 8, align 8)
   ; CHECK-NEXT:   G_BR %bb.1
   bb.1:
     liveins: $w1, $w2, $w3, $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 2c326902..eb30581 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -545,15 +545,18 @@ body:             |
     ; CHECK-LABEL: name: store_6xs64
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64)
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64)
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
-    ; CHECK-NEXT: G_STORE [[DEF]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>))
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>))
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD %ptr, [[C]](s64)
-    ; CHECK-NEXT: G_STORE [[DEF]](<2 x s64>), [[PTR_ADD]](p0) :: (store (<2 x s64>) into unknown-address + 16)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[PTR_ADD]](p0) :: (store (<2 x s64>) into unknown-address + 16)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: G_STORE [[DEF]](<2 x s64>), [[PTR_ADD1]](p0) :: (store (<2 x s64>) into unknown-address + 32)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD1]](p0) :: (store (<2 x s64>) into unknown-address + 32)
     ; CHECK-NEXT: RET_ReallyLR
     %val:_(<6 x s64>) = G_IMPLICIT_DEF
     %ptr:_(p0) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir
index b8bdef0..737c66c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir
@@ -220,10 +220,8 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UADDE]](s32), [[SEXT_INREG2]]
     ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UADDE]](s32)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF1]](s32)
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV4]](s8), [[UV5]](s8), [[UV6]](s8), [[DEF]](s8)
-    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV8]](s8), [[UV9]](s8), [[UV10]](s8), [[UV8]](s8)
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
     ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32)
     ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[MV2]], 24
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 23
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
index 52a28ad..1c5ae0d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
@@ -289,35 +289,35 @@ body:             |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4100
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %w0(s32), [[C]]
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ICMP2]], 1
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
-    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[DEF1]], [[TRUNC]](s16), [[C1]](s64)
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[IVEC]](<4 x s16>)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16)
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16)
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
     ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
-    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16)
-    ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[UV5]](s16)
-    ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[UV6]](s16)
-    ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[UV7]](s16)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
+    ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16)
+    ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16)
+    ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16)
+    ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
     ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<8 x s8>), [[BUILD_VECTOR1]], shufflemask(0, 0, 0, 0, undef, undef, undef, undef)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[SHUF]](<8 x s8>)
-    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s16>), [[UV9:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
-    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR2]](<8 x s8>)
-    ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(<4 x s16>), [[UV11:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>)
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[UV8]], [[UV10]]
+    ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s16>), [[UV7:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[UV4]], [[UV6]]
     ; CHECK-NEXT: [[TRUNC9:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
     ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[SHUF]](<8 x s8>)
-    ; CHECK-NEXT: [[UV12:%[0-9]+]]:_(<4 x s16>), [[UV13:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT2]](<8 x s16>)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC9]], [[UV12]]
+    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s16>), [[UV9:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT2]](<8 x s16>)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC9]], [[UV8]]
     ; CHECK-NEXT: [[TRUNC10:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC10]], [[XOR]]
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND]], [[AND1]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index fdd0ebb..352f4e7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -288,10 +288,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[UV]](s32)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[UV]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[DEF]](s32)
     ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]], shufflemask(0, 1, 5, 6)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[SHUF]](<4 x s32>), [[C]](s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir
index 2311be6..abfaea0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir
@@ -220,10 +220,8 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[USUBE]](s32), [[SEXT_INREG2]]
     ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[USUBE]](s32)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF1]](s32)
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV4]](s8), [[UV5]](s8), [[UV6]](s8), [[DEF]](s8)
-    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV8]](s8), [[UV9]](s8), [[UV10]](s8), [[UV8]](s8)
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
     ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32)
     ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[MV2]], 24
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 23
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
index 2609eb0..9726cc5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
@@ -37,10 +37,9 @@ body: |
   bb.0:
 
     ; CHECK-LABEL: name: test_implicit_def_v4s32
-    ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
-    ; CHECK-NEXT: $x0 = COPY [[UV]](<2 x s32>)
-    ; CHECK-NEXT: $x1 = COPY [[UV1]](<2 x s32>)
+    ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: $x0 = COPY [[DEF]](<2 x s32>)
+    ; CHECK-NEXT: $x1 = COPY [[DEF]](<2 x s32>)
     %0:_(<4 x s32>) = G_IMPLICIT_DEF
     %1:_(<2 x s32> ), %2:_(<2 x s32>) = G_UNMERGE_VALUES %0
     $x0 = COPY %1
@@ -67,10 +66,9 @@ body: |
   bb.0:
 
     ; CHECK-LABEL: name: test_implicit_def_v2s32
-    ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
-    ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
-    ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
+    ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
+    ; CHECK-NEXT: $w1 = COPY [[DEF]](s32)
     %0:_(<2 x s32>) = G_IMPLICIT_DEF
     %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0
     $w0 = COPY %1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index 41f7ab8..480fcbd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -4992,28 +4992,21 @@ define void @test_shl_i512_const_32(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_32:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #32
-; GISEL-NEXT:    lsr x13, x9, #32
-; GISEL-NEXT:    lsl x8, x8, #32
-; GISEL-NEXT:    orr x9, x10, x9, lsl #32
-; GISEL-NEXT:    lsr x10, x11, #32
-; GISEL-NEXT:    orr x11, x13, x11, lsl #32
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #32
-; GISEL-NEXT:    orr x10, x10, x12, lsl #32
-; GISEL-NEXT:    lsr x12, x14, #32
-; GISEL-NEXT:    lsr x9, x15, #32
-; GISEL-NEXT:    orr x8, x8, x14, lsl #32
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #32
-; GISEL-NEXT:    lsr x12, x13, #32
-; GISEL-NEXT:    orr x9, x9, x13, lsl #32
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #32
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #32
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #32
+; GISEL-NEXT:    extr x10, x15, x14, #32
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5044,30 +5037,22 @@ define void @test_lshr_i512_const_32(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_32:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x13, x9, #32
-; GISEL-NEXT:    lsl x15, x10, #32
-; GISEL-NEXT:    orr x11, x12, x11, lsr #32
-; GISEL-NEXT:    orr x8, x13, x8, lsr #32
-; GISEL-NEXT:    lsl x13, x14, #32
-; GISEL-NEXT:    orr x9, x15, x9, lsr #32
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #32
-; GISEL-NEXT:    lsl x8, x16, #32
-; GISEL-NEXT:    lsl x11, x12, #32
-; GISEL-NEXT:    lsl x13, x15, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #32
-; GISEL-NEXT:    lsr x10, x16, #32
-; GISEL-NEXT:    orr x11, x11, x14, lsr #32
-; GISEL-NEXT:    orr x9, x13, x12, lsr #32
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #32
+; GISEL-NEXT:    extr x9, x13, x12, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #32
+; GISEL-NEXT:    extr x8, x15, x14, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #32
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5098,32 +5083,24 @@ define void @test_ashr_i512_const_32(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_32:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x15, x9, #32
-; GISEL-NEXT:    lsl x16, x10, #32
-; GISEL-NEXT:    orr x11, x12, x11, lsr #32
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x8, x15, x8, lsr #32
-; GISEL-NEXT:    lsl x15, x13, #32
-; GISEL-NEXT:    orr x9, x16, x9, lsr #32
-; GISEL-NEXT:    asr x16, x17, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x14, #32
-; GISEL-NEXT:    orr x10, x15, x10, lsr #32
-; GISEL-NEXT:    lsl x15, x12, #32
-; GISEL-NEXT:    orr x8, x11, x13, lsr #32
-; GISEL-NEXT:    lsl x11, x17, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x15, x14, lsr #32
-; GISEL-NEXT:    lsl x13, x16, #32
-; GISEL-NEXT:    orr x10, x11, x12, lsr #32
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    orr x8, x13, x17, asr #32
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    ldp x14, x15, [x1, #32]
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    extr x11, x14, x11, #32
+; GISEL-NEXT:    extr x9, x15, x14, #32
+; GISEL-NEXT:    lsl x8, x8, #32
+; GISEL-NEXT:    stp x10, x11, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x15, #32
+; GISEL-NEXT:    extr x11, x13, x12, #32
+; GISEL-NEXT:    orr x8, x8, x13, asr #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x11, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5252,23 +5229,17 @@ define void @test_shl_i512_const_96(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #32
-; GISEL-NEXT:    lsr x16, x9, #32
-; GISEL-NEXT:    lsl x8, x8, #32
-; GISEL-NEXT:    orr x9, x14, x9, lsl #32
-; GISEL-NEXT:    lsr x14, x10, #32
-; GISEL-NEXT:    orr x10, x16, x10, lsl #32
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #32
-; GISEL-NEXT:    orr x11, x14, x11, lsl #32
-; GISEL-NEXT:    lsr x14, x12, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #32
-; GISEL-NEXT:    orr x8, x8, x12, lsl #32
-; GISEL-NEXT:    orr x10, x14, x13, lsl #32
-; GISEL-NEXT:    orr x9, x9, x15, lsl #32
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #32
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #32
+; GISEL-NEXT:    extr x9, x13, x12, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5297,27 +5268,21 @@ define void @test_lshr_i512_const_96(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_96:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x13, x9, #32
-; GISEL-NEXT:    orr x10, x12, x10, lsr #32
-; GISEL-NEXT:    lsl x12, x11, #32
-; GISEL-NEXT:    orr x8, x13, x8, lsr #32
-; GISEL-NEXT:    lsl x13, x14, #32
-; GISEL-NEXT:    orr x9, x12, x9, lsr #32
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #32
-; GISEL-NEXT:    orr x11, x13, x11, lsr #32
-; GISEL-NEXT:    lsl x12, x16, #32
-; GISEL-NEXT:    orr x8, x10, x14, lsr #32
-; GISEL-NEXT:    lsr x10, x16, #32
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #32
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #32
+; GISEL-NEXT:    extr x9, x13, x12, #32
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #32
+; GISEL-NEXT:    lsr x8, x14, #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5347,29 +5312,23 @@ define void @test_ashr_i512_const_96(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_96:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x11, [x1, #8]
-; GISEL-NEXT:    ldp x10, x13, [x1, #32]
-; GISEL-NEXT:    lsl x12, x8, #32
-; GISEL-NEXT:    lsl x14, x9, #32
-; GISEL-NEXT:    lsl x15, x10, #32
-; GISEL-NEXT:    orr x11, x12, x11, lsr #32
-; GISEL-NEXT:    ldp x12, x16, [x1, #48]
-; GISEL-NEXT:    orr x8, x14, x8, lsr #32
-; GISEL-NEXT:    lsl x14, x13, #32
-; GISEL-NEXT:    orr x9, x15, x9, lsr #32
-; GISEL-NEXT:    asr x15, x16, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x12, #32
-; GISEL-NEXT:    orr x10, x14, x10, lsr #32
-; GISEL-NEXT:    lsl x14, x16, #32
-; GISEL-NEXT:    orr x8, x11, x13, lsr #32
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x13, [x1, #40]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x14, x12, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #32
+; GISEL-NEXT:    extr x9, x10, x9, #32
+; GISEL-NEXT:    extr x10, x11, x10, #32
+; GISEL-NEXT:    asr x15, x12, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #32
+; GISEL-NEXT:    extr x9, x14, x13, #32
 ; GISEL-NEXT:    lsl x11, x15, #32
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x14, x12, lsr #32
-; GISEL-NEXT:    orr x10, x11, x16, asr #32
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x15, [x0, #48]
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x14, #32
+; GISEL-NEXT:    orr x8, x11, x12, asr #32
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x15, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5404,28 +5363,21 @@ define void @test_shl_i512_const_1(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_1:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #63
-; GISEL-NEXT:    lsr x13, x9, #63
-; GISEL-NEXT:    lsl x8, x8, #1
-; GISEL-NEXT:    orr x9, x10, x9, lsl #1
-; GISEL-NEXT:    lsr x10, x11, #63
-; GISEL-NEXT:    orr x11, x13, x11, lsl #1
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #63
-; GISEL-NEXT:    orr x10, x10, x12, lsl #1
-; GISEL-NEXT:    lsr x12, x14, #63
-; GISEL-NEXT:    lsr x9, x15, #63
-; GISEL-NEXT:    orr x8, x8, x14, lsl #1
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #1
-; GISEL-NEXT:    lsr x12, x13, #63
-; GISEL-NEXT:    orr x9, x9, x13, lsl #1
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #1
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #1
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #63
+; GISEL-NEXT:    extr x10, x15, x14, #63
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5457,30 +5409,22 @@ define void @test_lshr_i512_const_1(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_1:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x13, x9, #63
-; GISEL-NEXT:    lsl x15, x10, #63
-; GISEL-NEXT:    orr x11, x12, x11, lsr #1
-; GISEL-NEXT:    orr x8, x13, x8, lsr #1
-; GISEL-NEXT:    lsl x13, x14, #63
-; GISEL-NEXT:    orr x9, x15, x9, lsr #1
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #1
-; GISEL-NEXT:    lsl x8, x16, #63
-; GISEL-NEXT:    lsl x11, x12, #63
-; GISEL-NEXT:    lsl x13, x15, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #1
-; GISEL-NEXT:    lsr x10, x16, #1
-; GISEL-NEXT:    orr x11, x11, x14, lsr #1
-; GISEL-NEXT:    orr x9, x13, x12, lsr #1
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #1
+; GISEL-NEXT:    extr x9, x13, x12, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #1
+; GISEL-NEXT:    extr x8, x15, x14, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #1
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5512,32 +5456,24 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_1:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x15, x9, #63
-; GISEL-NEXT:    lsl x16, x10, #63
-; GISEL-NEXT:    orr x11, x12, x11, lsr #1
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x8, x15, x8, lsr #1
-; GISEL-NEXT:    lsl x15, x13, #63
-; GISEL-NEXT:    orr x9, x16, x9, lsr #1
-; GISEL-NEXT:    asr x16, x17, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x14, #63
-; GISEL-NEXT:    orr x10, x15, x10, lsr #1
-; GISEL-NEXT:    lsl x15, x12, #63
-; GISEL-NEXT:    orr x8, x11, x13, lsr #1
-; GISEL-NEXT:    lsl x11, x17, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x15, x14, lsr #1
-; GISEL-NEXT:    lsl x13, x16, #63
-; GISEL-NEXT:    orr x10, x11, x12, lsr #1
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    orr x8, x13, x17, asr #1
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    ldp x14, x15, [x1, #32]
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    extr x11, x14, x11, #1
+; GISEL-NEXT:    extr x9, x15, x14, #1
+; GISEL-NEXT:    lsl x8, x8, #63
+; GISEL-NEXT:    stp x10, x11, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x15, #1
+; GISEL-NEXT:    extr x11, x13, x12, #1
+; GISEL-NEXT:    orr x8, x8, x13, asr #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x11, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5571,28 +5507,21 @@ define void @test_shl_i512_const_15(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_15:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #49
-; GISEL-NEXT:    lsr x13, x9, #49
-; GISEL-NEXT:    lsl x8, x8, #15
-; GISEL-NEXT:    orr x9, x10, x9, lsl #15
-; GISEL-NEXT:    lsr x10, x11, #49
-; GISEL-NEXT:    orr x11, x13, x11, lsl #15
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #49
-; GISEL-NEXT:    orr x10, x10, x12, lsl #15
-; GISEL-NEXT:    lsr x12, x14, #49
-; GISEL-NEXT:    lsr x9, x15, #49
-; GISEL-NEXT:    orr x8, x8, x14, lsl #15
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #15
-; GISEL-NEXT:    lsr x12, x13, #49
-; GISEL-NEXT:    orr x9, x9, x13, lsl #15
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #15
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #15
+; GISEL-NEXT:    extr x8, x9, x8, #49
+; GISEL-NEXT:    extr x9, x10, x9, #49
+; GISEL-NEXT:    extr x10, x11, x10, #49
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #49
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #49
+; GISEL-NEXT:    extr x10, x15, x14, #49
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #49
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5624,30 +5553,22 @@ define void @test_lshr_i512_const_15(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_15:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #49
-; GISEL-NEXT:    lsl x13, x9, #49
-; GISEL-NEXT:    lsl x15, x10, #49
-; GISEL-NEXT:    orr x11, x12, x11, lsr #15
-; GISEL-NEXT:    orr x8, x13, x8, lsr #15
-; GISEL-NEXT:    lsl x13, x14, #49
-; GISEL-NEXT:    orr x9, x15, x9, lsr #15
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #15
-; GISEL-NEXT:    lsl x8, x16, #49
-; GISEL-NEXT:    lsl x11, x12, #49
-; GISEL-NEXT:    lsl x13, x15, #49
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #15
-; GISEL-NEXT:    lsr x10, x16, #15
-; GISEL-NEXT:    orr x11, x11, x14, lsr #15
-; GISEL-NEXT:    orr x9, x13, x12, lsr #15
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #15
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #15
+; GISEL-NEXT:    extr x10, x11, x10, #15
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #15
+; GISEL-NEXT:    extr x9, x13, x12, #15
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #15
+; GISEL-NEXT:    extr x8, x15, x14, #15
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #15
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5679,32 +5600,24 @@ define void @test_ashr_i512_const_15(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_15:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #49
-; GISEL-NEXT:    lsl x15, x9, #49
-; GISEL-NEXT:    lsl x16, x10, #49
-; GISEL-NEXT:    orr x11, x12, x11, lsr #15
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x8, x15, x8, lsr #15
-; GISEL-NEXT:    lsl x15, x13, #49
-; GISEL-NEXT:    orr x9, x16, x9, lsr #15
-; GISEL-NEXT:    asr x16, x17, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x14, #49
-; GISEL-NEXT:    orr x10, x15, x10, lsr #15
-; GISEL-NEXT:    lsl x15, x12, #49
-; GISEL-NEXT:    orr x8, x11, x13, lsr #15
-; GISEL-NEXT:    lsl x11, x17, #49
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x15, x14, lsr #15
-; GISEL-NEXT:    lsl x13, x16, #49
-; GISEL-NEXT:    orr x10, x11, x12, lsr #15
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    orr x8, x13, x17, asr #15
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #15
+; GISEL-NEXT:    ldp x14, x15, [x1, #32]
+; GISEL-NEXT:    extr x9, x10, x9, #15
+; GISEL-NEXT:    extr x10, x11, x10, #15
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    asr x8, x13, #63
+; GISEL-NEXT:    extr x11, x14, x11, #15
+; GISEL-NEXT:    extr x9, x15, x14, #15
+; GISEL-NEXT:    lsl x8, x8, #49
+; GISEL-NEXT:    stp x10, x11, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x15, #15
+; GISEL-NEXT:    extr x11, x13, x12, #15
+; GISEL-NEXT:    orr x8, x8, x13, asr #15
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x11, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5738,28 +5651,21 @@ define void @test_shl_i512_const_63(ptr %result, ptr %input) {
 ; GISEL-LABEL: test_shl_i512_const_63:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    ldp x8, x9, [x1]
-; GISEL-NEXT:    ldp x11, x12, [x1, #16]
-; GISEL-NEXT:    ldp x14, x15, [x1, #32]
-; GISEL-NEXT:    lsr x10, x8, #1
-; GISEL-NEXT:    lsr x13, x9, #1
-; GISEL-NEXT:    lsl x8, x8, #63
-; GISEL-NEXT:    orr x9, x10, x9, lsl #63
-; GISEL-NEXT:    lsr x10, x11, #1
-; GISEL-NEXT:    orr x11, x13, x11, lsl #63
-; GISEL-NEXT:    ldp x13, x16, [x1, #48]
-; GISEL-NEXT:    stp x8, x9, [x0]
-; GISEL-NEXT:    lsr x8, x12, #1
-; GISEL-NEXT:    orr x10, x10, x12, lsl #63
-; GISEL-NEXT:    lsr x12, x14, #1
-; GISEL-NEXT:    lsr x9, x15, #1
-; GISEL-NEXT:    orr x8, x8, x14, lsl #63
-; GISEL-NEXT:    stp x11, x10, [x0, #16]
-; GISEL-NEXT:    orr x11, x12, x15, lsl #63
-; GISEL-NEXT:    lsr x12, x13, #1
-; GISEL-NEXT:    orr x9, x9, x13, lsl #63
-; GISEL-NEXT:    stp x8, x11, [x0, #32]
-; GISEL-NEXT:    orr x8, x12, x16, lsl #63
-; GISEL-NEXT:    stp x9, x8, [x0, #48]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x13, x14, [x1, #32]
+; GISEL-NEXT:    lsl x12, x8, #63
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    ldp x15, x16, [x1, #48]
+; GISEL-NEXT:    stp x12, x8, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #16]
+; GISEL-NEXT:    extr x9, x14, x13, #1
+; GISEL-NEXT:    extr x10, x15, x14, #1
+; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    extr x8, x16, x15, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5791,30 +5697,22 @@ define void @test_lshr_i512_const_63(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_63:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x11, [x1]
-; GISEL-NEXT:    ldp x10, x14, [x1, #24]
-; GISEL-NEXT:    ldr x16, [x1, #56]
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x13, x9, #1
-; GISEL-NEXT:    lsl x15, x10, #1
-; GISEL-NEXT:    orr x11, x12, x11, lsr #63
-; GISEL-NEXT:    orr x8, x13, x8, lsr #63
-; GISEL-NEXT:    lsl x13, x14, #1
-; GISEL-NEXT:    orr x9, x15, x9, lsr #63
-; GISEL-NEXT:    ldp x12, x15, [x1, #40]
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    orr x10, x13, x10, lsr #63
-; GISEL-NEXT:    lsl x8, x16, #1
-; GISEL-NEXT:    lsl x11, x12, #1
-; GISEL-NEXT:    lsl x13, x15, #1
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x8, x8, x15, lsr #63
-; GISEL-NEXT:    lsr x10, x16, #63
-; GISEL-NEXT:    orr x11, x11, x14, lsr #63
-; GISEL-NEXT:    orr x9, x13, x12, lsr #63
-; GISEL-NEXT:    stp x8, x10, [x0, #48]
-; GISEL-NEXT:    stp x11, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #63
+; GISEL-NEXT:    extr x8, x15, x14, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    lsr x9, x15, #63
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5846,30 +5744,22 @@ define void @test_ashr_i512_const_63(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_63:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #8]
-; GISEL-NEXT:    ldr x10, [x1]
-; GISEL-NEXT:    ldp x11, x13, [x1, #24]
-; GISEL-NEXT:    ldr x17, [x1, #56]
-; GISEL-NEXT:    lsl x15, x9, #1
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x16, x11, #1
-; GISEL-NEXT:    orr x8, x15, x8, lsr #63
-; GISEL-NEXT:    lsl x15, x13, #1
-; GISEL-NEXT:    orr x10, x12, x10, lsr #63
-; GISEL-NEXT:    ldp x14, x12, [x1, #40]
-; GISEL-NEXT:    orr x9, x16, x9, lsr #63
-; GISEL-NEXT:    orr x11, x15, x11, lsr #63
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x8, x17, #1
-; GISEL-NEXT:    lsl x16, x14, #1
-; GISEL-NEXT:    lsl x10, x12, #1
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    asr x9, x17, #63
-; GISEL-NEXT:    orr x8, x8, x12, lsr #63
-; GISEL-NEXT:    orr x13, x16, x13, lsr #63
-; GISEL-NEXT:    orr x10, x10, x14, lsr #63
-; GISEL-NEXT:    orr x9, x9, x9, lsl #1
-; GISEL-NEXT:    stp x13, x10, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1]
+; GISEL-NEXT:    ldp x10, x11, [x1, #16]
+; GISEL-NEXT:    ldp x12, x13, [x1, #32]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    ldp x14, x15, [x1, #48]
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    extr x11, x14, x13, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    asr x10, x15, #63
+; GISEL-NEXT:    extr x8, x15, x14, #63
+; GISEL-NEXT:    stp x9, x11, [x0, #32]
+; GISEL-NEXT:    orr x9, x10, x10, lsl #1
 ; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
@@ -5906,23 +5796,17 @@ define void @test_shl_i512_const_65(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #63
-; GISEL-NEXT:    lsr x16, x9, #63
-; GISEL-NEXT:    lsl x8, x8, #1
-; GISEL-NEXT:    orr x9, x14, x9, lsl #1
-; GISEL-NEXT:    lsr x14, x10, #63
-; GISEL-NEXT:    orr x10, x16, x10, lsl #1
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #63
-; GISEL-NEXT:    orr x11, x14, x11, lsl #1
-; GISEL-NEXT:    lsr x14, x12, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #63
-; GISEL-NEXT:    orr x8, x8, x12, lsl #1
-; GISEL-NEXT:    orr x10, x14, x13, lsl #1
-; GISEL-NEXT:    orr x9, x9, x15, lsl #1
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #1
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -5953,27 +5837,21 @@ define void @test_lshr_i512_const_65(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_65:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x13, x9, #63
-; GISEL-NEXT:    orr x10, x12, x10, lsr #1
-; GISEL-NEXT:    lsl x12, x11, #63
-; GISEL-NEXT:    orr x8, x13, x8, lsr #1
-; GISEL-NEXT:    lsl x13, x14, #63
-; GISEL-NEXT:    orr x9, x12, x9, lsr #1
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #63
-; GISEL-NEXT:    orr x11, x13, x11, lsr #1
-; GISEL-NEXT:    lsl x12, x16, #63
-; GISEL-NEXT:    orr x8, x10, x14, lsr #1
-; GISEL-NEXT:    lsr x10, x16, #1
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #1
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #1
+; GISEL-NEXT:    extr x9, x13, x12, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #1
+; GISEL-NEXT:    lsr x8, x14, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6005,29 +5883,23 @@ define void @test_ashr_i512_const_65(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_65:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x11, [x1, #8]
-; GISEL-NEXT:    ldp x10, x13, [x1, #32]
-; GISEL-NEXT:    lsl x12, x8, #63
-; GISEL-NEXT:    lsl x14, x9, #63
-; GISEL-NEXT:    lsl x15, x10, #63
-; GISEL-NEXT:    orr x11, x12, x11, lsr #1
-; GISEL-NEXT:    ldp x12, x16, [x1, #48]
-; GISEL-NEXT:    orr x8, x14, x8, lsr #1
-; GISEL-NEXT:    lsl x14, x13, #63
-; GISEL-NEXT:    orr x9, x15, x9, lsr #1
-; GISEL-NEXT:    asr x15, x16, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x12, #63
-; GISEL-NEXT:    orr x10, x14, x10, lsr #1
-; GISEL-NEXT:    lsl x14, x16, #63
-; GISEL-NEXT:    orr x8, x11, x13, lsr #1
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x13, [x1, #40]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x14, x12, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    asr x15, x12, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #1
+; GISEL-NEXT:    extr x9, x14, x13, #1
 ; GISEL-NEXT:    lsl x11, x15, #63
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x14, x12, lsr #1
-; GISEL-NEXT:    orr x10, x11, x16, asr #1
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x15, [x0, #48]
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x14, #1
+; GISEL-NEXT:    orr x8, x11, x12, asr #1
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x15, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6062,23 +5934,17 @@ define void @test_shl_i512_const_100(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #28
-; GISEL-NEXT:    lsr x16, x9, #28
-; GISEL-NEXT:    lsl x8, x8, #36
-; GISEL-NEXT:    orr x9, x14, x9, lsl #36
-; GISEL-NEXT:    lsr x14, x10, #28
-; GISEL-NEXT:    orr x10, x16, x10, lsl #36
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #28
-; GISEL-NEXT:    orr x11, x14, x11, lsl #36
-; GISEL-NEXT:    lsr x14, x12, #28
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #28
-; GISEL-NEXT:    orr x8, x8, x12, lsl #36
-; GISEL-NEXT:    orr x10, x14, x13, lsl #36
-; GISEL-NEXT:    orr x9, x9, x15, lsl #36
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #36
+; GISEL-NEXT:    extr x8, x9, x8, #28
+; GISEL-NEXT:    extr x9, x10, x9, #28
+; GISEL-NEXT:    extr x10, x11, x10, #28
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #28
+; GISEL-NEXT:    extr x9, x13, x12, #28
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #28
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6109,27 +5975,21 @@ define void @test_lshr_i512_const_100(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_100:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #28
-; GISEL-NEXT:    lsl x13, x9, #28
-; GISEL-NEXT:    orr x10, x12, x10, lsr #36
-; GISEL-NEXT:    lsl x12, x11, #28
-; GISEL-NEXT:    orr x8, x13, x8, lsr #36
-; GISEL-NEXT:    lsl x13, x14, #28
-; GISEL-NEXT:    orr x9, x12, x9, lsr #36
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #28
-; GISEL-NEXT:    orr x11, x13, x11, lsr #36
-; GISEL-NEXT:    lsl x12, x16, #28
-; GISEL-NEXT:    orr x8, x10, x14, lsr #36
-; GISEL-NEXT:    lsr x10, x16, #36
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #36
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #36
+; GISEL-NEXT:    extr x9, x10, x9, #36
+; GISEL-NEXT:    extr x10, x11, x10, #36
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #36
+; GISEL-NEXT:    extr x9, x13, x12, #36
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #36
+; GISEL-NEXT:    lsr x8, x14, #36
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6161,29 +6021,23 @@ define void @test_ashr_i512_const_100(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_100:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x11, [x1, #8]
-; GISEL-NEXT:    ldp x10, x13, [x1, #32]
-; GISEL-NEXT:    lsl x12, x8, #28
-; GISEL-NEXT:    lsl x14, x9, #28
-; GISEL-NEXT:    lsl x15, x10, #28
-; GISEL-NEXT:    orr x11, x12, x11, lsr #36
-; GISEL-NEXT:    ldp x12, x16, [x1, #48]
-; GISEL-NEXT:    orr x8, x14, x8, lsr #36
-; GISEL-NEXT:    lsl x14, x13, #28
-; GISEL-NEXT:    orr x9, x15, x9, lsr #36
-; GISEL-NEXT:    asr x15, x16, #63
-; GISEL-NEXT:    stp x11, x8, [x0]
-; GISEL-NEXT:    lsl x11, x12, #28
-; GISEL-NEXT:    orr x10, x14, x10, lsr #36
-; GISEL-NEXT:    lsl x14, x16, #28
-; GISEL-NEXT:    orr x8, x11, x13, lsr #36
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x13, [x1, #40]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x14, x12, [x1, #48]
+; GISEL-NEXT:    extr x8, x9, x8, #36
+; GISEL-NEXT:    extr x9, x10, x9, #36
+; GISEL-NEXT:    extr x10, x11, x10, #36
+; GISEL-NEXT:    asr x15, x12, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x13, x11, #36
+; GISEL-NEXT:    extr x9, x14, x13, #36
 ; GISEL-NEXT:    lsl x11, x15, #28
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    orr x9, x14, x12, lsr #36
-; GISEL-NEXT:    orr x10, x11, x16, asr #36
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x15, [x0, #48]
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x12, x14, #36
+; GISEL-NEXT:    orr x8, x11, x12, asr #36
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x15, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6219,23 +6073,17 @@ define void @test_shl_i512_const_127(ptr %result, ptr %input) {
 ; GISEL-NEXT:    ldr x15, [x1, #48]
 ; GISEL-NEXT:    ldp x10, x11, [x1, #16]
 ; GISEL-NEXT:    ldp x12, x13, [x1, #32]
-; GISEL-NEXT:    lsr x14, x8, #1
-; GISEL-NEXT:    lsr x16, x9, #1
-; GISEL-NEXT:    lsl x8, x8, #63
-; GISEL-NEXT:    orr x9, x14, x9, lsl #63
-; GISEL-NEXT:    lsr x14, x10, #1
-; GISEL-NEXT:    orr x10, x16, x10, lsl #63
-; GISEL-NEXT:    stp xzr, x8, [x0]
-; GISEL-NEXT:    lsr x8, x11, #1
-; GISEL-NEXT:    orr x11, x14, x11, lsl #63
-; GISEL-NEXT:    lsr x14, x12, #1
-; GISEL-NEXT:    stp x9, x10, [x0, #16]
-; GISEL-NEXT:    lsr x9, x13, #1
-; GISEL-NEXT:    orr x8, x8, x12, lsl #63
-; GISEL-NEXT:    orr x10, x14, x13, lsl #63
-; GISEL-NEXT:    orr x9, x9, x15, lsl #63
-; GISEL-NEXT:    stp x11, x8, [x0, #32]
-; GISEL-NEXT:    stp x10, x9, [x0, #48]
+; GISEL-NEXT:    lsl x14, x8, #63
+; GISEL-NEXT:    extr x8, x9, x8, #1
+; GISEL-NEXT:    extr x9, x10, x9, #1
+; GISEL-NEXT:    extr x10, x11, x10, #1
+; GISEL-NEXT:    stp xzr, x14, [x0]
+; GISEL-NEXT:    stp x8, x9, [x0, #16]
+; GISEL-NEXT:    extr x8, x12, x11, #1
+; GISEL-NEXT:    extr x9, x13, x12, #1
+; GISEL-NEXT:    stp x10, x8, [x0, #32]
+; GISEL-NEXT:    extr x10, x15, x13, #1
+; GISEL-NEXT:    stp x9, x10, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6266,27 +6114,21 @@ define void @test_lshr_i512_const_127(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_lshr_i512_const_127:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x13, x9, #1
-; GISEL-NEXT:    orr x10, x12, x10, lsr #63
-; GISEL-NEXT:    lsl x12, x11, #1
-; GISEL-NEXT:    orr x8, x13, x8, lsr #63
-; GISEL-NEXT:    lsl x13, x14, #1
-; GISEL-NEXT:    orr x9, x12, x9, lsr #63
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x15, #1
-; GISEL-NEXT:    orr x11, x13, x11, lsr #63
-; GISEL-NEXT:    lsl x12, x16, #1
-; GISEL-NEXT:    orr x8, x10, x14, lsr #63
-; GISEL-NEXT:    lsr x10, x16, #63
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x12, x15, lsr #63
-; GISEL-NEXT:    stp x10, xzr, [x0, #48]
-; GISEL-NEXT:    stp x8, x9, [x0, #32]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    extr x9, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #63
+; GISEL-NEXT:    lsr x8, x14, #63
+; GISEL-NEXT:    stp x9, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, xzr, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
@@ -6317,28 +6159,22 @@ define void @test_ashr_i512_const_127(ptr %result, ptr %input) {
 ;
 ; GISEL-LABEL: test_ashr_i512_const_127:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    ldp x8, x9, [x1, #16]
-; GISEL-NEXT:    ldr x10, [x1, #8]
-; GISEL-NEXT:    ldp x11, x14, [x1, #32]
-; GISEL-NEXT:    ldp x15, x16, [x1, #48]
-; GISEL-NEXT:    lsl x12, x8, #1
-; GISEL-NEXT:    lsl x13, x9, #1
-; GISEL-NEXT:    orr x10, x12, x10, lsr #63
-; GISEL-NEXT:    lsl x12, x11, #1
-; GISEL-NEXT:    orr x8, x13, x8, lsr #63
-; GISEL-NEXT:    lsl x13, x14, #1
-; GISEL-NEXT:    orr x9, x12, x9, lsr #63
-; GISEL-NEXT:    lsl x12, x15, #1
-; GISEL-NEXT:    stp x10, x8, [x0]
-; GISEL-NEXT:    lsl x10, x16, #1
-; GISEL-NEXT:    orr x11, x13, x11, lsr #63
-; GISEL-NEXT:    asr x8, x16, #63
-; GISEL-NEXT:    orr x12, x12, x14, lsr #63
-; GISEL-NEXT:    stp x9, x11, [x0, #16]
-; GISEL-NEXT:    orr x9, x10, x15, lsr #63
-; GISEL-NEXT:    orr x10, x8, x8, lsl #1
-; GISEL-NEXT:    stp x12, x9, [x0, #32]
-; GISEL-NEXT:    stp x10, x8, [x0, #48]
+; GISEL-NEXT:    ldp x8, x9, [x1, #8]
+; GISEL-NEXT:    ldr x14, [x1, #56]
+; GISEL-NEXT:    ldp x10, x11, [x1, #24]
+; GISEL-NEXT:    ldp x12, x13, [x1, #40]
+; GISEL-NEXT:    extr x8, x9, x8, #63
+; GISEL-NEXT:    extr x9, x10, x9, #63
+; GISEL-NEXT:    extr x10, x11, x10, #63
+; GISEL-NEXT:    stp x8, x9, [x0]
+; GISEL-NEXT:    extr x8, x12, x11, #63
+; GISEL-NEXT:    asr x9, x14, #63
+; GISEL-NEXT:    extr x11, x13, x12, #63
+; GISEL-NEXT:    stp x10, x8, [x0, #16]
+; GISEL-NEXT:    extr x10, x14, x13, #63
+; GISEL-NEXT:    orr x8, x9, x9, lsl #1
+; GISEL-NEXT:    stp x11, x10, [x0, #32]
+; GISEL-NEXT:    stp x8, x9, [x0, #48]
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i512, ptr %input, align 64
diff --git a/llvm/test/CodeGen/AArch64/adc.ll b/llvm/test/CodeGen/AArch64/adc.ll
index 12e8bf2..03f3cf1 100644
--- a/llvm/test/CodeGen/AArch64/adc.ll
+++ b/llvm/test/CodeGen/AArch64/adc.ll
@@ -71,9 +71,8 @@ define i128 @test_shifted(i128 %a, i128 %b) {
 ;
 ; CHECK-GI-LABEL: test_shifted:
 ; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    lsr x8, x2, #19
+; CHECK-GI-NEXT:    extr x8, x3, x2, #19
 ; CHECK-GI-NEXT:    adds x0, x0, x2, lsl #45
-; CHECK-GI-NEXT:    orr x8, x8, x3, lsl #45
 ; CHECK-GI-NEXT:    adc x1, x1, x8
 ; CHECK-GI-NEXT:    ret
   %rhs = shl i128 %b, 45
@@ -108,8 +107,7 @@ define i128 @test_extended(i128 %a, i16 %b) {
 ; CHECK-GI-NEXT:    sxth x8, w2
 ; CHECK-GI-NEXT:    adds x0, x0, w2, sxth #3
 ; CHECK-GI-NEXT:    asr x9, x8, #63
-; CHECK-GI-NEXT:    lsr x8, x8, #61
-; CHECK-GI-NEXT:    orr x8, x8, x9, lsl #3
+; CHECK-GI-NEXT:    extr x8, x9, x8, #61
 ; CHECK-GI-NEXT:    adc x1, x1, x8
 ; CHECK-GI-NEXT:    ret
   %ext = sext i16 %b to i128
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index 076cbf7..a505b42 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1408,6 +1408,88 @@ define <4 x i16> @ext_via_i19(<4 x i16> %a) {
   ret <4 x i16> %t6
 }
 
+define <8 x i8> @srhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-LABEL: srhadd_v8i8_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    srhadd.8b v0, v0, v1
+; CHECK-NEXT:    ret
+  %s0s = sext <8 x i8> %s0 to <8 x i16>
+  %s1s = sext <8 x i8> %s1 to <8 x i16>
+  %s = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
+  %s2 = trunc <8 x i16> %s to <8 x i8>
+  ret <8 x i8> %s2
+}
+
+define <4 x i16> @srhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
+; CHECK-LABEL: srhadd_v4i16_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    srhadd.4h v0, v0, v1
+; CHECK-NEXT:    ret
+  %s0s = sext <4 x i16> %s0 to <4 x i32>
+  %s1s = sext <4 x i16> %s1 to <4 x i32>
+  %s = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)
+  %s2 = trunc <4 x i32> %s to <4 x i16>
+  ret <4 x i16> %s2
+}
+
+define <2 x i32> @srhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) {
+; CHECK-LABEL: srhadd_v2i32_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshll.2d v0, v0, #0
+; CHECK-NEXT:    sshll.2d v1, v1, #0
+; CHECK-NEXT:    eor.16b v2, v0, v1
+; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:    ushr.2d v1, v2, #1
+; CHECK-NEXT:    sub.2d v0, v0, v1
+; CHECK-NEXT:    xtn.2s v0, v0
+; CHECK-NEXT:    ret
+  %s0s = sext <2 x i32> %s0 to <2 x i64>
+  %s1s = sext <2 x i32> %s1 to <2 x i64>
+  %s = call <2 x i64> @llvm.aarch64.neon.urhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s)
+  %s2 = trunc <2 x i64> %s to <2 x i32>
+  ret <2 x i32> %s2
+}
+
+define <8 x i8> @urhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-LABEL: urhadd_v8i8_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd.8b v0, v0, v1
+; CHECK-NEXT:    ret
+  %s0s = zext <8 x i8> %s0 to <8 x i16>
+  %s1s = zext <8 x i8> %s1 to <8 x i16>
+  %s = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
+  %s2 = trunc <8 x i16> %s to <8 x i8>
+  ret <8 x i8> %s2
+}
+
+define <4 x i16> @urhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
+; CHECK-LABEL: urhadd_v4i16_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd.4h v0, v0, v1
+; CHECK-NEXT:    ret
+  %s0s = zext <4 x i16> %s0 to <4 x i32>
+  %s1s = zext <4 x i16> %s1 to <4 x i32>
+  %s = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)
+  %s2 = trunc <4 x i32> %s to <4 x i16>
+  ret <4 x i16> %s2
+}
+
+define <2 x i32> @urhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) {
+; CHECK-LABEL: urhadd_v2i32_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    uaddl.2d v0, v0, v1
+; CHECK-NEXT:    dup.2d v1, x8
+; CHECK-NEXT:    add.2d v0, v0, v1
+; CHECK-NEXT:    shrn.2s v0, v0, #1
+; CHECK-NEXT:    ret
+  %s0s = zext <2 x i32> %s0 to <2 x i64>
+  %s1s = zext <2 x i32> %s1 to <2 x i64>
+  %s = call <2 x i64> @llvm.aarch64.neon.srhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s)
+  %s2 = trunc <2 x i64> %s to <2 x i32>
+  ret <2 x i32> %s2
+}
+
 declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 670574f2..6df6d76 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -2,16 +2,21 @@
 ; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for dup_v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_str_v2i8
-
 define <2 x i8> @dup_v2i8(i8 %a) {
-; CHECK-LABEL: dup_v2i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    dup v0.2s, w0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: dup_v2i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    dup v0.2s, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: dup_v2i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    dup v0.8b, w0
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %b = insertelement <2 x i8> poison, i8 %a, i64 0
   %c = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
@@ -19,22 +24,45 @@ entry:
 }
 
 define <2 x i8> @duplane0_v2i8(<2 x i8> %b) {
-; CHECK-LABEL: duplane0_v2i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup v0.2s, v0.s[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: duplane0_v2i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: duplane0_v2i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %c = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
   ret <2 x i8> %c
 }
 
 define <2 x i8> @loaddup_v2i8(ptr %p) {
-; CHECK-LABEL: loaddup_v2i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0]
-; CHECK-NEXT:    dup v0.2s, v0.s[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: loaddup_v2i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr b0, [x0]
+; CHECK-SD-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_v2i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ld1r { v0.8b }, [x0]
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %a = load i8, ptr %p
   %b = insertelement <2 x i8> poison, i8 %a, i64 0
@@ -43,12 +71,24 @@ entry:
 }
 
 define <2 x i8> @loaddup_str_v2i8(ptr %p) {
-; CHECK-LABEL: loaddup_str_v2i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    strb wzr, [x0]
-; CHECK-NEXT:    dup v0.2s, w8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: loaddup_str_v2i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldrb w8, [x0]
+; CHECK-SD-NEXT:    strb wzr, [x0]
+; CHECK-SD-NEXT:    dup v0.2s, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: loaddup_str_v2i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    strb wzr, [x0]
+; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %a = load i8, ptr %p
   %b = insertelement <2 x i8> poison, i8 %a, i64 0
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 765f6b7..7f07ef4 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -510,41 +510,40 @@ define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) {
 ;
 ; CHECK-GI-LABEL: fshl_i128:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
 ; CHECK-GI-NEXT:    and x9, x4, #0x7f
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
-; CHECK-GI-NEXT:    lsl x14, x3, #63
-; CHECK-GI-NEXT:    sub x12, x10, x9
+; CHECK-GI-NEXT:    mov w10, #127 // =0x7f
+; CHECK-GI-NEXT:    sub x12, x8, x9
 ; CHECK-GI-NEXT:    lsl x13, x1, x9
-; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    bic x10, x10, x4
 ; CHECK-GI-NEXT:    lsr x12, x0, x12
-; CHECK-GI-NEXT:    bic x8, x8, x4
-; CHECK-GI-NEXT:    sub x15, x9, #64
+; CHECK-GI-NEXT:    sub x14, x9, #64
+; CHECK-GI-NEXT:    lsl x15, x0, x9
+; CHECK-GI-NEXT:    extr x16, x3, x2, #1
 ; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    lsl x9, x0, x9
-; CHECK-GI-NEXT:    lsl x15, x0, x15
-; CHECK-GI-NEXT:    orr x12, x12, x13
-; CHECK-GI-NEXT:    orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT:    lsr x14, x3, #1
-; CHECK-GI-NEXT:    sub x10, x10, x8
-; CHECK-GI-NEXT:    sub x16, x8, #64
-; CHECK-GI-NEXT:    csel x9, x9, xzr, lo
-; CHECK-GI-NEXT:    lsr x17, x13, x8
-; CHECK-GI-NEXT:    lsl x10, x14, x10
-; CHECK-GI-NEXT:    csel x12, x12, x15, lo
+; CHECK-GI-NEXT:    sub x8, x8, x10
+; CHECK-GI-NEXT:    orr x9, x12, x13
+; CHECK-GI-NEXT:    lsr x12, x3, #1
+; CHECK-GI-NEXT:    lsl x13, x0, x14
+; CHECK-GI-NEXT:    csel x14, x15, xzr, lo
+; CHECK-GI-NEXT:    sub x15, x10, #64
+; CHECK-GI-NEXT:    lsr x17, x16, x10
+; CHECK-GI-NEXT:    lsl x8, x12, x8
+; CHECK-GI-NEXT:    csel x9, x9, x13, lo
 ; CHECK-GI-NEXT:    tst x4, #0x7f
-; CHECK-GI-NEXT:    lsr x15, x14, x16
+; CHECK-GI-NEXT:    lsr x13, x12, x15
 ; CHECK-GI-NEXT:    mvn x11, x4
-; CHECK-GI-NEXT:    csel x12, x1, x12, eq
-; CHECK-GI-NEXT:    orr x10, x17, x10
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    lsr x14, x14, x8
-; CHECK-GI-NEXT:    csel x10, x10, x15, lo
+; CHECK-GI-NEXT:    csel x9, x1, x9, eq
+; CHECK-GI-NEXT:    orr x8, x17, x8
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    lsr x12, x12, x10
+; CHECK-GI-NEXT:    csel x8, x8, x13, lo
 ; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    csel x10, x13, x10, eq
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    csel x8, x14, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x9, x10
-; CHECK-GI-NEXT:    orr x1, x12, x8
+; CHECK-GI-NEXT:    csel x8, x16, x8, eq
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    csel x10, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x14, x8
+; CHECK-GI-NEXT:    orr x1, x9, x10
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c)
@@ -571,41 +570,40 @@ define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) {
 ;
 ; CHECK-GI-LABEL: fshr_i128:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x0, #63
-; CHECK-GI-NEXT:    mov w9, #127 // =0x7f
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
-; CHECK-GI-NEXT:    bic x9, x9, x4
-; CHECK-GI-NEXT:    lsl x11, x0, #1
-; CHECK-GI-NEXT:    and x12, x4, #0x7f
-; CHECK-GI-NEXT:    orr x8, x8, x1, lsl #1
-; CHECK-GI-NEXT:    sub x14, x10, x9
-; CHECK-GI-NEXT:    sub x17, x9, #64
-; CHECK-GI-NEXT:    lsl x15, x11, x9
-; CHECK-GI-NEXT:    lsr x14, x11, x14
-; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    lsl x16, x8, x9
-; CHECK-GI-NEXT:    sub x9, x10, x12
-; CHECK-GI-NEXT:    lsl x10, x11, x17
-; CHECK-GI-NEXT:    mvn x13, x4
-; CHECK-GI-NEXT:    csel x11, x15, xzr, lo
-; CHECK-GI-NEXT:    sub x15, x12, #64
-; CHECK-GI-NEXT:    orr x14, x14, x16
-; CHECK-GI-NEXT:    lsr x16, x2, x12
-; CHECK-GI-NEXT:    lsl x9, x3, x9
-; CHECK-GI-NEXT:    csel x10, x14, x10, lo
-; CHECK-GI-NEXT:    tst x13, #0x7f
-; CHECK-GI-NEXT:    lsr x13, x3, x15
-; CHECK-GI-NEXT:    csel x8, x8, x10, eq
-; CHECK-GI-NEXT:    orr x9, x16, x9
-; CHECK-GI-NEXT:    cmp x12, #64
-; CHECK-GI-NEXT:    lsr x10, x3, x12
-; CHECK-GI-NEXT:    csel x9, x9, x13, lo
+; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    lsl x9, x0, #1
+; CHECK-GI-NEXT:    extr x10, x1, x0, #63
+; CHECK-GI-NEXT:    bic x8, x8, x4
+; CHECK-GI-NEXT:    mov w11, #64 // =0x40
+; CHECK-GI-NEXT:    and x14, x4, #0x7f
+; CHECK-GI-NEXT:    sub x12, x11, x8
+; CHECK-GI-NEXT:    lsl x13, x10, x8
+; CHECK-GI-NEXT:    lsl x16, x9, x8
+; CHECK-GI-NEXT:    lsr x12, x9, x12
+; CHECK-GI-NEXT:    sub x17, x8, #64
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    lsl x8, x9, x17
+; CHECK-GI-NEXT:    sub x11, x11, x14
+; CHECK-GI-NEXT:    mvn x15, x4
+; CHECK-GI-NEXT:    orr x12, x12, x13
+; CHECK-GI-NEXT:    csel x9, x16, xzr, lo
+; CHECK-GI-NEXT:    sub x13, x14, #64
+; CHECK-GI-NEXT:    lsr x16, x2, x14
+; CHECK-GI-NEXT:    lsl x11, x3, x11
+; CHECK-GI-NEXT:    csel x8, x12, x8, lo
+; CHECK-GI-NEXT:    tst x15, #0x7f
+; CHECK-GI-NEXT:    lsr x12, x3, x13
+; CHECK-GI-NEXT:    csel x8, x10, x8, eq
+; CHECK-GI-NEXT:    orr x10, x16, x11
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    lsr x11, x3, x14
+; CHECK-GI-NEXT:    csel x10, x10, x12, lo
 ; CHECK-GI-NEXT:    tst x4, #0x7f
-; CHECK-GI-NEXT:    csel x9, x2, x9, eq
-; CHECK-GI-NEXT:    cmp x12, #64
-; CHECK-GI-NEXT:    csel x10, x10, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x11, x9
-; CHECK-GI-NEXT:    orr x1, x8, x10
+; CHECK-GI-NEXT:    csel x10, x2, x10, eq
+; CHECK-GI-NEXT:    cmp x14, #64
+; CHECK-GI-NEXT:    csel x11, x11, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x9, x10
+; CHECK-GI-NEXT:    orr x1, x8, x11
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c)
@@ -720,10 +718,9 @@ define i128 @rotl_i128_c(i128 %a) {
 ;
 ; CHECK-GI-LABEL: rotl_i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x0, #61
-; CHECK-GI-NEXT:    lsr x9, x1, #61
-; CHECK-GI-NEXT:    orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT:    orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT:    extr x8, x1, x0, #61
+; CHECK-GI-NEXT:    extr x0, x0, x1, #61
+; CHECK-GI-NEXT:    mov x1, x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3)
@@ -731,20 +728,12 @@ entry:
 }
 
 define i128 @rotr_i128_c(i128 %a) {
-; CHECK-SD-LABEL: rotr_i128_c:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    extr x8, x1, x0, #3
-; CHECK-SD-NEXT:    extr x1, x0, x1, #3
-; CHECK-SD-NEXT:    mov x0, x8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: rotr_i128_c:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x1, #61
-; CHECK-GI-NEXT:    lsl x9, x0, #61
-; CHECK-GI-NEXT:    orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT:    orr x1, x9, x1, lsr #3
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: rotr_i128_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x8, x1, x0, #3
+; CHECK-NEXT:    extr x1, x0, x1, #3
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3)
   ret i128 %d
@@ -868,10 +857,8 @@ define i128 @fshl_i128_c(i128 %a, i128 %b) {
 ;
 ; CHECK-GI-LABEL: fshl_i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x0, #61
-; CHECK-GI-NEXT:    lsr x9, x3, #61
-; CHECK-GI-NEXT:    orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT:    orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT:    extr x1, x1, x0, #61
+; CHECK-GI-NEXT:    extr x0, x0, x3, #61
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3)
@@ -879,21 +866,12 @@ entry:
 }
 
 define i128 @fshr_i128_c(i128 %a, i128 %b) {
-; CHECK-SD-LABEL: fshr_i128_c:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    extr x8, x3, x2, #3
-; CHECK-SD-NEXT:    extr x1, x0, x3, #3
-; CHECK-SD-NEXT:    mov x0, x8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fshr_i128_c:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x3, #61
-; CHECK-GI-NEXT:    lsr x9, x3, #3
-; CHECK-GI-NEXT:    orr x8, x8, x2, lsr #3
-; CHECK-GI-NEXT:    orr x1, x9, x0, lsl #61
-; CHECK-GI-NEXT:    mov x0, x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fshr_i128_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x8, x3, x2, #3
+; CHECK-NEXT:    extr x1, x0, x3, #3
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ret
 entry:
   %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3)
   ret i128 %d
@@ -3013,75 +2991,73 @@ define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GI-NEXT:    .cfi_offset w19, -16
 ; CHECK-GI-NEXT:    ldr x11, [sp, #16]
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
+; CHECK-GI-NEXT:    mov w9, #64 // =0x40
 ; CHECK-GI-NEXT:    ldr x12, [sp, #32]
 ; CHECK-GI-NEXT:    mov w13, #127 // =0x7f
-; CHECK-GI-NEXT:    and x9, x11, #0x7f
+; CHECK-GI-NEXT:    and x8, x11, #0x7f
 ; CHECK-GI-NEXT:    and x14, x12, #0x7f
-; CHECK-GI-NEXT:    mvn x15, x11
-; CHECK-GI-NEXT:    sub x8, x10, x9
-; CHECK-GI-NEXT:    sub x16, x9, #64
-; CHECK-GI-NEXT:    lsl x19, x1, x9
-; CHECK-GI-NEXT:    lsr x18, x0, x8
-; CHECK-GI-NEXT:    lsl x17, x0, x9
-; CHECK-GI-NEXT:    lsl x16, x0, x16
-; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    bic x0, x13, x11
-; CHECK-GI-NEXT:    mvn x8, x12
-; CHECK-GI-NEXT:    orr x18, x18, x19
-; CHECK-GI-NEXT:    csel x9, x17, xzr, lo
+; CHECK-GI-NEXT:    mvn x18, x11
+; CHECK-GI-NEXT:    sub x10, x9, x8
+; CHECK-GI-NEXT:    sub x15, x8, #64
+; CHECK-GI-NEXT:    lsl x17, x1, x8
+; CHECK-GI-NEXT:    lsr x16, x0, x10
+; CHECK-GI-NEXT:    lsl x15, x0, x15
+; CHECK-GI-NEXT:    cmp x8, #64
+; CHECK-GI-NEXT:    lsl x19, x0, x8
+; CHECK-GI-NEXT:    lsl x0, x3, x14
+; CHECK-GI-NEXT:    mvn x10, x12
+; CHECK-GI-NEXT:    orr x16, x16, x17
 ; CHECK-GI-NEXT:    sub x17, x14, #64
-; CHECK-GI-NEXT:    csel x16, x18, x16, lo
+; CHECK-GI-NEXT:    csel x15, x16, x15, lo
+; CHECK-GI-NEXT:    sub x16, x9, x14
+; CHECK-GI-NEXT:    csel x8, x19, xzr, lo
+; CHECK-GI-NEXT:    lsr x16, x2, x16
 ; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    sub x11, x10, x14
-; CHECK-GI-NEXT:    lsr x11, x2, x11
-; CHECK-GI-NEXT:    lsl x18, x3, x14
-; CHECK-GI-NEXT:    csel x16, x1, x16, eq
-; CHECK-GI-NEXT:    lsl x1, x2, x14
+; CHECK-GI-NEXT:    lsl x19, x2, x14
 ; CHECK-GI-NEXT:    lsl x17, x2, x17
+; CHECK-GI-NEXT:    csel x15, x1, x15, eq
 ; CHECK-GI-NEXT:    cmp x14, #64
-; CHECK-GI-NEXT:    lsl x14, x5, #63
-; CHECK-GI-NEXT:    orr x11, x11, x18
-; CHECK-GI-NEXT:    bic x13, x13, x12
-; CHECK-GI-NEXT:    csel x18, x1, xzr, lo
-; CHECK-GI-NEXT:    csel x11, x11, x17, lo
+; CHECK-GI-NEXT:    orr x16, x16, x0
+; CHECK-GI-NEXT:    bic x11, x13, x11
+; CHECK-GI-NEXT:    csel x14, x19, xzr, lo
+; CHECK-GI-NEXT:    csel x16, x16, x17, lo
 ; CHECK-GI-NEXT:    tst x12, #0x7f
-; CHECK-GI-NEXT:    lsr x12, x5, #1
-; CHECK-GI-NEXT:    orr x14, x14, x4, lsr #1
-; CHECK-GI-NEXT:    lsl x17, x7, #63
-; CHECK-GI-NEXT:    sub x1, x10, x0
-; CHECK-GI-NEXT:    csel x11, x3, x11, eq
-; CHECK-GI-NEXT:    sub x2, x0, #64
-; CHECK-GI-NEXT:    lsr x3, x14, x0
-; CHECK-GI-NEXT:    lsl x1, x12, x1
-; CHECK-GI-NEXT:    lsr x4, x7, #1
-; CHECK-GI-NEXT:    orr x17, x17, x6, lsr #1
-; CHECK-GI-NEXT:    lsr x2, x12, x2
-; CHECK-GI-NEXT:    cmp x0, #64
-; CHECK-GI-NEXT:    orr x1, x3, x1
-; CHECK-GI-NEXT:    sub x10, x10, x13
-; CHECK-GI-NEXT:    lsr x12, x12, x0
-; CHECK-GI-NEXT:    csel x1, x1, x2, lo
-; CHECK-GI-NEXT:    tst x15, #0x7f
-; CHECK-GI-NEXT:    sub x15, x13, #64
-; CHECK-GI-NEXT:    lsr x2, x17, x13
-; CHECK-GI-NEXT:    lsl x10, x4, x10
-; CHECK-GI-NEXT:    csel x14, x14, x1, eq
-; CHECK-GI-NEXT:    cmp x0, #64
-; CHECK-GI-NEXT:    lsr x15, x4, x15
-; CHECK-GI-NEXT:    lsr x0, x4, x13
-; CHECK-GI-NEXT:    csel x12, x12, xzr, lo
-; CHECK-GI-NEXT:    orr x10, x2, x10
-; CHECK-GI-NEXT:    cmp x13, #64
-; CHECK-GI-NEXT:    csel x10, x10, x15, lo
-; CHECK-GI-NEXT:    tst x8, #0x7f
-; CHECK-GI-NEXT:    orr x1, x16, x12
-; CHECK-GI-NEXT:    csel x8, x17, x10, eq
-; CHECK-GI-NEXT:    cmp x13, #64
-; CHECK-GI-NEXT:    csel x10, x0, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x9, x14
-; CHECK-GI-NEXT:    orr x2, x18, x8
-; CHECK-GI-NEXT:    orr x3, x11, x10
+; CHECK-GI-NEXT:    lsr x17, x5, #1
+; CHECK-GI-NEXT:    extr x0, x5, x4, #1
+; CHECK-GI-NEXT:    bic x12, x13, x12
+; CHECK-GI-NEXT:    csel x13, x3, x16, eq
+; CHECK-GI-NEXT:    sub x16, x9, x11
+; CHECK-GI-NEXT:    sub x1, x11, #64
+; CHECK-GI-NEXT:    lsr x3, x7, #1
+; CHECK-GI-NEXT:    lsr x2, x0, x11
+; CHECK-GI-NEXT:    lsl x16, x17, x16
+; CHECK-GI-NEXT:    extr x4, x7, x6, #1
+; CHECK-GI-NEXT:    lsr x1, x17, x1
+; CHECK-GI-NEXT:    cmp x11, #64
+; CHECK-GI-NEXT:    sub x9, x9, x12
+; CHECK-GI-NEXT:    orr x16, x2, x16
+; CHECK-GI-NEXT:    lsr x17, x17, x11
+; CHECK-GI-NEXT:    lsl x9, x3, x9
+; CHECK-GI-NEXT:    csel x16, x16, x1, lo
+; CHECK-GI-NEXT:    tst x18, #0x7f
+; CHECK-GI-NEXT:    sub x18, x12, #64
+; CHECK-GI-NEXT:    lsr x1, x4, x12
+; CHECK-GI-NEXT:    csel x16, x0, x16, eq
+; CHECK-GI-NEXT:    cmp x11, #64
+; CHECK-GI-NEXT:    lsr x11, x3, x18
+; CHECK-GI-NEXT:    csel x17, x17, xzr, lo
+; CHECK-GI-NEXT:    cmp x12, #64
+; CHECK-GI-NEXT:    orr x9, x1, x9
+; CHECK-GI-NEXT:    lsr x18, x3, x12
+; CHECK-GI-NEXT:    orr x0, x8, x16
+; CHECK-GI-NEXT:    csel x9, x9, x11, lo
+; CHECK-GI-NEXT:    tst x10, #0x7f
+; CHECK-GI-NEXT:    orr x1, x15, x17
+; CHECK-GI-NEXT:    csel x9, x4, x9, eq
+; CHECK-GI-NEXT:    cmp x12, #64
+; CHECK-GI-NEXT:    csel x10, x18, xzr, lo
+; CHECK-GI-NEXT:    orr x2, x14, x9
+; CHECK-GI-NEXT:    orr x3, x13, x10
 ; CHECK-GI-NEXT:    ldr x19, [sp], #16 // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3125,75 +3101,73 @@ define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
 ; CHECK-GI-LABEL: fshr_v2i128:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr x9, [sp]
-; CHECK-GI-NEXT:    lsl x12, x1, #1
-; CHECK-GI-NEXT:    mov w11, #127 // =0x7f
-; CHECK-GI-NEXT:    mov w14, #64 // =0x40
-; CHECK-GI-NEXT:    lsl x15, x0, #1
+; CHECK-GI-NEXT:    mov w10, #127 // =0x7f
+; CHECK-GI-NEXT:    mov w12, #64 // =0x40
+; CHECK-GI-NEXT:    lsl x13, x0, #1
+; CHECK-GI-NEXT:    extr x14, x1, x0, #63
 ; CHECK-GI-NEXT:    ldr x8, [sp, #16]
-; CHECK-GI-NEXT:    bic x13, x11, x9
-; CHECK-GI-NEXT:    orr x12, x12, x0, lsr #63
-; CHECK-GI-NEXT:    lsl x1, x3, #1
-; CHECK-GI-NEXT:    sub x17, x14, x13
-; CHECK-GI-NEXT:    sub x18, x13, #64
-; CHECK-GI-NEXT:    lsl x3, x15, x13
-; CHECK-GI-NEXT:    lsr x17, x15, x17
-; CHECK-GI-NEXT:    lsl x0, x12, x13
-; CHECK-GI-NEXT:    lsl x15, x15, x18
-; CHECK-GI-NEXT:    bic x11, x11, x8
+; CHECK-GI-NEXT:    bic x11, x10, x9
+; CHECK-GI-NEXT:    mvn x16, x9
+; CHECK-GI-NEXT:    and x15, x9, #0x7f
+; CHECK-GI-NEXT:    sub x17, x12, x11
+; CHECK-GI-NEXT:    sub x18, x11, #64
+; CHECK-GI-NEXT:    lsl x0, x14, x11
+; CHECK-GI-NEXT:    lsr x17, x13, x17
+; CHECK-GI-NEXT:    lsl x1, x13, x11
+; CHECK-GI-NEXT:    lsl x13, x13, x18
+; CHECK-GI-NEXT:    bic x10, x10, x8
 ; CHECK-GI-NEXT:    lsl x18, x2, #1
-; CHECK-GI-NEXT:    cmp x13, #64
+; CHECK-GI-NEXT:    cmp x11, #64
 ; CHECK-GI-NEXT:    orr x17, x17, x0
-; CHECK-GI-NEXT:    orr x13, x1, x2, lsr #63
-; CHECK-GI-NEXT:    mvn x16, x9
-; CHECK-GI-NEXT:    csel x15, x17, x15, lo
-; CHECK-GI-NEXT:    sub x17, x14, x11
-; CHECK-GI-NEXT:    csel x0, x3, xzr, lo
+; CHECK-GI-NEXT:    extr x11, x3, x2, #63
+; CHECK-GI-NEXT:    csel x0, x1, xzr, lo
+; CHECK-GI-NEXT:    csel x13, x17, x13, lo
+; CHECK-GI-NEXT:    sub x17, x12, x10
 ; CHECK-GI-NEXT:    tst x16, #0x7f
-; CHECK-GI-NEXT:    sub x16, x11, #64
+; CHECK-GI-NEXT:    sub x16, x10, #64
 ; CHECK-GI-NEXT:    lsr x17, x18, x17
-; CHECK-GI-NEXT:    lsl x2, x13, x11
-; CHECK-GI-NEXT:    lsl x1, x18, x11
-; CHECK-GI-NEXT:    csel x12, x12, x15, eq
-; CHECK-GI-NEXT:    lsl x15, x18, x16
-; CHECK-GI-NEXT:    and x10, x9, #0x7f
-; CHECK-GI-NEXT:    cmp x11, #64
-; CHECK-GI-NEXT:    mvn x11, x8
+; CHECK-GI-NEXT:    lsl x2, x11, x10
+; CHECK-GI-NEXT:    lsl x1, x18, x10
+; CHECK-GI-NEXT:    csel x13, x14, x13, eq
+; CHECK-GI-NEXT:    lsl x14, x18, x16
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    mvn x10, x8
 ; CHECK-GI-NEXT:    orr x16, x17, x2
 ; CHECK-GI-NEXT:    csel x17, x1, xzr, lo
-; CHECK-GI-NEXT:    csel x15, x16, x15, lo
-; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    sub x11, x14, x10
-; CHECK-GI-NEXT:    sub x16, x10, #64
-; CHECK-GI-NEXT:    lsr x18, x4, x10
-; CHECK-GI-NEXT:    lsl x11, x5, x11
-; CHECK-GI-NEXT:    csel x13, x13, x15, eq
-; CHECK-GI-NEXT:    lsr x15, x5, x16
+; CHECK-GI-NEXT:    csel x14, x16, x14, lo
+; CHECK-GI-NEXT:    tst x10, #0x7f
+; CHECK-GI-NEXT:    sub x10, x12, x15
+; CHECK-GI-NEXT:    sub x16, x15, #64
+; CHECK-GI-NEXT:    lsr x18, x4, x15
+; CHECK-GI-NEXT:    lsl x10, x5, x10
+; CHECK-GI-NEXT:    csel x11, x11, x14, eq
+; CHECK-GI-NEXT:    lsr x14, x5, x16
 ; CHECK-GI-NEXT:    and x1, x8, #0x7f
-; CHECK-GI-NEXT:    orr x11, x18, x11
-; CHECK-GI-NEXT:    cmp x10, #64
-; CHECK-GI-NEXT:    lsr x16, x5, x10
-; CHECK-GI-NEXT:    csel x11, x11, x15, lo
+; CHECK-GI-NEXT:    cmp x15, #64
+; CHECK-GI-NEXT:    lsr x16, x5, x15
+; CHECK-GI-NEXT:    orr x10, x18, x10
+; CHECK-GI-NEXT:    csel x10, x10, x14, lo
 ; CHECK-GI-NEXT:    tst x9, #0x7f
-; CHECK-GI-NEXT:    sub x9, x14, x1
-; CHECK-GI-NEXT:    sub x14, x1, #64
-; CHECK-GI-NEXT:    lsr x15, x6, x1
+; CHECK-GI-NEXT:    sub x9, x12, x1
+; CHECK-GI-NEXT:    sub x12, x1, #64
+; CHECK-GI-NEXT:    lsr x14, x6, x1
 ; CHECK-GI-NEXT:    lsl x9, x7, x9
-; CHECK-GI-NEXT:    csel x11, x4, x11, eq
-; CHECK-GI-NEXT:    cmp x10, #64
-; CHECK-GI-NEXT:    lsr x10, x7, x14
-; CHECK-GI-NEXT:    csel x14, x16, xzr, lo
-; CHECK-GI-NEXT:    orr x9, x15, x9
+; CHECK-GI-NEXT:    csel x10, x4, x10, eq
+; CHECK-GI-NEXT:    cmp x15, #64
+; CHECK-GI-NEXT:    lsr x12, x7, x12
+; CHECK-GI-NEXT:    csel x15, x16, xzr, lo
+; CHECK-GI-NEXT:    orr x9, x14, x9
 ; CHECK-GI-NEXT:    cmp x1, #64
-; CHECK-GI-NEXT:    lsr x15, x7, x1
-; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    lsr x14, x7, x1
+; CHECK-GI-NEXT:    csel x9, x9, x12, lo
 ; CHECK-GI-NEXT:    tst x8, #0x7f
 ; CHECK-GI-NEXT:    csel x8, x6, x9, eq
 ; CHECK-GI-NEXT:    cmp x1, #64
-; CHECK-GI-NEXT:    orr x0, x0, x11
-; CHECK-GI-NEXT:    csel x9, x15, xzr, lo
-; CHECK-GI-NEXT:    orr x1, x12, x14
+; CHECK-GI-NEXT:    orr x0, x0, x10
+; CHECK-GI-NEXT:    csel x9, x14, xzr, lo
+; CHECK-GI-NEXT:    orr x1, x13, x15
 ; CHECK-GI-NEXT:    orr x2, x17, x8
-; CHECK-GI-NEXT:    orr x3, x13, x9
+; CHECK-GI-NEXT:    orr x3, x11, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
@@ -3863,15 +3837,12 @@ define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: rotl_v2i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x1, #61
-; CHECK-GI-NEXT:    lsl x9, x1, #3
-; CHECK-GI-NEXT:    lsl x10, x3, #3
-; CHECK-GI-NEXT:    lsr x11, x3, #61
-; CHECK-GI-NEXT:    orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT:    orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT:    orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT:    orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT:    extr x8, x0, x1, #61
+; CHECK-GI-NEXT:    extr x9, x3, x2, #61
+; CHECK-GI-NEXT:    extr x1, x1, x0, #61
+; CHECK-GI-NEXT:    extr x2, x2, x3, #61
 ; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    mov x3, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -3891,14 +3862,12 @@ define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: rotr_v2i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x1, #61
-; CHECK-GI-NEXT:    lsl x9, x3, #61
-; CHECK-GI-NEXT:    lsl x10, x0, #61
-; CHECK-GI-NEXT:    lsl x11, x2, #61
-; CHECK-GI-NEXT:    orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT:    orr x2, x9, x2, lsr #3
-; CHECK-GI-NEXT:    orr x1, x10, x1, lsr #3
-; CHECK-GI-NEXT:    orr x3, x11, x3, lsr #3
+; CHECK-GI-NEXT:    extr x8, x1, x0, #3
+; CHECK-GI-NEXT:    extr x9, x3, x2, #3
+; CHECK-GI-NEXT:    extr x1, x0, x1, #3
+; CHECK-GI-NEXT:    extr x3, x2, x3, #3
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    mov x2, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -4464,14 +4433,10 @@ define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
 ;
 ; CHECK-GI-LABEL: fshl_v2i128_c:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsr x8, x5, #61
-; CHECK-GI-NEXT:    lsl x9, x1, #3
-; CHECK-GI-NEXT:    lsl x10, x3, #3
-; CHECK-GI-NEXT:    lsr x11, x7, #61
-; CHECK-GI-NEXT:    orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT:    orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT:    orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT:    orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT:    extr x8, x0, x5, #61
+; CHECK-GI-NEXT:    extr x1, x1, x0, #61
+; CHECK-GI-NEXT:    extr x3, x3, x2, #61
+; CHECK-GI-NEXT:    extr x2, x2, x7, #61
 ; CHECK-GI-NEXT:    mov x0, x8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4480,29 +4445,15 @@ entry:
 }
 
 define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
-; CHECK-SD-LABEL: fshr_v2i128_c:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    extr x8, x5, x4, #3
-; CHECK-SD-NEXT:    extr x9, x7, x6, #3
-; CHECK-SD-NEXT:    extr x1, x0, x5, #3
-; CHECK-SD-NEXT:    extr x3, x2, x7, #3
-; CHECK-SD-NEXT:    mov x0, x8
-; CHECK-SD-NEXT:    mov x2, x9
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fshr_v2i128_c:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    lsl x8, x5, #61
-; CHECK-GI-NEXT:    lsl x9, x7, #61
-; CHECK-GI-NEXT:    lsr x10, x5, #3
-; CHECK-GI-NEXT:    lsr x11, x7, #3
-; CHECK-GI-NEXT:    orr x8, x8, x4, lsr #3
-; CHECK-GI-NEXT:    orr x9, x9, x6, lsr #3
-; CHECK-GI-NEXT:    orr x1, x10, x0, lsl #61
-; CHECK-GI-NEXT:    orr x3, x11, x2, lsl #61
-; CHECK-GI-NEXT:    mov x0, x8
-; CHECK-GI-NEXT:    mov x2, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fshr_v2i128_c:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    extr x8, x5, x4, #3
+; CHECK-NEXT:    extr x9, x7, x6, #3
+; CHECK-NEXT:    extr x1, x0, x5, #3
+; CHECK-NEXT:    extr x3, x2, x7, #3
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    mov x2, x9
+; CHECK-NEXT:    ret
 entry:
   %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>)
   ret <2 x i128> %d
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index f9fd2ad..90fb102 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -85,41 +85,40 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ;
 ; CHECK-GI-LABEL: fshl_i128:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
 ; CHECK-GI-NEXT:    and x9, x4, #0x7f
-; CHECK-GI-NEXT:    mov w10, #64 // =0x40
-; CHECK-GI-NEXT:    lsl x14, x3, #63
-; CHECK-GI-NEXT:    sub x12, x10, x9
+; CHECK-GI-NEXT:    mov w10, #127 // =0x7f
+; CHECK-GI-NEXT:    sub x12, x8, x9
 ; CHECK-GI-NEXT:    lsl x13, x1, x9
-; CHECK-GI-NEXT:    mov w8, #127 // =0x7f
+; CHECK-GI-NEXT:    bic x10, x10, x4
 ; CHECK-GI-NEXT:    lsr x12, x0, x12
-; CHECK-GI-NEXT:    bic x8, x8, x4
-; CHECK-GI-NEXT:    sub x15, x9, #64
+; CHECK-GI-NEXT:    sub x14, x9, #64
+; CHECK-GI-NEXT:    lsl x15, x0, x9
+; CHECK-GI-NEXT:    extr x16, x3, x2, #1
 ; CHECK-GI-NEXT:    cmp x9, #64
-; CHECK-GI-NEXT:    lsl x9, x0, x9
-; CHECK-GI-NEXT:    lsl x15, x0, x15
-; CHECK-GI-NEXT:    orr x12, x12, x13
-; CHECK-GI-NEXT:    orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT:    lsr x14, x3, #1
-; CHECK-GI-NEXT:    sub x10, x10, x8
-; CHECK-GI-NEXT:    sub x16, x8, #64
-; CHECK-GI-NEXT:    csel x9, x9, xzr, lo
-; CHECK-GI-NEXT:    lsr x17, x13, x8
-; CHECK-GI-NEXT:    lsl x10, x14, x10
-; CHECK-GI-NEXT:    csel x12, x12, x15, lo
+; CHECK-GI-NEXT:    sub x8, x8, x10
+; CHECK-GI-NEXT:    orr x9, x12, x13
+; CHECK-GI-NEXT:    lsr x12, x3, #1
+; CHECK-GI-NEXT:    lsl x13, x0, x14
+; CHECK-GI-NEXT:    csel x14, x15, xzr, lo
+; CHECK-GI-NEXT:    sub x15, x10, #64
+; CHECK-GI-NEXT:    lsr x17, x16, x10
+; CHECK-GI-NEXT:    lsl x8, x12, x8
+; CHECK-GI-NEXT:    csel x9, x9, x13, lo
 ; CHECK-GI-NEXT:    tst x4, #0x7f
-; CHECK-GI-NEXT:    lsr x15, x14, x16
+; CHECK-GI-NEXT:    lsr x13, x12, x15
 ; CHECK-GI-NEXT:    mvn x11, x4
-; CHECK-GI-NEXT:    csel x12, x1, x12, eq
-; CHECK-GI-NEXT:    orr x10, x17, x10
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    lsr x14, x14, x8
-; CHECK-GI-NEXT:    csel x10, x10, x15, lo
+; CHECK-GI-NEXT:    csel x9, x1, x9, eq
+; CHECK-GI-NEXT:    orr x8, x17, x8
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    lsr x12, x12, x10
+; CHECK-GI-NEXT:    csel x8, x8, x13, lo
 ; CHECK-GI-NEXT:    tst x11, #0x7f
-; CHECK-GI-NEXT:    csel x10, x13, x10, eq
-; CHECK-GI-NEXT:    cmp x8, #64
-; CHECK-GI-NEXT:    csel x8, x14, xzr, lo
-; CHECK-GI-NEXT:    orr x0, x9, x10
-; CHECK-GI-NEXT:    orr x1, x12, x8
+; CHECK-GI-NEXT:    csel x8, x16, x8, eq
+; CHECK-GI-NEXT:    cmp x10, #64
+; CHECK-GI-NEXT:    csel x10, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x0, x14, x8
+; CHECK-GI-NEXT:    orr x1, x9, x10
 ; CHECK-GI-NEXT:    ret
   %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
   ret i128 %f
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1cb92e4..87b1108 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -559,20 +559,18 @@ define i128 @ui128_7(i128 %a, i128 %b) {
 ; CHECK-GI-NEXT:    add x8, x8, x10
 ; CHECK-GI-NEXT:    subs x10, x0, x9
 ; CHECK-GI-NEXT:    sbc x11, x1, x8
-; CHECK-GI-NEXT:    lsl x12, x11, #63
+; CHECK-GI-NEXT:    extr x10, x11, x10, #1
 ; CHECK-GI-NEXT:    lsr x11, x11, #1
-; CHECK-GI-NEXT:    orr x10, x12, x10, lsr #1
 ; CHECK-GI-NEXT:    adds x9, x10, x9
+; CHECK-GI-NEXT:    mov w10, #7 // =0x7
 ; CHECK-GI-NEXT:    adc x8, x11, x8
-; CHECK-GI-NEXT:    lsl x10, x8, #62
+; CHECK-GI-NEXT:    extr x9, x8, x9, #2
 ; CHECK-GI-NEXT:    lsr x8, x8, #2
-; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #2
-; CHECK-GI-NEXT:    mov w10, #7 // =0x7
-; CHECK-GI-NEXT:    lsl x12, x8, #3
 ; CHECK-GI-NEXT:    umulh x10, x9, x10
 ; CHECK-GI-NEXT:    lsl x11, x9, #3
-; CHECK-GI-NEXT:    sub x8, x12, x8
+; CHECK-GI-NEXT:    lsl x12, x8, #3
 ; CHECK-GI-NEXT:    sub x9, x11, x9
+; CHECK-GI-NEXT:    sub x8, x12, x8
 ; CHECK-GI-NEXT:    subs x0, x0, x9
 ; CHECK-GI-NEXT:    add x8, x8, x10
 ; CHECK-GI-NEXT:    sbc x1, x1, x8
@@ -640,10 +638,9 @@ define i128 @ui128_100(i128 %a, i128 %b) {
 ; CHECK-GI-NEXT:    add x10, x11, x12
 ; CHECK-GI-NEXT:    add x8, x8, x14
 ; CHECK-GI-NEXT:    add x8, x8, x10
-; CHECK-GI-NEXT:    lsl x10, x8, #60
-; CHECK-GI-NEXT:    lsr x8, x8, #4
-; CHECK-GI-NEXT:    orr x9, x10, x9, lsr #4
 ; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    extr x9, x8, x9, #4
+; CHECK-GI-NEXT:    lsr x8, x8, #4
 ; CHECK-GI-NEXT:    umulh x11, x9, x10
 ; CHECK-GI-NEXT:    mul x9, x9, x10
 ; CHECK-GI-NEXT:    madd x8, x8, x10, x11
@@ -3317,36 +3314,32 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
 ; CHECK-GI-NEXT:    sbc x14, x1, x12
 ; CHECK-GI-NEXT:    add x8, x8, x13
 ; CHECK-GI-NEXT:    subs x13, x2, x10
-; CHECK-GI-NEXT:    lsl x15, x14, #63
-; CHECK-GI-NEXT:    sbc x16, x3, x8
+; CHECK-GI-NEXT:    extr x9, x14, x9, #1
+; CHECK-GI-NEXT:    sbc x15, x3, x8
 ; CHECK-GI-NEXT:    lsr x14, x14, #1
-; CHECK-GI-NEXT:    orr x9, x15, x9, lsr #1
-; CHECK-GI-NEXT:    lsl x15, x16, #63
-; CHECK-GI-NEXT:    orr x13, x15, x13, lsr #1
+; CHECK-GI-NEXT:    extr x13, x15, x13, #1
 ; CHECK-GI-NEXT:    adds x9, x9, x11
-; CHECK-GI-NEXT:    lsr x11, x16, #1
+; CHECK-GI-NEXT:    lsr x11, x15, #1
 ; CHECK-GI-NEXT:    adc x12, x14, x12
 ; CHECK-GI-NEXT:    adds x10, x13, x10
-; CHECK-GI-NEXT:    lsl x13, x12, #62
-; CHECK-GI-NEXT:    lsr x12, x12, #2
-; CHECK-GI-NEXT:    adc x8, x11, x8
-; CHECK-GI-NEXT:    lsl x11, x8, #62
-; CHECK-GI-NEXT:    orr x9, x13, x9, lsr #2
+; CHECK-GI-NEXT:    extr x9, x12, x9, #2
 ; CHECK-GI-NEXT:    mov w13, #7 // =0x7
+; CHECK-GI-NEXT:    adc x8, x11, x8
+; CHECK-GI-NEXT:    lsr x11, x12, #2
+; CHECK-GI-NEXT:    extr x10, x8, x10, #2
+; CHECK-GI-NEXT:    umulh x12, x9, x13
 ; CHECK-GI-NEXT:    lsr x8, x8, #2
-; CHECK-GI-NEXT:    lsl x14, x12, #3
-; CHECK-GI-NEXT:    orr x10, x11, x10, lsr #2
-; CHECK-GI-NEXT:    umulh x11, x9, x13
+; CHECK-GI-NEXT:    lsl x14, x11, #3
 ; CHECK-GI-NEXT:    lsl x15, x9, #3
-; CHECK-GI-NEXT:    sub x12, x14, x12
-; CHECK-GI-NEXT:    lsl x16, x8, #3
 ; CHECK-GI-NEXT:    umulh x13, x10, x13
+; CHECK-GI-NEXT:    lsl x16, x8, #3
+; CHECK-GI-NEXT:    sub x11, x14, x11
 ; CHECK-GI-NEXT:    lsl x14, x10, #3
 ; CHECK-GI-NEXT:    sub x9, x15, x9
 ; CHECK-GI-NEXT:    sub x8, x16, x8
 ; CHECK-GI-NEXT:    subs x0, x0, x9
+; CHECK-GI-NEXT:    add x11, x11, x12
 ; CHECK-GI-NEXT:    sub x10, x14, x10
-; CHECK-GI-NEXT:    add x11, x12, x11
 ; CHECK-GI-NEXT:    sbc x1, x1, x11
 ; CHECK-GI-NEXT:    subs x2, x2, x10
 ; CHECK-GI-NEXT:    add x8, x8, x13
@@ -3394,9 +3387,10 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov x10, #23593 // =0x5c29
 ; CHECK-GI-NEXT:    mov x8, #62914 // =0xf5c2
-; CHECK-GI-NEXT:    sub x18, x0, x0
+; CHECK-GI-NEXT:    and x5, xzr, #0x1
 ; CHECK-GI-NEXT:    movk x10, #49807, lsl #16
 ; CHECK-GI-NEXT:    movk x8, #23592, lsl #16
+; CHECK-GI-NEXT:    umulh x18, x0, xzr
 ; CHECK-GI-NEXT:    movk x10, #10485, lsl #32
 ; CHECK-GI-NEXT:    movk x8, #49807, lsl #32
 ; CHECK-GI-NEXT:    movk x10, #36700, lsl #48
@@ -3409,84 +3403,81 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
 ; CHECK-GI-NEXT:    umulh x15, x1, x10
 ; CHECK-GI-NEXT:    cset w12, hs
 ; CHECK-GI-NEXT:    cmn x11, x13
-; CHECK-GI-NEXT:    and x11, x12, #0x1
-; CHECK-GI-NEXT:    umulh x16, x0, x8
-; CHECK-GI-NEXT:    cset w12, hs
+; CHECK-GI-NEXT:    sub x13, x0, x0
 ; CHECK-GI-NEXT:    and x12, x12, #0x1
-; CHECK-GI-NEXT:    add x14, x14, x18
-; CHECK-GI-NEXT:    add x11, x11, x12
-; CHECK-GI-NEXT:    and x12, xzr, #0x1
+; CHECK-GI-NEXT:    umulh x16, x0, x8
+; CHECK-GI-NEXT:    cset w11, hs
+; CHECK-GI-NEXT:    add x13, x14, x13
+; CHECK-GI-NEXT:    and x11, x11, #0x1
+; CHECK-GI-NEXT:    and x14, xzr, #0x1
 ; CHECK-GI-NEXT:    umulh x9, xzr, x10
-; CHECK-GI-NEXT:    adds x14, x14, x15
-; CHECK-GI-NEXT:    and x15, xzr, #0x1
+; CHECK-GI-NEXT:    add x11, x12, x11
+; CHECK-GI-NEXT:    add x12, x5, x14
+; CHECK-GI-NEXT:    adds x13, x13, x15
 ; CHECK-GI-NEXT:    umulh x17, x1, x8
-; CHECK-GI-NEXT:    cset w4, hs
-; CHECK-GI-NEXT:    add x15, x12, x15
-; CHECK-GI-NEXT:    adds x12, x14, x16
-; CHECK-GI-NEXT:    and x4, x4, #0x1
-; CHECK-GI-NEXT:    mul x18, x3, x10
 ; CHECK-GI-NEXT:    cset w14, hs
-; CHECK-GI-NEXT:    adds x12, x12, x11
-; CHECK-GI-NEXT:    add x11, x15, x4
 ; CHECK-GI-NEXT:    and x14, x14, #0x1
-; CHECK-GI-NEXT:    cset w15, hs
-; CHECK-GI-NEXT:    mul x5, x2, x8
-; CHECK-GI-NEXT:    add x11, x11, x14
-; CHECK-GI-NEXT:    and x14, x15, #0x1
-; CHECK-GI-NEXT:    add x17, x9, x17
-; CHECK-GI-NEXT:    add x14, x11, x14
-; CHECK-GI-NEXT:    mov w11, #100 // =0x64
-; CHECK-GI-NEXT:    umulh x13, x0, xzr
-; CHECK-GI-NEXT:    umulh x16, x2, x10
-; CHECK-GI-NEXT:    adds x18, x18, x5
-; CHECK-GI-NEXT:    mul x15, x3, x8
-; CHECK-GI-NEXT:    add x13, x17, x13
-; CHECK-GI-NEXT:    cset w17, hs
-; CHECK-GI-NEXT:    umulh x10, x3, x10
-; CHECK-GI-NEXT:    add x13, x13, x14
-; CHECK-GI-NEXT:    and x17, x17, #0x1
-; CHECK-GI-NEXT:    cmn x18, x16
-; CHECK-GI-NEXT:    sub x18, x2, x2
-; CHECK-GI-NEXT:    umulh x16, x2, x8
+; CHECK-GI-NEXT:    adds x13, x13, x16
+; CHECK-GI-NEXT:    mul x4, x3, x10
+; CHECK-GI-NEXT:    add x12, x12, x14
 ; CHECK-GI-NEXT:    cset w14, hs
-; CHECK-GI-NEXT:    and x14, x14, #0x1
-; CHECK-GI-NEXT:    add x15, x15, x18
+; CHECK-GI-NEXT:    adds x11, x13, x11
+; CHECK-GI-NEXT:    and x13, x14, #0x1
+; CHECK-GI-NEXT:    mul x15, x2, x8
+; CHECK-GI-NEXT:    cset w14, hs
+; CHECK-GI-NEXT:    add x12, x12, x13
+; CHECK-GI-NEXT:    and x13, x14, #0x1
+; CHECK-GI-NEXT:    add x14, x9, x17
+; CHECK-GI-NEXT:    sub x17, x2, x2
+; CHECK-GI-NEXT:    umulh x16, x2, x10
+; CHECK-GI-NEXT:    add x12, x12, x13
+; CHECK-GI-NEXT:    add x13, x14, x18
+; CHECK-GI-NEXT:    add x12, x13, x12
 ; CHECK-GI-NEXT:    and x18, xzr, #0x1
-; CHECK-GI-NEXT:    add x14, x17, x14
+; CHECK-GI-NEXT:    mul x5, x3, x8
+; CHECK-GI-NEXT:    extr x11, x12, x11, #4
+; CHECK-GI-NEXT:    adds x13, x4, x15
+; CHECK-GI-NEXT:    umulh x14, x3, x10
+; CHECK-GI-NEXT:    cset w15, hs
+; CHECK-GI-NEXT:    mov w10, #100 // =0x64
+; CHECK-GI-NEXT:    cmn x13, x16
+; CHECK-GI-NEXT:    and x15, x15, #0x1
+; CHECK-GI-NEXT:    umulh x13, x2, x8
+; CHECK-GI-NEXT:    cset w16, hs
+; CHECK-GI-NEXT:    add x17, x5, x17
+; CHECK-GI-NEXT:    and x16, x16, #0x1
 ; CHECK-GI-NEXT:    umulh x8, x3, x8
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    adds x14, x17, x14
 ; CHECK-GI-NEXT:    and x17, xzr, #0x1
-; CHECK-GI-NEXT:    adds x10, x15, x10
-; CHECK-GI-NEXT:    add x15, x17, x18
+; CHECK-GI-NEXT:    add x16, x18, x17
 ; CHECK-GI-NEXT:    cset w17, hs
-; CHECK-GI-NEXT:    umulh x18, x2, xzr
+; CHECK-GI-NEXT:    adds x13, x14, x13
+; CHECK-GI-NEXT:    umulh x14, x2, xzr
 ; CHECK-GI-NEXT:    and x17, x17, #0x1
-; CHECK-GI-NEXT:    adds x10, x10, x16
-; CHECK-GI-NEXT:    lsl x16, x13, #60
-; CHECK-GI-NEXT:    add x15, x15, x17
-; CHECK-GI-NEXT:    cset w17, hs
-; CHECK-GI-NEXT:    adds x10, x10, x14
-; CHECK-GI-NEXT:    and x14, x17, #0x1
+; CHECK-GI-NEXT:    cset w18, hs
+; CHECK-GI-NEXT:    adds x13, x13, x15
+; CHECK-GI-NEXT:    add x15, x16, x17
+; CHECK-GI-NEXT:    and x16, x18, #0x1
 ; CHECK-GI-NEXT:    cset w17, hs
 ; CHECK-GI-NEXT:    add x8, x9, x8
-; CHECK-GI-NEXT:    add x14, x15, x14
-; CHECK-GI-NEXT:    and x15, x17, #0x1
-; CHECK-GI-NEXT:    orr x12, x16, x12, lsr #4
-; CHECK-GI-NEXT:    add x9, x14, x15
-; CHECK-GI-NEXT:    add x8, x8, x18
-; CHECK-GI-NEXT:    add x8, x8, x9
-; CHECK-GI-NEXT:    lsr x9, x13, #4
-; CHECK-GI-NEXT:    umulh x14, x12, x11
-; CHECK-GI-NEXT:    lsl x13, x8, #60
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    and x16, x17, #0x1
+; CHECK-GI-NEXT:    lsr x9, x12, #4
+; CHECK-GI-NEXT:    add x15, x15, x16
+; CHECK-GI-NEXT:    umulh x17, x11, x10
+; CHECK-GI-NEXT:    add x8, x8, x14
+; CHECK-GI-NEXT:    add x8, x8, x15
+; CHECK-GI-NEXT:    mul x11, x11, x10
+; CHECK-GI-NEXT:    extr x12, x8, x13, #4
 ; CHECK-GI-NEXT:    lsr x8, x8, #4
-; CHECK-GI-NEXT:    mul x12, x12, x11
-; CHECK-GI-NEXT:    orr x10, x13, x10, lsr #4
-; CHECK-GI-NEXT:    madd x9, x9, x11, x14
-; CHECK-GI-NEXT:    umulh x13, x10, x11
-; CHECK-GI-NEXT:    subs x0, x0, x12
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    madd x9, x9, x10, x17
+; CHECK-GI-NEXT:    umulh x13, x12, x10
+; CHECK-GI-NEXT:    subs x0, x0, x11
+; CHECK-GI-NEXT:    mul x12, x12, x10
 ; CHECK-GI-NEXT:    sbc x1, x1, x9
-; CHECK-GI-NEXT:    madd x8, x8, x11, x13
-; CHECK-GI-NEXT:    subs x2, x2, x10
+; CHECK-GI-NEXT:    madd x8, x8, x10, x13
+; CHECK-GI-NEXT:    subs x2, x2, x12
 ; CHECK-GI-NEXT:    sbc x3, x3, x8
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index e4f9efa..0504959 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -351,7 +351,6 @@ define i64  @test_many_callee_arguments(
   ret i64 %ret
 }
 
-; FIXME: The new lowering should avoid saves/restores in the probing loop.
 define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{
 ; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
 ; CHECK:       // %bb.0:
@@ -389,16 +388,14 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state_size
 ; CHECK-NEWLOWERING-NEXT:    mov x8, sp
 ; CHECK-NEWLOWERING-NEXT:    sub x19, x8, x0
-; CHECK-NEWLOWERING-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16, lsl #12 // =65536
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x19
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16, lsl #12 // =65536
 ; CHECK-NEWLOWERING-NEXT:    cmp sp, x19
 ; CHECK-NEWLOWERING-NEXT:    b.le .LBB7_3
 ; CHECK-NEWLOWERING-NEXT:  // %bb.2: // in Loop: Header=BB7_1 Depth=1
-; CHECK-NEWLOWERING-NEXT:    mov x0, x19
 ; CHECK-NEWLOWERING-NEXT:    str xzr, [sp]
-; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore
 ; CHECK-NEWLOWERING-NEXT:    b .LBB7_1
 ; CHECK-NEWLOWERING-NEXT:  .LBB7_3:
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x19
diff --git a/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll b/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll
index 63c6533..a5b7612 100644
--- a/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll
@@ -64,6 +64,6 @@ define i64 @test_sme_calling_convention_x2() nounwind {
   ret i64 %pstate.sm
 }
 
-declare void @__arm_tpidr2_save()
-declare i64 @__arm_get_current_vg()
-declare {i64, i64} @__arm_sme_state()
+declare aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save()
+declare aarch64_sme_preservemost_from_x1 i64 @__arm_get_current_vg()
+declare aarch64_sme_preservemost_from_x2 {i64, i64} @__arm_sme_state()
diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
index 18ea07e..c753e9c 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
@@ -228,65 +228,34 @@ exit:
   ret void
 }
 
-; FIXME: The codegen for this case could be improved (by tuning weights).
-; Here the ZA save has been hoisted out of the conditional, but would be better
-; to sink it.
 define void @cond_private_za_call(i1 %cond) "aarch64_inout_za" nounwind {
-; CHECK-LABEL: cond_private_za_call:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-NEXT:    tbz w0, #0, .LBB3_4
-; CHECK-NEXT:  // %bb.1: // %private_za_call
-; CHECK-NEXT:    sub x8, x29, #16
-; CHECK-NEXT:    msr TPIDR2_EL0, x8
-; CHECK-NEXT:    bl private_za_call
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #16
-; CHECK-NEXT:    cbnz x8, .LBB3_3
-; CHECK-NEXT:  // %bb.2: // %private_za_call
-; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB3_3: // %private_za_call
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:  .LBB3_4: // %exit
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:    b shared_za_call
-;
-; CHECK-NEWLOWERING-LABEL: cond_private_za_call:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    mov x29, sp
-; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT:    rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT:    mov x9, sp
-; CHECK-NEWLOWERING-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT:    mov sp, x9
-; CHECK-NEWLOWERING-NEXT:    sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT:    tbz w0, #0, .LBB3_2
-; CHECK-NEWLOWERING-NEXT:  // %bb.1: // %private_za_call
-; CHECK-NEWLOWERING-NEXT:    bl private_za_call
-; CHECK-NEWLOWERING-NEXT:  .LBB3_2: // %exit
-; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB3_4
-; CHECK-NEWLOWERING-NEXT:  // %bb.3: // %exit
-; CHECK-NEWLOWERING-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT:  .LBB3_4: // %exit
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT:    mov sp, x29
-; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    b shared_za_call
+; CHECK-COMMON-LABEL: cond_private_za_call:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x29, sp
+; CHECK-COMMON-NEXT:    sub sp, sp, #16
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    mov x9, sp
+; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
+; CHECK-COMMON-NEXT:    mov sp, x9
+; CHECK-COMMON-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    tbz w0, #0, .LBB3_4
+; CHECK-COMMON-NEXT:  // %bb.1: // %private_za_call
+; CHECK-COMMON-NEXT:    sub x8, x29, #16
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-COMMON-NEXT:    bl private_za_call
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT:    sub x0, x29, #16
+; CHECK-COMMON-NEXT:    cbnz x8, .LBB3_3
+; CHECK-COMMON-NEXT:  // %bb.2: // %private_za_call
+; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
+; CHECK-COMMON-NEXT:  .LBB3_3: // %private_za_call
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT:  .LBB3_4: // %exit
+; CHECK-COMMON-NEXT:    mov sp, x29
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    b shared_za_call
   br i1 %cond, label %private_za_call, label %exit
 
 private_za_call:
@@ -910,7 +879,7 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
 ; CHECK-NEWLOWERING-LABEL: loop_with_external_entry:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    mov x29, sp
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16
 ; CHECK-NEWLOWERING-NEXT:    rdsvl x8, #1
@@ -923,23 +892,27 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
 ; CHECK-NEWLOWERING-NEXT:  // %bb.1: // %init
 ; CHECK-NEWLOWERING-NEXT:    bl shared_za_call
 ; CHECK-NEWLOWERING-NEXT:  .LBB11_2: // %loop.preheader
-; CHECK-NEWLOWERING-NEXT:    sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEWLOWERING-NEXT:    sub x20, x29, #16
+; CHECK-NEWLOWERING-NEXT:    b .LBB11_4
 ; CHECK-NEWLOWERING-NEXT:  .LBB11_3: // %loop
+; CHECK-NEWLOWERING-NEXT:    // in Loop: Header=BB11_4 Depth=1
+; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEWLOWERING-NEXT:    tbz w19, #0, .LBB11_6
+; CHECK-NEWLOWERING-NEXT:  .LBB11_4: // %loop
 ; CHECK-NEWLOWERING-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x20
 ; CHECK-NEWLOWERING-NEXT:    bl private_za_call
-; CHECK-NEWLOWERING-NEXT:    tbnz w19, #0, .LBB11_3
-; CHECK-NEWLOWERING-NEXT:  // %bb.4: // %exit
 ; CHECK-NEWLOWERING-NEXT:    smstart za
 ; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB11_6
-; CHECK-NEWLOWERING-NEXT:  // %bb.5: // %exit
+; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB11_3
+; CHECK-NEWLOWERING-NEXT:  // %bb.5: // %loop
+; CHECK-NEWLOWERING-NEXT:    // in Loop: Header=BB11_4 Depth=1
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEWLOWERING-NEXT:    b .LBB11_3
 ; CHECK-NEWLOWERING-NEXT:  .LBB11_6: // %exit
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x29
-; CHECK-NEWLOWERING-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index 3f35cb5..dcdc56c 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -63,25 +63,17 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe
 ; CHECK-NEXT:    ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr]
 ; CHECK-NEXT:    bl __cxa_throw
 ; CHECK-NEXT:  .Ltmp1: // EH_LABEL
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #16
-; CHECK-NEXT:    cbnz x8, .LBB0_4
-; CHECK-NEXT:  // %bb.3: // %throw_exception
-; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB0_4: // %throw_exception
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:  // %bb.5: // %throw_fail
-; CHECK-NEXT:  .LBB0_6: // %unwind_dtors
+; CHECK-NEXT:  // %bb.3: // %throw_fail
+; CHECK-NEXT:  .LBB0_4: // %unwind_dtors
 ; CHECK-NEXT:  .Ltmp2: // EH_LABEL
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #16
-; CHECK-NEXT:    cbnz x8, .LBB0_8
-; CHECK-NEXT:  // %bb.7: // %unwind_dtors
+; CHECK-NEXT:    cbnz x8, .LBB0_6
+; CHECK-NEXT:  // %bb.5: // %unwind_dtors
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB0_8: // %unwind_dtors
+; CHECK-NEXT:  .LBB0_6: // %unwind_dtors
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    bl shared_za_call
 ; CHECK-NEXT:    sub x8, x29, #16
diff --git a/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll
new file mode 100644
index 0000000..0306b27
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll
@@ -0,0 +1,296 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi < %s | FileCheck %s
+
+; This test case was generated by lowering mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir to LLVM IR.
+; The actual contents of the function are not that important. The main interesting quality here is that many blocks
+; don't directly use ZA. The only blocks that require ZA are the MOPA (and load/stores) in the inner loop, and the
+;`printMemrefF32()` call in the exit block.
+;
+; If ZA states are not propagated in the MachineSMEABIPass block %48 (which is within the outer loop), will
+; have  an edge to block %226 (the exit block), which requires ZA in the "saved" state, and an edge to block %51
+; (which has no preference on ZA state). This means block %48 will also end up in the locally saved state.
+; This is not really what we want, as it means we will save/restore ZA in the outer loop. We can fix this by
+; propagating the "active" state from the inner loop through basic blocks with no preference, to ensure the outer
+; loop is in the "active" state too.
+;
+; If done correctly, the only ZA save/restore should be in the exit block (with all other blocks in the active state).
+
+define void @matmul(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr %7, ptr %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13, ptr %14, ptr %15, i64 %16, i64 %17, i64 %18, i64 %19, i64 %20) #0 {
+; Check for a ZA zero in the entry block, then no uses of TPIDR2_EL0 (for ZA saves/restore)
+; until the exit block (which contains the call to printMemrefF32).
+;
+; CHECK-LABEL: matmul:
+; CHECK:      zero {za}
+; CHECK-NOT:  TPIDR2_EL0
+; CHECK:      msr TPIDR2_EL0, x{{.*}}
+; CHECK-NOT:  .LBB{{.*}}
+; CHECK:      bl printMemrefF32
+  %22 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %14, 0
+  %23 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %22, ptr %15, 1
+  %24 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %23, i64 %16, 2
+  %25 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %24, i64 %17, 3, 0
+  %26 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %25, i64 %19, 4, 0
+  %27 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %26, i64 %18, 3, 1
+  %28 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %27, i64 %20, 4, 1
+  %29 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %7, 0
+  %30 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %29, ptr %8, 1
+  %31 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %30, i64 %9, 2
+  %32 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %31, i64 %10, 3, 0
+  %33 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %32, i64 %12, 4, 0
+  %34 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %33, i64 %11, 3, 1
+  %35 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %34, i64 %13, 4, 1
+  %36 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %0, 0
+  %37 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %36, ptr %1, 1
+  %38 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %37, i64 %2, 2
+  %39 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %38, i64 %3, 3, 0
+  %40 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %39, i64 %5, 4, 0
+  %41 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %40, i64 %4, 3, 1
+  %42 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %41, i64 %6, 4, 1
+  %43 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0
+  %44 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1
+  %45 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1
+  %46 = call i64 @llvm.vscale.i64()
+  %47 = mul i64 %46, 4
+  br label %48
+
+48:                                               ; preds = %224, %21
+  %49 = phi i64 [ %225, %224 ], [ 0, %21 ]
+  %50 = icmp slt i64 %49, %43
+  br i1 %50, label %51, label %226
+
+51:                                               ; preds = %48
+  %52 = sub i64 %43, %49
+  %53 = call i64 @llvm.smin.i64(i64 %47, i64 %52)
+  %54 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+  %55 = trunc i64 %53 to i32
+  %56 = insertelement <vscale x 4 x i32> poison, i32 %55, i32 0
+  %57 = shufflevector <vscale x 4 x i32> %56, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %58 = icmp slt <vscale x 4 x i32> %54, %57
+  br label %59
+
+59:                                               ; preds = %222, %51
+  %60 = phi i64 [ %223, %222 ], [ 0, %51 ]
+  %61 = icmp slt i64 %60, %45
+  br i1 %61, label %62, label %224
+
+62:                                               ; preds = %59
+  %63 = sub i64 %45, %60
+  %64 = call i64 @llvm.smin.i64(i64 %47, i64 %63)
+  %65 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 0
+  %66 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 1
+  %67 = insertvalue { ptr, ptr, i64 } poison, ptr %65, 0
+  %68 = insertvalue { ptr, ptr, i64 } %67, ptr %66, 1
+  %69 = insertvalue { ptr, ptr, i64 } %68, i64 0, 2
+  %70 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 2
+  %71 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 0
+  %72 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 1
+  %73 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 0
+  %74 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 1
+  %75 = mul nsw i64 %49, %73
+  %76 = add i64 %70, %75
+  %77 = mul nsw i64 %60, %74
+  %78 = add i64 %76, %77
+  %79 = extractvalue { ptr, ptr, i64 } %69, 0
+  %80 = extractvalue { ptr, ptr, i64 } %69, 1
+  %81 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %79, 0
+  %82 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %81, ptr %80, 1
+  %83 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %82, i64 %78, 2
+  %84 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %83, i64 %53, 3, 0
+  %85 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %84, i64 %73, 4, 0
+  %86 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %85, i64 %64, 3, 1
+  %87 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %86, i64 %74, 4, 1
+  %88 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+  %89 = trunc i64 %64 to i32
+  %90 = insertelement <vscale x 4 x i32> poison, i32 %89, i32 0
+  %91 = shufflevector <vscale x 4 x i32> %90, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %92 = icmp slt <vscale x 4 x i32> %88, %91
+  br label %93
+
+93:                                               ; preds = %220, %62
+  %94 = phi i64 [ %221, %220 ], [ 0, %62 ]
+  %95 = icmp slt i64 %94, %44
+  br i1 %95, label %96, label %222
+
+96:                                               ; preds = %93
+  %97 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 0
+  %98 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 1
+  %99 = insertvalue { ptr, ptr, i64 } poison, ptr %97, 0
+  %100 = insertvalue { ptr, ptr, i64 } %99, ptr %98, 1
+  %101 = insertvalue { ptr, ptr, i64 } %100, i64 0, 2
+  %102 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 2
+  %103 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0
+  %104 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1
+  %105 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 0
+  %106 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 1
+  %107 = mul nsw i64 %49, %105
+  %108 = add i64 %102, %107
+  %109 = mul nsw i64 %94, %106
+  %110 = add i64 %108, %109
+  %111 = extractvalue { ptr, ptr, i64 } %101, 0
+  %112 = extractvalue { ptr, ptr, i64 } %101, 1
+  %113 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %111, 0
+  %114 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %113, ptr %112, 1
+  %115 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %114, i64 %110, 2
+  %116 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %115, i64 %53, 3, 0
+  %117 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %116, i64 %105, 4, 0
+  br label %118
+
+118:                                              ; preds = %133, %96
+  %119 = phi i64 [ %135, %133 ], [ 0, %96 ]
+  %120 = phi <vscale x 4 x float> [ %134, %133 ], [ poison, %96 ]
+  %121 = icmp slt i64 %119, %47
+  br i1 %121, label %122, label %136
+
+122:                                              ; preds = %118
+  %123 = extractelement <vscale x 4 x i1> %58, i64 %119
+  br i1 %123, label %124, label %133
+
+124:                                              ; preds = %122
+  %125 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 1
+  %126 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 2
+  %127 = getelementptr float, ptr %125, i64 %126
+  %128 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 4, 0
+  %129 = mul nuw nsw i64 %119, %128
+  %130 = getelementptr inbounds nuw float, ptr %127, i64 %129
+  %131 = load float, ptr %130, align 4
+  %132 = insertelement <vscale x 4 x float> %120, float %131, i64 %119
+  br label %133
+
+133:                                              ; preds = %124, %122
+  %134 = phi <vscale x 4 x float> [ %132, %124 ], [ %120, %122 ]
+  %135 = add i64 %119, 1
+  br label %118
+
+136:                                              ; preds = %118
+  %137 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 0
+  %138 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 1
+  %139 = insertvalue { ptr, ptr, i64 } poison, ptr %137, 0
+  %140 = insertvalue { ptr, ptr, i64 } %139, ptr %138, 1
+  %141 = insertvalue { ptr, ptr, i64 } %140, i64 0, 2
+  %142 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 2
+  %143 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 0
+  %144 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1
+  %145 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 0
+  %146 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 1
+  %147 = mul nsw i64 %94, %145
+  %148 = add i64 %142, %147
+  %149 = mul nsw i64 %60, %146
+  %150 = add i64 %148, %149
+  %151 = extractvalue { ptr, ptr, i64 } %141, 0
+  %152 = extractvalue { ptr, ptr, i64 } %141, 1
+  %153 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %151, 0
+  %154 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %153, ptr %152, 1
+  %155 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %154, i64 %150, 2
+  %156 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %155, i64 %64, 3, 0
+  %157 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %156, i64 %146, 4, 0
+  br label %158
+
+158:                                              ; preds = %173, %136
+  %159 = phi i64 [ %175, %173 ], [ 0, %136 ]
+  %160 = phi <vscale x 4 x float> [ %174, %173 ], [ poison, %136 ]
+  %161 = icmp slt i64 %159, %47
+  br i1 %161, label %162, label %176
+
+162:                                              ; preds = %158
+  %163 = extractelement <vscale x 4 x i1> %92, i64 %159
+  br i1 %163, label %164, label %173
+
+164:                                              ; preds = %162
+  %165 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 1
+  %166 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 2
+  %167 = getelementptr float, ptr %165, i64 %166
+  %168 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 4, 0
+  %169 = mul nuw nsw i64 %159, %168
+  %170 = getelementptr inbounds nuw float, ptr %167, i64 %169
+  %171 = load float, ptr %170, align 4
+  %172 = insertelement <vscale x 4 x float> %160, float %171, i64 %159
+  br label %173
+
+173:                                              ; preds = %164, %162
+  %174 = phi <vscale x 4 x float> [ %172, %164 ], [ %160, %162 ]
+  %175 = add i64 %159, 1
+  br label %158
+
+176:                                              ; preds = %158
+  %177 = trunc i64 %64 to i32
+  br label %178
+
+178:                                              ; preds = %181, %176
+  %179 = phi i64 [ %202, %181 ], [ 0, %176 ]
+  %180 = icmp slt i64 %179, %47
+  br i1 %180, label %181, label %203
+
+181:                                              ; preds = %178
+  %182 = icmp ult i64 %179, %53
+  %183 = sext i1 %182 to i32
+  %184 = and i32 %183, %177
+  %185 = sext i32 %184 to i64
+  %186 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+  %187 = trunc i64 %185 to i32
+  %188 = insertelement <vscale x 4 x i32> poison, i32 %187, i32 0
+  %189 = shufflevector <vscale x 4 x i32> %188, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %190 = icmp slt <vscale x 4 x i32> %186, %189
+  %191 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1
+  %192 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2
+  %193 = getelementptr float, ptr %191, i64 %192
+  %194 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0
+  %195 = mul i64 %179, %194
+  %196 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1
+  %197 = mul i64 0, %196
+  %198 = add i64 %195, %197
+  %199 = getelementptr float, ptr %193, i64 %198
+  %200 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %199, i32 4, <vscale x 4 x i1> %190, <vscale x 4 x float> poison)
+  %201 = trunc i64 %179 to i32
+  call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 %201, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> %200)
+  %202 = add i64 %179, 1
+  br label %178
+
+203:                                              ; preds = %178
+  call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> %58, <vscale x 4 x i1> %92, <vscale x 4 x float> %120, <vscale x 4 x float> %160)
+  %204 = call i64 @llvm.smin.i64(i64 %53, i64 %47)
+  br label %205
+
+205:                                              ; preds = %208, %203
+  %206 = phi i64 [ %219, %208 ], [ 0, %203 ]
+  %207 = icmp slt i64 %206, %204
+  br i1 %207, label %208, label %220
+
+208:                                              ; preds = %205
+  %209 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1
+  %210 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2
+  %211 = getelementptr float, ptr %209, i64 %210
+  %212 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0
+  %213 = mul i64 %206, %212
+  %214 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1
+  %215 = mul i64 0, %214
+  %216 = add i64 %213, %215
+  %217 = getelementptr float, ptr %211, i64 %216
+  %218 = trunc i64 %206 to i32
+  call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %92, ptr %217, i32 0, i32 %218)
+  %219 = add i64 %206, 1
+  br label %205
+
+220:                                              ; preds = %205
+  %221 = add i64 %94, 1
+  br label %93
+
+222:                                              ; preds = %93
+  %223 = add i64 %60, %47
+  br label %59
+
+224:                                              ; preds = %59
+  %225 = add i64 %49, %47
+  br label %48
+
+226:                                              ; preds = %48
+  %227 = alloca { ptr, ptr, i64, [2 x i64], [2 x i64] }, i64 1, align 8
+  store { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, ptr %227, align 8
+  %228 = insertvalue { i64, ptr } { i64 2, ptr poison }, ptr %227, 1
+  %229 = extractvalue { i64, ptr } %228, 0
+  %230 = extractvalue { i64, ptr } %228, 1
+  call void @printMemrefF32(i64 %229, ptr %230)
+  ret void
+}
+
+declare void @printMemrefF32(i64, ptr)
+
+attributes #0 = { "aarch64_new_za" "aarch64_pstate_sm_body" }
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index 066ee3b..afd56d1 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -12,77 +12,41 @@ entry:
 }
 
 define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" {
-; CHECK-LABEL: multi_bb_stpidr2_save_required:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-NEXT:    cbz w0, .LBB1_2
-; CHECK-NEXT:  // %bb.1: // %use_b
-; CHECK-NEXT:    fmov s1, #4.00000000
-; CHECK-NEXT:    fadd s0, s0, s1
-; CHECK-NEXT:    b .LBB1_5
-; CHECK-NEXT:  .LBB1_2: // %use_c
-; CHECK-NEXT:    fmov s0, s1
-; CHECK-NEXT:    sub x8, x29, #16
-; CHECK-NEXT:    msr TPIDR2_EL0, x8
-; CHECK-NEXT:    bl cosf
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #16
-; CHECK-NEXT:    cbnz x8, .LBB1_4
-; CHECK-NEXT:  // %bb.3: // %use_c
-; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB1_4: // %use_c
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:  .LBB1_5: // %exit
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    mov x29, sp
-; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w30, -8
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT:    rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT:    mov x9, sp
-; CHECK-NEWLOWERING-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT:    mov sp, x9
-; CHECK-NEWLOWERING-NEXT:    sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT:    cbz w0, .LBB1_2
-; CHECK-NEWLOWERING-NEXT:  // %bb.1: // %use_b
-; CHECK-NEWLOWERING-NEXT:    fmov s1, #4.00000000
-; CHECK-NEWLOWERING-NEXT:    fadd s0, s0, s1
-; CHECK-NEWLOWERING-NEXT:    b .LBB1_3
-; CHECK-NEWLOWERING-NEXT:  .LBB1_2: // %use_c
-; CHECK-NEWLOWERING-NEXT:    fmov s0, s1
-; CHECK-NEWLOWERING-NEXT:    bl cosf
-; CHECK-NEWLOWERING-NEXT:  .LBB1_3: // %exit
-; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB1_5
-; CHECK-NEWLOWERING-NEXT:  // %bb.4: // %exit
-; CHECK-NEWLOWERING-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT:  .LBB1_5: // %exit
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT:    mov sp, x29
-; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-COMMON-LABEL: multi_bb_stpidr2_save_required:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x29, sp
+; CHECK-COMMON-NEXT:    sub sp, sp, #16
+; CHECK-COMMON-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-COMMON-NEXT:    .cfi_offset w30, -8
+; CHECK-COMMON-NEXT:    .cfi_offset w29, -16
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    mov x9, sp
+; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
+; CHECK-COMMON-NEXT:    mov sp, x9
+; CHECK-COMMON-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    cbz w0, .LBB1_2
+; CHECK-COMMON-NEXT:  // %bb.1: // %use_b
+; CHECK-COMMON-NEXT:    fmov s1, #4.00000000
+; CHECK-COMMON-NEXT:    fadd s0, s0, s1
+; CHECK-COMMON-NEXT:    b .LBB1_5
+; CHECK-COMMON-NEXT:  .LBB1_2: // %use_c
+; CHECK-COMMON-NEXT:    fmov s0, s1
+; CHECK-COMMON-NEXT:    sub x8, x29, #16
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-COMMON-NEXT:    bl cosf
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT:    sub x0, x29, #16
+; CHECK-COMMON-NEXT:    cbnz x8, .LBB1_4
+; CHECK-COMMON-NEXT:  // %bb.3: // %use_c
+; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
+; CHECK-COMMON-NEXT:  .LBB1_4: // %use_c
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT:  .LBB1_5: // %exit
+; CHECK-COMMON-NEXT:    mov sp, x29
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ret
   %cmp = icmp ne i32 %a, 0
   br i1 %cmp, label %use_b, label %use_c
 
@@ -155,7 +119,9 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
 ; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEWLOWERING-NEXT:    rdsvl x8, #1
 ; CHECK-NEWLOWERING-NEXT:    mov x9, sp
+; CHECK-NEWLOWERING-NEXT:    sub x10, x29, #16
 ; CHECK-NEWLOWERING-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEWLOWERING-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16, lsl #12 // =65536
 ; CHECK-NEWLOWERING-NEXT:    cmp sp, x9
@@ -166,9 +132,7 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
 ; CHECK-NEWLOWERING-NEXT:  .LBB2_3:
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x9
 ; CHECK-NEWLOWERING-NEXT:    ldr xzr, [sp]
-; CHECK-NEWLOWERING-NEXT:    sub x10, x29, #16
 ; CHECK-NEWLOWERING-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEWLOWERING-NEXT:    cbz w0, .LBB2_5
 ; CHECK-NEWLOWERING-NEXT:  // %bb.4: // %use_b
 ; CHECK-NEWLOWERING-NEXT:    fmov s1, #4.00000000
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 2583a93..5b81f5d 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -426,3 +426,21 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin
   call void %callee()
   ret void
 }
+
+define void @disable_tailcallopt(ptr %callee) "aarch64_inout_zt0" nounwind {
+; CHECK-COMMON-LABEL: disable_tailcallopt:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x19, sp
+; CHECK-COMMON-NEXT:    str zt0, [x19]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    blr x0
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x19]
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+  tail call void %callee()
+  ret void
+}