aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AArch64')
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir7
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir15
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir14
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir11
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir32
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir7
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir14
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll888
-rw-r--r--llvm/test/CodeGen/AArch64/adc.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-vhadd.ll82
-rw-r--r--llvm/test/CodeGen/AArch64/dup.ll90
-rw-r--r--llvm/test/CodeGen/AArch64/fsh.ll473
-rw-r--r--llvm/test/CodeGen/AArch64/funnel-shift.ll55
-rw-r--r--llvm/test/CodeGen/AArch64/rem-by-const.ll173
-rw-r--r--llvm/test/CodeGen/AArch64/sme-agnostic-za.ll7
-rw-r--r--llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/sme-za-control-flow.ll107
-rw-r--r--llvm/test/CodeGen/AArch64/sme-za-exceptions.ll18
-rw-r--r--llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll296
-rw-r--r--llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll110
-rw-r--r--llvm/test/CodeGen/AArch64/sme-zt0-state.ll18
23 files changed, 1284 insertions, 1153 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir
index 68302f5..5f98dae 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir
@@ -290,11 +290,8 @@ body: |
; CHECK-LABEL: name: s3_from_s35
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[DEF]](s64)
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
- ; CHECK-NEXT: %ext:_(s32) = G_AND [[TRUNC]], [[C]]
- ; CHECK-NEXT: $w0 = COPY %ext(s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%val:_(s35) = G_IMPLICIT_DEF
%extract:_(s3) = G_EXTRACT %val, 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
index 03c28ef..b28298c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
@@ -159,13 +159,16 @@ body: |
; CHECK-LABEL: name: test_freeze_v3s8
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[DEF]]
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16)
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[BUILD_VECTOR]](<8 x s16>)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s8>), [[UV1:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[TRUNC]](<8 x s8>)
+ ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[UV]]
+ ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>)
; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV]](s8)
- ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV1]](s8)
- ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV2]](s8)
+ ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV2]](s8)
+ ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV3]](s8)
+ ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV4]](s8)
; CHECK-NEXT: %res:_(<4 x s32>) = G_BUILD_VECTOR %ext0(s32), %ext1(s32), %ext2(s32), %undef(s32)
; CHECK-NEXT: $q0 = COPY %res(<4 x s32>)
%x:_(<3 x s8>) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
index 858a5a2..1cf066d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
@@ -248,21 +248,19 @@ body: |
; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16)
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF2]](<4 x s8>)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<16 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, undef, undef, undef, undef)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[SHUF]](<16 x s8>)
; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(<4 x s32>) = G_UITOFP [[BITCAST]](<4 x s32>)
- ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UITOFP]](<4 x s32>)
- ; CHECK-NEXT: G_STORE [[UV10]](s32), [[COPY]](p0) :: (store (s32), align 16)
+ ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UITOFP]](<4 x s32>)
+ ; CHECK-NEXT: G_STORE [[UV6]](s32), [[COPY]](p0) :: (store (s32), align 16)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64)
- ; CHECK-NEXT: G_STORE [[UV11]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4)
+ ; CHECK-NEXT: G_STORE [[UV7]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C4]](s64)
- ; CHECK-NEXT: G_STORE [[UV12]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 8, align 8)
+ ; CHECK-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 8, align 8)
; CHECK-NEXT: G_BR %bb.1
bb.1:
liveins: $w1, $w2, $w3, $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 2c326902..eb30581 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -545,15 +545,18 @@ body: |
; CHECK-LABEL: name: store_6xs64
; CHECK: liveins: $x0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64)
; CHECK-NEXT: %ptr:_(p0) = COPY $x0
- ; CHECK-NEXT: G_STORE [[DEF]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>))
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>))
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD %ptr, [[C]](s64)
- ; CHECK-NEXT: G_STORE [[DEF]](<2 x s64>), [[PTR_ADD]](p0) :: (store (<2 x s64>) into unknown-address + 16)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[PTR_ADD]](p0) :: (store (<2 x s64>) into unknown-address + 16)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD %ptr, [[C1]](s64)
- ; CHECK-NEXT: G_STORE [[DEF]](<2 x s64>), [[PTR_ADD1]](p0) :: (store (<2 x s64>) into unknown-address + 32)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD1]](p0) :: (store (<2 x s64>) into unknown-address + 32)
; CHECK-NEXT: RET_ReallyLR
%val:_(<6 x s64>) = G_IMPLICIT_DEF
%ptr:_(p0) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir
index b8bdef0..737c66c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir
@@ -220,10 +220,8 @@ body: |
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UADDE]](s32), [[SEXT_INREG2]]
; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UADDE]](s32)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF1]](s32)
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV4]](s8), [[UV5]](s8), [[UV6]](s8), [[DEF]](s8)
- ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV8]](s8), [[UV9]](s8), [[UV10]](s8), [[UV8]](s8)
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32)
; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[MV2]], 24
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 23
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
index 52a28ad..1c5ae0d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
@@ -289,35 +289,35 @@ body: |
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4100
; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %w0(s32), [[C]]
; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ICMP2]], 1
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
- ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[DEF1]], [[TRUNC]](s16), [[C1]](s64)
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[IVEC]](<4 x s16>)
; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16)
; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16)
; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
- ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16)
- ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[UV5]](s16)
- ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[UV6]](s16)
- ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[UV7]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
+ ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16)
+ ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16)
+ ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16)
+ ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<8 x s8>), [[BUILD_VECTOR1]], shufflemask(0, 0, 0, 0, undef, undef, undef, undef)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[SHUF]](<8 x s8>)
- ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s16>), [[UV9:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+ ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR2]](<8 x s8>)
- ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(<4 x s16>), [[UV11:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>)
- ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[UV8]], [[UV10]]
+ ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s16>), [[UV7:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>)
+ ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[UV4]], [[UV6]]
; CHECK-NEXT: [[TRUNC9:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>)
; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[SHUF]](<8 x s8>)
- ; CHECK-NEXT: [[UV12:%[0-9]+]]:_(<4 x s16>), [[UV13:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT2]](<8 x s16>)
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC9]], [[UV12]]
+ ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s16>), [[UV9:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT2]](<8 x s16>)
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC9]], [[UV8]]
; CHECK-NEXT: [[TRUNC10:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>)
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC10]], [[XOR]]
; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND]], [[AND1]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index fdd0ebb..352f4e7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -288,10 +288,9 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[UV]](s32)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[UV]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[DEF]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[DEF]](s32)
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]], shufflemask(0, 1, 5, 6)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[SHUF]](<4 x s32>), [[C]](s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir
index 2311be6..abfaea0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir
@@ -220,10 +220,8 @@ body: |
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[USUBE]](s32), [[SEXT_INREG2]]
; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[USUBE]](s32)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF1]](s32)
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV4]](s8), [[UV5]](s8), [[UV6]](s8), [[DEF]](s8)
- ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV8]](s8), [[UV9]](s8), [[UV10]](s8), [[UV8]](s8)
+ ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8)
; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32)
; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[MV2]], 24
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 23
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
index 2609eb0..9726cc5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
@@ -37,10 +37,9 @@ body: |
bb.0:
; CHECK-LABEL: name: test_implicit_def_v4s32
- ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[DEF]](<4 x s32>)
- ; CHECK-NEXT: $x0 = COPY [[UV]](<2 x s32>)
- ; CHECK-NEXT: $x1 = COPY [[UV1]](<2 x s32>)
+ ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: $x0 = COPY [[DEF]](<2 x s32>)
+ ; CHECK-NEXT: $x1 = COPY [[DEF]](<2 x s32>)
%0:_(<4 x s32>) = G_IMPLICIT_DEF
%1:_(<2 x s32> ), %2:_(<2 x s32>) = G_UNMERGE_VALUES %0
$x0 = COPY %1
@@ -67,10 +66,9 @@ body: |
bb.0:
; CHECK-LABEL: name: test_implicit_def_v2s32
- ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
- ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
- ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
+ ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[DEF]](s32)
%0:_(<2 x s32>) = G_IMPLICIT_DEF
%1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0
$w0 = COPY %1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index 41f7ab8..480fcbd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -4992,28 +4992,21 @@ define void @test_shl_i512_const_32(ptr %result, ptr %input) {
; GISEL-LABEL: test_shl_i512_const_32:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: ldp x8, x9, [x1]
-; GISEL-NEXT: ldp x11, x12, [x1, #16]
-; GISEL-NEXT: ldp x14, x15, [x1, #32]
-; GISEL-NEXT: lsr x10, x8, #32
-; GISEL-NEXT: lsr x13, x9, #32
-; GISEL-NEXT: lsl x8, x8, #32
-; GISEL-NEXT: orr x9, x10, x9, lsl #32
-; GISEL-NEXT: lsr x10, x11, #32
-; GISEL-NEXT: orr x11, x13, x11, lsl #32
-; GISEL-NEXT: ldp x13, x16, [x1, #48]
-; GISEL-NEXT: stp x8, x9, [x0]
-; GISEL-NEXT: lsr x8, x12, #32
-; GISEL-NEXT: orr x10, x10, x12, lsl #32
-; GISEL-NEXT: lsr x12, x14, #32
-; GISEL-NEXT: lsr x9, x15, #32
-; GISEL-NEXT: orr x8, x8, x14, lsl #32
-; GISEL-NEXT: stp x11, x10, [x0, #16]
-; GISEL-NEXT: orr x11, x12, x15, lsl #32
-; GISEL-NEXT: lsr x12, x13, #32
-; GISEL-NEXT: orr x9, x9, x13, lsl #32
-; GISEL-NEXT: stp x8, x11, [x0, #32]
-; GISEL-NEXT: orr x8, x12, x16, lsl #32
-; GISEL-NEXT: stp x9, x8, [x0, #48]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x13, x14, [x1, #32]
+; GISEL-NEXT: lsl x12, x8, #32
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: ldp x15, x16, [x1, #48]
+; GISEL-NEXT: stp x12, x8, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #32
+; GISEL-NEXT: stp x9, x10, [x0, #16]
+; GISEL-NEXT: extr x9, x14, x13, #32
+; GISEL-NEXT: extr x10, x15, x14, #32
+; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: extr x8, x16, x15, #32
+; GISEL-NEXT: stp x10, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5044,30 +5037,22 @@ define void @test_lshr_i512_const_32(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_32:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x14, [x1, #24]
-; GISEL-NEXT: ldr x16, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #32
-; GISEL-NEXT: lsl x13, x9, #32
-; GISEL-NEXT: lsl x15, x10, #32
-; GISEL-NEXT: orr x11, x12, x11, lsr #32
-; GISEL-NEXT: orr x8, x13, x8, lsr #32
-; GISEL-NEXT: lsl x13, x14, #32
-; GISEL-NEXT: orr x9, x15, x9, lsr #32
-; GISEL-NEXT: ldp x12, x15, [x1, #40]
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: orr x10, x13, x10, lsr #32
-; GISEL-NEXT: lsl x8, x16, #32
-; GISEL-NEXT: lsl x11, x12, #32
-; GISEL-NEXT: lsl x13, x15, #32
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x8, x8, x15, lsr #32
-; GISEL-NEXT: lsr x10, x16, #32
-; GISEL-NEXT: orr x11, x11, x14, lsr #32
-; GISEL-NEXT: orr x9, x13, x12, lsr #32
-; GISEL-NEXT: stp x8, x10, [x0, #48]
-; GISEL-NEXT: stp x11, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #32
+; GISEL-NEXT: extr x9, x13, x12, #32
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #32
+; GISEL-NEXT: extr x8, x15, x14, #32
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: lsr x9, x15, #32
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5098,32 +5083,24 @@ define void @test_ashr_i512_const_32(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_32:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x13, [x1, #24]
-; GISEL-NEXT: ldr x17, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #32
-; GISEL-NEXT: lsl x15, x9, #32
-; GISEL-NEXT: lsl x16, x10, #32
-; GISEL-NEXT: orr x11, x12, x11, lsr #32
-; GISEL-NEXT: ldp x14, x12, [x1, #40]
-; GISEL-NEXT: orr x8, x15, x8, lsr #32
-; GISEL-NEXT: lsl x15, x13, #32
-; GISEL-NEXT: orr x9, x16, x9, lsr #32
-; GISEL-NEXT: asr x16, x17, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x14, #32
-; GISEL-NEXT: orr x10, x15, x10, lsr #32
-; GISEL-NEXT: lsl x15, x12, #32
-; GISEL-NEXT: orr x8, x11, x13, lsr #32
-; GISEL-NEXT: lsl x11, x17, #32
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x15, x14, lsr #32
-; GISEL-NEXT: lsl x13, x16, #32
-; GISEL-NEXT: orr x10, x11, x12, lsr #32
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: orr x8, x13, x17, asr #32
-; GISEL-NEXT: stp x10, x8, [x0, #48]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: ldp x14, x15, [x1, #32]
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: asr x8, x13, #63
+; GISEL-NEXT: extr x11, x14, x11, #32
+; GISEL-NEXT: extr x9, x15, x14, #32
+; GISEL-NEXT: lsl x8, x8, #32
+; GISEL-NEXT: stp x10, x11, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x15, #32
+; GISEL-NEXT: extr x11, x13, x12, #32
+; GISEL-NEXT: orr x8, x8, x13, asr #32
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x11, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5252,23 +5229,17 @@ define void @test_shl_i512_const_96(ptr %result, ptr %input) {
; GISEL-NEXT: ldr x15, [x1, #48]
; GISEL-NEXT: ldp x10, x11, [x1, #16]
; GISEL-NEXT: ldp x12, x13, [x1, #32]
-; GISEL-NEXT: lsr x14, x8, #32
-; GISEL-NEXT: lsr x16, x9, #32
-; GISEL-NEXT: lsl x8, x8, #32
-; GISEL-NEXT: orr x9, x14, x9, lsl #32
-; GISEL-NEXT: lsr x14, x10, #32
-; GISEL-NEXT: orr x10, x16, x10, lsl #32
-; GISEL-NEXT: stp xzr, x8, [x0]
-; GISEL-NEXT: lsr x8, x11, #32
-; GISEL-NEXT: orr x11, x14, x11, lsl #32
-; GISEL-NEXT: lsr x14, x12, #32
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: lsr x9, x13, #32
-; GISEL-NEXT: orr x8, x8, x12, lsl #32
-; GISEL-NEXT: orr x10, x14, x13, lsl #32
-; GISEL-NEXT: orr x9, x9, x15, lsl #32
-; GISEL-NEXT: stp x11, x8, [x0, #32]
-; GISEL-NEXT: stp x10, x9, [x0, #48]
+; GISEL-NEXT: lsl x14, x8, #32
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: stp xzr, x14, [x0]
+; GISEL-NEXT: stp x8, x9, [x0, #16]
+; GISEL-NEXT: extr x8, x12, x11, #32
+; GISEL-NEXT: extr x9, x13, x12, #32
+; GISEL-NEXT: stp x10, x8, [x0, #32]
+; GISEL-NEXT: extr x10, x15, x13, #32
+; GISEL-NEXT: stp x9, x10, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5297,27 +5268,21 @@ define void @test_lshr_i512_const_96(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_96:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #32
-; GISEL-NEXT: lsl x13, x9, #32
-; GISEL-NEXT: orr x10, x12, x10, lsr #32
-; GISEL-NEXT: lsl x12, x11, #32
-; GISEL-NEXT: orr x8, x13, x8, lsr #32
-; GISEL-NEXT: lsl x13, x14, #32
-; GISEL-NEXT: orr x9, x12, x9, lsr #32
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x15, #32
-; GISEL-NEXT: orr x11, x13, x11, lsr #32
-; GISEL-NEXT: lsl x12, x16, #32
-; GISEL-NEXT: orr x8, x10, x14, lsr #32
-; GISEL-NEXT: lsr x10, x16, #32
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x12, x15, lsr #32
-; GISEL-NEXT: stp x10, xzr, [x0, #48]
-; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #32
+; GISEL-NEXT: extr x9, x13, x12, #32
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #32
+; GISEL-NEXT: lsr x8, x14, #32
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, xzr, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5347,29 +5312,23 @@ define void @test_ashr_i512_const_96(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_96:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x11, [x1, #8]
-; GISEL-NEXT: ldp x10, x13, [x1, #32]
-; GISEL-NEXT: lsl x12, x8, #32
-; GISEL-NEXT: lsl x14, x9, #32
-; GISEL-NEXT: lsl x15, x10, #32
-; GISEL-NEXT: orr x11, x12, x11, lsr #32
-; GISEL-NEXT: ldp x12, x16, [x1, #48]
-; GISEL-NEXT: orr x8, x14, x8, lsr #32
-; GISEL-NEXT: lsl x14, x13, #32
-; GISEL-NEXT: orr x9, x15, x9, lsr #32
-; GISEL-NEXT: asr x15, x16, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x12, #32
-; GISEL-NEXT: orr x10, x14, x10, lsr #32
-; GISEL-NEXT: lsl x14, x16, #32
-; GISEL-NEXT: orr x8, x11, x13, lsr #32
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x13, [x1, #40]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x14, x12, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: asr x15, x12, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #32
+; GISEL-NEXT: extr x9, x14, x13, #32
; GISEL-NEXT: lsl x11, x15, #32
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x14, x12, lsr #32
-; GISEL-NEXT: orr x10, x11, x16, asr #32
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: stp x10, x15, [x0, #48]
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x14, #32
+; GISEL-NEXT: orr x8, x11, x12, asr #32
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, x15, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5404,28 +5363,21 @@ define void @test_shl_i512_const_1(ptr %result, ptr %input) {
; GISEL-LABEL: test_shl_i512_const_1:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: ldp x8, x9, [x1]
-; GISEL-NEXT: ldp x11, x12, [x1, #16]
-; GISEL-NEXT: ldp x14, x15, [x1, #32]
-; GISEL-NEXT: lsr x10, x8, #63
-; GISEL-NEXT: lsr x13, x9, #63
-; GISEL-NEXT: lsl x8, x8, #1
-; GISEL-NEXT: orr x9, x10, x9, lsl #1
-; GISEL-NEXT: lsr x10, x11, #63
-; GISEL-NEXT: orr x11, x13, x11, lsl #1
-; GISEL-NEXT: ldp x13, x16, [x1, #48]
-; GISEL-NEXT: stp x8, x9, [x0]
-; GISEL-NEXT: lsr x8, x12, #63
-; GISEL-NEXT: orr x10, x10, x12, lsl #1
-; GISEL-NEXT: lsr x12, x14, #63
-; GISEL-NEXT: lsr x9, x15, #63
-; GISEL-NEXT: orr x8, x8, x14, lsl #1
-; GISEL-NEXT: stp x11, x10, [x0, #16]
-; GISEL-NEXT: orr x11, x12, x15, lsl #1
-; GISEL-NEXT: lsr x12, x13, #63
-; GISEL-NEXT: orr x9, x9, x13, lsl #1
-; GISEL-NEXT: stp x8, x11, [x0, #32]
-; GISEL-NEXT: orr x8, x12, x16, lsl #1
-; GISEL-NEXT: stp x9, x8, [x0, #48]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x13, x14, [x1, #32]
+; GISEL-NEXT: lsl x12, x8, #1
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: ldp x15, x16, [x1, #48]
+; GISEL-NEXT: stp x12, x8, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #63
+; GISEL-NEXT: stp x9, x10, [x0, #16]
+; GISEL-NEXT: extr x9, x14, x13, #63
+; GISEL-NEXT: extr x10, x15, x14, #63
+; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: extr x8, x16, x15, #63
+; GISEL-NEXT: stp x10, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5457,30 +5409,22 @@ define void @test_lshr_i512_const_1(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_1:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x14, [x1, #24]
-; GISEL-NEXT: ldr x16, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #63
-; GISEL-NEXT: lsl x13, x9, #63
-; GISEL-NEXT: lsl x15, x10, #63
-; GISEL-NEXT: orr x11, x12, x11, lsr #1
-; GISEL-NEXT: orr x8, x13, x8, lsr #1
-; GISEL-NEXT: lsl x13, x14, #63
-; GISEL-NEXT: orr x9, x15, x9, lsr #1
-; GISEL-NEXT: ldp x12, x15, [x1, #40]
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: orr x10, x13, x10, lsr #1
-; GISEL-NEXT: lsl x8, x16, #63
-; GISEL-NEXT: lsl x11, x12, #63
-; GISEL-NEXT: lsl x13, x15, #63
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x8, x8, x15, lsr #1
-; GISEL-NEXT: lsr x10, x16, #1
-; GISEL-NEXT: orr x11, x11, x14, lsr #1
-; GISEL-NEXT: orr x9, x13, x12, lsr #1
-; GISEL-NEXT: stp x8, x10, [x0, #48]
-; GISEL-NEXT: stp x11, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #1
+; GISEL-NEXT: extr x9, x13, x12, #1
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #1
+; GISEL-NEXT: extr x8, x15, x14, #1
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: lsr x9, x15, #1
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5512,32 +5456,24 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_1:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x13, [x1, #24]
-; GISEL-NEXT: ldr x17, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #63
-; GISEL-NEXT: lsl x15, x9, #63
-; GISEL-NEXT: lsl x16, x10, #63
-; GISEL-NEXT: orr x11, x12, x11, lsr #1
-; GISEL-NEXT: ldp x14, x12, [x1, #40]
-; GISEL-NEXT: orr x8, x15, x8, lsr #1
-; GISEL-NEXT: lsl x15, x13, #63
-; GISEL-NEXT: orr x9, x16, x9, lsr #1
-; GISEL-NEXT: asr x16, x17, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x14, #63
-; GISEL-NEXT: orr x10, x15, x10, lsr #1
-; GISEL-NEXT: lsl x15, x12, #63
-; GISEL-NEXT: orr x8, x11, x13, lsr #1
-; GISEL-NEXT: lsl x11, x17, #63
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x15, x14, lsr #1
-; GISEL-NEXT: lsl x13, x16, #63
-; GISEL-NEXT: orr x10, x11, x12, lsr #1
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: orr x8, x13, x17, asr #1
-; GISEL-NEXT: stp x10, x8, [x0, #48]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: ldp x14, x15, [x1, #32]
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: asr x8, x13, #63
+; GISEL-NEXT: extr x11, x14, x11, #1
+; GISEL-NEXT: extr x9, x15, x14, #1
+; GISEL-NEXT: lsl x8, x8, #63
+; GISEL-NEXT: stp x10, x11, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x15, #1
+; GISEL-NEXT: extr x11, x13, x12, #1
+; GISEL-NEXT: orr x8, x8, x13, asr #1
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x11, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5571,28 +5507,21 @@ define void @test_shl_i512_const_15(ptr %result, ptr %input) {
; GISEL-LABEL: test_shl_i512_const_15:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: ldp x8, x9, [x1]
-; GISEL-NEXT: ldp x11, x12, [x1, #16]
-; GISEL-NEXT: ldp x14, x15, [x1, #32]
-; GISEL-NEXT: lsr x10, x8, #49
-; GISEL-NEXT: lsr x13, x9, #49
-; GISEL-NEXT: lsl x8, x8, #15
-; GISEL-NEXT: orr x9, x10, x9, lsl #15
-; GISEL-NEXT: lsr x10, x11, #49
-; GISEL-NEXT: orr x11, x13, x11, lsl #15
-; GISEL-NEXT: ldp x13, x16, [x1, #48]
-; GISEL-NEXT: stp x8, x9, [x0]
-; GISEL-NEXT: lsr x8, x12, #49
-; GISEL-NEXT: orr x10, x10, x12, lsl #15
-; GISEL-NEXT: lsr x12, x14, #49
-; GISEL-NEXT: lsr x9, x15, #49
-; GISEL-NEXT: orr x8, x8, x14, lsl #15
-; GISEL-NEXT: stp x11, x10, [x0, #16]
-; GISEL-NEXT: orr x11, x12, x15, lsl #15
-; GISEL-NEXT: lsr x12, x13, #49
-; GISEL-NEXT: orr x9, x9, x13, lsl #15
-; GISEL-NEXT: stp x8, x11, [x0, #32]
-; GISEL-NEXT: orr x8, x12, x16, lsl #15
-; GISEL-NEXT: stp x9, x8, [x0, #48]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x13, x14, [x1, #32]
+; GISEL-NEXT: lsl x12, x8, #15
+; GISEL-NEXT: extr x8, x9, x8, #49
+; GISEL-NEXT: extr x9, x10, x9, #49
+; GISEL-NEXT: extr x10, x11, x10, #49
+; GISEL-NEXT: ldp x15, x16, [x1, #48]
+; GISEL-NEXT: stp x12, x8, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #49
+; GISEL-NEXT: stp x9, x10, [x0, #16]
+; GISEL-NEXT: extr x9, x14, x13, #49
+; GISEL-NEXT: extr x10, x15, x14, #49
+; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: extr x8, x16, x15, #49
+; GISEL-NEXT: stp x10, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5624,30 +5553,22 @@ define void @test_lshr_i512_const_15(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_15:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x14, [x1, #24]
-; GISEL-NEXT: ldr x16, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #49
-; GISEL-NEXT: lsl x13, x9, #49
-; GISEL-NEXT: lsl x15, x10, #49
-; GISEL-NEXT: orr x11, x12, x11, lsr #15
-; GISEL-NEXT: orr x8, x13, x8, lsr #15
-; GISEL-NEXT: lsl x13, x14, #49
-; GISEL-NEXT: orr x9, x15, x9, lsr #15
-; GISEL-NEXT: ldp x12, x15, [x1, #40]
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: orr x10, x13, x10, lsr #15
-; GISEL-NEXT: lsl x8, x16, #49
-; GISEL-NEXT: lsl x11, x12, #49
-; GISEL-NEXT: lsl x13, x15, #49
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x8, x8, x15, lsr #15
-; GISEL-NEXT: lsr x10, x16, #15
-; GISEL-NEXT: orr x11, x11, x14, lsr #15
-; GISEL-NEXT: orr x9, x13, x12, lsr #15
-; GISEL-NEXT: stp x8, x10, [x0, #48]
-; GISEL-NEXT: stp x11, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #15
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #15
+; GISEL-NEXT: extr x10, x11, x10, #15
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #15
+; GISEL-NEXT: extr x9, x13, x12, #15
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #15
+; GISEL-NEXT: extr x8, x15, x14, #15
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: lsr x9, x15, #15
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5679,32 +5600,24 @@ define void @test_ashr_i512_const_15(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_15:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x13, [x1, #24]
-; GISEL-NEXT: ldr x17, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #49
-; GISEL-NEXT: lsl x15, x9, #49
-; GISEL-NEXT: lsl x16, x10, #49
-; GISEL-NEXT: orr x11, x12, x11, lsr #15
-; GISEL-NEXT: ldp x14, x12, [x1, #40]
-; GISEL-NEXT: orr x8, x15, x8, lsr #15
-; GISEL-NEXT: lsl x15, x13, #49
-; GISEL-NEXT: orr x9, x16, x9, lsr #15
-; GISEL-NEXT: asr x16, x17, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x14, #49
-; GISEL-NEXT: orr x10, x15, x10, lsr #15
-; GISEL-NEXT: lsl x15, x12, #49
-; GISEL-NEXT: orr x8, x11, x13, lsr #15
-; GISEL-NEXT: lsl x11, x17, #49
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x15, x14, lsr #15
-; GISEL-NEXT: lsl x13, x16, #49
-; GISEL-NEXT: orr x10, x11, x12, lsr #15
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: orr x8, x13, x17, asr #15
-; GISEL-NEXT: stp x10, x8, [x0, #48]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #15
+; GISEL-NEXT: ldp x14, x15, [x1, #32]
+; GISEL-NEXT: extr x9, x10, x9, #15
+; GISEL-NEXT: extr x10, x11, x10, #15
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: asr x8, x13, #63
+; GISEL-NEXT: extr x11, x14, x11, #15
+; GISEL-NEXT: extr x9, x15, x14, #15
+; GISEL-NEXT: lsl x8, x8, #49
+; GISEL-NEXT: stp x10, x11, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x15, #15
+; GISEL-NEXT: extr x11, x13, x12, #15
+; GISEL-NEXT: orr x8, x8, x13, asr #15
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x11, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5738,28 +5651,21 @@ define void @test_shl_i512_const_63(ptr %result, ptr %input) {
; GISEL-LABEL: test_shl_i512_const_63:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: ldp x8, x9, [x1]
-; GISEL-NEXT: ldp x11, x12, [x1, #16]
-; GISEL-NEXT: ldp x14, x15, [x1, #32]
-; GISEL-NEXT: lsr x10, x8, #1
-; GISEL-NEXT: lsr x13, x9, #1
-; GISEL-NEXT: lsl x8, x8, #63
-; GISEL-NEXT: orr x9, x10, x9, lsl #63
-; GISEL-NEXT: lsr x10, x11, #1
-; GISEL-NEXT: orr x11, x13, x11, lsl #63
-; GISEL-NEXT: ldp x13, x16, [x1, #48]
-; GISEL-NEXT: stp x8, x9, [x0]
-; GISEL-NEXT: lsr x8, x12, #1
-; GISEL-NEXT: orr x10, x10, x12, lsl #63
-; GISEL-NEXT: lsr x12, x14, #1
-; GISEL-NEXT: lsr x9, x15, #1
-; GISEL-NEXT: orr x8, x8, x14, lsl #63
-; GISEL-NEXT: stp x11, x10, [x0, #16]
-; GISEL-NEXT: orr x11, x12, x15, lsl #63
-; GISEL-NEXT: lsr x12, x13, #1
-; GISEL-NEXT: orr x9, x9, x13, lsl #63
-; GISEL-NEXT: stp x8, x11, [x0, #32]
-; GISEL-NEXT: orr x8, x12, x16, lsl #63
-; GISEL-NEXT: stp x9, x8, [x0, #48]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x13, x14, [x1, #32]
+; GISEL-NEXT: lsl x12, x8, #63
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: ldp x15, x16, [x1, #48]
+; GISEL-NEXT: stp x12, x8, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #1
+; GISEL-NEXT: stp x9, x10, [x0, #16]
+; GISEL-NEXT: extr x9, x14, x13, #1
+; GISEL-NEXT: extr x10, x15, x14, #1
+; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: extr x8, x16, x15, #1
+; GISEL-NEXT: stp x10, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5791,30 +5697,22 @@ define void @test_lshr_i512_const_63(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_63:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x14, [x1, #24]
-; GISEL-NEXT: ldr x16, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #1
-; GISEL-NEXT: lsl x13, x9, #1
-; GISEL-NEXT: lsl x15, x10, #1
-; GISEL-NEXT: orr x11, x12, x11, lsr #63
-; GISEL-NEXT: orr x8, x13, x8, lsr #63
-; GISEL-NEXT: lsl x13, x14, #1
-; GISEL-NEXT: orr x9, x15, x9, lsr #63
-; GISEL-NEXT: ldp x12, x15, [x1, #40]
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: orr x10, x13, x10, lsr #63
-; GISEL-NEXT: lsl x8, x16, #1
-; GISEL-NEXT: lsl x11, x12, #1
-; GISEL-NEXT: lsl x13, x15, #1
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x8, x8, x15, lsr #63
-; GISEL-NEXT: lsr x10, x16, #63
-; GISEL-NEXT: orr x11, x11, x14, lsr #63
-; GISEL-NEXT: orr x9, x13, x12, lsr #63
-; GISEL-NEXT: stp x8, x10, [x0, #48]
-; GISEL-NEXT: stp x11, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: extr x9, x13, x12, #63
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #63
+; GISEL-NEXT: extr x8, x15, x14, #63
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: lsr x9, x15, #63
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5846,30 +5744,22 @@ define void @test_ashr_i512_const_63(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_63:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x10, [x1]
-; GISEL-NEXT: ldp x11, x13, [x1, #24]
-; GISEL-NEXT: ldr x17, [x1, #56]
-; GISEL-NEXT: lsl x15, x9, #1
-; GISEL-NEXT: lsl x12, x8, #1
-; GISEL-NEXT: lsl x16, x11, #1
-; GISEL-NEXT: orr x8, x15, x8, lsr #63
-; GISEL-NEXT: lsl x15, x13, #1
-; GISEL-NEXT: orr x10, x12, x10, lsr #63
-; GISEL-NEXT: ldp x14, x12, [x1, #40]
-; GISEL-NEXT: orr x9, x16, x9, lsr #63
-; GISEL-NEXT: orr x11, x15, x11, lsr #63
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x8, x17, #1
-; GISEL-NEXT: lsl x16, x14, #1
-; GISEL-NEXT: lsl x10, x12, #1
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: asr x9, x17, #63
-; GISEL-NEXT: orr x8, x8, x12, lsr #63
-; GISEL-NEXT: orr x13, x16, x13, lsr #63
-; GISEL-NEXT: orr x10, x10, x14, lsr #63
-; GISEL-NEXT: orr x9, x9, x9, lsl #1
-; GISEL-NEXT: stp x13, x10, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: extr x9, x13, x12, #63
+; GISEL-NEXT: extr x11, x14, x13, #63
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: asr x10, x15, #63
+; GISEL-NEXT: extr x8, x15, x14, #63
+; GISEL-NEXT: stp x9, x11, [x0, #32]
+; GISEL-NEXT: orr x9, x10, x10, lsl #1
; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
@@ -5906,23 +5796,17 @@ define void @test_shl_i512_const_65(ptr %result, ptr %input) {
; GISEL-NEXT: ldr x15, [x1, #48]
; GISEL-NEXT: ldp x10, x11, [x1, #16]
; GISEL-NEXT: ldp x12, x13, [x1, #32]
-; GISEL-NEXT: lsr x14, x8, #63
-; GISEL-NEXT: lsr x16, x9, #63
-; GISEL-NEXT: lsl x8, x8, #1
-; GISEL-NEXT: orr x9, x14, x9, lsl #1
-; GISEL-NEXT: lsr x14, x10, #63
-; GISEL-NEXT: orr x10, x16, x10, lsl #1
-; GISEL-NEXT: stp xzr, x8, [x0]
-; GISEL-NEXT: lsr x8, x11, #63
-; GISEL-NEXT: orr x11, x14, x11, lsl #1
-; GISEL-NEXT: lsr x14, x12, #63
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: lsr x9, x13, #63
-; GISEL-NEXT: orr x8, x8, x12, lsl #1
-; GISEL-NEXT: orr x10, x14, x13, lsl #1
-; GISEL-NEXT: orr x9, x9, x15, lsl #1
-; GISEL-NEXT: stp x11, x8, [x0, #32]
-; GISEL-NEXT: stp x10, x9, [x0, #48]
+; GISEL-NEXT: lsl x14, x8, #1
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp xzr, x14, [x0]
+; GISEL-NEXT: stp x8, x9, [x0, #16]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: extr x9, x13, x12, #63
+; GISEL-NEXT: stp x10, x8, [x0, #32]
+; GISEL-NEXT: extr x10, x15, x13, #63
+; GISEL-NEXT: stp x9, x10, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5953,27 +5837,21 @@ define void @test_lshr_i512_const_65(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_65:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #63
-; GISEL-NEXT: lsl x13, x9, #63
-; GISEL-NEXT: orr x10, x12, x10, lsr #1
-; GISEL-NEXT: lsl x12, x11, #63
-; GISEL-NEXT: orr x8, x13, x8, lsr #1
-; GISEL-NEXT: lsl x13, x14, #63
-; GISEL-NEXT: orr x9, x12, x9, lsr #1
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x15, #63
-; GISEL-NEXT: orr x11, x13, x11, lsr #1
-; GISEL-NEXT: lsl x12, x16, #63
-; GISEL-NEXT: orr x8, x10, x14, lsr #1
-; GISEL-NEXT: lsr x10, x16, #1
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x12, x15, lsr #1
-; GISEL-NEXT: stp x10, xzr, [x0, #48]
-; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #1
+; GISEL-NEXT: extr x9, x13, x12, #1
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #1
+; GISEL-NEXT: lsr x8, x14, #1
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, xzr, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6005,29 +5883,23 @@ define void @test_ashr_i512_const_65(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_65:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x11, [x1, #8]
-; GISEL-NEXT: ldp x10, x13, [x1, #32]
-; GISEL-NEXT: lsl x12, x8, #63
-; GISEL-NEXT: lsl x14, x9, #63
-; GISEL-NEXT: lsl x15, x10, #63
-; GISEL-NEXT: orr x11, x12, x11, lsr #1
-; GISEL-NEXT: ldp x12, x16, [x1, #48]
-; GISEL-NEXT: orr x8, x14, x8, lsr #1
-; GISEL-NEXT: lsl x14, x13, #63
-; GISEL-NEXT: orr x9, x15, x9, lsr #1
-; GISEL-NEXT: asr x15, x16, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x12, #63
-; GISEL-NEXT: orr x10, x14, x10, lsr #1
-; GISEL-NEXT: lsl x14, x16, #63
-; GISEL-NEXT: orr x8, x11, x13, lsr #1
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x13, [x1, #40]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x14, x12, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: asr x15, x12, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #1
+; GISEL-NEXT: extr x9, x14, x13, #1
; GISEL-NEXT: lsl x11, x15, #63
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x14, x12, lsr #1
-; GISEL-NEXT: orr x10, x11, x16, asr #1
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: stp x10, x15, [x0, #48]
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x14, #1
+; GISEL-NEXT: orr x8, x11, x12, asr #1
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, x15, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6062,23 +5934,17 @@ define void @test_shl_i512_const_100(ptr %result, ptr %input) {
; GISEL-NEXT: ldr x15, [x1, #48]
; GISEL-NEXT: ldp x10, x11, [x1, #16]
; GISEL-NEXT: ldp x12, x13, [x1, #32]
-; GISEL-NEXT: lsr x14, x8, #28
-; GISEL-NEXT: lsr x16, x9, #28
-; GISEL-NEXT: lsl x8, x8, #36
-; GISEL-NEXT: orr x9, x14, x9, lsl #36
-; GISEL-NEXT: lsr x14, x10, #28
-; GISEL-NEXT: orr x10, x16, x10, lsl #36
-; GISEL-NEXT: stp xzr, x8, [x0]
-; GISEL-NEXT: lsr x8, x11, #28
-; GISEL-NEXT: orr x11, x14, x11, lsl #36
-; GISEL-NEXT: lsr x14, x12, #28
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: lsr x9, x13, #28
-; GISEL-NEXT: orr x8, x8, x12, lsl #36
-; GISEL-NEXT: orr x10, x14, x13, lsl #36
-; GISEL-NEXT: orr x9, x9, x15, lsl #36
-; GISEL-NEXT: stp x11, x8, [x0, #32]
-; GISEL-NEXT: stp x10, x9, [x0, #48]
+; GISEL-NEXT: lsl x14, x8, #36
+; GISEL-NEXT: extr x8, x9, x8, #28
+; GISEL-NEXT: extr x9, x10, x9, #28
+; GISEL-NEXT: extr x10, x11, x10, #28
+; GISEL-NEXT: stp xzr, x14, [x0]
+; GISEL-NEXT: stp x8, x9, [x0, #16]
+; GISEL-NEXT: extr x8, x12, x11, #28
+; GISEL-NEXT: extr x9, x13, x12, #28
+; GISEL-NEXT: stp x10, x8, [x0, #32]
+; GISEL-NEXT: extr x10, x15, x13, #28
+; GISEL-NEXT: stp x9, x10, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6109,27 +5975,21 @@ define void @test_lshr_i512_const_100(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_100:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #28
-; GISEL-NEXT: lsl x13, x9, #28
-; GISEL-NEXT: orr x10, x12, x10, lsr #36
-; GISEL-NEXT: lsl x12, x11, #28
-; GISEL-NEXT: orr x8, x13, x8, lsr #36
-; GISEL-NEXT: lsl x13, x14, #28
-; GISEL-NEXT: orr x9, x12, x9, lsr #36
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x15, #28
-; GISEL-NEXT: orr x11, x13, x11, lsr #36
-; GISEL-NEXT: lsl x12, x16, #28
-; GISEL-NEXT: orr x8, x10, x14, lsr #36
-; GISEL-NEXT: lsr x10, x16, #36
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x12, x15, lsr #36
-; GISEL-NEXT: stp x10, xzr, [x0, #48]
-; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #36
+; GISEL-NEXT: extr x9, x10, x9, #36
+; GISEL-NEXT: extr x10, x11, x10, #36
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #36
+; GISEL-NEXT: extr x9, x13, x12, #36
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #36
+; GISEL-NEXT: lsr x8, x14, #36
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, xzr, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6161,29 +6021,23 @@ define void @test_ashr_i512_const_100(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_100:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x11, [x1, #8]
-; GISEL-NEXT: ldp x10, x13, [x1, #32]
-; GISEL-NEXT: lsl x12, x8, #28
-; GISEL-NEXT: lsl x14, x9, #28
-; GISEL-NEXT: lsl x15, x10, #28
-; GISEL-NEXT: orr x11, x12, x11, lsr #36
-; GISEL-NEXT: ldp x12, x16, [x1, #48]
-; GISEL-NEXT: orr x8, x14, x8, lsr #36
-; GISEL-NEXT: lsl x14, x13, #28
-; GISEL-NEXT: orr x9, x15, x9, lsr #36
-; GISEL-NEXT: asr x15, x16, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x12, #28
-; GISEL-NEXT: orr x10, x14, x10, lsr #36
-; GISEL-NEXT: lsl x14, x16, #28
-; GISEL-NEXT: orr x8, x11, x13, lsr #36
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x13, [x1, #40]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x14, x12, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #36
+; GISEL-NEXT: extr x9, x10, x9, #36
+; GISEL-NEXT: extr x10, x11, x10, #36
+; GISEL-NEXT: asr x15, x12, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #36
+; GISEL-NEXT: extr x9, x14, x13, #36
; GISEL-NEXT: lsl x11, x15, #28
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x14, x12, lsr #36
-; GISEL-NEXT: orr x10, x11, x16, asr #36
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: stp x10, x15, [x0, #48]
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x14, #36
+; GISEL-NEXT: orr x8, x11, x12, asr #36
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, x15, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6219,23 +6073,17 @@ define void @test_shl_i512_const_127(ptr %result, ptr %input) {
; GISEL-NEXT: ldr x15, [x1, #48]
; GISEL-NEXT: ldp x10, x11, [x1, #16]
; GISEL-NEXT: ldp x12, x13, [x1, #32]
-; GISEL-NEXT: lsr x14, x8, #1
-; GISEL-NEXT: lsr x16, x9, #1
-; GISEL-NEXT: lsl x8, x8, #63
-; GISEL-NEXT: orr x9, x14, x9, lsl #63
-; GISEL-NEXT: lsr x14, x10, #1
-; GISEL-NEXT: orr x10, x16, x10, lsl #63
-; GISEL-NEXT: stp xzr, x8, [x0]
-; GISEL-NEXT: lsr x8, x11, #1
-; GISEL-NEXT: orr x11, x14, x11, lsl #63
-; GISEL-NEXT: lsr x14, x12, #1
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: lsr x9, x13, #1
-; GISEL-NEXT: orr x8, x8, x12, lsl #63
-; GISEL-NEXT: orr x10, x14, x13, lsl #63
-; GISEL-NEXT: orr x9, x9, x15, lsl #63
-; GISEL-NEXT: stp x11, x8, [x0, #32]
-; GISEL-NEXT: stp x10, x9, [x0, #48]
+; GISEL-NEXT: lsl x14, x8, #63
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: stp xzr, x14, [x0]
+; GISEL-NEXT: stp x8, x9, [x0, #16]
+; GISEL-NEXT: extr x8, x12, x11, #1
+; GISEL-NEXT: extr x9, x13, x12, #1
+; GISEL-NEXT: stp x10, x8, [x0, #32]
+; GISEL-NEXT: extr x10, x15, x13, #1
+; GISEL-NEXT: stp x9, x10, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6266,27 +6114,21 @@ define void @test_lshr_i512_const_127(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_127:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #1
-; GISEL-NEXT: lsl x13, x9, #1
-; GISEL-NEXT: orr x10, x12, x10, lsr #63
-; GISEL-NEXT: lsl x12, x11, #1
-; GISEL-NEXT: orr x8, x13, x8, lsr #63
-; GISEL-NEXT: lsl x13, x14, #1
-; GISEL-NEXT: orr x9, x12, x9, lsr #63
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x15, #1
-; GISEL-NEXT: orr x11, x13, x11, lsr #63
-; GISEL-NEXT: lsl x12, x16, #1
-; GISEL-NEXT: orr x8, x10, x14, lsr #63
-; GISEL-NEXT: lsr x10, x16, #63
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x12, x15, lsr #63
-; GISEL-NEXT: stp x10, xzr, [x0, #48]
-; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: extr x9, x13, x12, #63
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #63
+; GISEL-NEXT: lsr x8, x14, #63
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, xzr, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6317,28 +6159,22 @@ define void @test_ashr_i512_const_127(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_127:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #1
-; GISEL-NEXT: lsl x13, x9, #1
-; GISEL-NEXT: orr x10, x12, x10, lsr #63
-; GISEL-NEXT: lsl x12, x11, #1
-; GISEL-NEXT: orr x8, x13, x8, lsr #63
-; GISEL-NEXT: lsl x13, x14, #1
-; GISEL-NEXT: orr x9, x12, x9, lsr #63
-; GISEL-NEXT: lsl x12, x15, #1
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x16, #1
-; GISEL-NEXT: orr x11, x13, x11, lsr #63
-; GISEL-NEXT: asr x8, x16, #63
-; GISEL-NEXT: orr x12, x12, x14, lsr #63
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x10, x15, lsr #63
-; GISEL-NEXT: orr x10, x8, x8, lsl #1
-; GISEL-NEXT: stp x12, x9, [x0, #32]
-; GISEL-NEXT: stp x10, x8, [x0, #48]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: asr x9, x14, #63
+; GISEL-NEXT: extr x11, x13, x12, #63
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #63
+; GISEL-NEXT: orr x8, x9, x9, lsl #1
+; GISEL-NEXT: stp x11, x10, [x0, #32]
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
diff --git a/llvm/test/CodeGen/AArch64/adc.ll b/llvm/test/CodeGen/AArch64/adc.ll
index 12e8bf2..03f3cf1 100644
--- a/llvm/test/CodeGen/AArch64/adc.ll
+++ b/llvm/test/CodeGen/AArch64/adc.ll
@@ -71,9 +71,8 @@ define i128 @test_shifted(i128 %a, i128 %b) {
;
; CHECK-GI-LABEL: test_shifted:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: lsr x8, x2, #19
+; CHECK-GI-NEXT: extr x8, x3, x2, #19
; CHECK-GI-NEXT: adds x0, x0, x2, lsl #45
-; CHECK-GI-NEXT: orr x8, x8, x3, lsl #45
; CHECK-GI-NEXT: adc x1, x1, x8
; CHECK-GI-NEXT: ret
%rhs = shl i128 %b, 45
@@ -108,8 +107,7 @@ define i128 @test_extended(i128 %a, i16 %b) {
; CHECK-GI-NEXT: sxth x8, w2
; CHECK-GI-NEXT: adds x0, x0, w2, sxth #3
; CHECK-GI-NEXT: asr x9, x8, #63
-; CHECK-GI-NEXT: lsr x8, x8, #61
-; CHECK-GI-NEXT: orr x8, x8, x9, lsl #3
+; CHECK-GI-NEXT: extr x8, x9, x8, #61
; CHECK-GI-NEXT: adc x1, x1, x8
; CHECK-GI-NEXT: ret
%ext = sext i16 %b to i128
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index 076cbf7..a505b42 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1408,6 +1408,88 @@ define <4 x i16> @ext_via_i19(<4 x i16> %a) {
ret <4 x i16> %t6
}
+define <8 x i8> @srhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-LABEL: srhadd_v8i8_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: srhadd.8b v0, v0, v1
+; CHECK-NEXT: ret
+ %s0s = sext <8 x i8> %s0 to <8 x i16>
+ %s1s = sext <8 x i8> %s1 to <8 x i16>
+ %s = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
+ %s2 = trunc <8 x i16> %s to <8 x i8>
+ ret <8 x i8> %s2
+}
+
+define <4 x i16> @srhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
+; CHECK-LABEL: srhadd_v4i16_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: srhadd.4h v0, v0, v1
+; CHECK-NEXT: ret
+ %s0s = sext <4 x i16> %s0 to <4 x i32>
+ %s1s = sext <4 x i16> %s1 to <4 x i32>
+ %s = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)
+ %s2 = trunc <4 x i32> %s to <4 x i16>
+ ret <4 x i16> %s2
+}
+
+define <2 x i32> @srhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) {
+; CHECK-LABEL: srhadd_v2i32_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: sshll.2d v1, v1, #0
+; CHECK-NEXT: eor.16b v2, v0, v1
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: ushr.2d v1, v2, #1
+; CHECK-NEXT: sub.2d v0, v0, v1
+; CHECK-NEXT: xtn.2s v0, v0
+; CHECK-NEXT: ret
+ %s0s = sext <2 x i32> %s0 to <2 x i64>
+ %s1s = sext <2 x i32> %s1 to <2 x i64>
+ %s = call <2 x i64> @llvm.aarch64.neon.urhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s)
+ %s2 = trunc <2 x i64> %s to <2 x i32>
+ ret <2 x i32> %s2
+}
+
+define <8 x i8> @urhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-LABEL: urhadd_v8i8_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: urhadd.8b v0, v0, v1
+; CHECK-NEXT: ret
+ %s0s = zext <8 x i8> %s0 to <8 x i16>
+ %s1s = zext <8 x i8> %s1 to <8 x i16>
+ %s = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s)
+ %s2 = trunc <8 x i16> %s to <8 x i8>
+ ret <8 x i8> %s2
+}
+
+define <4 x i16> @urhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) {
+; CHECK-LABEL: urhadd_v4i16_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: urhadd.4h v0, v0, v1
+; CHECK-NEXT: ret
+ %s0s = zext <4 x i16> %s0 to <4 x i32>
+ %s1s = zext <4 x i16> %s1 to <4 x i32>
+ %s = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s)
+ %s2 = trunc <4 x i32> %s to <4 x i16>
+ ret <4 x i16> %s2
+}
+
+define <2 x i32> @urhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) {
+; CHECK-LABEL: urhadd_v2i32_trunc:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: uaddl.2d v0, v0, v1
+; CHECK-NEXT: dup.2d v1, x8
+; CHECK-NEXT: add.2d v0, v0, v1
+; CHECK-NEXT: shrn.2s v0, v0, #1
+; CHECK-NEXT: ret
+ %s0s = zext <2 x i32> %s0 to <2 x i64>
+ %s1s = zext <2 x i32> %s1 to <2 x i64>
+ %s = call <2 x i64> @llvm.aarch64.neon.srhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s)
+ %s2 = trunc <2 x i64> %s to <2 x i32>
+ ret <2 x i32> %s2
+}
+
declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>)
declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>)
declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 670574f2..6df6d76 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -2,16 +2,21 @@
; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-; CHECK-GI: warning: Instruction selection used fallback path for dup_v2i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v2i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v2i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_str_v2i8
-
define <2 x i8> @dup_v2i8(i8 %a) {
-; CHECK-LABEL: dup_v2i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: dup v0.2s, w0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: dup_v2i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: dup v0.2s, w0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: dup_v2i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: dup v0.8b, w0
+; CHECK-GI-NEXT: umov w8, v0.b[0]
+; CHECK-GI-NEXT: umov w9, v0.b[1]
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%b = insertelement <2 x i8> poison, i8 %a, i64 0
%c = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
@@ -19,22 +24,45 @@ entry:
}
define <2 x i8> @duplane0_v2i8(<2 x i8> %b) {
-; CHECK-LABEL: duplane0_v2i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: dup v0.2s, v0.s[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: duplane0_v2i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: dup v0.2s, v0.s[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: duplane0_v2i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: mov v0.b[1], w8
+; CHECK-GI-NEXT: dup v0.8b, v0.b[0]
+; CHECK-GI-NEXT: umov w8, v0.b[0]
+; CHECK-GI-NEXT: umov w9, v0.b[1]
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%c = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
ret <2 x i8> %c
}
define <2 x i8> @loaddup_v2i8(ptr %p) {
-; CHECK-LABEL: loaddup_v2i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0]
-; CHECK-NEXT: dup v0.2s, v0.s[0]
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: loaddup_v2i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr b0, [x0]
+; CHECK-SD-NEXT: dup v0.2s, v0.s[0]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: loaddup_v2i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ld1r { v0.8b }, [x0]
+; CHECK-GI-NEXT: umov w8, v0.b[0]
+; CHECK-GI-NEXT: umov w9, v0.b[1]
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%a = load i8, ptr %p
%b = insertelement <2 x i8> poison, i8 %a, i64 0
@@ -43,12 +71,24 @@ entry:
}
define <2 x i8> @loaddup_str_v2i8(ptr %p) {
-; CHECK-LABEL: loaddup_str_v2i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: strb wzr, [x0]
-; CHECK-NEXT: dup v0.2s, w8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: loaddup_str_v2i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldrb w8, [x0]
+; CHECK-SD-NEXT: strb wzr, [x0]
+; CHECK-SD-NEXT: dup v0.2s, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: loaddup_str_v2i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr b0, [x0]
+; CHECK-GI-NEXT: strb wzr, [x0]
+; CHECK-GI-NEXT: dup v0.8b, v0.b[0]
+; CHECK-GI-NEXT: umov w8, v0.b[0]
+; CHECK-GI-NEXT: umov w9, v0.b[1]
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: ret
entry:
%a = load i8, ptr %p
%b = insertelement <2 x i8> poison, i8 %a, i64 0
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 765f6b7..7f07ef4 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -510,41 +510,40 @@ define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) {
;
; CHECK-GI-LABEL: fshl_i128:
; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov w8, #64 // =0x40
; CHECK-GI-NEXT: and x9, x4, #0x7f
-; CHECK-GI-NEXT: mov w10, #64 // =0x40
-; CHECK-GI-NEXT: lsl x14, x3, #63
-; CHECK-GI-NEXT: sub x12, x10, x9
+; CHECK-GI-NEXT: mov w10, #127 // =0x7f
+; CHECK-GI-NEXT: sub x12, x8, x9
; CHECK-GI-NEXT: lsl x13, x1, x9
-; CHECK-GI-NEXT: mov w8, #127 // =0x7f
+; CHECK-GI-NEXT: bic x10, x10, x4
; CHECK-GI-NEXT: lsr x12, x0, x12
-; CHECK-GI-NEXT: bic x8, x8, x4
-; CHECK-GI-NEXT: sub x15, x9, #64
+; CHECK-GI-NEXT: sub x14, x9, #64
+; CHECK-GI-NEXT: lsl x15, x0, x9
+; CHECK-GI-NEXT: extr x16, x3, x2, #1
; CHECK-GI-NEXT: cmp x9, #64
-; CHECK-GI-NEXT: lsl x9, x0, x9
-; CHECK-GI-NEXT: lsl x15, x0, x15
-; CHECK-GI-NEXT: orr x12, x12, x13
-; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT: lsr x14, x3, #1
-; CHECK-GI-NEXT: sub x10, x10, x8
-; CHECK-GI-NEXT: sub x16, x8, #64
-; CHECK-GI-NEXT: csel x9, x9, xzr, lo
-; CHECK-GI-NEXT: lsr x17, x13, x8
-; CHECK-GI-NEXT: lsl x10, x14, x10
-; CHECK-GI-NEXT: csel x12, x12, x15, lo
+; CHECK-GI-NEXT: sub x8, x8, x10
+; CHECK-GI-NEXT: orr x9, x12, x13
+; CHECK-GI-NEXT: lsr x12, x3, #1
+; CHECK-GI-NEXT: lsl x13, x0, x14
+; CHECK-GI-NEXT: csel x14, x15, xzr, lo
+; CHECK-GI-NEXT: sub x15, x10, #64
+; CHECK-GI-NEXT: lsr x17, x16, x10
+; CHECK-GI-NEXT: lsl x8, x12, x8
+; CHECK-GI-NEXT: csel x9, x9, x13, lo
; CHECK-GI-NEXT: tst x4, #0x7f
-; CHECK-GI-NEXT: lsr x15, x14, x16
+; CHECK-GI-NEXT: lsr x13, x12, x15
; CHECK-GI-NEXT: mvn x11, x4
-; CHECK-GI-NEXT: csel x12, x1, x12, eq
-; CHECK-GI-NEXT: orr x10, x17, x10
-; CHECK-GI-NEXT: cmp x8, #64
-; CHECK-GI-NEXT: lsr x14, x14, x8
-; CHECK-GI-NEXT: csel x10, x10, x15, lo
+; CHECK-GI-NEXT: csel x9, x1, x9, eq
+; CHECK-GI-NEXT: orr x8, x17, x8
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: lsr x12, x12, x10
+; CHECK-GI-NEXT: csel x8, x8, x13, lo
; CHECK-GI-NEXT: tst x11, #0x7f
-; CHECK-GI-NEXT: csel x10, x13, x10, eq
-; CHECK-GI-NEXT: cmp x8, #64
-; CHECK-GI-NEXT: csel x8, x14, xzr, lo
-; CHECK-GI-NEXT: orr x0, x9, x10
-; CHECK-GI-NEXT: orr x1, x12, x8
+; CHECK-GI-NEXT: csel x8, x16, x8, eq
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: csel x10, x12, xzr, lo
+; CHECK-GI-NEXT: orr x0, x14, x8
+; CHECK-GI-NEXT: orr x1, x9, x10
; CHECK-GI-NEXT: ret
entry:
%d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c)
@@ -571,41 +570,40 @@ define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) {
;
; CHECK-GI-LABEL: fshr_i128:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x0, #63
-; CHECK-GI-NEXT: mov w9, #127 // =0x7f
-; CHECK-GI-NEXT: mov w10, #64 // =0x40
-; CHECK-GI-NEXT: bic x9, x9, x4
-; CHECK-GI-NEXT: lsl x11, x0, #1
-; CHECK-GI-NEXT: and x12, x4, #0x7f
-; CHECK-GI-NEXT: orr x8, x8, x1, lsl #1
-; CHECK-GI-NEXT: sub x14, x10, x9
-; CHECK-GI-NEXT: sub x17, x9, #64
-; CHECK-GI-NEXT: lsl x15, x11, x9
-; CHECK-GI-NEXT: lsr x14, x11, x14
-; CHECK-GI-NEXT: cmp x9, #64
-; CHECK-GI-NEXT: lsl x16, x8, x9
-; CHECK-GI-NEXT: sub x9, x10, x12
-; CHECK-GI-NEXT: lsl x10, x11, x17
-; CHECK-GI-NEXT: mvn x13, x4
-; CHECK-GI-NEXT: csel x11, x15, xzr, lo
-; CHECK-GI-NEXT: sub x15, x12, #64
-; CHECK-GI-NEXT: orr x14, x14, x16
-; CHECK-GI-NEXT: lsr x16, x2, x12
-; CHECK-GI-NEXT: lsl x9, x3, x9
-; CHECK-GI-NEXT: csel x10, x14, x10, lo
-; CHECK-GI-NEXT: tst x13, #0x7f
-; CHECK-GI-NEXT: lsr x13, x3, x15
-; CHECK-GI-NEXT: csel x8, x8, x10, eq
-; CHECK-GI-NEXT: orr x9, x16, x9
-; CHECK-GI-NEXT: cmp x12, #64
-; CHECK-GI-NEXT: lsr x10, x3, x12
-; CHECK-GI-NEXT: csel x9, x9, x13, lo
+; CHECK-GI-NEXT: mov w8, #127 // =0x7f
+; CHECK-GI-NEXT: lsl x9, x0, #1
+; CHECK-GI-NEXT: extr x10, x1, x0, #63
+; CHECK-GI-NEXT: bic x8, x8, x4
+; CHECK-GI-NEXT: mov w11, #64 // =0x40
+; CHECK-GI-NEXT: and x14, x4, #0x7f
+; CHECK-GI-NEXT: sub x12, x11, x8
+; CHECK-GI-NEXT: lsl x13, x10, x8
+; CHECK-GI-NEXT: lsl x16, x9, x8
+; CHECK-GI-NEXT: lsr x12, x9, x12
+; CHECK-GI-NEXT: sub x17, x8, #64
+; CHECK-GI-NEXT: cmp x8, #64
+; CHECK-GI-NEXT: lsl x8, x9, x17
+; CHECK-GI-NEXT: sub x11, x11, x14
+; CHECK-GI-NEXT: mvn x15, x4
+; CHECK-GI-NEXT: orr x12, x12, x13
+; CHECK-GI-NEXT: csel x9, x16, xzr, lo
+; CHECK-GI-NEXT: sub x13, x14, #64
+; CHECK-GI-NEXT: lsr x16, x2, x14
+; CHECK-GI-NEXT: lsl x11, x3, x11
+; CHECK-GI-NEXT: csel x8, x12, x8, lo
+; CHECK-GI-NEXT: tst x15, #0x7f
+; CHECK-GI-NEXT: lsr x12, x3, x13
+; CHECK-GI-NEXT: csel x8, x10, x8, eq
+; CHECK-GI-NEXT: orr x10, x16, x11
+; CHECK-GI-NEXT: cmp x14, #64
+; CHECK-GI-NEXT: lsr x11, x3, x14
+; CHECK-GI-NEXT: csel x10, x10, x12, lo
; CHECK-GI-NEXT: tst x4, #0x7f
-; CHECK-GI-NEXT: csel x9, x2, x9, eq
-; CHECK-GI-NEXT: cmp x12, #64
-; CHECK-GI-NEXT: csel x10, x10, xzr, lo
-; CHECK-GI-NEXT: orr x0, x11, x9
-; CHECK-GI-NEXT: orr x1, x8, x10
+; CHECK-GI-NEXT: csel x10, x2, x10, eq
+; CHECK-GI-NEXT: cmp x14, #64
+; CHECK-GI-NEXT: csel x11, x11, xzr, lo
+; CHECK-GI-NEXT: orr x0, x9, x10
+; CHECK-GI-NEXT: orr x1, x8, x11
; CHECK-GI-NEXT: ret
entry:
%d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c)
@@ -720,10 +718,9 @@ define i128 @rotl_i128_c(i128 %a) {
;
; CHECK-GI-LABEL: rotl_i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x0, #61
-; CHECK-GI-NEXT: lsr x9, x1, #61
-; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT: extr x8, x1, x0, #61
+; CHECK-GI-NEXT: extr x0, x0, x1, #61
+; CHECK-GI-NEXT: mov x1, x8
; CHECK-GI-NEXT: ret
entry:
%d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3)
@@ -731,20 +728,12 @@ entry:
}
define i128 @rotr_i128_c(i128 %a) {
-; CHECK-SD-LABEL: rotr_i128_c:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: extr x8, x1, x0, #3
-; CHECK-SD-NEXT: extr x1, x0, x1, #3
-; CHECK-SD-NEXT: mov x0, x8
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: rotr_i128_c:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl x8, x1, #61
-; CHECK-GI-NEXT: lsl x9, x0, #61
-; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT: orr x1, x9, x1, lsr #3
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: rotr_i128_c:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: extr x8, x1, x0, #3
+; CHECK-NEXT: extr x1, x0, x1, #3
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
entry:
%d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3)
ret i128 %d
@@ -868,10 +857,8 @@ define i128 @fshl_i128_c(i128 %a, i128 %b) {
;
; CHECK-GI-LABEL: fshl_i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x0, #61
-; CHECK-GI-NEXT: lsr x9, x3, #61
-; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT: extr x1, x1, x0, #61
+; CHECK-GI-NEXT: extr x0, x0, x3, #61
; CHECK-GI-NEXT: ret
entry:
%d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3)
@@ -879,21 +866,12 @@ entry:
}
define i128 @fshr_i128_c(i128 %a, i128 %b) {
-; CHECK-SD-LABEL: fshr_i128_c:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: extr x8, x3, x2, #3
-; CHECK-SD-NEXT: extr x1, x0, x3, #3
-; CHECK-SD-NEXT: mov x0, x8
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fshr_i128_c:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl x8, x3, #61
-; CHECK-GI-NEXT: lsr x9, x3, #3
-; CHECK-GI-NEXT: orr x8, x8, x2, lsr #3
-; CHECK-GI-NEXT: orr x1, x9, x0, lsl #61
-; CHECK-GI-NEXT: mov x0, x8
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fshr_i128_c:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: extr x8, x3, x2, #3
+; CHECK-NEXT: extr x1, x0, x3, #3
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
entry:
%d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3)
ret i128 %d
@@ -3013,75 +2991,73 @@ define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w19, -16
; CHECK-GI-NEXT: ldr x11, [sp, #16]
-; CHECK-GI-NEXT: mov w10, #64 // =0x40
+; CHECK-GI-NEXT: mov w9, #64 // =0x40
; CHECK-GI-NEXT: ldr x12, [sp, #32]
; CHECK-GI-NEXT: mov w13, #127 // =0x7f
-; CHECK-GI-NEXT: and x9, x11, #0x7f
+; CHECK-GI-NEXT: and x8, x11, #0x7f
; CHECK-GI-NEXT: and x14, x12, #0x7f
-; CHECK-GI-NEXT: mvn x15, x11
-; CHECK-GI-NEXT: sub x8, x10, x9
-; CHECK-GI-NEXT: sub x16, x9, #64
-; CHECK-GI-NEXT: lsl x19, x1, x9
-; CHECK-GI-NEXT: lsr x18, x0, x8
-; CHECK-GI-NEXT: lsl x17, x0, x9
-; CHECK-GI-NEXT: lsl x16, x0, x16
-; CHECK-GI-NEXT: cmp x9, #64
-; CHECK-GI-NEXT: bic x0, x13, x11
-; CHECK-GI-NEXT: mvn x8, x12
-; CHECK-GI-NEXT: orr x18, x18, x19
-; CHECK-GI-NEXT: csel x9, x17, xzr, lo
+; CHECK-GI-NEXT: mvn x18, x11
+; CHECK-GI-NEXT: sub x10, x9, x8
+; CHECK-GI-NEXT: sub x15, x8, #64
+; CHECK-GI-NEXT: lsl x17, x1, x8
+; CHECK-GI-NEXT: lsr x16, x0, x10
+; CHECK-GI-NEXT: lsl x15, x0, x15
+; CHECK-GI-NEXT: cmp x8, #64
+; CHECK-GI-NEXT: lsl x19, x0, x8
+; CHECK-GI-NEXT: lsl x0, x3, x14
+; CHECK-GI-NEXT: mvn x10, x12
+; CHECK-GI-NEXT: orr x16, x16, x17
; CHECK-GI-NEXT: sub x17, x14, #64
-; CHECK-GI-NEXT: csel x16, x18, x16, lo
+; CHECK-GI-NEXT: csel x15, x16, x15, lo
+; CHECK-GI-NEXT: sub x16, x9, x14
+; CHECK-GI-NEXT: csel x8, x19, xzr, lo
+; CHECK-GI-NEXT: lsr x16, x2, x16
; CHECK-GI-NEXT: tst x11, #0x7f
-; CHECK-GI-NEXT: sub x11, x10, x14
-; CHECK-GI-NEXT: lsr x11, x2, x11
-; CHECK-GI-NEXT: lsl x18, x3, x14
-; CHECK-GI-NEXT: csel x16, x1, x16, eq
-; CHECK-GI-NEXT: lsl x1, x2, x14
+; CHECK-GI-NEXT: lsl x19, x2, x14
; CHECK-GI-NEXT: lsl x17, x2, x17
+; CHECK-GI-NEXT: csel x15, x1, x15, eq
; CHECK-GI-NEXT: cmp x14, #64
-; CHECK-GI-NEXT: lsl x14, x5, #63
-; CHECK-GI-NEXT: orr x11, x11, x18
-; CHECK-GI-NEXT: bic x13, x13, x12
-; CHECK-GI-NEXT: csel x18, x1, xzr, lo
-; CHECK-GI-NEXT: csel x11, x11, x17, lo
+; CHECK-GI-NEXT: orr x16, x16, x0
+; CHECK-GI-NEXT: bic x11, x13, x11
+; CHECK-GI-NEXT: csel x14, x19, xzr, lo
+; CHECK-GI-NEXT: csel x16, x16, x17, lo
; CHECK-GI-NEXT: tst x12, #0x7f
-; CHECK-GI-NEXT: lsr x12, x5, #1
-; CHECK-GI-NEXT: orr x14, x14, x4, lsr #1
-; CHECK-GI-NEXT: lsl x17, x7, #63
-; CHECK-GI-NEXT: sub x1, x10, x0
-; CHECK-GI-NEXT: csel x11, x3, x11, eq
-; CHECK-GI-NEXT: sub x2, x0, #64
-; CHECK-GI-NEXT: lsr x3, x14, x0
-; CHECK-GI-NEXT: lsl x1, x12, x1
-; CHECK-GI-NEXT: lsr x4, x7, #1
-; CHECK-GI-NEXT: orr x17, x17, x6, lsr #1
-; CHECK-GI-NEXT: lsr x2, x12, x2
-; CHECK-GI-NEXT: cmp x0, #64
-; CHECK-GI-NEXT: orr x1, x3, x1
-; CHECK-GI-NEXT: sub x10, x10, x13
-; CHECK-GI-NEXT: lsr x12, x12, x0
-; CHECK-GI-NEXT: csel x1, x1, x2, lo
-; CHECK-GI-NEXT: tst x15, #0x7f
-; CHECK-GI-NEXT: sub x15, x13, #64
-; CHECK-GI-NEXT: lsr x2, x17, x13
-; CHECK-GI-NEXT: lsl x10, x4, x10
-; CHECK-GI-NEXT: csel x14, x14, x1, eq
-; CHECK-GI-NEXT: cmp x0, #64
-; CHECK-GI-NEXT: lsr x15, x4, x15
-; CHECK-GI-NEXT: lsr x0, x4, x13
-; CHECK-GI-NEXT: csel x12, x12, xzr, lo
-; CHECK-GI-NEXT: orr x10, x2, x10
-; CHECK-GI-NEXT: cmp x13, #64
-; CHECK-GI-NEXT: csel x10, x10, x15, lo
-; CHECK-GI-NEXT: tst x8, #0x7f
-; CHECK-GI-NEXT: orr x1, x16, x12
-; CHECK-GI-NEXT: csel x8, x17, x10, eq
-; CHECK-GI-NEXT: cmp x13, #64
-; CHECK-GI-NEXT: csel x10, x0, xzr, lo
-; CHECK-GI-NEXT: orr x0, x9, x14
-; CHECK-GI-NEXT: orr x2, x18, x8
-; CHECK-GI-NEXT: orr x3, x11, x10
+; CHECK-GI-NEXT: lsr x17, x5, #1
+; CHECK-GI-NEXT: extr x0, x5, x4, #1
+; CHECK-GI-NEXT: bic x12, x13, x12
+; CHECK-GI-NEXT: csel x13, x3, x16, eq
+; CHECK-GI-NEXT: sub x16, x9, x11
+; CHECK-GI-NEXT: sub x1, x11, #64
+; CHECK-GI-NEXT: lsr x3, x7, #1
+; CHECK-GI-NEXT: lsr x2, x0, x11
+; CHECK-GI-NEXT: lsl x16, x17, x16
+; CHECK-GI-NEXT: extr x4, x7, x6, #1
+; CHECK-GI-NEXT: lsr x1, x17, x1
+; CHECK-GI-NEXT: cmp x11, #64
+; CHECK-GI-NEXT: sub x9, x9, x12
+; CHECK-GI-NEXT: orr x16, x2, x16
+; CHECK-GI-NEXT: lsr x17, x17, x11
+; CHECK-GI-NEXT: lsl x9, x3, x9
+; CHECK-GI-NEXT: csel x16, x16, x1, lo
+; CHECK-GI-NEXT: tst x18, #0x7f
+; CHECK-GI-NEXT: sub x18, x12, #64
+; CHECK-GI-NEXT: lsr x1, x4, x12
+; CHECK-GI-NEXT: csel x16, x0, x16, eq
+; CHECK-GI-NEXT: cmp x11, #64
+; CHECK-GI-NEXT: lsr x11, x3, x18
+; CHECK-GI-NEXT: csel x17, x17, xzr, lo
+; CHECK-GI-NEXT: cmp x12, #64
+; CHECK-GI-NEXT: orr x9, x1, x9
+; CHECK-GI-NEXT: lsr x18, x3, x12
+; CHECK-GI-NEXT: orr x0, x8, x16
+; CHECK-GI-NEXT: csel x9, x9, x11, lo
+; CHECK-GI-NEXT: tst x10, #0x7f
+; CHECK-GI-NEXT: orr x1, x15, x17
+; CHECK-GI-NEXT: csel x9, x4, x9, eq
+; CHECK-GI-NEXT: cmp x12, #64
+; CHECK-GI-NEXT: csel x10, x18, xzr, lo
+; CHECK-GI-NEXT: orr x2, x14, x9
+; CHECK-GI-NEXT: orr x3, x13, x10
; CHECK-GI-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
@@ -3125,75 +3101,73 @@ define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
; CHECK-GI-LABEL: fshr_v2i128:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr x9, [sp]
-; CHECK-GI-NEXT: lsl x12, x1, #1
-; CHECK-GI-NEXT: mov w11, #127 // =0x7f
-; CHECK-GI-NEXT: mov w14, #64 // =0x40
-; CHECK-GI-NEXT: lsl x15, x0, #1
+; CHECK-GI-NEXT: mov w10, #127 // =0x7f
+; CHECK-GI-NEXT: mov w12, #64 // =0x40
+; CHECK-GI-NEXT: lsl x13, x0, #1
+; CHECK-GI-NEXT: extr x14, x1, x0, #63
; CHECK-GI-NEXT: ldr x8, [sp, #16]
-; CHECK-GI-NEXT: bic x13, x11, x9
-; CHECK-GI-NEXT: orr x12, x12, x0, lsr #63
-; CHECK-GI-NEXT: lsl x1, x3, #1
-; CHECK-GI-NEXT: sub x17, x14, x13
-; CHECK-GI-NEXT: sub x18, x13, #64
-; CHECK-GI-NEXT: lsl x3, x15, x13
-; CHECK-GI-NEXT: lsr x17, x15, x17
-; CHECK-GI-NEXT: lsl x0, x12, x13
-; CHECK-GI-NEXT: lsl x15, x15, x18
-; CHECK-GI-NEXT: bic x11, x11, x8
+; CHECK-GI-NEXT: bic x11, x10, x9
+; CHECK-GI-NEXT: mvn x16, x9
+; CHECK-GI-NEXT: and x15, x9, #0x7f
+; CHECK-GI-NEXT: sub x17, x12, x11
+; CHECK-GI-NEXT: sub x18, x11, #64
+; CHECK-GI-NEXT: lsl x0, x14, x11
+; CHECK-GI-NEXT: lsr x17, x13, x17
+; CHECK-GI-NEXT: lsl x1, x13, x11
+; CHECK-GI-NEXT: lsl x13, x13, x18
+; CHECK-GI-NEXT: bic x10, x10, x8
; CHECK-GI-NEXT: lsl x18, x2, #1
-; CHECK-GI-NEXT: cmp x13, #64
+; CHECK-GI-NEXT: cmp x11, #64
; CHECK-GI-NEXT: orr x17, x17, x0
-; CHECK-GI-NEXT: orr x13, x1, x2, lsr #63
-; CHECK-GI-NEXT: mvn x16, x9
-; CHECK-GI-NEXT: csel x15, x17, x15, lo
-; CHECK-GI-NEXT: sub x17, x14, x11
-; CHECK-GI-NEXT: csel x0, x3, xzr, lo
+; CHECK-GI-NEXT: extr x11, x3, x2, #63
+; CHECK-GI-NEXT: csel x0, x1, xzr, lo
+; CHECK-GI-NEXT: csel x13, x17, x13, lo
+; CHECK-GI-NEXT: sub x17, x12, x10
; CHECK-GI-NEXT: tst x16, #0x7f
-; CHECK-GI-NEXT: sub x16, x11, #64
+; CHECK-GI-NEXT: sub x16, x10, #64
; CHECK-GI-NEXT: lsr x17, x18, x17
-; CHECK-GI-NEXT: lsl x2, x13, x11
-; CHECK-GI-NEXT: lsl x1, x18, x11
-; CHECK-GI-NEXT: csel x12, x12, x15, eq
-; CHECK-GI-NEXT: lsl x15, x18, x16
-; CHECK-GI-NEXT: and x10, x9, #0x7f
-; CHECK-GI-NEXT: cmp x11, #64
-; CHECK-GI-NEXT: mvn x11, x8
+; CHECK-GI-NEXT: lsl x2, x11, x10
+; CHECK-GI-NEXT: lsl x1, x18, x10
+; CHECK-GI-NEXT: csel x13, x14, x13, eq
+; CHECK-GI-NEXT: lsl x14, x18, x16
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: mvn x10, x8
; CHECK-GI-NEXT: orr x16, x17, x2
; CHECK-GI-NEXT: csel x17, x1, xzr, lo
-; CHECK-GI-NEXT: csel x15, x16, x15, lo
-; CHECK-GI-NEXT: tst x11, #0x7f
-; CHECK-GI-NEXT: sub x11, x14, x10
-; CHECK-GI-NEXT: sub x16, x10, #64
-; CHECK-GI-NEXT: lsr x18, x4, x10
-; CHECK-GI-NEXT: lsl x11, x5, x11
-; CHECK-GI-NEXT: csel x13, x13, x15, eq
-; CHECK-GI-NEXT: lsr x15, x5, x16
+; CHECK-GI-NEXT: csel x14, x16, x14, lo
+; CHECK-GI-NEXT: tst x10, #0x7f
+; CHECK-GI-NEXT: sub x10, x12, x15
+; CHECK-GI-NEXT: sub x16, x15, #64
+; CHECK-GI-NEXT: lsr x18, x4, x15
+; CHECK-GI-NEXT: lsl x10, x5, x10
+; CHECK-GI-NEXT: csel x11, x11, x14, eq
+; CHECK-GI-NEXT: lsr x14, x5, x16
; CHECK-GI-NEXT: and x1, x8, #0x7f
-; CHECK-GI-NEXT: orr x11, x18, x11
-; CHECK-GI-NEXT: cmp x10, #64
-; CHECK-GI-NEXT: lsr x16, x5, x10
-; CHECK-GI-NEXT: csel x11, x11, x15, lo
+; CHECK-GI-NEXT: cmp x15, #64
+; CHECK-GI-NEXT: lsr x16, x5, x15
+; CHECK-GI-NEXT: orr x10, x18, x10
+; CHECK-GI-NEXT: csel x10, x10, x14, lo
; CHECK-GI-NEXT: tst x9, #0x7f
-; CHECK-GI-NEXT: sub x9, x14, x1
-; CHECK-GI-NEXT: sub x14, x1, #64
-; CHECK-GI-NEXT: lsr x15, x6, x1
+; CHECK-GI-NEXT: sub x9, x12, x1
+; CHECK-GI-NEXT: sub x12, x1, #64
+; CHECK-GI-NEXT: lsr x14, x6, x1
; CHECK-GI-NEXT: lsl x9, x7, x9
-; CHECK-GI-NEXT: csel x11, x4, x11, eq
-; CHECK-GI-NEXT: cmp x10, #64
-; CHECK-GI-NEXT: lsr x10, x7, x14
-; CHECK-GI-NEXT: csel x14, x16, xzr, lo
-; CHECK-GI-NEXT: orr x9, x15, x9
+; CHECK-GI-NEXT: csel x10, x4, x10, eq
+; CHECK-GI-NEXT: cmp x15, #64
+; CHECK-GI-NEXT: lsr x12, x7, x12
+; CHECK-GI-NEXT: csel x15, x16, xzr, lo
+; CHECK-GI-NEXT: orr x9, x14, x9
; CHECK-GI-NEXT: cmp x1, #64
-; CHECK-GI-NEXT: lsr x15, x7, x1
-; CHECK-GI-NEXT: csel x9, x9, x10, lo
+; CHECK-GI-NEXT: lsr x14, x7, x1
+; CHECK-GI-NEXT: csel x9, x9, x12, lo
; CHECK-GI-NEXT: tst x8, #0x7f
; CHECK-GI-NEXT: csel x8, x6, x9, eq
; CHECK-GI-NEXT: cmp x1, #64
-; CHECK-GI-NEXT: orr x0, x0, x11
-; CHECK-GI-NEXT: csel x9, x15, xzr, lo
-; CHECK-GI-NEXT: orr x1, x12, x14
+; CHECK-GI-NEXT: orr x0, x0, x10
+; CHECK-GI-NEXT: csel x9, x14, xzr, lo
+; CHECK-GI-NEXT: orr x1, x13, x15
; CHECK-GI-NEXT: orr x2, x17, x8
-; CHECK-GI-NEXT: orr x3, x13, x9
+; CHECK-GI-NEXT: orr x3, x11, x9
; CHECK-GI-NEXT: ret
entry:
%d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
@@ -3863,15 +3837,12 @@ define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) {
;
; CHECK-GI-LABEL: rotl_v2i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x1, #61
-; CHECK-GI-NEXT: lsl x9, x1, #3
-; CHECK-GI-NEXT: lsl x10, x3, #3
-; CHECK-GI-NEXT: lsr x11, x3, #61
-; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT: extr x8, x0, x1, #61
+; CHECK-GI-NEXT: extr x9, x3, x2, #61
+; CHECK-GI-NEXT: extr x1, x1, x0, #61
+; CHECK-GI-NEXT: extr x2, x2, x3, #61
; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: mov x3, x9
; CHECK-GI-NEXT: ret
entry:
%d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -3891,14 +3862,12 @@ define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) {
;
; CHECK-GI-LABEL: rotr_v2i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl x8, x1, #61
-; CHECK-GI-NEXT: lsl x9, x3, #61
-; CHECK-GI-NEXT: lsl x10, x0, #61
-; CHECK-GI-NEXT: lsl x11, x2, #61
-; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT: orr x2, x9, x2, lsr #3
-; CHECK-GI-NEXT: orr x1, x10, x1, lsr #3
-; CHECK-GI-NEXT: orr x3, x11, x3, lsr #3
+; CHECK-GI-NEXT: extr x8, x1, x0, #3
+; CHECK-GI-NEXT: extr x9, x3, x2, #3
+; CHECK-GI-NEXT: extr x1, x0, x1, #3
+; CHECK-GI-NEXT: extr x3, x2, x3, #3
+; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: mov x2, x9
; CHECK-GI-NEXT: ret
entry:
%d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -4464,14 +4433,10 @@ define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
;
; CHECK-GI-LABEL: fshl_v2i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x5, #61
-; CHECK-GI-NEXT: lsl x9, x1, #3
-; CHECK-GI-NEXT: lsl x10, x3, #3
-; CHECK-GI-NEXT: lsr x11, x7, #61
-; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT: extr x8, x0, x5, #61
+; CHECK-GI-NEXT: extr x1, x1, x0, #61
+; CHECK-GI-NEXT: extr x3, x3, x2, #61
+; CHECK-GI-NEXT: extr x2, x2, x7, #61
; CHECK-GI-NEXT: mov x0, x8
; CHECK-GI-NEXT: ret
entry:
@@ -4480,29 +4445,15 @@ entry:
}
define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
-; CHECK-SD-LABEL: fshr_v2i128_c:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: extr x8, x5, x4, #3
-; CHECK-SD-NEXT: extr x9, x7, x6, #3
-; CHECK-SD-NEXT: extr x1, x0, x5, #3
-; CHECK-SD-NEXT: extr x3, x2, x7, #3
-; CHECK-SD-NEXT: mov x0, x8
-; CHECK-SD-NEXT: mov x2, x9
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fshr_v2i128_c:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl x8, x5, #61
-; CHECK-GI-NEXT: lsl x9, x7, #61
-; CHECK-GI-NEXT: lsr x10, x5, #3
-; CHECK-GI-NEXT: lsr x11, x7, #3
-; CHECK-GI-NEXT: orr x8, x8, x4, lsr #3
-; CHECK-GI-NEXT: orr x9, x9, x6, lsr #3
-; CHECK-GI-NEXT: orr x1, x10, x0, lsl #61
-; CHECK-GI-NEXT: orr x3, x11, x2, lsl #61
-; CHECK-GI-NEXT: mov x0, x8
-; CHECK-GI-NEXT: mov x2, x9
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fshr_v2i128_c:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: extr x8, x5, x4, #3
+; CHECK-NEXT: extr x9, x7, x6, #3
+; CHECK-NEXT: extr x1, x0, x5, #3
+; CHECK-NEXT: extr x3, x2, x7, #3
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: mov x2, x9
+; CHECK-NEXT: ret
entry:
%d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>)
ret <2 x i128> %d
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index f9fd2ad..90fb102 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -85,41 +85,40 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
;
; CHECK-GI-LABEL: fshl_i128:
; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #64 // =0x40
; CHECK-GI-NEXT: and x9, x4, #0x7f
-; CHECK-GI-NEXT: mov w10, #64 // =0x40
-; CHECK-GI-NEXT: lsl x14, x3, #63
-; CHECK-GI-NEXT: sub x12, x10, x9
+; CHECK-GI-NEXT: mov w10, #127 // =0x7f
+; CHECK-GI-NEXT: sub x12, x8, x9
; CHECK-GI-NEXT: lsl x13, x1, x9
-; CHECK-GI-NEXT: mov w8, #127 // =0x7f
+; CHECK-GI-NEXT: bic x10, x10, x4
; CHECK-GI-NEXT: lsr x12, x0, x12
-; CHECK-GI-NEXT: bic x8, x8, x4
-; CHECK-GI-NEXT: sub x15, x9, #64
+; CHECK-GI-NEXT: sub x14, x9, #64
+; CHECK-GI-NEXT: lsl x15, x0, x9
+; CHECK-GI-NEXT: extr x16, x3, x2, #1
; CHECK-GI-NEXT: cmp x9, #64
-; CHECK-GI-NEXT: lsl x9, x0, x9
-; CHECK-GI-NEXT: lsl x15, x0, x15
-; CHECK-GI-NEXT: orr x12, x12, x13
-; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT: lsr x14, x3, #1
-; CHECK-GI-NEXT: sub x10, x10, x8
-; CHECK-GI-NEXT: sub x16, x8, #64
-; CHECK-GI-NEXT: csel x9, x9, xzr, lo
-; CHECK-GI-NEXT: lsr x17, x13, x8
-; CHECK-GI-NEXT: lsl x10, x14, x10
-; CHECK-GI-NEXT: csel x12, x12, x15, lo
+; CHECK-GI-NEXT: sub x8, x8, x10
+; CHECK-GI-NEXT: orr x9, x12, x13
+; CHECK-GI-NEXT: lsr x12, x3, #1
+; CHECK-GI-NEXT: lsl x13, x0, x14
+; CHECK-GI-NEXT: csel x14, x15, xzr, lo
+; CHECK-GI-NEXT: sub x15, x10, #64
+; CHECK-GI-NEXT: lsr x17, x16, x10
+; CHECK-GI-NEXT: lsl x8, x12, x8
+; CHECK-GI-NEXT: csel x9, x9, x13, lo
; CHECK-GI-NEXT: tst x4, #0x7f
-; CHECK-GI-NEXT: lsr x15, x14, x16
+; CHECK-GI-NEXT: lsr x13, x12, x15
; CHECK-GI-NEXT: mvn x11, x4
-; CHECK-GI-NEXT: csel x12, x1, x12, eq
-; CHECK-GI-NEXT: orr x10, x17, x10
-; CHECK-GI-NEXT: cmp x8, #64
-; CHECK-GI-NEXT: lsr x14, x14, x8
-; CHECK-GI-NEXT: csel x10, x10, x15, lo
+; CHECK-GI-NEXT: csel x9, x1, x9, eq
+; CHECK-GI-NEXT: orr x8, x17, x8
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: lsr x12, x12, x10
+; CHECK-GI-NEXT: csel x8, x8, x13, lo
; CHECK-GI-NEXT: tst x11, #0x7f
-; CHECK-GI-NEXT: csel x10, x13, x10, eq
-; CHECK-GI-NEXT: cmp x8, #64
-; CHECK-GI-NEXT: csel x8, x14, xzr, lo
-; CHECK-GI-NEXT: orr x0, x9, x10
-; CHECK-GI-NEXT: orr x1, x12, x8
+; CHECK-GI-NEXT: csel x8, x16, x8, eq
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: csel x10, x12, xzr, lo
+; CHECK-GI-NEXT: orr x0, x14, x8
+; CHECK-GI-NEXT: orr x1, x9, x10
; CHECK-GI-NEXT: ret
%f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
ret i128 %f
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1cb92e4..87b1108 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -559,20 +559,18 @@ define i128 @ui128_7(i128 %a, i128 %b) {
; CHECK-GI-NEXT: add x8, x8, x10
; CHECK-GI-NEXT: subs x10, x0, x9
; CHECK-GI-NEXT: sbc x11, x1, x8
-; CHECK-GI-NEXT: lsl x12, x11, #63
+; CHECK-GI-NEXT: extr x10, x11, x10, #1
; CHECK-GI-NEXT: lsr x11, x11, #1
-; CHECK-GI-NEXT: orr x10, x12, x10, lsr #1
; CHECK-GI-NEXT: adds x9, x10, x9
+; CHECK-GI-NEXT: mov w10, #7 // =0x7
; CHECK-GI-NEXT: adc x8, x11, x8
-; CHECK-GI-NEXT: lsl x10, x8, #62
+; CHECK-GI-NEXT: extr x9, x8, x9, #2
; CHECK-GI-NEXT: lsr x8, x8, #2
-; CHECK-GI-NEXT: orr x9, x10, x9, lsr #2
-; CHECK-GI-NEXT: mov w10, #7 // =0x7
-; CHECK-GI-NEXT: lsl x12, x8, #3
; CHECK-GI-NEXT: umulh x10, x9, x10
; CHECK-GI-NEXT: lsl x11, x9, #3
-; CHECK-GI-NEXT: sub x8, x12, x8
+; CHECK-GI-NEXT: lsl x12, x8, #3
; CHECK-GI-NEXT: sub x9, x11, x9
+; CHECK-GI-NEXT: sub x8, x12, x8
; CHECK-GI-NEXT: subs x0, x0, x9
; CHECK-GI-NEXT: add x8, x8, x10
; CHECK-GI-NEXT: sbc x1, x1, x8
@@ -640,10 +638,9 @@ define i128 @ui128_100(i128 %a, i128 %b) {
; CHECK-GI-NEXT: add x10, x11, x12
; CHECK-GI-NEXT: add x8, x8, x14
; CHECK-GI-NEXT: add x8, x8, x10
-; CHECK-GI-NEXT: lsl x10, x8, #60
-; CHECK-GI-NEXT: lsr x8, x8, #4
-; CHECK-GI-NEXT: orr x9, x10, x9, lsr #4
; CHECK-GI-NEXT: mov w10, #100 // =0x64
+; CHECK-GI-NEXT: extr x9, x8, x9, #4
+; CHECK-GI-NEXT: lsr x8, x8, #4
; CHECK-GI-NEXT: umulh x11, x9, x10
; CHECK-GI-NEXT: mul x9, x9, x10
; CHECK-GI-NEXT: madd x8, x8, x10, x11
@@ -3317,36 +3314,32 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
; CHECK-GI-NEXT: sbc x14, x1, x12
; CHECK-GI-NEXT: add x8, x8, x13
; CHECK-GI-NEXT: subs x13, x2, x10
-; CHECK-GI-NEXT: lsl x15, x14, #63
-; CHECK-GI-NEXT: sbc x16, x3, x8
+; CHECK-GI-NEXT: extr x9, x14, x9, #1
+; CHECK-GI-NEXT: sbc x15, x3, x8
; CHECK-GI-NEXT: lsr x14, x14, #1
-; CHECK-GI-NEXT: orr x9, x15, x9, lsr #1
-; CHECK-GI-NEXT: lsl x15, x16, #63
-; CHECK-GI-NEXT: orr x13, x15, x13, lsr #1
+; CHECK-GI-NEXT: extr x13, x15, x13, #1
; CHECK-GI-NEXT: adds x9, x9, x11
-; CHECK-GI-NEXT: lsr x11, x16, #1
+; CHECK-GI-NEXT: lsr x11, x15, #1
; CHECK-GI-NEXT: adc x12, x14, x12
; CHECK-GI-NEXT: adds x10, x13, x10
-; CHECK-GI-NEXT: lsl x13, x12, #62
-; CHECK-GI-NEXT: lsr x12, x12, #2
-; CHECK-GI-NEXT: adc x8, x11, x8
-; CHECK-GI-NEXT: lsl x11, x8, #62
-; CHECK-GI-NEXT: orr x9, x13, x9, lsr #2
+; CHECK-GI-NEXT: extr x9, x12, x9, #2
; CHECK-GI-NEXT: mov w13, #7 // =0x7
+; CHECK-GI-NEXT: adc x8, x11, x8
+; CHECK-GI-NEXT: lsr x11, x12, #2
+; CHECK-GI-NEXT: extr x10, x8, x10, #2
+; CHECK-GI-NEXT: umulh x12, x9, x13
; CHECK-GI-NEXT: lsr x8, x8, #2
-; CHECK-GI-NEXT: lsl x14, x12, #3
-; CHECK-GI-NEXT: orr x10, x11, x10, lsr #2
-; CHECK-GI-NEXT: umulh x11, x9, x13
+; CHECK-GI-NEXT: lsl x14, x11, #3
; CHECK-GI-NEXT: lsl x15, x9, #3
-; CHECK-GI-NEXT: sub x12, x14, x12
-; CHECK-GI-NEXT: lsl x16, x8, #3
; CHECK-GI-NEXT: umulh x13, x10, x13
+; CHECK-GI-NEXT: lsl x16, x8, #3
+; CHECK-GI-NEXT: sub x11, x14, x11
; CHECK-GI-NEXT: lsl x14, x10, #3
; CHECK-GI-NEXT: sub x9, x15, x9
; CHECK-GI-NEXT: sub x8, x16, x8
; CHECK-GI-NEXT: subs x0, x0, x9
+; CHECK-GI-NEXT: add x11, x11, x12
; CHECK-GI-NEXT: sub x10, x14, x10
-; CHECK-GI-NEXT: add x11, x12, x11
; CHECK-GI-NEXT: sbc x1, x1, x11
; CHECK-GI-NEXT: subs x2, x2, x10
; CHECK-GI-NEXT: add x8, x8, x13
@@ -3394,9 +3387,10 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov x10, #23593 // =0x5c29
; CHECK-GI-NEXT: mov x8, #62914 // =0xf5c2
-; CHECK-GI-NEXT: sub x18, x0, x0
+; CHECK-GI-NEXT: and x5, xzr, #0x1
; CHECK-GI-NEXT: movk x10, #49807, lsl #16
; CHECK-GI-NEXT: movk x8, #23592, lsl #16
+; CHECK-GI-NEXT: umulh x18, x0, xzr
; CHECK-GI-NEXT: movk x10, #10485, lsl #32
; CHECK-GI-NEXT: movk x8, #49807, lsl #32
; CHECK-GI-NEXT: movk x10, #36700, lsl #48
@@ -3409,84 +3403,81 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
; CHECK-GI-NEXT: umulh x15, x1, x10
; CHECK-GI-NEXT: cset w12, hs
; CHECK-GI-NEXT: cmn x11, x13
-; CHECK-GI-NEXT: and x11, x12, #0x1
-; CHECK-GI-NEXT: umulh x16, x0, x8
-; CHECK-GI-NEXT: cset w12, hs
+; CHECK-GI-NEXT: sub x13, x0, x0
; CHECK-GI-NEXT: and x12, x12, #0x1
-; CHECK-GI-NEXT: add x14, x14, x18
-; CHECK-GI-NEXT: add x11, x11, x12
-; CHECK-GI-NEXT: and x12, xzr, #0x1
+; CHECK-GI-NEXT: umulh x16, x0, x8
+; CHECK-GI-NEXT: cset w11, hs
+; CHECK-GI-NEXT: add x13, x14, x13
+; CHECK-GI-NEXT: and x11, x11, #0x1
+; CHECK-GI-NEXT: and x14, xzr, #0x1
; CHECK-GI-NEXT: umulh x9, xzr, x10
-; CHECK-GI-NEXT: adds x14, x14, x15
-; CHECK-GI-NEXT: and x15, xzr, #0x1
+; CHECK-GI-NEXT: add x11, x12, x11
+; CHECK-GI-NEXT: add x12, x5, x14
+; CHECK-GI-NEXT: adds x13, x13, x15
; CHECK-GI-NEXT: umulh x17, x1, x8
-; CHECK-GI-NEXT: cset w4, hs
-; CHECK-GI-NEXT: add x15, x12, x15
-; CHECK-GI-NEXT: adds x12, x14, x16
-; CHECK-GI-NEXT: and x4, x4, #0x1
-; CHECK-GI-NEXT: mul x18, x3, x10
; CHECK-GI-NEXT: cset w14, hs
-; CHECK-GI-NEXT: adds x12, x12, x11
-; CHECK-GI-NEXT: add x11, x15, x4
; CHECK-GI-NEXT: and x14, x14, #0x1
-; CHECK-GI-NEXT: cset w15, hs
-; CHECK-GI-NEXT: mul x5, x2, x8
-; CHECK-GI-NEXT: add x11, x11, x14
-; CHECK-GI-NEXT: and x14, x15, #0x1
-; CHECK-GI-NEXT: add x17, x9, x17
-; CHECK-GI-NEXT: add x14, x11, x14
-; CHECK-GI-NEXT: mov w11, #100 // =0x64
-; CHECK-GI-NEXT: umulh x13, x0, xzr
-; CHECK-GI-NEXT: umulh x16, x2, x10
-; CHECK-GI-NEXT: adds x18, x18, x5
-; CHECK-GI-NEXT: mul x15, x3, x8
-; CHECK-GI-NEXT: add x13, x17, x13
-; CHECK-GI-NEXT: cset w17, hs
-; CHECK-GI-NEXT: umulh x10, x3, x10
-; CHECK-GI-NEXT: add x13, x13, x14
-; CHECK-GI-NEXT: and x17, x17, #0x1
-; CHECK-GI-NEXT: cmn x18, x16
-; CHECK-GI-NEXT: sub x18, x2, x2
-; CHECK-GI-NEXT: umulh x16, x2, x8
+; CHECK-GI-NEXT: adds x13, x13, x16
+; CHECK-GI-NEXT: mul x4, x3, x10
+; CHECK-GI-NEXT: add x12, x12, x14
; CHECK-GI-NEXT: cset w14, hs
-; CHECK-GI-NEXT: and x14, x14, #0x1
-; CHECK-GI-NEXT: add x15, x15, x18
+; CHECK-GI-NEXT: adds x11, x13, x11
+; CHECK-GI-NEXT: and x13, x14, #0x1
+; CHECK-GI-NEXT: mul x15, x2, x8
+; CHECK-GI-NEXT: cset w14, hs
+; CHECK-GI-NEXT: add x12, x12, x13
+; CHECK-GI-NEXT: and x13, x14, #0x1
+; CHECK-GI-NEXT: add x14, x9, x17
+; CHECK-GI-NEXT: sub x17, x2, x2
+; CHECK-GI-NEXT: umulh x16, x2, x10
+; CHECK-GI-NEXT: add x12, x12, x13
+; CHECK-GI-NEXT: add x13, x14, x18
+; CHECK-GI-NEXT: add x12, x13, x12
; CHECK-GI-NEXT: and x18, xzr, #0x1
-; CHECK-GI-NEXT: add x14, x17, x14
+; CHECK-GI-NEXT: mul x5, x3, x8
+; CHECK-GI-NEXT: extr x11, x12, x11, #4
+; CHECK-GI-NEXT: adds x13, x4, x15
+; CHECK-GI-NEXT: umulh x14, x3, x10
+; CHECK-GI-NEXT: cset w15, hs
+; CHECK-GI-NEXT: mov w10, #100 // =0x64
+; CHECK-GI-NEXT: cmn x13, x16
+; CHECK-GI-NEXT: and x15, x15, #0x1
+; CHECK-GI-NEXT: umulh x13, x2, x8
+; CHECK-GI-NEXT: cset w16, hs
+; CHECK-GI-NEXT: add x17, x5, x17
+; CHECK-GI-NEXT: and x16, x16, #0x1
; CHECK-GI-NEXT: umulh x8, x3, x8
+; CHECK-GI-NEXT: add x15, x15, x16
+; CHECK-GI-NEXT: adds x14, x17, x14
; CHECK-GI-NEXT: and x17, xzr, #0x1
-; CHECK-GI-NEXT: adds x10, x15, x10
-; CHECK-GI-NEXT: add x15, x17, x18
+; CHECK-GI-NEXT: add x16, x18, x17
; CHECK-GI-NEXT: cset w17, hs
-; CHECK-GI-NEXT: umulh x18, x2, xzr
+; CHECK-GI-NEXT: adds x13, x14, x13
+; CHECK-GI-NEXT: umulh x14, x2, xzr
; CHECK-GI-NEXT: and x17, x17, #0x1
-; CHECK-GI-NEXT: adds x10, x10, x16
-; CHECK-GI-NEXT: lsl x16, x13, #60
-; CHECK-GI-NEXT: add x15, x15, x17
-; CHECK-GI-NEXT: cset w17, hs
-; CHECK-GI-NEXT: adds x10, x10, x14
-; CHECK-GI-NEXT: and x14, x17, #0x1
+; CHECK-GI-NEXT: cset w18, hs
+; CHECK-GI-NEXT: adds x13, x13, x15
+; CHECK-GI-NEXT: add x15, x16, x17
+; CHECK-GI-NEXT: and x16, x18, #0x1
; CHECK-GI-NEXT: cset w17, hs
; CHECK-GI-NEXT: add x8, x9, x8
-; CHECK-GI-NEXT: add x14, x15, x14
-; CHECK-GI-NEXT: and x15, x17, #0x1
-; CHECK-GI-NEXT: orr x12, x16, x12, lsr #4
-; CHECK-GI-NEXT: add x9, x14, x15
-; CHECK-GI-NEXT: add x8, x8, x18
-; CHECK-GI-NEXT: add x8, x8, x9
-; CHECK-GI-NEXT: lsr x9, x13, #4
-; CHECK-GI-NEXT: umulh x14, x12, x11
-; CHECK-GI-NEXT: lsl x13, x8, #60
+; CHECK-GI-NEXT: add x15, x15, x16
+; CHECK-GI-NEXT: and x16, x17, #0x1
+; CHECK-GI-NEXT: lsr x9, x12, #4
+; CHECK-GI-NEXT: add x15, x15, x16
+; CHECK-GI-NEXT: umulh x17, x11, x10
+; CHECK-GI-NEXT: add x8, x8, x14
+; CHECK-GI-NEXT: add x8, x8, x15
+; CHECK-GI-NEXT: mul x11, x11, x10
+; CHECK-GI-NEXT: extr x12, x8, x13, #4
; CHECK-GI-NEXT: lsr x8, x8, #4
-; CHECK-GI-NEXT: mul x12, x12, x11
-; CHECK-GI-NEXT: orr x10, x13, x10, lsr #4
-; CHECK-GI-NEXT: madd x9, x9, x11, x14
-; CHECK-GI-NEXT: umulh x13, x10, x11
-; CHECK-GI-NEXT: subs x0, x0, x12
-; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: madd x9, x9, x10, x17
+; CHECK-GI-NEXT: umulh x13, x12, x10
+; CHECK-GI-NEXT: subs x0, x0, x11
+; CHECK-GI-NEXT: mul x12, x12, x10
; CHECK-GI-NEXT: sbc x1, x1, x9
-; CHECK-GI-NEXT: madd x8, x8, x11, x13
-; CHECK-GI-NEXT: subs x2, x2, x10
+; CHECK-GI-NEXT: madd x8, x8, x10, x13
+; CHECK-GI-NEXT: subs x2, x2, x12
; CHECK-GI-NEXT: sbc x3, x3, x8
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index e4f9efa..0504959 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -351,7 +351,6 @@ define i64 @test_many_callee_arguments(
ret i64 %ret
}
-; FIXME: The new lowering should avoid saves/restores in the probing loop.
define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{
; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
; CHECK: // %bb.0:
@@ -389,16 +388,14 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
; CHECK-NEWLOWERING-NEXT: mov x8, sp
; CHECK-NEWLOWERING-NEXT: sub x19, x8, x0
-; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEWLOWERING-NEXT: cmp sp, x19
; CHECK-NEWLOWERING-NEXT: b.le .LBB7_3
; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
-; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
; CHECK-NEWLOWERING-NEXT: b .LBB7_1
; CHECK-NEWLOWERING-NEXT: .LBB7_3:
; CHECK-NEWLOWERING-NEXT: mov sp, x19
diff --git a/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll b/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll
index 63c6533..a5b7612 100644
--- a/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll
@@ -64,6 +64,6 @@ define i64 @test_sme_calling_convention_x2() nounwind {
ret i64 %pstate.sm
}
-declare void @__arm_tpidr2_save()
-declare i64 @__arm_get_current_vg()
-declare {i64, i64} @__arm_sme_state()
+declare aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save()
+declare aarch64_sme_preservemost_from_x1 i64 @__arm_get_current_vg()
+declare aarch64_sme_preservemost_from_x2 {i64, i64} @__arm_sme_state()
diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
index 18ea07e..c753e9c 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
@@ -228,65 +228,34 @@ exit:
ret void
}
-; FIXME: The codegen for this case could be improved (by tuning weights).
-; Here the ZA save has been hoisted out of the conditional, but would be better
-; to sink it.
define void @cond_private_za_call(i1 %cond) "aarch64_inout_za" nounwind {
-; CHECK-LABEL: cond_private_za_call:
-; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEXT: tbz w0, #0, .LBB3_4
-; CHECK-NEXT: // %bb.1: // %private_za_call
-; CHECK-NEXT: sub x8, x29, #16
-; CHECK-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEXT: bl private_za_call
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB3_3
-; CHECK-NEXT: // %bb.2: // %private_za_call
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB3_3: // %private_za_call
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: .LBB3_4: // %exit
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: b shared_za_call
-;
-; CHECK-NEWLOWERING-LABEL: cond_private_za_call:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB3_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %private_za_call
-; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: .LBB3_2: // %exit
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB3_4
-; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB3_4: // %exit
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: b shared_za_call
+; CHECK-COMMON-LABEL: cond_private_za_call:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: mov x29, sp
+; CHECK-COMMON-NEXT: sub sp, sp, #16
+; CHECK-COMMON-NEXT: rdsvl x8, #1
+; CHECK-COMMON-NEXT: mov x9, sp
+; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
+; CHECK-COMMON-NEXT: mov sp, x9
+; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-COMMON-NEXT: tbz w0, #0, .LBB3_4
+; CHECK-COMMON-NEXT: // %bb.1: // %private_za_call
+; CHECK-COMMON-NEXT: sub x8, x29, #16
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8
+; CHECK-COMMON-NEXT: bl private_za_call
+; CHECK-COMMON-NEXT: smstart za
+; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT: sub x0, x29, #16
+; CHECK-COMMON-NEXT: cbnz x8, .LBB3_3
+; CHECK-COMMON-NEXT: // %bb.2: // %private_za_call
+; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore
+; CHECK-COMMON-NEXT: .LBB3_3: // %private_za_call
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT: .LBB3_4: // %exit
+; CHECK-COMMON-NEXT: mov sp, x29
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: b shared_za_call
br i1 %cond, label %private_za_call, label %exit
private_za_call:
@@ -910,7 +879,7 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
; CHECK-NEWLOWERING-LABEL: loop_with_external_entry:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x29, sp
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
@@ -923,23 +892,27 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %init
; CHECK-NEWLOWERING-NEXT: bl shared_za_call
; CHECK-NEWLOWERING-NEXT: .LBB11_2: // %loop.preheader
-; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
+; CHECK-NEWLOWERING-NEXT: b .LBB11_4
; CHECK-NEWLOWERING-NEXT: .LBB11_3: // %loop
+; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB11_6
+; CHECK-NEWLOWERING-NEXT: .LBB11_4: // %loop
; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEWLOWERING-NEXT: bl private_za_call
-; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB11_3
-; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_6
-; CHECK-NEWLOWERING-NEXT: // %bb.5: // %exit
+; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_3
+; CHECK-NEWLOWERING-NEXT: // %bb.5: // %loop
+; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEWLOWERING-NEXT: b .LBB11_3
; CHECK-NEWLOWERING-NEXT: .LBB11_6: // %exit
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index 3f35cb5..dcdc56c 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -63,25 +63,17 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe
; CHECK-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr]
; CHECK-NEXT: bl __cxa_throw
; CHECK-NEXT: .Ltmp1: // EH_LABEL
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB0_4
-; CHECK-NEXT: // %bb.3: // %throw_exception
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB0_4: // %throw_exception
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: // %bb.5: // %throw_fail
-; CHECK-NEXT: .LBB0_6: // %unwind_dtors
+; CHECK-NEXT: // %bb.3: // %throw_fail
+; CHECK-NEXT: .LBB0_4: // %unwind_dtors
; CHECK-NEXT: .Ltmp2: // EH_LABEL
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB0_8
-; CHECK-NEXT: // %bb.7: // %unwind_dtors
+; CHECK-NEXT: cbnz x8, .LBB0_6
+; CHECK-NEXT: // %bb.5: // %unwind_dtors
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB0_8: // %unwind_dtors
+; CHECK-NEXT: .LBB0_6: // %unwind_dtors
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: bl shared_za_call
; CHECK-NEXT: sub x8, x29, #16
diff --git a/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll
new file mode 100644
index 0000000..0306b27
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll
@@ -0,0 +1,296 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi < %s | FileCheck %s
+
+; This test case was generated by lowering mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir to LLVM IR.
+; The actual contents of the function are not that important. The main interesting quality here is that many blocks
+; don't directly use ZA. The only blocks that require ZA are the MOPA (and load/stores) in the inner loop, and the
+;`printMemrefF32()` call in the exit block.
+;
+; If ZA states are not propagated in the MachineSMEABIPass block %48 (which is within the outer loop), will
+; have an edge to block %226 (the exit block), which requires ZA in the "saved" state, and an edge to block %51
+; (which has no preference on ZA state). This means block %48 will also end up in the locally saved state.
+; This is not really what we want, as it means we will save/restore ZA in the outer loop. We can fix this by
+; propagating the "active" state from the inner loop through basic blocks with no preference, to ensure the outer
+; loop is in the "active" state too.
+;
+; If done correctly, the only ZA save/restore should be in the exit block (with all other blocks in the active state).
+
+define void @matmul(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr %7, ptr %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13, ptr %14, ptr %15, i64 %16, i64 %17, i64 %18, i64 %19, i64 %20) #0 {
+; Check for a ZA zero in the entry block, then no uses of TPIDR2_EL0 (for ZA saves/restore)
+; until the exit block (which contains the call to printMemrefF32).
+;
+; CHECK-LABEL: matmul:
+; CHECK: zero {za}
+; CHECK-NOT: TPIDR2_EL0
+; CHECK: msr TPIDR2_EL0, x{{.*}}
+; CHECK-NOT: .LBB{{.*}}
+; CHECK: bl printMemrefF32
+ %22 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %14, 0
+ %23 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %22, ptr %15, 1
+ %24 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %23, i64 %16, 2
+ %25 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %24, i64 %17, 3, 0
+ %26 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %25, i64 %19, 4, 0
+ %27 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %26, i64 %18, 3, 1
+ %28 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %27, i64 %20, 4, 1
+ %29 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %7, 0
+ %30 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %29, ptr %8, 1
+ %31 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %30, i64 %9, 2
+ %32 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %31, i64 %10, 3, 0
+ %33 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %32, i64 %12, 4, 0
+ %34 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %33, i64 %11, 3, 1
+ %35 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %34, i64 %13, 4, 1
+ %36 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %0, 0
+ %37 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %36, ptr %1, 1
+ %38 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %37, i64 %2, 2
+ %39 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %38, i64 %3, 3, 0
+ %40 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %39, i64 %5, 4, 0
+ %41 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %40, i64 %4, 3, 1
+ %42 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %41, i64 %6, 4, 1
+ %43 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0
+ %44 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1
+ %45 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1
+ %46 = call i64 @llvm.vscale.i64()
+ %47 = mul i64 %46, 4
+ br label %48
+
+48: ; preds = %224, %21
+ %49 = phi i64 [ %225, %224 ], [ 0, %21 ]
+ %50 = icmp slt i64 %49, %43
+ br i1 %50, label %51, label %226
+
+51: ; preds = %48
+ %52 = sub i64 %43, %49
+ %53 = call i64 @llvm.smin.i64(i64 %47, i64 %52)
+ %54 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+ %55 = trunc i64 %53 to i32
+ %56 = insertelement <vscale x 4 x i32> poison, i32 %55, i32 0
+ %57 = shufflevector <vscale x 4 x i32> %56, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+ %58 = icmp slt <vscale x 4 x i32> %54, %57
+ br label %59
+
+59: ; preds = %222, %51
+ %60 = phi i64 [ %223, %222 ], [ 0, %51 ]
+ %61 = icmp slt i64 %60, %45
+ br i1 %61, label %62, label %224
+
+62: ; preds = %59
+ %63 = sub i64 %45, %60
+ %64 = call i64 @llvm.smin.i64(i64 %47, i64 %63)
+ %65 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 0
+ %66 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 1
+ %67 = insertvalue { ptr, ptr, i64 } poison, ptr %65, 0
+ %68 = insertvalue { ptr, ptr, i64 } %67, ptr %66, 1
+ %69 = insertvalue { ptr, ptr, i64 } %68, i64 0, 2
+ %70 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 2
+ %71 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 0
+ %72 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 1
+ %73 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 0
+ %74 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 1
+ %75 = mul nsw i64 %49, %73
+ %76 = add i64 %70, %75
+ %77 = mul nsw i64 %60, %74
+ %78 = add i64 %76, %77
+ %79 = extractvalue { ptr, ptr, i64 } %69, 0
+ %80 = extractvalue { ptr, ptr, i64 } %69, 1
+ %81 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %79, 0
+ %82 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %81, ptr %80, 1
+ %83 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %82, i64 %78, 2
+ %84 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %83, i64 %53, 3, 0
+ %85 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %84, i64 %73, 4, 0
+ %86 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %85, i64 %64, 3, 1
+ %87 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %86, i64 %74, 4, 1
+ %88 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+ %89 = trunc i64 %64 to i32
+ %90 = insertelement <vscale x 4 x i32> poison, i32 %89, i32 0
+ %91 = shufflevector <vscale x 4 x i32> %90, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+ %92 = icmp slt <vscale x 4 x i32> %88, %91
+ br label %93
+
+93: ; preds = %220, %62
+ %94 = phi i64 [ %221, %220 ], [ 0, %62 ]
+ %95 = icmp slt i64 %94, %44
+ br i1 %95, label %96, label %222
+
+96: ; preds = %93
+ %97 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 0
+ %98 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 1
+ %99 = insertvalue { ptr, ptr, i64 } poison, ptr %97, 0
+ %100 = insertvalue { ptr, ptr, i64 } %99, ptr %98, 1
+ %101 = insertvalue { ptr, ptr, i64 } %100, i64 0, 2
+ %102 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 2
+ %103 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0
+ %104 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1
+ %105 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 0
+ %106 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 1
+ %107 = mul nsw i64 %49, %105
+ %108 = add i64 %102, %107
+ %109 = mul nsw i64 %94, %106
+ %110 = add i64 %108, %109
+ %111 = extractvalue { ptr, ptr, i64 } %101, 0
+ %112 = extractvalue { ptr, ptr, i64 } %101, 1
+ %113 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %111, 0
+ %114 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %113, ptr %112, 1
+ %115 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %114, i64 %110, 2
+ %116 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %115, i64 %53, 3, 0
+ %117 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %116, i64 %105, 4, 0
+ br label %118
+
+118: ; preds = %133, %96
+ %119 = phi i64 [ %135, %133 ], [ 0, %96 ]
+ %120 = phi <vscale x 4 x float> [ %134, %133 ], [ poison, %96 ]
+ %121 = icmp slt i64 %119, %47
+ br i1 %121, label %122, label %136
+
+122: ; preds = %118
+ %123 = extractelement <vscale x 4 x i1> %58, i64 %119
+ br i1 %123, label %124, label %133
+
+124: ; preds = %122
+ %125 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 1
+ %126 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 2
+ %127 = getelementptr float, ptr %125, i64 %126
+ %128 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 4, 0
+ %129 = mul nuw nsw i64 %119, %128
+ %130 = getelementptr inbounds nuw float, ptr %127, i64 %129
+ %131 = load float, ptr %130, align 4
+ %132 = insertelement <vscale x 4 x float> %120, float %131, i64 %119
+ br label %133
+
+133: ; preds = %124, %122
+ %134 = phi <vscale x 4 x float> [ %132, %124 ], [ %120, %122 ]
+ %135 = add i64 %119, 1
+ br label %118
+
+136: ; preds = %118
+ %137 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 0
+ %138 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 1
+ %139 = insertvalue { ptr, ptr, i64 } poison, ptr %137, 0
+ %140 = insertvalue { ptr, ptr, i64 } %139, ptr %138, 1
+ %141 = insertvalue { ptr, ptr, i64 } %140, i64 0, 2
+ %142 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 2
+ %143 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 0
+ %144 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1
+ %145 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 0
+ %146 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 1
+ %147 = mul nsw i64 %94, %145
+ %148 = add i64 %142, %147
+ %149 = mul nsw i64 %60, %146
+ %150 = add i64 %148, %149
+ %151 = extractvalue { ptr, ptr, i64 } %141, 0
+ %152 = extractvalue { ptr, ptr, i64 } %141, 1
+ %153 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %151, 0
+ %154 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %153, ptr %152, 1
+ %155 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %154, i64 %150, 2
+ %156 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %155, i64 %64, 3, 0
+ %157 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %156, i64 %146, 4, 0
+ br label %158
+
+158: ; preds = %173, %136
+ %159 = phi i64 [ %175, %173 ], [ 0, %136 ]
+ %160 = phi <vscale x 4 x float> [ %174, %173 ], [ poison, %136 ]
+ %161 = icmp slt i64 %159, %47
+ br i1 %161, label %162, label %176
+
+162: ; preds = %158
+ %163 = extractelement <vscale x 4 x i1> %92, i64 %159
+ br i1 %163, label %164, label %173
+
+164: ; preds = %162
+ %165 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 1
+ %166 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 2
+ %167 = getelementptr float, ptr %165, i64 %166
+ %168 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 4, 0
+ %169 = mul nuw nsw i64 %159, %168
+ %170 = getelementptr inbounds nuw float, ptr %167, i64 %169
+ %171 = load float, ptr %170, align 4
+ %172 = insertelement <vscale x 4 x float> %160, float %171, i64 %159
+ br label %173
+
+173: ; preds = %164, %162
+ %174 = phi <vscale x 4 x float> [ %172, %164 ], [ %160, %162 ]
+ %175 = add i64 %159, 1
+ br label %158
+
+176: ; preds = %158
+ %177 = trunc i64 %64 to i32
+ br label %178
+
+178: ; preds = %181, %176
+ %179 = phi i64 [ %202, %181 ], [ 0, %176 ]
+ %180 = icmp slt i64 %179, %47
+ br i1 %180, label %181, label %203
+
+181: ; preds = %178
+ %182 = icmp ult i64 %179, %53
+ %183 = sext i1 %182 to i32
+ %184 = and i32 %183, %177
+ %185 = sext i32 %184 to i64
+ %186 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+ %187 = trunc i64 %185 to i32
+ %188 = insertelement <vscale x 4 x i32> poison, i32 %187, i32 0
+ %189 = shufflevector <vscale x 4 x i32> %188, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+ %190 = icmp slt <vscale x 4 x i32> %186, %189
+ %191 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1
+ %192 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2
+ %193 = getelementptr float, ptr %191, i64 %192
+ %194 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0
+ %195 = mul i64 %179, %194
+ %196 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1
+ %197 = mul i64 0, %196
+ %198 = add i64 %195, %197
+ %199 = getelementptr float, ptr %193, i64 %198
+ %200 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %199, i32 4, <vscale x 4 x i1> %190, <vscale x 4 x float> poison)
+ %201 = trunc i64 %179 to i32
+ call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 %201, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> %200)
+ %202 = add i64 %179, 1
+ br label %178
+
+203: ; preds = %178
+ call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> %58, <vscale x 4 x i1> %92, <vscale x 4 x float> %120, <vscale x 4 x float> %160)
+ %204 = call i64 @llvm.smin.i64(i64 %53, i64 %47)
+ br label %205
+
+205: ; preds = %208, %203
+ %206 = phi i64 [ %219, %208 ], [ 0, %203 ]
+ %207 = icmp slt i64 %206, %204
+ br i1 %207, label %208, label %220
+
+208: ; preds = %205
+ %209 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1
+ %210 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2
+ %211 = getelementptr float, ptr %209, i64 %210
+ %212 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0
+ %213 = mul i64 %206, %212
+ %214 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1
+ %215 = mul i64 0, %214
+ %216 = add i64 %213, %215
+ %217 = getelementptr float, ptr %211, i64 %216
+ %218 = trunc i64 %206 to i32
+ call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %92, ptr %217, i32 0, i32 %218)
+ %219 = add i64 %206, 1
+ br label %205
+
+220: ; preds = %205
+ %221 = add i64 %94, 1
+ br label %93
+
+222: ; preds = %93
+ %223 = add i64 %60, %47
+ br label %59
+
+224: ; preds = %59
+ %225 = add i64 %49, %47
+ br label %48
+
+226: ; preds = %48
+ %227 = alloca { ptr, ptr, i64, [2 x i64], [2 x i64] }, i64 1, align 8
+ store { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, ptr %227, align 8
+ %228 = insertvalue { i64, ptr } { i64 2, ptr poison }, ptr %227, 1
+ %229 = extractvalue { i64, ptr } %228, 0
+ %230 = extractvalue { i64, ptr } %228, 1
+ call void @printMemrefF32(i64 %229, ptr %230)
+ ret void
+}
+
+declare void @printMemrefF32(i64, ptr)
+
+attributes #0 = { "aarch64_new_za" "aarch64_pstate_sm_body" }
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index 066ee3b..afd56d1 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -12,77 +12,41 @@ entry:
}
define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" {
-; CHECK-LABEL: multi_bb_stpidr2_save_required:
-; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa w29, 16
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEXT: cbz w0, .LBB1_2
-; CHECK-NEXT: // %bb.1: // %use_b
-; CHECK-NEXT: fmov s1, #4.00000000
-; CHECK-NEXT: fadd s0, s0, s1
-; CHECK-NEXT: b .LBB1_5
-; CHECK-NEXT: .LBB1_2: // %use_c
-; CHECK-NEXT: fmov s0, s1
-; CHECK-NEXT: sub x8, x29, #16
-; CHECK-NEXT: msr TPIDR2_EL0, x8
-; CHECK-NEXT: bl cosf
-; CHECK-NEXT: smstart za
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB1_4
-; CHECK-NEXT: // %bb.3: // %use_c
-; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB1_4: // %use_c
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: .LBB1_5: // %exit
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 16
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -8
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mov x9, sp
-; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB1_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
-; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
-; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
-; CHECK-NEWLOWERING-NEXT: b .LBB1_3
-; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %use_c
-; CHECK-NEWLOWERING-NEXT: fmov s0, s1
-; CHECK-NEWLOWERING-NEXT: bl cosf
-; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %exit
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_5
-; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
+; CHECK-COMMON-LABEL: multi_bb_stpidr2_save_required:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: mov x29, sp
+; CHECK-COMMON-NEXT: sub sp, sp, #16
+; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16
+; CHECK-COMMON-NEXT: .cfi_offset w30, -8
+; CHECK-COMMON-NEXT: .cfi_offset w29, -16
+; CHECK-COMMON-NEXT: rdsvl x8, #1
+; CHECK-COMMON-NEXT: mov x9, sp
+; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
+; CHECK-COMMON-NEXT: mov sp, x9
+; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-COMMON-NEXT: cbz w0, .LBB1_2
+; CHECK-COMMON-NEXT: // %bb.1: // %use_b
+; CHECK-COMMON-NEXT: fmov s1, #4.00000000
+; CHECK-COMMON-NEXT: fadd s0, s0, s1
+; CHECK-COMMON-NEXT: b .LBB1_5
+; CHECK-COMMON-NEXT: .LBB1_2: // %use_c
+; CHECK-COMMON-NEXT: fmov s0, s1
+; CHECK-COMMON-NEXT: sub x8, x29, #16
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8
+; CHECK-COMMON-NEXT: bl cosf
+; CHECK-COMMON-NEXT: smstart za
+; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT: sub x0, x29, #16
+; CHECK-COMMON-NEXT: cbnz x8, .LBB1_4
+; CHECK-COMMON-NEXT: // %bb.3: // %use_c
+; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore
+; CHECK-COMMON-NEXT: .LBB1_4: // %use_c
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT: .LBB1_5: // %exit
+; CHECK-COMMON-NEXT: mov sp, x29
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ret
%cmp = icmp ne i32 %a, 0
br i1 %cmp, label %use_b, label %use_c
@@ -155,7 +119,9 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
; CHECK-NEWLOWERING-NEXT: mov x9, sp
+; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEWLOWERING-NEXT: cmp sp, x9
@@ -166,9 +132,7 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEWLOWERING-NEXT: .LBB2_3:
; CHECK-NEWLOWERING-NEXT: mov sp, x9
; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 2583a93..5b81f5d 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -426,3 +426,21 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin
call void %callee()
ret void
}
+
+define void @disable_tailcallopt(ptr %callee) "aarch64_inout_zt0" nounwind {
+; CHECK-COMMON-LABEL: disable_tailcallopt:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: sub sp, sp, #80
+; CHECK-COMMON-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: mov x19, sp
+; CHECK-COMMON-NEXT: str zt0, [x19]
+; CHECK-COMMON-NEXT: smstop za
+; CHECK-COMMON-NEXT: blr x0
+; CHECK-COMMON-NEXT: smstart za
+; CHECK-COMMON-NEXT: ldr zt0, [x19]
+; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: add sp, sp, #80
+; CHECK-COMMON-NEXT: ret
+ tail call void %callee()
+ ret void
+}