diff options
Diffstat (limited to 'llvm/test/CodeGen/AArch64')
23 files changed, 1284 insertions, 1153 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir index 68302f5..5f98dae 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir @@ -290,11 +290,8 @@ body: | ; CHECK-LABEL: name: s3_from_s35 ; CHECK: liveins: $w0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[DEF]](s64) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 - ; CHECK-NEXT: %ext:_(s32) = G_AND [[TRUNC]], [[C]] - ; CHECK-NEXT: $w0 = COPY %ext(s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: $w0 = COPY [[C]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %val:_(s35) = G_IMPLICIT_DEF %extract:_(s3) = G_EXTRACT %val, 0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir index 03c28ef..b28298c 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir @@ -159,13 +159,16 @@ body: | ; CHECK-LABEL: name: test_freeze_v3s8 ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[DEF]] - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[BUILD_VECTOR]](<8 x s16>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s8>), [[UV1:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[TRUNC]](<8 x s8>) + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[UV]] + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>) ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV]](s8) - ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV1]](s8) - ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV2]](s8) + ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV2]](s8) + ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV3]](s8) + ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV4]](s8) ; CHECK-NEXT: %res:_(<4 x s32>) = G_BUILD_VECTOR %ext0(s32), %ext1(s32), %ext2(s32), %undef(s32) ; CHECK-NEXT: $q0 = COPY %res(<4 x s32>) %x:_(<3 x s8>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir index 858a5a2..1cf066d 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir @@ -248,21 +248,19 @@ body: | ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16) ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16) ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16) - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF2]](<4 x s8>) - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<16 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, undef, undef, undef, undef) ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[SHUF]](<16 x s8>) ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(<4 x s32>) = G_UITOFP [[BITCAST]](<4 x s32>) - ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UITOFP]](<4 x s32>) - ; CHECK-NEXT: G_STORE [[UV10]](s32), [[COPY]](p0) :: (store (s32), align 16) + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UITOFP]](<4 x s32>) + ; CHECK-NEXT: G_STORE [[UV6]](s32), [[COPY]](p0) :: (store (s32), align 16) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64) - ; CHECK-NEXT: G_STORE [[UV11]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4) + ; CHECK-NEXT: G_STORE [[UV7]](s32), [[PTR_ADD]](p0) :: (store (s32) into unknown-address + 4) ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C4]](s64) - ; CHECK-NEXT: G_STORE [[UV12]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 8, align 8) + ; CHECK-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p0) :: (store (s32) into unknown-address + 8, align 8) ; CHECK-NEXT: G_BR %bb.1 bb.1: liveins: $w1, $w2, $w3, $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir index 2c326902..eb30581 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir @@ -545,15 +545,18 @@ body: | ; CHECK-LABEL: name: store_6xs64 ; CHECK: liveins: $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64) ; CHECK-NEXT: %ptr:_(p0) = COPY $x0 - ; CHECK-NEXT: G_STORE [[DEF]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>)) + ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>)) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD %ptr, [[C]](s64) - ; CHECK-NEXT: G_STORE [[DEF]](<2 x s64>), [[PTR_ADD]](p0) :: (store (<2 x s64>) into unknown-address + 16) + ; CHECK-NEXT: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[PTR_ADD]](p0) :: (store (<2 x s64>) into unknown-address + 16) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD %ptr, [[C1]](s64) - ; CHECK-NEXT: G_STORE [[DEF]](<2 x s64>), [[PTR_ADD1]](p0) :: (store (<2 x s64>) into unknown-address + 32) + ; CHECK-NEXT: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[PTR_ADD1]](p0) :: (store (<2 x s64>) into unknown-address + 32) ; CHECK-NEXT: RET_ReallyLR %val:_(<6 x s64>) = G_IMPLICIT_DEF %ptr:_(p0) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir index b8bdef0..737c66c 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir @@ -220,10 +220,8 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UADDE]](s32), [[SEXT_INREG2]] ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UADDE]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF1]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV4]](s8), [[UV5]](s8), [[UV6]](s8), [[DEF]](s8) - ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV8]](s8), [[UV9]](s8), [[UV10]](s8), [[UV8]](s8) + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8) ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[MV2]], 24 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 23 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir index 52a28ad..1c5ae0d 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir @@ -289,35 +289,35 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4100 ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %w0(s32), [[C]] ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ICMP2]], 1 - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32) - ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[DEF1]], [[TRUNC]](s16), [[C1]](s64) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[IVEC]](<4 x s16>) ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16) ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16) ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16) ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16) - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) - ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16) - ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[UV5]](s16) - ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[UV6]](s16) - ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[UV7]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8) + ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16) + ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16) + ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16) + ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8) ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<8 x s8>), [[BUILD_VECTOR1]], shufflemask(0, 0, 0, 0, undef, undef, undef, undef) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 1 ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[SHUF]](<8 x s8>) - ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s16>), [[UV9:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR2]](<8 x s8>) - ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(<4 x s16>), [[UV11:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[UV8]], [[UV10]] + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s16>), [[UV7:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT1]](<8 x s16>) + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[UV4]], [[UV6]] ; CHECK-NEXT: [[TRUNC9:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>) ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[SHUF]](<8 x s8>) - ; CHECK-NEXT: [[UV12:%[0-9]+]]:_(<4 x s16>), [[UV13:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT2]](<8 x s16>) - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC9]], [[UV12]] + ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<4 x s16>), [[UV9:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT2]](<8 x s16>) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC9]], [[UV8]] ; CHECK-NEXT: [[TRUNC10:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>) ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC10]], [[XOR]] ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND]], [[AND1]] diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir index fdd0ebb..352f4e7 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir @@ -288,10 +288,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[UV]](s32) - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[UV]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[DEF]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[DEF]](s32) ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]], shufflemask(0, 1, 5, 6) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[SHUF]](<4 x s32>), [[C]](s64) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir index 2311be6..abfaea0 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir @@ -220,10 +220,8 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[USUBE]](s32), [[SEXT_INREG2]] ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[USUBE]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF1]](s32) ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV4]](s8), [[UV5]](s8), [[UV6]](s8), [[DEF]](s8) - ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV8]](s8), [[UV9]](s8), [[UV10]](s8), [[UV8]](s8) + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8) ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[MV2]], 24 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 23 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir index 2609eb0..9726cc5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir @@ -37,10 +37,9 @@ body: | bb.0: ; CHECK-LABEL: name: test_implicit_def_v4s32 - ; CHECK: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) - ; CHECK-NEXT: $x0 = COPY [[UV]](<2 x s32>) - ; CHECK-NEXT: $x1 = COPY [[UV1]](<2 x s32>) + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: $x0 = COPY [[DEF]](<2 x s32>) + ; CHECK-NEXT: $x1 = COPY [[DEF]](<2 x s32>) %0:_(<4 x s32>) = G_IMPLICIT_DEF %1:_(<2 x s32> ), %2:_(<2 x s32>) = G_UNMERGE_VALUES %0 $x0 = COPY %1 @@ -67,10 +66,9 @@ body: | bb.0: ; CHECK-LABEL: name: test_implicit_def_v2s32 - ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) - ; CHECK-NEXT: $w0 = COPY [[UV]](s32) - ; CHECK-NEXT: $w1 = COPY [[UV1]](s32) + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: $w0 = COPY [[DEF]](s32) + ; CHECK-NEXT: $w1 = COPY [[DEF]](s32) %0:_(<2 x s32>) = G_IMPLICIT_DEF %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0 $w0 = COPY %1 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll index 41f7ab8..480fcbd 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll @@ -4992,28 +4992,21 @@ define void @test_shl_i512_const_32(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_32: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #32 -; GISEL-NEXT: lsr x13, x9, #32 -; GISEL-NEXT: lsl x8, x8, #32 -; GISEL-NEXT: orr x9, x10, x9, lsl #32 -; GISEL-NEXT: lsr x10, x11, #32 -; GISEL-NEXT: orr x11, x13, x11, lsl #32 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #32 -; GISEL-NEXT: orr x10, x10, x12, lsl #32 -; GISEL-NEXT: lsr x12, x14, #32 -; GISEL-NEXT: lsr x9, x15, #32 -; GISEL-NEXT: orr x8, x8, x14, lsl #32 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #32 -; GISEL-NEXT: lsr x12, x13, #32 -; GISEL-NEXT: orr x9, x9, x13, lsl #32 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #32 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #32 +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #32 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #32 +; GISEL-NEXT: extr x10, x15, x14, #32 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #32 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5044,30 +5037,22 @@ define void @test_lshr_i512_const_32(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_32: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x13, x9, #32 -; GISEL-NEXT: lsl x15, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: orr x8, x13, x8, lsr #32 -; GISEL-NEXT: lsl x13, x14, #32 -; GISEL-NEXT: orr x9, x15, x9, lsr #32 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #32 -; GISEL-NEXT: lsl x8, x16, #32 -; GISEL-NEXT: lsl x11, x12, #32 -; GISEL-NEXT: lsl x13, x15, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #32 -; GISEL-NEXT: lsr x10, x16, #32 -; GISEL-NEXT: orr x11, x11, x14, lsr #32 -; GISEL-NEXT: orr x9, x13, x12, lsr #32 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #32 +; GISEL-NEXT: extr x8, x15, x14, #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #32 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5098,32 +5083,24 @@ define void @test_ashr_i512_const_32(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_32: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x15, x9, #32 -; GISEL-NEXT: lsl x16, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #32 -; GISEL-NEXT: lsl x15, x13, #32 -; GISEL-NEXT: orr x9, x16, x9, lsr #32 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #32 -; GISEL-NEXT: orr x10, x15, x10, lsr #32 -; GISEL-NEXT: lsl x15, x12, #32 -; GISEL-NEXT: orr x8, x11, x13, lsr #32 -; GISEL-NEXT: lsl x11, x17, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #32 -; GISEL-NEXT: lsl x13, x16, #32 -; GISEL-NEXT: orr x10, x11, x12, lsr #32 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #32 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #32 +; GISEL-NEXT: extr x9, x15, x14, #32 +; GISEL-NEXT: lsl x8, x8, #32 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #32 +; GISEL-NEXT: extr x11, x13, x12, #32 +; GISEL-NEXT: orr x8, x8, x13, asr #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5252,23 +5229,17 @@ define void @test_shl_i512_const_96(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #32 -; GISEL-NEXT: lsr x16, x9, #32 -; GISEL-NEXT: lsl x8, x8, #32 -; GISEL-NEXT: orr x9, x14, x9, lsl #32 -; GISEL-NEXT: lsr x14, x10, #32 -; GISEL-NEXT: orr x10, x16, x10, lsl #32 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #32 -; GISEL-NEXT: orr x11, x14, x11, lsl #32 -; GISEL-NEXT: lsr x14, x12, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #32 -; GISEL-NEXT: orr x8, x8, x12, lsl #32 -; GISEL-NEXT: orr x10, x14, x13, lsl #32 -; GISEL-NEXT: orr x9, x9, x15, lsl #32 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #32 +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #32 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5297,27 +5268,21 @@ define void @test_lshr_i512_const_96(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_96: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x13, x9, #32 -; GISEL-NEXT: orr x10, x12, x10, lsr #32 -; GISEL-NEXT: lsl x12, x11, #32 -; GISEL-NEXT: orr x8, x13, x8, lsr #32 -; GISEL-NEXT: lsl x13, x14, #32 -; GISEL-NEXT: orr x9, x12, x9, lsr #32 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #32 -; GISEL-NEXT: orr x11, x13, x11, lsr #32 -; GISEL-NEXT: lsl x12, x16, #32 -; GISEL-NEXT: orr x8, x10, x14, lsr #32 -; GISEL-NEXT: lsr x10, x16, #32 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #32 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #32 +; GISEL-NEXT: lsr x8, x14, #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5347,29 +5312,23 @@ define void @test_ashr_i512_const_96(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_96: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x14, x9, #32 -; GISEL-NEXT: lsl x15, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #32 -; GISEL-NEXT: lsl x14, x13, #32 -; GISEL-NEXT: orr x9, x15, x9, lsr #32 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #32 -; GISEL-NEXT: orr x10, x14, x10, lsr #32 -; GISEL-NEXT: lsl x14, x16, #32 -; GISEL-NEXT: orr x8, x11, x13, lsr #32 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #32 +; GISEL-NEXT: extr x9, x14, x13, #32 ; GISEL-NEXT: lsl x11, x15, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #32 -; GISEL-NEXT: orr x10, x11, x16, asr #32 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #32 +; GISEL-NEXT: orr x8, x11, x12, asr #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5404,28 +5363,21 @@ define void @test_shl_i512_const_1(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_1: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #63 -; GISEL-NEXT: lsr x13, x9, #63 -; GISEL-NEXT: lsl x8, x8, #1 -; GISEL-NEXT: orr x9, x10, x9, lsl #1 -; GISEL-NEXT: lsr x10, x11, #63 -; GISEL-NEXT: orr x11, x13, x11, lsl #1 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #63 -; GISEL-NEXT: orr x10, x10, x12, lsl #1 -; GISEL-NEXT: lsr x12, x14, #63 -; GISEL-NEXT: lsr x9, x15, #63 -; GISEL-NEXT: orr x8, x8, x14, lsl #1 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #1 -; GISEL-NEXT: lsr x12, x13, #63 -; GISEL-NEXT: orr x9, x9, x13, lsl #1 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #1 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #1 +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #63 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #63 +; GISEL-NEXT: extr x10, x15, x14, #63 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #63 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5457,30 +5409,22 @@ define void @test_lshr_i512_const_1(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_1: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x13, x9, #63 -; GISEL-NEXT: lsl x15, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #1 -; GISEL-NEXT: lsl x13, x14, #63 -; GISEL-NEXT: orr x9, x15, x9, lsr #1 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #1 -; GISEL-NEXT: lsl x8, x16, #63 -; GISEL-NEXT: lsl x11, x12, #63 -; GISEL-NEXT: lsl x13, x15, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #1 -; GISEL-NEXT: lsr x10, x16, #1 -; GISEL-NEXT: orr x11, x11, x14, lsr #1 -; GISEL-NEXT: orr x9, x13, x12, lsr #1 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #1 +; GISEL-NEXT: extr x8, x15, x14, #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #1 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5512,32 +5456,24 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_1: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x15, x9, #63 -; GISEL-NEXT: lsl x16, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #1 -; GISEL-NEXT: lsl x15, x13, #63 -; GISEL-NEXT: orr x9, x16, x9, lsr #1 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #63 -; GISEL-NEXT: orr x10, x15, x10, lsr #1 -; GISEL-NEXT: lsl x15, x12, #63 -; GISEL-NEXT: orr x8, x11, x13, lsr #1 -; GISEL-NEXT: lsl x11, x17, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #1 -; GISEL-NEXT: lsl x13, x16, #63 -; GISEL-NEXT: orr x10, x11, x12, lsr #1 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #1 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #1 +; GISEL-NEXT: extr x9, x15, x14, #1 +; GISEL-NEXT: lsl x8, x8, #63 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #1 +; GISEL-NEXT: extr x11, x13, x12, #1 +; GISEL-NEXT: orr x8, x8, x13, asr #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5571,28 +5507,21 @@ define void @test_shl_i512_const_15(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_15: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #49 -; GISEL-NEXT: lsr x13, x9, #49 -; GISEL-NEXT: lsl x8, x8, #15 -; GISEL-NEXT: orr x9, x10, x9, lsl #15 -; GISEL-NEXT: lsr x10, x11, #49 -; GISEL-NEXT: orr x11, x13, x11, lsl #15 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #49 -; GISEL-NEXT: orr x10, x10, x12, lsl #15 -; GISEL-NEXT: lsr x12, x14, #49 -; GISEL-NEXT: lsr x9, x15, #49 -; GISEL-NEXT: orr x8, x8, x14, lsl #15 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #15 -; GISEL-NEXT: lsr x12, x13, #49 -; GISEL-NEXT: orr x9, x9, x13, lsl #15 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #15 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #15 +; GISEL-NEXT: extr x8, x9, x8, #49 +; GISEL-NEXT: extr x9, x10, x9, #49 +; GISEL-NEXT: extr x10, x11, x10, #49 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #49 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #49 +; GISEL-NEXT: extr x10, x15, x14, #49 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #49 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5624,30 +5553,22 @@ define void @test_lshr_i512_const_15(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_15: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #49 -; GISEL-NEXT: lsl x13, x9, #49 -; GISEL-NEXT: lsl x15, x10, #49 -; GISEL-NEXT: orr x11, x12, x11, lsr #15 -; GISEL-NEXT: orr x8, x13, x8, lsr #15 -; GISEL-NEXT: lsl x13, x14, #49 -; GISEL-NEXT: orr x9, x15, x9, lsr #15 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #15 -; GISEL-NEXT: lsl x8, x16, #49 -; GISEL-NEXT: lsl x11, x12, #49 -; GISEL-NEXT: lsl x13, x15, #49 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #15 -; GISEL-NEXT: lsr x10, x16, #15 -; GISEL-NEXT: orr x11, x11, x14, lsr #15 -; GISEL-NEXT: orr x9, x13, x12, lsr #15 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #15 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #15 +; GISEL-NEXT: extr x10, x11, x10, #15 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #15 +; GISEL-NEXT: extr x9, x13, x12, #15 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #15 +; GISEL-NEXT: extr x8, x15, x14, #15 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #15 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5679,32 +5600,24 @@ define void @test_ashr_i512_const_15(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_15: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #49 -; GISEL-NEXT: lsl x15, x9, #49 -; GISEL-NEXT: lsl x16, x10, #49 -; GISEL-NEXT: orr x11, x12, x11, lsr #15 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #15 -; GISEL-NEXT: lsl x15, x13, #49 -; GISEL-NEXT: orr x9, x16, x9, lsr #15 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #49 -; GISEL-NEXT: orr x10, x15, x10, lsr #15 -; GISEL-NEXT: lsl x15, x12, #49 -; GISEL-NEXT: orr x8, x11, x13, lsr #15 -; GISEL-NEXT: lsl x11, x17, #49 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #15 -; GISEL-NEXT: lsl x13, x16, #49 -; GISEL-NEXT: orr x10, x11, x12, lsr #15 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #15 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #15 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #15 +; GISEL-NEXT: extr x10, x11, x10, #15 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #15 +; GISEL-NEXT: extr x9, x15, x14, #15 +; GISEL-NEXT: lsl x8, x8, #49 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #15 +; GISEL-NEXT: extr x11, x13, x12, #15 +; GISEL-NEXT: orr x8, x8, x13, asr #15 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5738,28 +5651,21 @@ define void @test_shl_i512_const_63(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_63: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #1 -; GISEL-NEXT: lsr x13, x9, #1 -; GISEL-NEXT: lsl x8, x8, #63 -; GISEL-NEXT: orr x9, x10, x9, lsl #63 -; GISEL-NEXT: lsr x10, x11, #1 -; GISEL-NEXT: orr x11, x13, x11, lsl #63 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #1 -; GISEL-NEXT: orr x10, x10, x12, lsl #63 -; GISEL-NEXT: lsr x12, x14, #1 -; GISEL-NEXT: lsr x9, x15, #1 -; GISEL-NEXT: orr x8, x8, x14, lsl #63 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #63 -; GISEL-NEXT: lsr x12, x13, #1 -; GISEL-NEXT: orr x9, x9, x13, lsl #63 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #63 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #63 +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #1 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #1 +; GISEL-NEXT: extr x10, x15, x14, #1 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #1 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5791,30 +5697,22 @@ define void @test_lshr_i512_const_63(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_63: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: lsl x15, x10, #1 -; GISEL-NEXT: orr x11, x12, x11, lsr #63 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x15, x9, lsr #63 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #63 -; GISEL-NEXT: lsl x8, x16, #1 -; GISEL-NEXT: lsl x11, x12, #1 -; GISEL-NEXT: lsl x13, x15, #1 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #63 -; GISEL-NEXT: lsr x10, x16, #63 -; GISEL-NEXT: orr x11, x11, x14, lsr #63 -; GISEL-NEXT: orr x9, x13, x12, lsr #63 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: extr x8, x15, x14, #63 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #63 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5846,30 +5744,22 @@ define void @test_ashr_i512_const_63(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_63: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x10, [x1] -; GISEL-NEXT: ldp x11, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x15, x9, #1 -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x16, x11, #1 -; GISEL-NEXT: orr x8, x15, x8, lsr #63 -; GISEL-NEXT: lsl x15, x13, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x9, x16, x9, lsr #63 -; GISEL-NEXT: orr x11, x15, x11, lsr #63 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x8, x17, #1 -; GISEL-NEXT: lsl x16, x14, #1 -; GISEL-NEXT: lsl x10, x12, #1 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: asr x9, x17, #63 -; GISEL-NEXT: orr x8, x8, x12, lsr #63 -; GISEL-NEXT: orr x13, x16, x13, lsr #63 -; GISEL-NEXT: orr x10, x10, x14, lsr #63 -; GISEL-NEXT: orr x9, x9, x9, lsl #1 -; GISEL-NEXT: stp x13, x10, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: extr x11, x14, x13, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: asr x10, x15, #63 +; GISEL-NEXT: extr x8, x15, x14, #63 +; GISEL-NEXT: stp x9, x11, [x0, #32] +; GISEL-NEXT: orr x9, x10, x10, lsl #1 ; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: @@ -5906,23 +5796,17 @@ define void @test_shl_i512_const_65(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #63 -; GISEL-NEXT: lsr x16, x9, #63 -; GISEL-NEXT: lsl x8, x8, #1 -; GISEL-NEXT: orr x9, x14, x9, lsl #1 -; GISEL-NEXT: lsr x14, x10, #63 -; GISEL-NEXT: orr x10, x16, x10, lsl #1 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #63 -; GISEL-NEXT: orr x11, x14, x11, lsl #1 -; GISEL-NEXT: lsr x14, x12, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #63 -; GISEL-NEXT: orr x8, x8, x12, lsl #1 -; GISEL-NEXT: orr x10, x14, x13, lsl #1 -; GISEL-NEXT: orr x9, x9, x15, lsl #1 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #1 +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #63 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5953,27 +5837,21 @@ define void @test_lshr_i512_const_65(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_65: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x13, x9, #63 -; GISEL-NEXT: orr x10, x12, x10, lsr #1 -; GISEL-NEXT: lsl x12, x11, #63 -; GISEL-NEXT: orr x8, x13, x8, lsr #1 -; GISEL-NEXT: lsl x13, x14, #63 -; GISEL-NEXT: orr x9, x12, x9, lsr #1 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #63 -; GISEL-NEXT: orr x11, x13, x11, lsr #1 -; GISEL-NEXT: lsl x12, x16, #63 -; GISEL-NEXT: orr x8, x10, x14, lsr #1 -; GISEL-NEXT: lsr x10, x16, #1 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #1 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #1 +; GISEL-NEXT: lsr x8, x14, #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6005,29 +5883,23 @@ define void @test_ashr_i512_const_65(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_65: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x14, x9, #63 -; GISEL-NEXT: lsl x15, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #1 -; GISEL-NEXT: lsl x14, x13, #63 -; GISEL-NEXT: orr x9, x15, x9, lsr #1 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #63 -; GISEL-NEXT: orr x10, x14, x10, lsr #1 -; GISEL-NEXT: lsl x14, x16, #63 -; GISEL-NEXT: orr x8, x11, x13, lsr #1 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #1 +; GISEL-NEXT: extr x9, x14, x13, #1 ; GISEL-NEXT: lsl x11, x15, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #1 -; GISEL-NEXT: orr x10, x11, x16, asr #1 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #1 +; GISEL-NEXT: orr x8, x11, x12, asr #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6062,23 +5934,17 @@ define void @test_shl_i512_const_100(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #28 -; GISEL-NEXT: lsr x16, x9, #28 -; GISEL-NEXT: lsl x8, x8, #36 -; GISEL-NEXT: orr x9, x14, x9, lsl #36 -; GISEL-NEXT: lsr x14, x10, #28 -; GISEL-NEXT: orr x10, x16, x10, lsl #36 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #28 -; GISEL-NEXT: orr x11, x14, x11, lsl #36 -; GISEL-NEXT: lsr x14, x12, #28 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #28 -; GISEL-NEXT: orr x8, x8, x12, lsl #36 -; GISEL-NEXT: orr x10, x14, x13, lsl #36 -; GISEL-NEXT: orr x9, x9, x15, lsl #36 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #36 +; GISEL-NEXT: extr x8, x9, x8, #28 +; GISEL-NEXT: extr x9, x10, x9, #28 +; GISEL-NEXT: extr x10, x11, x10, #28 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #28 +; GISEL-NEXT: extr x9, x13, x12, #28 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #28 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6109,27 +5975,21 @@ define void @test_lshr_i512_const_100(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_100: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #28 -; GISEL-NEXT: lsl x13, x9, #28 -; GISEL-NEXT: orr x10, x12, x10, lsr #36 -; GISEL-NEXT: lsl x12, x11, #28 -; GISEL-NEXT: orr x8, x13, x8, lsr #36 -; GISEL-NEXT: lsl x13, x14, #28 -; GISEL-NEXT: orr x9, x12, x9, lsr #36 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #28 -; GISEL-NEXT: orr x11, x13, x11, lsr #36 -; GISEL-NEXT: lsl x12, x16, #28 -; GISEL-NEXT: orr x8, x10, x14, lsr #36 -; GISEL-NEXT: lsr x10, x16, #36 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #36 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #36 +; GISEL-NEXT: extr x9, x10, x9, #36 +; GISEL-NEXT: extr x10, x11, x10, #36 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #36 +; GISEL-NEXT: extr x9, x13, x12, #36 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #36 +; GISEL-NEXT: lsr x8, x14, #36 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6161,29 +6021,23 @@ define void @test_ashr_i512_const_100(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_100: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #28 -; GISEL-NEXT: lsl x14, x9, #28 -; GISEL-NEXT: lsl x15, x10, #28 -; GISEL-NEXT: orr x11, x12, x11, lsr #36 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #36 -; GISEL-NEXT: lsl x14, x13, #28 -; GISEL-NEXT: orr x9, x15, x9, lsr #36 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #28 -; GISEL-NEXT: orr x10, x14, x10, lsr #36 -; GISEL-NEXT: lsl x14, x16, #28 -; GISEL-NEXT: orr x8, x11, x13, lsr #36 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #36 +; GISEL-NEXT: extr x9, x10, x9, #36 +; GISEL-NEXT: extr x10, x11, x10, #36 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #36 +; GISEL-NEXT: extr x9, x14, x13, #36 ; GISEL-NEXT: lsl x11, x15, #28 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #36 -; GISEL-NEXT: orr x10, x11, x16, asr #36 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #36 +; GISEL-NEXT: orr x8, x11, x12, asr #36 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6219,23 +6073,17 @@ define void @test_shl_i512_const_127(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #1 -; GISEL-NEXT: lsr x16, x9, #1 -; GISEL-NEXT: lsl x8, x8, #63 -; GISEL-NEXT: orr x9, x14, x9, lsl #63 -; GISEL-NEXT: lsr x14, x10, #1 -; GISEL-NEXT: orr x10, x16, x10, lsl #63 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #1 -; GISEL-NEXT: orr x11, x14, x11, lsl #63 -; GISEL-NEXT: lsr x14, x12, #1 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #1 -; GISEL-NEXT: orr x8, x8, x12, lsl #63 -; GISEL-NEXT: orr x10, x14, x13, lsl #63 -; GISEL-NEXT: orr x9, x9, x15, lsl #63 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #63 +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #1 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6266,27 +6114,21 @@ define void @test_lshr_i512_const_127(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_127: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: lsl x12, x11, #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x12, x9, lsr #63 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #1 -; GISEL-NEXT: orr x11, x13, x11, lsr #63 -; GISEL-NEXT: lsl x12, x16, #1 -; GISEL-NEXT: orr x8, x10, x14, lsr #63 -; GISEL-NEXT: lsr x10, x16, #63 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #63 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: lsr x8, x14, #63 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6317,28 +6159,22 @@ define void @test_ashr_i512_const_127(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_127: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: lsl x12, x11, #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x12, x9, lsr #63 -; GISEL-NEXT: lsl x12, x15, #1 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x16, #1 -; GISEL-NEXT: orr x11, x13, x11, lsr #63 -; GISEL-NEXT: asr x8, x16, #63 -; GISEL-NEXT: orr x12, x12, x14, lsr #63 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x10, x15, lsr #63 -; GISEL-NEXT: orr x10, x8, x8, lsl #1 -; GISEL-NEXT: stp x12, x9, [x0, #32] -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: asr x9, x14, #63 +; GISEL-NEXT: extr x11, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: orr x8, x9, x9, lsl #1 +; GISEL-NEXT: stp x11, x10, [x0, #32] +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 diff --git a/llvm/test/CodeGen/AArch64/adc.ll b/llvm/test/CodeGen/AArch64/adc.ll index 12e8bf2..03f3cf1 100644 --- a/llvm/test/CodeGen/AArch64/adc.ll +++ b/llvm/test/CodeGen/AArch64/adc.ll @@ -71,9 +71,8 @@ define i128 @test_shifted(i128 %a, i128 %b) { ; ; CHECK-GI-LABEL: test_shifted: ; CHECK-GI: ; %bb.0: -; CHECK-GI-NEXT: lsr x8, x2, #19 +; CHECK-GI-NEXT: extr x8, x3, x2, #19 ; CHECK-GI-NEXT: adds x0, x0, x2, lsl #45 -; CHECK-GI-NEXT: orr x8, x8, x3, lsl #45 ; CHECK-GI-NEXT: adc x1, x1, x8 ; CHECK-GI-NEXT: ret %rhs = shl i128 %b, 45 @@ -108,8 +107,7 @@ define i128 @test_extended(i128 %a, i16 %b) { ; CHECK-GI-NEXT: sxth x8, w2 ; CHECK-GI-NEXT: adds x0, x0, w2, sxth #3 ; CHECK-GI-NEXT: asr x9, x8, #63 -; CHECK-GI-NEXT: lsr x8, x8, #61 -; CHECK-GI-NEXT: orr x8, x8, x9, lsl #3 +; CHECK-GI-NEXT: extr x8, x9, x8, #61 ; CHECK-GI-NEXT: adc x1, x1, x8 ; CHECK-GI-NEXT: ret %ext = sext i16 %b to i128 diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll index 076cbf7..a505b42 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -1408,6 +1408,88 @@ define <4 x i16> @ext_via_i19(<4 x i16> %a) { ret <4 x i16> %t6 } +define <8 x i8> @srhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) { +; CHECK-LABEL: srhadd_v8i8_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd.8b v0, v0, v1 +; CHECK-NEXT: ret + %s0s = sext <8 x i8> %s0 to <8 x i16> + %s1s = sext <8 x i8> %s1 to <8 x i16> + %s = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s) + %s2 = trunc <8 x i16> %s to <8 x i8> + ret <8 x i8> %s2 +} + +define <4 x i16> @srhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) { +; CHECK-LABEL: srhadd_v4i16_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd.4h v0, v0, v1 +; CHECK-NEXT: ret + %s0s = sext <4 x i16> %s0 to <4 x i32> + %s1s = sext <4 x i16> %s1 to <4 x i32> + %s = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s) + %s2 = trunc <4 x i32> %s to <4 x i16> + ret <4 x i16> %s2 +} + +define <2 x i32> @srhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) { +; CHECK-LABEL: srhadd_v2i32_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll.2d v0, v0, #0 +; CHECK-NEXT: sshll.2d v1, v1, #0 +; CHECK-NEXT: eor.16b v2, v0, v1 +; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: ushr.2d v1, v2, #1 +; CHECK-NEXT: sub.2d v0, v0, v1 +; CHECK-NEXT: xtn.2s v0, v0 +; CHECK-NEXT: ret + %s0s = sext <2 x i32> %s0 to <2 x i64> + %s1s = sext <2 x i32> %s1 to <2 x i64> + %s = call <2 x i64> @llvm.aarch64.neon.urhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s) + %s2 = trunc <2 x i64> %s to <2 x i32> + ret <2 x i32> %s2 +} + +define <8 x i8> @urhadd_v8i8_trunc(<8 x i8> %s0, <8 x i8> %s1) { +; CHECK-LABEL: urhadd_v8i8_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.8b v0, v0, v1 +; CHECK-NEXT: ret + %s0s = zext <8 x i8> %s0 to <8 x i16> + %s1s = zext <8 x i8> %s1 to <8 x i16> + %s = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %s0s, <8 x i16> %s1s) + %s2 = trunc <8 x i16> %s to <8 x i8> + ret <8 x i8> %s2 +} + +define <4 x i16> @urhadd_v4i16_trunc(<4 x i16> %s0, <4 x i16> %s1) { +; CHECK-LABEL: urhadd_v4i16_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd.4h v0, v0, v1 +; CHECK-NEXT: ret + %s0s = zext <4 x i16> %s0 to <4 x i32> + %s1s = zext <4 x i16> %s1 to <4 x i32> + %s = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %s0s, <4 x i32> %s1s) + %s2 = trunc <4 x i32> %s to <4 x i16> + ret <4 x i16> %s2 +} + +define <2 x i32> @urhadd_v2i32_trunc(<2 x i32> %s0, <2 x i32> %s1) { +; CHECK-LABEL: urhadd_v2i32_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: uaddl.2d v0, v0, v1 +; CHECK-NEXT: dup.2d v1, x8 +; CHECK-NEXT: add.2d v0, v0, v1 +; CHECK-NEXT: shrn.2s v0, v0, #1 +; CHECK-NEXT: ret + %s0s = zext <2 x i32> %s0 to <2 x i64> + %s1s = zext <2 x i32> %s1 to <2 x i64> + %s = call <2 x i64> @llvm.aarch64.neon.srhadd.v2i64(<2 x i64> %s0s, <2 x i64> %s1s) + %s2 = trunc <2 x i64> %s to <2 x i32> + ret <2 x i32> %s2 +} + declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll index 670574f2..6df6d76 100644 --- a/llvm/test/CodeGen/AArch64/dup.ll +++ b/llvm/test/CodeGen/AArch64/dup.ll @@ -2,16 +2,21 @@ ; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD ; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI -; CHECK-GI: warning: Instruction selection used fallback path for dup_v2i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for duplane0_v2i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_v2i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for loaddup_str_v2i8 - define <2 x i8> @dup_v2i8(i8 %a) { -; CHECK-LABEL: dup_v2i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v0.2s, w0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: dup_v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: dup v0.2s, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: dup_v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: dup v0.8b, w0 +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %b = insertelement <2 x i8> poison, i8 %a, i64 0 %c = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer @@ -19,22 +24,45 @@ entry: } define <2 x i8> @duplane0_v2i8(<2 x i8> %b) { -; CHECK-LABEL: duplane0_v2i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.2s, v0.s[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: duplane0_v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: dup v0.2s, v0.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: duplane0_v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov w8, v0.s[1] +; CHECK-GI-NEXT: mov v0.b[1], w8 +; CHECK-GI-NEXT: dup v0.8b, v0.b[0] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %c = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer ret <2 x i8> %c } define <2 x i8> @loaddup_v2i8(ptr %p) { -; CHECK-LABEL: loaddup_v2i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0] -; CHECK-NEXT: dup v0.2s, v0.s[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: loaddup_v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr b0, [x0] +; CHECK-SD-NEXT: dup v0.2s, v0.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: loaddup_v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ld1r { v0.8b }, [x0] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %a = load i8, ptr %p %b = insertelement <2 x i8> poison, i8 %a, i64 0 @@ -43,12 +71,24 @@ entry: } define <2 x i8> @loaddup_str_v2i8(ptr %p) { -; CHECK-LABEL: loaddup_str_v2i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: strb wzr, [x0] -; CHECK-NEXT: dup v0.2s, w8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: loaddup_str_v2i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldrb w8, [x0] +; CHECK-SD-NEXT: strb wzr, [x0] +; CHECK-SD-NEXT: dup v0.2s, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: loaddup_str_v2i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr b0, [x0] +; CHECK-GI-NEXT: strb wzr, [x0] +; CHECK-GI-NEXT: dup v0.8b, v0.b[0] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.s[1], w9 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret entry: %a = load i8, ptr %p %b = insertelement <2 x i8> poison, i8 %a, i64 0 diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 765f6b7..7f07ef4 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -510,41 +510,40 @@ define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) { ; ; CHECK-GI-LABEL: fshl_i128: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #64 // =0x40 ; CHECK-GI-NEXT: and x9, x4, #0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: lsl x14, x3, #63 -; CHECK-GI-NEXT: sub x12, x10, x9 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: sub x12, x8, x9 ; CHECK-GI-NEXT: lsl x13, x1, x9 -; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: bic x10, x10, x4 ; CHECK-GI-NEXT: lsr x12, x0, x12 -; CHECK-GI-NEXT: bic x8, x8, x4 -; CHECK-GI-NEXT: sub x15, x9, #64 +; CHECK-GI-NEXT: sub x14, x9, #64 +; CHECK-GI-NEXT: lsl x15, x0, x9 +; CHECK-GI-NEXT: extr x16, x3, x2, #1 ; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x9, x0, x9 -; CHECK-GI-NEXT: lsl x15, x0, x15 -; CHECK-GI-NEXT: orr x12, x12, x13 -; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1 -; CHECK-GI-NEXT: lsr x14, x3, #1 -; CHECK-GI-NEXT: sub x10, x10, x8 -; CHECK-GI-NEXT: sub x16, x8, #64 -; CHECK-GI-NEXT: csel x9, x9, xzr, lo -; CHECK-GI-NEXT: lsr x17, x13, x8 -; CHECK-GI-NEXT: lsl x10, x14, x10 -; CHECK-GI-NEXT: csel x12, x12, x15, lo +; CHECK-GI-NEXT: sub x8, x8, x10 +; CHECK-GI-NEXT: orr x9, x12, x13 +; CHECK-GI-NEXT: lsr x12, x3, #1 +; CHECK-GI-NEXT: lsl x13, x0, x14 +; CHECK-GI-NEXT: csel x14, x15, xzr, lo +; CHECK-GI-NEXT: sub x15, x10, #64 +; CHECK-GI-NEXT: lsr x17, x16, x10 +; CHECK-GI-NEXT: lsl x8, x12, x8 +; CHECK-GI-NEXT: csel x9, x9, x13, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: lsr x15, x14, x16 +; CHECK-GI-NEXT: lsr x13, x12, x15 ; CHECK-GI-NEXT: mvn x11, x4 -; CHECK-GI-NEXT: csel x12, x1, x12, eq -; CHECK-GI-NEXT: orr x10, x17, x10 -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: lsr x14, x14, x8 -; CHECK-GI-NEXT: csel x10, x10, x15, lo +; CHECK-GI-NEXT: csel x9, x1, x9, eq +; CHECK-GI-NEXT: orr x8, x17, x8 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: lsr x12, x12, x10 +; CHECK-GI-NEXT: csel x8, x8, x13, lo ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: csel x10, x13, x10, eq -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: csel x8, x14, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x10 -; CHECK-GI-NEXT: orr x1, x12, x8 +; CHECK-GI-NEXT: csel x8, x16, x8, eq +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: csel x10, x12, xzr, lo +; CHECK-GI-NEXT: orr x0, x14, x8 +; CHECK-GI-NEXT: orr x1, x9, x10 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c) @@ -571,41 +570,40 @@ define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) { ; ; CHECK-GI-LABEL: fshr_i128: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #63 -; CHECK-GI-NEXT: mov w9, #127 // =0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: bic x9, x9, x4 -; CHECK-GI-NEXT: lsl x11, x0, #1 -; CHECK-GI-NEXT: and x12, x4, #0x7f -; CHECK-GI-NEXT: orr x8, x8, x1, lsl #1 -; CHECK-GI-NEXT: sub x14, x10, x9 -; CHECK-GI-NEXT: sub x17, x9, #64 -; CHECK-GI-NEXT: lsl x15, x11, x9 -; CHECK-GI-NEXT: lsr x14, x11, x14 -; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x16, x8, x9 -; CHECK-GI-NEXT: sub x9, x10, x12 -; CHECK-GI-NEXT: lsl x10, x11, x17 -; CHECK-GI-NEXT: mvn x13, x4 -; CHECK-GI-NEXT: csel x11, x15, xzr, lo -; CHECK-GI-NEXT: sub x15, x12, #64 -; CHECK-GI-NEXT: orr x14, x14, x16 -; CHECK-GI-NEXT: lsr x16, x2, x12 -; CHECK-GI-NEXT: lsl x9, x3, x9 -; CHECK-GI-NEXT: csel x10, x14, x10, lo -; CHECK-GI-NEXT: tst x13, #0x7f -; CHECK-GI-NEXT: lsr x13, x3, x15 -; CHECK-GI-NEXT: csel x8, x8, x10, eq -; CHECK-GI-NEXT: orr x9, x16, x9 -; CHECK-GI-NEXT: cmp x12, #64 -; CHECK-GI-NEXT: lsr x10, x3, x12 -; CHECK-GI-NEXT: csel x9, x9, x13, lo +; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: lsl x9, x0, #1 +; CHECK-GI-NEXT: extr x10, x1, x0, #63 +; CHECK-GI-NEXT: bic x8, x8, x4 +; CHECK-GI-NEXT: mov w11, #64 // =0x40 +; CHECK-GI-NEXT: and x14, x4, #0x7f +; CHECK-GI-NEXT: sub x12, x11, x8 +; CHECK-GI-NEXT: lsl x13, x10, x8 +; CHECK-GI-NEXT: lsl x16, x9, x8 +; CHECK-GI-NEXT: lsr x12, x9, x12 +; CHECK-GI-NEXT: sub x17, x8, #64 +; CHECK-GI-NEXT: cmp x8, #64 +; CHECK-GI-NEXT: lsl x8, x9, x17 +; CHECK-GI-NEXT: sub x11, x11, x14 +; CHECK-GI-NEXT: mvn x15, x4 +; CHECK-GI-NEXT: orr x12, x12, x13 +; CHECK-GI-NEXT: csel x9, x16, xzr, lo +; CHECK-GI-NEXT: sub x13, x14, #64 +; CHECK-GI-NEXT: lsr x16, x2, x14 +; CHECK-GI-NEXT: lsl x11, x3, x11 +; CHECK-GI-NEXT: csel x8, x12, x8, lo +; CHECK-GI-NEXT: tst x15, #0x7f +; CHECK-GI-NEXT: lsr x12, x3, x13 +; CHECK-GI-NEXT: csel x8, x10, x8, eq +; CHECK-GI-NEXT: orr x10, x16, x11 +; CHECK-GI-NEXT: cmp x14, #64 +; CHECK-GI-NEXT: lsr x11, x3, x14 +; CHECK-GI-NEXT: csel x10, x10, x12, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: csel x9, x2, x9, eq -; CHECK-GI-NEXT: cmp x12, #64 -; CHECK-GI-NEXT: csel x10, x10, xzr, lo -; CHECK-GI-NEXT: orr x0, x11, x9 -; CHECK-GI-NEXT: orr x1, x8, x10 +; CHECK-GI-NEXT: csel x10, x2, x10, eq +; CHECK-GI-NEXT: cmp x14, #64 +; CHECK-GI-NEXT: csel x11, x11, xzr, lo +; CHECK-GI-NEXT: orr x0, x9, x10 +; CHECK-GI-NEXT: orr x1, x8, x11 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c) @@ -720,10 +718,9 @@ define i128 @rotl_i128_c(i128 %a) { ; ; CHECK-GI-LABEL: rotl_i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #61 -; CHECK-GI-NEXT: lsr x9, x1, #61 -; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3 -; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3 +; CHECK-GI-NEXT: extr x8, x1, x0, #61 +; CHECK-GI-NEXT: extr x0, x0, x1, #61 +; CHECK-GI-NEXT: mov x1, x8 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3) @@ -731,20 +728,12 @@ entry: } define i128 @rotr_i128_c(i128 %a) { -; CHECK-SD-LABEL: rotr_i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x1, x0, #3 -; CHECK-SD-NEXT: extr x1, x0, x1, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: rotr_i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x0, #61 -; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3 -; CHECK-GI-NEXT: orr x1, x9, x1, lsr #3 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: rotr_i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x1, x0, #3 +; CHECK-NEXT: extr x1, x0, x1, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3) ret i128 %d @@ -868,10 +857,8 @@ define i128 @fshl_i128_c(i128 %a, i128 %b) { ; ; CHECK-GI-LABEL: fshl_i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #61 -; CHECK-GI-NEXT: lsr x9, x3, #61 -; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3 -; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x0, x0, x3, #61 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3) @@ -879,21 +866,12 @@ entry: } define i128 @fshr_i128_c(i128 %a, i128 %b) { -; CHECK-SD-LABEL: fshr_i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x3, x2, #3 -; CHECK-SD-NEXT: extr x1, x0, x3, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fshr_i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x3, #61 -; CHECK-GI-NEXT: lsr x9, x3, #3 -; CHECK-GI-NEXT: orr x8, x8, x2, lsr #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsl #61 -; CHECK-GI-NEXT: mov x0, x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fshr_i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x3, x2, #3 +; CHECK-NEXT: extr x1, x0, x3, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3) ret i128 %d @@ -3013,75 +2991,73 @@ define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) { ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w19, -16 ; CHECK-GI-NEXT: ldr x11, [sp, #16] -; CHECK-GI-NEXT: mov w10, #64 // =0x40 +; CHECK-GI-NEXT: mov w9, #64 // =0x40 ; CHECK-GI-NEXT: ldr x12, [sp, #32] ; CHECK-GI-NEXT: mov w13, #127 // =0x7f -; CHECK-GI-NEXT: and x9, x11, #0x7f +; CHECK-GI-NEXT: and x8, x11, #0x7f ; CHECK-GI-NEXT: and x14, x12, #0x7f -; CHECK-GI-NEXT: mvn x15, x11 -; CHECK-GI-NEXT: sub x8, x10, x9 -; CHECK-GI-NEXT: sub x16, x9, #64 -; CHECK-GI-NEXT: lsl x19, x1, x9 -; CHECK-GI-NEXT: lsr x18, x0, x8 -; CHECK-GI-NEXT: lsl x17, x0, x9 -; CHECK-GI-NEXT: lsl x16, x0, x16 -; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: bic x0, x13, x11 -; CHECK-GI-NEXT: mvn x8, x12 -; CHECK-GI-NEXT: orr x18, x18, x19 -; CHECK-GI-NEXT: csel x9, x17, xzr, lo +; CHECK-GI-NEXT: mvn x18, x11 +; CHECK-GI-NEXT: sub x10, x9, x8 +; CHECK-GI-NEXT: sub x15, x8, #64 +; CHECK-GI-NEXT: lsl x17, x1, x8 +; CHECK-GI-NEXT: lsr x16, x0, x10 +; CHECK-GI-NEXT: lsl x15, x0, x15 +; CHECK-GI-NEXT: cmp x8, #64 +; CHECK-GI-NEXT: lsl x19, x0, x8 +; CHECK-GI-NEXT: lsl x0, x3, x14 +; CHECK-GI-NEXT: mvn x10, x12 +; CHECK-GI-NEXT: orr x16, x16, x17 ; CHECK-GI-NEXT: sub x17, x14, #64 -; CHECK-GI-NEXT: csel x16, x18, x16, lo +; CHECK-GI-NEXT: csel x15, x16, x15, lo +; CHECK-GI-NEXT: sub x16, x9, x14 +; CHECK-GI-NEXT: csel x8, x19, xzr, lo +; CHECK-GI-NEXT: lsr x16, x2, x16 ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: sub x11, x10, x14 -; CHECK-GI-NEXT: lsr x11, x2, x11 -; CHECK-GI-NEXT: lsl x18, x3, x14 -; CHECK-GI-NEXT: csel x16, x1, x16, eq -; CHECK-GI-NEXT: lsl x1, x2, x14 +; CHECK-GI-NEXT: lsl x19, x2, x14 ; CHECK-GI-NEXT: lsl x17, x2, x17 +; CHECK-GI-NEXT: csel x15, x1, x15, eq ; CHECK-GI-NEXT: cmp x14, #64 -; CHECK-GI-NEXT: lsl x14, x5, #63 -; CHECK-GI-NEXT: orr x11, x11, x18 -; CHECK-GI-NEXT: bic x13, x13, x12 -; CHECK-GI-NEXT: csel x18, x1, xzr, lo -; CHECK-GI-NEXT: csel x11, x11, x17, lo +; CHECK-GI-NEXT: orr x16, x16, x0 +; CHECK-GI-NEXT: bic x11, x13, x11 +; CHECK-GI-NEXT: csel x14, x19, xzr, lo +; CHECK-GI-NEXT: csel x16, x16, x17, lo ; CHECK-GI-NEXT: tst x12, #0x7f -; CHECK-GI-NEXT: lsr x12, x5, #1 -; CHECK-GI-NEXT: orr x14, x14, x4, lsr #1 -; CHECK-GI-NEXT: lsl x17, x7, #63 -; CHECK-GI-NEXT: sub x1, x10, x0 -; CHECK-GI-NEXT: csel x11, x3, x11, eq -; CHECK-GI-NEXT: sub x2, x0, #64 -; CHECK-GI-NEXT: lsr x3, x14, x0 -; CHECK-GI-NEXT: lsl x1, x12, x1 -; CHECK-GI-NEXT: lsr x4, x7, #1 -; CHECK-GI-NEXT: orr x17, x17, x6, lsr #1 -; CHECK-GI-NEXT: lsr x2, x12, x2 -; CHECK-GI-NEXT: cmp x0, #64 -; CHECK-GI-NEXT: orr x1, x3, x1 -; CHECK-GI-NEXT: sub x10, x10, x13 -; CHECK-GI-NEXT: lsr x12, x12, x0 -; CHECK-GI-NEXT: csel x1, x1, x2, lo -; CHECK-GI-NEXT: tst x15, #0x7f -; CHECK-GI-NEXT: sub x15, x13, #64 -; CHECK-GI-NEXT: lsr x2, x17, x13 -; CHECK-GI-NEXT: lsl x10, x4, x10 -; CHECK-GI-NEXT: csel x14, x14, x1, eq -; CHECK-GI-NEXT: cmp x0, #64 -; CHECK-GI-NEXT: lsr x15, x4, x15 -; CHECK-GI-NEXT: lsr x0, x4, x13 -; CHECK-GI-NEXT: csel x12, x12, xzr, lo -; CHECK-GI-NEXT: orr x10, x2, x10 -; CHECK-GI-NEXT: cmp x13, #64 -; CHECK-GI-NEXT: csel x10, x10, x15, lo -; CHECK-GI-NEXT: tst x8, #0x7f -; CHECK-GI-NEXT: orr x1, x16, x12 -; CHECK-GI-NEXT: csel x8, x17, x10, eq -; CHECK-GI-NEXT: cmp x13, #64 -; CHECK-GI-NEXT: csel x10, x0, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x14 -; CHECK-GI-NEXT: orr x2, x18, x8 -; CHECK-GI-NEXT: orr x3, x11, x10 +; CHECK-GI-NEXT: lsr x17, x5, #1 +; CHECK-GI-NEXT: extr x0, x5, x4, #1 +; CHECK-GI-NEXT: bic x12, x13, x12 +; CHECK-GI-NEXT: csel x13, x3, x16, eq +; CHECK-GI-NEXT: sub x16, x9, x11 +; CHECK-GI-NEXT: sub x1, x11, #64 +; CHECK-GI-NEXT: lsr x3, x7, #1 +; CHECK-GI-NEXT: lsr x2, x0, x11 +; CHECK-GI-NEXT: lsl x16, x17, x16 +; CHECK-GI-NEXT: extr x4, x7, x6, #1 +; CHECK-GI-NEXT: lsr x1, x17, x1 +; CHECK-GI-NEXT: cmp x11, #64 +; CHECK-GI-NEXT: sub x9, x9, x12 +; CHECK-GI-NEXT: orr x16, x2, x16 +; CHECK-GI-NEXT: lsr x17, x17, x11 +; CHECK-GI-NEXT: lsl x9, x3, x9 +; CHECK-GI-NEXT: csel x16, x16, x1, lo +; CHECK-GI-NEXT: tst x18, #0x7f +; CHECK-GI-NEXT: sub x18, x12, #64 +; CHECK-GI-NEXT: lsr x1, x4, x12 +; CHECK-GI-NEXT: csel x16, x0, x16, eq +; CHECK-GI-NEXT: cmp x11, #64 +; CHECK-GI-NEXT: lsr x11, x3, x18 +; CHECK-GI-NEXT: csel x17, x17, xzr, lo +; CHECK-GI-NEXT: cmp x12, #64 +; CHECK-GI-NEXT: orr x9, x1, x9 +; CHECK-GI-NEXT: lsr x18, x3, x12 +; CHECK-GI-NEXT: orr x0, x8, x16 +; CHECK-GI-NEXT: csel x9, x9, x11, lo +; CHECK-GI-NEXT: tst x10, #0x7f +; CHECK-GI-NEXT: orr x1, x15, x17 +; CHECK-GI-NEXT: csel x9, x4, x9, eq +; CHECK-GI-NEXT: cmp x12, #64 +; CHECK-GI-NEXT: csel x10, x18, xzr, lo +; CHECK-GI-NEXT: orr x2, x14, x9 +; CHECK-GI-NEXT: orr x3, x13, x10 ; CHECK-GI-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload ; CHECK-GI-NEXT: ret entry: @@ -3125,75 +3101,73 @@ define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) { ; CHECK-GI-LABEL: fshr_v2i128: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr x9, [sp] -; CHECK-GI-NEXT: lsl x12, x1, #1 -; CHECK-GI-NEXT: mov w11, #127 // =0x7f -; CHECK-GI-NEXT: mov w14, #64 // =0x40 -; CHECK-GI-NEXT: lsl x15, x0, #1 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: mov w12, #64 // =0x40 +; CHECK-GI-NEXT: lsl x13, x0, #1 +; CHECK-GI-NEXT: extr x14, x1, x0, #63 ; CHECK-GI-NEXT: ldr x8, [sp, #16] -; CHECK-GI-NEXT: bic x13, x11, x9 -; CHECK-GI-NEXT: orr x12, x12, x0, lsr #63 -; CHECK-GI-NEXT: lsl x1, x3, #1 -; CHECK-GI-NEXT: sub x17, x14, x13 -; CHECK-GI-NEXT: sub x18, x13, #64 -; CHECK-GI-NEXT: lsl x3, x15, x13 -; CHECK-GI-NEXT: lsr x17, x15, x17 -; CHECK-GI-NEXT: lsl x0, x12, x13 -; CHECK-GI-NEXT: lsl x15, x15, x18 -; CHECK-GI-NEXT: bic x11, x11, x8 +; CHECK-GI-NEXT: bic x11, x10, x9 +; CHECK-GI-NEXT: mvn x16, x9 +; CHECK-GI-NEXT: and x15, x9, #0x7f +; CHECK-GI-NEXT: sub x17, x12, x11 +; CHECK-GI-NEXT: sub x18, x11, #64 +; CHECK-GI-NEXT: lsl x0, x14, x11 +; CHECK-GI-NEXT: lsr x17, x13, x17 +; CHECK-GI-NEXT: lsl x1, x13, x11 +; CHECK-GI-NEXT: lsl x13, x13, x18 +; CHECK-GI-NEXT: bic x10, x10, x8 ; CHECK-GI-NEXT: lsl x18, x2, #1 -; CHECK-GI-NEXT: cmp x13, #64 +; CHECK-GI-NEXT: cmp x11, #64 ; CHECK-GI-NEXT: orr x17, x17, x0 -; CHECK-GI-NEXT: orr x13, x1, x2, lsr #63 -; CHECK-GI-NEXT: mvn x16, x9 -; CHECK-GI-NEXT: csel x15, x17, x15, lo -; CHECK-GI-NEXT: sub x17, x14, x11 -; CHECK-GI-NEXT: csel x0, x3, xzr, lo +; CHECK-GI-NEXT: extr x11, x3, x2, #63 +; CHECK-GI-NEXT: csel x0, x1, xzr, lo +; CHECK-GI-NEXT: csel x13, x17, x13, lo +; CHECK-GI-NEXT: sub x17, x12, x10 ; CHECK-GI-NEXT: tst x16, #0x7f -; CHECK-GI-NEXT: sub x16, x11, #64 +; CHECK-GI-NEXT: sub x16, x10, #64 ; CHECK-GI-NEXT: lsr x17, x18, x17 -; CHECK-GI-NEXT: lsl x2, x13, x11 -; CHECK-GI-NEXT: lsl x1, x18, x11 -; CHECK-GI-NEXT: csel x12, x12, x15, eq -; CHECK-GI-NEXT: lsl x15, x18, x16 -; CHECK-GI-NEXT: and x10, x9, #0x7f -; CHECK-GI-NEXT: cmp x11, #64 -; CHECK-GI-NEXT: mvn x11, x8 +; CHECK-GI-NEXT: lsl x2, x11, x10 +; CHECK-GI-NEXT: lsl x1, x18, x10 +; CHECK-GI-NEXT: csel x13, x14, x13, eq +; CHECK-GI-NEXT: lsl x14, x18, x16 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: mvn x10, x8 ; CHECK-GI-NEXT: orr x16, x17, x2 ; CHECK-GI-NEXT: csel x17, x1, xzr, lo -; CHECK-GI-NEXT: csel x15, x16, x15, lo -; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: sub x11, x14, x10 -; CHECK-GI-NEXT: sub x16, x10, #64 -; CHECK-GI-NEXT: lsr x18, x4, x10 -; CHECK-GI-NEXT: lsl x11, x5, x11 -; CHECK-GI-NEXT: csel x13, x13, x15, eq -; CHECK-GI-NEXT: lsr x15, x5, x16 +; CHECK-GI-NEXT: csel x14, x16, x14, lo +; CHECK-GI-NEXT: tst x10, #0x7f +; CHECK-GI-NEXT: sub x10, x12, x15 +; CHECK-GI-NEXT: sub x16, x15, #64 +; CHECK-GI-NEXT: lsr x18, x4, x15 +; CHECK-GI-NEXT: lsl x10, x5, x10 +; CHECK-GI-NEXT: csel x11, x11, x14, eq +; CHECK-GI-NEXT: lsr x14, x5, x16 ; CHECK-GI-NEXT: and x1, x8, #0x7f -; CHECK-GI-NEXT: orr x11, x18, x11 -; CHECK-GI-NEXT: cmp x10, #64 -; CHECK-GI-NEXT: lsr x16, x5, x10 -; CHECK-GI-NEXT: csel x11, x11, x15, lo +; CHECK-GI-NEXT: cmp x15, #64 +; CHECK-GI-NEXT: lsr x16, x5, x15 +; CHECK-GI-NEXT: orr x10, x18, x10 +; CHECK-GI-NEXT: csel x10, x10, x14, lo ; CHECK-GI-NEXT: tst x9, #0x7f -; CHECK-GI-NEXT: sub x9, x14, x1 -; CHECK-GI-NEXT: sub x14, x1, #64 -; CHECK-GI-NEXT: lsr x15, x6, x1 +; CHECK-GI-NEXT: sub x9, x12, x1 +; CHECK-GI-NEXT: sub x12, x1, #64 +; CHECK-GI-NEXT: lsr x14, x6, x1 ; CHECK-GI-NEXT: lsl x9, x7, x9 -; CHECK-GI-NEXT: csel x11, x4, x11, eq -; CHECK-GI-NEXT: cmp x10, #64 -; CHECK-GI-NEXT: lsr x10, x7, x14 -; CHECK-GI-NEXT: csel x14, x16, xzr, lo -; CHECK-GI-NEXT: orr x9, x15, x9 +; CHECK-GI-NEXT: csel x10, x4, x10, eq +; CHECK-GI-NEXT: cmp x15, #64 +; CHECK-GI-NEXT: lsr x12, x7, x12 +; CHECK-GI-NEXT: csel x15, x16, xzr, lo +; CHECK-GI-NEXT: orr x9, x14, x9 ; CHECK-GI-NEXT: cmp x1, #64 -; CHECK-GI-NEXT: lsr x15, x7, x1 -; CHECK-GI-NEXT: csel x9, x9, x10, lo +; CHECK-GI-NEXT: lsr x14, x7, x1 +; CHECK-GI-NEXT: csel x9, x9, x12, lo ; CHECK-GI-NEXT: tst x8, #0x7f ; CHECK-GI-NEXT: csel x8, x6, x9, eq ; CHECK-GI-NEXT: cmp x1, #64 -; CHECK-GI-NEXT: orr x0, x0, x11 -; CHECK-GI-NEXT: csel x9, x15, xzr, lo -; CHECK-GI-NEXT: orr x1, x12, x14 +; CHECK-GI-NEXT: orr x0, x0, x10 +; CHECK-GI-NEXT: csel x9, x14, xzr, lo +; CHECK-GI-NEXT: orr x1, x13, x15 ; CHECK-GI-NEXT: orr x2, x17, x8 -; CHECK-GI-NEXT: orr x3, x13, x9 +; CHECK-GI-NEXT: orr x3, x11, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) @@ -3863,15 +3837,12 @@ define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) { ; ; CHECK-GI-LABEL: rotl_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x1, #3 -; CHECK-GI-NEXT: lsl x10, x3, #3 -; CHECK-GI-NEXT: lsr x11, x3, #61 -; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61 -; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61 -; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3 +; CHECK-GI-NEXT: extr x8, x0, x1, #61 +; CHECK-GI-NEXT: extr x9, x3, x2, #61 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x2, x2, x3, #61 ; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mov x3, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>) @@ -3891,14 +3862,12 @@ define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) { ; ; CHECK-GI-LABEL: rotr_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x3, #61 -; CHECK-GI-NEXT: lsl x10, x0, #61 -; CHECK-GI-NEXT: lsl x11, x2, #61 -; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3 -; CHECK-GI-NEXT: orr x2, x9, x2, lsr #3 -; CHECK-GI-NEXT: orr x1, x10, x1, lsr #3 -; CHECK-GI-NEXT: orr x3, x11, x3, lsr #3 +; CHECK-GI-NEXT: extr x8, x1, x0, #3 +; CHECK-GI-NEXT: extr x9, x3, x2, #3 +; CHECK-GI-NEXT: extr x1, x0, x1, #3 +; CHECK-GI-NEXT: extr x3, x2, x3, #3 +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mov x2, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>) @@ -4464,14 +4433,10 @@ define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) { ; ; CHECK-GI-LABEL: fshl_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x5, #61 -; CHECK-GI-NEXT: lsl x9, x1, #3 -; CHECK-GI-NEXT: lsl x10, x3, #3 -; CHECK-GI-NEXT: lsr x11, x7, #61 -; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61 -; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61 -; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3 +; CHECK-GI-NEXT: extr x8, x0, x5, #61 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x3, x3, x2, #61 +; CHECK-GI-NEXT: extr x2, x2, x7, #61 ; CHECK-GI-NEXT: mov x0, x8 ; CHECK-GI-NEXT: ret entry: @@ -4480,29 +4445,15 @@ entry: } define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) { -; CHECK-SD-LABEL: fshr_v2i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x5, x4, #3 -; CHECK-SD-NEXT: extr x9, x7, x6, #3 -; CHECK-SD-NEXT: extr x1, x0, x5, #3 -; CHECK-SD-NEXT: extr x3, x2, x7, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: mov x2, x9 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fshr_v2i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x5, #61 -; CHECK-GI-NEXT: lsl x9, x7, #61 -; CHECK-GI-NEXT: lsr x10, x5, #3 -; CHECK-GI-NEXT: lsr x11, x7, #3 -; CHECK-GI-NEXT: orr x8, x8, x4, lsr #3 -; CHECK-GI-NEXT: orr x9, x9, x6, lsr #3 -; CHECK-GI-NEXT: orr x1, x10, x0, lsl #61 -; CHECK-GI-NEXT: orr x3, x11, x2, lsl #61 -; CHECK-GI-NEXT: mov x0, x8 -; CHECK-GI-NEXT: mov x2, x9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fshr_v2i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x5, x4, #3 +; CHECK-NEXT: extr x9, x7, x6, #3 +; CHECK-NEXT: extr x1, x0, x5, #3 +; CHECK-NEXT: extr x3, x2, x7, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, x9 +; CHECK-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>) ret <2 x i128> %d diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll index f9fd2ad..90fb102 100644 --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -85,41 +85,40 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; ; CHECK-GI-LABEL: fshl_i128: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #64 // =0x40 ; CHECK-GI-NEXT: and x9, x4, #0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: lsl x14, x3, #63 -; CHECK-GI-NEXT: sub x12, x10, x9 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: sub x12, x8, x9 ; CHECK-GI-NEXT: lsl x13, x1, x9 -; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: bic x10, x10, x4 ; CHECK-GI-NEXT: lsr x12, x0, x12 -; CHECK-GI-NEXT: bic x8, x8, x4 -; CHECK-GI-NEXT: sub x15, x9, #64 +; CHECK-GI-NEXT: sub x14, x9, #64 +; CHECK-GI-NEXT: lsl x15, x0, x9 +; CHECK-GI-NEXT: extr x16, x3, x2, #1 ; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x9, x0, x9 -; CHECK-GI-NEXT: lsl x15, x0, x15 -; CHECK-GI-NEXT: orr x12, x12, x13 -; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1 -; CHECK-GI-NEXT: lsr x14, x3, #1 -; CHECK-GI-NEXT: sub x10, x10, x8 -; CHECK-GI-NEXT: sub x16, x8, #64 -; CHECK-GI-NEXT: csel x9, x9, xzr, lo -; CHECK-GI-NEXT: lsr x17, x13, x8 -; CHECK-GI-NEXT: lsl x10, x14, x10 -; CHECK-GI-NEXT: csel x12, x12, x15, lo +; CHECK-GI-NEXT: sub x8, x8, x10 +; CHECK-GI-NEXT: orr x9, x12, x13 +; CHECK-GI-NEXT: lsr x12, x3, #1 +; CHECK-GI-NEXT: lsl x13, x0, x14 +; CHECK-GI-NEXT: csel x14, x15, xzr, lo +; CHECK-GI-NEXT: sub x15, x10, #64 +; CHECK-GI-NEXT: lsr x17, x16, x10 +; CHECK-GI-NEXT: lsl x8, x12, x8 +; CHECK-GI-NEXT: csel x9, x9, x13, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: lsr x15, x14, x16 +; CHECK-GI-NEXT: lsr x13, x12, x15 ; CHECK-GI-NEXT: mvn x11, x4 -; CHECK-GI-NEXT: csel x12, x1, x12, eq -; CHECK-GI-NEXT: orr x10, x17, x10 -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: lsr x14, x14, x8 -; CHECK-GI-NEXT: csel x10, x10, x15, lo +; CHECK-GI-NEXT: csel x9, x1, x9, eq +; CHECK-GI-NEXT: orr x8, x17, x8 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: lsr x12, x12, x10 +; CHECK-GI-NEXT: csel x8, x8, x13, lo ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: csel x10, x13, x10, eq -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: csel x8, x14, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x10 -; CHECK-GI-NEXT: orr x1, x12, x8 +; CHECK-GI-NEXT: csel x8, x16, x8, eq +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: csel x10, x12, xzr, lo +; CHECK-GI-NEXT: orr x0, x14, x8 +; CHECK-GI-NEXT: orr x1, x9, x10 ; CHECK-GI-NEXT: ret %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %f diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index 1cb92e4..87b1108 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -559,20 +559,18 @@ define i128 @ui128_7(i128 %a, i128 %b) { ; CHECK-GI-NEXT: add x8, x8, x10 ; CHECK-GI-NEXT: subs x10, x0, x9 ; CHECK-GI-NEXT: sbc x11, x1, x8 -; CHECK-GI-NEXT: lsl x12, x11, #63 +; CHECK-GI-NEXT: extr x10, x11, x10, #1 ; CHECK-GI-NEXT: lsr x11, x11, #1 -; CHECK-GI-NEXT: orr x10, x12, x10, lsr #1 ; CHECK-GI-NEXT: adds x9, x10, x9 +; CHECK-GI-NEXT: mov w10, #7 // =0x7 ; CHECK-GI-NEXT: adc x8, x11, x8 -; CHECK-GI-NEXT: lsl x10, x8, #62 +; CHECK-GI-NEXT: extr x9, x8, x9, #2 ; CHECK-GI-NEXT: lsr x8, x8, #2 -; CHECK-GI-NEXT: orr x9, x10, x9, lsr #2 -; CHECK-GI-NEXT: mov w10, #7 // =0x7 -; CHECK-GI-NEXT: lsl x12, x8, #3 ; CHECK-GI-NEXT: umulh x10, x9, x10 ; CHECK-GI-NEXT: lsl x11, x9, #3 -; CHECK-GI-NEXT: sub x8, x12, x8 +; CHECK-GI-NEXT: lsl x12, x8, #3 ; CHECK-GI-NEXT: sub x9, x11, x9 +; CHECK-GI-NEXT: sub x8, x12, x8 ; CHECK-GI-NEXT: subs x0, x0, x9 ; CHECK-GI-NEXT: add x8, x8, x10 ; CHECK-GI-NEXT: sbc x1, x1, x8 @@ -640,10 +638,9 @@ define i128 @ui128_100(i128 %a, i128 %b) { ; CHECK-GI-NEXT: add x10, x11, x12 ; CHECK-GI-NEXT: add x8, x8, x14 ; CHECK-GI-NEXT: add x8, x8, x10 -; CHECK-GI-NEXT: lsl x10, x8, #60 -; CHECK-GI-NEXT: lsr x8, x8, #4 -; CHECK-GI-NEXT: orr x9, x10, x9, lsr #4 ; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: extr x9, x8, x9, #4 +; CHECK-GI-NEXT: lsr x8, x8, #4 ; CHECK-GI-NEXT: umulh x11, x9, x10 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: madd x8, x8, x10, x11 @@ -3317,36 +3314,32 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI-NEXT: sbc x14, x1, x12 ; CHECK-GI-NEXT: add x8, x8, x13 ; CHECK-GI-NEXT: subs x13, x2, x10 -; CHECK-GI-NEXT: lsl x15, x14, #63 -; CHECK-GI-NEXT: sbc x16, x3, x8 +; CHECK-GI-NEXT: extr x9, x14, x9, #1 +; CHECK-GI-NEXT: sbc x15, x3, x8 ; CHECK-GI-NEXT: lsr x14, x14, #1 -; CHECK-GI-NEXT: orr x9, x15, x9, lsr #1 -; CHECK-GI-NEXT: lsl x15, x16, #63 -; CHECK-GI-NEXT: orr x13, x15, x13, lsr #1 +; CHECK-GI-NEXT: extr x13, x15, x13, #1 ; CHECK-GI-NEXT: adds x9, x9, x11 -; CHECK-GI-NEXT: lsr x11, x16, #1 +; CHECK-GI-NEXT: lsr x11, x15, #1 ; CHECK-GI-NEXT: adc x12, x14, x12 ; CHECK-GI-NEXT: adds x10, x13, x10 -; CHECK-GI-NEXT: lsl x13, x12, #62 -; CHECK-GI-NEXT: lsr x12, x12, #2 -; CHECK-GI-NEXT: adc x8, x11, x8 -; CHECK-GI-NEXT: lsl x11, x8, #62 -; CHECK-GI-NEXT: orr x9, x13, x9, lsr #2 +; CHECK-GI-NEXT: extr x9, x12, x9, #2 ; CHECK-GI-NEXT: mov w13, #7 // =0x7 +; CHECK-GI-NEXT: adc x8, x11, x8 +; CHECK-GI-NEXT: lsr x11, x12, #2 +; CHECK-GI-NEXT: extr x10, x8, x10, #2 +; CHECK-GI-NEXT: umulh x12, x9, x13 ; CHECK-GI-NEXT: lsr x8, x8, #2 -; CHECK-GI-NEXT: lsl x14, x12, #3 -; CHECK-GI-NEXT: orr x10, x11, x10, lsr #2 -; CHECK-GI-NEXT: umulh x11, x9, x13 +; CHECK-GI-NEXT: lsl x14, x11, #3 ; CHECK-GI-NEXT: lsl x15, x9, #3 -; CHECK-GI-NEXT: sub x12, x14, x12 -; CHECK-GI-NEXT: lsl x16, x8, #3 ; CHECK-GI-NEXT: umulh x13, x10, x13 +; CHECK-GI-NEXT: lsl x16, x8, #3 +; CHECK-GI-NEXT: sub x11, x14, x11 ; CHECK-GI-NEXT: lsl x14, x10, #3 ; CHECK-GI-NEXT: sub x9, x15, x9 ; CHECK-GI-NEXT: sub x8, x16, x8 ; CHECK-GI-NEXT: subs x0, x0, x9 +; CHECK-GI-NEXT: add x11, x11, x12 ; CHECK-GI-NEXT: sub x10, x14, x10 -; CHECK-GI-NEXT: add x11, x12, x11 ; CHECK-GI-NEXT: sbc x1, x1, x11 ; CHECK-GI-NEXT: subs x2, x2, x10 ; CHECK-GI-NEXT: add x8, x8, x13 @@ -3394,9 +3387,10 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov x10, #23593 // =0x5c29 ; CHECK-GI-NEXT: mov x8, #62914 // =0xf5c2 -; CHECK-GI-NEXT: sub x18, x0, x0 +; CHECK-GI-NEXT: and x5, xzr, #0x1 ; CHECK-GI-NEXT: movk x10, #49807, lsl #16 ; CHECK-GI-NEXT: movk x8, #23592, lsl #16 +; CHECK-GI-NEXT: umulh x18, x0, xzr ; CHECK-GI-NEXT: movk x10, #10485, lsl #32 ; CHECK-GI-NEXT: movk x8, #49807, lsl #32 ; CHECK-GI-NEXT: movk x10, #36700, lsl #48 @@ -3409,84 +3403,81 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI-NEXT: umulh x15, x1, x10 ; CHECK-GI-NEXT: cset w12, hs ; CHECK-GI-NEXT: cmn x11, x13 -; CHECK-GI-NEXT: and x11, x12, #0x1 -; CHECK-GI-NEXT: umulh x16, x0, x8 -; CHECK-GI-NEXT: cset w12, hs +; CHECK-GI-NEXT: sub x13, x0, x0 ; CHECK-GI-NEXT: and x12, x12, #0x1 -; CHECK-GI-NEXT: add x14, x14, x18 -; CHECK-GI-NEXT: add x11, x11, x12 -; CHECK-GI-NEXT: and x12, xzr, #0x1 +; CHECK-GI-NEXT: umulh x16, x0, x8 +; CHECK-GI-NEXT: cset w11, hs +; CHECK-GI-NEXT: add x13, x14, x13 +; CHECK-GI-NEXT: and x11, x11, #0x1 +; CHECK-GI-NEXT: and x14, xzr, #0x1 ; CHECK-GI-NEXT: umulh x9, xzr, x10 -; CHECK-GI-NEXT: adds x14, x14, x15 -; CHECK-GI-NEXT: and x15, xzr, #0x1 +; CHECK-GI-NEXT: add x11, x12, x11 +; CHECK-GI-NEXT: add x12, x5, x14 +; CHECK-GI-NEXT: adds x13, x13, x15 ; CHECK-GI-NEXT: umulh x17, x1, x8 -; CHECK-GI-NEXT: cset w4, hs -; CHECK-GI-NEXT: add x15, x12, x15 -; CHECK-GI-NEXT: adds x12, x14, x16 -; CHECK-GI-NEXT: and x4, x4, #0x1 -; CHECK-GI-NEXT: mul x18, x3, x10 ; CHECK-GI-NEXT: cset w14, hs -; CHECK-GI-NEXT: adds x12, x12, x11 -; CHECK-GI-NEXT: add x11, x15, x4 ; CHECK-GI-NEXT: and x14, x14, #0x1 -; CHECK-GI-NEXT: cset w15, hs -; CHECK-GI-NEXT: mul x5, x2, x8 -; CHECK-GI-NEXT: add x11, x11, x14 -; CHECK-GI-NEXT: and x14, x15, #0x1 -; CHECK-GI-NEXT: add x17, x9, x17 -; CHECK-GI-NEXT: add x14, x11, x14 -; CHECK-GI-NEXT: mov w11, #100 // =0x64 -; CHECK-GI-NEXT: umulh x13, x0, xzr -; CHECK-GI-NEXT: umulh x16, x2, x10 -; CHECK-GI-NEXT: adds x18, x18, x5 -; CHECK-GI-NEXT: mul x15, x3, x8 -; CHECK-GI-NEXT: add x13, x17, x13 -; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: umulh x10, x3, x10 -; CHECK-GI-NEXT: add x13, x13, x14 -; CHECK-GI-NEXT: and x17, x17, #0x1 -; CHECK-GI-NEXT: cmn x18, x16 -; CHECK-GI-NEXT: sub x18, x2, x2 -; CHECK-GI-NEXT: umulh x16, x2, x8 +; CHECK-GI-NEXT: adds x13, x13, x16 +; CHECK-GI-NEXT: mul x4, x3, x10 +; CHECK-GI-NEXT: add x12, x12, x14 ; CHECK-GI-NEXT: cset w14, hs -; CHECK-GI-NEXT: and x14, x14, #0x1 -; CHECK-GI-NEXT: add x15, x15, x18 +; CHECK-GI-NEXT: adds x11, x13, x11 +; CHECK-GI-NEXT: and x13, x14, #0x1 +; CHECK-GI-NEXT: mul x15, x2, x8 +; CHECK-GI-NEXT: cset w14, hs +; CHECK-GI-NEXT: add x12, x12, x13 +; CHECK-GI-NEXT: and x13, x14, #0x1 +; CHECK-GI-NEXT: add x14, x9, x17 +; CHECK-GI-NEXT: sub x17, x2, x2 +; CHECK-GI-NEXT: umulh x16, x2, x10 +; CHECK-GI-NEXT: add x12, x12, x13 +; CHECK-GI-NEXT: add x13, x14, x18 +; CHECK-GI-NEXT: add x12, x13, x12 ; CHECK-GI-NEXT: and x18, xzr, #0x1 -; CHECK-GI-NEXT: add x14, x17, x14 +; CHECK-GI-NEXT: mul x5, x3, x8 +; CHECK-GI-NEXT: extr x11, x12, x11, #4 +; CHECK-GI-NEXT: adds x13, x4, x15 +; CHECK-GI-NEXT: umulh x14, x3, x10 +; CHECK-GI-NEXT: cset w15, hs +; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: cmn x13, x16 +; CHECK-GI-NEXT: and x15, x15, #0x1 +; CHECK-GI-NEXT: umulh x13, x2, x8 +; CHECK-GI-NEXT: cset w16, hs +; CHECK-GI-NEXT: add x17, x5, x17 +; CHECK-GI-NEXT: and x16, x16, #0x1 ; CHECK-GI-NEXT: umulh x8, x3, x8 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: adds x14, x17, x14 ; CHECK-GI-NEXT: and x17, xzr, #0x1 -; CHECK-GI-NEXT: adds x10, x15, x10 -; CHECK-GI-NEXT: add x15, x17, x18 +; CHECK-GI-NEXT: add x16, x18, x17 ; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: umulh x18, x2, xzr +; CHECK-GI-NEXT: adds x13, x14, x13 +; CHECK-GI-NEXT: umulh x14, x2, xzr ; CHECK-GI-NEXT: and x17, x17, #0x1 -; CHECK-GI-NEXT: adds x10, x10, x16 -; CHECK-GI-NEXT: lsl x16, x13, #60 -; CHECK-GI-NEXT: add x15, x15, x17 -; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: adds x10, x10, x14 -; CHECK-GI-NEXT: and x14, x17, #0x1 +; CHECK-GI-NEXT: cset w18, hs +; CHECK-GI-NEXT: adds x13, x13, x15 +; CHECK-GI-NEXT: add x15, x16, x17 +; CHECK-GI-NEXT: and x16, x18, #0x1 ; CHECK-GI-NEXT: cset w17, hs ; CHECK-GI-NEXT: add x8, x9, x8 -; CHECK-GI-NEXT: add x14, x15, x14 -; CHECK-GI-NEXT: and x15, x17, #0x1 -; CHECK-GI-NEXT: orr x12, x16, x12, lsr #4 -; CHECK-GI-NEXT: add x9, x14, x15 -; CHECK-GI-NEXT: add x8, x8, x18 -; CHECK-GI-NEXT: add x8, x8, x9 -; CHECK-GI-NEXT: lsr x9, x13, #4 -; CHECK-GI-NEXT: umulh x14, x12, x11 -; CHECK-GI-NEXT: lsl x13, x8, #60 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: and x16, x17, #0x1 +; CHECK-GI-NEXT: lsr x9, x12, #4 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: umulh x17, x11, x10 +; CHECK-GI-NEXT: add x8, x8, x14 +; CHECK-GI-NEXT: add x8, x8, x15 +; CHECK-GI-NEXT: mul x11, x11, x10 +; CHECK-GI-NEXT: extr x12, x8, x13, #4 ; CHECK-GI-NEXT: lsr x8, x8, #4 -; CHECK-GI-NEXT: mul x12, x12, x11 -; CHECK-GI-NEXT: orr x10, x13, x10, lsr #4 -; CHECK-GI-NEXT: madd x9, x9, x11, x14 -; CHECK-GI-NEXT: umulh x13, x10, x11 -; CHECK-GI-NEXT: subs x0, x0, x12 -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: madd x9, x9, x10, x17 +; CHECK-GI-NEXT: umulh x13, x12, x10 +; CHECK-GI-NEXT: subs x0, x0, x11 +; CHECK-GI-NEXT: mul x12, x12, x10 ; CHECK-GI-NEXT: sbc x1, x1, x9 -; CHECK-GI-NEXT: madd x8, x8, x11, x13 -; CHECK-GI-NEXT: subs x2, x2, x10 +; CHECK-GI-NEXT: madd x8, x8, x10, x13 +; CHECK-GI-NEXT: subs x2, x2, x12 ; CHECK-GI-NEXT: sbc x3, x3, x8 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index e4f9efa..0504959 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -351,7 +351,6 @@ define i64 @test_many_callee_arguments( ret i64 %ret } -; FIXME: The new lowering should avoid saves/restores in the probing loop. define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{ ; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes: ; CHECK: // %bb.0: @@ -389,16 +388,14 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size ; CHECK-NEWLOWERING-NEXT: mov x8, sp ; CHECK-NEWLOWERING-NEXT: sub x19, x8, x0 -; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 ; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 ; CHECK-NEWLOWERING-NEXT: cmp sp, x19 ; CHECK-NEWLOWERING-NEXT: b.le .LBB7_3 ; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 -; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: str xzr, [sp] -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore ; CHECK-NEWLOWERING-NEXT: b .LBB7_1 ; CHECK-NEWLOWERING-NEXT: .LBB7_3: ; CHECK-NEWLOWERING-NEXT: mov sp, x19 diff --git a/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll b/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll index 63c6533..a5b7612 100644 --- a/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll +++ b/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll @@ -64,6 +64,6 @@ define i64 @test_sme_calling_convention_x2() nounwind { ret i64 %pstate.sm } -declare void @__arm_tpidr2_save() -declare i64 @__arm_get_current_vg() -declare {i64, i64} @__arm_sme_state() +declare aarch64_sme_preservemost_from_x0 void @__arm_tpidr2_save() +declare aarch64_sme_preservemost_from_x1 i64 @__arm_get_current_vg() +declare aarch64_sme_preservemost_from_x2 {i64, i64} @__arm_sme_state() diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll index 18ea07e..c753e9c 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll @@ -228,65 +228,34 @@ exit: ret void } -; FIXME: The codegen for this case could be improved (by tuning weights). -; Here the ZA save has been hoisted out of the conditional, but would be better -; to sink it. define void @cond_private_za_call(i1 %cond) "aarch64_inout_za" nounwind { -; CHECK-LABEL: cond_private_za_call: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: tbz w0, #0, .LBB3_4 -; CHECK-NEXT: // %bb.1: // %private_za_call -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB3_3 -; CHECK-NEXT: // %bb.2: // %private_za_call -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB3_3: // %private_za_call -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB3_4: // %exit -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: b shared_za_call -; -; CHECK-NEWLOWERING-LABEL: cond_private_za_call: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB3_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %private_za_call -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: .LBB3_2: // %exit -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB3_4 -; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB3_4: // %exit -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: b shared_za_call +; CHECK-COMMON-LABEL: cond_private_za_call: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: tbz w0, #0, .LBB3_4 +; CHECK-COMMON-NEXT: // %bb.1: // %private_za_call +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB3_3 +; CHECK-COMMON-NEXT: // %bb.2: // %private_za_call +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB3_3: // %private_za_call +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB3_4: // %exit +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: b shared_za_call br i1 %cond, label %private_za_call, label %exit private_za_call: @@ -910,7 +879,7 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin ; CHECK-NEWLOWERING-LABEL: loop_with_external_entry: ; CHECK-NEWLOWERING: // %bb.0: // %entry ; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEWLOWERING-NEXT: mov x29, sp ; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 ; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 @@ -923,23 +892,27 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin ; CHECK-NEWLOWERING-NEXT: // %bb.1: // %init ; CHECK-NEWLOWERING-NEXT: bl shared_za_call ; CHECK-NEWLOWERING-NEXT: .LBB11_2: // %loop.preheader -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16 +; CHECK-NEWLOWERING-NEXT: b .LBB11_4 ; CHECK-NEWLOWERING-NEXT: .LBB11_3: // %loop +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEWLOWERING-NEXT: tbz w19, #0, .LBB11_6 +; CHECK-NEWLOWERING-NEXT: .LBB11_4: // %loop ; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB11_3 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit ; CHECK-NEWLOWERING-NEXT: smstart za ; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_6 -; CHECK-NEWLOWERING-NEXT: // %bb.5: // %exit +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB11_3 +; CHECK-NEWLOWERING-NEXT: // %bb.5: // %loop +; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB11_4 Depth=1 ; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore +; CHECK-NEWLOWERING-NEXT: b .LBB11_3 ; CHECK-NEWLOWERING-NEXT: .LBB11_6: // %exit -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index 3f35cb5..dcdc56c 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -63,25 +63,17 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe ; CHECK-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr] ; CHECK-NEXT: bl __cxa_throw ; CHECK-NEXT: .Ltmp1: // EH_LABEL -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_4 -; CHECK-NEXT: // %bb.3: // %throw_exception -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB0_4: // %throw_exception -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: // %bb.5: // %throw_fail -; CHECK-NEXT: .LBB0_6: // %unwind_dtors +; CHECK-NEXT: // %bb.3: // %throw_fail +; CHECK-NEXT: .LBB0_4: // %unwind_dtors ; CHECK-NEXT: .Ltmp2: // EH_LABEL ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_8 -; CHECK-NEXT: // %bb.7: // %unwind_dtors +; CHECK-NEXT: cbnz x8, .LBB0_6 +; CHECK-NEXT: // %bb.5: // %unwind_dtors ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB0_8: // %unwind_dtors +; CHECK-NEXT: .LBB0_6: // %unwind_dtors ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: sub x8, x29, #16 diff --git a/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll new file mode 100644 index 0000000..0306b27 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll @@ -0,0 +1,296 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi < %s | FileCheck %s + +; This test case was generated by lowering mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir to LLVM IR. +; The actual contents of the function are not that important. The main interesting quality here is that many blocks +; don't directly use ZA. The only blocks that require ZA are the MOPA (and load/stores) in the inner loop, and the +;`printMemrefF32()` call in the exit block. +; +; If ZA states are not propagated in the MachineSMEABIPass block %48 (which is within the outer loop), will +; have an edge to block %226 (the exit block), which requires ZA in the "saved" state, and an edge to block %51 +; (which has no preference on ZA state). This means block %48 will also end up in the locally saved state. +; This is not really what we want, as it means we will save/restore ZA in the outer loop. We can fix this by +; propagating the "active" state from the inner loop through basic blocks with no preference, to ensure the outer +; loop is in the "active" state too. +; +; If done correctly, the only ZA save/restore should be in the exit block (with all other blocks in the active state). + +define void @matmul(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr %7, ptr %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13, ptr %14, ptr %15, i64 %16, i64 %17, i64 %18, i64 %19, i64 %20) #0 { +; Check for a ZA zero in the entry block, then no uses of TPIDR2_EL0 (for ZA saves/restore) +; until the exit block (which contains the call to printMemrefF32). +; +; CHECK-LABEL: matmul: +; CHECK: zero {za} +; CHECK-NOT: TPIDR2_EL0 +; CHECK: msr TPIDR2_EL0, x{{.*}} +; CHECK-NOT: .LBB{{.*}} +; CHECK: bl printMemrefF32 + %22 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %14, 0 + %23 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %22, ptr %15, 1 + %24 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %23, i64 %16, 2 + %25 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %24, i64 %17, 3, 0 + %26 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %25, i64 %19, 4, 0 + %27 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %26, i64 %18, 3, 1 + %28 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %27, i64 %20, 4, 1 + %29 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %7, 0 + %30 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %29, ptr %8, 1 + %31 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %30, i64 %9, 2 + %32 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %31, i64 %10, 3, 0 + %33 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %32, i64 %12, 4, 0 + %34 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %33, i64 %11, 3, 1 + %35 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %34, i64 %13, 4, 1 + %36 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %0, 0 + %37 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %36, ptr %1, 1 + %38 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %37, i64 %2, 2 + %39 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %38, i64 %3, 3, 0 + %40 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %39, i64 %5, 4, 0 + %41 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %40, i64 %4, 3, 1 + %42 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %41, i64 %6, 4, 1 + %43 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0 + %44 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1 + %45 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1 + %46 = call i64 @llvm.vscale.i64() + %47 = mul i64 %46, 4 + br label %48 + +48: ; preds = %224, %21 + %49 = phi i64 [ %225, %224 ], [ 0, %21 ] + %50 = icmp slt i64 %49, %43 + br i1 %50, label %51, label %226 + +51: ; preds = %48 + %52 = sub i64 %43, %49 + %53 = call i64 @llvm.smin.i64(i64 %47, i64 %52) + %54 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() + %55 = trunc i64 %53 to i32 + %56 = insertelement <vscale x 4 x i32> poison, i32 %55, i32 0 + %57 = shufflevector <vscale x 4 x i32> %56, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer + %58 = icmp slt <vscale x 4 x i32> %54, %57 + br label %59 + +59: ; preds = %222, %51 + %60 = phi i64 [ %223, %222 ], [ 0, %51 ] + %61 = icmp slt i64 %60, %45 + br i1 %61, label %62, label %224 + +62: ; preds = %59 + %63 = sub i64 %45, %60 + %64 = call i64 @llvm.smin.i64(i64 %47, i64 %63) + %65 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 0 + %66 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 1 + %67 = insertvalue { ptr, ptr, i64 } poison, ptr %65, 0 + %68 = insertvalue { ptr, ptr, i64 } %67, ptr %66, 1 + %69 = insertvalue { ptr, ptr, i64 } %68, i64 0, 2 + %70 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 2 + %71 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 0 + %72 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 1 + %73 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 0 + %74 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 1 + %75 = mul nsw i64 %49, %73 + %76 = add i64 %70, %75 + %77 = mul nsw i64 %60, %74 + %78 = add i64 %76, %77 + %79 = extractvalue { ptr, ptr, i64 } %69, 0 + %80 = extractvalue { ptr, ptr, i64 } %69, 1 + %81 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %79, 0 + %82 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %81, ptr %80, 1 + %83 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %82, i64 %78, 2 + %84 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %83, i64 %53, 3, 0 + %85 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %84, i64 %73, 4, 0 + %86 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %85, i64 %64, 3, 1 + %87 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %86, i64 %74, 4, 1 + %88 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() + %89 = trunc i64 %64 to i32 + %90 = insertelement <vscale x 4 x i32> poison, i32 %89, i32 0 + %91 = shufflevector <vscale x 4 x i32> %90, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer + %92 = icmp slt <vscale x 4 x i32> %88, %91 + br label %93 + +93: ; preds = %220, %62 + %94 = phi i64 [ %221, %220 ], [ 0, %62 ] + %95 = icmp slt i64 %94, %44 + br i1 %95, label %96, label %222 + +96: ; preds = %93 + %97 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 0 + %98 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 1 + %99 = insertvalue { ptr, ptr, i64 } poison, ptr %97, 0 + %100 = insertvalue { ptr, ptr, i64 } %99, ptr %98, 1 + %101 = insertvalue { ptr, ptr, i64 } %100, i64 0, 2 + %102 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 2 + %103 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0 + %104 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1 + %105 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 0 + %106 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 1 + %107 = mul nsw i64 %49, %105 + %108 = add i64 %102, %107 + %109 = mul nsw i64 %94, %106 + %110 = add i64 %108, %109 + %111 = extractvalue { ptr, ptr, i64 } %101, 0 + %112 = extractvalue { ptr, ptr, i64 } %101, 1 + %113 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %111, 0 + %114 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %113, ptr %112, 1 + %115 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %114, i64 %110, 2 + %116 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %115, i64 %53, 3, 0 + %117 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %116, i64 %105, 4, 0 + br label %118 + +118: ; preds = %133, %96 + %119 = phi i64 [ %135, %133 ], [ 0, %96 ] + %120 = phi <vscale x 4 x float> [ %134, %133 ], [ poison, %96 ] + %121 = icmp slt i64 %119, %47 + br i1 %121, label %122, label %136 + +122: ; preds = %118 + %123 = extractelement <vscale x 4 x i1> %58, i64 %119 + br i1 %123, label %124, label %133 + +124: ; preds = %122 + %125 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 1 + %126 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 2 + %127 = getelementptr float, ptr %125, i64 %126 + %128 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 4, 0 + %129 = mul nuw nsw i64 %119, %128 + %130 = getelementptr inbounds nuw float, ptr %127, i64 %129 + %131 = load float, ptr %130, align 4 + %132 = insertelement <vscale x 4 x float> %120, float %131, i64 %119 + br label %133 + +133: ; preds = %124, %122 + %134 = phi <vscale x 4 x float> [ %132, %124 ], [ %120, %122 ] + %135 = add i64 %119, 1 + br label %118 + +136: ; preds = %118 + %137 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 0 + %138 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 1 + %139 = insertvalue { ptr, ptr, i64 } poison, ptr %137, 0 + %140 = insertvalue { ptr, ptr, i64 } %139, ptr %138, 1 + %141 = insertvalue { ptr, ptr, i64 } %140, i64 0, 2 + %142 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 2 + %143 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 0 + %144 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1 + %145 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 0 + %146 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 1 + %147 = mul nsw i64 %94, %145 + %148 = add i64 %142, %147 + %149 = mul nsw i64 %60, %146 + %150 = add i64 %148, %149 + %151 = extractvalue { ptr, ptr, i64 } %141, 0 + %152 = extractvalue { ptr, ptr, i64 } %141, 1 + %153 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %151, 0 + %154 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %153, ptr %152, 1 + %155 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %154, i64 %150, 2 + %156 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %155, i64 %64, 3, 0 + %157 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %156, i64 %146, 4, 0 + br label %158 + +158: ; preds = %173, %136 + %159 = phi i64 [ %175, %173 ], [ 0, %136 ] + %160 = phi <vscale x 4 x float> [ %174, %173 ], [ poison, %136 ] + %161 = icmp slt i64 %159, %47 + br i1 %161, label %162, label %176 + +162: ; preds = %158 + %163 = extractelement <vscale x 4 x i1> %92, i64 %159 + br i1 %163, label %164, label %173 + +164: ; preds = %162 + %165 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 1 + %166 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 2 + %167 = getelementptr float, ptr %165, i64 %166 + %168 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 4, 0 + %169 = mul nuw nsw i64 %159, %168 + %170 = getelementptr inbounds nuw float, ptr %167, i64 %169 + %171 = load float, ptr %170, align 4 + %172 = insertelement <vscale x 4 x float> %160, float %171, i64 %159 + br label %173 + +173: ; preds = %164, %162 + %174 = phi <vscale x 4 x float> [ %172, %164 ], [ %160, %162 ] + %175 = add i64 %159, 1 + br label %158 + +176: ; preds = %158 + %177 = trunc i64 %64 to i32 + br label %178 + +178: ; preds = %181, %176 + %179 = phi i64 [ %202, %181 ], [ 0, %176 ] + %180 = icmp slt i64 %179, %47 + br i1 %180, label %181, label %203 + +181: ; preds = %178 + %182 = icmp ult i64 %179, %53 + %183 = sext i1 %182 to i32 + %184 = and i32 %183, %177 + %185 = sext i32 %184 to i64 + %186 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() + %187 = trunc i64 %185 to i32 + %188 = insertelement <vscale x 4 x i32> poison, i32 %187, i32 0 + %189 = shufflevector <vscale x 4 x i32> %188, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer + %190 = icmp slt <vscale x 4 x i32> %186, %189 + %191 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1 + %192 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2 + %193 = getelementptr float, ptr %191, i64 %192 + %194 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0 + %195 = mul i64 %179, %194 + %196 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1 + %197 = mul i64 0, %196 + %198 = add i64 %195, %197 + %199 = getelementptr float, ptr %193, i64 %198 + %200 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %199, i32 4, <vscale x 4 x i1> %190, <vscale x 4 x float> poison) + %201 = trunc i64 %179 to i32 + call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 %201, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> %200) + %202 = add i64 %179, 1 + br label %178 + +203: ; preds = %178 + call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> %58, <vscale x 4 x i1> %92, <vscale x 4 x float> %120, <vscale x 4 x float> %160) + %204 = call i64 @llvm.smin.i64(i64 %53, i64 %47) + br label %205 + +205: ; preds = %208, %203 + %206 = phi i64 [ %219, %208 ], [ 0, %203 ] + %207 = icmp slt i64 %206, %204 + br i1 %207, label %208, label %220 + +208: ; preds = %205 + %209 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1 + %210 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2 + %211 = getelementptr float, ptr %209, i64 %210 + %212 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0 + %213 = mul i64 %206, %212 + %214 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1 + %215 = mul i64 0, %214 + %216 = add i64 %213, %215 + %217 = getelementptr float, ptr %211, i64 %216 + %218 = trunc i64 %206 to i32 + call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %92, ptr %217, i32 0, i32 %218) + %219 = add i64 %206, 1 + br label %205 + +220: ; preds = %205 + %221 = add i64 %94, 1 + br label %93 + +222: ; preds = %93 + %223 = add i64 %60, %47 + br label %59 + +224: ; preds = %59 + %225 = add i64 %49, %47 + br label %48 + +226: ; preds = %48 + %227 = alloca { ptr, ptr, i64, [2 x i64], [2 x i64] }, i64 1, align 8 + store { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, ptr %227, align 8 + %228 = insertvalue { i64, ptr } { i64 2, ptr poison }, ptr %227, 1 + %229 = extractvalue { i64, ptr } %228, 0 + %230 = extractvalue { i64, ptr } %228, 1 + call void @printMemrefF32(i64 %229, ptr %230) + ret void +} + +declare void @printMemrefF32(i64, ptr) + +attributes #0 = { "aarch64_new_za" "aarch64_pstate_sm_body" } diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index 066ee3b..afd56d1 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -12,77 +12,41 @@ entry: } define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" { -; CHECK-LABEL: multi_bb_stpidr2_save_required: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: cbz w0, .LBB1_2 -; CHECK-NEXT: // %bb.1: // %use_b -; CHECK-NEXT: fmov s1, #4.00000000 -; CHECK-NEXT: fadd s0, s0, s1 -; CHECK-NEXT: b .LBB1_5 -; CHECK-NEXT: .LBB1_2: // %use_c -; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl cosf -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB1_4 -; CHECK-NEXT: // %bb.3: // %use_c -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB1_4: // %use_c -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB1_5: // %exit -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -8 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB1_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b -; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000 -; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1 -; CHECK-NEWLOWERING-NEXT: b .LBB1_3 -; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %use_c -; CHECK-NEWLOWERING-NEXT: fmov s0, s1 -; CHECK-NEWLOWERING-NEXT: bl cosf -; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %exit -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_5 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: multi_bb_stpidr2_save_required: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16 +; CHECK-COMMON-NEXT: .cfi_offset w30, -8 +; CHECK-COMMON-NEXT: .cfi_offset w29, -16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: cbz w0, .LBB1_2 +; CHECK-COMMON-NEXT: // %bb.1: // %use_b +; CHECK-COMMON-NEXT: fmov s1, #4.00000000 +; CHECK-COMMON-NEXT: fadd s0, s0, s1 +; CHECK-COMMON-NEXT: b .LBB1_5 +; CHECK-COMMON-NEXT: .LBB1_2: // %use_c +; CHECK-COMMON-NEXT: fmov s0, s1 +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl cosf +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB1_4 +; CHECK-COMMON-NEXT: // %bb.3: // %use_c +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB1_4: // %use_c +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB1_5: // %exit +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret %cmp = icmp ne i32 %a, 0 br i1 %cmp, label %use_b, label %use_c @@ -155,7 +119,9 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 ; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 ; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 ; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 ; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 ; CHECK-NEWLOWERING-NEXT: cmp sp, x9 @@ -166,9 +132,7 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEWLOWERING-NEXT: .LBB2_3: ; CHECK-NEWLOWERING-NEXT: mov sp, x9 ; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp] -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 ; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5 ; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b ; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000 diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 2583a93..5b81f5d 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -426,3 +426,21 @@ define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwin call void %callee() ret void } + +define void @disable_tailcallopt(ptr %callee) "aarch64_inout_zt0" nounwind { +; CHECK-COMMON-LABEL: disable_tailcallopt: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sub sp, sp, #80 +; CHECK-COMMON-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x19, sp +; CHECK-COMMON-NEXT: str zt0, [x19] +; CHECK-COMMON-NEXT: smstop za +; CHECK-COMMON-NEXT: blr x0 +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: ldr zt0, [x19] +; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: add sp, sp, #80 +; CHECK-COMMON-NEXT: ret + tail call void %callee() + ret void +} |
