diff options
author | David Green <david.green@arm.com> | 2025-04-01 16:24:54 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-04-01 16:24:54 +0100 |
commit | 4cb41d136cd4e2caef724e35b337f888036f8645 (patch) | |
tree | 64553b1028a670bfba8e4412a23b16009a69992f | |
parent | ac55688482637ce625edaa8a25ad6eced8992a22 (diff) | |
download | llvm-4cb41d136cd4e2caef724e35b337f888036f8645.zip llvm-4cb41d136cd4e2caef724e35b337f888036f8645.tar.gz llvm-4cb41d136cd4e2caef724e35b337f888036f8645.tar.bz2 |
[AArch64] Prefer zip over ushll for anyext. (#133433)
Many CPUs have a higher throughput of ZIP instructions vs USHLL. This
adds some tablegen patterns for preferring zip in anyext patterns.
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 17 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/andorxor.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll | 3 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/bitcast.ll | 3 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/extbinopload.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/extract-subvec-combine.ll | 19 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/neon-bitcast.ll | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/zext.ll | 2 |
10 files changed, 45 insertions, 26 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 6c61e3a..f291589 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6751,6 +6751,23 @@ def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))), (UZP2v4i32 V128:$Vn, V128:$Vm)>; +// extract_subvec(anyext) can use zip. Check for one use on the anyext, otherwise +// the extract_subvector can be free. +let HasOneUse = 1 in +def anyext_oneuse: PatFrag<(ops node:$src0), (anyext $src0)>; +def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 0))), + (ZIP1v8i8 V64:$Vn, V64:$Vn)>; +def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 0))), + (ZIP1v4i16 V64:$Vn, V64:$Vn)>; +def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 0))), + (ZIP1v2i32 V64:$Vn, V64:$Vn)>; +def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 4))), + (ZIP2v8i8 V64:$Vn, V64:$Vn)>; +def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 2))), + (ZIP2v4i16 V64:$Vn, V64:$Vn)>; +def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 1))), + (ZIP2v2i32 V64:$Vn, V64:$Vn)>; + //---------------------------------------------------------------------------- // AdvSIMD TBL/TBX instructions //---------------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 24f2549..0384848 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -433,8 +433,8 @@ define void @and_v4i8(ptr %p1, ptr %p2) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: ldr s1, [x1] -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: str s0, [x0] @@ -482,8 +482,8 @@ define void @or_v4i8(ptr %p1, ptr %p2) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: ldr s1, [x1] -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: str s0, [x0] @@ -531,8 +531,8 @@ define void @xor_v4i8(ptr %p1, ptr %p2) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: ldr s1, [x1] -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: str s0, [x0] diff --git a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll index 864ddc2..90fa294 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll @@ -6,8 +6,7 @@ define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) { ; CHECK-LABEL: bitcast_v2i16_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-NEXT: ret %y = bitcast <2 x half> %x to <2 x i16> ret <2 x i16> %y diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index d9199ce2..d54cc4a 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -125,8 +125,7 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){ ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: add w8, w0, w1 ; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: bitcast_i32_v2i16: diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index 72f4d58..82114d60 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -649,7 +649,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: str s1, [x4] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-NEXT: ldr s0, [x2] ; CHECK-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-NEXT: umov w9, v2.h[0] @@ -659,7 +659,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: mov v0.b[9], w10 ; CHECK-NEXT: umov w10, v2.h[3] ; CHECK-NEXT: ldr s2, [x1] -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: zip1 v2.8b, v2.8b, v2.8b ; CHECK-NEXT: mov v0.b[10], w9 ; CHECK-NEXT: add x9, x1, #4 ; CHECK-NEXT: mov v1.d[1], v2.d[0] diff --git a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll index 75d55773..368103b 100644 --- a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll +++ b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll @@ -104,12 +104,19 @@ define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind { ; Negative test, combine should not fire if sign extension is for a different width. define <2 x i32> @sext_extract_zext_idx0_negtest(<4 x i16> %vec) nounwind { -; CHECK-LABEL: sext_extract_zext_idx0_negtest: -; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: shl v0.2s, v0.2s, #17 -; CHECK-NEXT: sshr v0.2s, v0.2s, #17 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sext_extract_zext_idx0_negtest: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: shl v0.2s, v0.2s, #17 +; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #17 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sext_extract_zext_idx0_negtest: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #17 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #17 +; CHECK-GI-NEXT: ret %zext = zext <4 x i16> %vec to <4 x i32> %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0) %sext_inreg_step0 = shl <2 x i32> %extract, <i32 17, i32 17> diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll index d06612e..07772b7 100644 --- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll @@ -518,15 +518,14 @@ define <2 x i16> @bitcast_i32_to_v2i16(i32 %word) { ; CHECK-LE-LABEL: bitcast_i32_to_v2i16: ; CHECK-LE: // %bb.0: ; CHECK-LE-NEXT: fmov s0, w0 -; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: bitcast_i32_to_v2i16: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: fmov s0, w0 ; CHECK-BE-NEXT: rev32 v0.4h, v0.4h -; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %ret = bitcast i32 %word to <2 x i16> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll index 8fac0e1..bda7ff9 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -88,9 +88,7 @@ define void @extract_subvector_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: zip2 v0.4h, v0.4h, v0.4h ; CHECK-NEXT: ret %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2) ret <2 x i16> %ret diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index b52cbfe..45b7a27 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -303,7 +303,7 @@ define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) { ; BE-NEXT: add x8, x0, #2 ; BE-NEXT: ldr s0, [sp, #12] ; BE-NEXT: rev32 v0.8b, v0.8b -; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; BE-NEXT: ld1 { v0.b }[4], [x8] ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index e40b9cb..962486a 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -447,7 +447,7 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) { ; CHECK-SD-NEXT: mov w8, #1023 // =0x3ff ; CHECK-SD-NEXT: dup v2.2d, x8 ; CHECK-SD-NEXT: mov v0.s[1], w1 -; CHECK-SD-NEXT: ushll v3.2d, v1.2s, #0 +; CHECK-SD-NEXT: zip1 v3.2s, v1.2s, v1.2s ; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-SD-NEXT: and v2.8b, v3.8b, v2.8b |