diff options
author | Florian Hahn <flo@fhahn.com> | 2022-09-08 15:45:32 +0100 |
---|---|---|
committer | Florian Hahn <flo@fhahn.com> | 2022-09-08 15:45:32 +0100 |
commit | 39fcb4a2684ab1ded9ef62d618c1f26c5db30fd2 (patch) | |
tree | 673dd71dd620b3f33ff0b7615a3d3f46455386ca /llvm | |
parent | f87993915768772d113bfd524347ce4341b843cf (diff) | |
download | llvm-39fcb4a2684ab1ded9ef62d618c1f26c5db30fd2.zip llvm-39fcb4a2684ab1ded9ef62d618c1f26c5db30fd2.tar.gz llvm-39fcb4a2684ab1ded9ef62d618c1f26c5db30fd2.tar.bz2 |
[AArch64] Add tests for lowering trunc to i8 using tbl.
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/test/CodeGen/AArch64/trunc-to-tbl.ll | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll new file mode 100644 index 0000000..672a0f1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios" + +; It's profitable to use a single tbl.4 instruction to lower the truncate. +define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) { +; CHECK-LABEL: trunc_v16i32_to_v16i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB0_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #6 +; CHECK-NEXT: ldp q1, q0, [x9, #32] +; CHECK-NEXT: ldp q3, q2, [x9] +; CHECK-NEXT: uzp1.8h v0, v1, v0 +; CHECK-NEXT: uzp1.8h v1, v3, v2 +; CHECK-NEXT: uzp1.16b v0, v1, v0 +; CHECK-NEXT: str q0, [x1, x8, lsl #4] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <16 x i32>, ptr %A, i64 %iv + %l.A = load <16 x i32>, ptr %gep.A + %trunc = trunc <16 x i32> %l.A to <16 x i8> + %gep.dst = getelementptr inbounds <16 x i8>, ptr %dst, i64 %iv + store <16 x i8> %trunc, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +; Not profitable to use tbl, as materializing the masks requires more +; instructions. +define void @trunc_v16i32_to_v16i8_no_loop(ptr %A, ptr %dst) { +; CHECK-LABEL: trunc_v16i32_to_v16i8_no_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uzp1.8h v0, v1, v0 +; CHECK-NEXT: uzp1.8h v1, v3, v2 +; CHECK-NEXT: uzp1.16b v0, v1, v0 +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret +entry: + %l.A = load <16 x i32>, ptr %A + %trunc = trunc <16 x i32> %l.A to <16 x i8> + store <16 x i8> %trunc, ptr %dst + ret void +} + +; It's profitable to use a single tbl.2 instruction to lower the truncate. +define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) { +; CHECK-LABEL: trunc_v8i32_to_v8i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB2_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #5 +; CHECK-NEXT: ldp q1, q0, [x9] +; CHECK-NEXT: uzp1.8h v0, v1, v0 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: str d0, [x1, x8, lsl #3] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB2_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <8 x i32>, ptr %A, i64 %iv + %l.A = load <8 x i32>, ptr %gep.A + %trunc = trunc <8 x i32> %l.A to <8 x i8> + %gep.dst = getelementptr inbounds <8 x i8>, ptr %dst, i64 %iv + store <8 x i8> %trunc, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} |