diff options
Diffstat (limited to 'llvm/test/CodeGen/AArch64')
| -rw-r--r-- | llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll | 489 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/aarch64-smull.ll | 14 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll | 19 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll | 178 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/dup.ll | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/load-zext-bitcast.ll | 45 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/sme-za-exceptions.ll | 2 |
8 files changed, 675 insertions, 86 deletions
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 0933e67..b54f262 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -749,12 +749,429 @@ for.body: ; preds = %for.body.preheader1 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none) %A, i8 noundef %B, i32 noundef %n) { +; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-SD-NEXT: cbz w2, .LBB6_3 +; CHECK-SD-NEXT: // %bb.1: // %iter.check +; CHECK-SD-NEXT: str x25, [sp, #-64]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w23, -40 +; CHECK-SD-NEXT: .cfi_offset w24, -48 +; CHECK-SD-NEXT: .cfi_offset w25, -64 +; CHECK-SD-NEXT: sxtb x9, w1 +; CHECK-SD-NEXT: cmp w2, #3 +; CHECK-SD-NEXT: mov w10, w2 +; CHECK-SD-NEXT: b.hi .LBB6_4 +; CHECK-SD-NEXT: // %bb.2: +; CHECK-SD-NEXT: mov x11, xzr +; CHECK-SD-NEXT: mov x8, xzr +; CHECK-SD-NEXT: b .LBB6_13 +; CHECK-SD-NEXT: .LBB6_3: +; CHECK-SD-NEXT: mov x0, xzr +; CHECK-SD-NEXT: ret +; CHECK-SD-NEXT: .LBB6_4: // %vector.main.loop.iter.check +; CHECK-SD-NEXT: dup v0.2d, x9 +; CHECK-SD-NEXT: cmp w2, #16 +; CHECK-SD-NEXT: b.hs .LBB6_6 +; CHECK-SD-NEXT: // %bb.5: +; CHECK-SD-NEXT: mov x11, xzr +; CHECK-SD-NEXT: mov x8, xzr +; CHECK-SD-NEXT: b .LBB6_10 +; CHECK-SD-NEXT: .LBB6_6: // %vector.ph +; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: and x12, x10, #0xc +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 +; CHECK-SD-NEXT: and x11, x10, #0xfffffff0 +; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v7.2d, #0000000000000000 +; CHECK-SD-NEXT: mov x15, x0 +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v16.2d, #0000000000000000 +; CHECK-SD-NEXT: and x16, x10, #0xfffffff0 +; CHECK-SD-NEXT: movi v6.2d, #0000000000000000 +; CHECK-SD-NEXT: fmov x13, d0 +; CHECK-SD-NEXT: fmov x14, d0 +; CHECK-SD-NEXT: .LBB6_7: // %vector.body +; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SD-NEXT: ldr q17, [x15], #16 +; CHECK-SD-NEXT: subs x16, x16, #16 +; CHECK-SD-NEXT: ushll v18.8h, v17.8b, #0 +; CHECK-SD-NEXT: ushll2 v19.8h, v17.16b, #0 +; CHECK-SD-NEXT: ushll v17.4s, v18.4h, #0 +; CHECK-SD-NEXT: ushll2 v20.4s, v19.8h, #0 +; CHECK-SD-NEXT: ushll2 v18.4s, v18.8h, #0 +; CHECK-SD-NEXT: ushll v19.4s, v19.4h, #0 +; CHECK-SD-NEXT: ushll v21.2d, v17.2s, #0 +; CHECK-SD-NEXT: ushll2 v22.2d, v20.4s, #0 +; CHECK-SD-NEXT: ushll2 v17.2d, v17.4s, #0 +; CHECK-SD-NEXT: ushll v23.2d, v18.2s, #0 +; CHECK-SD-NEXT: ushll v20.2d, v20.2s, #0 +; CHECK-SD-NEXT: ushll2 v18.2d, v18.4s, #0 +; CHECK-SD-NEXT: fmov x17, d21 +; CHECK-SD-NEXT: mov x2, v21.d[1] +; CHECK-SD-NEXT: ushll v21.2d, v19.2s, #0 +; CHECK-SD-NEXT: ushll2 v19.2d, v19.4s, #0 +; CHECK-SD-NEXT: fmov x18, d22 +; CHECK-SD-NEXT: fmov x1, d17 +; CHECK-SD-NEXT: fmov x3, d23 +; CHECK-SD-NEXT: fmov x21, d20 +; CHECK-SD-NEXT: fmov x22, d18 +; CHECK-SD-NEXT: fmov x19, d21 +; CHECK-SD-NEXT: mul x17, x13, x17 +; CHECK-SD-NEXT: mov x4, v22.d[1] +; CHECK-SD-NEXT: fmov x24, d19 +; CHECK-SD-NEXT: mov x5, v23.d[1] +; CHECK-SD-NEXT: mov x6, v21.d[1] +; CHECK-SD-NEXT: mov x7, v20.d[1] +; CHECK-SD-NEXT: mov x20, v18.d[1] +; CHECK-SD-NEXT: mov x23, v19.d[1] +; CHECK-SD-NEXT: mov x25, v17.d[1] +; CHECK-SD-NEXT: mul x18, x14, x18 +; CHECK-SD-NEXT: mul x1, x13, x1 +; CHECK-SD-NEXT: fmov d17, x17 +; CHECK-SD-NEXT: mul x3, x13, x3 +; CHECK-SD-NEXT: fmov d18, x18 +; CHECK-SD-NEXT: mul x19, x13, x19 +; CHECK-SD-NEXT: fmov d19, x1 +; CHECK-SD-NEXT: mul x21, x13, x21 +; CHECK-SD-NEXT: fmov d20, x3 +; CHECK-SD-NEXT: mul x22, x13, x22 +; CHECK-SD-NEXT: fmov d21, x19 +; CHECK-SD-NEXT: mul x24, x13, x24 +; CHECK-SD-NEXT: fmov d24, x21 +; CHECK-SD-NEXT: mul x2, x8, x2 +; CHECK-SD-NEXT: fmov d22, x22 +; CHECK-SD-NEXT: mul x4, x8, x4 +; CHECK-SD-NEXT: fmov d23, x24 +; CHECK-SD-NEXT: mul x5, x8, x5 +; CHECK-SD-NEXT: mov v17.d[1], x2 +; CHECK-SD-NEXT: mul x6, x8, x6 +; CHECK-SD-NEXT: mov v18.d[1], x4 +; CHECK-SD-NEXT: mul x7, x8, x7 +; CHECK-SD-NEXT: mov v20.d[1], x5 +; CHECK-SD-NEXT: add v1.2d, v17.2d, v1.2d +; CHECK-SD-NEXT: mul x20, x8, x20 +; CHECK-SD-NEXT: mov v21.d[1], x6 +; CHECK-SD-NEXT: add v6.2d, v18.2d, v6.2d +; CHECK-SD-NEXT: mul x23, x8, x23 +; CHECK-SD-NEXT: mov v24.d[1], x7 +; CHECK-SD-NEXT: add v4.2d, v20.2d, v4.2d +; CHECK-SD-NEXT: mul x17, x8, x25 +; CHECK-SD-NEXT: mov v22.d[1], x20 +; CHECK-SD-NEXT: add v7.2d, v21.2d, v7.2d +; CHECK-SD-NEXT: mov v23.d[1], x23 +; CHECK-SD-NEXT: add v16.2d, v24.2d, v16.2d +; CHECK-SD-NEXT: mov v19.d[1], x17 +; CHECK-SD-NEXT: add v3.2d, v22.2d, v3.2d +; CHECK-SD-NEXT: add v5.2d, v23.2d, v5.2d +; CHECK-SD-NEXT: add v2.2d, v19.2d, v2.2d +; CHECK-SD-NEXT: b.ne .LBB6_7 +; CHECK-SD-NEXT: // %bb.8: // %middle.block +; CHECK-SD-NEXT: add v1.2d, v1.2d, v7.2d +; CHECK-SD-NEXT: add v4.2d, v4.2d, v16.2d +; CHECK-SD-NEXT: cmp x11, x10 +; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d +; CHECK-SD-NEXT: add v3.2d, v3.2d, v6.2d +; CHECK-SD-NEXT: add v1.2d, v1.2d, v4.2d +; CHECK-SD-NEXT: add v2.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d +; CHECK-SD-NEXT: addp d1, v1.2d +; CHECK-SD-NEXT: fmov x8, d1 +; CHECK-SD-NEXT: b.eq .LBB6_15 +; CHECK-SD-NEXT: // %bb.9: // %vec.epilog.iter.check +; CHECK-SD-NEXT: cbz x12, .LBB6_13 +; CHECK-SD-NEXT: .LBB6_10: // %vec.epilog.ph +; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-NEXT: mov x13, x11 +; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff +; CHECK-SD-NEXT: fmov x14, d0 +; CHECK-SD-NEXT: and x11, x10, #0xfffffffc +; CHECK-SD-NEXT: fmov x15, d0 +; CHECK-SD-NEXT: sub x12, x13, x11 +; CHECK-SD-NEXT: add x13, x0, x13 +; CHECK-SD-NEXT: mov v1.d[0], x8 +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: .LBB6_11: // %vec.epilog.vector.body +; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SD-NEXT: ldr s0, [x13], #4 +; CHECK-SD-NEXT: adds x12, x12, #4 +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v4.2d, v0.2s, #0 +; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-SD-NEXT: and v4.16b, v4.16b, v3.16b +; CHECK-SD-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-SD-NEXT: fmov x16, d4 +; CHECK-SD-NEXT: fmov x18, d0 +; CHECK-SD-NEXT: mov x17, v4.d[1] +; CHECK-SD-NEXT: mov x1, v0.d[1] +; CHECK-SD-NEXT: mul x16, x14, x16 +; CHECK-SD-NEXT: mul x18, x15, x18 +; CHECK-SD-NEXT: mul x17, x8, x17 +; CHECK-SD-NEXT: fmov d0, x16 +; CHECK-SD-NEXT: mul x1, x8, x1 +; CHECK-SD-NEXT: fmov d4, x18 +; CHECK-SD-NEXT: mov v0.d[1], x17 +; CHECK-SD-NEXT: mov v4.d[1], x1 +; CHECK-SD-NEXT: add v1.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: add v2.2d, v4.2d, v2.2d +; CHECK-SD-NEXT: b.ne .LBB6_11 +; CHECK-SD-NEXT: // %bb.12: // %vec.epilog.middle.block +; CHECK-SD-NEXT: add v0.2d, v1.2d, v2.2d +; CHECK-SD-NEXT: cmp x11, x10 +; CHECK-SD-NEXT: addp d0, v0.2d +; CHECK-SD-NEXT: fmov x8, d0 +; CHECK-SD-NEXT: b.eq .LBB6_15 +; CHECK-SD-NEXT: .LBB6_13: // %for.body.preheader +; CHECK-SD-NEXT: sub x10, x10, x11 +; CHECK-SD-NEXT: add x11, x0, x11 +; CHECK-SD-NEXT: .LBB6_14: // %for.body +; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SD-NEXT: ldrb w12, [x11], #1 +; CHECK-SD-NEXT: subs x10, x10, #1 +; CHECK-SD-NEXT: smaddl x8, w12, w9, x8 +; CHECK-SD-NEXT: b.ne .LBB6_14 +; CHECK-SD-NEXT: .LBB6_15: +; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x25, [sp], #64 // 8-byte Folded Reload +; CHECK-SD-NEXT: mov x0, x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-GI-NEXT: cbz w2, .LBB6_7 +; CHECK-GI-NEXT: // %bb.1: // %iter.check +; CHECK-GI-NEXT: movi d0, #0000000000000000 +; CHECK-GI-NEXT: sxtb x9, w1 +; CHECK-GI-NEXT: mov x11, xzr +; CHECK-GI-NEXT: cmp w2, #4 +; CHECK-GI-NEXT: mov w10, w2 +; CHECK-GI-NEXT: b.lo .LBB6_12 +; CHECK-GI-NEXT: // %bb.2: // %vector.main.loop.iter.check +; CHECK-GI-NEXT: movi d0, #0000000000000000 +; CHECK-GI-NEXT: dup v1.2d, x9 +; CHECK-GI-NEXT: mov x11, xzr +; CHECK-GI-NEXT: cmp w2, #16 +; CHECK-GI-NEXT: b.lo .LBB6_9 +; CHECK-GI-NEXT: // %bb.3: // %vector.ph +; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 +; CHECK-GI-NEXT: xtn v2.2s, v1.2d +; CHECK-GI-NEXT: and x8, x10, #0xc +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 +; CHECK-GI-NEXT: movi v4.2d, #0000000000000000 +; CHECK-GI-NEXT: and x11, x10, #0xfffffff0 +; CHECK-GI-NEXT: movi v5.2d, #0000000000000000 +; CHECK-GI-NEXT: movi v6.2d, #0000000000000000 +; CHECK-GI-NEXT: mov x12, x0 +; CHECK-GI-NEXT: movi v7.2d, #0000000000000000 +; CHECK-GI-NEXT: movi v16.2d, #0000000000000000 +; CHECK-GI-NEXT: and x13, x10, #0xfffffff0 +; CHECK-GI-NEXT: movi v17.2d, #0000000000000000 +; CHECK-GI-NEXT: .LBB6_4: // %vector.body +; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-GI-NEXT: ldr q18, [x12], #16 +; CHECK-GI-NEXT: subs x13, x13, #16 +; CHECK-GI-NEXT: ushll v19.8h, v18.8b, #0 +; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0 +; CHECK-GI-NEXT: ushll v20.4s, v19.4h, #0 +; CHECK-GI-NEXT: ushll2 v19.4s, v19.8h, #0 +; CHECK-GI-NEXT: ushll v21.4s, v18.4h, #0 +; CHECK-GI-NEXT: ushll2 v18.4s, v18.8h, #0 +; CHECK-GI-NEXT: mov d22, v20.d[1] +; CHECK-GI-NEXT: mov d23, v19.d[1] +; CHECK-GI-NEXT: mov d24, v21.d[1] +; CHECK-GI-NEXT: mov d25, v18.d[1] +; CHECK-GI-NEXT: smlal v0.2d, v2.2s, v20.2s +; CHECK-GI-NEXT: smlal v4.2d, v2.2s, v19.2s +; CHECK-GI-NEXT: smlal v6.2d, v2.2s, v21.2s +; CHECK-GI-NEXT: smlal v16.2d, v2.2s, v18.2s +; CHECK-GI-NEXT: smlal v3.2d, v2.2s, v22.2s +; CHECK-GI-NEXT: smlal v5.2d, v2.2s, v23.2s +; CHECK-GI-NEXT: smlal v7.2d, v2.2s, v24.2s +; CHECK-GI-NEXT: smlal v17.2d, v2.2s, v25.2s +; CHECK-GI-NEXT: b.ne .LBB6_4 +; CHECK-GI-NEXT: // %bb.5: // %middle.block +; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d +; CHECK-GI-NEXT: add v2.2d, v4.2d, v5.2d +; CHECK-GI-NEXT: cmp x11, x10 +; CHECK-GI-NEXT: add v3.2d, v6.2d, v7.2d +; CHECK-GI-NEXT: add v4.2d, v16.2d, v17.2d +; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: add v2.2d, v3.2d, v4.2d +; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: addp d0, v0.2d +; CHECK-GI-NEXT: b.ne .LBB6_8 +; CHECK-GI-NEXT: // %bb.6: +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB6_7: +; CHECK-GI-NEXT: mov x8, xzr +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: ret +; CHECK-GI-NEXT: .LBB6_8: // %vec.epilog.iter.check +; CHECK-GI-NEXT: cbz x8, .LBB6_12 +; CHECK-GI-NEXT: .LBB6_9: // %vec.epilog.ph +; CHECK-GI-NEXT: mov v0.d[1], xzr +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: mov x12, x11 +; CHECK-GI-NEXT: xtn v1.2s, v1.2d +; CHECK-GI-NEXT: and x11, x10, #0xfffffffc +; CHECK-GI-NEXT: sub x8, x12, x11 +; CHECK-GI-NEXT: add x12, x0, x12 +; CHECK-GI-NEXT: .LBB6_10: // %vec.epilog.vector.body +; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-GI-NEXT: ldr w13, [x12], #4 +; CHECK-GI-NEXT: adds x8, x8, #4 +; CHECK-GI-NEXT: fmov s3, w13 +; CHECK-GI-NEXT: uxtb w13, w13 +; CHECK-GI-NEXT: mov b4, v3.b[2] +; CHECK-GI-NEXT: mov b5, v3.b[1] +; CHECK-GI-NEXT: mov b6, v3.b[3] +; CHECK-GI-NEXT: fmov s3, w13 +; CHECK-GI-NEXT: fmov w14, s4 +; CHECK-GI-NEXT: fmov w15, s5 +; CHECK-GI-NEXT: fmov w16, s6 +; CHECK-GI-NEXT: uxtb w14, w14 +; CHECK-GI-NEXT: uxtb w15, w15 +; CHECK-GI-NEXT: uxtb w16, w16 +; CHECK-GI-NEXT: fmov s4, w14 +; CHECK-GI-NEXT: mov v3.s[1], w15 +; CHECK-GI-NEXT: mov v4.s[1], w16 +; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v3.2s +; CHECK-GI-NEXT: smlal v2.2d, v1.2s, v4.2s +; CHECK-GI-NEXT: b.ne .LBB6_10 +; CHECK-GI-NEXT: // %bb.11: // %vec.epilog.middle.block +; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-GI-NEXT: cmp x11, x10 +; CHECK-GI-NEXT: addp d0, v0.2d +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: b.eq .LBB6_14 +; CHECK-GI-NEXT: .LBB6_12: // %for.body.preheader +; CHECK-GI-NEXT: sub x10, x10, x11 +; CHECK-GI-NEXT: add x11, x0, x11 +; CHECK-GI-NEXT: .LBB6_13: // %for.body +; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-GI-NEXT: ldrb w8, [x11], #1 +; CHECK-GI-NEXT: fmov x12, d0 +; CHECK-GI-NEXT: subs x10, x10, #1 +; CHECK-GI-NEXT: madd x8, x8, x9, x12 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: b.ne .LBB6_13 +; CHECK-GI-NEXT: .LBB6_14: // %for.cond.cleanup +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: ret +entry: + %cmp5.not = icmp eq i32 %n, 0 + br i1 %cmp5.not, label %for.cond.cleanup, label %iter.check + +iter.check: ; preds = %entry + %conv1 = sext i8 %B to i64 + %wide.trip.count = zext i32 %n to i64 + %min.iters.check = icmp ult i32 %n, 4 + br i1 %min.iters.check, label %for.body.preheader, label %vector.main.loop.iter.check + +vector.main.loop.iter.check: ; preds = %iter.check + %min.iters.check9 = icmp ult i32 %n, 16 + br i1 %min.iters.check9, label %vec.epilog.ph, label %vector.ph + +vector.ph: ; preds = %vector.main.loop.iter.check + %n.mod.vf = and i64 %wide.trip.count, 12 + %n.vec = and i64 %wide.trip.count, 4294967280 + %broadcast.splatinsert = insertelement <16 x i64> poison, i64 %conv1, i64 0 + %broadcast.splat = shufflevector <16 x i64> %broadcast.splatinsert, <16 x i64> poison, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <16 x i64> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] + %0 = getelementptr inbounds nuw i8, ptr %A, i64 %index + %wide.load = load <16 x i8>, ptr %0, align 1 + %1 = zext <16 x i8> %wide.load to <16 x i64> + %2 = mul nsw <16 x i64> %broadcast.splat, %1 + %3 = add <16 x i64> %2, %vec.phi + %index.next = add nuw i64 %index, 16 + %4 = icmp eq i64 %index.next, %n.vec + br i1 %4, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %5 = tail call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %3) + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond.cleanup, label %vec.epilog.iter.check + +vec.epilog.iter.check: ; preds = %middle.block + %min.epilog.iters.check = icmp eq i64 %n.mod.vf, 0 + br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph + +vec.epilog.ph: ; preds = %vector.main.loop.iter.check, %vec.epilog.iter.check + %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ] + %bc.merge.rdx = phi i64 [ %5, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ] + %n.vec11 = and i64 %wide.trip.count, 4294967292 + %6 = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 %bc.merge.rdx, i64 0 + %broadcast.splatinsert12 = insertelement <4 x i64> poison, i64 %conv1, i64 0 + %broadcast.splat13 = shufflevector <4 x i64> %broadcast.splatinsert12, <4 x i64> poison, <4 x i32> zeroinitializer + br label %vec.epilog.vector.body + +vec.epilog.vector.body: ; preds = %vec.epilog.vector.body, %vec.epilog.ph + %index14 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next17, %vec.epilog.vector.body ] + %vec.phi15 = phi <4 x i64> [ %6, %vec.epilog.ph ], [ %10, %vec.epilog.vector.body ] + %7 = getelementptr inbounds nuw i8, ptr %A, i64 %index14 + %wide.load16 = load <4 x i8>, ptr %7, align 1 + %8 = zext <4 x i8> %wide.load16 to <4 x i64> + %9 = mul nsw <4 x i64> %broadcast.splat13, %8 + %10 = add <4 x i64> %9, %vec.phi15 + %index.next17 = add nuw i64 %index14, 4 + %11 = icmp eq i64 %index.next17, %n.vec11 + br i1 %11, label %vec.epilog.middle.block, label %vec.epilog.vector.body + +vec.epilog.middle.block: ; preds = %vec.epilog.vector.body + %12 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %10) + %cmp.n18 = icmp eq i64 %n.vec11, %wide.trip.count + br i1 %cmp.n18, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %iter.check, %vec.epilog.iter.check, %vec.epilog.middle.block + %indvars.iv.ph = phi i64 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec11, %vec.epilog.middle.block ] + %s.06.ph = phi i64 [ 0, %iter.check ], [ %5, %vec.epilog.iter.check ], [ %12, %vec.epilog.middle.block ] + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry + %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %middle.block ], [ %12, %vec.epilog.middle.block ], [ %add, %for.body ] + ret i64 %s.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ] + %s.06 = phi i64 [ %add, %for.body ], [ %s.06.ph, %for.body.preheader ] + %arrayidx = getelementptr inbounds nuw i8, ptr %A, i64 %indvars.iv + %13 = load i8, ptr %arrayidx, align 1 + %conv = zext i8 %13 to i64 + %mul = mul nsw i64 %conv, %conv1 + %add = add nsw i64 %mul, %s.06 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-SD-LABEL: sink_v2z64_1: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mov x8, xzr ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: .LBB6_1: // %loop +; CHECK-SD-NEXT: .LBB7_1: // %loop ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldr d1, [x0] ; CHECK-SD-NEXT: subs x2, x2, #8 @@ -762,7 +1179,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-SD-NEXT: umull v1.2d, v1.2s, v0.s[1] ; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #15 ; CHECK-SD-NEXT: str d1, [x0], #32 -; CHECK-SD-NEXT: b.ne .LBB6_1 +; CHECK-SD-NEXT: b.ne .LBB7_1 ; CHECK-SD-NEXT: // %bb.2: // %exit ; CHECK-SD-NEXT: ret ; @@ -772,7 +1189,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: dup v0.2d, v0.d[1] ; CHECK-GI-NEXT: xtn v0.2s, v0.2d -; CHECK-GI-NEXT: .LBB6_1: // %loop +; CHECK-GI-NEXT: .LBB7_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 @@ -780,7 +1197,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-GI-NEXT: umull v1.2d, v1.2s, v0.2s ; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15 ; CHECK-GI-NEXT: str d1, [x0], #32 -; CHECK-GI-NEXT: b.ne .LBB6_1 +; CHECK-GI-NEXT: b.ne .LBB7_1 ; CHECK-GI-NEXT: // %bb.2: // %exit ; CHECK-GI-NEXT: ret entry: @@ -813,7 +1230,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mov x8, xzr ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: .LBB7_1: // %loop +; CHECK-SD-NEXT: .LBB8_1: // %loop ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldr q1, [x0] ; CHECK-SD-NEXT: subs x2, x2, #8 @@ -823,7 +1240,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #15 ; CHECK-SD-NEXT: shrn2 v2.4s, v1.2d, #15 ; CHECK-SD-NEXT: str q2, [x0], #32 -; CHECK-SD-NEXT: b.ne .LBB7_1 +; CHECK-SD-NEXT: b.ne .LBB8_1 ; CHECK-SD-NEXT: // %bb.2: // %exit ; CHECK-SD-NEXT: ret ; @@ -833,7 +1250,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: dup v0.2d, v0.d[1] ; CHECK-GI-NEXT: xtn v0.2s, v0.2d -; CHECK-GI-NEXT: .LBB7_1: // %loop +; CHECK-GI-NEXT: .LBB8_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldr q1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 @@ -844,7 +1261,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15 ; CHECK-GI-NEXT: shrn2 v1.4s, v2.2d, #15 ; CHECK-GI-NEXT: str q1, [x0], #32 -; CHECK-GI-NEXT: b.ne .LBB7_1 +; CHECK-GI-NEXT: b.ne .LBB8_1 ; CHECK-GI-NEXT: // %bb.2: // %exit ; CHECK-GI-NEXT: ret entry: @@ -877,7 +1294,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: dup v0.8b, v0.b[0] ; CHECK-SD-NEXT: mov x8, xzr -; CHECK-SD-NEXT: .LBB8_1: // %loop +; CHECK-SD-NEXT: .LBB9_1: // %loop ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldr d1, [x0] ; CHECK-SD-NEXT: subs x2, x2, #8 @@ -886,7 +1303,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0 ; CHECK-SD-NEXT: xtn v1.8b, v1.8h ; CHECK-SD-NEXT: str d1, [x0], #32 -; CHECK-SD-NEXT: b.ne .LBB8_1 +; CHECK-SD-NEXT: b.ne .LBB9_1 ; CHECK-SD-NEXT: // %bb.2: // %exit ; CHECK-SD-NEXT: ret ; @@ -896,7 +1313,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: dup v0.8h, v0.h[0] ; CHECK-GI-NEXT: xtn v0.8b, v0.8h -; CHECK-GI-NEXT: .LBB8_1: // %loop +; CHECK-GI-NEXT: .LBB9_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 @@ -905,7 +1322,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-GI-NEXT: cmlt v1.8h, v1.8h, #0 ; CHECK-GI-NEXT: xtn v1.8b, v1.8h ; CHECK-GI-NEXT: str d1, [x0], #32 -; CHECK-GI-NEXT: b.ne .LBB8_1 +; CHECK-GI-NEXT: b.ne .LBB9_1 ; CHECK-GI-NEXT: // %bb.2: // %exit ; CHECK-GI-NEXT: ret entry: @@ -938,7 +1355,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: dup v0.16b, v0.b[10] ; CHECK-SD-NEXT: mov x8, xzr -; CHECK-SD-NEXT: .LBB9_1: // %loop +; CHECK-SD-NEXT: .LBB10_1: // %loop ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldr q1, [x0] ; CHECK-SD-NEXT: subs x2, x2, #8 @@ -949,7 +1366,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-SD-NEXT: cmlt v2.8h, v2.8h, #0 ; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b ; CHECK-SD-NEXT: str q1, [x0], #32 -; CHECK-SD-NEXT: b.ne .LBB9_1 +; CHECK-SD-NEXT: b.ne .LBB10_1 ; CHECK-SD-NEXT: // %bb.2: // %exit ; CHECK-SD-NEXT: ret ; @@ -959,7 +1376,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: dup v0.8h, v0.h[2] ; CHECK-GI-NEXT: xtn v0.8b, v0.8h -; CHECK-GI-NEXT: .LBB9_1: // %loop +; CHECK-GI-NEXT: .LBB10_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldr q1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 @@ -971,7 +1388,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-GI-NEXT: cmlt v2.8h, v2.8h, #0 ; CHECK-GI-NEXT: uzp1 v1.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: str q1, [x0], #32 -; CHECK-GI-NEXT: b.ne .LBB9_1 +; CHECK-GI-NEXT: b.ne .LBB10_1 ; CHECK-GI-NEXT: // %bb.2: // %exit ; CHECK-GI-NEXT: ret entry: @@ -1005,7 +1422,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea ; CHECK-SD-NEXT: dup v0.4h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 -; CHECK-SD-NEXT: .LBB10_1: // %vector.body +; CHECK-SD-NEXT: .LBB11_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #8 @@ -1015,7 +1432,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea ; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: stp q1, q2, [x9] -; CHECK-SD-NEXT: b.ne .LBB10_1 +; CHECK-SD-NEXT: b.ne .LBB11_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; @@ -1026,7 +1443,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 ; CHECK-GI-NEXT: xtn v0.4h, v0.4s -; CHECK-GI-NEXT: .LBB10_1: // %vector.body +; CHECK-GI-NEXT: .LBB11_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #8 @@ -1036,7 +1453,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea ; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: stp q1, q2, [x9] -; CHECK-GI-NEXT: b.ne .LBB10_1 +; CHECK-GI-NEXT: b.ne .LBB11_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: @@ -1089,7 +1506,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt ; CHECK-SD-NEXT: dup v0.8h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff0 -; CHECK-SD-NEXT: .LBB11_1: // %vector.body +; CHECK-SD-NEXT: .LBB12_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #16 @@ -1103,7 +1520,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt ; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: stp q1, q3, [x9] ; CHECK-SD-NEXT: stp q2, q4, [x9, #32] -; CHECK-SD-NEXT: b.ne .LBB11_1 +; CHECK-SD-NEXT: b.ne .LBB12_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; @@ -1114,7 +1531,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff0 ; CHECK-GI-NEXT: xtn v0.4h, v0.4s -; CHECK-GI-NEXT: .LBB11_1: // %vector.body +; CHECK-GI-NEXT: .LBB12_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #16 @@ -1130,7 +1547,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt ; CHECK-GI-NEXT: umull v4.4s, v0.4h, v4.4h ; CHECK-GI-NEXT: stp q1, q3, [x9] ; CHECK-GI-NEXT: stp q2, q4, [x9, #32]! -; CHECK-GI-NEXT: b.ne .LBB11_1 +; CHECK-GI-NEXT: b.ne .LBB12_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: @@ -1184,7 +1601,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: fmov s0, w9 -; CHECK-SD-NEXT: .LBB12_1: // %vector.body +; CHECK-SD-NEXT: .LBB13_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #8 @@ -1196,7 +1613,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado ; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0] ; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0] ; CHECK-SD-NEXT: stp q1, q2, [x9] -; CHECK-SD-NEXT: b.ne .LBB12_1 +; CHECK-SD-NEXT: b.ne .LBB13_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; @@ -1206,7 +1623,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 -; CHECK-GI-NEXT: .LBB12_1: // %vector.body +; CHECK-GI-NEXT: .LBB13_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #8 @@ -1218,7 +1635,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: stp q1, q2, [x9] -; CHECK-GI-NEXT: b.ne .LBB12_1 +; CHECK-GI-NEXT: b.ne .LBB13_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: @@ -1272,7 +1689,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff0 ; CHECK-SD-NEXT: fmov s0, w9 -; CHECK-SD-NEXT: .LBB13_1: // %vector.body +; CHECK-SD-NEXT: .LBB14_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #16 @@ -1290,7 +1707,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur ; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0] ; CHECK-SD-NEXT: stp q1, q3, [x9] ; CHECK-SD-NEXT: stp q2, q4, [x9, #32] -; CHECK-SD-NEXT: b.ne .LBB13_1 +; CHECK-SD-NEXT: b.ne .LBB14_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; @@ -1300,7 +1717,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff0 -; CHECK-GI-NEXT: .LBB13_1: // %vector.body +; CHECK-GI-NEXT: .LBB14_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #16 @@ -1318,7 +1735,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: stp q3, q1, [x9] ; CHECK-GI-NEXT: stp q4, q2, [x9, #32]! -; CHECK-GI-NEXT: b.ne .LBB13_1 +; CHECK-GI-NEXT: b.ne .LBB14_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: @@ -1369,9 +1786,9 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc ; CHECK-SD-LABEL: cmplx_mul_combined_re_im: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: lsr x9, x0, #16 -; CHECK-SD-NEXT: adrp x8, .LCPI14_0 +; CHECK-SD-NEXT: adrp x8, .LCPI15_0 ; CHECK-SD-NEXT: dup v4.8h, w0 -; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI14_0] +; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI15_0] ; CHECK-SD-NEXT: dup v2.8h, w9 ; CHECK-SD-NEXT: sqneg v1.8h, v2.8h ; CHECK-SD-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b @@ -1386,12 +1803,12 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc ; CHECK-GI-LABEL: cmplx_mul_combined_re_im: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: lsr w9, w0, #16 -; CHECK-GI-NEXT: adrp x8, .LCPI14_0 +; CHECK-GI-NEXT: adrp x8, .LCPI15_0 ; CHECK-GI-NEXT: rev32 v4.8h, v0.8h ; CHECK-GI-NEXT: dup v1.8h, w9 ; CHECK-GI-NEXT: fmov s3, w9 ; CHECK-GI-NEXT: sqneg v2.8h, v1.8h -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-GI-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b ; CHECK-GI-NEXT: mov d2, v0.d[1] ; CHECK-GI-NEXT: dup v3.8h, w0 diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 6e5c666..0cd885e 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64: ; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: ldrh w8, [x0] -; CHECK-NEON-NEXT: ldrh w9, [x0, #2] +; CHECK-NEON-NEXT: ldrh w8, [x0, #2] +; CHECK-NEON-NEXT: ldr h0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] -; CHECK-NEON-NEXT: fmov d0, x8 -; CHECK-NEON-NEXT: mov v0.d[1], x9 +; CHECK-NEON-NEXT: mov v0.d[1], x8 ; CHECK-NEON-NEXT: xtn v0.2s, v0.2d ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: ; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: ldrh w8, [x0] -; CHECK-SVE-NEXT: ldrh w9, [x0, #2] +; CHECK-SVE-NEXT: ldrh w8, [x0, #2] +; CHECK-SVE-NEXT: ldr h0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] -; CHECK-SVE-NEXT: fmov d0, x8 -; CHECK-SVE-NEXT: mov v0.d[1], x9 +; CHECK-SVE-NEXT: mov v0.d[1], x8 ; CHECK-SVE-NEXT: xtn v0.2s, v0.2d ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir index f34d3ed..6b2a31b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir +++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir @@ -35,7 +35,7 @@ body: | ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0 ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{ $}} - ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr + ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0 ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0 ; ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0 diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll index dc64306..0f284aa 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll @@ -1,41 +1,44 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR32-NOZCZ-GPR64 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr32 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32 -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr64 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR64 -; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR32-ZCZ-GPR64 +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR32-NOZCZ-GPR64 ; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR32-NOZCZ-GPR64 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 define i8 @ti8() { entry: ; ALL-LABEL: ti8: -; NOZCZ-GPR: mov w0, wzr +; NOZCZ-GPR32-NOZCZ-GPR64: mov w0, wzr ; ZCZ-GPR32: mov w0, #0 +; NOZCZ-GPR32-ZCZ-GPR64: mov x0, #0 ret i8 0 } define i16 @ti16() { entry: ; ALL-LABEL: ti16: -; NOZCZ-GPR: mov w0, wzr +; NOZCZ-GPR32-NOZCZ-GPR64: mov w0, wzr ; ZCZ-GPR32: mov w0, #0 +; NOZCZ-GPR32-ZCZ-GPR64: mov x0, #0 ret i16 0 } define i32 @ti32() { entry: ; ALL-LABEL: ti32: -; NOZCZ-GPR: mov w0, wzr +; NOZCZ-GPR32-NOZCZ-GPR64: mov w0, wzr ; ZCZ-GPR32: mov w0, #0 +; NOZCZ-GPR32-ZCZ-GPR64: mov x0, #0 ret i32 0 } define i64 @ti64() { entry: ; ALL-LABEL: ti64: -; NOZCZ-GPR: mov x0, xzr +; NOZCZ-GPR32-NOZCZ-GPR64 mov x0, xzr ; ZCZ-GPR64: mov x0, #0 ret i64 0 } diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll new file mode 100644 index 0000000..cf52934 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s + +; Test optimization of DUP with extended narrow loads +; This should avoid GPR->SIMD transfers by loading directly into vector registers + +define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: ret + %load = load i8, ptr %p, align 1 + %ext = zext i8 %load to i16 + %vec = insertelement <4 x i16> poison, i16 %ext, i32 0 + %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer + ret <4 x i16> %dup +} + +define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: ret + %load = load i8, ptr %p, align 1 + %ext = zext i8 %load to i16 + %vec = insertelement <8 x i16> poison, i16 %ext, i32 0 + %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer + ret <8 x i16> %dup +} + +define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret + %load = load i8, ptr %p, align 1 + %ext = zext i8 %load to i32 + %vec = insertelement <2 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer + ret <2 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %load = load i8, ptr %p, align 1 + %ext = zext i8 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, #4] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %addr = getelementptr inbounds i8, ptr %p, i64 4 + %load = load i8, ptr %addr, align 1 + %ext = zext i8 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) { +; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, x1] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %addr = getelementptr inbounds i8, ptr %p, i64 %offset + %load = load i8, ptr %addr, align 1 + %ext = zext i8 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret + %load = load i8, ptr %p, align 1 + %ext = zext i8 %load to i64 + %vec = insertelement <2 x i64> poison, i64 %ext, i32 0 + %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer + ret <2 x i64> %dup +} + +define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i16_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret + %load = load i16, ptr %p, align 1 + %ext = zext i16 %load to i32 + %vec = insertelement <2 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer + ret <2 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i16_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %load = load i16, ptr %p, align 1 + %ext = zext i16 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, #8] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %addr = getelementptr inbounds i16, ptr %p, i64 4 + %load = load i16, ptr %addr, align 1 + %ext = zext i16 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) { +; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %addr = getelementptr inbounds i16, ptr %p, i64 %offset + %load = load i16, ptr %addr, align 1 + %ext = zext i16 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i16_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret + %load = load i16, ptr %p, align 1 + %ext = zext i16 %load to i64 + %vec = insertelement <2 x i64> poison, i64 %ext, i32 0 + %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer + ret <2 x i64> %dup +} + +define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i32_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret + %load = load i32, ptr %p, align 1 + %ext = zext i32 %load to i64 + %vec = insertelement <2 x i64> poison, i64 %ext, i32 0 + %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer + ret <2 x i64> %dup +} diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll index 079ff10..670574f2 100644 --- a/llvm/test/CodeGen/AArch64/dup.ll +++ b/llvm/test/CodeGen/AArch64/dup.ll @@ -32,8 +32,8 @@ entry: define <2 x i8> @loaddup_v2i8(ptr %p) { ; CHECK-LABEL: loaddup_v2i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.2s, v0.s[0] ; CHECK-NEXT: ret entry: %a = load i8, ptr %p @@ -189,8 +189,8 @@ entry: define <4 x i8> @loaddup_v4i8(ptr %p) { ; CHECK-SD-LABEL: loaddup_v4i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: dup v0.4h, w8 +; CHECK-SD-NEXT: ldr b0, [x0] +; CHECK-SD-NEXT: dup v0.4h, v0.h[0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: loaddup_v4i8: @@ -444,8 +444,8 @@ entry: define <2 x i16> @loaddup_v2i16(ptr %p) { ; CHECK-SD-LABEL: loaddup_v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: dup v0.2s, w8 +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: dup v0.2s, v0.s[0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: loaddup_v2i16: diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll index 6177ae5..628506b 100644 --- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll @@ -84,8 +84,7 @@ entry: define double @load_u64_from_u32_off1(ptr %n){ ; CHECK-LABEL: load_u64_from_u32_off1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldur w8, [x0, #1] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldur s0, [x0, #1] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 1 @@ -98,8 +97,7 @@ entry: define double @load_u64_from_u16_off1(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_off1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldurh w8, [x0, #1] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldur h0, [x0, #1] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 1 @@ -125,8 +123,7 @@ entry: define float @load_u32_from_u16_off1(ptr %n){ ; CHECK-LABEL: load_u32_from_u16_off1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldurh w8, [x0, #1] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldur h0, [x0, #1] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 1 @@ -168,8 +165,7 @@ entry: define double @load_u64_from_u32_off2(ptr %n){ ; CHECK-LABEL: load_u64_from_u32_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldur w8, [x0, #2] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldur s0, [x0, #2] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 2 @@ -250,8 +246,7 @@ entry: define double @load_u64_from_u32_off255(ptr %n){ ; CHECK-LABEL: load_u64_from_u32_off255: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldur w8, [x0, #255] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldur s0, [x0, #255] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 255 @@ -264,8 +259,7 @@ entry: define double @load_u64_from_u16_off255(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_off255: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldurh w8, [x0, #255] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldur h0, [x0, #255] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 255 @@ -291,8 +285,7 @@ entry: define float @load_u32_from_u16_off255(ptr %n){ ; CHECK-LABEL: load_u32_from_u16_off255: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldurh w8, [x0, #255] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldur h0, [x0, #255] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 255 @@ -494,8 +487,8 @@ entry: define double @load_u64_from_u32_offnp1(ptr %n){ ; CHECK-LABEL: load_u64_from_u32_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add x8, x0, #4, lsl #12 // =16384 -; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: mov w8, #16384 // =0x4000 +; CHECK-NEXT: ldr s0, [x0, x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 16384 @@ -508,8 +501,8 @@ entry: define double @load_u64_from_u16_offnp1(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192 -; CHECK-NEXT: ldr h0, [x8] +; CHECK-NEXT: mov w8, #8192 // =0x2000 +; CHECK-NEXT: ldr h0, [x0, x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 8192 @@ -522,8 +515,8 @@ entry: define double @load_u64_from_u8_offnp1(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096 -; CHECK-NEXT: ldr b0, [x8] +; CHECK-NEXT: mov w8, #4096 // =0x1000 +; CHECK-NEXT: ldr b0, [x0, x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 4096 @@ -536,8 +529,8 @@ entry: define float @load_u32_from_u16_offnp1(ptr %n){ ; CHECK-LABEL: load_u32_from_u16_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192 -; CHECK-NEXT: ldr h0, [x8] +; CHECK-NEXT: mov w8, #8192 // =0x2000 +; CHECK-NEXT: ldr h0, [x0, x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 8192 @@ -550,8 +543,8 @@ entry: define float @load_u32_from_u8_offnp1(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096 -; CHECK-NEXT: ldr b0, [x8] +; CHECK-NEXT: mov w8, #4096 // =0x1000 +; CHECK-NEXT: ldr b0, [x0, x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 4096 @@ -564,8 +557,8 @@ entry: define half @load_u16_from_u8_offnp1(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096 -; CHECK-NEXT: ldr b0, [x8] +; CHECK-NEXT: mov w8, #4096 // =0x1000 +; CHECK-NEXT: ldr b0, [x0, x8] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index b8d6c88..3f35cb5 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -829,7 +829,7 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal ; CHECK-SDAG-NEXT: bl __arm_sme_restore ; CHECK-SDAG-NEXT: b .LBB5_1 entry: - invoke void @agnostic_za_call() + invoke void @agnostic_za_call() "aarch64_za_state_agnostic" to label %exit unwind label %catch catch: |
