From 35f2caf713489049cc1b31aa3fe0a054968f80e3 Mon Sep 17 00:00:00 2001 From: chuongg3 Date: Wed, 29 May 2024 13:14:03 +0100 Subject: [AArch64][GlobalISel] Select TBL/TBX Intrinsics (#92914) --- .../AArch64/GISel/AArch64InstructionSelector.cpp | 45 + llvm/test/CodeGen/AArch64/arm64-tbl.ll | 1373 +++++++++++++++----- 2 files changed, 1069 insertions(+), 349 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 3b3c1fc..4a7c82b 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -227,6 +227,8 @@ private: bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI); bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); + void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs, + unsigned Opc1, unsigned Opc2, bool isExt); bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI); bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI); @@ -6537,6 +6539,25 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, I.eraseFromParent(); return true; } + case Intrinsic::aarch64_neon_tbl2: + SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false); + return true; + case Intrinsic::aarch64_neon_tbl3: + SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three, + false); + return true; + case Intrinsic::aarch64_neon_tbl4: + SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false); + return true; + case Intrinsic::aarch64_neon_tbx2: + SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true); + return true; + case Intrinsic::aarch64_neon_tbx3: + SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true); + return true; + case Intrinsic::aarch64_neon_tbx4: + SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true); + return true; case Intrinsic::swift_async_context_addr: auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()}, {Register(AArch64::FP)}) @@ -6552,6 +6573,30 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, return false; } +void AArch64InstructionSelector::SelectTable(MachineInstr &I, + MachineRegisterInfo &MRI, + unsigned NumVec, unsigned Opc1, + unsigned Opc2, bool isExt) { + Register DstReg = I.getOperand(0).getReg(); + unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2; + + // Create the REG_SEQUENCE + SmallVector Regs; + for (unsigned i = 0; i < NumVec; i++) + Regs.push_back(I.getOperand(i + 2 + isExt).getReg()); + Register RegSeq = createQTuple(Regs, MIB); + + Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg(); + MachineInstrBuilder Instr; + if (isExt) { + Register Reg = I.getOperand(2).getReg(); + Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg}); + } else + Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg}); + constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI); + I.eraseFromParent(); +} + InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const { auto MaybeImmed = getImmedFromMO(Root); diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll index 96b2af7..44b92e6 100644 --- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll @@ -1,28 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for tbl2_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl2_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl3_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl3_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl4_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbl4_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_v8i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_first_mask2 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_nonconst_second_mask2 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_shuffle -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shuffled_tbl2_to_tbl4_mixed_tbl2_mask2 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx2_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx2_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx3_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx3_16b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx4_8b -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for tbx4_16b +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind { ; CHECK-LABEL: tbl1_8b: @@ -43,175 +21,378 @@ define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind { } define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) { -; CHECK-LABEL: tbl2_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl.8b v0, { v0, v1 }, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl2_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: tbl.8b v0, { v0, v1 }, v2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl2_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: tbl.8b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) ret <8 x i8> %tmp3 } define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { -; CHECK-LABEL: tbl2_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl2_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl2_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) ret <16 x i8> %tmp3 } define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) { -; CHECK-LABEL: tbl3_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: tbl.8b v0, { v0, v1, v2 }, v3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl3_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: tbl.8b v0, { v0, v1, v2 }, v3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl3_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: tbl.8b v0, { v0, v1, v2 }, v3 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) ret <8 x i8> %tmp3 } define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) { -; CHECK-LABEL: tbl3_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2 }, v3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl3_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2 }, v3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl3_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1, v2 }, v3 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) ret <16 x i8> %tmp3 } define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) { -; CHECK-LABEL: tbl4_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.8b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl4_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.8b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl4_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: tbl.8b v0, { v0, v1, v2, v3 }, v4 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) ret <8 x i8> %tmp3 } define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) { -; CHECK-LABEL: tbl4_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbl4_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbl4_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) ret <16 x i8> %tmp3 } -; CHECK-LABEL: .LCPI8_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 8 // 0x8 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff -; CHECK-NEXT: .byte 255 // 0xff +; CHECK-SD-LABEL: .LCPI8_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 8 // 0x8 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff + +; CHECK-GI-LABEL: .LCPI8_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 13 // 0xd +; CHECK-GI-NEXT: .byte 14 // 0xe +; CHECK-GI-NEXT: .byte 15 // 0xf +; CHECK-GI-LABEL: .LCPI8_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_v8i8: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: tbl.8b v0, { v0, v1 }, v4 -; CHECK-NEXT: tbl.8b v1, { v2, v3 }, v4 -; CHECK-NEXT: mov.s v0[1], v1[1] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_v8i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI8_0 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: ldr d4, [x8, :lo12:.LCPI8_0] +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: tbl.8b v0, { v0, v1 }, v4 +; CHECK-SD-NEXT: tbl.8b v1, { v2, v3 }, v4 +; CHECK-SD-NEXT: mov.s v0[1], v1[1] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_v8i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI8_1 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr d4, [x8, :lo12:.LCPI8_1] +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: adrp x8, .LCPI8_0 +; CHECK-GI-NEXT: tbl.8b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.8b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: mov.d v0[1], v1[0] +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI8_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> ) %t2 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %c, <16 x i8> %d, <8 x i8> ) %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <8 x i32> ret <8 x i8> %s } -; CHECK-LABEL: .LCPI9_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 8 // 0x8 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 16 // 0x10 -; CHECK-NEXT: .byte 20 // 0x14 -; CHECK-NEXT: .byte 24 // 0x18 -; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 32 // 0x20 -; CHECK-NEXT: .byte 36 // 0x24 -; CHECK-NEXT: .byte 40 // 0x28 -; CHECK-NEXT: .byte 44 // 0x2c -; CHECK-NEXT: .byte 48 // 0x30 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 56 // 0x38 -; CHECK-NEXT: .byte 60 // 0x3c +; CHECK-SD-LABEL: .LCPI9_0: +; CHECK-SD-NEXT: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 8 // 0x8 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 32 // 0x20 +; CHECK-SD-NEXT: .byte 36 // 0x24 +; CHECK-SD-NEXT: .byte 40 // 0x28 +; CHECK-SD-NEXT: .byte 44 // 0x2c +; CHECK-SD-NEXT: .byte 48 // 0x30 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 56 // 0x38 +; CHECK-SD-NEXT: .byte 60 // 0x3c + +;CHECK-GI-LABEL: .LCPI9_0: +;CHECK-GI: .byte 0 // 0x0 +;CHECK-GI-NEXT: .byte 1 // 0x1 +;CHECK-GI-NEXT: .byte 2 // 0x2 +;CHECK-GI-NEXT: .byte 3 // 0x3 +;CHECK-GI-NEXT: .byte 4 // 0x4 +;CHECK-GI-NEXT: .byte 5 // 0x5 +;CHECK-GI-NEXT: .byte 6 // 0x6 +;CHECK-GI-NEXT: .byte 7 // 0x7 +;CHECK-GI-NEXT: .byte 16 // 0x10 +;CHECK-GI-NEXT: .byte 17 // 0x11 +;CHECK-GI-NEXT: .byte 18 // 0x12 +;CHECK-GI-NEXT: .byte 19 // 0x13 +;CHECK-GI-NEXT: .byte 20 // 0x14 +;CHECK-GI-NEXT: .byte 21 // 0x15 +;CHECK-GI-NEXT: .byte 22 // 0x16 +;CHECK-GI-NEXT: .byte 23 // 0x17 +;CHECK-GI-LABEL: .LCPI9_1: +;CHECK-GI: .byte 0 // 0x0 +;CHECK-GI-NEXT: .byte 4 // 0x4 +;CHECK-GI-NEXT: .byte 8 // 0x8 +;CHECK-GI-NEXT: .byte 12 // 0xc +;CHECK-GI-NEXT: .byte 16 // 0x10 +;CHECK-GI-NEXT: .byte 20 // 0x14 +;CHECK-GI-NEXT: .byte 24 // 0x18 +;CHECK-GI-NEXT: .byte 28 // 0x1c +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff +;CHECK-GI-NEXT: .byte 255 // 0xff define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: adrp x8, .LCPI9_0 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI9_0] +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI9_1 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI9_1] +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: adrp x8, .LCPI9_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI9_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> ret <16 x i8> %s } +; CHECK-GI-LABEL: .LCPI10_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI10_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff + define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: fmov s4, w0 -; CHECK-NEXT: mov w8, #32 // =0x20 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: mov.b v4[1], w0 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: mov.b v4[2], w0 -; CHECK-NEXT: mov.b v4[3], w0 -; CHECK-NEXT: mov.b v4[4], w0 -; CHECK-NEXT: mov.b v4[5], w0 -; CHECK-NEXT: mov.b v4[6], w0 -; CHECK-NEXT: mov.b v4[7], w0 -; CHECK-NEXT: mov.b v4[8], w8 -; CHECK-NEXT: mov w8, #36 // =0x24 -; CHECK-NEXT: mov.b v4[9], w8 -; CHECK-NEXT: mov w8, #40 // =0x28 -; CHECK-NEXT: mov.b v4[10], w8 -; CHECK-NEXT: mov w8, #44 // =0x2c -; CHECK-NEXT: mov.b v4[11], w8 -; CHECK-NEXT: mov w8, #48 // =0x30 -; CHECK-NEXT: mov.b v4[12], w8 -; CHECK-NEXT: mov w8, #52 // =0x34 -; CHECK-NEXT: mov.b v4[13], w8 -; CHECK-NEXT: mov w8, #56 // =0x38 -; CHECK-NEXT: mov.b v4[14], w8 -; CHECK-NEXT: mov w8, #60 // =0x3c -; CHECK-NEXT: mov.b v4[15], w8 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s4, w0 +; CHECK-SD-NEXT: mov w8, #32 // =0x20 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: mov.b v4[1], w0 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: mov.b v4[2], w0 +; CHECK-SD-NEXT: mov.b v4[3], w0 +; CHECK-SD-NEXT: mov.b v4[4], w0 +; CHECK-SD-NEXT: mov.b v4[5], w0 +; CHECK-SD-NEXT: mov.b v4[6], w0 +; CHECK-SD-NEXT: mov.b v4[7], w0 +; CHECK-SD-NEXT: mov.b v4[8], w8 +; CHECK-SD-NEXT: mov w8, #36 // =0x24 +; CHECK-SD-NEXT: mov.b v4[9], w8 +; CHECK-SD-NEXT: mov w8, #40 // =0x28 +; CHECK-SD-NEXT: mov.b v4[10], w8 +; CHECK-SD-NEXT: mov w8, #44 // =0x2c +; CHECK-SD-NEXT: mov.b v4[11], w8 +; CHECK-SD-NEXT: mov w8, #48 // =0x30 +; CHECK-SD-NEXT: mov.b v4[12], w8 +; CHECK-SD-NEXT: mov w8, #52 // =0x34 +; CHECK-SD-NEXT: mov.b v4[13], w8 +; CHECK-SD-NEXT: mov w8, #56 // =0x38 +; CHECK-SD-NEXT: mov.b v4[14], w8 +; CHECK-SD-NEXT: mov w8, #60 // =0x3c +; CHECK-SD-NEXT: mov.b v4[15], w8 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s4, w0 +; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: mov.16b v5, v4 +; CHECK-GI-NEXT: mov.b v5[1], v4[0] +; CHECK-GI-NEXT: mov.b v5[2], v4[0] +; CHECK-GI-NEXT: mov.b v5[3], v4[0] +; CHECK-GI-NEXT: mov.b v5[4], v4[0] +; CHECK-GI-NEXT: mov.b v5[5], v4[0] +; CHECK-GI-NEXT: mov.b v5[6], v4[0] +; CHECK-GI-NEXT: mov.b v5[7], v4[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI10_1 +; CHECK-GI-NEXT: mov.b v5[8], v4[0] +; CHECK-GI-NEXT: mov.b v5[9], v4[0] +; CHECK-GI-NEXT: mov.b v5[10], v4[0] +; CHECK-GI-NEXT: mov.b v5[11], v4[0] +; CHECK-GI-NEXT: mov.b v5[12], v4[0] +; CHECK-GI-NEXT: mov.b v5[13], v4[0] +; CHECK-GI-NEXT: mov.b v5[14], v4[0] +; CHECK-GI-NEXT: mov.b v5[15], v4[0] +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI10_1] +; CHECK-GI-NEXT: adrp x8, .LCPI10_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v5 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI10_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 @@ -234,40 +415,111 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x ret <16 x i8> %s } +; CHECK-GI-LABEL: .LCPI11_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 15 // 0xf +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 31 // 0x1f +; CHECK-GI-LABEL: .LCPI11_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff + define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: mov.b v4[1], w8 -; CHECK-NEXT: mov.b v4[2], w8 -; CHECK-NEXT: mov.b v4[3], w8 -; CHECK-NEXT: mov.b v4[4], w8 -; CHECK-NEXT: mov.b v4[5], w8 -; CHECK-NEXT: mov.b v4[6], w8 -; CHECK-NEXT: mov w8, #32 // =0x20 -; CHECK-NEXT: mov.b v4[7], w0 -; CHECK-NEXT: mov.b v4[8], w8 -; CHECK-NEXT: mov w8, #36 // =0x24 -; CHECK-NEXT: mov.b v4[9], w8 -; CHECK-NEXT: mov w8, #40 // =0x28 -; CHECK-NEXT: mov.b v4[10], w8 -; CHECK-NEXT: mov w8, #44 // =0x2c -; CHECK-NEXT: mov.b v4[11], w8 -; CHECK-NEXT: mov w8, #48 // =0x30 -; CHECK-NEXT: mov.b v4[12], w8 -; CHECK-NEXT: mov w8, #52 // =0x34 -; CHECK-NEXT: mov.b v4[13], w8 -; CHECK-NEXT: mov w8, #56 // =0x38 -; CHECK-NEXT: mov.b v4[14], w8 -; CHECK-NEXT: mov w8, #31 // =0x1f -; CHECK-NEXT: mov.b v4[15], w8 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, #1 // =0x1 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: fmov s4, w8 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: mov.b v4[1], w8 +; CHECK-SD-NEXT: mov.b v4[2], w8 +; CHECK-SD-NEXT: mov.b v4[3], w8 +; CHECK-SD-NEXT: mov.b v4[4], w8 +; CHECK-SD-NEXT: mov.b v4[5], w8 +; CHECK-SD-NEXT: mov.b v4[6], w8 +; CHECK-SD-NEXT: mov w8, #32 // =0x20 +; CHECK-SD-NEXT: mov.b v4[7], w0 +; CHECK-SD-NEXT: mov.b v4[8], w8 +; CHECK-SD-NEXT: mov w8, #36 // =0x24 +; CHECK-SD-NEXT: mov.b v4[9], w8 +; CHECK-SD-NEXT: mov w8, #40 // =0x28 +; CHECK-SD-NEXT: mov.b v4[10], w8 +; CHECK-SD-NEXT: mov w8, #44 // =0x2c +; CHECK-SD-NEXT: mov.b v4[11], w8 +; CHECK-SD-NEXT: mov w8, #48 // =0x30 +; CHECK-SD-NEXT: mov.b v4[12], w8 +; CHECK-SD-NEXT: mov w8, #52 // =0x34 +; CHECK-SD-NEXT: mov.b v4[13], w8 +; CHECK-SD-NEXT: mov w8, #56 // =0x38 +; CHECK-SD-NEXT: mov.b v4[14], w8 +; CHECK-SD-NEXT: mov w8, #31 // =0x1f +; CHECK-SD-NEXT: mov.b v4[15], w8 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: fmov s6, w0 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: mov.16b v5, v4 +; CHECK-GI-NEXT: mov.b v5[1], v4[0] +; CHECK-GI-NEXT: mov.b v5[2], v4[0] +; CHECK-GI-NEXT: mov.b v5[3], v4[0] +; CHECK-GI-NEXT: mov.b v5[4], v4[0] +; CHECK-GI-NEXT: mov.b v5[5], v4[0] +; CHECK-GI-NEXT: mov.b v5[6], v4[0] +; CHECK-GI-NEXT: mov.b v5[7], v4[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI11_1 +; CHECK-GI-NEXT: mov.b v5[8], v4[0] +; CHECK-GI-NEXT: mov.b v5[9], v4[0] +; CHECK-GI-NEXT: mov.b v5[10], v4[0] +; CHECK-GI-NEXT: mov.b v5[11], v4[0] +; CHECK-GI-NEXT: mov.b v5[12], v6[0] +; CHECK-GI-NEXT: mov.b v5[13], v6[0] +; CHECK-GI-NEXT: mov.b v5[14], v4[0] +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI11_1] +; CHECK-GI-NEXT: adrp x8, .LCPI11_0 +; CHECK-GI-NEXT: mov.b v5[15], v6[0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v5 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0 %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1 %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2 @@ -290,29 +542,116 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x ret <16 x i8> %s } +; CHECK-SD-LABEL: .LCPI12_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 8 // 0x8 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff + +; CHECK-GI-LABEL: .LCPI12_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI12_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff + define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v4, #0xffffffffffffffff -; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI12_0] -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5 -; CHECK-NEXT: mov.b v4[0], w0 -; CHECK-NEXT: mov.b v4[1], w0 -; CHECK-NEXT: mov.b v4[2], w0 -; CHECK-NEXT: mov.b v4[3], w0 -; CHECK-NEXT: mov.b v4[4], w0 -; CHECK-NEXT: mov.b v4[5], w0 -; CHECK-NEXT: mov.b v4[6], w0 -; CHECK-NEXT: mov.b v4[7], w0 -; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4 -; CHECK-NEXT: mov.d v2[1], v0[0] -; CHECK-NEXT: mov.16b v0, v2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi.2d v4, #0xffffffffffffffff +; CHECK-SD-NEXT: adrp x8, .LCPI12_0 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: ldr q5, [x8, :lo12:.LCPI12_0] +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: tbl.16b v2, { v2, v3 }, v5 +; CHECK-SD-NEXT: mov.b v4[0], w0 +; CHECK-SD-NEXT: mov.b v4[1], w0 +; CHECK-SD-NEXT: mov.b v4[2], w0 +; CHECK-SD-NEXT: mov.b v4[3], w0 +; CHECK-SD-NEXT: mov.b v4[4], w0 +; CHECK-SD-NEXT: mov.b v4[5], w0 +; CHECK-SD-NEXT: mov.b v4[6], w0 +; CHECK-SD-NEXT: mov.b v4[7], w0 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-SD-NEXT: mov.d v2[1], v0[0] +; CHECK-SD-NEXT: mov.16b v0, v2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s4, w0 +; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: mov.16b v5, v4 +; CHECK-GI-NEXT: mov.b v5[1], v4[0] +; CHECK-GI-NEXT: mov.b v5[2], v4[0] +; CHECK-GI-NEXT: mov.b v5[3], v4[0] +; CHECK-GI-NEXT: mov.b v5[4], v4[0] +; CHECK-GI-NEXT: mov.b v5[5], v4[0] +; CHECK-GI-NEXT: mov.b v5[6], v4[0] +; CHECK-GI-NEXT: mov.b v5[7], v4[0] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI12_1 +; CHECK-GI-NEXT: mov.b v5[8], v4[0] +; CHECK-GI-NEXT: mov.b v5[9], v4[0] +; CHECK-GI-NEXT: mov.b v5[10], v4[0] +; CHECK-GI-NEXT: mov.b v5[11], v4[0] +; CHECK-GI-NEXT: mov.b v5[12], v4[0] +; CHECK-GI-NEXT: mov.b v5[13], v4[0] +; CHECK-GI-NEXT: mov.b v5[14], v4[0] +; CHECK-GI-NEXT: mov.b v5[15], v4[0] +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI12_1] +; CHECK-GI-NEXT: adrp x8, .LCPI12_0 +; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v4 +; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v5 +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI12_0] +; CHECK-GI-NEXT: tbl.16b v0, { v2, v3 }, v0 +; CHECK-GI-NEXT: ret %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 @@ -335,29 +674,133 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x ret <16 x i8> %s } +; CHECK-SD-LABEL: .LCPI13_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 8 // 0x8 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-NEXT: .byte 255 // 0xff +; CHECK-SD-LABEL: .LCPI13_1: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 1 // 0x1 +; CHECK-SD-NEXT: .byte 2 // 0x2 +; CHECK-SD-NEXT: .byte 3 // 0x3 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 5 // 0x5 +; CHECK-SD-NEXT: .byte 6 // 0x6 +; CHECK-SD-NEXT: .byte 7 // 0x7 +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 17 // 0x11 +; CHECK-SD-NEXT: .byte 18 // 0x12 +; CHECK-SD-NEXT: .byte 19 // 0x13 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 21 // 0x15 +; CHECK-SD-NEXT: .byte 30 // 0x1e +; CHECK-SD-NEXT: .byte 31 // 0x1f + +; CHECK-GI-LABEL: .LCPI13_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 2 // 0x2 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 30 // 0x1e +; CHECK-GI-NEXT: .byte 31 // 0x1f +; CHECK-GI-LABEL: .LCPI13_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff + define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2: -; CHECK: // %bb.0: -; CHECK-NEXT: dup.16b v4, w0 -; CHECK-NEXT: mov w8, #255 // =0xff -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: mov.b v4[8], w8 -; CHECK-NEXT: mov.b v4[9], w8 -; CHECK-NEXT: mov.b v4[10], w8 -; CHECK-NEXT: mov.b v4[11], w8 -; CHECK-NEXT: mov.b v4[12], w8 -; CHECK-NEXT: mov.b v4[13], w8 -; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: adrp x8, .LCPI13_1 -; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5 -; CHECK-NEXT: tbl.16b v3, { v0, v1 }, v4 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI13_1] -; CHECK-NEXT: tbl.16b v0, { v2, v3 }, v0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: dup.16b v4, w0 +; CHECK-SD-NEXT: mov w8, #255 // =0xff +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: mov.b v4[8], w8 +; CHECK-SD-NEXT: mov.b v4[9], w8 +; CHECK-SD-NEXT: mov.b v4[10], w8 +; CHECK-SD-NEXT: mov.b v4[11], w8 +; CHECK-SD-NEXT: mov.b v4[12], w8 +; CHECK-SD-NEXT: mov.b v4[13], w8 +; CHECK-SD-NEXT: adrp x8, .LCPI13_0 +; CHECK-SD-NEXT: ldr q5, [x8, :lo12:.LCPI13_0] +; CHECK-SD-NEXT: adrp x8, .LCPI13_1 +; CHECK-SD-NEXT: tbl.16b v2, { v2, v3 }, v5 +; CHECK-SD-NEXT: tbl.16b v3, { v0, v1 }, v4 +; CHECK-SD-NEXT: ldr q0, [x8, :lo12:.LCPI13_1] +; CHECK-SD-NEXT: tbl.16b v0, { v2, v3 }, v0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s4, w0 +; CHECK-GI-NEXT: mov w8, #255 // =0xff +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: fmov s6, w8 +; CHECK-GI-NEXT: adrp x8, .LCPI13_1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: mov.16b v5, v4 +; CHECK-GI-NEXT: mov.b v5[1], v4[0] +; CHECK-GI-NEXT: mov.b v5[2], v4[0] +; CHECK-GI-NEXT: mov.b v5[3], v4[0] +; CHECK-GI-NEXT: mov.b v5[4], v4[0] +; CHECK-GI-NEXT: mov.b v5[5], v4[0] +; CHECK-GI-NEXT: mov.b v5[6], v4[0] +; CHECK-GI-NEXT: mov.b v5[7], v4[0] +; CHECK-GI-NEXT: mov.b v5[8], v6[0] +; CHECK-GI-NEXT: mov.b v5[9], v6[0] +; CHECK-GI-NEXT: mov.b v5[10], v6[0] +; CHECK-GI-NEXT: mov.b v5[11], v6[0] +; CHECK-GI-NEXT: mov.b v5[12], v6[0] +; CHECK-GI-NEXT: mov.b v5[13], v6[0] +; CHECK-GI-NEXT: mov.b v5[14], v4[0] +; CHECK-GI-NEXT: mov.b v5[15], v4[0] +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI13_1] +; CHECK-GI-NEXT: adrp x8, .LCPI13_0 +; CHECK-GI-NEXT: tbl.16b v2, { v2, v3 }, v4 +; CHECK-GI-NEXT: tbl.16b v3, { v0, v1 }, v5 +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI13_0] +; CHECK-GI-NEXT: tbl.16b v0, { v2, v3 }, v0 +; CHECK-GI-NEXT: ret %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 @@ -380,106 +823,293 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 ret <16 x i8> %s } +; CHECK-SD-LABEL: .LCPI14_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 32 // 0x20 +; CHECK-SD-NEXT: .byte 36 // 0x24 +; CHECK-SD-NEXT: .byte 40 // 0x28 +; CHECK-SD-NEXT: .byte 44 // 0x2c +; CHECK-SD-NEXT: .byte 48 // 0x30 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 56 // 0x38 +; CHECK-SD-NEXT: .byte 60 // 0x3c -; CHECK-LABEL: .LCPI14_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 16 // 0x10 -; CHECK-NEXT: .byte 20 // 0x14 -; CHECK-NEXT: .byte 24 // 0x18 -; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 32 // 0x20 -; CHECK-NEXT: .byte 36 // 0x24 -; CHECK-NEXT: .byte 40 // 0x28 -; CHECK-NEXT: .byte 44 // 0x2c -; CHECK-NEXT: .byte 48 // 0x30 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 56 // 0x38 -; CHECK-NEXT: .byte 60 // 0x3c +; CHECK-GI-LABEL: .LCPI14_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI14_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: adrp x8, .LCPI14_0 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI14_0] +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI14_1 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI14_1] +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: adrp x8, .LCPI14_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v4 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> ret <16 x i8> %s } -; CHECK-LABEL: .LCPI15_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 16 // 0x10 -; CHECK-NEXT: .byte 20 // 0x14 -; CHECK-NEXT: .byte 24 // 0x18 -; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 32 // 0x20 -; CHECK-NEXT: .byte 36 // 0x24 -; CHECK-NEXT: .byte 40 // 0x28 -; CHECK-NEXT: .byte 44 // 0x2c -; CHECK-NEXT: .byte 48 // 0x30 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 56 // 0x38 -; CHECK-NEXT: .byte 60 // 0x3c +; CHECK-SD-LABEL: .LCPI15_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 32 // 0x20 +; CHECK-SD-NEXT: .byte 36 // 0x24 +; CHECK-SD-NEXT: .byte 40 // 0x28 +; CHECK-SD-NEXT: .byte 44 // 0x2c +; CHECK-SD-NEXT: .byte 48 // 0x30 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 56 // 0x38 +; CHECK-SD-NEXT: .byte 60 // 0x3c + +; CHECK-GI-LABEL: .LCPI15_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI15_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-LABEL: .LCPI15_2: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: adrp x8, .LCPI15_0 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI15_0] +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI15_2 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI15_2] +; CHECK-GI-NEXT: adrp x8, .LCPI15_1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI15_1] +; CHECK-GI-NEXT: adrp x8, .LCPI15_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v5 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> ret <16 x i8> %s } -; CHECK-LABEL: .LCPI16_0: -; CHECK-NEXT: .byte 0 // 0x0 -; CHECK-NEXT: .byte 4 // 0x4 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 12 // 0xc -; CHECK-NEXT: .byte 16 // 0x10 -; CHECK-NEXT: .byte 20 // 0x14 -; CHECK-NEXT: .byte 24 // 0x18 -; CHECK-NEXT: .byte 28 // 0x1c -; CHECK-NEXT: .byte 32 // 0x20 -; CHECK-NEXT: .byte 36 // 0x24 -; CHECK-NEXT: .byte 40 // 0x28 -; CHECK-NEXT: .byte 44 // 0x2c -; CHECK-NEXT: .byte 48 // 0x30 -; CHECK-NEXT: .byte 52 // 0x34 -; CHECK-NEXT: .byte 56 // 0x38 -; CHECK-NEXT: .byte 60 // 0x3c +; CHECK-SD-LABEL: .LCPI16_0: +; CHECK-SD: .byte 0 // 0x0 +; CHECK-SD-NEXT: .byte 4 // 0x4 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 12 // 0xc +; CHECK-SD-NEXT: .byte 16 // 0x10 +; CHECK-SD-NEXT: .byte 20 // 0x14 +; CHECK-SD-NEXT: .byte 24 // 0x18 +; CHECK-SD-NEXT: .byte 28 // 0x1c +; CHECK-SD-NEXT: .byte 32 // 0x20 +; CHECK-SD-NEXT: .byte 36 // 0x24 +; CHECK-SD-NEXT: .byte 40 // 0x28 +; CHECK-SD-NEXT: .byte 44 // 0x2c +; CHECK-SD-NEXT: .byte 48 // 0x30 +; CHECK-SD-NEXT: .byte 52 // 0x34 +; CHECK-SD-NEXT: .byte 56 // 0x38 +; CHECK-SD-NEXT: .byte 60 // 0x3c + +; CHECK-GI-LABEL: .LCPI16_0: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 1 // 0x1 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 3 // 0x3 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 5 // 0x5 +; CHECK-GI-NEXT: .byte 6 // 0x6 +; CHECK-GI-NEXT: .byte 7 // 0x7 +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 17 // 0x11 +; CHECK-GI-NEXT: .byte 18 // 0x12 +; CHECK-GI-NEXT: .byte 19 // 0x13 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 21 // 0x15 +; CHECK-GI-NEXT: .byte 22 // 0x16 +; CHECK-GI-NEXT: .byte 23 // 0x17 +; CHECK-GI-LABEL: .LCPI16_1: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-LABEL: .LCPI16_2: +; CHECK-GI: .byte 0 // 0x0 +; CHECK-GI-NEXT: .byte 4 // 0x4 +; CHECK-GI-NEXT: .byte 8 // 0x8 +; CHECK-GI-NEXT: .byte 12 // 0xc +; CHECK-GI-NEXT: .byte 16 // 0x10 +; CHECK-GI-NEXT: .byte 20 // 0x14 +; CHECK-GI-NEXT: .byte 24 // 0x18 +; CHECK-GI-NEXT: .byte 28 // 0x1c +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff +; CHECK-GI-NEXT: .byte 255 // 0xff define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { -; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: adrp x8, .LCPI16_0 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_0] +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI16_2 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI16_2] +; CHECK-GI-NEXT: adrp x8, .LCPI16_1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 +; CHECK-GI-NEXT: ldr q5, [x8, :lo12:.LCPI16_1] +; CHECK-GI-NEXT: adrp x8, .LCPI16_0 +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v4 +; CHECK-GI-NEXT: tbl.16b v1, { v2, v3 }, v5 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI16_0] +; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 +; CHECK-GI-NEXT: ret %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> @@ -514,73 +1144,121 @@ define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind { } define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) { -; CHECK-LABEL: tbx2_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 -; CHECK-NEXT: tbx.8b v0, { v1, v2 }, v3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx2_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 +; CHECK-SD-NEXT: tbx.8b v0, { v1, v2 }, v3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx2_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 +; CHECK-GI-NEXT: tbx.8b v0, { v1, v2 }, v3 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) ret <8 x i8> %tmp3 } define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) { -; CHECK-LABEL: tbx2_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 -; CHECK-NEXT: tbx.16b v0, { v1, v2 }, v3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx2_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 +; CHECK-SD-NEXT: tbx.16b v0, { v1, v2 }, v3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx2_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2 +; CHECK-GI-NEXT: tbx.16b v0, { v1, v2 }, v3 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) ret <16 x i8> %tmp3 } define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) { -; CHECK-LABEL: tbx3_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: tbx.8b v0, { v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx3_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: tbx.8b v0, { v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx3_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: tbx.8b v0, { v1, v2, v3 }, v4 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) ret <8 x i8> %tmp3 } define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) { -; CHECK-LABEL: tbx3_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 -; CHECK-NEXT: tbx.16b v0, { v1, v2, v3 }, v4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx3_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-SD-NEXT: tbx.16b v0, { v1, v2, v3 }, v4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx3_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3 +; CHECK-GI-NEXT: tbx.16b v0, { v1, v2, v3 }, v4 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) ret <16 x i8> %tmp3 } define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) { -; CHECK-LABEL: tbx4_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: tbx.8b v0, { v1, v2, v3, v4 }, v5 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx4_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: tbx.8b v0, { v1, v2, v3, v4 }, v5 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx4_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: tbx.8b v0, { v1, v2, v3, v4 }, v5 +; CHECK-GI-NEXT: ret %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) ret <8 x i8> %tmp3 } define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) { -; CHECK-LABEL: tbx4_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 -; CHECK-NEXT: tbx.16b v0, { v1, v2, v3, v4 }, v5 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: tbx4_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-SD-NEXT: tbx.16b v0, { v1, v2, v3, v4 }, v5 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: tbx4_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4 +; CHECK-GI-NEXT: tbx.16b v0, { v1, v2, v3, v4 }, v5 +; CHECK-GI-NEXT: ret %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) ret <16 x i8> %tmp3 } @@ -594,6 +1272,3 @@ declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-GI: {{.*}} -; CHECK-SD: {{.*}} -- cgit v1.1