; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s ; Test optimization of DUP with extended narrow loads ; This should avoid GPR->SIMD transfers by loading directly into vector registers define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i8_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: dup v0.4h, v0.h[0] ; CHECK-NEXT: ret %load = load i8, ptr %p, align 1 %ext = zext i8 %load to i16 %vec = insertelement <4 x i16> poison, i16 %ext, i32 0 %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer ret <4 x i16> %dup } define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: dup v0.8h, v0.h[0] ; CHECK-NEXT: ret %load = load i8, ptr %p, align 1 %ext = zext i8 %load to i16 %vec = insertelement <8 x i16> poison, i16 %ext, i32 0 %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer ret <8 x i16> %dup } define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i8_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: dup v0.2s, v0.s[0] ; CHECK-NEXT: ret %load = load i8, ptr %p, align 1 %ext = zext i8 %load to i32 %vec = insertelement <2 x i32> poison, i32 %ext, i32 0 %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer ret <2 x i32> %dup } define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i8_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: dup v0.4s, v0.s[0] ; CHECK-NEXT: ret %load = load i8, ptr %p, align 1 %ext = zext i8 %load to i32 %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer ret <4 x i32> %dup } define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0, #4] ; CHECK-NEXT: dup v0.4s, v0.s[0] ; CHECK-NEXT: ret %addr = getelementptr inbounds i8, ptr %p, i64 4 %load = load i8, ptr %addr, align 1 %ext = zext i8 %load to i32 %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer ret <4 x i32> %dup } define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) { ; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0, x1] ; CHECK-NEXT: dup v0.4s, v0.s[0] ; CHECK-NEXT: ret %addr = getelementptr inbounds i8, ptr %p, i64 %offset %load = load i8, ptr %addr, align 1 %ext = zext i8 %load to i32 %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer ret <4 x i32> %dup } define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i8_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr b0, [x0] ; CHECK-NEXT: dup v0.2d, v0.d[0] ; CHECK-NEXT: ret %load = load i8, ptr %p, align 1 %ext = zext i8 %load to i64 %vec = insertelement <2 x i64> poison, i64 %ext, i32 0 %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer ret <2 x i64> %dup } define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i16_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: dup v0.2s, v0.s[0] ; CHECK-NEXT: ret %load = load i16, ptr %p, align 1 %ext = zext i16 %load to i32 %vec = insertelement <2 x i32> poison, i32 %ext, i32 0 %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer ret <2 x i32> %dup } define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: dup v0.4s, v0.s[0] ; CHECK-NEXT: ret %load = load i16, ptr %p, align 1 %ext = zext i16 %load to i32 %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer ret <4 x i32> %dup } define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0, #8] ; CHECK-NEXT: dup v0.4s, v0.s[0] ; CHECK-NEXT: ret %addr = getelementptr inbounds i16, ptr %p, i64 4 %load = load i16, ptr %addr, align 1 %ext = zext i16 %load to i32 %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer ret <4 x i32> %dup } define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) { ; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] ; CHECK-NEXT: dup v0.4s, v0.s[0] ; CHECK-NEXT: ret %addr = getelementptr inbounds i16, ptr %p, i64 %offset %load = load i16, ptr %addr, align 1 %ext = zext i16 %load to i32 %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer ret <4 x i32> %dup } define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i16_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: dup v0.2d, v0.d[0] ; CHECK-NEXT: ret %load = load i16, ptr %p, align 1 %ext = zext i16 %load to i64 %vec = insertelement <2 x i64> poison, i64 %ext, i32 0 %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer ret <2 x i64> %dup } define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) { ; CHECK-LABEL: test_dup_zextload_i32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: dup v0.2d, v0.d[0] ; CHECK-NEXT: ret %load = load i32, ptr %p, align 1 %ext = zext i32 %load to i64 %vec = insertelement <2 x i64> poison, i64 %ext, i32 0 %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer ret <2 x i64> %dup }