diff options
Diffstat (limited to 'llvm/test/CodeGen')
22 files changed, 1747 insertions, 276 deletions
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll index 9193025..6177ae5 100644 --- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll @@ -112,8 +112,7 @@ entry: define double @load_u64_from_u8_off1(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_off1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr b0, [x0, #1] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 1 @@ -140,8 +139,7 @@ entry: define float @load_u32_from_u8_off1(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_off1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr b0, [x0, #1] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 1 @@ -154,8 +152,7 @@ entry: define half @load_u16_from_u8_off1(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_off1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #1] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr b0, [x0, #1] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: @@ -185,8 +182,7 @@ entry: define double @load_u64_from_u16_off2(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrh w8, [x0, #2] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr h0, [x0, #2] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 2 @@ -199,8 +195,7 @@ entry: define double @load_u64_from_u8_off2(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr b0, [x0, #2] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 2 @@ -226,7 +221,7 @@ entry: define float @load_u32_from_u8_off2(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: ldr b0, [x0, #2] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 2 @@ -239,7 +234,7 @@ entry: define half @load_u16_from_u8_off2(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: ldr b0, [x0, #2] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: @@ -283,8 +278,7 @@ entry: define double @load_u64_from_u8_off255(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_off255: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #255] -; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr b0, [x0, #255] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 255 @@ -311,8 +305,7 @@ entry: define float @load_u32_from_u8_off255(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_off255: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #255] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr b0, [x0, #255] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 255 @@ -325,8 +318,7 @@ entry: define half @load_u16_from_u8_off255(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_off255: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0, #255] -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr b0, [x0, #255] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: @@ -354,7 +346,7 @@ entry: define double @load_u64_from_u16_off256(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_off256: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr h0, [x0, #128] +; CHECK-NEXT: ldr h0, [x0, #256] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 256 @@ -367,7 +359,7 @@ entry: define double @load_u64_from_u8_off256(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_off256: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #64] +; CHECK-NEXT: ldr b0, [x0, #256] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 256 @@ -393,7 +385,7 @@ entry: define float @load_u32_from_u8_off256(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_off256: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #128] +; CHECK-NEXT: ldr b0, [x0, #256] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 256 @@ -406,7 +398,7 @@ entry: define half @load_u16_from_u8_off256(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_off256: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #128] +; CHECK-NEXT: ldr b0, [x0, #256] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: @@ -435,8 +427,7 @@ entry: define double @load_u64_from_u16_offn(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_offn: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #8190 // =0x1ffe -; CHECK-NEXT: ldr h0, [x0, x8] +; CHECK-NEXT: ldr h0, [x0, #8190] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 8190 @@ -517,7 +508,8 @@ entry: define double @load_u64_from_u16_offnp1(ptr %n){ ; CHECK-LABEL: load_u64_from_u16_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr h0, [x0, #4096] +; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192 +; CHECK-NEXT: ldr h0, [x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 8192 @@ -530,7 +522,8 @@ entry: define double @load_u64_from_u8_offnp1(ptr %n){ ; CHECK-LABEL: load_u64_from_u8_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #1024] +; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096 +; CHECK-NEXT: ldr b0, [x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 4096 @@ -557,7 +550,8 @@ entry: define float @load_u32_from_u8_offnp1(ptr %n){ ; CHECK-LABEL: load_u32_from_u8_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #2048] +; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096 +; CHECK-NEXT: ldr b0, [x8] ; CHECK-NEXT: ret entry: %p = getelementptr i8, ptr %n, i64 4096 @@ -570,7 +564,8 @@ entry: define half @load_u16_from_u8_offnp1(ptr %n){ ; CHECK-LABEL: load_u16_from_u8_offnp1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #2048] +; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096 +; CHECK-NEXT: ldr b0, [x8] ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll index b3a7057..c551375 100644 --- a/llvm/test/CodeGen/AMDGPU/add-max.ll +++ b/llvm/test/CodeGen/AMDGPU/add-max.ll @@ -7,7 +7,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -18,39 +18,38 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret } define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { -; SDAG-LABEL: add_max_u32_ssv: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_add_max_u32 v0, s0, s1, v0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: add_max_u32_ssv: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_add_co_i32 s0, s0, s1 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: v_max_u32_e32 v0, s0, v0 -; GISEL-NEXT: ; return to shader part epilog - %add = add i32 %a, %b +; GCN-LABEL: add_max_u32_ssv: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_max_u32 v0, s0, s1, v0 +; GCN-NEXT: ; return to shader part epilog + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret } define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) { -; GCN-LABEL: add_max_u32_sss: -; GCN: ; %bb.0: -; GCN-NEXT: s_add_co_i32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_max_u32 s0, s0, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b +; SDAG-LABEL: add_max_u32_sss: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: v_max_u32_e32 v0, s2, v0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: add_max_u32_sss: +; GISEL: ; %bb.0: +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_add_max_u32 v0, s0, s1, v0 +; GISEL-NEXT: ; return to shader part epilog + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -61,7 +60,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 4) %ret = bitcast i32 %max to float ret float %ret @@ -72,26 +71,19 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %add, i32 100) %ret = bitcast i32 %max to float ret float %ret } -define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) { -; SDAG-LABEL: add_max_u32_slv: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_add_max_u32 v0, 0x64, s0, v0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: add_max_u32_slv: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_addk_co_i32 s0, 0x64 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: v_max_u32_e32 v0, s0, v0 -; GISEL-NEXT: ; return to shader part epilog - %add = add i32 %a, 100 - %max = call i32 @llvm.umax.i32(i32 %add, i32 %b) +define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) { +; GCN-LABEL: add_max_u32_slv: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) + %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret } @@ -101,7 +93,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.smax.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -112,7 +104,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.umin.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -123,7 +115,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) { ; GCN: ; %bb.0: ; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add i32 %a, %b + %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b) %max = call i32 @llvm.smin.i32(i32 %add, i32 %c) %ret = bitcast i32 %max to float ret float %ret @@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret } define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) { -; SDAG-LABEL: add_max_v2u16_ssv: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: add_max_v2u16_ssv: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_lshr_b32 s2, s0, 16 -; GISEL-NEXT: s_lshr_b32 s3, s1, 16 -; GISEL-NEXT: s_add_co_i32 s0, s0, s1 -; GISEL-NEXT: s_add_co_i32 s2, s2, s3 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GISEL-NEXT: v_pk_max_u16 v0, s0, v0 -; GISEL-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b +; GCN-LABEL: add_max_v2u16_ssv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, v0 +; GCN-NEXT: ; return to shader part epilog + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) { ; SDAG-LABEL: add_max_v2u16_sss: ; SDAG: ; %bb.0: -; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 +; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-NEXT: v_pk_max_u16 v0, v0, s2 ; SDAG-NEXT: ; return to shader part epilog ; ; GISEL-LABEL: add_max_v2u16_sss: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_lshr_b32 s3, s0, 16 -; GISEL-NEXT: s_lshr_b32 s4, s1, 16 -; GISEL-NEXT: s_add_co_i32 s0, s0, s1 -; GISEL-NEXT: s_add_co_i32 s3, s3, s4 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GISEL-NEXT: s_and_b32 s3, s2, 0xffff -; GISEL-NEXT: s_lshr_b32 s1, s0, 16 -; GISEL-NEXT: s_and_b32 s0, s0, 0xffff -; GISEL-NEXT: s_lshr_b32 s2, s2, 16 -; GISEL-NEXT: s_max_u32 s0, s0, s3 -; GISEL-NEXT: s_max_u32 s1, s1, s2 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_pk_add_max_u16 v0, s0, s1, v0 ; GISEL-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) { ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) { ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>) %ret = bitcast <2 x i16> %max to float ret float %ret } define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) { -; SDAG-LABEL: add_max_v2u16_slv: -; SDAG: ; %bb.0: -; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: add_max_v2u16_slv: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_lshr_b32 s1, s0, 16 -; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064 -; GISEL-NEXT: s_addk_co_i32 s1, 0x64 -; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GISEL-NEXT: v_pk_max_u16 v0, s0, v0 -; GISEL-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, <i16 100, i16 100> +; GCN-LABEL: add_max_v2u16_slv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0 +; GCN-NEXT: ; return to shader part epilog + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>) %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret @@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> ; GCN: ; %bb.0: ; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog - %add = add <2 x i16> %a, %b + %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b) %max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c) %ret = bitcast <2 x i16> %max to float ret float %ret diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 711d57b..30ad46d9 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -39131,21 +39131,21 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_xor_b32_e32 v2, v0, v1 ; GFX1250-NEXT: v_cls_i32_e32 v3, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_add_nc_u32 v3, -1, v3 :: v_dual_bitop2_b32 v2, v0, v1 bitop3:0x14 ; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2 +; GFX1250-NEXT: v_min_u32_e32 v2, v3, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_min_u32 v2, v3, -1, v2 ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 ; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp i64 %x to bfloat @@ -39483,29 +39483,30 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX1250-NEXT: v_xor_b32_e32 v4, v2, v3 ; GFX1250-NEXT: v_cls_i32_e32 v6, v3 ; GFX1250-NEXT: v_cls_i32_e32 v7, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4 +; GFX1250-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_add_nc_u32 v7, -1, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_min_u32 v5, v7, -1, v5 -; GFX1250-NEXT: v_add_min_u32 v4, v6, -1, v4 +; GFX1250-NEXT: v_min_u32_e32 v5, v7, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v4, v6, v4 ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1] -; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] ; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 ; GFX1250-NEXT: v_sub_nc_u32_e32 v3, 32, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_ldexp_f32 v1, v2, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <2 x i64> %x to <2 x bfloat> @@ -39968,41 +39969,42 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250TRUE16: ; %bb.0: ; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v2, v3 -; GFX1250TRUE16-NEXT: v_xor_b32_e32 v6, v4, v5 +; GFX1250TRUE16-NEXT: v_cls_i32_e32 v6, v5 +; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v4, v5 ; GFX1250TRUE16-NEXT: v_cls_i32_e32 v10, v3 -; GFX1250TRUE16-NEXT: v_cls_i32_e32 v9, v5 ; GFX1250TRUE16-NEXT: v_cls_i32_e32 v11, v1 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_ashrrev_i32 v6, 31, v6 -; GFX1250TRUE16-NEXT: v_xor_b32_e32 v8, v0, v1 -; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX1250TRUE16-NEXT: v_add_min_u32 v7, v10, -1, v7 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_add_min_u32 v6, v9, -1, v6 -; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3] -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5] -; GFX1250TRUE16-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14 +; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250TRUE16-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_bitop2_b32 v8, v2, v3 bitop3:0x14 +; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_ashrrev_i32 v8, 31, v8 +; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_add_nc_u32 v11, -1, v11 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v9, 32, v9 +; GFX1250TRUE16-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8 +; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5] +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250TRUE16-NEXT: v_min_u32_e32 v7, v10, v8 +; GFX1250TRUE16-NEXT: v_min_u32_e32 v8, v11, v9 +; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3] +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1] ; GFX1250TRUE16-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX1250TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250TRUE16-NEXT: v_add_min_u32 v8, v11, -1, v8 -; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 +; GFX1250TRUE16-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1] -; GFX1250TRUE16-NEXT: v_sub_nc_u32_e32 v5, 32, v8 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250TRUE16-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX1250TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v5, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v4 ; GFX1250TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX1250TRUE16-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -40017,44 +40019,47 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX1250FAKE16: ; %bb.0: ; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250FAKE16-NEXT: v_xor_b32_e32 v8, v4, v5 -; GFX1250FAKE16-NEXT: v_xor_b32_e32 v6, v2, v3 +; GFX1250FAKE16-NEXT: v_cls_i32_e32 v6, v5 +; GFX1250FAKE16-NEXT: v_xor_b32_e32 v7, v2, v3 ; GFX1250FAKE16-NEXT: v_cls_i32_e32 v10, v3 -; GFX1250FAKE16-NEXT: v_cls_i32_e32 v9, v5 ; GFX1250FAKE16-NEXT: v_cls_i32_e32 v11, v1 -; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v8, 31, v8 :: v_dual_bitop2_b32 v7, v0, v1 bitop3:0x14 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7 -; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_add_min_u32 v6, v10, -1, v6 -; GFX1250FAKE16-NEXT: v_add_min_u32 v7, v11, -1, v7 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v8, v4, v5 bitop3:0x14 +; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3] -; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1] -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1250FAKE16-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX1250FAKE16-NEXT: v_add_nc_u32_e32 v8, 32, v8 -; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_ashrrev_i32 v8, 31, v8 +; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v11, -1, v11 :: v_dual_ashrrev_i32 v9, 31, v9 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v8, 32, v8 +; GFX1250FAKE16-NEXT: v_add_nc_u32_e32 v9, 32, v9 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250FAKE16-NEXT: v_min_u32_e32 v7, v10, v7 +; GFX1250FAKE16-NEXT: v_min_u32_e32 v6, v6, v8 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX1250FAKE16-NEXT: v_add_min_u32 v8, v9, -1, v8 +; GFX1250FAKE16-NEXT: v_min_u32_e32 v9, v11, v9 +; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3] ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 -; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[4:5], v8, v[4:5] -; GFX1250FAKE16-NEXT: v_sub_nc_u32_e32 v8, 32, v8 -; GFX1250FAKE16-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5] +; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v9, v[0:1] +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250FAKE16-NEXT: v_min_u32_e32 v2, 1, v2 ; GFX1250FAKE16-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6 +; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v4, 32, v9 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54 +; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250FAKE16-NEXT: v_ldexp_f32 v2, v2, v3 ; GFX1250FAKE16-NEXT: v_ldexp_f32 v0, v0, v4 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_ldexp_f32 v1, v1, v8 +; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250FAKE16-NEXT: v_ldexp_f32 v1, v1, v6 ; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 ; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 @@ -40644,51 +40649,54 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX1250-NEXT: v_cls_i32_e32 v9, v7 ; GFX1250-NEXT: v_xor_b32_e32 v8, v6, v7 -; GFX1250-NEXT: v_cls_i32_e32 v12, v7 -; GFX1250-NEXT: v_cls_i32_e32 v13, v5 -; GFX1250-NEXT: v_cls_i32_e32 v14, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_ashrrev_i32 v8, 31, v8 -; GFX1250-NEXT: v_xor_b32_e32 v10, v2, v3 -; GFX1250-NEXT: v_cls_i32_e32 v15, v1 -; GFX1250-NEXT: v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14 -; GFX1250-NEXT: v_add_min_u32 v9, v13, -1, v9 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_min_u32 v8, v12, -1, v8 -; GFX1250-NEXT: v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v9, v[4:5] -; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_add_nc_u32_e32 v11, 32, v11 -; GFX1250-NEXT: v_add_min_u32 v10, v14, -1, v10 +; GFX1250-NEXT: v_cls_i32_e32 v10, v5 +; GFX1250-NEXT: v_xor_b32_e32 v14, v0, v1 +; GFX1250-NEXT: v_cls_i32_e32 v12, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_add_nc_u32 v9, -1, v9 :: v_dual_ashrrev_i32 v8, 31, v8 +; GFX1250-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_bitop2_b32 v11, v4, v5 bitop3:0x14 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_min_u32 v11, v15, -1, v11 -; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3] -; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 -; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: v_dual_add_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v13, v2, v3 bitop3:0x14 +; GFX1250-NEXT: v_ashrrev_i32_e32 v11, 31, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v8, v9, v8 +; GFX1250-NEXT: v_ashrrev_i32_e32 v9, 31, v13 +; GFX1250-NEXT: v_cls_i32_e32 v13, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_ashrrev_i32 v14, 31, v14 :: v_dual_add_nc_u32 v11, 32, v11 +; GFX1250-NEXT: v_dual_add_nc_u32 v12, -1, v12 :: v_dual_add_nc_u32 v9, 32, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_add_nc_u32 v13, -1, v13 :: v_dual_add_nc_u32 v14, 32, v14 +; GFX1250-NEXT: v_min_u32_e32 v10, v10, v11 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_min_u32_e32 v9, v12, v9 +; GFX1250-NEXT: v_min_u32_e32 v11, v13, v14 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v10, v[4:5] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v9, v[2:3] +; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v11, v[0:1] +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v10 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 -; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v9 +; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v10 ; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 +; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v6 +; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 32, v8 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v3, v6 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4 ; GFX1250-NEXT: v_sub_nc_u32_e32 v6, 32, v11 ; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX1250-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX1250-NEXT: v_ldexp_f32 v1, v3, v1 ; GFX1250-NEXT: v_ldexp_f32 v3, v4, v7 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_ldexp_f32 v2, v2, v5 diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index 9601a2e..2731148 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -166,6 +166,7 @@ ; CHECK-NEXT: ARM Execution Domain Fix ; CHECK-NEXT: BreakFalseDeps ; CHECK-NEXT: ARM pseudo instruction expansion pass +; CHECK-NEXT: Insert KCFI indirect call checks ; CHECK-NEXT: Thumb2 instruction size reduce pass ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction diff --git a/llvm/test/CodeGen/ARM/kcfi-arm.ll b/llvm/test/CodeGen/ARM/kcfi-arm.ll new file mode 100644 index 0000000..e3696cf --- /dev/null +++ b/llvm/test/CodeGen/ARM/kcfi-arm.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL +; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI + +; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them) + +; MIR-LABEL: name: f1 +; MIR: body: + +; ISEL: BLX %0, csr_aapcs,{{.*}} cfi-type 12345678 + +; KCFI: BUNDLE{{.*}} { +; KCFI-NEXT: KCFI_CHECK_ARM $r0, 12345678 +; KCFI-NEXT: BLX killed $r0, csr_aapcs,{{.*}} +; KCFI-NEXT: } + +; MIR-LABEL: name: f2 +; MIR: body: + +; ISEL: TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678 + +; KCFI: BUNDLE{{.*}} { +; KCFI-NEXT: KCFI_CHECK_ARM $r0, 12345678 +; KCFI-NEXT: TAILJMPr killed $r0, csr_aapcs, implicit $sp, implicit $sp +; KCFI-NEXT: } + +; ASM: .long 12345678 +define void @f1(ptr noundef %x) !kcfi_type !1 { +; ASM-LABEL: f1: +; ASM: @ %bb.0: +; ASM-NEXT: .save {r11, lr} +; ASM-NEXT: push {r11, lr} +; ASM-NEXT: bic r12, r0, #1 +; ASM-NEXT: ldr r12, [r12, #-4] +; ASM-NEXT: eor r12, r12, #78 +; ASM-NEXT: eor r12, r12, #24832 +; ASM-NEXT: eor r12, r12, #12320768 +; ASM-NEXT: eors r12, r12, #0 +; ASM-NEXT: beq .Ltmp0 +; ASM-NEXT: udf #33760 +; ASM-NEXT: .Ltmp0: +; ASM-NEXT: blx r0 +; ASM-NEXT: pop {r11, pc} + + call void %x() [ "kcfi"(i32 12345678) ] + ret void +} + +; Test with tail call +define void @f2(ptr noundef %x) !kcfi_type !1 { +; ASM-LABEL: f2: +; ASM: @ %bb.0: +; ASM-NEXT: bic r12, r0, #1 +; ASM-NEXT: ldr r12, [r12, #-4] +; ASM-NEXT: eor r12, r12, #78 +; ASM-NEXT: eor r12, r12, #24832 +; ASM-NEXT: eor r12, r12, #12320768 +; ASM-NEXT: eors r12, r12, #0 +; ASM-NEXT: beq .Ltmp1 +; ASM-NEXT: udf #33760 +; ASM-NEXT: .Ltmp1: +; ASM-NEXT: bx r0 + + tail call void %x() [ "kcfi"(i32 12345678) ] + ret void +} + +; Test r3 spill/reload when target is r12 and r3 is a call argument. +; With 5+ arguments (target + 4 args), r0-r3 are all used for arguments, +; forcing r3 to be spilled when we need it as scratch register. +define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 { +; ASM-LABEL: f3_r3_spill: +; ASM: @ %bb.0: +; ASM-NEXT: .save {r11, lr} +; ASM-NEXT: push {r11, lr} +; ASM-NEXT: mov lr, r3 +; ASM-NEXT: ldr r3, [sp, #8] +; ASM-NEXT: mov r12, r0 +; ASM-NEXT: mov r0, r1 +; ASM-NEXT: mov r1, r2 +; ASM-NEXT: mov r2, lr +; ASM-NEXT: stmdb sp!, {r3} +; ASM-NEXT: bic r3, r12, #1 +; ASM-NEXT: ldr r3, [r3, #-4] +; ASM-NEXT: eor r3, r3, #78 +; ASM-NEXT: eor r3, r3, #24832 +; ASM-NEXT: eor r3, r3, #12320768 +; ASM-NEXT: eors r3, r3, #0 +; ASM-NEXT: ldm sp!, {r3} +; ASM-NEXT: beq .Ltmp2 +; ASM-NEXT: udf #33772 +; ASM-NEXT: .Ltmp2: +; ASM-NEXT: blx r12 +; ASM-NEXT: pop {r11, pc} +; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp]=%d +; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12 +; Compiler shuffles arguments into place, saving r3 (c) in lr, loading d from stack +; r3 is live as 4th argument, so push it before KCFI check +; Restore r3 immediately after comparison, before branch + call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ] + ret void +} + +; Test with 3 arguments - r3 not live, target in r12, so r3 used as scratch without spilling +define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 { +; ASM-LABEL: f4_r3_unused: +; ASM: @ %bb.0: +; ASM-NEXT: .save {r11, lr} +; ASM-NEXT: push {r11, lr} +; ASM-NEXT: mov r3, r0 +; ASM-NEXT: mov r0, r1 +; ASM-NEXT: mov r1, r2 +; ASM-NEXT: bic r12, r3, #1 +; ASM-NEXT: ldr r12, [r12, #-4] +; ASM-NEXT: eor r12, r12, #78 +; ASM-NEXT: eor r12, r12, #24832 +; ASM-NEXT: eor r12, r12, #12320768 +; ASM-NEXT: eors r12, r12, #0 +; ASM-NEXT: beq .Ltmp3 +; ASM-NEXT: udf #33763 +; ASM-NEXT: .Ltmp3: +; ASM-NEXT: blx r3 +; ASM-NEXT: pop {r11, pc} +; Only 3 arguments total, so r3 is not used as call argument +; Compiler puts target→r3, a→r0, b→r1 +; r3 is the target, so we use r12 as scratch (no spill needed) + call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ] + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"kcfi", i32 1} +!1 = !{i32 12345678} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; ISEL: {{.*}} +; KCFI: {{.*}} +; MIR: {{.*}} diff --git a/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll new file mode 100644 index 0000000..8e71cae --- /dev/null +++ b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll @@ -0,0 +1,81 @@ +; RUN: llc -mtriple=thumbv7-linux-gnueabi -filetype=obj < %s +; RUN: llc -mtriple=thumbv7-linux-gnueabi < %s | FileCheck %s + +; This test verifies that KCFI instrumentation doesn't cause "out of range +; pc-relative fixup value" errors when generating object files. +; +; The test creates a scenario with enough KCFI-instrumented indirect calls +; (~32 bytes each) that would push a cbz/cbnz instruction out of its ±126 byte +; range if the KCFI_CHECK pseudo-instruction size is not properly accounted for. +; +; Without the fix (KCFI_CHECK returns size 0): +; - Backend thinks KCFI checks take no space +; - Generates cbz to branch over the code +; - During assembly, cbz target is >126 bytes away +; - Assembly fails with "error: out of range pc-relative fixup value" +; +; With the fix (KCFI_CHECK returns size 32 for Thumb2): +; - Backend correctly accounts for KCFI check expansion +; - Avoids cbz or uses longer-range branch instructions +; - Assembly succeeds, object file is generated + +declare void @external_function(i32) + +; Test WITHOUT KCFI: should generate cbz since calls are small +; CHECK-LABEL: test_without_kcfi: +; CHECK: cbz +; CHECK-NOT: bic{{.*}}#1 +define i32 @test_without_kcfi(ptr %callback, i32 %x) { +entry: + %cmp = icmp eq i32 %x, 0 + br i1 %cmp, label %if_zero, label %if_nonzero + +if_nonzero: + ; Regular (non-KCFI) indirect calls - much smaller + call void %callback() + call void %callback() + call void %callback() + call void %callback() + call void %callback() + call void %callback() + + call void @external_function(i32 %x) + %add1 = add i32 %x, 1 + ret i32 %add1 + +if_zero: + call void @external_function(i32 0) + ret i32 0 +} + +; Test WITH KCFI: should NOT generate cbz due to large KCFI checks +; CHECK-LABEL: test_with_kcfi: +; CHECK-NOT: cbz +; CHECK: bic{{.*}}#1 +define i32 @test_with_kcfi(ptr %callback, i32 %x) !kcfi_type !1 { +entry: + %cmp = icmp eq i32 %x, 0 + br i1 %cmp, label %if_zero, label %if_nonzero + +if_nonzero: + ; Six KCFI-instrumented indirect calls (~192 bytes total, exceeds cbz range) + call void %callback() [ "kcfi"(i32 12345678) ] + call void %callback() [ "kcfi"(i32 12345678) ] + call void %callback() [ "kcfi"(i32 12345678) ] + call void %callback() [ "kcfi"(i32 12345678) ] + call void %callback() [ "kcfi"(i32 12345678) ] + call void %callback() [ "kcfi"(i32 12345678) ] + + ; Regular call to prevent optimization + call void @external_function(i32 %x) + %add1 = add i32 %x, 1 + ret i32 %add1 + +if_zero: + call void @external_function(i32 0) + ret i32 0 +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"kcfi", i32 1} +!1 = !{i32 12345678} diff --git a/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll new file mode 100644 index 0000000..f8e0838 --- /dev/null +++ b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s + +; CHECK: .p2align 2 +; CHECK-NOT: nop +; CHECK: .long 12345678 +define void @f1(ptr noundef %x) !kcfi_type !1 { +; CHECK-LABEL: f1: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bic r12, r0, #1 +; CHECK-NEXT: ldr r12, [r12, #-4] +; CHECK-NEXT: eor r12, r12, #78 +; CHECK-NEXT: eor r12, r12, #24832 +; CHECK-NEXT: eor r12, r12, #12320768 +; CHECK-NEXT: eors r12, r12, #0 +; CHECK-NEXT: beq .Ltmp0 +; CHECK-NEXT: udf #33760 +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: blx r0 +; CHECK-NEXT: pop {r11, pc} + call void %x() [ "kcfi"(i32 12345678) ] + ret void +} + +; CHECK: .p2align 2 +; CHECK-NOT: .long +; CHECK-NOT: nop +define void @f2(ptr noundef %x) { +; CHECK-LABEL: f2: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bic r12, r0, #1 +; CHECK-NEXT: ldr r12, [r12, #-4] +; CHECK-NEXT: eor r12, r12, #78 +; CHECK-NEXT: eor r12, r12, #24832 +; CHECK-NEXT: eor r12, r12, #12320768 +; CHECK-NEXT: eors r12, r12, #0 +; CHECK-NEXT: beq .Ltmp1 +; CHECK-NEXT: udf #33760 +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: blx r0 +; CHECK-NEXT: pop {r11, pc} + call void %x() [ "kcfi"(i32 12345678) ] + ret void +} + +; CHECK: .p2align 2 +; CHECK: .long 12345678 +; CHECK-COUNT-11: nop +define void @f3(ptr noundef %x) #0 !kcfi_type !1 { +; CHECK-LABEL: f3: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bic r12, r0, #1 +; CHECK-NEXT: ldr r12, [r12, #-48] +; CHECK-NEXT: eor r12, r12, #78 +; CHECK-NEXT: eor r12, r12, #24832 +; CHECK-NEXT: eor r12, r12, #12320768 +; CHECK-NEXT: eors r12, r12, #0 +; CHECK-NEXT: beq .Ltmp3 +; CHECK-NEXT: udf #33760 +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: blx r0 +; CHECK-NEXT: pop {r11, pc} + call void %x() [ "kcfi"(i32 12345678) ] + ret void +} + +; CHECK: .p2align 2 +; CHECK-COUNT-11: nop +define void @f4(ptr noundef %x) #0 { +; CHECK-LABEL: f4: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bic r12, r0, #1 +; CHECK-NEXT: ldr r12, [r12, #-48] +; CHECK-NEXT: eor r12, r12, #78 +; CHECK-NEXT: eor r12, r12, #24832 +; CHECK-NEXT: eor r12, r12, #12320768 +; CHECK-NEXT: eors r12, r12, #0 +; CHECK-NEXT: beq .Ltmp5 +; CHECK-NEXT: udf #33760 +; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: blx r0 +; CHECK-NEXT: pop {r11, pc} + call void %x() [ "kcfi"(i32 12345678) ] + ret void +} + +attributes #0 = { "patchable-function-prefix"="11" } + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"kcfi", i32 1} +!1 = !{i32 12345678} diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb.ll b/llvm/test/CodeGen/ARM/kcfi-thumb.ll new file mode 100644 index 0000000..7c02d830 --- /dev/null +++ b/llvm/test/CodeGen/ARM/kcfi-thumb.ll @@ -0,0 +1,215 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s + +; This test verifies that Thumb1 (ARMv6-M) generates correct code for backend KCFI. +; Thumb1 uses the backend KCFI implementation with Thumb1-specific instructions. + +; Test function without KCFI annotation +; CHECK-LABEL: .globl nosan +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: .type nosan,%function +; CHECK-NEXT: .code 16 +; CHECK-NEXT: .thumb_func +define dso_local void @nosan() nounwind { +; CHECK-LABEL: nosan: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + ret void +} + +; Test function with KCFI annotation - verifies type hash emission +;; The alignment is at least 4 to avoid unaligned type hash loads when this +;; instrumented function is indirectly called. +; CHECK-LABEL: .globl target_func +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .type target_func,%function +; CHECK-NEXT: .long 3170468932 +; CHECK-NEXT: .code 16 +; CHECK-NEXT: .thumb_func +define void @target_func() !kcfi_type !1 { +; CHECK-LABEL: target_func: +; CHECK: @ %bb.0: +; CHECK-NEXT: bx lr + ret void +} + +; Test indirect call with KCFI check using operand bundles +; CHECK-LABEL: .globl f1 +; CHECK: .p2align 2 +; CHECK-NEXT: .type f1,%function +; CHECK-NEXT: .long 3170468932 +; CHECK-NEXT: .code 16 +; CHECK-NEXT: .thumb_func +define void @f1(ptr noundef %x) !kcfi_type !1 { +; CHECK-LABEL: f1: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: bics r2, r3 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: movs r3, #188 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #249 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #132 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #68 +; CHECK-NEXT: cmp r2, r3 +; CHECK-NEXT: beq .Ltmp0 +; CHECK-NEXT: bkpt #0 +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: blx r0 +; CHECK-NEXT: pop {r7, pc} + call void %x() [ "kcfi"(i32 -1124498364) ] + ret void +} + +; Test with tail call - backend KCFI supports tail calls +define void @f2(ptr noundef %x) !kcfi_type !1 { +; CHECK-LABEL: f2: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: bics r2, r3 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: movs r3, #188 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #249 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #132 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #68 +; CHECK-NEXT: cmp r2, r3 +; CHECK-NEXT: beq .Ltmp1 +; CHECK-NEXT: bkpt #0 +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: blx r0 +; CHECK-NEXT: pop {r7, pc} + tail call void %x() [ "kcfi"(i32 -1124498364) ] + ret void +} + +; Test with R2 live (3 arguments) - compiler shuffles args, no spilling needed +define void @f3_r2_live(ptr noundef %x, i32 %a, i32 %b, i32 %c) !kcfi_type !1 { +; CHECK-LABEL: f3_r2_live: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: mov r2, r3 +; CHECK-NEXT: push {r2} +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: bics r2, r3 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: movs r3, #188 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #249 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #132 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #68 +; CHECK-NEXT: cmp r2, r3 +; CHECK-NEXT: pop {r2} +; CHECK-NEXT: beq .Ltmp2 +; CHECK-NEXT: bkpt #0 +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: blx r4 +; CHECK-NEXT: pop {r4, pc} +; Compiler shuffles: target→r4, c→r2, a→r0, b→r1 +; R2 is live (3rd arg), so we push it, then uses R3 as temp, R2 as scratch + call void %x(i32 %a, i32 %b, i32 %c) [ "kcfi"(i32 -1124498364) ] + ret void +} + +; Test with both R2 and R3 live (4 arguments) - compiler moves to r5/r4, uses R3 temp and R12 scratch +define void @f4_r2_r3_live(ptr noundef %x, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 { +; CHECK-LABEL: f4_r2_r3_live: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: ldr r3, [sp, #16] +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: push {r3} +; CHECK-NEXT: push {r2} +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: bics r2, r3 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: movs r3, #188 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #249 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #132 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #68 +; CHECK-NEXT: cmp r2, r3 +; CHECK-NEXT: pop {r2} +; CHECK-NEXT: pop {r3} +; CHECK-NEXT: beq .Ltmp3 +; CHECK-NEXT: bkpt #0 +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: blx r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +; Compiler shuffles: r3→r5, target→r4, d→r3 (from stack), a→r0, b→r1, c→r2 +; Then pushes r3 (d value), then r2, uses R3 as temp, R2 as scratch + call void %x(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ] + ret void +} + +; Test where target ends up in R12, forcing R2 as scratch, with both R2 and R3 live +; This uses inline asm to force target into R12, with 4 call arguments to make R2/R3 live +define void @f5_r12_target_r2_r3_live(i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 { +; CHECK-LABEL: f5_r12_target_r2_r3_live: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: push {r3} +; CHECK-NEXT: push {r2} +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: bics r2, r3 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: movs r3, #188 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #249 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #132 +; CHECK-NEXT: lsls r3, r3, #8 +; CHECK-NEXT: adds r3, #68 +; CHECK-NEXT: cmp r2, r3 +; CHECK-NEXT: pop {r2} +; CHECK-NEXT: pop {r3} +; CHECK-NEXT: beq .Ltmp4 +; CHECK-NEXT: bkpt #0 +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: blx r12 +; CHECK-NEXT: pop {r7, pc} +; Use inline asm to get function pointer into R12 +; With 4 arguments (r0-r3), both R2 and R3 are live +; Target in R12 means R2 is scratch, R3 is temp, and both need spilling + %target = call ptr asm "", "={r12}"() + call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ] + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"kcfi", i32 1} +!1 = !{i32 -1124498364} diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb2.ll b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll new file mode 100644 index 0000000..f319d98 --- /dev/null +++ b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL +; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI + +; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them) + +; MIR-LABEL: name: f1 +; MIR: body: + +; ISEL: tBLXr 14 /* CC::al */, $noreg, %0, csr_aapcs,{{.*}} cfi-type 12345678 + +; KCFI: BUNDLE{{.*}} { +; KCFI-NEXT: KCFI_CHECK_Thumb2 $r0, 12345678 +; KCFI-NEXT: tBLXr 14 /* CC::al */, $noreg, {{(killed )?}}$r0, csr_aapcs,{{.*}} +; KCFI-NEXT: } + +; MIR-LABEL: name: f2 +; MIR: body: + +; ISEL: TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678 + +; KCFI: BUNDLE{{.*}} { +; KCFI-NEXT: KCFI_CHECK_Thumb2 $r0, 12345678 +; KCFI-NEXT: tTAILJMPr {{(killed )?}}$r0, csr_aapcs, implicit $sp, implicit $sp +; KCFI-NEXT: } + +; Test function without KCFI annotation +; ASM-LABEL: .globl nosan +; ASM-NEXT: .p2align 1 +; ASM-NEXT: .type nosan,%function +; ASM-NEXT: .code 16 +; ASM-NEXT: .thumb_func +define dso_local void @nosan() nounwind { +; ASM-LABEL: nosan: +; ASM: @ %bb.0: +; ASM-NEXT: bx lr + ret void +} + +; Test function with KCFI annotation - verifies type hash emission +;; The alignment is at least 4 to avoid unaligned type hash loads when this +;; instrumented function is indirectly called. +; ASM-LABEL: .globl target_func +; ASM-NEXT: .p2align 2 +; ASM-NEXT: .type target_func,%function +; ASM-NEXT: .long 12345678 +; ASM-NEXT: .code 16 +; ASM-NEXT: .thumb_func +define void @target_func() !kcfi_type !1 { +; ASM-LABEL: target_func: +; ASM: @ %bb.0: +; ASM-NEXT: bx lr + ret void +} + +; Test indirect call with KCFI check +; ASM: .long 12345678 +define void @f1(ptr noundef %x) !kcfi_type !1 { +; ASM-LABEL: f1: +; ASM: @ %bb.0: +; ASM-NEXT: .save {r7, lr} +; ASM-NEXT: push {r7, lr} +; ASM-NEXT: bic r12, r0, #1 +; ASM-NEXT: ldr r12, [r12, #-4] +; ASM-NEXT: eor r12, r12, #78 +; ASM-NEXT: eor r12, r12, #24832 +; ASM-NEXT: eor r12, r12, #12320768 +; ASM-NEXT: eors r12, r12, #0 +; ASM-NEXT: beq.w .Ltmp0 +; ASM-NEXT: udf #128 +; ASM-NEXT: .Ltmp0: +; ASM-NEXT: blx r0 +; ASM-NEXT: pop {r7, pc} + + call void %x() [ "kcfi"(i32 12345678) ] + ret void +} + +; Test with tail call +define void @f2(ptr noundef %x) !kcfi_type !1 { +; ASM-LABEL: f2: +; ASM: @ %bb.0: +; ASM-NEXT: bic r12, r0, #1 +; ASM-NEXT: ldr r12, [r12, #-4] +; ASM-NEXT: eor r12, r12, #78 +; ASM-NEXT: eor r12, r12, #24832 +; ASM-NEXT: eor r12, r12, #12320768 +; ASM-NEXT: eors r12, r12, #0 +; ASM-NEXT: beq.w .Ltmp1 +; ASM-NEXT: udf #128 +; ASM-NEXT: .Ltmp1: +; ASM-NEXT: bx r0 + + tail call void %x() [ "kcfi"(i32 12345678) ] + ret void +} + +; Test r3 spill/reload when target is r12 and r3 is a call argument (Thumb2) +define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 { +; ASM-LABEL: f3_r3_spill: +; ASM: @ %bb.0: +; ASM-NEXT: .save {r7, lr} +; ASM-NEXT: push {r7, lr} +; ASM-NEXT: mov lr, r3 +; ASM-NEXT: ldr r3, [sp, #8] +; ASM-NEXT: mov r12, r0 +; ASM-NEXT: mov r0, r1 +; ASM-NEXT: mov r1, r2 +; ASM-NEXT: mov r2, lr +; ASM-NEXT: push {r3} +; ASM-NEXT: bic r3, r12, #1 +; ASM-NEXT: ldr r3, [r3, #-4] +; ASM-NEXT: eor r3, r3, #78 +; ASM-NEXT: eor r3, r3, #24832 +; ASM-NEXT: eor r3, r3, #12320768 +; ASM-NEXT: eors r3, r3, #0 +; ASM-NEXT: pop {r3} +; ASM-NEXT: beq.w .Ltmp2 +; ASM-NEXT: udf #140 +; ASM-NEXT: .Ltmp2: +; ASM-NEXT: blx r12 +; ASM-NEXT: pop {r7, pc} +; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp+8]=%d +; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12 +; r3 is live as 4th argument, so push it before KCFI check + call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ] + ret void +} + +; Test with 3 arguments - r3 not live, target in r12 or elsewhere, r12 used as scratch +define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 { +; ASM-LABEL: f4_r3_unused: +; ASM: @ %bb.0: +; ASM-NEXT: .save {r7, lr} +; ASM-NEXT: push {r7, lr} +; ASM-NEXT: mov r3, r0 +; ASM-NEXT: mov r0, r1 +; ASM-NEXT: mov r1, r2 +; ASM-NEXT: bic r12, r3, #1 +; ASM-NEXT: ldr r12, [r12, #-4] +; ASM-NEXT: eor r12, r12, #78 +; ASM-NEXT: eor r12, r12, #24832 +; ASM-NEXT: eor r12, r12, #12320768 +; ASM-NEXT: eors r12, r12, #0 +; ASM-NEXT: beq.w .Ltmp3 +; ASM-NEXT: udf #131 +; ASM-NEXT: .Ltmp3: +; ASM-NEXT: blx r3 +; ASM-NEXT: pop {r7, pc} +; Only 3 arguments total, so r3 is not used as call argument +; Target might be in r3, using r12 as scratch (no spill needed) + call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ] + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"kcfi", i32 1} +!1 = !{i32 12345678} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; ISEL: {{.*}} +; KCFI: {{.*}} +; MIR: {{.*}} diff --git a/llvm/test/CodeGen/ARM/kcfi.ll b/llvm/test/CodeGen/ARM/kcfi.ll deleted file mode 100644 index 9e16468..0000000 --- a/llvm/test/CodeGen/ARM/kcfi.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s - -; CHECK-LABEL: .globl nosan -; CHECK-NEXT: .p2align 1 -; CHECK-NEXT: .type nosan,%function -; CHECK-NEXT: .code 16 -; CHECK-NEXT: .thumb_func -; CHECK-NEXT: nosan: -define dso_local void @nosan() nounwind { - ret void -} - -;; The alignment is at least 4 to avoid unaligned type hash loads when this -;; instrumented function is indirectly called. -; CHECK-LABEL: .globl f1 -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: .type f1,%function -; CHECK-NEXT: .long 3170468932 -; CHECK-NEXT: .code 16 -; CHECK-NEXT: .thumb_func -; CHECK-NEXT: f1: -define void @f1(ptr noundef %x) !kcfi_type !1 { - ret void -} - -!llvm.module.flags = !{!0} -!0 = !{i32 4, !"kcfi", i32 1} -!1 = !{i32 -1124498364} diff --git a/llvm/test/CodeGen/AVR/dynalloca.ll b/llvm/test/CodeGen/AVR/dynalloca.ll index 3face71..b32910b 100644 --- a/llvm/test/CodeGen/AVR/dynalloca.ll +++ b/llvm/test/CodeGen/AVR/dynalloca.ll @@ -64,16 +64,16 @@ define void @dynalloca2(i16 %x) { ; CHECK-NEXT: out 63, r0 ; CHECK-NEXT: out 61, {{.*}} ; Store values on the stack -; CHECK: ldi r20, 0 -; CHECK: ldi r21, 0 -; CHECK: std Z+8, r21 -; CHECK: std Z+7, r20 -; CHECK: std Z+6, r21 -; CHECK: std Z+5, r20 -; CHECK: std Z+4, r21 -; CHECK: std Z+3, r20 -; CHECK: std Z+2, r21 -; CHECK: std Z+1, r20 +; CHECK: ldi [[REG1:r[0-9]+]], 0 +; CHECK: ldi [[REG2:r[0-9]+]], 0 +; CHECK: std Z+8, [[REG2]] +; CHECK: std Z+7, [[REG1]] +; CHECK: std Z+6, [[REG2]] +; CHECK: std Z+5, [[REG1]] +; CHECK: std Z+4, [[REG2]] +; CHECK: std Z+3, [[REG1]] +; CHECK: std Z+2, [[REG2]] +; CHECK: std Z+1, [[REG1]] ; CHECK: call ; Call frame restore ; CHECK-NEXT: in r30, 61 diff --git a/llvm/test/CodeGen/AVR/issue-163015.ll b/llvm/test/CodeGen/AVR/issue-163015.ll new file mode 100644 index 0000000..6c4dc51 --- /dev/null +++ b/llvm/test/CodeGen/AVR/issue-163015.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -mtriple=avr | FileCheck %s + +@ui1 = protected local_unnamed_addr global i64 zeroinitializer, align 8 +@ui2 = protected local_unnamed_addr global i64 zeroinitializer, align 8 +@failed = private unnamed_addr addrspace(1) constant [12 x i8] c"test failed\00" +@stats2 = external protected global i16, align 1 + +; CHECK-LABEL: main: +define i32 @main() addrspace(1) { +entry: + store i64 94, ptr @ui1, align 8 + store i64 53, ptr @ui2, align 8 + tail call addrspace(1) void @foo(i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 32, ptr @stats2) + %11 = load i64, ptr @ui1, align 8 + %12 = load i64, ptr @ui2, align 8 + +; COM: CHECK: call __udivdi3 + %15 = udiv i64 %11, %12 + +; look for the buggy pattern where r30/r31 are being clobbered, corrupting the stack pointer +; CHECK-NOT: std Z+{{[1-9]+}}, r30 +; CHECK-NOT: std Z+{{[1-9]+}}, r31 + +; CHECK: call expect + tail call addrspace(1) void @expect(i64 %15, i64 1, i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 33) + +; CHECK: ret + ret i32 0 +} + +declare protected void @expect(i64, i64, i16, i16, i8, i16) local_unnamed_addr addrspace(1) #0 +declare protected void @foo(i16, i16, i8, i16, i16) local_unnamed_addr addrspace(1) #0 diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll index 8c0d82e..6f1bbd0 100644 --- a/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll +++ b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll @@ -2,7 +2,7 @@ ; Check that we correctly ignore cbuffers that were nulled out by optimizations. %__cblayout_CB = type <{ float }> -@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison +@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 4, 0)) poison @x = external local_unnamed_addr addrspace(2) global float, align 4 ; CHECK-NOT: !hlsl.cbs = diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll index bd07ba1..eb4cf76 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll @@ -20,6 +20,9 @@ ; CHECK: OpReadClockKHR [[v2uint]] [[uint_1]] ; CHECK: OpReadClockKHR [[v2uint]] [[uint_2]] ; CHECK: OpReadClockKHR [[v2uint]] [[uint_3]] +; CHECK: OpReadClockKHR [[ulong]] [[uint_1]] +; CHECK: OpReadClockKHR [[ulong]] [[uint_2]] +; CHECK: OpReadClockKHR [[ulong]] [[uint_3]] define dso_local spir_kernel void @test_clocks(ptr addrspace(1) nocapture noundef writeonly align 8 %out64, ptr addrspace(1) nocapture noundef writeonly align 8 %outv2) { entry: @@ -39,6 +42,9 @@ entry: %call9 = tail call spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() %arrayidx10 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 16 store <2 x i32> %call9, ptr addrspace(1) %arrayidx10, align 8 + %call10 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 1) + %call11 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 2) + %call12 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 3) ret void } @@ -59,3 +65,6 @@ declare spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv() local_unnamed_add ; Function Attrs: convergent nounwind declare spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() local_unnamed_addr + +; Function Attrs: nounwind +declare spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32) diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 4f5cb5a..3e7b73a 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -269,3 +269,708 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind { %ret = load atomic <1 x i64>, ptr %x acquire, align 8 ret <1 x i64> %ret } + +define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec1_ptr: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: pushq %rax +; CHECK-O3-NEXT: movq %rdi, %rsi +; CHECK-O3-NEXT: movq %rsp, %rdx +; CHECK-O3-NEXT: movl $8, %edi +; CHECK-O3-NEXT: movl $2, %ecx +; CHECK-O3-NEXT: callq __atomic_load@PLT +; CHECK-O3-NEXT: movq (%rsp), %rax +; CHECK-O3-NEXT: popq %rcx +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_ptr: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: pushq %rax +; CHECK-SSE-O3-NEXT: movq %rdi, %rsi +; CHECK-SSE-O3-NEXT: movq %rsp, %rdx +; CHECK-SSE-O3-NEXT: movl $8, %edi +; CHECK-SSE-O3-NEXT: movl $2, %ecx +; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT: movq (%rsp), %rax +; CHECK-SSE-O3-NEXT: popq %rcx +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_ptr: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: pushq %rax +; CHECK-AVX-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX-O3-NEXT: movl $8, %edi +; CHECK-AVX-O3-NEXT: movl $2, %ecx +; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT: movq (%rsp), %rax +; CHECK-AVX-O3-NEXT: popq %rcx +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec1_ptr: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: pushq %rax +; CHECK-O0-NEXT: movq %rdi, %rsi +; CHECK-O0-NEXT: movl $8, %edi +; CHECK-O0-NEXT: movq %rsp, %rdx +; CHECK-O0-NEXT: movl $2, %ecx +; CHECK-O0-NEXT: callq __atomic_load@PLT +; CHECK-O0-NEXT: movq (%rsp), %rax +; CHECK-O0-NEXT: popq %rcx +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_ptr: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: pushq %rax +; CHECK-SSE-O0-NEXT: movq %rdi, %rsi +; CHECK-SSE-O0-NEXT: movl $8, %edi +; CHECK-SSE-O0-NEXT: movq %rsp, %rdx +; CHECK-SSE-O0-NEXT: movl $2, %ecx +; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT: movq (%rsp), %rax +; CHECK-SSE-O0-NEXT: popq %rcx +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_ptr: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: pushq %rax +; CHECK-AVX-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX-O0-NEXT: movl $8, %edi +; CHECK-AVX-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX-O0-NEXT: movl $2, %ecx +; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O0-NEXT: movq (%rsp), %rax +; CHECK-AVX-O0-NEXT: popq %rcx +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <1 x ptr>, ptr %x acquire, align 4 + ret <1 x ptr> %ret +} + +define <1 x half> @atomic_vec1_half(ptr %x) { +; CHECK-O3-LABEL: atomic_vec1_half: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movzwl (%rdi), %eax +; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax +; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax +; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec1_half: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movw (%rdi), %cx +; CHECK-O0-NEXT: # implicit-def: $eax +; CHECK-O0-NEXT: movw %cx, %ax +; CHECK-O0-NEXT: # implicit-def: $xmm0 +; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_half: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movw (%rdi), %cx +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %cx, %ax +; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_half: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movw (%rdi), %cx +; CHECK-AVX-O0-NEXT: # implicit-def: $eax +; CHECK-AVX-O0-NEXT: movw %cx, %ax +; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0 +; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <1 x half>, ptr %x acquire, align 2 + ret <1 x half> %ret +} + +define <1 x float> @atomic_vec1_float(ptr %x) { +; CHECK-O3-LABEL: atomic_vec1_float: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_float: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_float: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec1_float: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_float: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_float: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <1 x float>, ptr %x acquire, align 4 + ret <1 x float> %ret +} + +define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec1_double_align: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_double_align: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_double_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec1_double_align: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_double_align: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_double_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <1 x double>, ptr %x acquire, align 8 + ret <1 x double> %ret +} + +define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec1_i64: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: pushq %rax +; CHECK-O3-NEXT: movq %rdi, %rsi +; CHECK-O3-NEXT: movq %rsp, %rdx +; CHECK-O3-NEXT: movl $8, %edi +; CHECK-O3-NEXT: movl $2, %ecx +; CHECK-O3-NEXT: callq __atomic_load@PLT +; CHECK-O3-NEXT: movq (%rsp), %rax +; CHECK-O3-NEXT: popq %rcx +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_i64: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: pushq %rax +; CHECK-SSE-O3-NEXT: movq %rdi, %rsi +; CHECK-SSE-O3-NEXT: movq %rsp, %rdx +; CHECK-SSE-O3-NEXT: movl $8, %edi +; CHECK-SSE-O3-NEXT: movl $2, %ecx +; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT: movq (%rsp), %rax +; CHECK-SSE-O3-NEXT: popq %rcx +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_i64: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: pushq %rax +; CHECK-AVX-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX-O3-NEXT: movl $8, %edi +; CHECK-AVX-O3-NEXT: movl $2, %ecx +; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT: movq (%rsp), %rax +; CHECK-AVX-O3-NEXT: popq %rcx +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec1_i64: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: pushq %rax +; CHECK-O0-NEXT: movq %rdi, %rsi +; CHECK-O0-NEXT: movl $8, %edi +; CHECK-O0-NEXT: movq %rsp, %rdx +; CHECK-O0-NEXT: movl $2, %ecx +; CHECK-O0-NEXT: callq __atomic_load@PLT +; CHECK-O0-NEXT: movq (%rsp), %rax +; CHECK-O0-NEXT: popq %rcx +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_i64: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: pushq %rax +; CHECK-SSE-O0-NEXT: movq %rdi, %rsi +; CHECK-SSE-O0-NEXT: movl $8, %edi +; CHECK-SSE-O0-NEXT: movq %rsp, %rdx +; CHECK-SSE-O0-NEXT: movl $2, %ecx +; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT: movq (%rsp), %rax +; CHECK-SSE-O0-NEXT: popq %rcx +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_i64: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: pushq %rax +; CHECK-AVX-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX-O0-NEXT: movl $8, %edi +; CHECK-AVX-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX-O0-NEXT: movl $2, %ecx +; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O0-NEXT: movq (%rsp), %rax +; CHECK-AVX-O0-NEXT: popq %rcx +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <1 x i64>, ptr %x acquire, align 4 + ret <1 x i64> %ret +} + +define <1 x double> @atomic_vec1_double(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec1_double: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: pushq %rax +; CHECK-O3-NEXT: movq %rdi, %rsi +; CHECK-O3-NEXT: movq %rsp, %rdx +; CHECK-O3-NEXT: movl $8, %edi +; CHECK-O3-NEXT: movl $2, %ecx +; CHECK-O3-NEXT: callq __atomic_load@PLT +; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-O3-NEXT: popq %rax +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec1_double: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: pushq %rax +; CHECK-SSE-O3-NEXT: movq %rdi, %rsi +; CHECK-SSE-O3-NEXT: movq %rsp, %rdx +; CHECK-SSE-O3-NEXT: movl $8, %edi +; CHECK-SSE-O3-NEXT: movl $2, %ecx +; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-SSE-O3-NEXT: popq %rax +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec1_double: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: pushq %rax +; CHECK-AVX-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX-O3-NEXT: movl $8, %edi +; CHECK-AVX-O3-NEXT: movl $2, %ecx +; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-AVX-O3-NEXT: popq %rax +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec1_double: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: pushq %rax +; CHECK-O0-NEXT: movq %rdi, %rsi +; CHECK-O0-NEXT: movl $8, %edi +; CHECK-O0-NEXT: movq %rsp, %rdx +; CHECK-O0-NEXT: movl $2, %ecx +; CHECK-O0-NEXT: callq __atomic_load@PLT +; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-O0-NEXT: popq %rax +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec1_double: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: pushq %rax +; CHECK-SSE-O0-NEXT: movq %rdi, %rsi +; CHECK-SSE-O0-NEXT: movl $8, %edi +; CHECK-SSE-O0-NEXT: movq %rsp, %rdx +; CHECK-SSE-O0-NEXT: movl $2, %ecx +; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-SSE-O0-NEXT: popq %rax +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec1_double: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: pushq %rax +; CHECK-AVX-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX-O0-NEXT: movl $8, %edi +; CHECK-AVX-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX-O0-NEXT: movl $2, %ecx +; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-AVX-O0-NEXT: popq %rax +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <1 x double>, ptr %x acquire, align 4 + ret <1 x double> %ret +} + +define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec2_i32: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: pushq %rax +; CHECK-O3-NEXT: movq %rdi, %rsi +; CHECK-O3-NEXT: movq %rsp, %rdx +; CHECK-O3-NEXT: movl $8, %edi +; CHECK-O3-NEXT: movl $2, %ecx +; CHECK-O3-NEXT: callq __atomic_load@PLT +; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-O3-NEXT: popq %rax +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec2_i32: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: pushq %rax +; CHECK-SSE-O3-NEXT: movq %rdi, %rsi +; CHECK-SSE-O3-NEXT: movq %rsp, %rdx +; CHECK-SSE-O3-NEXT: movl $8, %edi +; CHECK-SSE-O3-NEXT: movl $2, %ecx +; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-SSE-O3-NEXT: popq %rax +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_i32: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: pushq %rax +; CHECK-AVX-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX-O3-NEXT: movl $8, %edi +; CHECK-AVX-O3-NEXT: movl $2, %ecx +; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-AVX-O3-NEXT: popq %rax +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec2_i32: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: pushq %rax +; CHECK-O0-NEXT: movq %rdi, %rsi +; CHECK-O0-NEXT: movl $8, %edi +; CHECK-O0-NEXT: movq %rsp, %rdx +; CHECK-O0-NEXT: movl $2, %ecx +; CHECK-O0-NEXT: callq __atomic_load@PLT +; CHECK-O0-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-O0-NEXT: popq %rax +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_i32: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: pushq %rax +; CHECK-SSE-O0-NEXT: movq %rdi, %rsi +; CHECK-SSE-O0-NEXT: movl $8, %edi +; CHECK-SSE-O0-NEXT: movq %rsp, %rdx +; CHECK-SSE-O0-NEXT: movl $2, %ecx +; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-SSE-O0-NEXT: popq %rax +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_i32: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: pushq %rax +; CHECK-AVX-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX-O0-NEXT: movl $8, %edi +; CHECK-AVX-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX-O0-NEXT: movl $2, %ecx +; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O0-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; CHECK-AVX-O0-NEXT: popq %rax +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x i32>, ptr %x acquire, align 4 + ret <2 x i32> %ret +} + +define <4 x float> @atomic_vec4_float(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec4_float: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: subq $24, %rsp +; CHECK-O3-NEXT: movq %rdi, %rsi +; CHECK-O3-NEXT: movq %rsp, %rdx +; CHECK-O3-NEXT: movl $16, %edi +; CHECK-O3-NEXT: movl $2, %ecx +; CHECK-O3-NEXT: callq __atomic_load@PLT +; CHECK-O3-NEXT: movaps (%rsp), %xmm0 +; CHECK-O3-NEXT: addq $24, %rsp +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec4_float: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: subq $24, %rsp +; CHECK-SSE-O3-NEXT: movq %rdi, %rsi +; CHECK-SSE-O3-NEXT: movq %rsp, %rdx +; CHECK-SSE-O3-NEXT: movl $16, %edi +; CHECK-SSE-O3-NEXT: movl $2, %ecx +; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0 +; CHECK-SSE-O3-NEXT: addq $24, %rsp +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_float: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: subq $24, %rsp +; CHECK-AVX-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX-O3-NEXT: movl $16, %edi +; CHECK-AVX-O3-NEXT: movl $2, %ecx +; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT: vmovaps (%rsp), %xmm0 +; CHECK-AVX-O3-NEXT: addq $24, %rsp +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec4_float: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: subq $24, %rsp +; CHECK-O0-NEXT: movq %rdi, %rsi +; CHECK-O0-NEXT: movl $16, %edi +; CHECK-O0-NEXT: movq %rsp, %rdx +; CHECK-O0-NEXT: movl $2, %ecx +; CHECK-O0-NEXT: callq __atomic_load@PLT +; CHECK-O0-NEXT: movaps (%rsp), %xmm0 +; CHECK-O0-NEXT: addq $24, %rsp +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec4_float: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: subq $24, %rsp +; CHECK-SSE-O0-NEXT: movq %rdi, %rsi +; CHECK-SSE-O0-NEXT: movl $16, %edi +; CHECK-SSE-O0-NEXT: movq %rsp, %rdx +; CHECK-SSE-O0-NEXT: movl $2, %ecx +; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT: movaps (%rsp), %xmm0 +; CHECK-SSE-O0-NEXT: addq $24, %rsp +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_float: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: subq $24, %rsp +; CHECK-AVX-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX-O0-NEXT: movl $16, %edi +; CHECK-AVX-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX-O0-NEXT: movl $2, %ecx +; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O0-NEXT: vmovaps (%rsp), %xmm0 +; CHECK-AVX-O0-NEXT: addq $24, %rsp +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x float>, ptr %x acquire, align 4 + ret <4 x float> %ret +} + +define <8 x double> @atomic_vec8_double(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec8_double: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: subq $72, %rsp +; CHECK-O3-NEXT: movq %rdi, %rsi +; CHECK-O3-NEXT: movq %rsp, %rdx +; CHECK-O3-NEXT: movl $64, %edi +; CHECK-O3-NEXT: movl $2, %ecx +; CHECK-O3-NEXT: callq __atomic_load@PLT +; CHECK-O3-NEXT: movaps (%rsp), %xmm0 +; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; CHECK-O3-NEXT: addq $72, %rsp +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec8_double: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: subq $72, %rsp +; CHECK-SSE-O3-NEXT: movq %rdi, %rsi +; CHECK-SSE-O3-NEXT: movq %rsp, %rdx +; CHECK-SSE-O3-NEXT: movl $64, %edi +; CHECK-SSE-O3-NEXT: movl $2, %ecx +; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0 +; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; CHECK-SSE-O3-NEXT: addq $72, %rsp +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec8_double: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: subq $72, %rsp +; CHECK-O0-NEXT: movq %rdi, %rsi +; CHECK-O0-NEXT: movl $64, %edi +; CHECK-O0-NEXT: movq %rsp, %rdx +; CHECK-O0-NEXT: movl $2, %ecx +; CHECK-O0-NEXT: callq __atomic_load@PLT +; CHECK-O0-NEXT: movapd (%rsp), %xmm0 +; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; CHECK-O0-NEXT: addq $72, %rsp +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec8_double: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: subq $72, %rsp +; CHECK-SSE-O0-NEXT: movq %rdi, %rsi +; CHECK-SSE-O0-NEXT: movl $64, %edi +; CHECK-SSE-O0-NEXT: movq %rsp, %rdx +; CHECK-SSE-O0-NEXT: movl $2, %ecx +; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT: movapd (%rsp), %xmm0 +; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; CHECK-SSE-O0-NEXT: addq $72, %rsp +; CHECK-SSE-O0-NEXT: retq + %ret = load atomic <8 x double>, ptr %x acquire, align 4 + ret <8 x double> %ret +} + +define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec16_bfloat: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: subq $40, %rsp +; CHECK-O3-NEXT: movq %rdi, %rsi +; CHECK-O3-NEXT: movq %rsp, %rdx +; CHECK-O3-NEXT: movl $32, %edi +; CHECK-O3-NEXT: movl $2, %ecx +; CHECK-O3-NEXT: callq __atomic_load@PLT +; CHECK-O3-NEXT: movaps (%rsp), %xmm0 +; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-O3-NEXT: addq $40, %rsp +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: subq $40, %rsp +; CHECK-SSE-O3-NEXT: movq %rdi, %rsi +; CHECK-SSE-O3-NEXT: movq %rsp, %rdx +; CHECK-SSE-O3-NEXT: movl $32, %edi +; CHECK-SSE-O3-NEXT: movl $2, %ecx +; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0 +; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-SSE-O3-NEXT: addq $40, %rsp +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec16_bfloat: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: subq $40, %rsp +; CHECK-AVX-O3-NEXT: movq %rdi, %rsi +; CHECK-AVX-O3-NEXT: movq %rsp, %rdx +; CHECK-AVX-O3-NEXT: movl $32, %edi +; CHECK-AVX-O3-NEXT: movl $2, %ecx +; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O3-NEXT: vmovups (%rsp), %ymm0 +; CHECK-AVX-O3-NEXT: addq $40, %rsp +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec16_bfloat: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: subq $40, %rsp +; CHECK-O0-NEXT: movq %rdi, %rsi +; CHECK-O0-NEXT: movl $32, %edi +; CHECK-O0-NEXT: movq %rsp, %rdx +; CHECK-O0-NEXT: movl $2, %ecx +; CHECK-O0-NEXT: callq __atomic_load@PLT +; CHECK-O0-NEXT: movaps (%rsp), %xmm0 +; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-O0-NEXT: addq $40, %rsp +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: subq $40, %rsp +; CHECK-SSE-O0-NEXT: movq %rdi, %rsi +; CHECK-SSE-O0-NEXT: movl $32, %edi +; CHECK-SSE-O0-NEXT: movq %rsp, %rdx +; CHECK-SSE-O0-NEXT: movl $2, %ecx +; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT: movaps (%rsp), %xmm0 +; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-SSE-O0-NEXT: addq $40, %rsp +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec16_bfloat: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: subq $40, %rsp +; CHECK-AVX-O0-NEXT: movq %rdi, %rsi +; CHECK-AVX-O0-NEXT: movl $32, %edi +; CHECK-AVX-O0-NEXT: movq %rsp, %rdx +; CHECK-AVX-O0-NEXT: movl $2, %ecx +; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT +; CHECK-AVX-O0-NEXT: vmovups (%rsp), %ymm0 +; CHECK-AVX-O0-NEXT: addq $40, %rsp +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <16 x bfloat>, ptr %x acquire, align 4 + ret <16 x bfloat> %ret +} + +define <32 x half> @atomic_vec32_half(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec32_half: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: subq $72, %rsp +; CHECK-O3-NEXT: movq %rdi, %rsi +; CHECK-O3-NEXT: movq %rsp, %rdx +; CHECK-O3-NEXT: movl $64, %edi +; CHECK-O3-NEXT: movl $2, %ecx +; CHECK-O3-NEXT: callq __atomic_load@PLT +; CHECK-O3-NEXT: movaps (%rsp), %xmm0 +; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; CHECK-O3-NEXT: addq $72, %rsp +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec32_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: subq $72, %rsp +; CHECK-SSE-O3-NEXT: movq %rdi, %rsi +; CHECK-SSE-O3-NEXT: movq %rsp, %rdx +; CHECK-SSE-O3-NEXT: movl $64, %edi +; CHECK-SSE-O3-NEXT: movl $2, %ecx +; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0 +; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; CHECK-SSE-O3-NEXT: addq $72, %rsp +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec32_half: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: subq $72, %rsp +; CHECK-O0-NEXT: movq %rdi, %rsi +; CHECK-O0-NEXT: movl $64, %edi +; CHECK-O0-NEXT: movq %rsp, %rdx +; CHECK-O0-NEXT: movl $2, %ecx +; CHECK-O0-NEXT: callq __atomic_load@PLT +; CHECK-O0-NEXT: movaps (%rsp), %xmm0 +; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; CHECK-O0-NEXT: addq $72, %rsp +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec32_half: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: subq $72, %rsp +; CHECK-SSE-O0-NEXT: movq %rdi, %rsi +; CHECK-SSE-O0-NEXT: movl $64, %edi +; CHECK-SSE-O0-NEXT: movq %rsp, %rdx +; CHECK-SSE-O0-NEXT: movl $2, %ecx +; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT +; CHECK-SSE-O0-NEXT: movaps (%rsp), %xmm0 +; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; CHECK-SSE-O0-NEXT: addq $72, %rsp +; CHECK-SSE-O0-NEXT: retq + %ret = load atomic <32 x half>, ptr %x acquire, align 4 + ret <32 x half> %ret +} diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll index 4e76262..423e318 100644 --- a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll +++ b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll @@ -19,7 +19,7 @@ entry: ; CHECK: func: ; CHECK: .Lfunc_begin1: ; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text{{$}} -; CHECK-NEXT: .byte 3 # version +; CHECK-NEXT: .byte 4 # version ; BASIC-NEXT: .byte 0 # feature ; PGO-NEXT: .byte 3 # feature ; CHECK-NEXT: .quad .Lfunc_begin1 # function address diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll index f610b04..e32e522 100644 --- a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll +++ b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll @@ -10,7 +10,7 @@ define dso_local i32 @_Z3barv() { ; CHECK-LABEL: _Z3barv: ; CHECK-NEXT: [[BAR_BEGIN:.Lfunc_begin[0-9]+]]: ; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3barv{{$}} -; CHECK-NEXT: .byte 3 # version +; CHECK-NEXT: .byte 4 # version ; CHECK-NEXT: .byte 0 # feature ; CHECK-NEXT: .quad [[BAR_BEGIN]] # function address @@ -23,7 +23,7 @@ define dso_local i32 @_Z3foov() { ; CHECK-LABEL: _Z3foov: ; CHECK-NEXT: [[FOO_BEGIN:.Lfunc_begin[0-9]+]]: ; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3foov{{$}} -; CHECK-NEXT: .byte 3 # version +; CHECK-NEXT: .byte 4 # version ; CHECK-NEXT: .byte 32 # feature ; CHECK-NEXT: .quad [[FOO_BEGIN]] # function address @@ -36,6 +36,6 @@ define linkonce_odr dso_local i32 @_Z4fooTIiET_v() comdat { ; CHECK-LABEL: _Z4fooTIiET_v: ; CHECK-NEXT: [[FOOCOMDAT_BEGIN:.Lfunc_begin[0-9]+]]: ; CHECK: .section .llvm_bb_addr_map,"oG",@llvm_bb_addr_map,.text._Z4fooTIiET_v,_Z4fooTIiET_v,comdat{{$}} -; CHECK-NEXT: .byte 3 # version +; CHECK-NEXT: .byte 4 # version ; CHECK-NEXT: .byte 0 # feature ; CHECK-NEXT: .quad [[FOOCOMDAT_BEGIN]] # function address diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll index ba76f3e..12b1297 100644 --- a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll +++ b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll @@ -69,7 +69,7 @@ declare i32 @__gxx_personality_v0(...) ; CHECK-LABEL: .Lfunc_end0: ; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}} -; CHECK-NEXT: .byte 3 # version +; CHECK-NEXT: .byte 4 # version ; BASIC-NEXT: .byte 32 # feature ; PGO-ALL-NEXT: .byte 39 # feature ; FEC-ONLY-NEXT:.byte 33 # feature diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll index 6157f1a..aeb6dc95 100644 --- a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll +++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll @@ -47,7 +47,7 @@ declare i32 @__gxx_personality_v0(...) ; CHECK-LABEL: .Lfunc_end0: ; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot._Z3bazb -; CHECK-NEXT: .byte 3 # version +; CHECK-NEXT: .byte 4 # version ; CHECK-NEXT: .byte 40 # feature ; CHECK-NEXT: .byte 2 # number of basic block ranges ; CHECK-NEXT: .quad .Lfunc_begin0 # base address diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll new file mode 100644 index 0000000..a5678877 --- /dev/null +++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll @@ -0,0 +1,94 @@ +; Check the basic block sections labels option works when used along with -emit-bb-hash. +; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -emit-bb-hash | FileCheck %s --check-prefixes=CHECK,UNIQ + +define void @_Z3bazb(i1 zeroext, i1 zeroext) personality ptr @__gxx_personality_v0 { + br i1 %0, label %3, label %8 + +3: + %4 = invoke i32 @_Z3barv() + to label %8 unwind label %6 + br label %10 + +6: + landingpad { ptr, i32 } + catch ptr null + br label %12 + +8: + %9 = call i32 @_Z3foov() + br i1 %1, label %12, label %10 + +10: + %11 = select i1 %1, ptr blockaddress(@_Z3bazb, %3), ptr blockaddress(@_Z3bazb, %12) ; <ptr> [#uses=1] + indirectbr ptr %11, [label %3, label %12] + +12: + ret void +} + +declare i32 @_Z3barv() #1 + +declare i32 @_Z3foov() #1 + +declare i32 @__gxx_personality_v0(...) + +; UNIQ: .section .text._Z3bazb,"ax",@progbits{{$}} +; NOUNIQ: .section .text,"ax",@progbits,unique,1 +; CHECK-LABEL: _Z3bazb: +; CHECK-LABEL: .Lfunc_begin0: +; CHECK-LABEL: .LBB_END0_0: +; CHECK-LABEL: .LBB0_1: +; CHECK-LABEL: .LBB0_1_CS0: +; CHECK-LABEL: .LBB_END0_1: +; CHECK-LABEL: .LBB0_2: +; CHECK-LABEL: .LBB0_2_CS0: +; CHECK-LABEL: .LBB_END0_2: +; CHECK-LABEL: .LBB0_3: +; CHECK-LABEL: .LBB_END0_3: +; CHECK-LABEL: .Lfunc_end0: + +; UNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}} +;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section. +; NOUNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1 +; CHECK-NEXT: .byte 4 # version +; CHECK-NEXT: .byte 96 # feature +; CHECK-NEXT: .quad .Lfunc_begin0 # function address +; CHECK-NEXT: .byte 6 # number of basic blocks +; CHECK-NEXT: .byte 0 # BB id +; CHECK-NEXT: .uleb128 .Lfunc_begin0-.Lfunc_begin0 +; CHECK-NEXT: .byte 0 # number of callsites +; CHECK-NEXT: .uleb128 .LBB_END0_0-.Lfunc_begin0 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .quad {{-?[0-9]+}} +; CHECK-NEXT: .byte 1 # BB id +; CHECK-NEXT: .uleb128 .LBB0_1-.LBB_END0_0 +; CHECK-NEXT: .byte 1 # number of callsites +; CHECK-NEXT: .uleb128 .LBB0_1_CS0-.LBB0_1 +; CHECK-NEXT: .uleb128 .LBB_END0_1-.LBB0_1_CS0 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .quad {{-?[0-9]+}} +; CHECK-NEXT: .byte 3 # BB id +; CHECK-NEXT: .uleb128 .LBB0_2-.LBB_END0_1 +; CHECK-NEXT: .byte 1 # number of callsites +; CHECK-NEXT: .uleb128 .LBB0_2_CS0-.LBB0_2 +; CHECK-NEXT: .uleb128 .LBB_END0_2-.LBB0_2_CS0 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .quad {{-?[0-9]+}} +; CHECK-NEXT: .byte 4 # BB id +; CHECK-NEXT: .uleb128 .LBB0_3-.LBB_END0_2 +; CHECK-NEXT: .byte 0 # number of callsites +; CHECK-NEXT: .uleb128 .LBB_END0_3-.LBB0_3 +; CHECK-NEXT: .byte 16 +; CHECK-NEXT: .quad {{-?[0-9]+}} +; CHECK-NEXT: .byte 5 # BB id +; CHECK-NEXT: .uleb128 .LBB0_4-.LBB_END0_3 +; CHECK-NEXT: .byte 0 # number of callsites +; CHECK-NEXT: .uleb128 .LBB_END0_4-.LBB0_4 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .quad {{-?[0-9]+}} +; CHECK-NEXT: .byte 2 # BB id +; CHECK-NEXT: .uleb128 .LBB0_5-.LBB_END0_4 +; CHECK-NEXT: .byte 0 # number of callsites +; CHECK-NEXT: .uleb128 .LBB_END0_5-.LBB0_5 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .quad {{-?[0-9]+}} diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll index 1e8cee4..d49b313 100644 --- a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll +++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll @@ -58,7 +58,7 @@ declare i32 @qux() ; CHECK-LABEL: .Lfunc_end0: ; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot.foo -; CHECK-NEXT: .byte 3 # version +; CHECK-NEXT: .byte 4 # version ; BASIC-NEXT: .byte 40 # feature ; PGO-NEXT: .byte 47 # feature ; CHECK-NEXT: .byte 2 # number of basic block ranges diff --git a/llvm/test/CodeGen/X86/basic-block-address-map.ll b/llvm/test/CodeGen/X86/basic-block-address-map.ll index 5c8f3a6..64cf2c7 100644 --- a/llvm/test/CodeGen/X86/basic-block-address-map.ll +++ b/llvm/test/CodeGen/X86/basic-block-address-map.ll @@ -52,7 +52,7 @@ declare i32 @__gxx_personality_v0(...) ; UNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}} ;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section. ; NOUNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1 -; CHECK-NEXT: .byte 3 # version +; CHECK-NEXT: .byte 4 # version ; CHECK-NEXT: .byte 32 # feature ; CHECK-NEXT: .quad .Lfunc_begin0 # function address ; CHECK-NEXT: .byte 6 # number of basic blocks |
