aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/load-zext-bitcast.ll51
-rw-r--r--llvm/test/CodeGen/AMDGPU/add-max.ll151
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll218
-rw-r--r--llvm/test/CodeGen/ARM/O3-pipeline.ll1
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-arm.ll138
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-cbz-range.ll81
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll99
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-thumb.ll215
-rw-r--r--llvm/test/CodeGen/ARM/kcfi-thumb2.ll163
-rw-r--r--llvm/test/CodeGen/ARM/kcfi.ll28
-rw-r--r--llvm/test/CodeGen/AVR/dynalloca.ll20
-rw-r--r--llvm/test/CodeGen/AVR/issue-163015.ll32
-rw-r--r--llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll2
-rw-r--r--llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll9
-rw-r--r--llvm/test/CodeGen/X86/atomic-load-store.ll705
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll2
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll6
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll2
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll2
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll94
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll2
-rw-r--r--llvm/test/CodeGen/X86/basic-block-address-map.ll2
22 files changed, 1747 insertions, 276 deletions
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 9193025..6177ae5 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -112,8 +112,7 @@ entry:
define double @load_u64_from_u8_off1(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #1]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -140,8 +139,7 @@ entry:
define float @load_u32_from_u8_off1(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #1]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -154,8 +152,7 @@ entry:
define half @load_u16_from_u8_off1(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #1]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -185,8 +182,7 @@ entry:
define double @load_u64_from_u16_off2(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrh w8, [x0, #2]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr h0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -199,8 +195,7 @@ entry:
define double @load_u64_from_u8_off2(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr b0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -226,7 +221,7 @@ entry:
define float @load_u32_from_u8_off2(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #1]
+; CHECK-NEXT: ldr b0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -239,7 +234,7 @@ entry:
define half @load_u16_from_u8_off2(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #1]
+; CHECK-NEXT: ldr b0, [x0, #2]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -283,8 +278,7 @@ entry:
define double @load_u64_from_u8_off255(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #255]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr b0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -311,8 +305,7 @@ entry:
define float @load_u32_from_u8_off255(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #255]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -325,8 +318,7 @@ entry:
define half @load_u16_from_u8_off255(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #255]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #255]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -354,7 +346,7 @@ entry:
define double @load_u64_from_u16_off256(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr h0, [x0, #128]
+; CHECK-NEXT: ldr h0, [x0, #256]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 256
@@ -367,7 +359,7 @@ entry:
define double @load_u64_from_u8_off256(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #64]
+; CHECK-NEXT: ldr b0, [x0, #256]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 256
@@ -393,7 +385,7 @@ entry:
define float @load_u32_from_u8_off256(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #128]
+; CHECK-NEXT: ldr b0, [x0, #256]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 256
@@ -406,7 +398,7 @@ entry:
define half @load_u16_from_u8_off256(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #128]
+; CHECK-NEXT: ldr b0, [x0, #256]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -435,8 +427,7 @@ entry:
define double @load_u64_from_u16_offn(ptr %n){
; CHECK-LABEL: load_u64_from_u16_offn:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #8190 // =0x1ffe
-; CHECK-NEXT: ldr h0, [x0, x8]
+; CHECK-NEXT: ldr h0, [x0, #8190]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 8190
@@ -517,7 +508,8 @@ entry:
define double @load_u64_from_u16_offnp1(ptr %n){
; CHECK-LABEL: load_u64_from_u16_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr h0, [x0, #4096]
+; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192
+; CHECK-NEXT: ldr h0, [x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 8192
@@ -530,7 +522,8 @@ entry:
define double @load_u64_from_u8_offnp1(ptr %n){
; CHECK-LABEL: load_u64_from_u8_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #1024]
+; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096
+; CHECK-NEXT: ldr b0, [x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 4096
@@ -557,7 +550,8 @@ entry:
define float @load_u32_from_u8_offnp1(ptr %n){
; CHECK-LABEL: load_u32_from_u8_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #2048]
+; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096
+; CHECK-NEXT: ldr b0, [x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 4096
@@ -570,7 +564,8 @@ entry:
define half @load_u16_from_u8_offnp1(ptr %n){
; CHECK-LABEL: load_u16_from_u8_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #2048]
+; CHECK-NEXT: add x8, x0, #1, lsl #12 // =4096
+; CHECK-NEXT: ldr b0, [x8]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
index b3a7057..c551375 100644
--- a/llvm/test/CodeGen/AMDGPU/add-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -7,7 +7,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -18,39 +18,38 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}
define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
-; SDAG-LABEL: add_max_u32_ssv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_add_max_u32 v0, s0, s1, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_ssv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_add_co_i32 s0, s0, s1
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+; GCN-LABEL: add_max_u32_ssv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_max_u32 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}
define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
-; GCN-LABEL: add_max_u32_sss:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_add_co_i32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_max_u32 s0, s0, s2
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+; SDAG-LABEL: add_max_u32_sss:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_max_u32_e32 v0, s2, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_u32_sss:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_max_u32 v0, s0, s1, v0
+; GISEL-NEXT: ; return to shader part epilog
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -61,7 +60,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 4)
%ret = bitcast i32 %max to float
ret float %ret
@@ -72,26 +71,19 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 100)
%ret = bitcast i32 %max to float
ret float %ret
}
-define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
-; SDAG-LABEL: add_max_u32_slv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_add_max_u32 v0, 0x64, s0, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_slv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_addk_co_i32 s0, 0x64
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add i32 %a, 100
- %max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
+define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_u32_slv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}
@@ -101,7 +93,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -112,7 +104,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -123,7 +115,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}
define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
-; SDAG-LABEL: add_max_v2u16_ssv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_ssv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_lshr_b32 s2, s0, 16
-; GISEL-NEXT: s_lshr_b32 s3, s1, 16
-; GISEL-NEXT: s_add_co_i32 s0, s0, s1
-; GISEL-NEXT: s_add_co_i32 s2, s2, s3
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+; GCN-LABEL: add_max_v2u16_ssv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b
define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
; SDAG-LABEL: add_max_v2u16_sss:
; SDAG: ; %bb.0:
-; SDAG-NEXT: v_pk_add_u16 v0, s0, s1
+; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 clamp
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_pk_max_u16 v0, v0, s2
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: add_max_v2u16_sss:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_lshr_b32 s3, s0, 16
-; GISEL-NEXT: s_lshr_b32 s4, s1, 16
-; GISEL-NEXT: s_add_co_i32 s0, s0, s1
-; GISEL-NEXT: s_add_co_i32 s3, s3, s4
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3
-; GISEL-NEXT: s_and_b32 s3, s2, 0xffff
-; GISEL-NEXT: s_lshr_b32 s1, s0, 16
-; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT: s_lshr_b32 s2, s2, 16
-; GISEL-NEXT: s_max_u32 s0, s0, s3
-; GISEL-NEXT: s_max_u32 s1, s1, s2
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
; GISEL-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}
define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
-; SDAG-LABEL: add_max_v2u16_slv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_slv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_lshr_b32 s1, s0, 16
-; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064
-; GISEL-NEXT: s_addk_co_i32 s1, 0x64
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, <i16 100, i16 100>
+; GCN-LABEL: add_max_v2u16_slv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 711d57b..30ad46d9 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -39131,21 +39131,21 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX1250-NEXT: v_cls_i32_e32 v3, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_add_nc_u32 v3, -1, v3 :: v_dual_bitop2_b32 v2, v0, v1 bitop3:0x14
; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2
+; GFX1250-NEXT: v_min_u32_e32 v2, v3, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_min_u32 v2, v3, -1, v2
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = sitofp i64 %x to bfloat
@@ -39483,29 +39483,30 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX1250-NEXT: v_xor_b32_e32 v4, v2, v3
; GFX1250-NEXT: v_cls_i32_e32 v6, v3
; GFX1250-NEXT: v_cls_i32_e32 v7, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4
+; GFX1250-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_add_nc_u32 v7, -1, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_min_u32 v5, v7, -1, v5
-; GFX1250-NEXT: v_add_min_u32 v4, v6, -1, v4
+; GFX1250-NEXT: v_min_u32_e32 v5, v7, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_min_u32_e32 v4, v6, v4
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1]
-; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
; GFX1250-NEXT: v_sub_nc_u32_e32 v3, 32, v5
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_ldexp_f32 v1, v2, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = sitofp <2 x i64> %x to <2 x bfloat>
@@ -39968,41 +39969,42 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX1250TRUE16: ; %bb.0:
; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v2, v3
-; GFX1250TRUE16-NEXT: v_xor_b32_e32 v6, v4, v5
+; GFX1250TRUE16-NEXT: v_cls_i32_e32 v6, v5
+; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v4, v5
; GFX1250TRUE16-NEXT: v_cls_i32_e32 v10, v3
-; GFX1250TRUE16-NEXT: v_cls_i32_e32 v9, v5
; GFX1250TRUE16-NEXT: v_cls_i32_e32 v11, v1
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_ashrrev_i32 v6, 31, v6
-; GFX1250TRUE16-NEXT: v_xor_b32_e32 v8, v0, v1
-; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GFX1250TRUE16-NEXT: v_add_min_u32 v7, v10, -1, v7
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_add_min_u32 v6, v9, -1, v6
-; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
-; GFX1250TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
+; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250TRUE16-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_bitop2_b32 v8, v2, v3 bitop3:0x14
+; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_add_nc_u32 v11, -1, v11
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
+; GFX1250TRUE16-NEXT: v_min_u32_e32 v6, v6, v7
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT: v_min_u32_e32 v7, v10, v8
+; GFX1250TRUE16-NEXT: v_min_u32_e32 v8, v11, v9
+; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
; GFX1250TRUE16-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX1250TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT: v_add_min_u32 v8, v11, -1, v8
-; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250TRUE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
-; GFX1250TRUE16-NEXT: v_sub_nc_u32_e32 v5, 32, v8
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250TRUE16-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX1250TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v5, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250TRUE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v4
; GFX1250TRUE16-NEXT: v_sub_nc_u32_e32 v4, 32, v7
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX1250TRUE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -40017,44 +40019,47 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX1250FAKE16: ; %bb.0:
; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: v_xor_b32_e32 v8, v4, v5
-; GFX1250FAKE16-NEXT: v_xor_b32_e32 v6, v2, v3
+; GFX1250FAKE16-NEXT: v_cls_i32_e32 v6, v5
+; GFX1250FAKE16-NEXT: v_xor_b32_e32 v7, v2, v3
; GFX1250FAKE16-NEXT: v_cls_i32_e32 v10, v3
-; GFX1250FAKE16-NEXT: v_cls_i32_e32 v9, v5
; GFX1250FAKE16-NEXT: v_cls_i32_e32 v11, v1
-; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v8, 31, v8 :: v_dual_bitop2_b32 v7, v0, v1 bitop3:0x14
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7
-; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_add_min_u32 v6, v10, -1, v6
-; GFX1250FAKE16-NEXT: v_add_min_u32 v7, v11, -1, v7
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v8, v4, v5 bitop3:0x14
+; GFX1250FAKE16-NEXT: v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3]
-; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1]
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX1250FAKE16-NEXT: v_add_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v11, -1, v11 :: v_dual_ashrrev_i32 v9, 31, v9
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v8, 32, v8
+; GFX1250FAKE16-NEXT: v_add_nc_u32_e32 v9, 32, v9
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v7, v10, v7
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v6, v6, v8
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX1250FAKE16-NEXT: v_add_min_u32 v8, v9, -1, v8
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v9, v11, v9
+; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
-; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[4:5], v8, v[4:5]
-; GFX1250FAKE16-NEXT: v_sub_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT: v_ldexp_f32 v2, v2, v3
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250FAKE16-NEXT: v_lshlrev_b64_e32 v[0:1], v9, v[0:1]
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250FAKE16-NEXT: v_min_u32_e32 v4, 1, v4
-; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT: v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT: v_sub_nc_u32_e32 v6, 32, v6
+; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX1250FAKE16-NEXT: v_dual_sub_nc_u32 v4, 32, v9 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v2, v2
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT: v_ldexp_f32 v2, v2, v3
; GFX1250FAKE16-NEXT: v_ldexp_f32 v0, v0, v4
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_ldexp_f32 v1, v1, v8
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT: v_ldexp_f32 v1, v1, v6
; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
@@ -40644,51 +40649,54 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_xor_b32_e32 v9, v4, v5
+; GFX1250-NEXT: v_cls_i32_e32 v9, v7
; GFX1250-NEXT: v_xor_b32_e32 v8, v6, v7
-; GFX1250-NEXT: v_cls_i32_e32 v12, v7
-; GFX1250-NEXT: v_cls_i32_e32 v13, v5
-; GFX1250-NEXT: v_cls_i32_e32 v14, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_ashrrev_i32 v8, 31, v8
-; GFX1250-NEXT: v_xor_b32_e32 v10, v2, v3
-; GFX1250-NEXT: v_cls_i32_e32 v15, v1
-; GFX1250-NEXT: v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14
-; GFX1250-NEXT: v_add_min_u32 v9, v13, -1, v9
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_min_u32 v8, v12, -1, v8
-; GFX1250-NEXT: v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v9, v[4:5]
-; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_add_nc_u32_e32 v11, 32, v11
-; GFX1250-NEXT: v_add_min_u32 v10, v14, -1, v10
+; GFX1250-NEXT: v_cls_i32_e32 v10, v5
+; GFX1250-NEXT: v_xor_b32_e32 v14, v0, v1
+; GFX1250-NEXT: v_cls_i32_e32 v12, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_dual_add_nc_u32 v9, -1, v9 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250-NEXT: v_dual_add_nc_u32 v10, -1, v10 :: v_dual_bitop2_b32 v11, v4, v5 bitop3:0x14
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_min_u32 v11, v15, -1, v11
-; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3]
-; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6
-; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT: v_dual_add_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v13, v2, v3 bitop3:0x14
+; GFX1250-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_min_u32_e32 v8, v9, v8
+; GFX1250-NEXT: v_ashrrev_i32_e32 v9, 31, v13
+; GFX1250-NEXT: v_cls_i32_e32 v13, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_ashrrev_i32 v14, 31, v14 :: v_dual_add_nc_u32 v11, 32, v11
+; GFX1250-NEXT: v_dual_add_nc_u32 v12, -1, v12 :: v_dual_add_nc_u32 v9, 32, v9
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_add_nc_u32 v13, -1, v13 :: v_dual_add_nc_u32 v14, 32, v14
+; GFX1250-NEXT: v_min_u32_e32 v10, v10, v11
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_min_u32_e32 v9, v12, v9
+; GFX1250-NEXT: v_min_u32_e32 v11, v13, v14
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v10, v[4:5]
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v9, v[2:3]
+; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v11, v[0:1]
+; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_or_b32_e32 v6, v7, v6
; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v10 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
-; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v9
+; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v10
; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4
+; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v6
+; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 32, v8
+; GFX1250-NEXT: v_cvt_f32_i32_e32 v3, v6
+; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4
; GFX1250-NEXT: v_sub_nc_u32_e32 v6, 32, v11
; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX1250-NEXT: v_ldexp_f32 v1, v1, v3
+; GFX1250-NEXT: v_ldexp_f32 v1, v3, v1
; GFX1250-NEXT: v_ldexp_f32 v3, v4, v7
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_ldexp_f32 v2, v2, v5
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 9601a2e..2731148 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -166,6 +166,7 @@
; CHECK-NEXT: ARM Execution Domain Fix
; CHECK-NEXT: BreakFalseDeps
; CHECK-NEXT: ARM pseudo instruction expansion pass
+; CHECK-NEXT: Insert KCFI indirect call checks
; CHECK-NEXT: Thumb2 instruction size reduce pass
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Machine Natural Loop Construction
diff --git a/llvm/test/CodeGen/ARM/kcfi-arm.ll b/llvm/test/CodeGen/ARM/kcfi-arm.ll
new file mode 100644
index 0000000..e3696cf
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-arm.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI
+
+; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them)
+
+; MIR-LABEL: name: f1
+; MIR: body:
+
+; ISEL: BLX %0, csr_aapcs,{{.*}} cfi-type 12345678
+
+; KCFI: BUNDLE{{.*}} {
+; KCFI-NEXT: KCFI_CHECK_ARM $r0, 12345678
+; KCFI-NEXT: BLX killed $r0, csr_aapcs,{{.*}}
+; KCFI-NEXT: }
+
+; MIR-LABEL: name: f2
+; MIR: body:
+
+; ISEL: TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678
+
+; KCFI: BUNDLE{{.*}} {
+; KCFI-NEXT: KCFI_CHECK_ARM $r0, 12345678
+; KCFI-NEXT: TAILJMPr killed $r0, csr_aapcs, implicit $sp, implicit $sp
+; KCFI-NEXT: }
+
+; ASM: .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f1:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r11, lr}
+; ASM-NEXT: push {r11, lr}
+; ASM-NEXT: bic r12, r0, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq .Ltmp0
+; ASM-NEXT: udf #33760
+; ASM-NEXT: .Ltmp0:
+; ASM-NEXT: blx r0
+; ASM-NEXT: pop {r11, pc}
+
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test with tail call
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f2:
+; ASM: @ %bb.0:
+; ASM-NEXT: bic r12, r0, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq .Ltmp1
+; ASM-NEXT: udf #33760
+; ASM-NEXT: .Ltmp1:
+; ASM-NEXT: bx r0
+
+ tail call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test r3 spill/reload when target is r12 and r3 is a call argument.
+; With 5+ arguments (target + 4 args), r0-r3 are all used for arguments,
+; forcing r3 to be spilled when we need it as scratch register.
+define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; ASM-LABEL: f3_r3_spill:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r11, lr}
+; ASM-NEXT: push {r11, lr}
+; ASM-NEXT: mov lr, r3
+; ASM-NEXT: ldr r3, [sp, #8]
+; ASM-NEXT: mov r12, r0
+; ASM-NEXT: mov r0, r1
+; ASM-NEXT: mov r1, r2
+; ASM-NEXT: mov r2, lr
+; ASM-NEXT: stmdb sp!, {r3}
+; ASM-NEXT: bic r3, r12, #1
+; ASM-NEXT: ldr r3, [r3, #-4]
+; ASM-NEXT: eor r3, r3, #78
+; ASM-NEXT: eor r3, r3, #24832
+; ASM-NEXT: eor r3, r3, #12320768
+; ASM-NEXT: eors r3, r3, #0
+; ASM-NEXT: ldm sp!, {r3}
+; ASM-NEXT: beq .Ltmp2
+; ASM-NEXT: udf #33772
+; ASM-NEXT: .Ltmp2:
+; ASM-NEXT: blx r12
+; ASM-NEXT: pop {r11, pc}
+; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp]=%d
+; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12
+; Compiler shuffles arguments into place, saving r3 (c) in lr, loading d from stack
+; r3 is live as 4th argument, so push it before KCFI check
+; Restore r3 immediately after comparison, before branch
+ call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test with 3 arguments - r3 not live, target in r12, so r3 used as scratch without spilling
+define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 {
+; ASM-LABEL: f4_r3_unused:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r11, lr}
+; ASM-NEXT: push {r11, lr}
+; ASM-NEXT: mov r3, r0
+; ASM-NEXT: mov r0, r1
+; ASM-NEXT: mov r1, r2
+; ASM-NEXT: bic r12, r3, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq .Ltmp3
+; ASM-NEXT: udf #33763
+; ASM-NEXT: .Ltmp3:
+; ASM-NEXT: blx r3
+; ASM-NEXT: pop {r11, pc}
+; Only 3 arguments total, so r3 is not used as call argument
+; Compiler puts target→r3, a→r0, b→r1
+; r3 is the target, so we use r12 as scratch (no spill needed)
+ call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ISEL: {{.*}}
+; KCFI: {{.*}}
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll
new file mode 100644
index 0000000..8e71cae
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -filetype=obj < %s
+; RUN: llc -mtriple=thumbv7-linux-gnueabi < %s | FileCheck %s
+
+; This test verifies that KCFI instrumentation doesn't cause "out of range
+; pc-relative fixup value" errors when generating object files.
+;
+; The test creates a scenario with enough KCFI-instrumented indirect calls
+; (~32 bytes each) that would push a cbz/cbnz instruction out of its ±126 byte
+; range if the KCFI_CHECK pseudo-instruction size is not properly accounted for.
+;
+; Without the fix (KCFI_CHECK returns size 0):
+; - Backend thinks KCFI checks take no space
+; - Generates cbz to branch over the code
+; - During assembly, cbz target is >126 bytes away
+; - Assembly fails with "error: out of range pc-relative fixup value"
+;
+; With the fix (KCFI_CHECK returns size 32 for Thumb2):
+; - Backend correctly accounts for KCFI check expansion
+; - Avoids cbz or uses longer-range branch instructions
+; - Assembly succeeds, object file is generated
+
+declare void @external_function(i32)
+
+; Test WITHOUT KCFI: should generate cbz since calls are small
+; CHECK-LABEL: test_without_kcfi:
+; CHECK: cbz
+; CHECK-NOT: bic{{.*}}#1
+define i32 @test_without_kcfi(ptr %callback, i32 %x) {
+entry:
+ %cmp = icmp eq i32 %x, 0
+ br i1 %cmp, label %if_zero, label %if_nonzero
+
+if_nonzero:
+ ; Regular (non-KCFI) indirect calls - much smaller
+ call void %callback()
+ call void %callback()
+ call void %callback()
+ call void %callback()
+ call void %callback()
+ call void %callback()
+
+ call void @external_function(i32 %x)
+ %add1 = add i32 %x, 1
+ ret i32 %add1
+
+if_zero:
+ call void @external_function(i32 0)
+ ret i32 0
+}
+
+; Test WITH KCFI: should NOT generate cbz due to large KCFI checks
+; CHECK-LABEL: test_with_kcfi:
+; CHECK-NOT: cbz
+; CHECK: bic{{.*}}#1
+define i32 @test_with_kcfi(ptr %callback, i32 %x) !kcfi_type !1 {
+entry:
+ %cmp = icmp eq i32 %x, 0
+ br i1 %cmp, label %if_zero, label %if_nonzero
+
+if_nonzero:
+ ; Six KCFI-instrumented indirect calls (~192 bytes total, exceeds cbz range)
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+ call void %callback() [ "kcfi"(i32 12345678) ]
+
+ ; Regular call to prevent optimization
+ call void @external_function(i32 %x)
+ %add1 = add i32 %x, 1
+ ret i32 %add1
+
+if_zero:
+ call void @external_function(i32 0)
+ ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
diff --git a/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll
new file mode 100644
index 0000000..f8e0838
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK: .p2align 2
+; CHECK-NOT: nop
+; CHECK: .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bic r12, r0, #1
+; CHECK-NEXT: ldr r12, [r12, #-4]
+; CHECK-NEXT: eor r12, r12, #78
+; CHECK-NEXT: eor r12, r12, #24832
+; CHECK-NEXT: eor r12, r12, #12320768
+; CHECK-NEXT: eors r12, r12, #0
+; CHECK-NEXT: beq .Ltmp0
+; CHECK-NEXT: udf #33760
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r11, pc}
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; CHECK: .p2align 2
+; CHECK-NOT: .long
+; CHECK-NOT: nop
+define void @f2(ptr noundef %x) {
+; CHECK-LABEL: f2:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bic r12, r0, #1
+; CHECK-NEXT: ldr r12, [r12, #-4]
+; CHECK-NEXT: eor r12, r12, #78
+; CHECK-NEXT: eor r12, r12, #24832
+; CHECK-NEXT: eor r12, r12, #12320768
+; CHECK-NEXT: eors r12, r12, #0
+; CHECK-NEXT: beq .Ltmp1
+; CHECK-NEXT: udf #33760
+; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r11, pc}
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; CHECK: .p2align 2
+; CHECK: .long 12345678
+; CHECK-COUNT-11: nop
+define void @f3(ptr noundef %x) #0 !kcfi_type !1 {
+; CHECK-LABEL: f3:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bic r12, r0, #1
+; CHECK-NEXT: ldr r12, [r12, #-48]
+; CHECK-NEXT: eor r12, r12, #78
+; CHECK-NEXT: eor r12, r12, #24832
+; CHECK-NEXT: eor r12, r12, #12320768
+; CHECK-NEXT: eors r12, r12, #0
+; CHECK-NEXT: beq .Ltmp3
+; CHECK-NEXT: udf #33760
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r11, pc}
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; CHECK: .p2align 2
+; CHECK-COUNT-11: nop
+define void @f4(ptr noundef %x) #0 {
+; CHECK-LABEL: f4:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bic r12, r0, #1
+; CHECK-NEXT: ldr r12, [r12, #-48]
+; CHECK-NEXT: eor r12, r12, #78
+; CHECK-NEXT: eor r12, r12, #24832
+; CHECK-NEXT: eor r12, r12, #12320768
+; CHECK-NEXT: eors r12, r12, #0
+; CHECK-NEXT: beq .Ltmp5
+; CHECK-NEXT: udf #33760
+; CHECK-NEXT: .Ltmp5:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r11, pc}
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+attributes #0 = { "patchable-function-prefix"="11" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb.ll b/llvm/test/CodeGen/ARM/kcfi-thumb.ll
new file mode 100644
index 0000000..7c02d830
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-thumb.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s
+
+; This test verifies that Thumb1 (ARMv6-M) generates correct code for backend KCFI.
+; Thumb1 uses the backend KCFI implementation with Thumb1-specific instructions.
+
+; Test function without KCFI annotation
+; CHECK-LABEL: .globl nosan
+; CHECK-NEXT: .p2align 1
+; CHECK-NEXT: .type nosan,%function
+; CHECK-NEXT: .code 16
+; CHECK-NEXT: .thumb_func
+define dso_local void @nosan() nounwind {
+; CHECK-LABEL: nosan:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: bx lr
+ ret void
+}
+
+; Test function with KCFI annotation - verifies type hash emission
+;; The alignment is at least 4 to avoid unaligned type hash loads when this
+;; instrumented function is indirectly called.
+; CHECK-LABEL: .globl target_func
+; CHECK-NEXT: .p2align 2
+; CHECK-NEXT: .type target_func,%function
+; CHECK-NEXT: .long 3170468932
+; CHECK-NEXT: .code 16
+; CHECK-NEXT: .thumb_func
+define void @target_func() !kcfi_type !1 {
+; CHECK-LABEL: target_func:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: bx lr
+ ret void
+}
+
+; Test indirect call with KCFI check using operand bundles
+; CHECK-LABEL: .globl f1
+; CHECK: .p2align 2
+; CHECK-NEXT: .type f1,%function
+; CHECK-NEXT: .long 3170468932
+; CHECK-NEXT: .code 16
+; CHECK-NEXT: .thumb_func
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: beq .Ltmp0
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r7, pc}
+ call void %x() [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+; Test with tail call - backend KCFI supports tail calls
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f2:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: beq .Ltmp1
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: blx r0
+; CHECK-NEXT: pop {r7, pc}
+ tail call void %x() [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+; Test with R2 live (3 arguments) - compiler shuffles args, no spilling needed
+define void @f3_r2_live(ptr noundef %x, i32 %a, i32 %b, i32 %c) !kcfi_type !1 {
+; CHECK-LABEL: f3_r2_live:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: mov r1, r2
+; CHECK-NEXT: mov r2, r3
+; CHECK-NEXT: push {r2}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r4
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: pop {r2}
+; CHECK-NEXT: beq .Ltmp2
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp2:
+; CHECK-NEXT: blx r4
+; CHECK-NEXT: pop {r4, pc}
+; Compiler shuffles: target→r4, c→r2, a→r0, b→r1
+; R2 is live (3rd arg), so we push it, then uses R3 as temp, R2 as scratch
+ call void %x(i32 %a, i32 %b, i32 %c) [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+; Test with both R2 and R3 live (4 arguments) - compiler moves to r5/r4, uses R3 temp and R12 scratch
+define void @f4_r2_r3_live(ptr noundef %x, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; CHECK-LABEL: f4_r2_r3_live:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: mov r5, r3
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: ldr r3, [sp, #16]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: mov r1, r2
+; CHECK-NEXT: mov r2, r5
+; CHECK-NEXT: push {r3}
+; CHECK-NEXT: push {r2}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r4
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: pop {r2}
+; CHECK-NEXT: pop {r3}
+; CHECK-NEXT: beq .Ltmp3
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT: blx r4
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+; Compiler shuffles: r3→r5, target→r4, d→r3 (from stack), a→r0, b→r1, c→r2
+; Then pushes r3 (d value), then r2, uses R3 as temp, R2 as scratch
+ call void %x(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+; Test where target ends up in R12, forcing R2 as scratch, with both R2 and R3 live
+; This uses inline asm to force target into R12, with 4 call arguments to make R2/R3 live
+define void @f5_r12_target_r2_r3_live(i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; CHECK-LABEL: f5_r12_target_r2_r3_live:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: @APP
+; CHECK-NEXT: @NO_APP
+; CHECK-NEXT: push {r3}
+; CHECK-NEXT: push {r2}
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov r2, r12
+; CHECK-NEXT: bics r2, r3
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: ldr r2, [r2]
+; CHECK-NEXT: movs r3, #188
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #249
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #132
+; CHECK-NEXT: lsls r3, r3, #8
+; CHECK-NEXT: adds r3, #68
+; CHECK-NEXT: cmp r2, r3
+; CHECK-NEXT: pop {r2}
+; CHECK-NEXT: pop {r3}
+; CHECK-NEXT: beq .Ltmp4
+; CHECK-NEXT: bkpt #0
+; CHECK-NEXT: .Ltmp4:
+; CHECK-NEXT: blx r12
+; CHECK-NEXT: pop {r7, pc}
+; Use inline asm to get function pointer into R12
+; With 4 arguments (r0-r3), both R2 and R3 are live
+; Target in R12 means R2 is scratch, R3 is temp, and both need spilling
+ %target = call ptr asm "", "={r12}"()
+ call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ]
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 -1124498364}
diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb2.ll b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll
new file mode 100644
index 0000000..f319d98
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI
+
+; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them)
+
+; MIR-LABEL: name: f1
+; MIR: body:
+
+; ISEL: tBLXr 14 /* CC::al */, $noreg, %0, csr_aapcs,{{.*}} cfi-type 12345678
+
+; KCFI: BUNDLE{{.*}} {
+; KCFI-NEXT: KCFI_CHECK_Thumb2 $r0, 12345678
+; KCFI-NEXT: tBLXr 14 /* CC::al */, $noreg, {{(killed )?}}$r0, csr_aapcs,{{.*}}
+; KCFI-NEXT: }
+
+; MIR-LABEL: name: f2
+; MIR: body:
+
+; ISEL: TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678
+
+; KCFI: BUNDLE{{.*}} {
+; KCFI-NEXT: KCFI_CHECK_Thumb2 $r0, 12345678
+; KCFI-NEXT: tTAILJMPr {{(killed )?}}$r0, csr_aapcs, implicit $sp, implicit $sp
+; KCFI-NEXT: }
+
+; Test function without KCFI annotation
+; ASM-LABEL: .globl nosan
+; ASM-NEXT: .p2align 1
+; ASM-NEXT: .type nosan,%function
+; ASM-NEXT: .code 16
+; ASM-NEXT: .thumb_func
+define dso_local void @nosan() nounwind {
+; ASM-LABEL: nosan:
+; ASM: @ %bb.0:
+; ASM-NEXT: bx lr
+ ret void
+}
+
+; Test function with KCFI annotation - verifies type hash emission
+;; The alignment is at least 4 to avoid unaligned type hash loads when this
+;; instrumented function is indirectly called.
+; ASM-LABEL: .globl target_func
+; ASM-NEXT: .p2align 2
+; ASM-NEXT: .type target_func,%function
+; ASM-NEXT: .long 12345678
+; ASM-NEXT: .code 16
+; ASM-NEXT: .thumb_func
+define void @target_func() !kcfi_type !1 {
+; ASM-LABEL: target_func:
+; ASM: @ %bb.0:
+; ASM-NEXT: bx lr
+ ret void
+}
+
+; Test indirect call with KCFI check
+; ASM: .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f1:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r7, lr}
+; ASM-NEXT: push {r7, lr}
+; ASM-NEXT: bic r12, r0, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq.w .Ltmp0
+; ASM-NEXT: udf #128
+; ASM-NEXT: .Ltmp0:
+; ASM-NEXT: blx r0
+; ASM-NEXT: pop {r7, pc}
+
+ call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test with tail call
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f2:
+; ASM: @ %bb.0:
+; ASM-NEXT: bic r12, r0, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq.w .Ltmp1
+; ASM-NEXT: udf #128
+; ASM-NEXT: .Ltmp1:
+; ASM-NEXT: bx r0
+
+ tail call void %x() [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test r3 spill/reload when target is r12 and r3 is a call argument (Thumb2)
+define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; ASM-LABEL: f3_r3_spill:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r7, lr}
+; ASM-NEXT: push {r7, lr}
+; ASM-NEXT: mov lr, r3
+; ASM-NEXT: ldr r3, [sp, #8]
+; ASM-NEXT: mov r12, r0
+; ASM-NEXT: mov r0, r1
+; ASM-NEXT: mov r1, r2
+; ASM-NEXT: mov r2, lr
+; ASM-NEXT: push {r3}
+; ASM-NEXT: bic r3, r12, #1
+; ASM-NEXT: ldr r3, [r3, #-4]
+; ASM-NEXT: eor r3, r3, #78
+; ASM-NEXT: eor r3, r3, #24832
+; ASM-NEXT: eor r3, r3, #12320768
+; ASM-NEXT: eors r3, r3, #0
+; ASM-NEXT: pop {r3}
+; ASM-NEXT: beq.w .Ltmp2
+; ASM-NEXT: udf #140
+; ASM-NEXT: .Ltmp2:
+; ASM-NEXT: blx r12
+; ASM-NEXT: pop {r7, pc}
+; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp+8]=%d
+; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12
+; r3 is live as 4th argument, so push it before KCFI check
+ call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+; Test with 3 arguments - r3 not live, target in r12 or elsewhere, r12 used as scratch
+define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 {
+; ASM-LABEL: f4_r3_unused:
+; ASM: @ %bb.0:
+; ASM-NEXT: .save {r7, lr}
+; ASM-NEXT: push {r7, lr}
+; ASM-NEXT: mov r3, r0
+; ASM-NEXT: mov r0, r1
+; ASM-NEXT: mov r1, r2
+; ASM-NEXT: bic r12, r3, #1
+; ASM-NEXT: ldr r12, [r12, #-4]
+; ASM-NEXT: eor r12, r12, #78
+; ASM-NEXT: eor r12, r12, #24832
+; ASM-NEXT: eor r12, r12, #12320768
+; ASM-NEXT: eors r12, r12, #0
+; ASM-NEXT: beq.w .Ltmp3
+; ASM-NEXT: udf #131
+; ASM-NEXT: .Ltmp3:
+; ASM-NEXT: blx r3
+; ASM-NEXT: pop {r7, pc}
+; Only 3 arguments total, so r3 is not used as call argument
+; Target might be in r3, using r12 as scratch (no spill needed)
+ call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ]
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ISEL: {{.*}}
+; KCFI: {{.*}}
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/kcfi.ll b/llvm/test/CodeGen/ARM/kcfi.ll
deleted file mode 100644
index 9e16468..0000000
--- a/llvm/test/CodeGen/ARM/kcfi.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s
-
-; CHECK-LABEL: .globl nosan
-; CHECK-NEXT: .p2align 1
-; CHECK-NEXT: .type nosan,%function
-; CHECK-NEXT: .code 16
-; CHECK-NEXT: .thumb_func
-; CHECK-NEXT: nosan:
-define dso_local void @nosan() nounwind {
- ret void
-}
-
-;; The alignment is at least 4 to avoid unaligned type hash loads when this
-;; instrumented function is indirectly called.
-; CHECK-LABEL: .globl f1
-; CHECK-NEXT: .p2align 2
-; CHECK-NEXT: .type f1,%function
-; CHECK-NEXT: .long 3170468932
-; CHECK-NEXT: .code 16
-; CHECK-NEXT: .thumb_func
-; CHECK-NEXT: f1:
-define void @f1(ptr noundef %x) !kcfi_type !1 {
- ret void
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 4, !"kcfi", i32 1}
-!1 = !{i32 -1124498364}
diff --git a/llvm/test/CodeGen/AVR/dynalloca.ll b/llvm/test/CodeGen/AVR/dynalloca.ll
index 3face71..b32910b 100644
--- a/llvm/test/CodeGen/AVR/dynalloca.ll
+++ b/llvm/test/CodeGen/AVR/dynalloca.ll
@@ -64,16 +64,16 @@ define void @dynalloca2(i16 %x) {
; CHECK-NEXT: out 63, r0
; CHECK-NEXT: out 61, {{.*}}
; Store values on the stack
-; CHECK: ldi r20, 0
-; CHECK: ldi r21, 0
-; CHECK: std Z+8, r21
-; CHECK: std Z+7, r20
-; CHECK: std Z+6, r21
-; CHECK: std Z+5, r20
-; CHECK: std Z+4, r21
-; CHECK: std Z+3, r20
-; CHECK: std Z+2, r21
-; CHECK: std Z+1, r20
+; CHECK: ldi [[REG1:r[0-9]+]], 0
+; CHECK: ldi [[REG2:r[0-9]+]], 0
+; CHECK: std Z+8, [[REG2]]
+; CHECK: std Z+7, [[REG1]]
+; CHECK: std Z+6, [[REG2]]
+; CHECK: std Z+5, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
+; CHECK: std Z+1, [[REG1]]
; CHECK: call
; Call frame restore
; CHECK-NEXT: in r30, 61
diff --git a/llvm/test/CodeGen/AVR/issue-163015.ll b/llvm/test/CodeGen/AVR/issue-163015.ll
new file mode 100644
index 0000000..6c4dc51
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/issue-163015.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=avr | FileCheck %s
+
+@ui1 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@ui2 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@failed = private unnamed_addr addrspace(1) constant [12 x i8] c"test failed\00"
+@stats2 = external protected global i16, align 1
+
+; CHECK-LABEL: main:
+define i32 @main() addrspace(1) {
+entry:
+ store i64 94, ptr @ui1, align 8
+ store i64 53, ptr @ui2, align 8
+ tail call addrspace(1) void @foo(i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 32, ptr @stats2)
+ %11 = load i64, ptr @ui1, align 8
+ %12 = load i64, ptr @ui2, align 8
+
+; COM: CHECK: call __udivdi3
+ %15 = udiv i64 %11, %12
+
+; look for the buggy pattern where r30/r31 are being clobbered, corrupting the stack pointer
+; CHECK-NOT: std Z+{{[1-9]+}}, r30
+; CHECK-NOT: std Z+{{[1-9]+}}, r31
+
+; CHECK: call expect
+ tail call addrspace(1) void @expect(i64 %15, i64 1, i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 33)
+
+; CHECK: ret
+ ret i32 0
+}
+
+declare protected void @expect(i64, i64, i16, i16, i8, i16) local_unnamed_addr addrspace(1) #0
+declare protected void @foo(i16, i16, i8, i16, i16) local_unnamed_addr addrspace(1) #0
diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
index 8c0d82e..6f1bbd0 100644
--- a/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
@@ -2,7 +2,7 @@
; Check that we correctly ignore cbuffers that were nulled out by optimizations.
%__cblayout_CB = type <{ float }>
-@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 4, 0)) poison
@x = external local_unnamed_addr addrspace(2) global float, align 4
; CHECK-NOT: !hlsl.cbs =
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
index bd07ba1..eb4cf76 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
@@ -20,6 +20,9 @@
; CHECK: OpReadClockKHR [[v2uint]] [[uint_1]]
; CHECK: OpReadClockKHR [[v2uint]] [[uint_2]]
; CHECK: OpReadClockKHR [[v2uint]] [[uint_3]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_1]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_2]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_3]]
define dso_local spir_kernel void @test_clocks(ptr addrspace(1) nocapture noundef writeonly align 8 %out64, ptr addrspace(1) nocapture noundef writeonly align 8 %outv2) {
entry:
@@ -39,6 +42,9 @@ entry:
%call9 = tail call spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv()
%arrayidx10 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 16
store <2 x i32> %call9, ptr addrspace(1) %arrayidx10, align 8
+ %call10 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 1)
+ %call11 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 2)
+ %call12 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 3)
ret void
}
@@ -59,3 +65,6 @@ declare spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv() local_unnamed_add
; Function Attrs: convergent nounwind
declare spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() local_unnamed_addr
+
+; Function Attrs: nounwind
+declare spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32)
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 4f5cb5a..3e7b73a 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -269,3 +269,708 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
%ret = load atomic <1 x i64>, ptr %x acquire, align 8
ret <1 x i64> %ret
}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_ptr:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: pushq %rax
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $8, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movq (%rsp), %rax
+; CHECK-O3-NEXT: popq %rcx
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pushq %rax
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $8, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT: popq %rcx
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: pushq %rax
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $8, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT: popq %rcx
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_ptr:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: pushq %rax
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $8, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movq (%rsp), %rax
+; CHECK-O0-NEXT: popq %rcx
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: pushq %rax
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $8, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT: popq %rcx
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: pushq %rax
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $8, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT: popq %rcx
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+ ret <1 x ptr> %ret
+}
+
+define <1 x half> @atomic_vec1_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_half:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_half:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_half:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movw (%rdi), %cx
+; CHECK-O0-NEXT: # implicit-def: $eax
+; CHECK-O0-NEXT: movw %cx, %ax
+; CHECK-O0-NEXT: # implicit-def: $xmm0
+; CHECK-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movw (%rdi), %cx
+; CHECK-SSE-O0-NEXT: # implicit-def: $eax
+; CHECK-SSE-O0-NEXT: movw %cx, %ax
+; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_half:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: movw (%rdi), %cx
+; CHECK-AVX-O0-NEXT: # implicit-def: $eax
+; CHECK-AVX-O0-NEXT: movw %cx, %ax
+; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0
+; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x half>, ptr %x acquire, align 2
+ ret <1 x half> %ret
+}
+
+define <1 x float> @atomic_vec1_float(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_float:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_float:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_float:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_float:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_float:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_float:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x float>, ptr %x acquire, align 4
+ ret <1 x float> %ret
+}
+
+define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double_align:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double_align:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_double_align:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_double_align:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_double_align:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_double_align:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x double>, ptr %x acquire, align 8
+ ret <1 x double> %ret
+}
+
+define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_i64:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: pushq %rax
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $8, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movq (%rsp), %rax
+; CHECK-O3-NEXT: popq %rcx
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pushq %rax
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $8, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT: popq %rcx
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: pushq %rax
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $8, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT: popq %rcx
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i64:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: pushq %rax
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $8, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movq (%rsp), %rax
+; CHECK-O0-NEXT: popq %rcx
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: pushq %rax
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $8, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT: popq %rcx
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: pushq %rax
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $8, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT: popq %rcx
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x i64>, ptr %x acquire, align 4
+ ret <1 x i64> %ret
+}
+
+define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: pushq %rax
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $8, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT: popq %rax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pushq %rax
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $8, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT: popq %rax
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_double:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: pushq %rax
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $8, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT: popq %rax
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec1_double:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: pushq %rax
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $8, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT: popq %rax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_double:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: pushq %rax
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $8, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT: popq %rax
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_double:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: pushq %rax
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $8, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT: popq %rax
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <1 x double>, ptr %x acquire, align 4
+ ret <1 x double> %ret
+}
+
+define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec2_i32:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: pushq %rax
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $8, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT: popq %rax
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: pushq %rax
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $8, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT: popq %rax
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: pushq %rax
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $8, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT: popq %rax
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec2_i32:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: pushq %rax
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $8, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT: popq %rax
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: pushq %rax
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $8, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT: popq %rax
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: pushq %rax
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $8, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT: popq %rax
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <2 x i32>, ptr %x acquire, align 4
+ ret <2 x i32> %ret
+}
+
+define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_float:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: subq $24, %rsp
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $16, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-O3-NEXT: addq $24, %rsp
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_float:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: subq $24, %rsp
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $16, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT: addq $24, %rsp
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_float:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: subq $24, %rsp
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $16, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: vmovaps (%rsp), %xmm0
+; CHECK-AVX-O3-NEXT: addq $24, %rsp
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec4_float:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: subq $24, %rsp
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $16, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-O0-NEXT: addq $24, %rsp
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_float:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $24, %rsp
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $16, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT: addq $24, %rsp
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_float:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: subq $24, %rsp
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $16, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: vmovaps (%rsp), %xmm0
+; CHECK-AVX-O0-NEXT: addq $24, %rsp
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <4 x float>, ptr %x acquire, align 4
+ ret <4 x float> %ret
+}
+
+define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec8_double:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: subq $72, %rsp
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $64, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT: addq $72, %rsp
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec8_double:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: subq $72, %rsp
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $64, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT: addq $72, %rsp
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec8_double:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: subq $72, %rsp
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $64, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movapd (%rsp), %xmm0
+; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT: addq $72, %rsp
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec8_double:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $72, %rsp
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $64, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movapd (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT: addq $72, %rsp
+; CHECK-SSE-O0-NEXT: retq
+ %ret = load atomic <8 x double>, ptr %x acquire, align 4
+ ret <8 x double> %ret
+}
+
+define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: subq $40, %rsp
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $32, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT: addq $40, %rsp
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: subq $40, %rsp
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $32, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT: addq $40, %rsp
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: subq $40, %rsp
+; CHECK-AVX-O3-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT: movl $32, %edi
+; CHECK-AVX-O3-NEXT: movl $2, %ecx
+; CHECK-AVX-O3-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT: vmovups (%rsp), %ymm0
+; CHECK-AVX-O3-NEXT: addq $40, %rsp
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: subq $40, %rsp
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $32, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT: addq $40, %rsp
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $40, %rsp
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $32, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT: addq $40, %rsp
+; CHECK-SSE-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: subq $40, %rsp
+; CHECK-AVX-O0-NEXT: movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT: movl $32, %edi
+; CHECK-AVX-O0-NEXT: movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT: movl $2, %ecx
+; CHECK-AVX-O0-NEXT: callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT: vmovups (%rsp), %ymm0
+; CHECK-AVX-O0-NEXT: addq $40, %rsp
+; CHECK-AVX-O0-NEXT: retq
+ %ret = load atomic <16 x bfloat>, ptr %x acquire, align 4
+ ret <16 x bfloat> %ret
+}
+
+define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec32_half:
+; CHECK-O3: # %bb.0:
+; CHECK-O3-NEXT: subq $72, %rsp
+; CHECK-O3-NEXT: movq %rdi, %rsi
+; CHECK-O3-NEXT: movq %rsp, %rdx
+; CHECK-O3-NEXT: movl $64, %edi
+; CHECK-O3-NEXT: movl $2, %ecx
+; CHECK-O3-NEXT: callq __atomic_load@PLT
+; CHECK-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT: addq $72, %rsp
+; CHECK-O3-NEXT: retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec32_half:
+; CHECK-SSE-O3: # %bb.0:
+; CHECK-SSE-O3-NEXT: subq $72, %rsp
+; CHECK-SSE-O3-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT: movl $64, %edi
+; CHECK-SSE-O3-NEXT: movl $2, %ecx
+; CHECK-SSE-O3-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT: addq $72, %rsp
+; CHECK-SSE-O3-NEXT: retq
+;
+; CHECK-O0-LABEL: atomic_vec32_half:
+; CHECK-O0: # %bb.0:
+; CHECK-O0-NEXT: subq $72, %rsp
+; CHECK-O0-NEXT: movq %rdi, %rsi
+; CHECK-O0-NEXT: movl $64, %edi
+; CHECK-O0-NEXT: movq %rsp, %rdx
+; CHECK-O0-NEXT: movl $2, %ecx
+; CHECK-O0-NEXT: callq __atomic_load@PLT
+; CHECK-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT: addq $72, %rsp
+; CHECK-O0-NEXT: retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec32_half:
+; CHECK-SSE-O0: # %bb.0:
+; CHECK-SSE-O0-NEXT: subq $72, %rsp
+; CHECK-SSE-O0-NEXT: movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT: movl $64, %edi
+; CHECK-SSE-O0-NEXT: movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT: movl $2, %ecx
+; CHECK-SSE-O0-NEXT: callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT: movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT: addq $72, %rsp
+; CHECK-SSE-O0-NEXT: retq
+ %ret = load atomic <32 x half>, ptr %x acquire, align 4
+ ret <32 x half> %ret
+}
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
index 4e76262..423e318 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
@@ -19,7 +19,7 @@ entry:
; CHECK: func:
; CHECK: .Lfunc_begin1:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; BASIC-NEXT: .byte 0 # feature
; PGO-NEXT: .byte 3 # feature
; CHECK-NEXT: .quad .Lfunc_begin1 # function address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
index f610b04..e32e522 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
@@ -10,7 +10,7 @@ define dso_local i32 @_Z3barv() {
; CHECK-LABEL: _Z3barv:
; CHECK-NEXT: [[BAR_BEGIN:.Lfunc_begin[0-9]+]]:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3barv{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 0 # feature
; CHECK-NEXT: .quad [[BAR_BEGIN]] # function address
@@ -23,7 +23,7 @@ define dso_local i32 @_Z3foov() {
; CHECK-LABEL: _Z3foov:
; CHECK-NEXT: [[FOO_BEGIN:.Lfunc_begin[0-9]+]]:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3foov{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 32 # feature
; CHECK-NEXT: .quad [[FOO_BEGIN]] # function address
@@ -36,6 +36,6 @@ define linkonce_odr dso_local i32 @_Z4fooTIiET_v() comdat {
; CHECK-LABEL: _Z4fooTIiET_v:
; CHECK-NEXT: [[FOOCOMDAT_BEGIN:.Lfunc_begin[0-9]+]]:
; CHECK: .section .llvm_bb_addr_map,"oG",@llvm_bb_addr_map,.text._Z4fooTIiET_v,_Z4fooTIiET_v,comdat{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 0 # feature
; CHECK-NEXT: .quad [[FOOCOMDAT_BEGIN]] # function address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
index ba76f3e..12b1297 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
@@ -69,7 +69,7 @@ declare i32 @__gxx_personality_v0(...)
; CHECK-LABEL: .Lfunc_end0:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; BASIC-NEXT: .byte 32 # feature
; PGO-ALL-NEXT: .byte 39 # feature
; FEC-ONLY-NEXT:.byte 33 # feature
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
index 6157f1a..aeb6dc95 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
@@ -47,7 +47,7 @@ declare i32 @__gxx_personality_v0(...)
; CHECK-LABEL: .Lfunc_end0:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot._Z3bazb
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 40 # feature
; CHECK-NEXT: .byte 2 # number of basic block ranges
; CHECK-NEXT: .quad .Lfunc_begin0 # base address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll
new file mode 100644
index 0000000..a5678877
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll
@@ -0,0 +1,94 @@
+; Check the basic block sections labels option works when used along with -emit-bb-hash.
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -emit-bb-hash | FileCheck %s --check-prefixes=CHECK,UNIQ
+
+define void @_Z3bazb(i1 zeroext, i1 zeroext) personality ptr @__gxx_personality_v0 {
+ br i1 %0, label %3, label %8
+
+3:
+ %4 = invoke i32 @_Z3barv()
+ to label %8 unwind label %6
+ br label %10
+
+6:
+ landingpad { ptr, i32 }
+ catch ptr null
+ br label %12
+
+8:
+ %9 = call i32 @_Z3foov()
+ br i1 %1, label %12, label %10
+
+10:
+ %11 = select i1 %1, ptr blockaddress(@_Z3bazb, %3), ptr blockaddress(@_Z3bazb, %12) ; <ptr> [#uses=1]
+ indirectbr ptr %11, [label %3, label %12]
+
+12:
+ ret void
+}
+
+declare i32 @_Z3barv() #1
+
+declare i32 @_Z3foov() #1
+
+declare i32 @__gxx_personality_v0(...)
+
+; UNIQ: .section .text._Z3bazb,"ax",@progbits{{$}}
+; NOUNIQ: .section .text,"ax",@progbits,unique,1
+; CHECK-LABEL: _Z3bazb:
+; CHECK-LABEL: .Lfunc_begin0:
+; CHECK-LABEL: .LBB_END0_0:
+; CHECK-LABEL: .LBB0_1:
+; CHECK-LABEL: .LBB0_1_CS0:
+; CHECK-LABEL: .LBB_END0_1:
+; CHECK-LABEL: .LBB0_2:
+; CHECK-LABEL: .LBB0_2_CS0:
+; CHECK-LABEL: .LBB_END0_2:
+; CHECK-LABEL: .LBB0_3:
+; CHECK-LABEL: .LBB_END0_3:
+; CHECK-LABEL: .Lfunc_end0:
+
+; UNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
+;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section.
+; NOUNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1
+; CHECK-NEXT: .byte 4 # version
+; CHECK-NEXT: .byte 96 # feature
+; CHECK-NEXT: .quad .Lfunc_begin0 # function address
+; CHECK-NEXT: .byte 6 # number of basic blocks
+; CHECK-NEXT: .byte 0 # BB id
+; CHECK-NEXT: .uleb128 .Lfunc_begin0-.Lfunc_begin0
+; CHECK-NEXT: .byte 0 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB_END0_0-.Lfunc_begin0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 1 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_1-.LBB_END0_0
+; CHECK-NEXT: .byte 1 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB0_1_CS0-.LBB0_1
+; CHECK-NEXT: .uleb128 .LBB_END0_1-.LBB0_1_CS0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 3 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_2-.LBB_END0_1
+; CHECK-NEXT: .byte 1 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB0_2_CS0-.LBB0_2
+; CHECK-NEXT: .uleb128 .LBB_END0_2-.LBB0_2_CS0
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 4 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_3-.LBB_END0_2
+; CHECK-NEXT: .byte 0 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB_END0_3-.LBB0_3
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 5 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_4-.LBB_END0_3
+; CHECK-NEXT: .byte 0 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB_END0_4-.LBB0_4
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .quad {{-?[0-9]+}}
+; CHECK-NEXT: .byte 2 # BB id
+; CHECK-NEXT: .uleb128 .LBB0_5-.LBB_END0_4
+; CHECK-NEXT: .byte 0 # number of callsites
+; CHECK-NEXT: .uleb128 .LBB_END0_5-.LBB0_5
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .quad {{-?[0-9]+}}
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
index 1e8cee4..d49b313 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
@@ -58,7 +58,7 @@ declare i32 @qux()
; CHECK-LABEL: .Lfunc_end0:
; CHECK: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot.foo
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; BASIC-NEXT: .byte 40 # feature
; PGO-NEXT: .byte 47 # feature
; CHECK-NEXT: .byte 2 # number of basic block ranges
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map.ll b/llvm/test/CodeGen/X86/basic-block-address-map.ll
index 5c8f3a6..64cf2c7 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map.ll
@@ -52,7 +52,7 @@ declare i32 @__gxx_personality_v0(...)
; UNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section.
; NOUNIQ: .section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1
-; CHECK-NEXT: .byte 3 # version
+; CHECK-NEXT: .byte 4 # version
; CHECK-NEXT: .byte 32 # feature
; CHECK-NEXT: .quad .Lfunc_begin0 # function address
; CHECK-NEXT: .byte 6 # number of basic blocks