; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s ; Check code generation for memmoves with statically unknown size and all ; combinations of the following address spaces: ; destination address space: 0, 1, 3, 5 ; source address space: 0, 1, 3, 4, 5 define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p0_p0: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 ; CHECK-NEXT: v_mov_b32_e32 v9, 0 ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %Flow35 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB0_10 ; CHECK-NEXT: .LBB0_2: ; %Flow36 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB0_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v13, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5] ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5 ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17] ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB0_5 ; CHECK-NEXT: .LBB0_6: ; %Flow30 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v4, v[2:3] ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v4 ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB0_8 ; CHECK-NEXT: .LBB0_9: ; %Flow28 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: .LBB0_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v12, v[10:11] ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[4:5], v12 ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB0_12 ; CHECK-NEXT: .LBB0_13: ; %Flow34 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5] ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB0_15 ; CHECK-NEXT: .LBB0_16: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p0_p1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 ; CHECK-NEXT: v_mov_b32_e32 v9, 0 ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB1_3 ; CHECK-NEXT: ; %bb.1: ; %Flow37 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB1_10 ; CHECK-NEXT: .LBB1_2: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB1_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB1_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v13, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5 ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17] ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB1_5 ; CHECK-NEXT: .LBB1_6: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB1_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v4 ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB1_8 ; CHECK-NEXT: .LBB1_9: ; %Flow30 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB1_2 ; CHECK-NEXT: .LBB1_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB1_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_co_u32 v4, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v11, s4 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[4:5], off ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 ; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[10:11], v12 ; CHECK-NEXT: v_add_co_u32 v10, s5, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB1_12 ; CHECK-NEXT: .LBB1_13: ; %Flow36 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB1_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB1_15 ; CHECK-NEXT: .LBB1_16: ; %Flow34 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p0_p3: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v7, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; CHECK-NEXT: v_and_b32_e32 v5, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] ; CHECK-NEXT: v_cndmask_b32_e32 v9, -1, v0, vcc_lo ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6] ; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v9 ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB2_3 ; CHECK-NEXT: ; %bb.1: ; %Flow39 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB2_10 ; CHECK-NEXT: .LBB2_2: ; %Flow40 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB2_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB2_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v1 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: v_mov_b32_e32 v11, v5 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[13:16], v4 ; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v12, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16] ; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB2_5 ; CHECK-NEXT: .LBB2_6: ; %Flow34 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB2_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v6, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v3, v2 ; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v3 ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB2_8 ; CHECK-NEXT: .LBB2_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB2_2 ; CHECK-NEXT: .LBB2_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB2_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v11, v4 ; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_byte v[9:10], v11 ; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB2_12 ; CHECK-NEXT: .LBB2_13: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB2_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[7:10], v2 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v6, vcc_lo ; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, v1, v6, vcc_lo ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4] ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: s_or_b32 s7, s4, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB2_15 ; CHECK-NEXT: .LBB2_16: ; %Flow36 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p0_p4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 ; CHECK-NEXT: v_mov_b32_e32 v9, 0 ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB3_3 ; CHECK-NEXT: ; %bb.1: ; %Flow34 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB3_10 ; CHECK-NEXT: .LBB3_2: ; %Flow35 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB3_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB3_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v13, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5 ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[14:17] ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB3_5 ; CHECK-NEXT: .LBB3_6: ; %Flow29 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB3_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v4 ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB3_8 ; CHECK-NEXT: .LBB3_9: ; %Flow27 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB3_2 ; CHECK-NEXT: .LBB3_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB3_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[4:5], v12 ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB3_12 ; CHECK-NEXT: .LBB3_13: ; %Flow33 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB3_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB3_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB3_15 ; CHECK-NEXT: .LBB3_16: ; %Flow31 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p0_p5: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v7, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, 0 ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; CHECK-NEXT: v_and_b32_e32 v5, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8] ; CHECK-NEXT: v_cndmask_b32_e32 v9, -1, v0, vcc_lo ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6] ; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v9 ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB4_3 ; CHECK-NEXT: ; %bb.1: ; %Flow39 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB4_10 ; CHECK-NEXT: .LBB4_2: ; %Flow40 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB4_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB4_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v1 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: v_mov_b32_e32 v11, v5 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v13, v4, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v14, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v15, v4, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v16, v4, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v12, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16] ; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB4_5 ; CHECK-NEXT: .LBB4_6: ; %Flow34 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB4_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v6, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen ; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[0:1], v3 ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB4_8 ; CHECK-NEXT: .LBB4_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB4_2 ; CHECK-NEXT: .LBB4_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB4_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen ; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_byte v[9:10], v11 ; CHECK-NEXT: v_add_co_u32 v9, s5, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB4_12 ; CHECK-NEXT: .LBB4_13: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB4_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB4_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v6, vcc_lo ; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, v1, v6, vcc_lo ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4] ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: s_or_b32 s7, s4, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB4_15 ; CHECK-NEXT: .LBB4_16: ; %Flow36 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p1_p0: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 ; CHECK-NEXT: v_mov_b32_e32 v9, 0 ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB5_3 ; CHECK-NEXT: ; %bb.1: ; %Flow37 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB5_10 ; CHECK-NEXT: .LBB5_2: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB5_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB5_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v13, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[4:5] ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5 ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB5_5 ; CHECK-NEXT: .LBB5_6: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB5_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v4, v[2:3] ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_byte v[0:1], v4, off ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB5_8 ; CHECK-NEXT: .LBB5_9: ; %Flow30 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB5_2 ; CHECK-NEXT: .LBB5_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB5_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v12, v[10:11] ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_byte v[4:5], v12, off ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB5_12 ; CHECK-NEXT: .LBB5_13: ; %Flow36 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB5_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB5_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[4:5] ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB5_15 ; CHECK-NEXT: .LBB5_16: ; %Flow34 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p1_p1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 ; CHECK-NEXT: v_mov_b32_e32 v9, 0 ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB6_3 ; CHECK-NEXT: ; %bb.1: ; %Flow41 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB6_10 ; CHECK-NEXT: .LBB6_2: ; %Flow42 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB6_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB6_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v13, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5 ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB6_5 ; CHECK-NEXT: .LBB6_6: ; %Flow36 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB6_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_byte v[0:1], v4, off ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB6_8 ; CHECK-NEXT: .LBB6_9: ; %Flow34 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB6_2 ; CHECK-NEXT: .LBB6_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB6_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_byte v[4:5], v12, off ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB6_12 ; CHECK-NEXT: .LBB6_13: ; %Flow40 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB6_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB6_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB6_15 ; CHECK-NEXT: .LBB6_16: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p1_p3: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB7_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v2 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[10:13], v9 ; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4 ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v15, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB7_2 ; CHECK-NEXT: .LBB7_3: ; %Flow9 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB7_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo ; CHECK-NEXT: .LBB7_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v7, v2 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_byte v[3:4], v7, off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB7_5 ; CHECK-NEXT: ; %bb.6: ; %Flow ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: .LBB7_7: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p1_p4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v8, 15, v4 ; CHECK-NEXT: v_mov_b32_e32 v9, 0 ; CHECK-NEXT: v_and_b32_e32 v6, -16, v4 ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB8_3 ; CHECK-NEXT: ; %bb.1: ; %Flow38 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB8_10 ; CHECK-NEXT: .LBB8_2: ; %Flow39 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB8_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB8_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v11, v1 ; CHECK-NEXT: v_mov_b32_e32 v13, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v12, v6 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_dwordx4 v[14:17], v[4:5], off ; CHECK-NEXT: v_add_co_u32 v12, s5, v12, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, s5 ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[12:13] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[10:11], v[14:17], off ; CHECK-NEXT: v_add_co_u32 v10, s6, v10, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB8_5 ; CHECK-NEXT: .LBB8_6: ; %Flow33 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB8_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v7, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s5 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v4, v[2:3], off ; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s5 ; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s9, s5, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_byte v[0:1], v4, off ; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB8_8 ; CHECK-NEXT: .LBB8_9: ; %Flow31 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB8_2 ; CHECK-NEXT: .LBB8_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB8_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v11, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, v3, v11, s4 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: global_load_ubyte v12, v[10:11], off ; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v9, s4 ; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v11, s4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9] ; CHECK-NEXT: s_or_b32 s8, s4, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_byte v[4:5], v12, off ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB8_12 ; CHECK-NEXT: .LBB8_13: ; %Flow37 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB8_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB8_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, v3, v7, vcc_lo ; CHECK-NEXT: v_add_co_u32 v12, s4, v0, v6 ; CHECK-NEXT: v_add_co_ci_u32_e64 v13, null, v1, v7, s4 ; CHECK-NEXT: global_load_dwordx4 v[8:11], v[4:5], off ; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v7, vcc_lo ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5] ; CHECK-NEXT: v_mov_b32_e32 v7, v5 ; CHECK-NEXT: v_mov_b32_e32 v6, v4 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[12:13], v[8:11], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB8_15 ; CHECK-NEXT: .LBB8_16: ; %Flow35 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p1_p5: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB9_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v2 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB9_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v10, v9, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v11, v9, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v12, v9, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v13, v9, s[0:3], 0 offen offset:12 ; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4 ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v15, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB9_2 ; CHECK-NEXT: .LBB9_3: ; %Flow9 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB9_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc_lo ; CHECK-NEXT: .LBB9_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_byte v[3:4], v7, off ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB9_5 ; CHECK-NEXT: ; %bb.6: ; %Flow ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: .LBB9_7: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p3_p0: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6] ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo ; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc_lo ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[7:8] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[1:2], v[9:10] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB10_3 ; CHECK-NEXT: ; %bb.1: ; %Flow39 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB10_10 ; CHECK-NEXT: .LBB10_2: ; %Flow40 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB10_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB10_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v2 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v9, v1 ; CHECK-NEXT: v_mov_b32_e32 v11, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10] ; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v12, s5 ; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[11:12] ; CHECK-NEXT: s_or_b32 s9, s6, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: ds_write_b128 v4, v[13:16] ; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB10_5 ; CHECK-NEXT: .LBB10_6: ; %Flow34 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB10_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v8, s5 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v2, v[0:1] ; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s5 ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6] ; CHECK-NEXT: s_or_b32 s9, s6, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: ds_write_b8 v3, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB10_8 ; CHECK-NEXT: .LBB10_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB10_2 ; CHECK-NEXT: .LBB10_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB10_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v11, v[9:10] ; CHECK-NEXT: v_add_co_u32 v5, s4, v5, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s4 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6] ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: ds_write_b8 v4, v11 ; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB10_12 ; CHECK-NEXT: .LBB10_13: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB10_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, -1, v2, vcc_lo ; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v2, v8, vcc_lo ; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, vcc_lo ; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4] ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8] ; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[3:6] ; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB10_15 ; CHECK-NEXT: .LBB10_16: ; %Flow36 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p3_p1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB11_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .LBB11_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] ; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v9, v[10:13] ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB11_2 ; CHECK-NEXT: .LBB11_3: ; %Flow9 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB11_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo ; CHECK-NEXT: .LBB11_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] ; CHECK-NEXT: global_load_ubyte v3, v[3:4], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b8 v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB11_5 ; CHECK-NEXT: ; %bb.6: ; %Flow ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: .LBB11_7: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p3_p3: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v5, 0 ; CHECK-NEXT: v_and_b32_e32 v4, 15, v2 ; CHECK-NEXT: v_and_b32_e32 v6, -16, v2 ; CHECK-NEXT: v_mov_b32_e32 v7, v3 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5] ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; CHECK-NEXT: v_cmpx_ge_u32_e64 v1, v0 ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB12_3 ; CHECK-NEXT: ; %bb.1: ; %Flow46 ; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB12_10 ; CHECK-NEXT: .LBB12_2: ; %Flow47 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB12_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB12_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v3, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .LBB12_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[9:12], v3 ; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_write_b128 v8, v[9:12] ; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB12_5 ; CHECK-NEXT: .LBB12_6: ; %Flow41 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB12_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v2, -16, v2 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; CHECK-NEXT: .LBB12_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v2, v1 ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5] ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_write_b8 v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB12_8 ; CHECK-NEXT: .LBB12_9: ; %Flow39 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 ; CHECK-NEXT: s_cbranch_execz .LBB12_2 ; CHECK-NEXT: .LBB12_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s6, s4 ; CHECK-NEXT: s_cbranch_execz .LBB12_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v2 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v6, v0, v7 ; CHECK-NEXT: v_add_nc_u32_e32 v7, v1, v7 ; CHECK-NEXT: .LBB12_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v8, v7 ; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v7 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[4:5] ; CHECK-NEXT: s_or_b32 s7, s4, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_write_b8 v6, v8 ; CHECK-NEXT: v_add_nc_u32_e32 v6, -1, v6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB12_12 ; CHECK-NEXT: .LBB12_13: ; %Flow45 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB12_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v5, -16, v2 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4 ; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; CHECK-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v3, vcc_lo ; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[5:8], v4 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4 ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_write_b128 v2, v[5:8] ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB12_15 ; CHECK-NEXT: .LBB12_16: ; %Flow43 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p3_p4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB13_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .LBB13_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] ; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v9, v[10:13] ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB13_2 ; CHECK-NEXT: .LBB13_3: ; %Flow9 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB13_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo ; CHECK-NEXT: .LBB13_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] ; CHECK-NEXT: global_load_ubyte v3, v[3:4], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b8 v0, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB13_5 ; CHECK-NEXT: ; %bb.6: ; %Flow ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: .LBB13_7: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p3_p5: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB14_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v9, v7, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v10, v7, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v11, v7, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v12, v7, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v8, v[9:12] ; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB14_2 ; CHECK-NEXT: .LBB14_3: ; %Flow14 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB14_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; CHECK-NEXT: .LBB14_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b8 v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB14_5 ; CHECK-NEXT: ; %bb.6: ; %Flow ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: .LBB14_7: ; %Flow12 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p5_p0: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6] ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo ; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc_lo ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[7:8] ; CHECK-NEXT: v_cmpx_ge_u64_e64 v[1:2], v[9:10] ; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB15_3 ; CHECK-NEXT: ; %bb.1: ; %Flow39 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB15_10 ; CHECK-NEXT: .LBB15_2: ; %Flow40 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB15_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s6, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB15_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v10, v2 ; CHECK-NEXT: v_mov_b32_e32 v12, v8 ; CHECK-NEXT: v_mov_b32_e32 v9, v1 ; CHECK-NEXT: v_mov_b32_e32 v11, v7 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10] ; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v12, s5 ; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_store_dword v16, v4, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_store_dword v15, v4, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_store_dword v14, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4 ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB15_5 ; CHECK-NEXT: .LBB15_6: ; %Flow34 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_and_saveexec_b32 s8, s4 ; CHECK-NEXT: s_cbranch_execz .LBB15_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3 ; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v8, s5 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v2, v[0:1] ; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s5 ; CHECK-NEXT: v_add_co_u32 v0, s5, v0, 1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s5 ; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[5:6] ; CHECK-NEXT: s_or_b32 s9, s6, s9 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v3, 1, v3 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9 ; CHECK-NEXT: s_cbranch_execnz .LBB15_8 ; CHECK-NEXT: .LBB15_9: ; %Flow32 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: ; implicit-def: $vgpr1_vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7 ; CHECK-NEXT: s_cbranch_execz .LBB15_2 ; CHECK-NEXT: .LBB15_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB15_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flat_load_ubyte v11, v[9:10] ; CHECK-NEXT: v_add_co_u32 v5, s4, v5, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v6, s4 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6] ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_store_byte v11, v4, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB15_12 ; CHECK-NEXT: .LBB15_13: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB15_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, -1, v2, vcc_lo ; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB15_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, v2, v8, vcc_lo ; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v8, vcc_lo ; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8] ; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0 ; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_cbranch_execnz .LBB15_15 ; CHECK-NEXT: .LBB15_16: ; %Flow36 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p5_p1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB16_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB16_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] ; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB16_2 ; CHECK-NEXT: .LBB16_3: ; %Flow9 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB16_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo ; CHECK-NEXT: .LBB16_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] ; CHECK-NEXT: global_load_ubyte v3, v[3:4], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB16_5 ; CHECK-NEXT: ; %bb.6: ; %Flow ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: .LBB16_7: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p5_p3: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: v_and_b32_e32 v5, 15, v4 ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3] ; CHECK-NEXT: s_cbranch_execz .LBB17_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v7, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_b128 v[9:12], v7 ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB17_2 ; CHECK-NEXT: .LBB17_3: ; %Flow14 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB17_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader ; CHECK-NEXT: v_and_b32_e32 v2, -16, v4 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; CHECK-NEXT: .LBB17_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ds_read_u8 v2, v1 ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB17_5 ; CHECK-NEXT: ; %bb.6: ; %Flow ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: .LBB17_7: ; %Flow12 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p5_p4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v7, -16, v3 ; CHECK-NEXT: v_mov_b32_e32 v8, v4 ; CHECK-NEXT: v_and_b32_e32 v5, 15, v3 ; CHECK-NEXT: v_mov_b32_e32 v6, 0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[7:8] ; CHECK-NEXT: s_cbranch_execz .LBB18_3 ; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader ; CHECK-NEXT: v_mov_b32_e32 v9, v0 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB18_2: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s5, v2, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8] ; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9 ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB18_2 ; CHECK-NEXT: .LBB18_3: ; %Flow9 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[5:6] ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execz .LBB18_7 ; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader ; CHECK-NEXT: v_and_b32_e32 v3, -16, v3 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, null, v2, v4, vcc_lo ; CHECK-NEXT: .LBB18_5: ; %loop-memcpy-residual ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v2, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6] ; CHECK-NEXT: global_load_ubyte v3, v[3:4], off ; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB18_5 ; CHECK-NEXT: ; %bb.6: ; %Flow ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: .LBB18_7: ; %Flow7 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src, i64 %sz) { ; CHECK-LABEL: memmove_p5_p5: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v5, 0 ; CHECK-NEXT: v_and_b32_e32 v4, 15, v2 ; CHECK-NEXT: v_and_b32_e32 v6, -16, v2 ; CHECK-NEXT: v_mov_b32_e32 v7, v3 ; CHECK-NEXT: s_mov_b32 s6, exec_lo ; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5] ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; CHECK-NEXT: v_cmpx_ge_u32_e64 v1, v0 ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB19_3 ; CHECK-NEXT: ; %bb.1: ; %Flow46 ; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB19_10 ; CHECK-NEXT: .LBB19_2: ; %Flow47 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB19_3: ; %memmove_copy_forward ; CHECK-NEXT: s_and_saveexec_b32 s7, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB19_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader ; CHECK-NEXT: v_mov_b32_e32 v3, v1 ; CHECK-NEXT: v_mov_b32_e32 v8, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB19_5: ; %memmove_fwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen ; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v7, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3 ; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7] ; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8 ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB19_5 ; CHECK-NEXT: .LBB19_6: ; %Flow41 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB19_9 ; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v2, -16, v2 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; CHECK-NEXT: .LBB19_8: ; %memmove_fwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen ; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s5 ; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[4:5] ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execnz .LBB19_8 ; CHECK-NEXT: .LBB19_9: ; %Flow39 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: s_andn2_saveexec_b32 s5, s6 ; CHECK-NEXT: s_cbranch_execz .LBB19_2 ; CHECK-NEXT: .LBB19_10: ; %memmove_copy_backwards ; CHECK-NEXT: s_and_saveexec_b32 s6, s4 ; CHECK-NEXT: s_cbranch_execz .LBB19_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader ; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v2 ; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v6, v0, v7 ; CHECK-NEXT: v_add_nc_u32_e32 v7, v1, v7 ; CHECK-NEXT: .LBB19_12: ; %memmove_bwd_residual_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: buffer_load_ubyte v8, v7, s[0:3], 0 offen ; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, s4 ; CHECK-NEXT: v_add_nc_u32_e32 v7, -1, v7 ; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[4:5] ; CHECK-NEXT: s_or_b32 s7, s4, s7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_byte v8, v6, s[0:3], 0 offen ; CHECK-NEXT: v_add_nc_u32_e32 v6, -1, v6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7 ; CHECK-NEXT: s_cbranch_execnz .LBB19_12 ; CHECK-NEXT: .LBB19_13: ; %Flow45 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB19_16 ; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader ; CHECK-NEXT: v_and_b32_e32 v5, -16, v2 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4 ; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5 ; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4 ; CHECK-NEXT: v_sub_co_ci_u32_e64 v1, null, 0, v3, vcc_lo ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB19_15: ; %memmove_bwd_main_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3 ; CHECK-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:12 ; CHECK-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v4, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v7, v4, s[0:3], 0 offen ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4 ; CHECK-NEXT: s_waitcnt vmcnt(3) ; CHECK-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen ; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB19_15 ; CHECK-NEXT: .LBB19_16: ; %Flow43 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 %sz, i1 false) ret void } declare void @llvm.memmove.p0.p0.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p0.p1.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p0.p3.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p0.p4.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p1.p3.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p1.p4.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p3.p0.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p3.p1.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p3.p3.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p3.p4.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p3.p5.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p5.p3.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p5.p4.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #0 declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #0 attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }