; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s ; Loop body exceeds MaxAsyncMarkers on first iteration ; Preloop: 5 markers ; Loop body: 18 markers ; CHECK-LABEL: test_loop_exceeds_max_first_iteration: ; CHECK: ; wait_asyncmark(3) ; CHECK-NEXT: s_waitcnt vmcnt(3) define void @test_loop_exceeds_max_first_iteration(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) { entry: ; Preloop: 5 async LDS DMA operations call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() br label %loop_header loop_header: %i = phi i32 [ 0, %entry ], [ %i.next, %loop_body ] %i.next = add i32 %i, 1 %cmp = icmp slt i32 %i, %n br i1 %cmp, label %loop_body, label %exit loop_body: ; Loop body with 18 async operations call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() br label %loop_header exit: call void @llvm.amdgcn.wait.asyncmark(i16 3) %lds_val = load i32, ptr addrspace(3) %lds store i32 %lds_val, ptr addrspace(1) %out ret void } ; Loop body does not exceed MaxAsyncMarkers on first iteration ; Preloop: 5 markers ; Loop body: 5 markers ; CHECK-LABEL: test_loop_needs_more_iterations: ; CHECK: ; wait_asyncmark(3) ; CHECK-NEXT: s_waitcnt vmcnt(3) define void @test_loop_needs_more_iterations(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) { entry: ; Preloop: 5 async LDS DMA operations call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() br label %loop_header loop_header: %i = phi i32 [ 0, %entry ], [ %i.next, %loop_body ] %i.next = add i32 %i, 1 %cmp = icmp slt i32 %i, %n br i1 %cmp, label %loop_body, label %exit loop_body: ; Loop body with 5 async operations call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() br label %loop_header exit: call void @llvm.amdgcn.wait.asyncmark(i16 3) %lds_val = load i32, ptr addrspace(3) %lds store i32 %lds_val, ptr addrspace(1) %out ret void } ; Merge exceeds MaxAsyncMarkers ; CHECK-LABEL: max_when_merged: ; CHECK: ; wait_asyncmark(17) ; CHECK-NEXT: s_waitcnt vmcnt(15) define void @max_when_merged(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) { entry: %cmp = icmp slt i32 0, %n br i1 %cmp, label %then, label %else then: ; 5 async operations call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() br label %endif else: ; 18 async operations call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() br label %endif endif: call void @llvm.amdgcn.wait.asyncmark(i16 17) %lds_val = load i32, ptr addrspace(3) %lds store i32 %lds_val, ptr addrspace(1) %out ret void } ; Straightline exceeds MaxAsyncMarkers ; CHECK-LABEL: no_max_in_straightline: ; CHECK: ; wait_asyncmark(17) ; CHECK-NEXT: s_waitcnt vmcnt(17) define void @no_max_in_straightline(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) { ; 18 async operations call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.wait.asyncmark(i16 17) %lds_val = load i32, ptr addrspace(3) %lds store i32 %lds_val, ptr addrspace(1) %out ret void }