# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s --- # Test: DS prefetch with flush points in single-block loop. # Preheader has DS load. Loop has DS loads where some are used in same iteration # (creating flush points) but others remain unflushed at backedge (prefetches). # Expected: s_wait_dscnt 0 in preheader, non-zero waits in loop. name: ds_prefetch_flushed tracksRegLiveness: true machineFunctionInfo: isEntryFunction: true body: | ; CHECK-LABEL: name: ds_prefetch_flushed ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $sgpr0, $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec ; CHECK-NEXT: $vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec ; CHECK-NEXT: $vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec ; CHECK-NEXT: S_WAIT_DSCNT 0 ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec ; CHECK-NEXT: S_WAIT_DSCNT 1 ; CHECK-NEXT: $vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec ; CHECK-NEXT: S_BARRIER ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec ; CHECK-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec ; CHECK-NEXT: S_WAIT_DSCNT 2 ; CHECK-NEXT: $vgpr52 = V_ADD_F32_e32 $vgpr32, $vgpr32, implicit $mode, implicit $exec ; CHECK-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec ; CHECK-NEXT: $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec ; CHECK-NEXT: S_WAIT_DSCNT 2 ; CHECK-NEXT: $vgpr53 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec ; CHECK-NEXT: $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 liveins: $sgpr0, $vgpr0 ; Preheader: DS load for use inside loop $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec $vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec $vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.1, %bb.2 liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32 ; Use preheader value $vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec ; Use preheader and prefetched value $vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec ; Barrier S_BARRIER ; First two will be "flushed" by same-iteration use below $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128 $vgpr0, 64, 0, implicit $m0, implicit $exec $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128 $vgpr0, 80, 0, implicit $m0, implicit $exec ; Use preheader and prefetched value $vgpr52 = V_ADD_F32_e32 $vgpr32, $vgpr32, implicit $mode, implicit $exec ; These two remain unflushed - true prefetches for next iteration $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128 $vgpr0, 96, 0, implicit $m0, implicit $exec $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128 $vgpr0, 112, 0, implicit $m0, implicit $exec ; Use vgpr24 - creates flush point, flushes loads 1-2 $vgpr53 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec ; Loop control $sgpr0 = S_ADD_I32 $sgpr0, -1, implicit-def $scc S_CBRANCH_SCC1 %bb.1, implicit $scc S_BRANCH %bb.2 bb.2: S_ENDPGM 0 ...