aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2016-11-29 03:41:26 +0000
committerTom Stellard <thomas.stellard@amd.com>2016-11-29 03:41:26 +0000
commit2064ab4ae69a0fde9b7fb003844f89ab2c441f4a (patch)
tree57abba27e9710adf4328f4f2e8bf959819cce4e1
parent572c582019c7b6fdd110217fc074e98656663543 (diff)
downloadllvm-2064ab4ae69a0fde9b7fb003844f89ab2c441f4a.zip
llvm-2064ab4ae69a0fde9b7fb003844f89ab2c441f4a.tar.gz
llvm-2064ab4ae69a0fde9b7fb003844f89ab2c441f4a.tar.bz2
Merging r277504:
------------------------------------------------------------------------ r277504 | nhaehnle | 2016-08-02 12:31:14 -0700 (Tue, 02 Aug 2016) | 21 lines AMDGPU: Stay in WQM for non-intrinsic stores Summary: Two types of stores are possible in pixel shaders: stores to memory that are explicitly requested at the API level, and stores that are an implementation detail of register spilling or lowering of arrays. For the first kind of store, we must ensure that helper pixels have no effect and hence WQM must be disabled. The second kind of store must always be executed, because the written value may be loaded again in a way that is relevant for helper pixels as well -- and there are no externally visible effects anyway. This is a candidate for the 3.9 release branch. Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: https://reviews.llvm.org/D22675 ------------------------------------------------------------------------ llvm-svn: 288104
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.ll41
1 files changed, 41 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index b272190..809a7ba 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -374,6 +374,47 @@ break:
ret <4 x float> %c.iv
}
+; Only intrinsic stores need exact execution -- other stores do not have
+; externally visible effects and may require WQM for correctness.
+;
+; CHECK-LABEL: {{^}}test_alloca:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+
+; CHECK: image_sample
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dwordx4
+define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
+entry:
+ %array = alloca [32 x i32], align 4
+
+ call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+
+ %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0
+ store volatile i32 %a, i32* %s.gep, align 4
+
+ call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
+
+ %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
+ %c = load i32, i32* %c.gep, align 4
+
+ %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+
+ ret void
+}
+
+
declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1