aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>2019-08-01 22:18:56 +0000
committerStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>2019-08-01 22:18:56 +0000
commiteee9312a85c020f90beb186fe95835e36ae8ce1c (patch)
tree7d31ae2f30d7052c4e0bea84cf4507ca9fb0ec8d
parentac7e5788ca03dac225fd92ac937593454bd36d21 (diff)
downloadllvm-eee9312a85c020f90beb186fe95835e36ae8ce1c.zip
llvm-eee9312a85c020f90beb186fe95835e36ae8ce1c.tar.gz
llvm-eee9312a85c020f90beb186fe95835e36ae8ce1c.tar.bz2
Relax load store vectorizer pointer strip checks
The previous change to fix crash in the vectorizer introduced performance regressions. The condition to preserve pointer address space during the search is too tight, we only need to match the size. Differential Revision: https://reviews.llvm.org/D65600 llvm-svn: 367624
-rw-r--r--llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp5
-rw-r--r--llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll51
2 files changed, 47 insertions, 9 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index f799a45..19afe41 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -339,14 +339,13 @@ bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
const APInt &PtrDelta,
unsigned Depth) const {
unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
- unsigned PtrAS = PtrA->getType()->getPointerAddressSpace();
APInt OffsetA(PtrBitWidth, 0);
APInt OffsetB(PtrBitWidth, 0);
PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
- if (PtrA->getType()->getPointerAddressSpace() != PtrAS ||
- PtrB->getType()->getPointerAddressSpace() != PtrAS)
+ if (DL.getTypeStoreSizeInBits(PtrA->getType()) != PtrBitWidth ||
+ DL.getTypeStoreSizeInBits(PtrB->getType()) != PtrBitWidth)
return false;
APInt OffsetDelta = OffsetB - OffsetA;
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll
index 197a439..1e9ffdb 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll
@@ -1,18 +1,57 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S < %s | FileCheck %s
+; RUN: opt -load-store-vectorizer -S < %s | FileCheck %s
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32"
+target datalayout = "e-p:64:64-p1:64:64-p5:32:32"
-; CHECK-LABEL: @test
+; Size mismatch between the 32 bit pointer in address space 5 and 64 bit
+; pointer in address space 0 it was cast to caused below test to crash.
+; The p5:32:32 portion of the data layout is critical for the test.
+
+; CHECK-LABEL: @cast_to_ptr
; CHECK: store i32* undef, i32** %tmp9, align 8
; CHECK: store i32* undef, i32** %tmp7, align 8
-define amdgpu_kernel void @test() {
+define void @cast_to_ptr() {
entry:
- %a10.ascast.i = addrspacecast i32* addrspace(5)* null to i32**
+ %ascast = addrspacecast i32* addrspace(5)* null to i32**
%tmp4 = icmp eq i32 undef, 0
%tmp6 = select i1 false, i32** undef, i32** undef
%tmp7 = select i1 %tmp4, i32** null, i32** %tmp6
- %tmp9 = select i1 %tmp4, i32** %a10.ascast.i, i32** null
+ %tmp9 = select i1 %tmp4, i32** %ascast, i32** null
store i32* undef, i32** %tmp9, align 8
store i32* undef, i32** %tmp7, align 8
unreachable
}
+
+; CHECK-LABEL: @cast_to_cast
+; CHECK: %tmp4 = load i32*, i32** %tmp1, align 8
+; CHECK: %tmp5 = load i32*, i32** %tmp3, align 8
+define void @cast_to_cast() {
+entry:
+ %a.ascast = addrspacecast i32* addrspace(5)* undef to i32**
+ %b.ascast = addrspacecast i32* addrspace(5)* null to i32**
+ %tmp1 = select i1 false, i32** %a.ascast, i32** undef
+ %tmp3 = select i1 false, i32** %b.ascast, i32** undef
+ %tmp4 = load i32*, i32** %tmp1, align 8
+ %tmp5 = load i32*, i32** %tmp3, align 8
+ unreachable
+}
+
+; CHECK-LABEL: @all_to_cast
+; CHECK: load <4 x float>
+define void @all_to_cast(i8* nocapture readonly align 16 dereferenceable(16) %alloc1) {
+entry:
+ %alloc16 = addrspacecast i8* %alloc1 to i8 addrspace(1)*
+ %tmp = bitcast i8 addrspace(1)* %alloc16 to float addrspace(1)*
+ %tmp1 = load float, float addrspace(1)* %tmp, align 16, !invariant.load !0
+ %tmp6 = getelementptr inbounds i8, i8 addrspace(1)* %alloc16, i64 4
+ %tmp7 = bitcast i8 addrspace(1)* %tmp6 to float addrspace(1)*
+ %tmp8 = load float, float addrspace(1)* %tmp7, align 4, !invariant.load !0
+ %tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %alloc16, i64 8
+ %tmp16 = bitcast i8 addrspace(1)* %tmp15 to float addrspace(1)*
+ %tmp17 = load float, float addrspace(1)* %tmp16, align 8, !invariant.load !0
+ %tmp24 = getelementptr inbounds i8, i8 addrspace(1)* %alloc16, i64 12
+ %tmp25 = bitcast i8 addrspace(1)* %tmp24 to float addrspace(1)*
+ %tmp26 = load float, float addrspace(1)* %tmp25, align 4, !invariant.load !0
+ ret void
+}
+
+!0 = !{}