diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2020-01-31 19:39:02 -0500 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2020-02-03 14:26:01 -0800 |
| commit | 7d3aace3f52f6b3f87aac432aa41ae1cdeb348eb (patch) | |
| tree | a38b231f7e7ff5b131d45817af41e1e1a8286f2f | |
| parent | f8c4d70d11388ddbf3ccc63ca4ea35d09a987d41 (diff) | |
| download | llvm-7d3aace3f52f6b3f87aac432aa41ae1cdeb348eb.zip llvm-7d3aace3f52f6b3f87aac432aa41ae1cdeb348eb.tar.gz llvm-7d3aace3f52f6b3f87aac432aa41ae1cdeb348eb.tar.bz2 | |
AMDGPU: Add flag to control mem intrinsic expansion
GlobalISel doesn't implement the expansion for these yet, so add a
flag to force expanding these so it's possible to avoid these for a
while.
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll | 123 |
2 files changed, 133 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index 1503296..54c15e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -22,7 +22,15 @@ using namespace llvm; namespace { -const unsigned MaxStaticSize = 1024; +static int MaxStaticSize; + +static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt( + "amdgpu-mem-intrinsic-expand-size", + cl::desc("Set minimum mem intrinsic size to expand in IR"), + cl::location(MaxStaticSize), + cl::init(1024), + cl::Hidden); + class AMDGPULowerIntrinsics : public ModulePass { private: @@ -57,7 +65,7 @@ INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false, // require splitting based on alignment) static bool shouldExpandOperationWithSize(Value *Size) { ConstantInt *CI = dyn_cast<ConstantInt>(Size); - return !CI || (CI->getZExtValue() > MaxStaticSize); + return !CI || (CI->getSExtValue() > MaxStaticSize); } bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) { diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll new file mode 100644 index 0000000..a83715d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s +; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s +; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s +; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s + +; Test the -amdgpu-mem-intrinsic-expand-size flag works. + +; Make sure we can always eliminate the intrinsic, even at 0. +define amdgpu_kernel void @memset_size_0(i8 addrspace(1)* %dst, i8 %val) { +; OPT8-LABEL: @memset_size_0( +; OPT8-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false) +; OPT8-NEXT: ret void +; +; OPT4-LABEL: @memset_size_0( +; OPT4-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false) +; OPT4-NEXT: ret void +; +; OPT0-LABEL: @memset_size_0( +; OPT0-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false) +; OPT0-NEXT: ret void +; +; OPT_NEG-LABEL: @memset_size_0( +; OPT_NEG-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] +; OPT_NEG: loadstoreloop: +; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] +; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] +; OPT_NEG-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]] +; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0 +; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] +; OPT_NEG: split: +; OPT_NEG-NEXT: ret void +; + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 0, i1 false) + ret void +} + +define amdgpu_kernel void @memset_size_4(i8 addrspace(1)* %dst, i8 %val) { +; OPT8-LABEL: @memset_size_4( +; OPT8-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false) +; OPT8-NEXT: ret void +; +; OPT4-LABEL: @memset_size_4( +; OPT4-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false) +; OPT4-NEXT: ret void +; +; OPT0-LABEL: @memset_size_4( +; OPT0-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] +; OPT0: loadstoreloop: +; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] +; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] +; OPT0-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]] +; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4 +; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] +; OPT0: split: +; OPT0-NEXT: ret void +; +; OPT_NEG-LABEL: @memset_size_4( +; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] +; OPT_NEG: loadstoreloop: +; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] +; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] +; OPT_NEG-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]] +; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4 +; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] +; OPT_NEG: split: +; OPT_NEG-NEXT: ret void +; + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 4, i1 false) + ret void +} + +define amdgpu_kernel void @memset_size_8(i8 addrspace(1)* %dst, i8 %val) { +; OPT8-LABEL: @memset_size_8( +; OPT8-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false) +; OPT8-NEXT: ret void +; +; OPT4-LABEL: @memset_size_8( +; OPT4-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] +; OPT4: loadstoreloop: +; OPT4-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] +; OPT4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] +; OPT4-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]] +; OPT4-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; OPT4-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8 +; OPT4-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] +; OPT4: split: +; OPT4-NEXT: ret void +; +; OPT0-LABEL: @memset_size_8( +; OPT0-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] +; OPT0: loadstoreloop: +; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] +; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] +; OPT0-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]] +; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8 +; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] +; OPT0: split: +; OPT0-NEXT: ret void +; +; OPT_NEG-LABEL: @memset_size_8( +; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]] +; OPT_NEG: loadstoreloop: +; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] +; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] +; OPT_NEG-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]] +; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 +; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8 +; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] +; OPT_NEG: split: +; OPT_NEG-NEXT: ret void +; + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 8, i1 false) + ret void +} + +declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture writeonly, i8, i64, i1 immarg) #0 + +attributes #0 = { argmemonly nounwind willreturn writeonly } |
