aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2020-01-31 19:39:02 -0500
committerMatt Arsenault <Matthew.Arsenault@amd.com>2020-02-03 14:26:01 -0800
commit7d3aace3f52f6b3f87aac432aa41ae1cdeb348eb (patch)
treea38b231f7e7ff5b131d45817af41e1e1a8286f2f
parentf8c4d70d11388ddbf3ccc63ca4ea35d09a987d41 (diff)
downloadllvm-7d3aace3f52f6b3f87aac432aa41ae1cdeb348eb.zip
llvm-7d3aace3f52f6b3f87aac432aa41ae1cdeb348eb.tar.gz
llvm-7d3aace3f52f6b3f87aac432aa41ae1cdeb348eb.tar.bz2
AMDGPU: Add flag to control mem intrinsic expansion
GlobalISel doesn't implement the expansion for these yet, so add a flag to force expanding these so it's possible to avoid these for a while.
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp12
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll123
2 files changed, 133 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 1503296..54c15e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -22,7 +22,15 @@ using namespace llvm;
namespace {
-const unsigned MaxStaticSize = 1024;
+static int MaxStaticSize;
+
+static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt(
+ "amdgpu-mem-intrinsic-expand-size",
+ cl::desc("Set minimum mem intrinsic size to expand in IR"),
+ cl::location(MaxStaticSize),
+ cl::init(1024),
+ cl::Hidden);
+
class AMDGPULowerIntrinsics : public ModulePass {
private:
@@ -57,7 +65,7 @@ INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
// require splitting based on alignment)
static bool shouldExpandOperationWithSize(Value *Size) {
ConstantInt *CI = dyn_cast<ConstantInt>(Size);
- return !CI || (CI->getZExtValue() > MaxStaticSize);
+ return !CI || (CI->getSExtValue() > MaxStaticSize);
}
bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
new file mode 100644
index 0000000..a83715d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
+; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
+; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
+; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
+
+; Test the -amdgpu-mem-intrinsic-expand-size flag works.
+
+; Make sure we can always eliminate the intrinsic, even at 0.
+define amdgpu_kernel void @memset_size_0(i8 addrspace(1)* %dst, i8 %val) {
+; OPT8-LABEL: @memset_size_0(
+; OPT8-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
+; OPT8-NEXT: ret void
+;
+; OPT4-LABEL: @memset_size_0(
+; OPT4-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
+; OPT4-NEXT: ret void
+;
+; OPT0-LABEL: @memset_size_0(
+; OPT0-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
+; OPT0-NEXT: ret void
+;
+; OPT_NEG-LABEL: @memset_size_0(
+; OPT_NEG-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
+; OPT_NEG: loadstoreloop:
+; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
+; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
+; OPT_NEG-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]]
+; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
+; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
+; OPT_NEG: split:
+; OPT_NEG-NEXT: ret void
+;
+ call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 0, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memset_size_4(i8 addrspace(1)* %dst, i8 %val) {
+; OPT8-LABEL: @memset_size_4(
+; OPT8-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
+; OPT8-NEXT: ret void
+;
+; OPT4-LABEL: @memset_size_4(
+; OPT4-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
+; OPT4-NEXT: ret void
+;
+; OPT0-LABEL: @memset_size_4(
+; OPT0-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
+; OPT0: loadstoreloop:
+; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
+; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
+; OPT0-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]]
+; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4
+; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
+; OPT0: split:
+; OPT0-NEXT: ret void
+;
+; OPT_NEG-LABEL: @memset_size_4(
+; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
+; OPT_NEG: loadstoreloop:
+; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
+; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
+; OPT_NEG-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]]
+; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4
+; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
+; OPT_NEG: split:
+; OPT_NEG-NEXT: ret void
+;
+ call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 4, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memset_size_8(i8 addrspace(1)* %dst, i8 %val) {
+; OPT8-LABEL: @memset_size_8(
+; OPT8-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false)
+; OPT8-NEXT: ret void
+;
+; OPT4-LABEL: @memset_size_8(
+; OPT4-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
+; OPT4: loadstoreloop:
+; OPT4-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
+; OPT4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
+; OPT4-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]]
+; OPT4-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; OPT4-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
+; OPT4-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
+; OPT4: split:
+; OPT4-NEXT: ret void
+;
+; OPT0-LABEL: @memset_size_8(
+; OPT0-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
+; OPT0: loadstoreloop:
+; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
+; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
+; OPT0-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]]
+; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
+; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
+; OPT0: split:
+; OPT0-NEXT: ret void
+;
+; OPT_NEG-LABEL: @memset_size_8(
+; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
+; OPT_NEG: loadstoreloop:
+; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
+; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
+; OPT_NEG-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]]
+; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
+; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
+; OPT_NEG: split:
+; OPT_NEG-NEXT: ret void
+;
+ call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 8, i1 false)
+ ret void
+}
+
+declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture writeonly, i8, i64, i1 immarg) #0
+
+attributes #0 = { argmemonly nounwind willreturn writeonly }