[PPCGCodeGen] Differentiate kernels based on their parent Scop

Summary: Add a sequence number that identifies a ptx_kernel's parent Scop within a function to it's name to differentiate it from other kernels produced from the same function, yet different Scops. Kernels produced from different Scops can end up having the same name. Consider a function with 2 Scops and each Scop being able to produce just one kernel. Both of these kernels have the name "kernel_0". This can lead to the wrong kernel being launched when the runtime picks a kernel from its cache based on the name alone. This patch supplements D33985, by differentiating kernels across Scops as well. Previously (even before D33985) while profiling kernels generated through JIT e.g. Julia, [[ https://groups.google.com/d/msg/polly-dev/J1j587H3-Qw/mR-jfL16BgAJ | kernels associated with different functions, and even different SCoPs within a function, would be grouped together due to the common name ]]. This patch prevents this grouping and the kernels are reported separately. Reviewers: grosser, bollu Reviewed By: grosser Subscribers: mehdi_amini, nemanjai, pollydev, kbarton Tags: #polly Differential Revision: https://reviews.llvm.org/D35176 llvm-svn: 307814
author: Singapuram Sanjay Srivallabh <singapuram.sanjay@gmail.com> 2017-07-12 16:46:19 +0000
committer: Singapuram Sanjay Srivallabh <singapuram.sanjay@gmail.com> 2017-07-12 16:46:19 +0000
commit: 1abd9ffa3753fa4942ee415b7d81f655a38d2770 (patch)
tree: 6fed380ddccfb2584cb67c430a8e6cc620365356 /polly
parent: ada11923fa4e1a7de31ac33ce68db7d129e49718 (diff)
download: llvm-1abd9ffa3753fa4942ee415b7d81f655a38d2770.zip
llvm-1abd9ffa3753fa4942ee415b7d81f655a38d2770.tar.gz
llvm-1abd9ffa3753fa4942ee415b7d81f655a38d2770.tar.bz2
10 files changed, 167 insertions, 15 deletions
diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
index 5898512..11c7b16 100644
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -1616,6 +1616,12 @@ private:
   /// The name of the SCoP (identical to the regions name)
   std::string name;
 
+  /// The ID to be assigned to the next Scop in a function
+  static int NextScopID;
+
+  /// The name of the function currently under consideration
+  static std::string CurrentFunc;
+
   // Access functions of the SCoP.
   //
   // This owns all the MemoryAccess objects of the Scop created in this pass.
@@ -1808,6 +1814,12 @@ private:
   /// The smallest statement index not yet assigned.
   long StmtIdx = 0;
 
+  /// A number that uniquely represents a Scop within its function
+  const int ID;
+
+  /// Return the ID for a new Scop within a function
+  static int getNextID(std::string ParentFunc);
+
   /// Scop constructor; invoked from ScopBuilder::buildScop.
   Scop(Region &R, ScalarEvolution &SE, LoopInfo &LI,
        ScopDetection::DetectionContext &DC);
@@ -2378,6 +2390,9 @@ public:
   /// Check if the SCoP is to be skipped by ScopPass passes.
   bool isToBeSkipped() const { return SkipScop; }
 
+  /// Return the ID of the Scop
+  int getID() const { return ID; }
+
   /// Get the name of the entry and exit blocks of this Scop.
   ///
   /// These along with the function name can uniquely identify a Scop.
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index bf69723..49a49f1 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -3499,6 +3499,18 @@ static Loop *getLoopSurroundingScop(Scop &S, LoopInfo &LI) {
   return L ? (S.contains(L) ? L->getParentLoop() : L) : nullptr;
 }
 
+int Scop::NextScopID = 0;
+
+std::string Scop::CurrentFunc = "";
+
+int Scop::getNextID(std::string ParentFunc) {
+  if (ParentFunc != CurrentFunc) {
+    CurrentFunc = ParentFunc;
+    NextScopID = 0;
+  }
+  return NextScopID++;
+}
+
 Scop::Scop(Region &R, ScalarEvolution &ScalarEvolution, LoopInfo &LI,
            ScopDetection::DetectionContext &DC)
     : SE(&ScalarEvolution), R(R), name(R.getNameStr()), IsOptimized(false),
@@ -3506,7 +3518,8 @@ Scop::Scop(Region &R, ScalarEvolution &ScalarEvolution, LoopInfo &LI,
       MaxLoopDepth(0), CopyStmtsNum(0), SkipScop(false), DC(DC),
       IslCtx(isl_ctx_alloc(), isl_ctx_free), Context(nullptr),
       Affinator(this, LI), AssumedContext(nullptr), InvalidContext(nullptr),
-      Schedule(nullptr) {
+      Schedule(nullptr),
+      ID(getNextID((*R.getEntry()->getParent()).getName().str())) {
   if (IslOnErrorAbort)
     isl_options_set_on_error(getIslCtx(), ISL_ON_ERROR_ABORT);
   buildContext();
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index d2e4b2b..93a8417 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -686,8 +686,8 @@ private:
 };
 
 std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) {
-  return "FUNC_" + S.getFunction().getName().str() + "_KERNEL_" +
-         std::to_string(Kernel_id);
+  return "FUNC_" + S.getFunction().getName().str() + "_SCOP_" +
+         std::to_string(S.getID()) + "_KERNEL_" + std::to_string(Kernel_id);
 }
 
 void GPUNodeBuilder::initializeAfterRTH() {
diff --git a/polly/test/GPGPU/cuda-annotations.ll b/polly/test/GPGPU/cuda-annotations.ll
index 57b2e160..9a66279 100644
--- a/polly/test/GPGPU/cuda-annotations.ll
+++ b/polly/test/GPGPU/cuda-annotations.ll
@@ -4,11 +4,11 @@
 
 ; REQUIRES: pollyacc
 
-; KERNEL: define ptx_kernel void @FUNC_foo_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %n) #0 {
+; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %n) #0 {
 
 ; KERNEL: !nvvm.annotations = !{!0}
 
-; KERNEL: !0 = !{void (i8 addrspace(1)*, i64)* @FUNC_foo_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
+; KERNEL: !0 = !{void (i8 addrspace(1)*, i64)* @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll
index 9d2af1d..cc6ec52 100644
--- a/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll
@@ -54,7 +54,7 @@
 ; CHECK-NEXT:  %22 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 3
 ; CHECK-NEXT:  %23 = bitcast i32* %polly_launch_0_param_size_1 to i8*
 ; CHECK-NEXT:  store i8* %23, i8** %22
-; CHECK-NEXT:  %24 = call i8* @polly_getKernel(i8* getelementptr inbounds ([810 x i8], [810 x i8]* @FUNC_copy_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([19 x i8], [19 x i8]* @FUNC_copy_KERNEL_0_name, i32 0, i32 0))
+; CHECK-NEXT:  %24 = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
 ; CHECK-NEXT:  call void @polly_launchKernel(i8* %24, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
 ; CHECK-NEXT:  call void @polly_freeKernel(i8* %24)
 ; CHECK-NEXT:  call void @polly_synchronizeDevice()
diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll
index 15f66bb..f01398e 100644
--- a/polly/test/GPGPU/host-control-flow.ll
+++ b/polly/test/GPGPU/host-control-flow.ll
@@ -42,7 +42,7 @@
 ; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99
 ; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
 
-; KERNEL-IR: define ptx_kernel void @FUNC_foo_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0)
+; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0)
 ; KERNEL-IR-LABEL: entry:
 ; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
diff --git a/polly/test/GPGPU/invariant-load-hoisting.ll b/polly/test/GPGPU/invariant-load-hoisting.ll
index 58a6689..5b90014 100644
--- a/polly/test/GPGPU/invariant-load-hoisting.ll
+++ b/polly/test/GPGPU/invariant-load-hoisting.ll
@@ -21,7 +21,7 @@
 ; HOST-IR:      call void @polly_launchKernel(i8* %215, i32 %221, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
 ; HOST-IR-NEXT: call void @polly_freeKernel(i8* %215)
 ;
-; KERNEL-IR: define ptx_kernel void @FUNC_f_KERNEL_0(i8 addrspace(1)* %MemRef_B, i8 addrspace(1)* %MemRef_A, i32 %n, i32 %tmp12) #0 {
+; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B, i8 addrspace(1)* %MemRef_A, i32 %n, i32 %tmp12) #0 {
 ;
 ; Check that we generate correct GPU code in case of invariant load hoisting.
 ;
diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
index a2233cc..3134e4f 100644
--- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll
+++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
@@ -16,12 +16,12 @@
 ;        B[i] += 42;
 ;    }
 
-; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_KERNEL_0'
-; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_KERNEL_0"
+; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0'
+; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0"
 ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
 
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_KERNEL_0(i8 addrspace(1)* %MemRef_A)
+; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A)
 ; KERNEL-NEXT:   entry:
 ; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
@@ -31,12 +31,12 @@
 ; KERNEL:     ret void
 ; KERNEL-NEXT: }
 
-; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_KERNEL_1'
-; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_KERNEL_1"
+; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1'
+; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1"
 ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
 
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_KERNEL_1(i8 addrspace(1)* %MemRef_B)
+; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B)
 ; KERNEL-NEXT:   entry:
 ; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
diff --git a/polly/test/GPGPU/kernel-params-scop-parameter.ll b/polly/test/GPGPU/kernel-params-scop-parameter.ll
index d2d5aba..3cbd404 100644
--- a/polly/test/GPGPU/kernel-params-scop-parameter.ll
+++ b/polly/test/GPGPU/kernel-params-scop-parameter.ll
@@ -9,7 +9,7 @@
 ;        A[i] += 42;
 ;    }
 
-; KERNEL-IR: define ptx_kernel void @FUNC_kernel_params_scop_parameter_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %n)
+; KERNEL-IR: define ptx_kernel void @FUNC_kernel_params_scop_parameter_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %n)
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll b/polly/test/GPGPU/kernels-names-across-scops-funcs.ll
new file mode 100644
index 0000000..22ad3f5
--- /dev/null
+++ b/polly/test/GPGPU/kernels-names-across-scops-funcs.ll
@@ -0,0 +1,124 @@
+; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen-ppcg \
+; RUN: -polly-acc-dump-kernel-ir -disable-output < %s | \
+; RUN: FileCheck -check-prefix=KERNEL %s
+
+; REQUIRES: pollyacc
+
+; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_arg1, i32 %arg) #0 {
+; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_1_KERNEL_0(i8 addrspace(1)* %MemRef_arg1, i32 %arg) #0 {
+; KERNEL: define ptx_kernel void @FUNC_foo2_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_arg1, i32 %arg) #0 {
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32 %arg, i32* %arg1) #0 {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb
+  %tmp = icmp sgt i32 %arg, 0
+  br i1 %tmp, label %bb3, label %bb13
+
+bb3:                                              ; preds = %bb2
+  br label %bb4
+
+bb4:                                              ; preds = %bb4, %bb3
+  %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ]
+  %tmp6 = getelementptr inbounds i32, i32* %arg1, i64 %tmp5
+  %tmp7 = load i32, i32* %tmp6, align 4, !tbaa !2
+  %tmp8 = add nsw i32 %tmp7, 1
+  store i32 %tmp8, i32* %tmp6, align 4, !tbaa !2
+  %tmp9 = add nuw nsw i64 %tmp5, 1
+  %tmp10 = zext i32 %arg to i64
+  %tmp11 = icmp ne i64 %tmp9, %tmp10
+  br i1 %tmp11, label %bb4, label %bb12
+
+bb12:                                             ; preds = %bb4
+  br label %bb13
+
+bb13:                                             ; preds = %bb12, %bb2
+  %tmp14 = tail call i64 @clock() #3
+  %tmp15 = icmp eq i64 %tmp14, 0
+  br i1 %tmp15, label %bb16, label %bb29
+
+bb16:                                             ; preds = %bb13
+  %tmp17 = icmp sgt i32 %arg, 0
+  br i1 %tmp17, label %bb18, label %bb28
+
+bb18:                                             ; preds = %bb16
+  br label %bb19
+
+bb19:                                             ; preds = %bb19, %bb18
+  %tmp20 = phi i64 [ 0, %bb18 ], [ %tmp24, %bb19 ]
+  %tmp21 = getelementptr inbounds i32, i32* %arg1, i64 %tmp20
+  %tmp22 = load i32, i32* %tmp21, align 4, !tbaa !2
+  %tmp23 = add nsw i32 %tmp22, 1
+  store i32 %tmp23, i32* %tmp21, align 4, !tbaa !2
+  %tmp24 = add nuw nsw i64 %tmp20, 1
+  %tmp25 = zext i32 %arg to i64
+  %tmp26 = icmp ne i64 %tmp24, %tmp25
+  br i1 %tmp26, label %bb19, label %bb27
+
+bb27:                                             ; preds = %bb19
+  br label %bb28
+
+bb28:                                             ; preds = %bb27, %bb16
+  br label %bb29
+
+bb29:                                             ; preds = %bb28, %bb13
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind
+declare i64 @clock() #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define void @foo2(i32 %arg, i32* %arg1) #0 {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb
+  %tmp = icmp sgt i32 %arg, 0
+  br i1 %tmp, label %bb3, label %bb13
+
+bb3:                                              ; preds = %bb2
+  br label %bb4
+
+bb4:                                              ; preds = %bb4, %bb3
+  %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ]
+  %tmp6 = getelementptr inbounds i32, i32* %arg1, i64 %tmp5
+  %tmp7 = load i32, i32* %tmp6, align 4, !tbaa !2
+  %tmp8 = add nsw i32 %tmp7, 1
+  store i32 %tmp8, i32* %tmp6, align 4, !tbaa !2
+  %tmp9 = add nuw nsw i64 %tmp5, 1
+  %tmp10 = zext i32 %arg to i64
+  %tmp11 = icmp ne i64 %tmp9, %tmp10
+  br i1 %tmp11, label %bb4, label %bb12
+
+bb12:                                             ; preds = %bb4
+  br label %bb13
+
+bb13:                                             ; preds = %bb12, %bb2
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (http://llvm.org/git/clang 98cf823022d1d71065c71e9338226ebf8bfa36ba) (http://llvm.org/git/llvm.git 4efa61f12928015bad233274ffa2e60c918e9a10)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
author	Singapuram Sanjay Srivallabh <singapuram.sanjay@gmail.com>	2017-07-12 16:46:19 +0000
committer	Singapuram Sanjay Srivallabh <singapuram.sanjay@gmail.com>	2017-07-12 16:46:19 +0000
commit	1abd9ffa3753fa4942ee415b7d81f655a38d2770 (patch)
tree	6fed380ddccfb2584cb67c430a8e6cc620365356 /polly
parent	ada11923fa4e1a7de31ac33ce68db7d129e49718 (diff)
download	llvm-1abd9ffa3753fa4942ee415b7d81f655a38d2770.zip llvm-1abd9ffa3753fa4942ee415b7d81f655a38d2770.tar.gz llvm-1abd9ffa3753fa4942ee415b7d81f655a38d2770.tar.bz2