[PPCGCodeGeneration] Teach Polly to start using live range reordering.

Polly did not use PPCG's live range reordering feature. Teach PPCGCodeGeneration to use this. Documentation on this is sparse, so much of the code is conservative. We currently kill all phi nodes in a Scop by appending them to the must_kill map we pass to PPCG. I do not have a proof of correctness, but it seems to be intuitively correct. We also do not handle `array_order`, which, quoting PPCG, is: PPCG/gpu.h: "Order dependences on non-scalars." It seems to consist of RAW dependences between arrays. We need to pass this information for more complex privatization cases. Differential Revision: https://reviews.llvm.org/D34941 llvm-svn: 307163
author: Siddharth Bhat <siddu.druid@gmail.com> 2017-07-05 14:57:04 +0000
committer: Siddharth Bhat <siddu.druid@gmail.com> 2017-07-05 14:57:04 +0000
commit: a82f2d264a05dbaf53c381c87e34f90354daed55 (patch)
tree: 6e6157c4239ed0c8b1f1971baa25577d1a203674 /polly
parent: d54759b236b8e61468e7f0a4e05bda004fe61f63 (diff)
download: llvm-a82f2d264a05dbaf53c381c87e34f90354daed55.zip
llvm-a82f2d264a05dbaf53c381c87e34f90354daed55.tar.gz
llvm-a82f2d264a05dbaf53c381c87e34f90354daed55.tar.bz2
5 files changed, 243 insertions, 13 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index eb46d45..b882455 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -112,6 +112,111 @@ static cl::opt<int>
                cl::desc("Minimal number of compute statements to run on GPU."),
                cl::Hidden, cl::init(10 * 512 * 512));
 
+/// Used to store information PPCG wants for kills. This information is
+/// used by live range reordering.
+///
+/// @see computeLiveRangeReordering
+/// @see GPUNodeBuilder::createPPCGScop
+/// @see GPUNodeBuilder::createPPCGProg
+struct MustKillsInfo {
+  /// Collection of all kill statements that will be sequenced at the end of
+  /// PPCGScop->schedule.
+  ///
+  /// The nodes in `KillsSchedule` will be merged using `isl_schedule_set`
+  /// which merges schedules in *arbitrary* order.
+  /// (we don't care about the order of the kills anyway).
+  isl::schedule KillsSchedule;
+  /// Map from kill statement instances to scalars that need to be
+  /// killed.
+  ///
+  /// We currently only derive kill information for phi nodes, as phi nodes
+  /// allow us to easily derive kill information. PHI nodes are not alive
+  /// outside the scop and can consequently all be "killed". [params] -> {
+  /// [Stmt_phantom[] -> ref_phantom[]] -> phi_ref[] }
+  isl::union_map TaggedMustKills;
+
+  MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){};
+};
+
+/// Compute must-kills needed to enable live range reordering with PPCG.
+///
+/// @params S The Scop to compute live range reordering information
+/// @returns live range reordering information that can be used to setup
+/// PPCG.
+static MustKillsInfo computeMustKillsInfo(const Scop &S) {
+  const isl::space ParamSpace(isl::manage(S.getParamSpace()));
+  MustKillsInfo Info;
+
+  // 1. Collect phi nodes in scop.
+  SmallVector<isl::id, 4> KillMemIds;
+  for (ScopArrayInfo *SAI : S.arrays()) {
+    if (!SAI->isPHIKind())
+      continue;
+
+    KillMemIds.push_back(isl::manage(SAI->getBasePtrId()));
+  }
+
+  Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace));
+
+  // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the
+  // schedule:
+  //     - filter: "[control] -> { }"
+  // So, we choose to not create this to keep the output a little nicer,
+  // at the cost of some code complexity.
+  Info.KillsSchedule = nullptr;
+
+  for (isl::id &phiId : KillMemIds) {
+    isl::id KillStmtId = isl::id::alloc(
+        S.getIslCtx(), std::string("SKill_phantom_").append(phiId.get_name()),
+        nullptr);
+
+    // NOTE: construction of tagged_must_kill:
+    // 2. We need to construct a map:
+    //     [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> phi_ref }
+    // To construct this, we use `isl_map_domain_product` on 2 maps`:
+    // 2a. StmtToPhi:
+    //         [param] -> { Stmt_phantom[] -> phi_ref[] }
+    // 2b. PhantomRefToPhi:
+    //         [param] -> { ref_phantom[] -> phi_ref[] }
+    //
+    // Combining these with `isl_map_domain_product` gives us
+    // TaggedMustKill:
+    //     [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] }
+
+    // 2a. [param] -> { S_2[] -> phi_ref[] }
+    isl::map StmtToPhi = isl::map::universe(isl::space(ParamSpace));
+    StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::in, isl::id(KillStmtId));
+    StmtToPhi = StmtToPhi.set_tuple_id(isl::dim::out, isl::id(phiId));
+
+    isl::id PhantomRefId = isl::id::alloc(
+        S.getIslCtx(), std::string("ref_phantom") + phiId.get_name(), nullptr);
+
+    // 2b. [param] -> { phantom_ref[] -> memref[] }
+    isl::map PhantomRefToPhi = isl::map::universe(isl::space(ParamSpace));
+    PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::in, PhantomRefId);
+    PhantomRefToPhi = PhantomRefToPhi.set_tuple_id(isl::dim::out, phiId);
+
+    // 2. [param] -> { [Stmt[] -> phantom_ref[]] -> memref[] }
+    isl::map TaggedMustKill = StmtToPhi.domain_product(PhantomRefToPhi);
+    Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill);
+
+    // 3. Create the kill schedule of the form:
+    //     "[param] -> { Stmt_phantom[] }"
+    // Then add this to Info.KillsSchedule.
+    isl::space KillStmtSpace = ParamSpace;
+    KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId);
+    isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace);
+
+    isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain);
+    if (Info.KillsSchedule)
+      Info.KillsSchedule = Info.KillsSchedule.set(KillSchedule);
+    else
+      Info.KillsSchedule = KillSchedule;
+  }
+
+  return Info;
+}
+
 /// Create the ast expressions for a ScopStmt.
 ///
 /// This function is a callback for to generate the ast expressions for each
@@ -2114,6 +2219,8 @@ public:
     auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
 
     PPCGScop->options = createPPCGOptions();
+    // enable live range reordering
+    PPCGScop->options->live_range_reordering = 1;
 
     PPCGScop->start = 0;
     PPCGScop->end = 0;
@@ -2129,10 +2236,9 @@ public:
     PPCGScop->tagged_must_writes = getTaggedMustWrites();
     PPCGScop->must_writes = S->getMustWrites();
     PPCGScop->live_out = nullptr;
-    PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace());
     PPCGScop->tagger = nullptr;
-
-    PPCGScop->independence = nullptr;
+    PPCGScop->independence =
+        isl_union_map_empty(isl_set_get_space(PPCGScop->context));
     PPCGScop->dep_flow = nullptr;
     PPCGScop->tagged_dep_flow = nullptr;
     PPCGScop->dep_false = nullptr;
@@ -2141,8 +2247,15 @@ public:
     PPCGScop->tagged_dep_order = nullptr;
 
     PPCGScop->schedule = S->getScheduleTree();
-    PPCGScop->names = getNames();
 
+    MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
+    // If we have something non-trivial to kill, add it to the schedule
+    if (KillsInfo.KillsSchedule.get())
+      PPCGScop->schedule = isl_schedule_sequence(
+          PPCGScop->schedule, KillsInfo.KillsSchedule.take());
+    PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take();
+
+    PPCGScop->names = getNames();
     PPCGScop->pet = nullptr;
 
     compute_tagger(PPCGScop);
@@ -2414,7 +2527,13 @@ public:
     PPCGProg->to_inner = getArrayIdentity();
     PPCGProg->to_outer = getArrayIdentity();
     PPCGProg->any_to_outer = nullptr;
-    PPCGProg->array_order = nullptr;
+
+    // this needs to be set when live range reordering is enabled.
+    // NOTE: I believe that is conservatively correct. I'm not sure
+    //       what the semantics of this is.
+    // Quoting PPCG/gpu.h: "Order dependences on non-scalars."
+    PPCGProg->array_order =
+        isl_union_map_empty(isl_set_get_space(PPCGScop->context));
     PPCGProg->n_stmts = std::distance(S->begin(), S->end());
     PPCGProg->stmts = getStatements();
     PPCGProg->n_array = std::distance(S->array_begin(), S->array_end());
@@ -2424,7 +2543,6 @@ public:
     createArrays(PPCGProg);
 
     PPCGProg->may_persist = compute_may_persist(PPCGProg);
-
     return PPCGProg;
   }
 
diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll
index ba2674c..465acaf 100644
--- a/polly/test/GPGPU/non-read-only-scalars.ll
+++ b/polly/test/GPGPU/non-read-only-scalars.ll
@@ -67,7 +67,6 @@
 
 ; CODE:   }
 ; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0__phi, dev_MemRef_sum_0__phi, sizeof(float), cudaMemcpyDeviceToHost));
 ; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost));
 ; CODE-NEXT: }
 
diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll
index e2780cb..eb32030 100644
--- a/polly/test/GPGPU/phi-nodes-in-kernel.ll
+++ b/polly/test/GPGPU/phi-nodes-in-kernel.ll
@@ -24,9 +24,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CODE-NEXT:     cudaCheckKernel();
 ; CODE-NEXT:   }
 
-; CODE:   cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055__phi, dev_MemRef_out_l_055__phi, sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE:       cudaCheckReturn(cudaMemcpy(&MemRef_out_l_055, dev_MemRef_out_l_055, sizeof(i32), cudaMemcpyDeviceToHost));
+; CODE-NEXT:  cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
 ; CODE-NEXT: }
 
 ; CODE: # kernel0
@@ -41,9 +40,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; IR:      [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8*
 ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
 
-; IR:      [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4)
-; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
+; IR: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
 ; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4)
 
 ; KERNEL-IR: entry:
diff --git a/polly/test/GPGPU/privatization-simple.ll b/polly/test/GPGPU/privatization-simple.ll
new file mode 100644
index 0000000..a648437
--- /dev/null
+++ b/polly/test/GPGPU/privatization-simple.ll
@@ -0,0 +1,56 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
+
+; SCOP:      Function: f
+; SCOP-NEXT: Region: %for.body---%for.end
+; SCOP-NEXT: Max Loop Depth:  1
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+; void f(int A[], int B[], int control, int C[]) {
+;     int x;
+; #pragma scop
+;     for(int i = 0; i < 1000; i ++) {
+;         x = 0;
+;         if(control) x = C[i];
+;         B[i] = x * A[i];
+; 
+;     }
+; #pragma endscop
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %A, i32* %B, i32 %control, i32* %C) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %entry.split, %if.end
+  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
+  %tobool = icmp eq i32 %control, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %tmp4 = load i32, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp8 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %tmp8, %x.0
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %if.end
+  ret void
+}
diff --git a/polly/test/GPGPU/privatization.ll b/polly/test/GPGPU/privatization.ll
new file mode 100644
index 0000000..f2a4ef2
--- /dev/null
+++ b/polly/test/GPGPU/privatization.ll
@@ -0,0 +1,60 @@
+; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
+
+; SCOP:      Function: checkPrivatization
+; SCOP-NEXT: Region: %for.body---%for.end
+; SCOP-NEXT: Max Loop Depth:  1
+
+
+; Check that kernel launch is generated in host IR.
+; the declare would not be generated unless a call to a kernel exists.
+; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
+
+; 
+;
+;    void checkPrivatization(int A[], int B[], int C[], int control) {
+;      int x;
+;    #pragma scop
+;      for (int i = 0; i < 1000; i++) {
+;        x = 0;
+;        if (control)
+;          x += C[i];
+;
+;        B[i] = x * A[i];
+;      }
+;    #pragma endscop
+;    }
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @checkPrivatization(i32* %A, i32* %B, i32* %C, i32 %control) {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %entry.split, %if.end
+  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
+  %tobool = icmp eq i32 %control, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %tmp4 = load i32, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp9 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %tmp9, %x.0
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %if.end
+  ret void
+}
author	Siddharth Bhat <siddu.druid@gmail.com>	2017-07-05 14:57:04 +0000
committer	Siddharth Bhat <siddu.druid@gmail.com>	2017-07-05 14:57:04 +0000
commit	a82f2d264a05dbaf53c381c87e34f90354daed55 (patch)
tree	6e6157c4239ed0c8b1f1971baa25577d1a203674 /polly
parent	d54759b236b8e61468e7f0a4e05bda004fe61f63 (diff)
download	llvm-a82f2d264a05dbaf53c381c87e34f90354daed55.zip llvm-a82f2d264a05dbaf53c381c87e34f90354daed55.tar.gz llvm-a82f2d264a05dbaf53c381c87e34f90354daed55.tar.bz2