[NewPM] Disable non-trivial loop-unswitch on targets with divergence

Unswitching a loop on a non-trivial divergent branch is expensive since it serializes the execution of both version of the loop. But identifying a divergent branch needs divergence analysis, which is a function level analysis. The legacy pass manager handles this dependency by isolating such a loop transform and rerunning the required function analyses. This functionality is currently missing in the new pass manager, and there is no safe way for the SimpleLoopUnswitch pass to depend on DivergenceAnalysis. So we conservatively assume that all non-trivial branches are divergent if the target has divergence. Reviewed By: tra Differential Revision: https://reviews.llvm.org/D98958
author: Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com> 2021-03-25 11:27:10 +0000
committer: Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com> 2021-03-25 11:27:10 +0000
commit: b92c8c22b924969fe6cbe1b9faf874333d4eafd0 (patch)
tree: 776d0b339ad0003bc0997d7ac2fe176f51a2f765
parent: 1e56e8717f09cc287d2c1329d4009ae38acfa54c (diff)
download: llvm-b92c8c22b924969fe6cbe1b9faf874333d4eafd0.zip
llvm-b92c8c22b924969fe6cbe1b9faf874333d4eafd0.tar.gz
llvm-b92c8c22b924969fe6cbe1b9faf874333d4eafd0.tar.bz2
3 files changed, 68 insertions, 48 deletions
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 92461ea..cf77cf7 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2901,10 +2901,20 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
     return true;
   }
 
-  // If we're not doing non-trivial unswitching, we're done. We both accept
-  // a parameter but also check a local flag that can be used for testing
-  // a debugging.
-  if (!NonTrivial && !EnableNonTrivialUnswitch)
+  // Check whether we should continue with non-trivial conditions.
+  // EnableNonTrivialUnswitch: Global variable that forces non-trivial
+  //                           unswitching for testing and debugging.
+  // NonTrivial: Parameter that enables non-trivial unswitching for this
+  //             invocation of the transform. But this should be allowed only
+  //             for targets without branch divergence.
+  //
+  // FIXME: If divergence analysis becomes available to a loop
+  // transform, we should allow unswitching for non-trivial uniform
+  // branches even on targets that have divergence.
+  // https://bugs.llvm.org/show_bug.cgi?id=48819
+  bool ContinueWithNonTrivial =
+      EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence());
+  if (!ContinueWithNonTrivial)
     return false;
 
   // Skip non-trivial unswitching for optsize functions.
diff --git a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
index 873a765..4014664 100644
--- a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
+++ b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
@@ -1,47 +1,4 @@
-; RUN: opt -mtriple=amdgcn-- -O3 -S -enable-new-pm=0 %s | FileCheck %s
-
-; This fails with the new pass manager:
-; https://bugs.llvm.org/show_bug.cgi?id=48819
-
-; Check that loop unswitch happened and condition hoisted out of the loop.
-; Condition is uniform so all targets should perform unswitching.
-
-; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
-; CHECK: entry:
-; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
-; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
-; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
-; CHECK-NEXT: br i1
-
-define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
-entry:
-  %cmp6 = icmp sgt i32 %n, 0
-  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:                                   ; preds = %entry
-  %cmp1 = icmp eq i32 %x, 123456
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret void
-
-for.body:                                         ; preds = %for.inc, %for.body.lr.ph
-  %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-  br i1 %cmp1, label %if.then, label %for.inc
-
-if.then:                                          ; preds = %for.body
-  %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07
-  store i32 %i.07, i32 * %arrayidx, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
+; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s
 
 ; Check that loop unswitch does not happen if condition is divergent.
 
diff --git a/llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll
new file mode 100644
index 0000000..943a533
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll
@@ -0,0 +1,53 @@
+; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s
+; XFAIL: *
+
+; Check that loop unswitch happened and condition hoisted out of the loop.
+; Condition is uniform so even targets with divergence should perform unswitching.
+
+; This fails with the new pass manager:
+; https://bugs.llvm.org/show_bug.cgi?id=48819
+; The correct behaviour (allow uniform non-trivial branches to be
+; unswitched on all targets) requires access to the function-level
+; divergence analysis from a loop transform, which is currently not
+; supported in the new pass manager.
+
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
+; CHECK: entry:
+; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
+; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
+; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
+; CHECK-NEXT: br i1
+
+define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp1 = icmp eq i32 %x, 123456
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07
+  store i32 %i.07, i32 * %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
author	Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com>	2021-03-25 11:27:10 +0000
committer	Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com>	2021-03-25 11:27:10 +0000
commit	b92c8c22b924969fe6cbe1b9faf874333d4eafd0 (patch)
tree	776d0b339ad0003bc0997d7ac2fe176f51a2f765
parent	1e56e8717f09cc287d2c1329d4009ae38acfa54c (diff)
download	llvm-b92c8c22b924969fe6cbe1b9faf874333d4eafd0.zip llvm-b92c8c22b924969fe6cbe1b9faf874333d4eafd0.tar.gz llvm-b92c8c22b924969fe6cbe1b9faf874333d4eafd0.tar.bz2