diff options
author | Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com> | 2021-03-25 11:27:10 +0000 |
---|---|---|
committer | Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com> | 2021-03-25 11:27:10 +0000 |
commit | b92c8c22b924969fe6cbe1b9faf874333d4eafd0 (patch) | |
tree | 776d0b339ad0003bc0997d7ac2fe176f51a2f765 | |
parent | 1e56e8717f09cc287d2c1329d4009ae38acfa54c (diff) | |
download | llvm-b92c8c22b924969fe6cbe1b9faf874333d4eafd0.zip llvm-b92c8c22b924969fe6cbe1b9faf874333d4eafd0.tar.gz llvm-b92c8c22b924969fe6cbe1b9faf874333d4eafd0.tar.bz2 |
[NewPM] Disable non-trivial loop-unswitch on targets with divergence
Unswitching a loop on a non-trivial divergent branch is expensive
since it serializes the execution of both version of the
loop. But identifying a divergent branch needs divergence analysis,
which is a function level analysis.
The legacy pass manager handles this dependency by isolating such a
loop transform and rerunning the required function analyses. This
functionality is currently missing in the new pass manager, and there
is no safe way for the SimpleLoopUnswitch pass to depend on
DivergenceAnalysis. So we conservatively assume that all non-trivial
branches are divergent if the target has divergence.
Reviewed By: tra
Differential Revision: https://reviews.llvm.org/D98958
3 files changed, 68 insertions, 48 deletions
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 92461ea..cf77cf7 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -2901,10 +2901,20 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, return true; } - // If we're not doing non-trivial unswitching, we're done. We both accept - // a parameter but also check a local flag that can be used for testing - // a debugging. - if (!NonTrivial && !EnableNonTrivialUnswitch) + // Check whether we should continue with non-trivial conditions. + // EnableNonTrivialUnswitch: Global variable that forces non-trivial + // unswitching for testing and debugging. + // NonTrivial: Parameter that enables non-trivial unswitching for this + // invocation of the transform. But this should be allowed only + // for targets without branch divergence. + // + // FIXME: If divergence analysis becomes available to a loop + // transform, we should allow unswitching for non-trivial uniform + // branches even on targets that have divergence. + // https://bugs.llvm.org/show_bug.cgi?id=48819 + bool ContinueWithNonTrivial = + EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence()); + if (!ContinueWithNonTrivial) return false; // Skip non-trivial unswitching for optsize functions. diff --git a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll index 873a765..4014664 100644 --- a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll +++ b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll @@ -1,47 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -O3 -S -enable-new-pm=0 %s | FileCheck %s - -; This fails with the new pass manager: -; https://bugs.llvm.org/show_bug.cgi?id=48819 - -; Check that loop unswitch happened and condition hoisted out of the loop. -; Condition is uniform so all targets should perform unswitching. - -; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch -; CHECK: entry: -; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp -; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456 -; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]] -; CHECK-NEXT: br i1 - -define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) { -entry: - %cmp6 = icmp sgt i32 %n, 0 - br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup - -for.body.lr.ph: ; preds = %entry - %cmp1 = icmp eq i32 %x, 123456 - br label %for.body - -for.cond.cleanup.loopexit: ; preds = %for.inc - br label %for.cond.cleanup - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - ret void - -for.body: ; preds = %for.inc, %for.body.lr.ph - %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] - br i1 %cmp1, label %if.then, label %for.inc - -if.then: ; preds = %for.body - %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07 - store i32 %i.07, i32 * %arrayidx, align 4 - br label %for.inc - -for.inc: ; preds = %for.body, %if.then - %inc = add nuw nsw i32 %i.07, 1 - %exitcond = icmp eq i32 %inc, %n - br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body -} +; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s ; Check that loop unswitch does not happen if condition is divergent. diff --git a/llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll new file mode 100644 index 0000000..943a533 --- /dev/null +++ b/llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll @@ -0,0 +1,53 @@ +; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s +; XFAIL: * + +; Check that loop unswitch happened and condition hoisted out of the loop. +; Condition is uniform so even targets with divergence should perform unswitching. + +; This fails with the new pass manager: +; https://bugs.llvm.org/show_bug.cgi?id=48819 +; The correct behaviour (allow uniform non-trivial branches to be +; unswitched on all targets) requires access to the function-level +; divergence analysis from a loop transform, which is currently not +; supported in the new pass manager. + +; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch +; CHECK: entry: +; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp +; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456 +; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]] +; CHECK-NEXT: br i1 + +define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %cmp1 = icmp eq i32 %x, 123456 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.inc, %for.body.lr.ph + %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07 + store i32 %i.07, i32 * %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } |