From 6fa65f8a98967a5d2d2a6863e0f67a40d2961905 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Wed, 29 Jun 2022 16:44:57 -0500 Subject: [Polly][MatMul] Abandon dependence analysis. The copy statements inserted by the matrix-multiplication optimization introduce new dependencies between the copy statements and other statements. As a result, the DependenceInfo must be recomputed. Not recomputing them caused IslAstInfo to deduce that some loops are parallel but cause race conditions when accessing the packed arrays. As a result, matrix-matrix multiplication currently cannot be parallelized. Also see discussion at https://reviews.llvm.org/D125202 --- polly/include/polly/DependenceInfo.h | 16 +++++++ polly/lib/Analysis/DependenceInfo.cpp | 10 ++++ polly/lib/Transform/MatmulOptimizer.cpp | 3 -- polly/lib/Transform/ScheduleOptimizer.cpp | 56 ++++++++++++---------- polly/test/CodeGen/OpenMP/matmul-parallel.ll | 24 +++------- .../pattern-matching-based-opts.ll | 3 +- 6 files changed, 63 insertions(+), 49 deletions(-) (limited to 'polly') diff --git a/polly/include/polly/DependenceInfo.h b/polly/include/polly/DependenceInfo.h index d1d6aa2..7526a29 100644 --- a/polly/include/polly/DependenceInfo.h +++ b/polly/include/polly/DependenceInfo.h @@ -208,6 +208,15 @@ struct DependenceAnalysis final : public AnalysisInfoMixin { /// Recompute dependences from schedule and memory accesses. const Dependences &recomputeDependences(Dependences::AnalysisLevel Level); + + /// Invalidate the dependence information and recompute it when needed + /// again. + /// May be required when the underlaying Scop was changed in a way that + /// would add new dependencies (e.g. between new statement instances + /// insierted into the SCoP) or intentionally breaks existing ones. It is + /// not required when updating the schedule that conforms the existing + /// dependencies. + void abandonDependences(); }; Result run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR); @@ -241,6 +250,13 @@ public: /// Recompute dependences from schedule and memory accesses. const Dependences &recomputeDependences(Dependences::AnalysisLevel Level); + /// Invalidate the dependence information and recompute it when needed again. + /// May be required when the underlaying Scop was changed in a way that would + /// add new dependencies (e.g. between new statement instances insierted into + /// the SCoP) or intentionally breaks existing ones. It is not required when + /// updating the schedule that conforms the existing dependencies. + void abandonDependences(); + /// Compute the dependence information for the SCoP @p S. bool runOnScop(Scop &S) override; diff --git a/polly/lib/Analysis/DependenceInfo.cpp b/polly/lib/Analysis/DependenceInfo.cpp index f29355ff..d58dc99 100644 --- a/polly/lib/Analysis/DependenceInfo.cpp +++ b/polly/lib/Analysis/DependenceInfo.cpp @@ -848,6 +848,11 @@ const Dependences &DependenceAnalysis::Result::recomputeDependences( return *D[Level]; } +void DependenceAnalysis::Result::abandonDependences() { + for (std::unique_ptr &Deps : D) + Deps.release(); +} + DependenceAnalysis::Result DependenceAnalysis::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR) { @@ -890,6 +895,11 @@ DependenceInfo::recomputeDependences(Dependences::AnalysisLevel Level) { return *D[Level]; } +void DependenceInfo::abandonDependences() { + for (std::unique_ptr &Deps : D) + Deps.release(); +} + bool DependenceInfo::runOnScop(Scop &ScopVar) { S = &ScopVar; return false; diff --git a/polly/lib/Transform/MatmulOptimizer.cpp b/polly/lib/Transform/MatmulOptimizer.cpp index bad05df..4120cd8 100644 --- a/polly/lib/Transform/MatmulOptimizer.cpp +++ b/polly/lib/Transform/MatmulOptimizer.cpp @@ -491,9 +491,6 @@ createMacroKernel(isl::schedule_node Node, Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1); Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1); - // Mark the outermost loop as parallelizable. - Node = Node.as().member_set_coincident(0, true); - return Node.child(0).child(0); } diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index c036651..99645d0 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -228,6 +228,7 @@ struct OptimizerAdditionalInfoTy { bool PatternOpts; bool Postopts; bool Prevect; + bool &DepsChanged; }; class ScheduleTreeOptimizer final { @@ -526,6 +527,7 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg, tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D); if (!PatternOptimizedSchedule.is_null()) { MatMulOpts++; + OAI->DepsChanged = true; return PatternOptimizedSchedule.release(); } } @@ -676,21 +678,21 @@ static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { &Version); } -static bool runIslScheduleOptimizer( +static void runIslScheduleOptimizer( Scop &S, function_ref GetDeps, TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, - isl::schedule &LastSchedule) { + isl::schedule &LastSchedule, bool &DepsChanged) { // Skip SCoPs in case they're already optimised by PPCGCodeGeneration if (S.isToBeSkipped()) - return false; + return; // Skip empty SCoPs but still allow code generation as it will delete the // loops present but not needed. if (S.getSize() == 0) { S.markAsOptimized(); - return false; + return; } ScopsProcessed++; @@ -706,7 +708,7 @@ static bool runIslScheduleOptimizer( &S, Schedule, GetDeps(Dependences::AL_Statement), ORE); if (ManuallyTransformed.is_null()) { LLVM_DEBUG(dbgs() << "Error during manual optimization\n"); - return false; + return; } if (ManuallyTransformed.get() != Schedule.get()) { @@ -724,18 +726,18 @@ static bool runIslScheduleOptimizer( // metadata earlier in ScopDetection. if (!HasUserTransformation && S.hasDisableHeuristicsHint()) { LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n"); - return false; + return; } // Get dependency analysis. const Dependences &D = GetDeps(Dependences::AL_Statement); if (D.getSharedIslCtx() != S.getSharedIslCtx()) { LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n"); - return false; + return; } if (!D.hasValidDependences()) { LLVM_DEBUG(dbgs() << "Dependency information not available\n"); - return false; + return; } // Apply ISL's algorithm only if not overriden by the user. Note that @@ -769,7 +771,7 @@ static bool runIslScheduleOptimizer( isl::union_set Domain = S.getDomains(); if (Domain.is_null()) - return false; + return; isl::union_map Validity = D.getDependences(ValidityKinds); isl::union_map Proximity = D.getDependences(ProximityKinds); @@ -847,7 +849,7 @@ static bool runIslScheduleOptimizer( // In cases the scheduler is not able to optimize the code, we just do not // touch the schedule. if (Schedule.is_null()) - return false; + return; if (GreedyFusion) { isl::union_map Validity = D.getDependences( @@ -858,10 +860,12 @@ static bool runIslScheduleOptimizer( // Apply post-rescheduling optimizations (if enabled) and/or prevectorization. const OptimizerAdditionalInfoTy OAI = { - TTI, const_cast(&D), + TTI, + const_cast(&D), /*PatternOpts=*/!HasUserTransformation && PMBasedOpts, /*Postopts=*/!HasUserTransformation && EnablePostopts, - /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE}; + /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE, + DepsChanged}; if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) { Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); Schedule = hoistExtensionNodes(Schedule); @@ -872,7 +876,7 @@ static bool runIslScheduleOptimizer( // Skip profitability check if user transformation(s) have been applied. if (!HasUserTransformation && !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule)) - return false; + return; auto ScopStats = S.getStatistics(); ScopsOptimized++; @@ -885,8 +889,6 @@ static bool runIslScheduleOptimizer( if (OptimizedScops) errs() << S; - - return false; } bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { @@ -904,7 +906,13 @@ bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { getAnalysis().getORE(); TargetTransformInfo *TTI = &getAnalysis().getTTI(F); - return runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule); + + bool DepsChanged = false; + runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule, + DepsChanged); + if (DepsChanged) + getAnalysis().abandonDependences(); + return false; } static void runScheduleOptimizerPrinter(raw_ostream &OS, @@ -971,22 +979,18 @@ runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, OptimizationRemarkEmitter ORE(&S.getFunction()); TargetTransformInfo *TTI = &SAR.TTI; isl::schedule LastSchedule; - bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule); + bool DepsChanged = false; + runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged); + if (DepsChanged) + Deps.abandonDependences(); + if (OS) { *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '" << S.getName() << "' in function '" << S.getFunction().getName() << "':\n"; runScheduleOptimizerPrinter(*OS, LastSchedule); } - - if (!Modified) - return PreservedAnalyses::all(); - - PreservedAnalyses PA; - PA.preserveSet>(); - PA.preserveSet>(); - PA.preserveSet>(); - return PA; + return PreservedAnalyses::all(); } llvm::PreservedAnalyses diff --git a/polly/test/CodeGen/OpenMP/matmul-parallel.ll b/polly/test/CodeGen/OpenMP/matmul-parallel.ll index 4a9d9c8..29a0939 100644 --- a/polly/test/CodeGen/OpenMP/matmul-parallel.ll +++ b/polly/test/CodeGen/OpenMP/matmul-parallel.ll @@ -2,20 +2,10 @@ ; RUN: opt %loadPolly -polly-parallel -polly-opt-isl -polly-codegen -S < %s | FileCheck --check-prefix=CODEGEN %s ; REQUIRES: asserts -; Parellization of detected matrix-multiplication. The allocations -; Packed_A and Packed_B must be passed to the outlined function. -; llvm.org/PR43164 -; -; #define N 1536 -; int foo(float A[N][N],float B[N][N],float C[N][N]) { -; for (int i = 0; i < N; i++) { -; for (int j = 0; j < N; j++) { -; for (int k = 0; k < N; k++) -; C[i][j] = C[i][j] + A[i][k] * B[k][j]; -; } -; } -; return 0; -; } +; Parallelization of detected matrix-multiplication. +; Currently, this is not supported. Due to Packed_A/Packed_B not private +; per-thread the outer loops cannot be parallelized and a +; '#pragma omp parallel for' on an inner loop may impose too much overhead. target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc19.16.27034" @@ -65,8 +55,6 @@ for.body8: } -; AST: #pragma omp parallel for +; AST-NOT: parallel -; CODGEN-LABEL: define internal void @init_array_polly_subfn(i8* %polly.par.userContext) -; CODEGEN: %polly.subfunc.arg.Packed_A = load -; CODEGEN: %polly.subfunc.arg.Packed_B = load +; CODEGEN-NOT: subfunc diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll index 90d8cd8..69f43d1 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll @@ -15,8 +15,7 @@ ; ; CHECK-NOT: The matrix multiplication pattern was detected ; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected -; PARALLEL-AST: #pragma known-parallel -; PARALLEL-AST: #pragma known-parallel +; PARALLEL-AST-NOT: #pragma known-parallel ; STATS: 1 polly-opt-isl - Number of matrix multiplication patterns detected and optimized ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -- cgit v1.1