diff options
author | Valentin Clement (バレンタイン クレメン) <clementval@gmail.com> | 2025-07-08 10:52:15 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-07-08 10:52:15 -0700 |
commit | 46caad52ac14cefd6f9cf3188863818e330f3844 (patch) | |
tree | 408654b7580b750b7b9dad9378de4f8eae18e234 | |
parent | 1e3f6a6c4fd8dafb208216ffcabe31fa978ca502 (diff) | |
download | llvm-users/david-salinas/fix_offload_compression.zip llvm-users/david-salinas/fix_offload_compression.tar.gz llvm-users/david-salinas/fix_offload_compression.tar.bz2 |
[flang][cuda] Do not produce data transfer in offloaded do concurrent (#147435)users/david-salinas/fix_offload_compression
If a `do concurrent` loop is offloaded then there should be no CUDA data
transfer in it. Update the semantic and lowering to take that into
account.
`AssignmentChecker` has to be put into a separate pass because the
checkers in `SemanticsVisitor` cannot have the same `Enter/Leave`
functions. The `DoForallChecker` already has `Eneter/Leave` functions
for the `DoConstruct`.
-rw-r--r-- | flang/include/flang/Optimizer/Builder/CUFCommon.h | 3 | ||||
-rw-r--r-- | flang/include/flang/Support/Fortran-features.h | 2 | ||||
-rw-r--r-- | flang/lib/Lower/Bridge.cpp | 5 | ||||
-rw-r--r-- | flang/lib/Optimizer/Builder/CUFCommon.cpp | 6 | ||||
-rw-r--r-- | flang/lib/Semantics/assignment.cpp | 56 | ||||
-rw-r--r-- | flang/lib/Semantics/assignment.h | 8 | ||||
-rw-r--r-- | flang/lib/Semantics/check-cuda.cpp | 65 | ||||
-rw-r--r-- | flang/lib/Semantics/check-cuda.h | 10 | ||||
-rw-r--r-- | flang/test/Lower/CUDA/cuda-data-transfer.cuf | 16 | ||||
-rw-r--r-- | flang/tools/bbc/bbc.cpp | 10 |
10 files changed, 109 insertions, 72 deletions
diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h index 65b9cce..5c56dd6b 100644 --- a/flang/include/flang/Optimizer/Builder/CUFCommon.h +++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h @@ -27,7 +27,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod, mlir::SymbolTable &symTab); bool isCUDADeviceContext(mlir::Operation *op); -bool isCUDADeviceContext(mlir::Region &); +bool isCUDADeviceContext(mlir::Region &, + bool isDoConcurrentOffloadEnabled = false); bool isRegisteredDeviceGlobal(fir::GlobalOp op); bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr); diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h index 8a2b7b2..857de94 100644 --- a/flang/include/flang/Support/Fortran-features.h +++ b/flang/include/flang/Support/Fortran-features.h @@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines, SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank, IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor, ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy, - InaccessibleDeferredOverride, CudaWarpMatchFunction) + InaccessibleDeferredOverride, CudaWarpMatchFunction, DoConcurrentOffload) // Portability and suspicious usage warnings ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index ff35840a..6964736 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -4886,7 +4886,10 @@ private: mlir::Location loc = getCurrentLocation(); fir::FirOpBuilder &builder = getFirOpBuilder(); - bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion()); + bool isInDeviceContext = cuf::isCUDADeviceContext( + builder.getRegion(), + getFoldingContext().languageFeatures().IsEnabled( + Fortran::common::LanguageFeature::DoConcurrentOffload)); bool isCUDATransfer = IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext; diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp index 5f286c0..dcbf499 100644 --- a/flang/lib/Optimizer/Builder/CUFCommon.cpp +++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp @@ -43,7 +43,8 @@ bool cuf::isCUDADeviceContext(mlir::Operation *op) { // for it. // If the insertion point is inside an OpenACC region op, it is considered // device context. -bool cuf::isCUDADeviceContext(mlir::Region ®ion) { +bool cuf::isCUDADeviceContext(mlir::Region ®ion, + bool isDoConcurrentOffloadEnabled) { if (region.getParentOfType<cuf::KernelOp>()) return true; if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>()) @@ -56,6 +57,9 @@ bool cuf::isCUDADeviceContext(mlir::Region ®ion) { cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice; } } + if (isDoConcurrentOffloadEnabled && + region.getParentOfType<fir::DoConcurrentLoopOp>()) + return true; return false; } diff --git a/flang/lib/Semantics/assignment.cpp b/flang/lib/Semantics/assignment.cpp index 43e23a9..88e0888 100644 --- a/flang/lib/Semantics/assignment.cpp +++ b/flang/lib/Semantics/assignment.cpp @@ -42,7 +42,6 @@ public: void Analyze(const parser::AssignmentStmt &); void Analyze(const parser::PointerAssignmentStmt &); void Analyze(const parser::ConcurrentControl &); - int deviceConstructDepth_{0}; SemanticsContext &context() { return context_; } private: @@ -97,21 +96,6 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) { if (whereDepth_ > 0) { CheckShape(lhsLoc, &lhs); } - if (context_.foldingContext().languageFeatures().IsEnabled( - common::LanguageFeature::CUDA)) { - const auto &scope{context_.FindScope(lhsLoc)}; - const Scope &progUnit{GetProgramUnitContaining(scope)}; - if (!IsCUDADeviceContext(&progUnit) && deviceConstructDepth_ == 0) { - if (Fortran::evaluate::HasCUDADeviceAttrs(lhs) && - Fortran::evaluate::HasCUDAImplicitTransfer(rhs)) { - if (GetNbOfCUDAManagedOrUnifiedSymbols(lhs) == 1 && - GetNbOfCUDAManagedOrUnifiedSymbols(rhs) == 1 && - GetNbOfCUDADeviceSymbols(rhs) == 1) - return; // This is a special case handled on the host. - context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US); - } - } - } } } @@ -254,46 +238,6 @@ void AssignmentChecker::Enter(const parser::MaskedElsewhereStmt &x) { void AssignmentChecker::Leave(const parser::MaskedElsewhereStmt &) { context_.value().PopWhereContext(); } -void AssignmentChecker::Enter(const parser::CUFKernelDoConstruct &x) { - ++context_.value().deviceConstructDepth_; -} -void AssignmentChecker::Leave(const parser::CUFKernelDoConstruct &) { - --context_.value().deviceConstructDepth_; -} -static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) { - const auto &beginBlockDirective = - std::get<Fortran::parser::AccBeginBlockDirective>(x.t); - const auto &blockDirective = - std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t); - if (blockDirective.v == llvm::acc::ACCD_parallel || - blockDirective.v == llvm::acc::ACCD_serial || - blockDirective.v == llvm::acc::ACCD_kernels) { - return true; - } - return false; -} -void AssignmentChecker::Enter(const parser::OpenACCBlockConstruct &x) { - if (IsOpenACCComputeConstruct(x)) { - ++context_.value().deviceConstructDepth_; - } -} -void AssignmentChecker::Leave(const parser::OpenACCBlockConstruct &x) { - if (IsOpenACCComputeConstruct(x)) { - --context_.value().deviceConstructDepth_; - } -} -void AssignmentChecker::Enter(const parser::OpenACCCombinedConstruct &) { - ++context_.value().deviceConstructDepth_; -} -void AssignmentChecker::Leave(const parser::OpenACCCombinedConstruct &) { - --context_.value().deviceConstructDepth_; -} -void AssignmentChecker::Enter(const parser::OpenACCLoopConstruct &) { - ++context_.value().deviceConstructDepth_; -} -void AssignmentChecker::Leave(const parser::OpenACCLoopConstruct &) { - --context_.value().deviceConstructDepth_; -} } // namespace Fortran::semantics template class Fortran::common::Indirection< diff --git a/flang/lib/Semantics/assignment.h b/flang/lib/Semantics/assignment.h index 4a1bb92..ba53774 100644 --- a/flang/lib/Semantics/assignment.h +++ b/flang/lib/Semantics/assignment.h @@ -46,14 +46,6 @@ public: void Leave(const parser::EndWhereStmt &); void Enter(const parser::MaskedElsewhereStmt &); void Leave(const parser::MaskedElsewhereStmt &); - void Enter(const parser::CUFKernelDoConstruct &); - void Leave(const parser::CUFKernelDoConstruct &); - void Enter(const parser::OpenACCBlockConstruct &); - void Leave(const parser::OpenACCBlockConstruct &); - void Enter(const parser::OpenACCCombinedConstruct &); - void Leave(const parser::OpenACCCombinedConstruct &); - void Enter(const parser::OpenACCLoopConstruct &); - void Leave(const parser::OpenACCLoopConstruct &); SemanticsContext &context(); diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp index 8decfb0..b011476 100644 --- a/flang/lib/Semantics/check-cuda.cpp +++ b/flang/lib/Semantics/check-cuda.cpp @@ -685,18 +685,67 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) { std::get<std::list<parser::CUFReduction>>(directive.t)) { CheckReduce(context_, reduce); } - inCUFKernelDoConstruct_ = true; + ++deviceConstructDepth_; +} + +static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) { + const auto &beginBlockDirective = + std::get<Fortran::parser::AccBeginBlockDirective>(x.t); + const auto &blockDirective = + std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t); + if (blockDirective.v == llvm::acc::ACCD_parallel || + blockDirective.v == llvm::acc::ACCD_serial || + blockDirective.v == llvm::acc::ACCD_kernels) { + return true; + } + return false; } void CUDAChecker::Leave(const parser::CUFKernelDoConstruct &) { - inCUFKernelDoConstruct_ = false; + --deviceConstructDepth_; +} +void CUDAChecker::Enter(const parser::OpenACCBlockConstruct &x) { + if (IsOpenACCComputeConstruct(x)) { + ++deviceConstructDepth_; + } +} +void CUDAChecker::Leave(const parser::OpenACCBlockConstruct &x) { + if (IsOpenACCComputeConstruct(x)) { + --deviceConstructDepth_; + } +} +void CUDAChecker::Enter(const parser::OpenACCCombinedConstruct &) { + ++deviceConstructDepth_; +} +void CUDAChecker::Leave(const parser::OpenACCCombinedConstruct &) { + --deviceConstructDepth_; +} +void CUDAChecker::Enter(const parser::OpenACCLoopConstruct &) { + ++deviceConstructDepth_; +} +void CUDAChecker::Leave(const parser::OpenACCLoopConstruct &) { + --deviceConstructDepth_; +} +void CUDAChecker::Enter(const parser::DoConstruct &x) { + if (x.IsDoConcurrent() && + context_.foldingContext().languageFeatures().IsEnabled( + common::LanguageFeature::DoConcurrentOffload)) { + ++deviceConstructDepth_; + } +} +void CUDAChecker::Leave(const parser::DoConstruct &x) { + if (x.IsDoConcurrent() && + context_.foldingContext().languageFeatures().IsEnabled( + common::LanguageFeature::DoConcurrentOffload)) { + --deviceConstructDepth_; + } } void CUDAChecker::Enter(const parser::AssignmentStmt &x) { auto lhsLoc{std::get<parser::Variable>(x.t).GetSource()}; const auto &scope{context_.FindScope(lhsLoc)}; const Scope &progUnit{GetProgramUnitContaining(scope)}; - if (IsCUDADeviceContext(&progUnit) || inCUFKernelDoConstruct_) { + if (IsCUDADeviceContext(&progUnit) || deviceConstructDepth_ > 0) { return; // Data transfer with assignment is only perform on host. } @@ -714,6 +763,16 @@ void CUDAChecker::Enter(const parser::AssignmentStmt &x) { context_.Say(lhsLoc, "More than one reference to a CUDA object on the right hand side of the assigment"_err_en_US); } + + if (Fortran::evaluate::HasCUDADeviceAttrs(assign->lhs) && + Fortran::evaluate::HasCUDAImplicitTransfer(assign->rhs)) { + if (GetNbOfCUDAManagedOrUnifiedSymbols(assign->lhs) == 1 && + GetNbOfCUDAManagedOrUnifiedSymbols(assign->rhs) == 1 && + GetNbOfCUDADeviceSymbols(assign->rhs) == 1) { + return; // This is a special case handled on the host. + } + context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US); + } } } // namespace Fortran::semantics diff --git a/flang/lib/Semantics/check-cuda.h b/flang/lib/Semantics/check-cuda.h index 222a2ee..1000025 100644 --- a/flang/lib/Semantics/check-cuda.h +++ b/flang/lib/Semantics/check-cuda.h @@ -41,10 +41,18 @@ public: void Enter(const parser::CUFKernelDoConstruct &); void Leave(const parser::CUFKernelDoConstruct &); void Enter(const parser::AssignmentStmt &); + void Enter(const parser::OpenACCBlockConstruct &); + void Leave(const parser::OpenACCBlockConstruct &); + void Enter(const parser::OpenACCCombinedConstruct &); + void Leave(const parser::OpenACCCombinedConstruct &); + void Enter(const parser::OpenACCLoopConstruct &); + void Leave(const parser::OpenACCLoopConstruct &); + void Enter(const parser::DoConstruct &); + void Leave(const parser::DoConstruct &); private: SemanticsContext &context_; - bool inCUFKernelDoConstruct_ = false; + int deviceConstructDepth_{0}; }; bool CanonicalizeCUDA(parser::Program &); diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf index d0032af..68a0202f 100644 --- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf +++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf @@ -403,3 +403,19 @@ end subroutine ! CHECK-LABEL: func.func @_QPsub20() ! CHECK-NOT: cuf.data_transfer ! CHECK: hlfir.assign + +subroutine sub21() + real, allocatable,device:: a(:,:), b(:,:) + real:: s + integer:: i,j,N=16 + allocate(a(N,N),b(N,N)) + do concurrent(i=1:N, j=1:N) reduce(+:s) + b(i,j)=a(i,j)**2 + s=s+b(i,j) + end do +end subroutine + +! CHECK-LABEL: func.func @_QPsub21() +! CHECK: fir.do_concurrent.loop +! CHECK-NOT: cuf.data_transfer +! CHECK: hlfir.assign diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index 59372a8..d216069 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -224,6 +224,11 @@ static llvm::cl::opt<bool> enableCUDA("fcuda", llvm::cl::init(false)); static llvm::cl::opt<bool> + enableDoConcurrentOffload("fdoconcurrent-offload", + llvm::cl::desc("enable do concurrent offload"), + llvm::cl::init(false)); + +static llvm::cl::opt<bool> disableCUDAWarpFunction("fcuda-disable-warp-function", llvm::cl::desc("Disable CUDA Warp Function"), llvm::cl::init(false)); @@ -608,6 +613,11 @@ int main(int argc, char **argv) { options.features.Enable(Fortran::common::LanguageFeature::CUDA); } + if (enableDoConcurrentOffload) { + options.features.Enable( + Fortran::common::LanguageFeature::DoConcurrentOffload); + } + if (disableCUDAWarpFunction) { options.features.Enable( Fortran::common::LanguageFeature::CudaWarpMatchFunction, false); |