[CUDA][HIP] capture possible ODR-used var (#136645)

In a lambda function, a call of a function may resolve to host and device functions with different signatures. Especially, a constexpr local variable may be passed by value by the device function and passed by reference by the host function, which will cause the constexpr variable captured by the lambda function in host compilation but not in the device compilation. The discrepancy in the lambda captures will violate ODR and causes UB for kernels using these lambdas. This PR fixes the issue by identifying discrepancy of ODR/non-ODR usages of constexpr local variables passed to host/device functions and conservatively capture them. Fixes: https://github.com/llvm/llvm-project/issues/132068
author: Yaxun (Sam) Liu <yaxun.liu@amd.com> 2025-04-23 12:50:28 -0400
committer: GitHub <noreply@github.com> 2025-04-23 12:50:28 -0400
commit: 83c309b90550aa768ff9aa11b70898ee2c56b71e (patch)
tree: 25ca8d7844eb8491a36ad657547fb7195813a862
parent: d7215c0ee2e4bca1ce87b956335ef6a2cddaf16f (diff)
download: llvm-83c309b90550aa768ff9aa11b70898ee2c56b71e.zip
llvm-83c309b90550aa768ff9aa11b70898ee2c56b71e.tar.gz
llvm-83c309b90550aa768ff9aa11b70898ee2c56b71e.tar.bz2
6 files changed, 212 insertions, 3 deletions
diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h
index 958d650..6bf9ae8 100644
--- a/clang/include/clang/Sema/ScopeInfo.h
+++ b/clang/include/clang/Sema/ScopeInfo.h
@@ -949,6 +949,9 @@ public:
 
   SourceLocation PotentialThisCaptureLocation;
 
+  /// Variables that are potentially ODR-used in CUDA/HIP.
+  llvm::SmallPtrSet<VarDecl *, 4> CUDAPotentialODRUsedVars;
+
   LambdaScopeInfo(DiagnosticsEngine &Diag)
       : CapturingScopeInfo(Diag, ImpCap_None) {
     Kind = SK_Lambda;
diff --git a/clang/include/clang/Sema/SemaCUDA.h b/clang/include/clang/Sema/SemaCUDA.h
index 71f05e8..dbc14328 100644
--- a/clang/include/clang/Sema/SemaCUDA.h
+++ b/clang/include/clang/Sema/SemaCUDA.h
@@ -274,6 +274,10 @@ public:
   /// parameters specified via <<<>>>.
   std::string getConfigureFuncName() const;
 
+  /// Record variables that are potentially ODR-used in CUDA/HIP.
+  void recordPotentialODRUsedVariable(MultiExprArg Args,
+                                      OverloadCandidateSet &CandidateSet);
+
 private:
   unsigned ForceHostDeviceDepth = 0;
 
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 0e5fc5e..0a8c24f 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -18,6 +18,7 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/Overload.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/Template.h"
@@ -1100,3 +1101,49 @@ std::string SemaCUDA::getConfigureFuncName() const {
   // Legacy CUDA kernel configuration call
   return "cudaConfigureCall";
 }
+
+// Record any local constexpr variables that are passed one way on the host
+// and another on the device.
+void SemaCUDA::recordPotentialODRUsedVariable(
+    MultiExprArg Arguments, OverloadCandidateSet &Candidates) {
+  sema::LambdaScopeInfo *LambdaInfo = SemaRef.getCurLambda();
+  if (!LambdaInfo)
+    return;
+
+  for (unsigned I = 0; I < Arguments.size(); ++I) {
+    auto *DeclRef = dyn_cast<DeclRefExpr>(Arguments[I]);
+    if (!DeclRef)
+      continue;
+    auto *Variable = dyn_cast<VarDecl>(DeclRef->getDecl());
+    if (!Variable || !Variable->isLocalVarDecl() || !Variable->isConstexpr())
+      continue;
+
+    bool HostByValue = false, HostByRef = false;
+    bool DeviceByValue = false, DeviceByRef = false;
+
+    for (OverloadCandidate &Candidate : Candidates) {
+      FunctionDecl *Callee = Candidate.Function;
+      if (!Callee || I >= Callee->getNumParams())
+        continue;
+
+      CUDAFunctionTarget Target = IdentifyTarget(Callee);
+      if (Target == CUDAFunctionTarget::InvalidTarget ||
+          Target == CUDAFunctionTarget::Global)
+        continue;
+
+      bool CoversHost = (Target == CUDAFunctionTarget::Host ||
+                         Target == CUDAFunctionTarget::HostDevice);
+      bool CoversDevice = (Target == CUDAFunctionTarget::Device ||
+                           Target == CUDAFunctionTarget::HostDevice);
+
+      bool IsRef = Callee->getParamDecl(I)->getType()->isReferenceType();
+      HostByValue |= CoversHost && !IsRef;
+      HostByRef |= CoversHost && IsRef;
+      DeviceByValue |= CoversDevice && !IsRef;
+      DeviceByRef |= CoversDevice && IsRef;
+    }
+
+    if ((HostByValue && DeviceByRef) || (HostByRef && DeviceByValue))
+      LambdaInfo->CUDAPotentialODRUsedVars.insert(Variable);
+  }
+}
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 2e6ce17..4186999 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -19517,11 +19517,29 @@ static ExprResult rebuildPotentialResultsAsNonOdrUsed(Sema &S, Expr *E,
     return false;
   };
 
+  // Check whether this expression may be odr-used in CUDA/HIP.
+  auto MaybeCUDAODRUsed = [&]() -> bool {
+    if (!S.LangOpts.CUDA)
+      return false;
+    LambdaScopeInfo *LSI = S.getCurLambda();
+    if (!LSI)
+      return false;
+    auto *DRE = dyn_cast<DeclRefExpr>(E);
+    if (!DRE)
+      return false;
+    auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
+    if (!VD)
+      return false;
+    return LSI->CUDAPotentialODRUsedVars.count(VD);
+  };
+
   // Mark that this expression does not constitute an odr-use.
   auto MarkNotOdrUsed = [&] {
-    S.MaybeODRUseExprs.remove(E);
-    if (LambdaScopeInfo *LSI = S.getCurLambda())
-      LSI->markVariableExprAsNonODRUsed(E);
+    if (!MaybeCUDAODRUsed()) {
+      S.MaybeODRUseExprs.remove(E);
+      if (LambdaScopeInfo *LSI = S.getCurLambda())
+        LSI->markVariableExprAsNonODRUsed(E);
+    }
   };
 
   // C++2a [basic.def.odr]p2:
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 5b224b6..042de8d 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14706,6 +14706,8 @@ ExprResult Sema::BuildOverloadedCallExpr(Scope *S, Expr *Fn,
   // the UnresolvedLookupExpr was type-dependent.
   if (OverloadResult == OR_Success) {
     const FunctionDecl *FDecl = Best->Function;
+    if (LangOpts.CUDA)
+      CUDA().recordPotentialODRUsedVariable(Args, CandidateSet);
     if (FDecl && FDecl->isTemplateInstantiation() &&
         FDecl->getReturnType()->isUndeducedType()) {
 
diff --git a/clang/test/CodeGenCUDA/lambda-constexpr-capture.cu b/clang/test/CodeGenCUDA/lambda-constexpr-capture.cu
new file mode 100644
index 0000000..1a1db63
--- /dev/null
+++ b/clang/test/CodeGenCUDA/lambda-constexpr-capture.cu
@@ -0,0 +1,135 @@
+// RUN: %clang_cc1 -emit-llvm -x hip %s -o - -triple x86_64-linux-gnu \
+// RUN:   | FileCheck -check-prefixes=CHECK,HOST %s
+// RUN: %clang_cc1 -emit-llvm -x hip %s -o - -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   | FileCheck -check-prefixes=CHECK,DEV %s
+
+#include "Inputs/cuda.h"
+
+// CHECK: %class.anon = type { ptr, float, ptr, ptr }
+// CHECK: %class.anon.0 = type { ptr, float, ptr, ptr }
+// CHECK: %class.anon.1 = type { ptr, ptr, ptr }
+// CHECK: %class.anon.2 = type { ptr, float, ptr, ptr }
+
+// HOST: call void @_ZN8DevByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon)
+// DEV: define amdgpu_kernel void @_ZN8DevByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon)
+
+// Only the device function passes arugments by value.
+namespace DevByVal {
+__device__ float fun(float x, float y) {
+  return x;
+}
+
+float fun(const float &x, const float &y) {
+  return x;
+}
+
+template<typename F>
+void __global__ kernel(F f)
+{
+  f(1);
+}
+
+void test(float const * fl, float const * A, float * Vf)
+{
+  float constexpr small(1.0e-25);
+
+  auto lambda = [=] __device__ __host__ (unsigned int n) {
+    float const value = fun(small, fl[0]);
+    Vf[0] = value * A[0];
+  };
+  kernel<<<1, 1>>>(lambda);
+}
+}
+
+// HOST: call void @_ZN9HostByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.0)
+// DEV: define amdgpu_kernel void @_ZN9HostByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.0)
+
+// Only the host function passes arugments by value.
+namespace HostByVal {
+float fun(float x, float y) {
+  return x;
+}
+
+__device__ float fun(const float &x, const float &y) {
+  return x;
+}
+
+template<typename F>
+void __global__ kernel(F f)
+{
+  f(1);
+}
+
+void test(float const * fl, float const * A, float * Vf)
+{
+  float constexpr small(1.0e-25);
+
+  auto lambda = [=] __device__ __host__ (unsigned int n) {
+    float const value = fun(small, fl[0]);
+    Vf[0] = value * A[0];
+  };
+  kernel<<<1, 1>>>(lambda);
+}
+}
+
+// HOST: call void @_ZN9BothByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.1)
+// DEV: define amdgpu_kernel void @_ZN9BothByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.1)
+
+// Both the host and device functions pass arugments by value.
+namespace BothByVal {
+float fun(float x, float y) {
+  return x;
+}
+
+__device__ float fun(float x, float y) {
+  return x;
+}
+
+template<typename F>
+void __global__ kernel(F f)
+{
+  f(1);
+}
+
+void test(float const * fl, float const * A, float * Vf)
+{
+  float constexpr small(1.0e-25);
+
+  auto lambda = [=] __device__ __host__ (unsigned int n) {
+    float const value = fun(small, fl[0]);
+    Vf[0] = value * A[0];
+  };
+  kernel<<<1, 1>>>(lambda);
+}
+}
+
+// HOST: call void @_ZN12NeitherByVal21__device_stub__kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr noundef byval(%class.anon.2)
+// DEV: define amdgpu_kernel void @_ZN12NeitherByVal6kernelIZNS_4testEPKfS2_PfEUljE_EEvT_(ptr addrspace(4) noundef byref(%class.anon.2)
+
+// Neither the host nor device function passes arugments by value.
+namespace NeitherByVal {
+float fun(const float& x, const float& y) {
+  return x;
+}
+
+__device__ float fun(const float& x, const float& y) {
+  return x;
+}
+
+template<typename F>
+void __global__ kernel(F f)
+{
+  f(1);
+}
+
+void test(float const * fl, float const * A, float * Vf)
+{
+  float constexpr small(1.0e-25);
+
+  auto lambda = [=] __device__ __host__ (unsigned int n) {
+    float const value = fun(small, fl[0]);
+    Vf[0] = value * A[0];
+  };
+  kernel<<<1, 1>>>(lambda);
+}
+}
author	Yaxun (Sam) Liu <yaxun.liu@amd.com>	2025-04-23 12:50:28 -0400
committer	GitHub <noreply@github.com>	2025-04-23 12:50:28 -0400
commit	83c309b90550aa768ff9aa11b70898ee2c56b71e (patch)
tree	25ca8d7844eb8491a36ad657547fb7195813a862
parent	d7215c0ee2e4bca1ce87b956335ef6a2cddaf16f (diff)
download	llvm-83c309b90550aa768ff9aa11b70898ee2c56b71e.zip llvm-83c309b90550aa768ff9aa11b70898ee2c56b71e.tar.gz llvm-83c309b90550aa768ff9aa11b70898ee2c56b71e.tar.bz2