Generate HIP global symbol stub redirection

koparasy · koparasy · commit 1b14d8549507 · 2025-02-14T09:42:57.000-08:00
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
@@ -14,14 +14,35 @@
 
 #include "CIRGenCUDARuntime.h"
 #include "CIRGenFunction.h"
+#include "mlir/IR/Operation.h"
 #include "clang/Basic/Cuda.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include <iostream>
 
 using namespace clang;
 using namespace clang::CIRGen;
 
 CIRGenCUDARuntime::~CIRGenCUDARuntime() {}
 
+CIRGenCUDARuntime::CIRGenCUDARuntime(CIRGenModule &cgm) : cgm(cgm) {
+  if (cgm.getLangOpts().OffloadViaLLVM)
+    llvm_unreachable("NYI");
+  else if (cgm.getLangOpts().HIP)
+    Prefix = "hip";
+  else
+    Prefix = "cuda";
+}
+
+std::string CIRGenCUDARuntime::addPrefixToName(StringRef FuncName) const {
+  return (Prefix + FuncName).str();
+}
+std::string
+CIRGenCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
+  return ("__" + Prefix + FuncName).str();
+}
+
 void CIRGenCUDARuntime::emitDeviceStubBodyLegacy(CIRGenFunction &cgf,
                                                  cir::FuncOp fn,
                                                  FunctionArgList &args) {
@@ -31,16 +52,14 @@ void CIRGenCUDARuntime::emitDeviceStubBodyLegacy(CIRGenFunction &cgf,
 void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
                                               cir::FuncOp fn,
                                               FunctionArgList &args) {
-  if (cgm.getLangOpts().HIP)
-    llvm_unreachable("NYI");
 
   // This requires arguments to be sent to kernels in a different way.
   if (cgm.getLangOpts().OffloadViaLLVM)
     llvm_unreachable("NYI");
 
   auto &builder = cgm.getBuilder();
 
-  // For cudaLaunchKernel, we must add another layer of indirection
+  // For [cuda|hip]LaunchKernel, we must add another layer of indirection
   // to arguments. For example, for function `add(int a, float b)`,
   // we need to pass it as `void *args[2] = { &a, &b }`.
 
@@ -71,7 +90,8 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
       LangOptions::GPUDefaultStreamKind::PerThread)
     llvm_unreachable("NYI");
 
-  std::string launchAPI = "cudaLaunchKernel";
+  std::string launchAPI = addPrefixToName("LaunchKernel");
+  std::cout << "LaunchAPI is " << launchAPI << "\n";
   const IdentifierInfo &launchII = cgm.getASTContext().Idents.get(launchAPI);
   FunctionDecl *launchFD = nullptr;
   for (auto *result : dc->lookup(&launchII)) {
@@ -86,11 +106,11 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
   }
 
   // Use this function to retrieve arguments for cudaLaunchKernel:
-  // int __cudaPopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
+  // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
   //                                *sharedMem, cudaStream_t *stream)
   //
-  // Here cudaStream_t, while also being the 6th argument of cudaLaunchKernel,
-  // is a pointer to some opaque struct.
+  // Here [cuda|hip]Stream_t, while also being the 6th argument of
+  // [cuda|hip]LaunchKernel, is a pointer to some opaque struct.
 
   mlir::Type dim3Ty =
       cgf.getTypes().convertType(launchFD->getParamDecl(1)->getType());
@@ -114,26 +134,45 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
       cir::FuncType::get({gridDim.getType(), blockDim.getType(),
                           sharedMem.getType(), stream.getType()},
                          cgm.SInt32Ty),
-      "__cudaPopCallConfiguration");
+      addUnderscoredPrefixToName("PopCallConfiguration"));
   cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});
 
   // Now emit the call to cudaLaunchKernel
-  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+  // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
+  // dim3 blockDim,
   //                              void **args, size_t sharedMem,
-  //                              cudaStream_t stream);
-  auto kernelTy =
-      cir::PointerType::get(&cgm.getMLIRContext(), fn.getFunctionType());
+  //                              [cuda|hip]Stream_t stream);
 
-  mlir::Value kernel =
-      builder.create<cir::GetGlobalOp>(loc, kernelTy, fn.getSymName());
-  mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy);
+  // We now either pick the function or the stub global for cuda, hip
+  // resepectively.
+  auto kernel = [&]() {
+    if (auto globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
+            KernelHandles[fn.getSymName()])) {
+      auto kernelTy =
+          cir::PointerType::get(&cgm.getMLIRContext(), globalOp.getSymType());
+      mlir::Value kernel = builder.create<cir::GetGlobalOp>(
+          loc, kernelTy, globalOp.getSymName());
+      return kernel;
+    }
+    if (auto funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
+            KernelHandles[fn.getSymName()])) {
+      auto kernelTy = cir::PointerType::get(&cgm.getMLIRContext(),
+                                            funcOp.getFunctionType());
+      mlir::Value kernel =
+          builder.create<cir::GetGlobalOp>(loc, kernelTy, funcOp.getSymName());
+      mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy);
+      return func;
+    }
+    assert(false && "Expected stub handle to be cir::GlobalOp or funcOp");
+  }();
+  // mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy);
   CallArgList launchArgs;
 
   mlir::Value kernelArgsDecayed =
       builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
                          cir::PointerType::get(cgm.VoidPtrTy));
 
-  launchArgs.add(RValue::get(func), launchFD->getParamDecl(0)->getType());
+  launchArgs.add(RValue::get(kernel), launchFD->getParamDecl(0)->getType());
   launchArgs.add(
       RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
       launchFD->getParamDecl(1)->getType());
@@ -157,13 +196,16 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
 
 void CIRGenCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
                                        FunctionArgList &args) {
-  // Device stub and its handle might be different.
-  if (cgm.getLangOpts().HIP)
-    llvm_unreachable("NYI");
-
+  if (auto globalOp =
+          llvm::dyn_cast<cir::GlobalOp>(KernelHandles[fn.getSymName()])) {
+    auto symbol = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
+    // Set the initializer for the global
+    cgm.setInitializer(globalOp, symbol);
+  }
   // CUDA 9.0 changed the way to launch kernels.
   if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(),
                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
+      (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
       cgm.getLangOpts().OffloadViaLLVM)
     emitDeviceStubBodyNew(cgf, fn, args);
   else
@@ -189,3 +231,57 @@ RValue CIRGenCUDARuntime::emitCUDAKernelCallExpr(CIRGenFunction &cgf,
 
   return RValue::get(nullptr);
 }
+
+mlir::Operation *CIRGenCUDARuntime::getKernelHandle(cir::FuncOp fn,
+                                                    GlobalDecl GD) {
+
+  // Check if we already have a kernel handle for this function
+  auto Loc = KernelHandles.find(fn.getSymName());
+  if (Loc != KernelHandles.end()) {
+    auto OldHandle = Loc->second;
+    // Here we know that the fn did not change. Return it
+    if (KernelStubs[OldHandle] == fn)
+      return OldHandle;
+
+    // We've found the function name, but F itself has changed, so we need to
+    // update the references.
+    if (cgm.getLangOpts().HIP) {
+      // For HIP compilation the handle itself does not change, so we only need
+      // to update the Stub value.
+      KernelStubs[OldHandle] = fn;
+      return OldHandle;
+    }
+    // For non-HIP compilation, erase the old Stub and fall-through to creating
+    // new entries.
+    KernelStubs.erase(OldHandle);
+  }
+
+  // If not targeting HIP, store the function itself
+  if (!cgm.getLangOpts().HIP) {
+    KernelHandles[fn.getSymName()] = fn;
+    KernelStubs[fn] = fn;
+    return fn;
+  }
+
+  // Create a new CIR global variable to represent the kernel handle
+  auto &builder = cgm.getBuilder();
+  auto globalName = cgm.getMangledName(
+      GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
+  auto globalOp = cgm.getOrInsertGlobal(
+      fn->getLoc(), globalName, fn.getFunctionType(), [&] {
+        return CIRGenModule::createGlobalOp(
+            cgm, fn->getLoc(), globalName,
+            builder.getPointerTo(fn.getFunctionType()), true, /* addrSpace=*/{},
+            /*insertPoint=*/nullptr, fn.getLinkage());
+      });
+
+  globalOp->setAttr("alignment", builder.getI64IntegerAttr(
+                                     cgm.getPointerAlign().getQuantity()));
+  globalOp->setAttr("visibility", fn->getAttr("sym_visibility"));
+
+  // Store references
+  KernelHandles[fn.getSymName()] = globalOp;
+  KernelStubs[globalOp] = fn;
+
+  return globalOp;
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
@@ -29,15 +29,26 @@ class ReturnValueSlot;
 class CIRGenCUDARuntime {
 protected:
   CIRGenModule &cgm;
+  StringRef Prefix;
+
+  // Map a device stub function to a symbol for identifying kernel in host code.
+  // For CUDA, the symbol for identifying the kernel is the same as the device
+  // stub function. For HIP, they are different.
+  llvm::DenseMap<StringRef, mlir::Operation *> KernelHandles;
+
+  // Map a kernel handle to the kernel stub.
+  llvm::DenseMap<mlir::Operation *, mlir::Operation *> KernelStubs;
 
 private:
   void emitDeviceStubBodyLegacy(CIRGenFunction &cgf, cir::FuncOp fn,
                                 FunctionArgList &args);
   void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
                              FunctionArgList &args);
+  std::string addPrefixToName(StringRef FuncName) const;
+  std::string addUnderscoredPrefixToName(StringRef FuncName) const;
 
 public:
-  CIRGenCUDARuntime(CIRGenModule &cgm) : cgm(cgm) {}
+  CIRGenCUDARuntime(CIRGenModule &cgm);
   virtual ~CIRGenCUDARuntime();
 
   virtual void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
@@ -46,6 +57,7 @@ class CIRGenCUDARuntime {
   virtual RValue emitCUDAKernelCallExpr(CIRGenFunction &cgf,
                                         const CUDAKernelCallExpr *expr,
                                         ReturnValueSlot retValue);
+  virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl GD);
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -651,9 +651,10 @@ void CIRGenModule::emitGlobalFunctionDefinition(GlobalDecl GD,
 
   // Get or create the prototype for the function.
   auto Fn = dyn_cast_if_present<cir::FuncOp>(Op);
-  if (!Fn || Fn.getFunctionType() != Ty)
-    Fn = GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer=*/true,
-                           ForDefinition);
+  if (!Fn || Fn.getFunctionType() != Ty) {
+    Fn = GetAddrOfFunction(GD, Ty, /*ForVTable=*/false,
+                           /*DontDefer=*/true, ForDefinition);
+  }
 
   // Already emitted.
   if (!Fn.isDeclaration())
@@ -2356,10 +2357,17 @@ cir::FuncOp CIRGenModule::GetAddrOfFunction(clang::GlobalDecl GD, mlir::Type Ty,
 
   // As __global__ functions (kernels) always reside on device,
   // when we access them from host, we must refer to the kernel handle.
-  // For CUDA, it's just the device stub. For HIP, it's something different.
-  if (langOpts.CUDA && !langOpts.CUDAIsDevice && langOpts.HIP &&
+  // For HIP, we should never directly access the host device addr, but
+  // instead the Global Variable of that stub. For CUDA, it's just the device
+  // stub. For HIP, it's something different.
+  if ((langOpts.HIP || langOpts.CUDA) && !langOpts.CUDAIsDevice &&
       cast<FunctionDecl>(GD.getDecl())->hasAttr<CUDAGlobalAttr>()) {
-    llvm_unreachable("NYI");
+    auto *stubHandle = getCUDARuntime().getKernelHandle(F, GD);
+    if (IsForDefinition)
+      return F;
+
+    if (langOpts.HIP)
+      llvm_unreachable("NYI");
   }
 
   return F;
@@ -3169,15 +3177,15 @@ CIRGenModule::GetAddrOfGlobal(GlobalDecl GD, ForDefinition_t IsForDefinition) {
     auto FInfo =
         &getTypes().arrangeCXXMethodDeclaration(cast<CXXMethodDecl>(D));
     auto Ty = getTypes().GetFunctionType(*FInfo);
-    return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer=*/false,
-                             IsForDefinition);
+    return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false,
+                             /*DontDefer=*/false, IsForDefinition);
   }
 
   if (isa<FunctionDecl>(D)) {
     const CIRGenFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
     auto Ty = getTypes().GetFunctionType(FI);
-    return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false, /*DontDefer=*/false,
-                             IsForDefinition);
+    return GetAddrOfFunction(GD, Ty, /*ForVTable=*/false,
+                             /*DontDefer=*/false, IsForDefinition);
   }
 
   return getAddrOfGlobalVar(cast<VarDecl>(D), /*Ty=*/nullptr, IsForDefinition)
diff --git a/clang/test/CIR/CodeGen/HIP/simple-device.cpp b/clang/test/CIR/CodeGen/HIP/simple-device.cpp
diff --git a/clang/test/CIR/CodeGen/HIP/simple.cpp b/clang/test/CIR/CodeGen/HIP/simple.cpp
@@ -1,16 +1,34 @@
 #include "../Inputs/cuda.h"
 
-// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
+// RUN:            -x hip -fhip-new-launch-api \
 // RUN:            -emit-cir %s -o %t.cir
-// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s
 
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip \
+// RUN:            -fcuda-is-device -fhip-new-launch-api \
+// RUN:              -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s
+
+// Attribute for global_fn
+// CIR-HOST: [[Kernel:#[a-zA-Z_0-9]+]] = {{.*}}#cir.cuda_kernel_name<_Z9global_fni>{{.*}}
 
-// This should emit as a normal C++ function.
-__host__ void host_fn(int *a, int *b, int *c) {}
 
-// CIR: cir.func @_Z7host_fnPiS_S_
+__host__ void host_fn(int *a, int *b, int *c) {}
+// CIR-HOST: cir.func @_Z7host_fnPiS_S_
+// CIR-DEVICE-NOT: cir.func @_Z7host_fnPiS_S_
 
-// This shouldn't emit.
 __device__ void device_fn(int* a, double b, float c) {}
+// CIR-HOST-NOT: cir.func @_Z9device_fnPidf
+// CIR-DEVICE: cir.func @_Z9device_fnPidf
+
+__global__ void global_fn(int a) {}
+// CIR-DEVICE: @_Z9global_fni
+
+// CIR-HOST: cir.alloca {{.*}}"kernel_args"
+// CIR-HOST: cir.call @__hipPopCallConfiguration
 
-// CHECK-NOT: cir.func @_Z9device_fnPidf
+// Host access the global stub instead of the functiond evice stub.
+// The stub has the mangled name of the function
+// CIR-HOST: cir.get_global @_Z9global_fni
+// CIR-HOST: cir.call @hipLaunchKernel