Skip to content

Commit b510e50

Browse files
authored
[CIR][CUDA] Handle shared and local variables (#1368)
CUDA shared variables are device-only, accessible from all threads in a block of some kernel. It's similar to `local` variables in OpenCL which all threads in a work-group can access. Hence they are realized as `static` variables in addrspace(local). On the other hand, the local variables inside a kernel (without special attributes) are just regular variables, typically emitted by `CreateTempAlloca`. They are in the default address space. OG checks if the expected address space, denoted by the type, is the same as the actual address space indicated by attributes. If they aren't the same, a `addrspacecast` is emitted when a global variable is accessed. In CIR however, `cir.get_global` alreadys carries that information in `!cir.ptr` type, so we don't need a cast.
1 parent 2b94285 commit b510e50

File tree

4 files changed

+45
-8
lines changed

4 files changed

+45
-8
lines changed

clang/lib/CIR/CodeGen/CIRGenDecl.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -479,9 +479,10 @@ CIRGenModule::getOrCreateStaticVarDecl(const VarDecl &D,
479479
// OpenCL variables in local address space and CUDA shared
480480
// variables cannot have an initializer.
481481
mlir::Attribute Init = nullptr;
482-
if (D.hasAttr<CUDASharedAttr>() || D.hasAttr<LoaderUninitializedAttr>())
482+
if (D.hasAttr<LoaderUninitializedAttr>())
483483
llvm_unreachable("CUDA is NYI");
484-
else if (Ty.getAddressSpace() != LangAS::opencl_local)
484+
else if (Ty.getAddressSpace() != LangAS::opencl_local &&
485+
!D.hasAttr<CUDASharedAttr>())
485486
Init = builder.getZeroInitAttr(convertType(Ty));
486487

487488
cir::GlobalOp GV = builder.createVersionedGlobal(
@@ -499,9 +500,14 @@ CIRGenModule::getOrCreateStaticVarDecl(const VarDecl &D,
499500

500501
setGVProperties(GV, &D);
501502

502-
// Make sure the result is of the correct type.
503-
if (AS != builder.getAddrSpaceAttr(Ty.getAddressSpace()))
504-
llvm_unreachable("address space cast NYI");
503+
// OG checks if the expected address space, denoted by the type, is the
504+
// same as the actual address space indicated by attributes. If they aren't
505+
// the same, an addrspacecast is emitted when this variable is accessed.
506+
// In CIR however, cir.get_global alreadys carries that information in
507+
// !cir.ptr type - if this global is in OpenCL local address space, then its
508+
// type would be !cir.ptr<..., addrspace(offload_local)>. Therefore we don't
509+
// need an explicit address space cast in CIR: they will get emitted when
510+
// lowering to LLVM IR.
505511

506512
// Ensure that the static local gets initialized by making sure the parent
507513
// function gets emitted eventually.

clang/lib/CIR/CodeGen/CIRGenModule.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1604,7 +1604,14 @@ LangAS CIRGenModule::getGlobalConstantAddressSpace() const {
16041604
LangAS CIRGenModule::getLangTempAllocaAddressSpace() const {
16051605
if (getLangOpts().OpenCL)
16061606
return LangAS::opencl_private;
1607-
if (getLangOpts().SYCLIsDevice || getLangOpts().CUDAIsDevice ||
1607+
1608+
// For temporaries inside functions, CUDA treats them as normal variables.
1609+
// LangAS::cuda_device, on the other hand, is reserved for those variables
1610+
// explicitly marked with __device__.
1611+
if (getLangOpts().CUDAIsDevice)
1612+
return LangAS::Default;
1613+
1614+
if (getLangOpts().SYCLIsDevice ||
16081615
(getLangOpts().OpenMP && getLangOpts().OpenMPIsTargetDevice))
16091616
llvm_unreachable("NYI");
16101617
return LangAS::Default;
@@ -3983,8 +3990,11 @@ LangAS CIRGenModule::getGlobalVarAddressSpace(const VarDecl *D) {
39833990
(!D || D->getType().getAddressSpace() == LangAS::Default))
39843991
llvm_unreachable("NYI");
39853992

3986-
if (langOpts.CUDA && langOpts.CUDAIsDevice)
3993+
if (langOpts.CUDA && langOpts.CUDAIsDevice) {
3994+
if (D && D->hasAttr<CUDASharedAttr>())
3995+
return LangAS::cuda_shared;
39873996
llvm_unreachable("NYI");
3997+
}
39883998

39893999
if (langOpts.OpenMP)
39904000
llvm_unreachable("NYI");

clang/lib/CIR/Dialect/IR/CIRAttrs.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,9 @@ AddressSpaceAttr::getValueFromLangAS(clang::LangAS langAS) {
645645
case LangAS::opencl_global:
646646
return Kind::offload_global;
647647
case LangAS::opencl_local:
648+
case LangAS::cuda_shared:
649+
// Local means local among the work-group (OpenCL) or block (CUDA).
650+
// All threads inside the kernel can access local memory.
648651
return Kind::offload_local;
649652
case LangAS::opencl_constant:
650653
return Kind::offload_constant;
@@ -657,7 +660,6 @@ AddressSpaceAttr::getValueFromLangAS(clang::LangAS langAS) {
657660
case LangAS::opencl_global_host:
658661
case LangAS::cuda_device:
659662
case LangAS::cuda_constant:
660-
case LangAS::cuda_shared:
661663
case LangAS::sycl_global:
662664
case LangAS::sycl_global_device:
663665
case LangAS::sycl_global_host:
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#include "../Inputs/cuda.h"
2+
3+
// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
4+
// RUN: -fcuda-is-device -emit-cir -target-sdk-version=12.3 \
5+
// RUN: %s -o %t.cir
6+
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
7+
8+
__global__ void fn() {
9+
int i = 0;
10+
__shared__ int j;
11+
j = i;
12+
}
13+
14+
// CIR: cir.global "private" internal dsolocal addrspace(offload_local) @_ZZ2fnvE1j : !s32i
15+
// CIR: cir.func @_Z2fnv
16+
// CIR: [[Local:%[0-9]+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
17+
// CIR: [[Shared:%[0-9]+]] = cir.get_global @_ZZ2fnvE1j : !cir.ptr<!s32i, addrspace(offload_local)>
18+
// CIR: [[Tmp:%[0-9]+]] = cir.load [[Local]] : !cir.ptr<!s32i>, !s32i
19+
// CIR: cir.store [[Tmp]], [[Shared]] : !s32i, !cir.ptr<!s32i, addrspace(offload_local)>

0 commit comments

Comments
 (0)