diff --git a/src/thorin/be/c/c.cpp b/src/thorin/be/c/c.cpp
index f8a6c510d..c3430f524 100644
--- a/src/thorin/be/c/c.cpp
+++ b/src/thorin/be/c/c.cpp
@@ -512,6 +512,9 @@ void CCodeGen::emit_module() {
             stream_.fmt("__device__ inline int blockDim_{}() {{ return blockDim.{}; }}\n", x, x);
             stream_.fmt("__device__ inline int gridDim_{}() {{ return gridDim.{}; }}\n", x, x);
         }
+
+        stream_.fmt("\n"
+                    "extern __shared__ unsigned char __dynamic_smem[];\n");
     }
 
     stream_.endl() << func_impls_.str();
@@ -742,7 +745,10 @@ void CCodeGen::emit_epilogue(Continuation* cont) {
         bb.tail.fmt("goto {};", label_name(callee));
     } else if (auto callee = body->callee()->isa_nom<Continuation>(); callee && callee->is_intrinsic()) {
         if (callee->intrinsic() == Intrinsic::Reserve) {
+            assert(body->num_args() == 3 && "incorrect number of arguments");
+
             emit_unsafe(body->arg(0));
+
             if (!body->arg(1)->isa<PrimLit>())
                 world().edef(body->arg(1), "reserve_shared: couldn't extract memory size");
 
@@ -758,6 +764,16 @@ void CCodeGen::emit_epilogue(Continuation* cont) {
             }
             bb.tail.fmt("p_{} = {}_reserved;\n", ret_cont->param(1)->unique_name(), cont->unique_name());
             bb.tail.fmt("goto {};", label_name(ret_cont));
+        } else if (callee->intrinsic() == Intrinsic::LocalMemory) {
+            if (lang_ == Lang::HLS)
+                world().edef(body, "local_memory not supported for HLS");
+            assert(body->num_args() == 2 && "incorrect number of arguments");
+
+            emit_unsafe(body->arg(0));
+
+            auto ret_cont = body->arg(1)->as_nom<Continuation>();
+            bb.tail.fmt("p_{} = __dynamic_smem;\n", ret_cont->param(1)->unique_name());
+            bb.tail.fmt("goto {};", label_name(ret_cont));
         } else if (callee->intrinsic() == Intrinsic::Pipeline) {
             assert((lang_ == Lang::OpenCL || lang_ == Lang::HLS) && "pipelining not supported on this backend");
 
@@ -1440,6 +1456,12 @@ std::string CCodeGen::emit_fun_head(Continuation* cont, bool is_proto) {
         }
         needs_comma = true;
     }
+
+    if (cont->is_exported() && lang_ == Lang::OpenCL) {
+        if (needs_comma) s.fmt(", ");
+        s.fmt("__local unsigned char* __dynamic_smem");
+    }
+
     s << ")";
     return s.str();
 }
diff --git a/src/thorin/be/codegen.h b/src/thorin/be/codegen.h
index c46cee7c0..3f18969cd 100644
--- a/src/thorin/be/codegen.h
+++ b/src/thorin/be/codegen.h
@@ -33,6 +33,7 @@ struct LaunchArgs {
         Device,
         Space,
         Config,
+        LocalMem,
         Body,
         Return,
         Num
diff --git a/src/thorin/be/llvm/amdgpu.cpp b/src/thorin/be/llvm/amdgpu.cpp
index d5e0d1d3a..36fed7cd2 100644
--- a/src/thorin/be/llvm/amdgpu.cpp
+++ b/src/thorin/be/llvm/amdgpu.cpp
@@ -76,4 +76,8 @@ llvm::Value* AMDGPUCodeGen::emit_reserve(llvm::IRBuilder<>& irbuilder, const Con
     return emit_reserve_shared(irbuilder, continuation, true);
 }
 
+llvm::Value* AMDGPUCodeGen::emit_local_memory(llvm::IRBuilder<>& irbuilder, const Continuation* continuation) {
+    return emit_local_memory_base_ptr(irbuilder, continuation);
+}
+
 }
diff --git a/src/thorin/be/llvm/amdgpu.h b/src/thorin/be/llvm/amdgpu.h
index 277b8d595..7715b3bee 100644
--- a/src/thorin/be/llvm/amdgpu.h
+++ b/src/thorin/be/llvm/amdgpu.h
@@ -21,6 +21,7 @@ class AMDGPUCodeGen : public CodeGen {
     llvm::Value* emit_global(const Global*) override;
     llvm::Value* emit_mathop(llvm::IRBuilder<>&, const MathOp*) override;
     llvm::Value* emit_reserve(llvm::IRBuilder<>&, const Continuation*) override;
+    llvm::Value* emit_local_memory(llvm::IRBuilder<>&, const Continuation*) override;
     std::string get_alloc_name() const override { return "malloc"; }
 
     const Cont2Config& kernel_config_;
diff --git a/src/thorin/be/llvm/llvm.cpp b/src/thorin/be/llvm/llvm.cpp
index ecea16edd..72470084c 100644
--- a/src/thorin/be/llvm/llvm.cpp
+++ b/src/thorin/be/llvm/llvm.cpp
@@ -1300,6 +1300,7 @@ std::vector<llvm::Value*> CodeGen::emit_intrinsic(llvm::IRBuilder<>& irbuilder,
         case Intrinsic::CmpXchgWeak:  return emit_cmpxchg(irbuilder, continuation, true);
         case Intrinsic::Fence:        emit_fence(irbuilder, continuation); break;
         case Intrinsic::Reserve:      return { emit_reserve(irbuilder, continuation) };
+        case Intrinsic::LocalMemory:  return { emit_local_memory(irbuilder, continuation) };
         case Intrinsic::CUDA:         runtime_->emit_host_code(*this, irbuilder, Runtime::CUDA_PLATFORM,   ".cu",     continuation); break;
         case Intrinsic::NVVM:         runtime_->emit_host_code(*this, irbuilder, Runtime::CUDA_PLATFORM,   ".nvvm",   continuation); break;
         case Intrinsic::OpenCL:       runtime_->emit_host_code(*this, irbuilder, Runtime::OPENCL_PLATFORM, ".cl",     continuation); break;
@@ -1420,7 +1421,7 @@ llvm::Value* CodeGen::emit_reserve(llvm::IRBuilder<>&, const Continuation* conti
 llvm::Value* CodeGen::emit_reserve_shared(llvm::IRBuilder<>& irbuilder, const Continuation* continuation, bool init_undef) {
     assert(continuation->has_body());
     auto body = continuation->body();
-    assert(body->num_args() == 3 && "required arguments are missing");
+    assert(body->num_args() == 3 && "incorrect number of arguments");
     if (!body->arg(1)->isa<PrimLit>())
         world().edef(body->arg(1), "reserve_shared: couldn't extract memory size");
     auto num_elems = body->arg(1)->as<PrimLit>()->ps32_value();
@@ -1437,6 +1438,28 @@ llvm::Value* CodeGen::emit_reserve_shared(llvm::IRBuilder<>& irbuilder, const Co
     return call;
 }
 
+llvm::Value* CodeGen::emit_local_memory(llvm::IRBuilder<>&, const Continuation* continuation) {
+    world().edef(continuation, "local_memory: only allowed in device code");
+    THORIN_UNREACHABLE;
+}
+
+llvm::Value* CodeGen::emit_local_memory_base_ptr(llvm::IRBuilder<>& irbuilder, const Continuation* continuation) {
+    static constexpr auto name = "__dynamic_smem";
+
+    assert(continuation->has_body());
+    auto body = continuation->body();
+    assert(body->num_args() == 2 && "incorrect number of arguments");
+    auto cont = body->arg(1)->as_nom<Continuation>();
+
+    if (auto found = module().getGlobalVariable(name))
+        return found;
+
+    auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(context()), 0);
+    auto global = new llvm::GlobalVariable(module(), type, false, llvm::GlobalValue::ExternalLinkage, nullptr, name, nullptr, llvm::GlobalVariable::NotThreadLocal, 3);
+    global->setAlignment(llvm::Align(16));
+    return global;
+}
+
 /*
  * backend-specific stuff
  */
diff --git a/src/thorin/be/llvm/llvm.h b/src/thorin/be/llvm/llvm.h
index 7d5414331..596ff9f54 100644
--- a/src/thorin/be/llvm/llvm.h
+++ b/src/thorin/be/llvm/llvm.h
@@ -85,6 +85,8 @@ class CodeGen : public thorin::CodeGen, public thorin::Emitter<llvm::Value*, llv
 
     virtual llvm::Value* emit_reserve(llvm::IRBuilder<>&, const Continuation*);
     llvm::Value* emit_reserve_shared(llvm::IRBuilder<>&, const Continuation*, bool=false);
+    virtual llvm::Value* emit_local_memory(llvm::IRBuilder<>&, const Continuation*);
+    llvm::Value* emit_local_memory_base_ptr(llvm::IRBuilder<>& irbuilder, const Continuation* continuation);
 
     virtual std::string get_alloc_name() const = 0;
     llvm::BasicBlock* cont2bb(Continuation* cont) { return cont2bb_[cont].first; }
diff --git a/src/thorin/be/llvm/nvvm.cpp b/src/thorin/be/llvm/nvvm.cpp
index 88c488da8..00255d651 100644
--- a/src/thorin/be/llvm/nvvm.cpp
+++ b/src/thorin/be/llvm/nvvm.cpp
@@ -246,6 +246,10 @@ llvm::Value* NVVMCodeGen::emit_reserve(llvm::IRBuilder<>& irbuilder, const Conti
     return emit_reserve_shared(irbuilder, continuation);
 }
 
+llvm::Value* NVVMCodeGen::emit_local_memory(llvm::IRBuilder<>& irbuilder, const Continuation* continuation) {
+    return emit_local_memory_base_ptr(irbuilder, continuation);
+}
+
 llvm::Value* NVVMCodeGen::emit_mathop(llvm::IRBuilder<>& irbuilder, const MathOp* mathop) {
     auto make_key = [] (MathOpTag tag, unsigned bitwidth) { return (static_cast<unsigned>(tag) << 16) | bitwidth; };
     static const std::unordered_map<unsigned, std::string> libdevice_functions = {
diff --git a/src/thorin/be/llvm/nvvm.h b/src/thorin/be/llvm/nvvm.h
index 3c5b8596d..592729f00 100644
--- a/src/thorin/be/llvm/nvvm.h
+++ b/src/thorin/be/llvm/nvvm.h
@@ -29,6 +29,7 @@ class NVVMCodeGen : public CodeGen {
     llvm::Value* emit_mathop(llvm::IRBuilder<>&, const MathOp*) override;
 
     llvm::Value* emit_reserve(llvm::IRBuilder<>&, const Continuation*) override;
+    llvm::Value* emit_local_memory(llvm::IRBuilder<>&, const Continuation*) override;
 
     llvm::Value* emit_global(const Global*) override;
 
diff --git a/src/thorin/be/llvm/runtime.cpp b/src/thorin/be/llvm/runtime.cpp
index 375713091..b59592d24 100644
--- a/src/thorin/be/llvm/runtime.cpp
+++ b/src/thorin/be/llvm/runtime.cpp
@@ -64,11 +64,13 @@ void Runtime::emit_host_code(CodeGen& code_gen, llvm::IRBuilder<>& builder, Plat
     assert(continuation->has_body());
     auto body = continuation->body();
     // to-target is the desired kernel call
-    // target(mem, device, (dim.x, dim.y, dim.z), (block.x, block.y, block.z), body, return, free_vars)
+    // target(mem, device, (dim.x, dim.y, dim.z), (block.x, block.y, block.z), lmem, body, return, free_vars)
     auto target = body->callee()->as_nom<Continuation>();
     assert_unused(target->is_intrinsic());
     assert(body->num_args() >= LaunchArgs::Num && "required arguments are missing");
 
+    auto& world = continuation->world();
+
     // arguments
     auto target_device_id = code_gen.emit(body->arg(LaunchArgs::Device));
     auto target_platform = builder.getInt32(platform);
@@ -78,7 +80,6 @@ void Runtime::emit_host_code(CodeGen& code_gen, llvm::IRBuilder<>& builder, Plat
     auto it_config = body->arg(LaunchArgs::Config);
     auto kernel = body->arg(LaunchArgs::Body)->as<Global>()->init()->as<Continuation>();
 
-    auto& world = continuation->world();
     //auto kernel_name = builder.CreateGlobalStringPtr(kernel->name() == "hls_top" ? kernel->name() : kernel->name());
     auto kernel_name = builder.CreateGlobalStringPtr(kernel->name());
     auto file_name = builder.CreateGlobalStringPtr(world.name() + ext);
@@ -179,9 +180,12 @@ void Runtime::emit_host_code(CodeGen& code_gen, llvm::IRBuilder<>& builder, Plat
     allocs     = builder.CreateInBoundsGEP(llvm::cast<llvm::AllocaInst>(allocs)->getAllocatedType(),     allocs,     gep_first_elem);
     types      = builder.CreateInBoundsGEP(llvm::cast<llvm::AllocaInst>(types)->getAllocatedType(),      types,      gep_first_elem);
 
+    auto lmem = code_gen.emit(body->arg(LaunchArgs::LocalMem));
+
     launch_kernel(code_gen, builder, target_device,
                   file_name, kernel_name,
                   grid_size, block_size,
+                  lmem,
                   args, sizes, aligns, allocs, types,
                   builder.getInt32(num_kernel_args));
 }
@@ -190,10 +194,11 @@ llvm::Value* Runtime::launch_kernel(
     CodeGen& code_gen, llvm::IRBuilder<>& builder, llvm::Value* device,
     llvm::Value* file, llvm::Value* kernel,
     llvm::Value* grid, llvm::Value* block,
+    llvm::Value* lmem,
     llvm::Value* args, llvm::Value* sizes, llvm::Value* aligns, llvm::Value* allocs, llvm::Value* types,
     llvm::Value* num_args)
 {
-    llvm::Value* launch_args[] = { device, file, kernel, grid, block, args, sizes, aligns, allocs, types, num_args };
+    llvm::Value* launch_args[] = { device, file, kernel, grid, block, lmem, args, sizes, aligns, allocs, types, num_args };
     return builder.CreateCall(get(code_gen, "anydsl_launch_kernel"), launch_args);
 }
 
diff --git a/src/thorin/be/llvm/runtime.h b/src/thorin/be/llvm/runtime.h
index 9038040a0..3655ca0e2 100644
--- a/src/thorin/be/llvm/runtime.h
+++ b/src/thorin/be/llvm/runtime.h
@@ -32,6 +32,7 @@ class Runtime {
         CodeGen&, llvm::IRBuilder<>&, llvm::Value* device,
         llvm::Value* file, llvm::Value* kernel,
         llvm::Value* grid, llvm::Value* block,
+        llvm::Value* lmem,
         llvm::Value* args, llvm::Value* sizes, llvm::Value* aligns, llvm::Value* allocs, llvm::Value* types,
         llvm::Value* num_args);
 
diff --git a/src/thorin/be/llvm/runtime.inc b/src/thorin/be/llvm/runtime.inc
index 44ee7f9f8..fe4ab4144 100644
--- a/src/thorin/be/llvm/runtime.inc
+++ b/src/thorin/be/llvm/runtime.inc
@@ -6,7 +6,7 @@ namespace thorin {
         declare noalias ptr @anydsl_alloc(i32, i64);
         declare noalias ptr @anydsl_alloc_unified(i32, i64);
         declare void @anydsl_release(i32, ptr);
-        declare void @anydsl_launch_kernel(i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, i32);
+        declare void @anydsl_launch_kernel(i32, ptr, ptr, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr, i32);
         declare void @anydsl_parallel_for(i32, i32, i32, ptr, ptr);
         declare void @anydsl_fibers_spawn(i32, i32, i32, ptr, ptr);
         declare i32  @anydsl_spawn_thread(ptr, ptr);
diff --git a/src/thorin/continuation.cpp b/src/thorin/continuation.cpp
index 9001206b4..da8af3ceb 100644
--- a/src/thorin/continuation.cpp
+++ b/src/thorin/continuation.cpp
@@ -245,28 +245,29 @@ const Filter* Continuation::all_true_filter() const {
 
 bool Continuation::is_accelerator() const { return Intrinsic::AcceleratorBegin <= intrinsic() && intrinsic() < Intrinsic::AcceleratorEnd; }
 void Continuation::set_intrinsic() {
-    if      (name() == "cuda")           attributes().intrinsic = Intrinsic::CUDA;
-    else if (name() == "nvvm")           attributes().intrinsic = Intrinsic::NVVM;
-    else if (name() == "opencl")         attributes().intrinsic = Intrinsic::OpenCL;
-    else if (name() == "amdgpu_hsa")     attributes().intrinsic = Intrinsic::AMDGPUHSA;
-    else if (name() == "amdgpu_pal")     attributes().intrinsic = Intrinsic::AMDGPUPAL;
-    else if (name() == "shady_compute")  attributes().intrinsic = Intrinsic::ShadyCompute;
-    else if (name() == "hls")            attributes().intrinsic = Intrinsic::HLS;
-    else if (name() == "parallel")       attributes().intrinsic = Intrinsic::Parallel;
-    else if (name() == "fibers")         attributes().intrinsic = Intrinsic::Fibers;
-    else if (name() == "spawn")          attributes().intrinsic = Intrinsic::Spawn;
-    else if (name() == "sync")           attributes().intrinsic = Intrinsic::Sync;
-    else if (name() == "vectorize")      attributes().intrinsic = Intrinsic::Vectorize;
-    else if (name() == "pe_info")        attributes().intrinsic = Intrinsic::PeInfo;
-    else if (name() == "pipeline")       attributes().intrinsic = Intrinsic::Pipeline;
-    else if (name() == "reserve_shared") attributes().intrinsic = Intrinsic::Reserve;
-    else if (name() == "atomic")         attributes().intrinsic = Intrinsic::Atomic;
-    else if (name() == "atomic_load")    attributes().intrinsic = Intrinsic::AtomicLoad;
-    else if (name() == "atomic_store")   attributes().intrinsic = Intrinsic::AtomicStore;
-    else if (name() == "cmpxchg")        attributes().intrinsic = Intrinsic::CmpXchg;
-    else if (name() == "cmpxchg_weak")   attributes().intrinsic = Intrinsic::CmpXchgWeak;
-    else if (name() == "fence")          attributes().intrinsic = Intrinsic::Fence;
-    else if (name() == "undef")          attributes().intrinsic = Intrinsic::Undef;
+    if      (name() == "cuda")              attributes().intrinsic = Intrinsic::CUDA;
+    else if (name() == "nvvm")              attributes().intrinsic = Intrinsic::NVVM;
+    else if (name() == "opencl")            attributes().intrinsic = Intrinsic::OpenCL;
+    else if (name() == "amdgpu_hsa")        attributes().intrinsic = Intrinsic::AMDGPUHSA;
+    else if (name() == "amdgpu_pal")        attributes().intrinsic = Intrinsic::AMDGPUPAL;
+    else if (name() == "shady_compute")     attributes().intrinsic = Intrinsic::ShadyCompute;
+    else if (name() == "hls")               attributes().intrinsic = Intrinsic::HLS;
+    else if (name() == "parallel")          attributes().intrinsic = Intrinsic::Parallel;
+    else if (name() == "fibers")            attributes().intrinsic = Intrinsic::Fibers;
+    else if (name() == "spawn")             attributes().intrinsic = Intrinsic::Spawn;
+    else if (name() == "sync")              attributes().intrinsic = Intrinsic::Sync;
+    else if (name() == "vectorize")         attributes().intrinsic = Intrinsic::Vectorize;
+    else if (name() == "pe_info")           attributes().intrinsic = Intrinsic::PeInfo;
+    else if (name() == "pipeline")          attributes().intrinsic = Intrinsic::Pipeline;
+    else if (name() == "reserve_shared")    attributes().intrinsic = Intrinsic::Reserve;
+    else if (name() == "local_memory_base") attributes().intrinsic = Intrinsic::LocalMemory;
+    else if (name() == "atomic")            attributes().intrinsic = Intrinsic::Atomic;
+    else if (name() == "atomic_load")       attributes().intrinsic = Intrinsic::AtomicLoad;
+    else if (name() == "atomic_store")      attributes().intrinsic = Intrinsic::AtomicStore;
+    else if (name() == "cmpxchg")           attributes().intrinsic = Intrinsic::CmpXchg;
+    else if (name() == "cmpxchg_weak")      attributes().intrinsic = Intrinsic::CmpXchgWeak;
+    else if (name() == "fence")             attributes().intrinsic = Intrinsic::Fence;
+    else if (name() == "undef")             attributes().intrinsic = Intrinsic::Undef;
     else world().ELOG("unsupported thorin intrinsic '{}'", name());
 }
 
diff --git a/src/thorin/continuation.h b/src/thorin/continuation.h
index 73e47e36a..2aa8995dd 100644
--- a/src/thorin/continuation.h
+++ b/src/thorin/continuation.h
@@ -106,6 +106,7 @@ enum class Intrinsic : uint8_t {
     Vectorize,                  ///< External vectorizer.
     AcceleratorEnd,
     Reserve = AcceleratorEnd,   ///< Intrinsic memory reserve function
+    LocalMemory,                ///< Intrinsic get local memory base pointer
     Atomic,                     ///< Intrinsic atomic function
     AtomicLoad,                 ///< Intrinsic atomic load function
     AtomicStore,                ///< Intrinsic atomic store function