diff --git a/src/thorin/be/c/c.cpp b/src/thorin/be/c/c.cpp index f8a6c510d..c3430f524 100644 --- a/src/thorin/be/c/c.cpp +++ b/src/thorin/be/c/c.cpp @@ -512,6 +512,9 @@ void CCodeGen::emit_module() { stream_.fmt("__device__ inline int blockDim_{}() {{ return blockDim.{}; }}\n", x, x); stream_.fmt("__device__ inline int gridDim_{}() {{ return gridDim.{}; }}\n", x, x); } + + stream_.fmt("\n" + "extern __shared__ unsigned char __dynamic_smem[];\n"); } stream_.endl() << func_impls_.str(); @@ -742,7 +745,10 @@ void CCodeGen::emit_epilogue(Continuation* cont) { bb.tail.fmt("goto {};", label_name(callee)); } else if (auto callee = body->callee()->isa_nom(); callee && callee->is_intrinsic()) { if (callee->intrinsic() == Intrinsic::Reserve) { + assert(body->num_args() == 3 && "incorrect number of arguments"); + emit_unsafe(body->arg(0)); + if (!body->arg(1)->isa()) world().edef(body->arg(1), "reserve_shared: couldn't extract memory size"); @@ -758,6 +764,16 @@ void CCodeGen::emit_epilogue(Continuation* cont) { } bb.tail.fmt("p_{} = {}_reserved;\n", ret_cont->param(1)->unique_name(), cont->unique_name()); bb.tail.fmt("goto {};", label_name(ret_cont)); + } else if (callee->intrinsic() == Intrinsic::LocalMemory) { + if (lang_ == Lang::HLS) + world().edef(body, "local_memory not supported for HLS"); + assert(body->num_args() == 2 && "incorrect number of arguments"); + + emit_unsafe(body->arg(0)); + + auto ret_cont = body->arg(1)->as_nom(); + bb.tail.fmt("p_{} = __dynamic_smem;\n", ret_cont->param(1)->unique_name()); + bb.tail.fmt("goto {};", label_name(ret_cont)); } else if (callee->intrinsic() == Intrinsic::Pipeline) { assert((lang_ == Lang::OpenCL || lang_ == Lang::HLS) && "pipelining not supported on this backend"); @@ -1440,6 +1456,12 @@ std::string CCodeGen::emit_fun_head(Continuation* cont, bool is_proto) { } needs_comma = true; } + + if (cont->is_exported() && lang_ == Lang::OpenCL) { + if (needs_comma) s.fmt(", "); + s.fmt("__local unsigned char* __dynamic_smem"); + } + s << ")"; return s.str(); } diff --git a/src/thorin/be/codegen.h b/src/thorin/be/codegen.h index c46cee7c0..3f18969cd 100644 --- a/src/thorin/be/codegen.h +++ b/src/thorin/be/codegen.h @@ -33,6 +33,7 @@ struct LaunchArgs { Device, Space, Config, + LocalMem, Body, Return, Num diff --git a/src/thorin/be/llvm/amdgpu.cpp b/src/thorin/be/llvm/amdgpu.cpp index d5e0d1d3a..36fed7cd2 100644 --- a/src/thorin/be/llvm/amdgpu.cpp +++ b/src/thorin/be/llvm/amdgpu.cpp @@ -76,4 +76,8 @@ llvm::Value* AMDGPUCodeGen::emit_reserve(llvm::IRBuilder<>& irbuilder, const Con return emit_reserve_shared(irbuilder, continuation, true); } +llvm::Value* AMDGPUCodeGen::emit_local_memory(llvm::IRBuilder<>& irbuilder, const Continuation* continuation) { + return emit_local_memory_base_ptr(irbuilder, continuation); +} + } diff --git a/src/thorin/be/llvm/amdgpu.h b/src/thorin/be/llvm/amdgpu.h index 277b8d595..7715b3bee 100644 --- a/src/thorin/be/llvm/amdgpu.h +++ b/src/thorin/be/llvm/amdgpu.h @@ -21,6 +21,7 @@ class AMDGPUCodeGen : public CodeGen { llvm::Value* emit_global(const Global*) override; llvm::Value* emit_mathop(llvm::IRBuilder<>&, const MathOp*) override; llvm::Value* emit_reserve(llvm::IRBuilder<>&, const Continuation*) override; + llvm::Value* emit_local_memory(llvm::IRBuilder<>&, const Continuation*) override; std::string get_alloc_name() const override { return "malloc"; } const Cont2Config& kernel_config_; diff --git a/src/thorin/be/llvm/llvm.cpp b/src/thorin/be/llvm/llvm.cpp index ecea16edd..72470084c 100644 --- a/src/thorin/be/llvm/llvm.cpp +++ b/src/thorin/be/llvm/llvm.cpp @@ -1300,6 +1300,7 @@ std::vector CodeGen::emit_intrinsic(llvm::IRBuilder<>& irbuilder, case Intrinsic::CmpXchgWeak: return emit_cmpxchg(irbuilder, continuation, true); case Intrinsic::Fence: emit_fence(irbuilder, continuation); break; case Intrinsic::Reserve: return { emit_reserve(irbuilder, continuation) }; + case Intrinsic::LocalMemory: return { emit_local_memory(irbuilder, continuation) }; case Intrinsic::CUDA: runtime_->emit_host_code(*this, irbuilder, Runtime::CUDA_PLATFORM, ".cu", continuation); break; case Intrinsic::NVVM: runtime_->emit_host_code(*this, irbuilder, Runtime::CUDA_PLATFORM, ".nvvm", continuation); break; case Intrinsic::OpenCL: runtime_->emit_host_code(*this, irbuilder, Runtime::OPENCL_PLATFORM, ".cl", continuation); break; @@ -1420,7 +1421,7 @@ llvm::Value* CodeGen::emit_reserve(llvm::IRBuilder<>&, const Continuation* conti llvm::Value* CodeGen::emit_reserve_shared(llvm::IRBuilder<>& irbuilder, const Continuation* continuation, bool init_undef) { assert(continuation->has_body()); auto body = continuation->body(); - assert(body->num_args() == 3 && "required arguments are missing"); + assert(body->num_args() == 3 && "incorrect number of arguments"); if (!body->arg(1)->isa()) world().edef(body->arg(1), "reserve_shared: couldn't extract memory size"); auto num_elems = body->arg(1)->as()->ps32_value(); @@ -1437,6 +1438,28 @@ llvm::Value* CodeGen::emit_reserve_shared(llvm::IRBuilder<>& irbuilder, const Co return call; } +llvm::Value* CodeGen::emit_local_memory(llvm::IRBuilder<>&, const Continuation* continuation) { + world().edef(continuation, "local_memory: only allowed in device code"); + THORIN_UNREACHABLE; +} + +llvm::Value* CodeGen::emit_local_memory_base_ptr(llvm::IRBuilder<>& irbuilder, const Continuation* continuation) { + static constexpr auto name = "__dynamic_smem"; + + assert(continuation->has_body()); + auto body = continuation->body(); + assert(body->num_args() == 2 && "incorrect number of arguments"); + auto cont = body->arg(1)->as_nom(); + + if (auto found = module().getGlobalVariable(name)) + return found; + + auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(context()), 0); + auto global = new llvm::GlobalVariable(module(), type, false, llvm::GlobalValue::ExternalLinkage, nullptr, name, nullptr, llvm::GlobalVariable::NotThreadLocal, 3); + global->setAlignment(llvm::Align(16)); + return global; +} + /* * backend-specific stuff */ diff --git a/src/thorin/be/llvm/llvm.h b/src/thorin/be/llvm/llvm.h index 7d5414331..596ff9f54 100644 --- a/src/thorin/be/llvm/llvm.h +++ b/src/thorin/be/llvm/llvm.h @@ -85,6 +85,8 @@ class CodeGen : public thorin::CodeGen, public thorin::Emitter&, const Continuation*); llvm::Value* emit_reserve_shared(llvm::IRBuilder<>&, const Continuation*, bool=false); + virtual llvm::Value* emit_local_memory(llvm::IRBuilder<>&, const Continuation*); + llvm::Value* emit_local_memory_base_ptr(llvm::IRBuilder<>& irbuilder, const Continuation* continuation); virtual std::string get_alloc_name() const = 0; llvm::BasicBlock* cont2bb(Continuation* cont) { return cont2bb_[cont].first; } diff --git a/src/thorin/be/llvm/nvvm.cpp b/src/thorin/be/llvm/nvvm.cpp index 88c488da8..00255d651 100644 --- a/src/thorin/be/llvm/nvvm.cpp +++ b/src/thorin/be/llvm/nvvm.cpp @@ -246,6 +246,10 @@ llvm::Value* NVVMCodeGen::emit_reserve(llvm::IRBuilder<>& irbuilder, const Conti return emit_reserve_shared(irbuilder, continuation); } +llvm::Value* NVVMCodeGen::emit_local_memory(llvm::IRBuilder<>& irbuilder, const Continuation* continuation) { + return emit_local_memory_base_ptr(irbuilder, continuation); +} + llvm::Value* NVVMCodeGen::emit_mathop(llvm::IRBuilder<>& irbuilder, const MathOp* mathop) { auto make_key = [] (MathOpTag tag, unsigned bitwidth) { return (static_cast(tag) << 16) | bitwidth; }; static const std::unordered_map libdevice_functions = { diff --git a/src/thorin/be/llvm/nvvm.h b/src/thorin/be/llvm/nvvm.h index 3c5b8596d..592729f00 100644 --- a/src/thorin/be/llvm/nvvm.h +++ b/src/thorin/be/llvm/nvvm.h @@ -29,6 +29,7 @@ class NVVMCodeGen : public CodeGen { llvm::Value* emit_mathop(llvm::IRBuilder<>&, const MathOp*) override; llvm::Value* emit_reserve(llvm::IRBuilder<>&, const Continuation*) override; + llvm::Value* emit_local_memory(llvm::IRBuilder<>&, const Continuation*) override; llvm::Value* emit_global(const Global*) override; diff --git a/src/thorin/be/llvm/runtime.cpp b/src/thorin/be/llvm/runtime.cpp index 375713091..b59592d24 100644 --- a/src/thorin/be/llvm/runtime.cpp +++ b/src/thorin/be/llvm/runtime.cpp @@ -64,11 +64,13 @@ void Runtime::emit_host_code(CodeGen& code_gen, llvm::IRBuilder<>& builder, Plat assert(continuation->has_body()); auto body = continuation->body(); // to-target is the desired kernel call - // target(mem, device, (dim.x, dim.y, dim.z), (block.x, block.y, block.z), body, return, free_vars) + // target(mem, device, (dim.x, dim.y, dim.z), (block.x, block.y, block.z), lmem, body, return, free_vars) auto target = body->callee()->as_nom(); assert_unused(target->is_intrinsic()); assert(body->num_args() >= LaunchArgs::Num && "required arguments are missing"); + auto& world = continuation->world(); + // arguments auto target_device_id = code_gen.emit(body->arg(LaunchArgs::Device)); auto target_platform = builder.getInt32(platform); @@ -78,7 +80,6 @@ void Runtime::emit_host_code(CodeGen& code_gen, llvm::IRBuilder<>& builder, Plat auto it_config = body->arg(LaunchArgs::Config); auto kernel = body->arg(LaunchArgs::Body)->as()->init()->as(); - auto& world = continuation->world(); //auto kernel_name = builder.CreateGlobalStringPtr(kernel->name() == "hls_top" ? kernel->name() : kernel->name()); auto kernel_name = builder.CreateGlobalStringPtr(kernel->name()); auto file_name = builder.CreateGlobalStringPtr(world.name() + ext); @@ -179,9 +180,12 @@ void Runtime::emit_host_code(CodeGen& code_gen, llvm::IRBuilder<>& builder, Plat allocs = builder.CreateInBoundsGEP(llvm::cast(allocs)->getAllocatedType(), allocs, gep_first_elem); types = builder.CreateInBoundsGEP(llvm::cast(types)->getAllocatedType(), types, gep_first_elem); + auto lmem = code_gen.emit(body->arg(LaunchArgs::LocalMem)); + launch_kernel(code_gen, builder, target_device, file_name, kernel_name, grid_size, block_size, + lmem, args, sizes, aligns, allocs, types, builder.getInt32(num_kernel_args)); } @@ -190,10 +194,11 @@ llvm::Value* Runtime::launch_kernel( CodeGen& code_gen, llvm::IRBuilder<>& builder, llvm::Value* device, llvm::Value* file, llvm::Value* kernel, llvm::Value* grid, llvm::Value* block, + llvm::Value* lmem, llvm::Value* args, llvm::Value* sizes, llvm::Value* aligns, llvm::Value* allocs, llvm::Value* types, llvm::Value* num_args) { - llvm::Value* launch_args[] = { device, file, kernel, grid, block, args, sizes, aligns, allocs, types, num_args }; + llvm::Value* launch_args[] = { device, file, kernel, grid, block, lmem, args, sizes, aligns, allocs, types, num_args }; return builder.CreateCall(get(code_gen, "anydsl_launch_kernel"), launch_args); } diff --git a/src/thorin/be/llvm/runtime.h b/src/thorin/be/llvm/runtime.h index 9038040a0..3655ca0e2 100644 --- a/src/thorin/be/llvm/runtime.h +++ b/src/thorin/be/llvm/runtime.h @@ -32,6 +32,7 @@ class Runtime { CodeGen&, llvm::IRBuilder<>&, llvm::Value* device, llvm::Value* file, llvm::Value* kernel, llvm::Value* grid, llvm::Value* block, + llvm::Value* lmem, llvm::Value* args, llvm::Value* sizes, llvm::Value* aligns, llvm::Value* allocs, llvm::Value* types, llvm::Value* num_args); diff --git a/src/thorin/be/llvm/runtime.inc b/src/thorin/be/llvm/runtime.inc index 44ee7f9f8..fe4ab4144 100644 --- a/src/thorin/be/llvm/runtime.inc +++ b/src/thorin/be/llvm/runtime.inc @@ -6,7 +6,7 @@ namespace thorin { declare noalias ptr @anydsl_alloc(i32, i64); declare noalias ptr @anydsl_alloc_unified(i32, i64); declare void @anydsl_release(i32, ptr); - declare void @anydsl_launch_kernel(i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, i32); + declare void @anydsl_launch_kernel(i32, ptr, ptr, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr, i32); declare void @anydsl_parallel_for(i32, i32, i32, ptr, ptr); declare void @anydsl_fibers_spawn(i32, i32, i32, ptr, ptr); declare i32 @anydsl_spawn_thread(ptr, ptr); diff --git a/src/thorin/continuation.cpp b/src/thorin/continuation.cpp index 9001206b4..da8af3ceb 100644 --- a/src/thorin/continuation.cpp +++ b/src/thorin/continuation.cpp @@ -245,28 +245,29 @@ const Filter* Continuation::all_true_filter() const { bool Continuation::is_accelerator() const { return Intrinsic::AcceleratorBegin <= intrinsic() && intrinsic() < Intrinsic::AcceleratorEnd; } void Continuation::set_intrinsic() { - if (name() == "cuda") attributes().intrinsic = Intrinsic::CUDA; - else if (name() == "nvvm") attributes().intrinsic = Intrinsic::NVVM; - else if (name() == "opencl") attributes().intrinsic = Intrinsic::OpenCL; - else if (name() == "amdgpu_hsa") attributes().intrinsic = Intrinsic::AMDGPUHSA; - else if (name() == "amdgpu_pal") attributes().intrinsic = Intrinsic::AMDGPUPAL; - else if (name() == "shady_compute") attributes().intrinsic = Intrinsic::ShadyCompute; - else if (name() == "hls") attributes().intrinsic = Intrinsic::HLS; - else if (name() == "parallel") attributes().intrinsic = Intrinsic::Parallel; - else if (name() == "fibers") attributes().intrinsic = Intrinsic::Fibers; - else if (name() == "spawn") attributes().intrinsic = Intrinsic::Spawn; - else if (name() == "sync") attributes().intrinsic = Intrinsic::Sync; - else if (name() == "vectorize") attributes().intrinsic = Intrinsic::Vectorize; - else if (name() == "pe_info") attributes().intrinsic = Intrinsic::PeInfo; - else if (name() == "pipeline") attributes().intrinsic = Intrinsic::Pipeline; - else if (name() == "reserve_shared") attributes().intrinsic = Intrinsic::Reserve; - else if (name() == "atomic") attributes().intrinsic = Intrinsic::Atomic; - else if (name() == "atomic_load") attributes().intrinsic = Intrinsic::AtomicLoad; - else if (name() == "atomic_store") attributes().intrinsic = Intrinsic::AtomicStore; - else if (name() == "cmpxchg") attributes().intrinsic = Intrinsic::CmpXchg; - else if (name() == "cmpxchg_weak") attributes().intrinsic = Intrinsic::CmpXchgWeak; - else if (name() == "fence") attributes().intrinsic = Intrinsic::Fence; - else if (name() == "undef") attributes().intrinsic = Intrinsic::Undef; + if (name() == "cuda") attributes().intrinsic = Intrinsic::CUDA; + else if (name() == "nvvm") attributes().intrinsic = Intrinsic::NVVM; + else if (name() == "opencl") attributes().intrinsic = Intrinsic::OpenCL; + else if (name() == "amdgpu_hsa") attributes().intrinsic = Intrinsic::AMDGPUHSA; + else if (name() == "amdgpu_pal") attributes().intrinsic = Intrinsic::AMDGPUPAL; + else if (name() == "shady_compute") attributes().intrinsic = Intrinsic::ShadyCompute; + else if (name() == "hls") attributes().intrinsic = Intrinsic::HLS; + else if (name() == "parallel") attributes().intrinsic = Intrinsic::Parallel; + else if (name() == "fibers") attributes().intrinsic = Intrinsic::Fibers; + else if (name() == "spawn") attributes().intrinsic = Intrinsic::Spawn; + else if (name() == "sync") attributes().intrinsic = Intrinsic::Sync; + else if (name() == "vectorize") attributes().intrinsic = Intrinsic::Vectorize; + else if (name() == "pe_info") attributes().intrinsic = Intrinsic::PeInfo; + else if (name() == "pipeline") attributes().intrinsic = Intrinsic::Pipeline; + else if (name() == "reserve_shared") attributes().intrinsic = Intrinsic::Reserve; + else if (name() == "local_memory_base") attributes().intrinsic = Intrinsic::LocalMemory; + else if (name() == "atomic") attributes().intrinsic = Intrinsic::Atomic; + else if (name() == "atomic_load") attributes().intrinsic = Intrinsic::AtomicLoad; + else if (name() == "atomic_store") attributes().intrinsic = Intrinsic::AtomicStore; + else if (name() == "cmpxchg") attributes().intrinsic = Intrinsic::CmpXchg; + else if (name() == "cmpxchg_weak") attributes().intrinsic = Intrinsic::CmpXchgWeak; + else if (name() == "fence") attributes().intrinsic = Intrinsic::Fence; + else if (name() == "undef") attributes().intrinsic = Intrinsic::Undef; else world().ELOG("unsupported thorin intrinsic '{}'", name()); } diff --git a/src/thorin/continuation.h b/src/thorin/continuation.h index 73e47e36a..2aa8995dd 100644 --- a/src/thorin/continuation.h +++ b/src/thorin/continuation.h @@ -106,6 +106,7 @@ enum class Intrinsic : uint8_t { Vectorize, ///< External vectorizer. AcceleratorEnd, Reserve = AcceleratorEnd, ///< Intrinsic memory reserve function + LocalMemory, ///< Intrinsic get local memory base pointer Atomic, ///< Intrinsic atomic function AtomicLoad, ///< Intrinsic atomic load function AtomicStore, ///< Intrinsic atomic store function