I'm writing a CUDA kernel that is compiled at runtime using NVRTC (CUDA version 9.2 with NVRTC version 7.5), which needs the stdint.h
header, in order to have the int32_t
etc. types.
If I write the kernel source code without the include, it works correctly. For example the kernel
extern "C" __global__ void f() { ... }
Compiles to PTX code where f is defined as .visible .entry f
.
But if the kernel source code is
#include <stdint.h>
extern "C" __global__ void f() { ... }
it reports A function without execution space annotations (__host__/__device__/__global__) is considered a host function, and host functions are not allowed in JIT mode.
(also without extern "C"
).
Passing -default-device
makes the PTX code .visible .func f
, so the function cannot be called from the host.
Is there a way to include headers in the source code, and still have a __global__
entry function? Or alternately, a way to know which integer size convention is used on the by the NVRTC compiler, so that the int32_t
etc. types can be manually defined?
Edit: Example program that shows the problem:
#include <cstdlib>
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>
[[noreturn]] void fail(const std::string& msg, int code) {
std::cerr << "error: " << msg << " (" << code << ')' << std::endl;
std::exit(EXIT_FAILURE);
}
std::unique_ptr<char[]> compile_to_ptx(const char* program_source) {
nvrtcResult rv;
// create nvrtc program
nvrtcProgram prog;
rv = nvrtcCreateProgram(
&prog,
program_source,
"program.cu",
0,
nullptr,
nullptr
);
if(rv != NVRTC_SUCCESS) fail("nvrtcCreateProgram", rv);
// compile nvrtc program
std::vector<const char*> options = {
"--gpu-architecture=compute_30"
};
//options.push_back("-default-device");
rv = nvrtcCompileProgram(prog, options.size(), options.data());
if(rv != NVRTC_SUCCESS) {
std::size_t log_size;
rv = nvrtcGetProgramLogSize(prog, &log_size);
if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLogSize", rv);
auto log = std::make_unique<char[]>(log_size);
rv = nvrtcGetProgramLog(prog, log.get());
if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLog", rv);
assert(log[log_size - 1] == '\0');
std::cerr << "Compile error; log:\n" << log.get() << std::endl;
fail("nvrtcCompileProgram", rv);
}
// get ptx code
std::size_t ptx_size;
rv = nvrtcGetPTXSize(prog, &ptx_size);
if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTXSize", rv);
auto ptx = std::make_unique<char[]>(ptx_size);
rv = nvrtcGetPTX(prog, ptx.get());
if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTX", rv);
assert(ptx[ptx_size - 1] == '\0');
nvrtcDestroyProgram(&prog);
return ptx;
}
const char program_source[] = R"%%%(
//#include <stdint.h>
extern "C" __global__ void f(int* in, int* out) {
out[threadIdx.x] = in[threadIdx.x];
}
)%%%";
int main() {
CUresult rv;
// initialize CUDA
rv = cuInit(0);
if(rv != CUDA_SUCCESS) fail("cuInit", rv);
// compile program to ptx
auto ptx = compile_to_ptx(program_source);
std::cout << "PTX code:\n" << ptx.get() << std::endl;
}
When //#include <stdint.h>
in the kernel source is uncommented it no longer compiles. When //options.push_back("-default-device");
is uncommented it compiles but does not mark the function f
as .entry
.
CMakeLists.txt to compile it (needs CUDA driver API + NVRTC)
cmake_minimum_required(VERSION 3.4)
project(cudabug CXX)
find_package(CUDA REQUIRED)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED 14)
add_executable(cudabug cudabug.cc)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})
target_link_libraries(cudabug PUBLIC ${CUDA_LIBRARIES} nvrtc cuda)
[Preface: this is a very hacky answer, and is specific to the GNU toolchain (although I suspect the problem in the question is also specific to the GNU toolchain)].
It would appear that the problem here is with the GNU standard header features.h
, which gets pulled into stdint.h
and then winds up defining a lot of stub functions which have the default __host__
compilation space. This causes nvrtc to blow up. It also seems that the -default-device
option will result in a resolved glibC compiler feature set which makes the whole nvrtc compiler fail.
You can defeat this (in a very hacky way) by predefining a feature set for the standard library which excludes all the host functions. Changing your JIT kernel code to
const char program_source[] = R"%%%(
#define __ASSEMBLER__
#define __extension__
#include <stdint.h>
extern "C" __global__ void f(int32_t* in, int32_t* out) {
out[threadIdx.x] = in[threadIdx.x];
}
)%%%";
got me this:
$ nvcc -std=c++14 -ccbin=g++-7 jit_header.cu -o jitheader -lnvrtc -lcuda
$ ./jitheader
PTX code:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-24330188
// Cuda compilation tools, release 9.2, V9.2.148
// Based on LLVM 3.4svn
//
.version 6.2
.target sm_30
.address_size 64
// .globl f
.visible .entry f(
.param .u64 f_param_0,
.param .u64 f_param_1
)
{
.reg .b32 %r<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [f_param_0];
ld.param.u64 %rd2, [f_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.u32 %r2, [%rd6];
add.s64 %rd7, %rd3, %rd5;
st.global.u32 [%rd7], %r2;
ret;
}
Big caveat: This worked on the glibC system I tried it on. It probably won't work with other toolchains or libC implementations (if, indeed, they have this problem).
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With