I have a CUDA template library, in which one function is actually not a template, but is defined within a .cuh
header. (vector_add_kernel
in kernel.cuh
below.)
If multiple .cu
files include kernel.cuh
and call vector_add[_kernel]
, it will result in multiple definition errors at link-time. In C++, one can use the inline
qualifier to avoid such errors.
However, inline __global__ ...
- while preventing the multiple definition errors on my system - results in a warning that the inline
qualifier has been ignored.
Q: Is there a better way to avoid the multiple definition error, or a way to suppress this warning only for this function? And is inline __global__
even safe, or might other host compilers truly ignore it?
I could simply move the vector_add_kernel
to a separate .cu
file, but it would be the only non-header file. I could also template vector_add_kernel
, but in my library that makes little sense.
A (not-so-minimal, sorry) working example (tested with CUDA 7.0, gcc 4.7.2 on Debian) is below.
To clarify, main.cu
is some user's code; lib.cu
is some external library not belonging to me; and kernel.cuh
is part of my template library. So, both the external lib
and the user's main
are using my template library, kernel.cuh
- but separately.
main.cu
:
#include "lib.hpp"
#include "kernel.cuh"
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <cstddef>
#include <cstdlib>
#include <iostream>
int main(void)
{
const size_t N = 1u << 7;
float* a = (float*) malloc(N * sizeof(float));
float* b = (float*) malloc(N * sizeof(float));
float* c = (float*) malloc(N * sizeof(float));
for (int i = 0; i < N; ++i) {
a[i] = b[i] = 2.0f * i;
}
lib_vector_add(a, b, c, N);
for (int i = 0; i < N; ++i) {
if (c[i] != 2.0f * i + 2.0f * i)
std::cout << "Error, lib, element " << i << std::endl;
}
thrust::device_vector<float> d_a(a, a + N);
thrust::device_vector<float> d_b(b, b + N);
thrust::device_vector<float> d_c(N);
vector_add(d_a, d_b, d_c);
thrust::host_vector<float> h_c = d_c;
for (int i = 0; i < N; ++i) {
if (h_c[i] != 2.0f * i + 2.0f * i)
std::cout << "Error, element " << i << std::endl;
}
}
lib.cu
,
#include <kernel.cuh>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
void lib_vector_add(float* a, float* b, float* c, size_t N)
{
thrust::host_vector<float> h_a(a, a + N);
thrust::host_vector<float> h_b(b, b + N);
thrust::device_vector<float> d_a = h_a;
thrust::device_vector<float> d_b = h_b;
thrust::device_vector<float> d_c(N);
vector_add(d_a, d_b, d_c);
thrust::host_vector<float> h_c = d_c;
for (int i = 0; i < N; ++i)
{
c[i] = h_c[i];
}
}
lib.hpp
,
#pragma once
#include <cstddef>
void lib_vector_add(float*, float*, float*, size_t);
kernel.cuh
- this form results in a linker error. Uncomment the first inline
to get a working code.
#pragma once
#include <thrust/device_vector.h>
#include <cstddef>
// inline keyword avoids multiple definition errors, but produces warnings.
// UNCOMMENT TO GET A WORKING EXECUTABLE.
// inline
__global__ void vector_add_kernel(
const float *const a,
const float *const b,
float *const c,
const size_t N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N)
{
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x;
}
}
// inline produces no warnings.
inline
void vector_add(
const thrust::device_vector<float>& d_a,
const thrust::device_vector<float>& d_b,
thrust::device_vector<float>& d_c)
{
const float *const a_ptr = thrust::raw_pointer_cast(d_a.data());
const float *const b_ptr = thrust::raw_pointer_cast(d_b.data());
float *const c_ptr = thrust::raw_pointer_cast(d_c.data());
const size_t N = d_a.size();
dim3 block(128);
dim3 grid((N + 127) / 128);
vector_add_kernel<<<grid, block>>>(a_ptr, b_ptr, c_ptr, N);
}
Makefile
OBJS = main.o lib.o
DEPS = kernel.cuh
CU_ARCH = -gencode arch=compute_20,code=sm_20
all: app
app: $(OBJS)
nvcc $(CU_ARCH) $(OBJS) -o app
%.o: %.cu $(DEPS)
nvcc $(CU_ARCH) -dc -I./ $< -o $@
clean:
-rm *.o
If you want to keep your current code organisation, you have a very simple solution which is to declare your kernel static
(in place of your inline
keyword). This will prevent the linker from complaining, but will however generate as many different versions of the kernel as there will be of compilation units (object files) where the kernel.cuh
will have been included.
Another solution would be to templatise your kernel. I know you already dismissed this possibility, but you should reconsider it, since your kernel is a natural template for the float
type of the input parameters...
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With