'inline' for __global__ functions to avoid multiple definition error

Question

I have a CUDA template library, in which one function is actually not a template, but is defined within a .cuh header. (vector_add_kernel in kernel.cuh below.)

If multiple .cu files include kernel.cuh and call vector_add[_kernel], it will result in multiple definition errors at link-time. In C++, one can use the inline qualifier to avoid such errors.

However, inline __global__ ... - while preventing the multiple definition errors on my system - results in a warning that the inline qualifier has been ignored.

Q: Is there a better way to avoid the multiple definition error, or a way to suppress this warning only for this function? And is inline __global__ even safe, or might other host compilers truly ignore it?

I could simply move the vector_add_kernel to a separate .cu file, but it would be the only non-header file. I could also template vector_add_kernel, but in my library that makes little sense.

A (not-so-minimal, sorry) working example (tested with CUDA 7.0, gcc 4.7.2 on Debian) is below.

To clarify, main.cu is some user's code; lib.cu is some external library not belonging to me; and kernel.cuh is part of my template library. So, both the external lib and the user's main are using my template library, kernel.cuh - but separately.

main.cu:

#include "lib.hpp"
#include "kernel.cuh"

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#include <cstddef>
#include <cstdlib>
#include <iostream>

int main(void)
{
    const size_t N = 1u << 7;

    float* a = (float*) malloc(N * sizeof(float));
    float* b = (float*) malloc(N * sizeof(float));
    float* c = (float*) malloc(N * sizeof(float));

    for (int i = 0; i < N; ++i) {
        a[i] = b[i] = 2.0f * i;
    }

    lib_vector_add(a, b, c, N);
    for (int i = 0; i < N; ++i) {
        if (c[i] != 2.0f * i + 2.0f * i)
            std::cout << "Error, lib, element " << i << std::endl;
    }

    thrust::device_vector<float> d_a(a, a + N);
    thrust::device_vector<float> d_b(b, b + N);
    thrust::device_vector<float> d_c(N);

    vector_add(d_a, d_b, d_c);
    thrust::host_vector<float> h_c = d_c;
    for (int i = 0; i < N; ++i) {
        if (h_c[i] != 2.0f * i + 2.0f * i)
            std::cout << "Error, element " << i << std::endl;
    }
}

lib.cu,

#include <kernel.cuh>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

void lib_vector_add(float* a, float* b, float* c, size_t N)
{
    thrust::host_vector<float> h_a(a, a + N);
    thrust::host_vector<float> h_b(b, b + N);

    thrust::device_vector<float> d_a = h_a;
    thrust::device_vector<float> d_b = h_b;
    thrust::device_vector<float> d_c(N);

    vector_add(d_a, d_b, d_c);

    thrust::host_vector<float> h_c = d_c;
    for (int i = 0; i < N; ++i)
    {
        c[i] = h_c[i];
    }
}

lib.hpp,

#pragma once

#include <cstddef>

void lib_vector_add(float*, float*, float*, size_t);

kernel.cuh - this form results in a linker error. Uncomment the first inline to get a working code.

#pragma once

#include <thrust/device_vector.h>
#include <cstddef>

// inline keyword avoids multiple definition errors, but produces warnings.
// UNCOMMENT TO GET A WORKING EXECUTABLE.
// inline
__global__ void vector_add_kernel(
    const float *const a,
    const float *const b,
    float *const c,
    const size_t N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    while (tid < N)
    {
        c[tid] = a[tid] + b[tid];
        tid += blockDim.x * gridDim.x;
    }
}

// inline produces no warnings.
inline
void vector_add(
    const thrust::device_vector<float>& d_a,
    const thrust::device_vector<float>& d_b,
    thrust::device_vector<float>& d_c)
{
    const float *const a_ptr = thrust::raw_pointer_cast(d_a.data());
    const float *const b_ptr = thrust::raw_pointer_cast(d_b.data());
    float *const c_ptr = thrust::raw_pointer_cast(d_c.data());

    const size_t N = d_a.size();

    dim3 block(128);
    dim3 grid((N + 127) / 128);

    vector_add_kernel<<<grid, block>>>(a_ptr, b_ptr, c_ptr, N);
}

Makefile

OBJS = main.o lib.o
DEPS = kernel.cuh
CU_ARCH = -gencode arch=compute_20,code=sm_20

all: app

app: $(OBJS)
    nvcc $(CU_ARCH) $(OBJS) -o app

%.o: %.cu $(DEPS)
    nvcc $(CU_ARCH) -dc -I./ $< -o $@

clean:
    -rm *.o

Gilles · Accepted Answer

If you want to keep your current code organisation, you have a very simple solution which is to declare your kernel static (in place of your inline keyword). This will prevent the linker from complaining, but will however generate as many different versions of the kernel as there will be of compilation units (object files) where the kernel.cuh will have been included.

Another solution would be to templatise your kernel. I know you already dismissed this possibility, but you should reconsider it, since your kernel is a natural template for the float type of the input parameters...

'inline' for global functions to avoid multiple definition error

Tags:

c++

cuda

linker

Sam

1 Answers

Gilles

Recent Activity

Donate For Us