Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

CUDA 7.5 experimental __host__ __device__ lambdas

I played a bit with the experimental device lambdas that where introduced in CUDA 7.5 and promoted in this blog post by Mark Harris.

For the following example I removed a lot of stuff that is not needed to show my problem (my actual implementation looks a bit nicer...).

I tried to write a foreach function that operates either on vectors on device (1 thread per element) or host (serial) depending on a template parameter. With this foreach function I can easily implement BLAS functions. As an example I use assigning a scalar to each component of a vector (I attach the complete code in the end):

template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
    auto assign = [=] __host__ __device__ ( size_t index ) { vector[index] = a; };
    if( onDevice )
    {
        foreachDevice( size, assign );
    }
    else
    {
        foreachHost( size, assign );
    }
}

However, this code gives a compiler error because of the __host__ __device__ lambda:

The closure type for a lambda ("lambda ->void") cannot be used in the template argument type of a __global__ function template instantiation, unless the lambda is defined within a __device__ or __global__ function

I get the same error if I remove the __device__ from the lambda expression and I get no compile error if I remove __host__ (only __device__ lambda), but in this case the host part is not executed...

If I define the lambda as either __host__ or __device__ separately, the code compiles and works as expected.

template<bool onDevice> void assignScalar2( size_t size, double* vector, double a )
{
    if( onDevice )
    {
        auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
        foreachDevice( size, assign );
    }
    else
    {
        auto assign = [=] __host__ ( size_t index ) { vector[index] = a; };
        foreachHost( size, assign );
    }
}

However, this introduces code duplication and actually makes the whole idea of using lambdas useless for this example.

Is there a way to accomplish what I want to do or is this a bug in the experimental feature? Actually, defining a __host__ __device__ lambda is explicitly mentioned in the first example in the programming guide. Even for that simpler example (just return a constant value from the lambda) I couldn't find a way to use the lambda expression on both host and device.

Here is the full code, compile with options -std=c++11 --expt-extended-lambda:

#include <iostream>
using namespace std;

template<typename Operation> void foreachHost( size_t size, Operation o )
{
    for( size_t i = 0; i < size; ++i )
    {
        o( i );
    }
}

template<typename Operation> __global__ void kernel_foreach( Operation o )
{
    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
    o( index );
}

template<typename Operation> void foreachDevice( size_t size, Operation o )
{
    size_t blocksize = 32;
    size_t gridsize = size/32;
    kernel_foreach<<<gridsize,blocksize>>>( o );
}

__global__ void printFirstElementOnDevice( double* vector )
{
    printf( "dVector[0] = %f\n", vector[0] );
}

void assignScalarHost( size_t size, double* vector, double a )
{
    auto assign = [=] ( size_t index ) { vector[index] = a; };
    foreachHost( size, assign );
}

void assignScalarDevice( size_t size, double* vector, double a )
{
    auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
    foreachDevice( size, assign );
}

// compile error:
template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
    auto assign = [=]  __host__ __device__ ( size_t index ) { vector[index] = a; };
    if( onDevice )
    {
        foreachDevice( size, assign );
    }
    else
    {
        foreachHost( size, assign );
    }
}

// works:
template<bool onDevice> void assignScalar2( size_t size, double* vector, double a )
{
    if( onDevice )
    {
        auto assign = [=] __device__ ( size_t index ) { vector[index] = a; };
        foreachDevice( size, assign );
    }
    else
    {
        auto assign = [=] __host__ ( size_t index ) { vector[index] = a; };
        foreachHost( size, assign );
    }
}

int main()
{
    size_t SIZE = 32;

    double* hVector = new double[SIZE];
    double* dVector;
    cudaMalloc( &dVector, SIZE*sizeof(double) );

    // clear memory
    for( size_t i = 0; i < SIZE; ++i )
    {
        hVector[i] = 0;
    }
    cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice );

    assignScalarHost( SIZE, hVector, 1.0 );
    cout << "hVector[0] = " << hVector[0] << endl;

    assignScalarDevice( SIZE, dVector, 2.0 );
    printFirstElementOnDevice<<<1,1>>>( dVector );
    cudaDeviceSynchronize();

    assignScalar2<false>( SIZE, hVector, 3.0 );
    cout << "hVector[0] = " << hVector[0] << endl;

    assignScalar2<true>( SIZE, dVector, 4.0 );
    printFirstElementOnDevice<<<1,1>>>( dVector );
    cudaDeviceSynchronize();

//  assignScalar<false>( SIZE, hVector, 5.0 );
//  cout << "hVector[0] = " << hVector[0] << endl;
//
//  assignScalar<true>( SIZE, dVector, 6.0 );
//  printFirstElementOnDevice<<<1,1>>>( dVector );
//  cudaDeviceSynchronize();

    cudaError_t error = cudaGetLastError();
    if(error!=cudaSuccess)
    {
        cout << "ERROR: " << cudaGetErrorString(error);
    }
}

I used the production release of CUDA 7.5.

Update

I tried this third version for the assignScalar function:

template<bool onDevice> void assignScalar3( size_t size, double* vector, double a )
{
#ifdef __CUDA_ARCH__
#define LAMBDA_HOST_DEVICE __device__
#else
#define LAMBDA_HOST_DEVICE __host__
#endif

    auto assign = [=] LAMBDA_HOST_DEVICE ( size_t index ) { vector[index] = a; };
    if( onDevice )
    {
        foreachDevice( size, assign );
    }
    else
    {
        foreachHost( size, assign );
    }
}

It compiles and runs without error, but the device version (assignScalar3<true>) is not executed. Actually, I thought that __CUDA_ARCH__ will always be undefined (since the function is not __device__) but I checked explicitly that there is a compile path where it is defined.

like image 955
havogt Avatar asked Oct 20 '22 02:10

havogt


1 Answers

The task that I tried to accomplish with the examples provided in the question is not possible with CUDA 7.5, though it was not explicitly excluded from the allowed cases for the experimental lambda support.

NVIDIA announced that CUDA Toolkit 8.0 will support __host__ __device__ lambdas as an experimental feature, according to the blog post CUDA 8 Features Revealed.

I verified that my example works with the CUDA 8 Release Candidate (Cuda compilation tools, release 8.0, V8.0.26).

Here is the code that I finally used, compiled with nvcc -std=c++11 --expt-extended-lambda:

#include <iostream>
using namespace std;

template<typename Operation> __global__ void kernel_foreach( Operation o )
{
    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    o( i );
}

template<bool onDevice, typename Operation> void foreach( size_t size, Operation o )
{
    if( onDevice )
    {
        size_t blocksize = 32;
        size_t gridsize = size/32;
        kernel_foreach<<<gridsize,blocksize>>>( o );
    }
    else
    {
        for( size_t i = 0; i < size; ++i )
        {
            o( i );
        }
    }
}

__global__ void printFirstElementOnDevice( double* vector )
{
    printf( "dVector[0] = %f\n", vector[0] );
}

template<bool onDevice> void assignScalar( size_t size, double* vector, double a )
{
    auto assign = [=]  __host__ __device__ ( size_t i ) { vector[i] = a; };
    foreach<onDevice>( size, assign );
}

int main()
{
    size_t SIZE = 32;

    double* hVector = new double[SIZE];
    double* dVector;
    cudaMalloc( &dVector, SIZE*sizeof(double) );

    // clear memory
    for( size_t i = 0; i < SIZE; ++i )
    {
        hVector[i] = 0;
    }
    cudaMemcpy( dVector, hVector, SIZE*sizeof(double), cudaMemcpyHostToDevice );

    assignScalar<false>( SIZE, hVector, 3.0 );
    cout << "hVector[0] = " << hVector[0] << endl;

    assignScalar<true>( SIZE, dVector, 4.0 );
    printFirstElementOnDevice<<<1,1>>>( dVector );
    cudaDeviceSynchronize();

    cudaError_t error = cudaGetLastError();
    if(error!=cudaSuccess)
    {
        cout << "ERROR: " << cudaGetErrorString(error);
    }
}
like image 180
havogt Avatar answered Oct 30 '22 14:10

havogt