Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

CUDA device runtime api cudaMemsetAsync doesn't work

I am trying to call cudaMemsetAsync from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.

Here is my test code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>

const int size = 5;

__global__ void kernel(int *c)
{
    cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
}

int main()
{
    cudaError_t cudaStatus;
    int c[size] = { 12, 12, 12, 12, 12 };
    int *dev_c = 0;

    cudaStatus = cudaSetDevice(0);
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
    kernel <<< 1, 1 >>>(dev_c);
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(dev_c);
    cudaStatus = cudaDeviceReset();

    printf("%d\n", cudaStatus);
    printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
    return 0;
}

And if I run it, I got output like this:

>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
kernel.cu
   Creating library a.lib and object a.exp
0
{0,0,0,0,0}

When I call memory set, I use value 0x7FFFFFFF. I'm expecting non-zero numbers, but it always shows zero.

Is this a bug? or I did something wrong? I'm using CUDA 8.0

like image 848
Xiang Zhang Avatar asked May 17 '17 14:05

Xiang Zhang


People also ask

Is cudaMemcpy blocked?

Data transfers in either direction between the host and device using assignment statements or the function cudaMemcpy() are blocking transfers; that is, control is returned to the host thread only after the data transfer is complete.

What does cudaDeviceSynchronize do?

cudaDeviceSynchronize() will force the program to ensure the stream(s)'s kernels/memcpys are complete before continuing, which can make it easier to find out where the illegal accesses are occuring (since the failure will show up during the sync).

What is Cudamemset?

Fills the first count bytes of the memory area pointed to by devPtr with the constant byte value value. Parameters: devPtr - Pointer to device memory value - Value to set for each byte of specified memory count - Size in bytes to set.

What is a Cuda error?

A CUDA Error: Device-Side Assert Triggered can either be caused by an inconsistency between the number of labels and output units or an incorrect input for a loss function.


1 Answers

I can confirm this appears not to work in CUDA 8 on the systems I tested it with.

If you want a single thread to perform the operation, you can use memset directly in device code (it, like memcpy, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread.

If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this:

#include <cstring>
#include <cstdio>

const int size = 5;

__global__ void myMemset_kernel(void* p, unsigned char val, size_t sz)
{
    size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned char* _p = (unsigned char*)p;
    for(; tid < sz; tid += blockDim.x * gridDim.x) {
       _p[tid] = val;
    }
}

__device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL)
{
    const dim3 blocksz(256,1,1); 
    size_t nblocks = (sz + blocksz.x -1) / blocksz.x;

    unsigned charval = val & 0xff;
    myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz); 
}

__global__ void kernel(int *c)
{
    cudaStream_t s;
    cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
    myMemset(c, 0x7FFFFFFF, size * 4, s);
    cudaDeviceSynchronize();
}

int main()
{
    int c[size];
    int *dev_c;

    memset(&c[0], 0xffffff0c, size * sizeof(int));
    printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);

    cudaMalloc((void**)&dev_c, size * sizeof(int));
    cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
    kernel <<< 1, 1 >>>(dev_c);
    cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(dev_c);

    printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
    return 0;
}

which compiles and does this:

$ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt
$ ./memset 
{0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c}
{ffffffff,ffffffff,ffffffff,ffffffff,ffffffff}

A final point -- note the values above and read this question and answer. In your code, it is not possible to use cudaMemset to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, cudaMemset and its relatives work like regular memset and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway.


like image 128
2 revs Avatar answered Nov 02 '22 23:11

2 revs