I am trying to call <code>cudaMemsetAsync</code> from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0. Here is my test code: <pre class="prettyprint"><code>#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "cuda_device_runtime_api.h" #include <stdio.h> const int size = 5; __global__ void kernel(int *c) { cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL); } int main() { cudaError_t cudaStatus; int c[size] = { 12, 12, 12, 12, 12 }; int *dev_c = 0; cudaStatus = cudaSetDevice(0); cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice); kernel <<< 1, 1 >>>(dev_c); cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_c); cudaStatus = cudaDeviceReset(); printf("%d\n", cudaStatus); printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]); return 0; } </code></pre> And if I run it, I got output like this: <pre class="prettyprint"><code>>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt kernel.cu Creating library a.lib and object a.exp 0 {0,0,0,0,0} </code></pre> When I call memory set, I use value <code>0x7FFFFFFF</code>. I'm expecting non-zero numbers, but it always shows zero. Is this a bug? or I did something wrong? I'm using CUDA 8.0

I can confirm this appears not to work in CUDA 8 on the systems I tested it with. If you want a single thread to perform the operation, you can use <code>memset</code> directly in device code (it, like <code>memcpy</code>, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread. If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this: <pre class="prettyprint"><code>#include <cstring> #include <cstdio> const int size = 5; __global__ void myMemset_kernel(void* p, unsigned char val, size_t sz) { size_t tid = threadIdx.x + blockDim.x * blockIdx.x; unsigned char* _p = (unsigned char*)p; for(; tid < sz; tid += blockDim.x * gridDim.x) { _p[tid] = val; } } __device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL) { const dim3 blocksz(256,1,1); size_t nblocks = (sz + blocksz.x -1) / blocksz.x; unsigned charval = val & 0xff; myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz); } __global__ void kernel(int *c) { cudaStream_t s; cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); myMemset(c, 0x7FFFFFFF, size * 4, s); cudaDeviceSynchronize(); } int main() { int c[size]; int *dev_c; memset(&c[0], 0xffffff0c, size * sizeof(int)); printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]); cudaMalloc((void**)&dev_c, size * sizeof(int)); cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice); kernel <<< 1, 1 >>>(dev_c); cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_c); printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]); return 0; } </code></pre> which compiles and does this: <pre class="prettyprint"><code>$ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt $ ./memset {0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c} {ffffffff,ffffffff,ffffffff,ffffffff,ffffffff} </code></pre> A final point -- note the values above and read this question and answer. In your code, it is not possible to use <code>cudaMemset</code> to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, <code>cudaMemset</code> and its relatives work like regular <code>memset</code> and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway. <hr>

CUDA device runtime api cudaMemsetAsync doesn't work

Tags:

cuda

dynamic-parallelism

I am trying to call cudaMemsetAsync from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.

Here is my test code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>

const int size = 5;

__global__ void kernel(int *c)
{
    cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
}

int main()
{
    cudaError_t cudaStatus;
    int c[size] = { 12, 12, 12, 12, 12 };
    int *dev_c = 0;

    cudaStatus = cudaSetDevice(0);
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
    kernel <<< 1, 1 >>>(dev_c);
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(dev_c);
    cudaStatus = cudaDeviceReset();

    printf("%d\n", cudaStatus);
    printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
    return 0;
}

And if I run it, I got output like this:

>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
kernel.cu
   Creating library a.lib and object a.exp
0
{0,0,0,0,0}

When I call memory set, I use value 0x7FFFFFFF. I'm expecting non-zero numbers, but it always shows zero.

Is this a bug? or I did something wrong? I'm using CUDA 8.0

848

asked May 17 '17 14:05

Xiang Zhang

1 Answers

I can confirm this appears not to work in CUDA 8 on the systems I tested it with.

If you want a single thread to perform the operation, you can use memset directly in device code (it, like memcpy, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread.

If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this:

#include <cstring>
#include <cstdio>

const int size = 5;

__global__ void myMemset_kernel(void* p, unsigned char val, size_t sz)
{
    size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
    unsigned char* _p = (unsigned char*)p;
    for(; tid < sz; tid += blockDim.x * gridDim.x) {
       _p[tid] = val;
    }
}

__device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL)
{
    const dim3 blocksz(256,1,1); 
    size_t nblocks = (sz + blocksz.x -1) / blocksz.x;

    unsigned charval = val & 0xff;
    myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz); 
}

__global__ void kernel(int *c)
{
    cudaStream_t s;
    cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
    myMemset(c, 0x7FFFFFFF, size * 4, s);
    cudaDeviceSynchronize();
}

int main()
{
    int c[size];
    int *dev_c;

    memset(&c[0], 0xffffff0c, size * sizeof(int));
    printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);

    cudaMalloc((void**)&dev_c, size * sizeof(int));
    cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
    kernel <<< 1, 1 >>>(dev_c);
    cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(dev_c);

    printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
    return 0;
}

which compiles and does this:

$ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt
$ ./memset 
{0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c}
{ffffffff,ffffffff,ffffffff,ffffffff,ffffffff}

A final point -- note the values above and read this question and answer. In your code, it is not possible to use cudaMemset to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, cudaMemset and its relatives work like regular memset and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway.

128

answered Nov 02 '22 23:11

2 revs

Related questions
                            
                                Are GPUs good for case-based image filtering?
                            
                                Multi-GPU programming strategies using CUDA
                            
                                How to perform deep copying of struct with CUDA? [duplicate]
                            
                                installing Cuda NVIDIA Graphic Driver failed [closed]
                            
                                will cudaMalloc synchronize host and device?
                            
                                How to determine why a CUDA stream is blocking
                            
                                CUDA Profiler: Calculate memory and compute utilization
                            
                                Is there build-in cross and dot products in CUDA?
                            
                                Getting stack overflows with a CUDA kernel
                            
                                cuda-memcheck, how to get from address to source code?
                            
                                How is 3D texture memory cached?
                            
                                How to write a pointer-chasing benchmark using 64-bit pointers in CUDA?
                            
                                How to get cmake to enable cuda when compiling yolo (darknet)?
                            
                                Generalized sliding-window computation on the GPU
                            
                                Can I use thrust::host_vector or I must use cudaHostAlloc for zero-copy with Thrust?
                            
                                CUDA kernel as member function of a class
                            
                                NVCC warning level
                            
                                Segmentation fault in __pthread_getspecific called from libcuda.so.1
                            
                                Global memory access and L1 cache in Kepler
                            
                                How to have Apache Spark running on GPU?

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With