How could we generate random numbers in CUDA C with different seed on each run?

Tags:

I am working on a stochastic process and I wanted to generate different series if random numbers in CUDA kernel each time I run the program. This similar to what we does in C++ by declaring seed = time(null) followed by srand(seed) and rand( )

I can pass seeds from host to device via the kernel but the problem in doing this is I would have to pass an entire array of seeds into the kernel for each thread to have a different random seed each time. Is there a way I could generate random seed / process if / machine time or something like than within the kernel and pass it as a seed?

344

asked Mar 15 '14 14:03

user2624901

1 Answers

You don't need to pass an array of random seeds, but, when you use the cuRAND library, you can properly set the sequence number parameter of curand_init. For example [Disclaimer: it is a non-tested function]

__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    if (i < Np) {

        curandState state;

        curand_init(seed, i, 0, &state);

        numbers[i] = curand_uniform(&state);
    }
}

You can also avoid passing the seed from outside if you change the curand_init instruction to

curand_init(clock64(), i, 0, &state);

EDIT

Following Roger Dahl's comment, I have done a comparison (Kepler K20c) between four different possibilities for the generation of arrays of 131072 elements:

Single random number generation: separate kernels for initialization and random number generation;
Single random number generation: unique kernel for initialization and random number generation;
Multiple random number generation: separate kernels for initialization and random number generation;
Multiple random number generation: unique kernel for initialization and random number generation;

Below is the code. The timing for generating has been the following:

861ms;
852ms;
866ms;
2556ms;

I hope I have correctly understood the performance issue raised by Roger Dahl.

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>

#define DSIZE 8192*16
#define nTPB 256

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
        if (abort) exit(code);
    }
}

/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__global__ void testrand1(curandState *state, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    a[idx] = curand_uniform(&state[idx]);
}

__global__ void testrand2(unsigned long seed, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curandState state;
    curand_init(seed, idx, 0, &state);
    a[idx] = curand_uniform(&state);
}

/********/
/* MAIN */
/********/
int main() {

    int n_iter = 20;

    curandState *devState;  gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));

    float *d_a;             gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));

    float time;
    cudaEvent_t start, stop;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels with multiple random number generation:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels for multiple random number generation:  %3.1f ms \n", time);

    getchar();
}

Output on GTX660:

Elapsed time for separate kernels:  1960.3 ms
Elapsed time for single kernels:  1536.9 ms
Elapsed time for separate kernels with multiple random number generation:  1576.0 ms
Elapsed time for single kernels for multiple random number generation:  4612.2 ms

Output on GTX570:

Elapsed time for separate kernels:  957.2 ms 
Elapsed time for single kernels:  947.7 ms 
Elapsed time for separate kernels with multiple random number generation:  964.6 ms 
Elapsed time for single kernels for multiple random number generation:  2839.0 ms

Approximately same performance as the K20c.

114

answered Sep 21 '22 02:09

Vitality

Related questions
                            
                                conditional syncthreads & deadlock (or not)
                            
                                Error : argument of type "int" is incompatible with parameter of type "const void *"
                            
                                What's the efficient way to swap two register variables in CUDA?
                            
                                How can I check the progress of matrix multiplication?
                            
                                CMake does not properly find CUDA library
                            
                                Using multiple CUDA GPUs
                            
                                How to define CUDA device constant like a C++ const/constexpr?
                            
                                ERROR: The Compose file './docker-compose.yaml' is invalid because: Unsupported config option for services.nvidia-smi-test: 'runtime'
                            
                                Is it possible to enable syntax highlighting for CUDA 4.0 in Visual Studio 2010?
                            
                                Programming CUDA using Delphi or FreePascal
                            
                                Issue with production release of CUDA Toolkit 4.0 and Nsight 2.0
                            
                                Cuda Bayer/CFA demosaicing example
                            
                                initializer not allowed for __shared__ variable for cuda
                            
                                /usr/bin/ld: cannot find -lcudart
                            
                                L1 cache persistance across CUDA kernels
                            
                                CUDA-capability and CUDA version: compatible?
                            
                                How to get properties from active CUDA device?
                            
                                Why CUDA Occupancy is defined in terms of number of active warps over max warps supported
                            
                                Confusion over grid and block dimensions
                            
                                Poor performance when calling cudaMalloc with 2 GPUs simultaneously

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

How could we generate random numbers in CUDA C with different seed on each run?

Tags:

cuda

gpu

nvidia

user2624901

People also ask

1 Answers

Vitality

Recent Activity

Donate For Us