Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How could we generate random numbers in CUDA C with different seed on each run?

Tags:

cuda

gpu

nvidia

I am working on a stochastic process and I wanted to generate different series if random numbers in CUDA kernel each time I run the program. This similar to what we does in C++ by declaring seed = time(null) followed by srand(seed) and rand( )

I can pass seeds from host to device via the kernel but the problem in doing this is I would have to pass an entire array of seeds into the kernel for each thread to have a different random seed each time. Is there a way I could generate random seed / process if / machine time or something like than within the kernel and pass it as a seed?

like image 344
user2624901 Avatar asked Mar 15 '14 14:03

user2624901


People also ask

How are random numbers generated from a seed?

Random seeds are often generated from the state of the computer system (such as the time), a cryptographically secure pseudorandom number generator or from a hardware random number generator.

How are random numbers generated in C?

The rand() and srand() functions are used to generate random numbers in C/C++ programming languages. The rand() function gives same results on every execution because the srand() value is fixed to 1. If we use time function from time.

What is seed in random number generator C?

In the C Programming Language, the srand function initializes the sequence of pseudo-random numbers generated when the rand function is called.

What function in the random package sets the seed?

The set. seed() function sets the starting number used to generate a sequence of random numbers – it ensures that you get the same result if you start with that same seed each time you run the same process.


1 Answers

You don't need to pass an array of random seeds, but, when you use the cuRAND library, you can properly set the sequence number parameter of curand_init. For example [Disclaimer: it is a non-tested function]

__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    if (i < Np) {

        curandState state;

        curand_init(seed, i, 0, &state);

        numbers[i] = curand_uniform(&state);
    }
}

You can also avoid passing the seed from outside if you change the curand_init instruction to

curand_init(clock64(), i, 0, &state);

EDIT

Following Roger Dahl's comment, I have done a comparison (Kepler K20c) between four different possibilities for the generation of arrays of 131072 elements:

  1. Single random number generation: separate kernels for initialization and random number generation;
  2. Single random number generation: unique kernel for initialization and random number generation;
  3. Multiple random number generation: separate kernels for initialization and random number generation;
  4. Multiple random number generation: unique kernel for initialization and random number generation;

Below is the code. The timing for generating has been the following:

  1. 861ms;
  2. 852ms;
  3. 866ms;
  4. 2556ms;

I hope I have correctly understood the performance issue raised by Roger Dahl.

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>

#define DSIZE 8192*16
#define nTPB 256

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
        if (abort) exit(code);
    }
}

/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__global__ void testrand1(curandState *state, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    a[idx] = curand_uniform(&state[idx]);
}

__global__ void testrand2(unsigned long seed, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curandState state;
    curand_init(seed, idx, 0, &state);
    a[idx] = curand_uniform(&state);
}

/********/
/* MAIN */
/********/
int main() {

    int n_iter = 20;

    curandState *devState;  gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));

    float *d_a;             gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));

    float time;
    cudaEvent_t start, stop;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels with multiple random number generation:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i<n_iter; i++) {

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels for multiple random number generation:  %3.1f ms \n", time);

    getchar();
}

Output on GTX660:

Elapsed time for separate kernels:  1960.3 ms
Elapsed time for single kernels:  1536.9 ms
Elapsed time for separate kernels with multiple random number generation:  1576.0 ms
Elapsed time for single kernels for multiple random number generation:  4612.2 ms

Output on GTX570:

Elapsed time for separate kernels:  957.2 ms 
Elapsed time for single kernels:  947.7 ms 
Elapsed time for separate kernels with multiple random number generation:  964.6 ms 
Elapsed time for single kernels for multiple random number generation:  2839.0 ms 

Approximately same performance as the K20c.

like image 114
Vitality Avatar answered Sep 21 '22 02:09

Vitality