I am working on a stochastic process and I wanted to generate different series if random numbers in CUDA kernel each time I run the program. This similar to what we does in C++ by declaring seed = time(null) followed by srand(seed) and rand( )
I can pass seeds from host to device via the kernel but the problem in doing this is I would have to pass an entire array of seeds into the kernel for each thread to have a different random seed each time. Is there a way I could generate random seed / process if / machine time or something like than within the kernel and pass it as a seed?
Random seeds are often generated from the state of the computer system (such as the time), a cryptographically secure pseudorandom number generator or from a hardware random number generator.
The rand() and srand() functions are used to generate random numbers in C/C++ programming languages. The rand() function gives same results on every execution because the srand() value is fixed to 1. If we use time function from time.
In the C Programming Language, the srand function initializes the sequence of pseudo-random numbers generated when the rand function is called.
The set. seed() function sets the starting number used to generate a sequence of random numbers – it ensures that you get the same result if you start with that same seed each time you run the same process.
You don't need to pass an array of random seeds, but, when you use the cuRAND library, you can properly set the sequence number parameter of curand_init
. For example [Disclaimer: it is a non-tested function]
__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < Np) {
curandState state;
curand_init(seed, i, 0, &state);
numbers[i] = curand_uniform(&state);
}
}
You can also avoid passing the seed from outside if you change the curand_init
instruction to
curand_init(clock64(), i, 0, &state);
EDIT
Following Roger Dahl's comment, I have done a comparison (Kepler K20c) between four different possibilities for the generation of arrays of 131072
elements:
Below is the code. The timing for generating has been the following:
861ms
;852ms
;866ms
;2556ms
;I hope I have correctly understood the performance issue raised by Roger Dahl.
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192*16
#define nTPB 256
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__global__ void testrand1(curandState *state, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
a[idx] = curand_uniform(&state[idx]);
}
__global__ void testrand2(unsigned long seed, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curandState state;
curand_init(seed, idx, 0, &state);
a[idx] = curand_uniform(&state);
}
/********/
/* MAIN */
/********/
int main() {
int n_iter = 20;
curandState *devState; gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));
float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels with multiple random number generation: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels for multiple random number generation: %3.1f ms \n", time);
getchar();
}
Output on GTX660:
Elapsed time for separate kernels: 1960.3 ms
Elapsed time for single kernels: 1536.9 ms
Elapsed time for separate kernels with multiple random number generation: 1576.0 ms
Elapsed time for single kernels for multiple random number generation: 4612.2 ms
Output on GTX570:
Elapsed time for separate kernels: 957.2 ms
Elapsed time for single kernels: 947.7 ms
Elapsed time for separate kernels with multiple random number generation: 964.6 ms
Elapsed time for single kernels for multiple random number generation: 2839.0 ms
Approximately same performance as the K20c.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With