I've got a strange problem with cuda,
In the below snippet,
#include <stdio.h>
#define OUTPUT_SIZE 26
typedef $PRECISION REAL;
extern "C"
{
__global__ void test_coeff ( REAL* results )
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
int out_index = OUTPUT_SIZE * id;
for (int i=0; i<OUTPUT_SIZE; i++)
{
results[out_index+i]=id;
printf("q");
}
}
}
When I compile and run the code (via pycuda), it works as expected. When I remove the printf, then the results are weird - most of the array is populated correctly, but some of it seems completely random.
here's the full python code:
import numpy as np
import string
#pycuda stuff
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
class MC:
cudacodetemplate = """
#include <stdio.h>
#define OUTPUT_SIZE 26
typedef $PRECISION REAL;
extern "C"
{
__global__ void test_coeff ( REAL* results )
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
int out_index = OUTPUT_SIZE * id;
for (int i=0; i<OUTPUT_SIZE; i++)
{
results[out_index+i]=id;
//printf("q");
}
}
}
"""
def __init__(self, size, prec = np.float32):
#800 meg should be enough . . .
drv.limit.MALLOC_HEAP_SIZE = 1024*1024*800
self.size = size
self.prec = prec
template = string.Template(MC.cudacodetemplate)
self.cudacode = template.substitute( PRECISION = 'float' if prec==np.float32 else 'double')
#self.module = pycuda.compiler.SourceModule(self.cudacode, no_extern_c=True, options=['--ptxas-options=-v'])
self.module = SourceModule(self.cudacode, no_extern_c=True)
def test(self, out_size):
#try to precalc the co-efficients for just the elements of the vector that changes
test = np.zeros( ( 128, out_size*(2**self.size) ), dtype=self.prec )
test2 = np.zeros( ( 128, out_size*(2**self.size) ), dtype=self.prec )
test_coeff = self.module.get_function ('test_coeff')
test_coeff( drv.Out(test), block=(2**self.size,1,1), grid=( 128, 1 ) )
test_coeff( drv.Out(test2), block=(2**self.size,1,1), grid=( 128, 1 ) )
error = (test-test2)
return error
if __name__ == '__main__':
p1 = MC ( 5, np.float64 )
err = p1.test(26)
print err.max()
print err.min()
Basically, with the printf in the kernel, the err is 0 - without it it prints some random error (on my machine around 2452 (for the max), and -2583 (for the min))
I have no idea why.
I've running cuda 4.2 on pycuda 2012.2 (windows 7 64bit) with a geforce 570.
Thanks.
This is most likely due to compiler optimization. You are setting a block of memory OUTPUT_SIZE in length to the loop-constant value of id. In my experience the compiler will optimize that to a memcpy or whathaveyou unless there is something else going on in the loop -- ie your print statement. Furthermore, if you do not utilize that block of memory the compiler may optimize the entire loop away. Try fiddling with your optimization levels and see if you have different results.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With