I have a problem with passing a pointer to the struct to the device function. I want to create a struct in local memory (i know it's slow, it's just an example) and pass it to the other function by pointer. The problem is that when i debug it with memcheck on, i get error:
Program received signal CUDA_EXCEPTION_1, Lane Illegal Address.
Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 7, warp 0, lane 0
0x0000000000977608 in foo (st=0x3fffc38) at test.cu:15
15 st->m_tx = 99;
If I debug it without memcheck on, it works fine and gives expected results. My OS is RedHat 6.3 64-bits with Kernel 2.6.32-220. I use GTX680, CUDA 5.0 and compile the program with sm=30.
Code I used for testing this is below:
typedef struct __align__(8) {
int m_x0;
int m_tx;
} myStruct;
__device__ void foo(myStruct *st) {
st->m_tx = 99;
st->m_x0 = 123;
}
__global__ void myKernel(){
myStruct m_struct ;
m_struct.m_tx = 45;
m_struct.m_x0 = 90;
foo(&m_struct);
}
int main(void) {
myKernel <<<1,1 >>>();
cudaThreadSynchronize();
return 0;
}
Any suggestions? Thanks for any help.
Your example code is completely optimised away by the compiler because none of the code contributes to a global memory write. This is easily proved by compiling the kernel to a cubin file and disassembling the result with cuobjdump:
$ nvcc -arch=sm_20 -Xptxas="-v" -cubin struct.cu
ptxas info : Compiling entry function '_Z8myKernelv' for 'sm_20'
ptxas info : Function properties for _Z8myKernelv
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 2 registers, 32 bytes cmem[0]
$ cuobjdump -sass struct_dumb.cubin
code for sm_20
Function : _Z8myKernelv
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x00001de780000000*/ EXIT;
.............................
ie. the kernel is completely empty. The debugger can't debug the code you want to investigate because it does not exist in what the compiler/assembler emitted. If we take a few liberties with your code:
typedef struct __align__(8) {
int m_x0;
int m_tx;
} myStruct;
__device__ __noinline__ void foo(myStruct *st) {
st->m_tx = 99;
st->m_x0 = 123;
}
__global__ void myKernel(int dowrite, int *output){
myStruct m_struct ;
m_struct.m_tx = 45;
m_struct.m_x0 = 90;
if (dowrite) {
foo(&m_struct);
output[threadIdx.x] = m_struct.m_tx + m_struct.m_x0;
}
}
int main(void) {
int * output;
cudaMalloc((void **)(&output), sizeof(int));
myKernel <<<1,1 >>>(1, output);
cudaThreadSynchronize();
return 0;
}
and repeat the same compilation and disassembly steps, things look somewhat different:
$ nvcc -arch=sm_20 -Xptxas="-v" -cubin struct_dumb.cu
ptxas info : Compiling entry function '_Z8myKerneliPi' for 'sm_20'
ptxas info : Function properties for _Z8myKerneliPi
8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for _Z3fooP8myStruct
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 5 registers, 40 bytes cmem[0]
$ /usr/local/cuda/bin/cuobjdump -sass struct_dumb.cubin
code for sm_20
Function : _Z8myKerneliPi
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x20105d034800c000*/ IADD R1, R1, -0x8;
/*0010*/ /*0x68009de218000001*/ MOV32I R2, 0x5a;
/*0018*/ /*0xb400dde218000000*/ MOV32I R3, 0x2d;
/*0020*/ /*0x83f1dc23190e4000*/ ISETP.EQ.AND P0, pt, RZ, c [0x0] [0x20], pt;
/*0028*/ /*0x00101c034800c000*/ IADD R0, R1, 0x0;
/*0030*/ /*0x00109ca5c8000000*/ STL.64 [R1], R2;
/*0038*/ /*0x000001e780000000*/ @P0 EXIT;
/*0040*/ /*0x10011c0348004000*/ IADD R4, R0, c [0x0] [0x4];
/*0048*/ /*0xc001000750000000*/ CAL 0x80;
/*0050*/ /*0x00009ca5c0000000*/ LDL.64 R2, [R0];
/*0058*/ /*0x84011c042c000000*/ S2R R4, SR_Tid_X;
/*0060*/ /*0x90411c4340004000*/ ISCADD R4, R4, c [0x0] [0x24], 0x2;
/*0068*/ /*0x0c201c0348000000*/ IADD R0, R2, R3;
/*0070*/ /*0x00401c8590000000*/ ST [R4], R0;
/*0078*/ /*0x00001de780000000*/ EXIT;
/*0080*/ /*0x8c00dde218000001*/ MOV32I R3, 0x63;
/*0088*/ /*0xec009de218000001*/ MOV32I R2, 0x7b;
/*0090*/ /*0x1040dc8590000000*/ ST [R4+0x4], R3;
/*0098*/ /*0x00409c8590000000*/ ST [R4], R2;
/*00a0*/ /*0x00001de790000000*/ RET;
...............................
we get actual code in the assembler output. You might have more luck in the debugger with that.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With