here's a simple OpenCL Matrix Multiplication kernel which is driving me crazy:
By the way I am using pyopencl.
__kernel void matrixMul( __global int* C,
__global int* A,
__global int* B,
int wA, int wB){
int row = get_global_id(1); //2D Threas ID x
int col = get_global_id(0); //2D Threas ID y
//Perform dot-product accumulated into value
int value = 0;
for ( int k = 0; k < wA; k++ ){
value += A[row*wA + k] * B[k*wB+col];
}
C[row*wA+col] = value; //Write to the device memory
}
Where (inputs)
A = [72 45
75 61]
B = [26 53
46 76]
wA = wB = 2
Output I am getting:
Sometime I get:
C = [3942 0
0 5472]
Else I get:
C = [3942 7236
3312 5472]
But the output should be:
C = [3942 7236
4756 8611]
I don't know what mistake I am making here. I have spent the entire day with no luck.
Please help me with this
Here's the full python code:
import pyopencl as cl
import numpy as np
import os
ORDER = 2
LEN = ORDER*ORDER
ctx = cl.create_some_context()
commandQueue = cl.CommandQueue( ctx )
A = np.array((72, 45, 75, 61), dtype = np.int32)
B = np.array((26, 53, 46, 76), dtype = np.int32)
C = np.empty_like(A)
in_buf1 = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
hostbuf = A )
in_buf2 = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
hostbuf = B )
out_buf = cl.Buffer( ctx, cl.mem_flags.WRITE_ONLY, C.nbytes )
kernelSrc1 = """__kernel void
matrixMul( /*const int Mdim,
const int Ndim,
const int Pdim,*/
__global int* C,
__global int* A,
__global int* B,
int wA, int wB)
{
int row = get_global_id(1); //2D Threas ID x
int col = get_global_id(0); //2D Threas ID y
//Perform dot-product accumulated into value
int value = 0;
for ( int k = 0; k < wA; k++ ){
value += A[row*wA + k] * B[k*wB+col];
}
C[row*wA+col] = value; //Write to the device memory
}"""
program1 = cl.Program(ctx, kernelSrc1 ).build()
event1 = program1.matrixMul( commandQueue, (LEN, ), None,
out_buf, in_buf1, in_buf2, np.int32(ORDER), np.int32(ORDER));
event1.wait()
cl.enqueue_copy(commandQueue, C, out_buf)
print C
I am using Python 2.7.x, pyopencl 2012.1, AMD APP SDK
You are setting your global size argument incorrectly. Since you are using two dimensions of global size in your kernel, you need to set your global size to (ORDER,ORDER). When you change it to that, you get:
[3942 7236
4756 8611]
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With