I made an OpenCL program and use pinned memory (CL_MEM_ALLOC_HOST_PTR
) to get a higher transfer rate from device to host.
The transfer rate is increased as I expected (get transfer rate using AMD APP Profiler 2.4). The problem is the transfer rate is higher than PCIe bandwidth (93703 GB /s) for matrix 4096 x 4096 (64 MB).
It happened too when I use zero copy buffer ( CL_MEM_ALLOC_HOST_PTR + clEnqueueMapBuffer). I search some information that it is true if pinned memory and zero copy buffer have high transfer rate but it still limited with PCIe bandwidth for discrete GPU. So, is it normal if the transfer rate exceed PCIe bandwidth (using PCIe bandwidth 2.0 x 16)?
My OS is Windows 7 64 bit. I use AMD APP SDK 2.6 and discrete GPU AMD HD 6630M.
Edit: Here is the code:
#include <Windows.h>
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
cl_context context = NULL;
cl_command_queue queue = NULL;
cl_program program = NULL;
void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)
{
cl_int err;
cl_kernel naive;
// Create Kernel Object Bound To Kernel Function
naive = clCreateKernel(program, "naiveAlgorithm", &err);
//Set size of global work item and work tem in each work goups
int globalsize = size;
int localsize;
if(globalsize >= 16)
{
localsize =16;
}else
{
localsize = globalsize;
}
size_t global_work_items [2] = {globalsize, globalsize};
size_t local_work_items [2] = {localsize, localsize};
// Setup Kernel Argument
err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);
err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);
err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);
err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);
// Execute OpenCL kernel for Naive Algorithm
err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);
clFinish(queue);
//Release Kernel
err = clReleaseKernel(naive);
}
void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)
{
int err;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
// Allocate Device Memory For Input And Output
d_A = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_B = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);
// Copy Host Memory To Memory Device
err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL);
MatrixMul(d_A, d_B, d_C, size);
err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);
err = clReleaseMemObject(d_A);
err = clReleaseMemObject(d_B);
err = clReleaseMemObject(d_C);
}
//Main Function
int main(int argc, char **argv)
{
//Size of matrix for Strassen Algorithm
cl_int size = 4096;
//Matrix for input and output
cl_float * matrixA;
cl_float * matrixB;
cl_float * matrixC;
//Allocate and init memory for the host
matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));
matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));
//Fill matrix
fillMatrix(matrixA,size);
fillMatrix(matrixB,size);
//print input for matrix A and B
cout<<"Input for matrix A :"<<endl;
printMatrix(matrixA, size*size, size);
cout<<"Input for matrix B :"<<endl;
printMatrix(matrixB, size*size, size);
cl_int err; // error code
cl_platform_id* platforms;
cl_uint platformCount;
cl_device_id device;
int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk
clGetPlatformIDs(0, NULL, &platformCount); //get number of platform
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
clGetPlatformIDs(platformCount, platforms, NULL); //get list of platform
clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices
const cl_context_properties contextProperties [] =
{CL_CONTEXT_PLATFORM,
reinterpret_cast<cl_context_properties> (platforms [platformtype]),
0, 0
};
context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);
![enter image description here][2]queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
//Load Kernel Source
FILE *fp;
const char fileName[] = "./MatMul_Kernel.cl";
size_t source_size;
char *source_str;
fp = fopen(fileName, "r");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Create Program Object
program = clCreateProgramWithSource(context, 1, (const char **) &source_str,(const size_t *),
&source_size, &err);
// Build Program
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
Naive(matrixA, matrixB, matrixC, size);
//Cleanup all memory
err = clFlush(queue);
err = clFinish(queue);
err = clReleaseProgram(program);
err = clReleaseCommandQueue(queue);
err = clReleaseContext(context);
// Display result of matrix multiplication
cout<<"Output for matrix C :"<<endl;
printMatrix(matrixC, size*size, size);
cout<<endl;
free(matrixA);
free(matrixB);
free(matrixC);
free(source_str);
return 0;
}
And here is the kernel code:
__kernel void naiveAlgorithm(__global float *A, __global float *B, __global float *C, int size) {
int tx = get_global_id(0); //2D Thread IDx
int ty = get_global_id(1); //2D Thread IDy
float sum = 0;
//Calculate result of one element of Matrix C
for (int k = 0; k < size; k++) {
sum += A[ty*size+k] * B[k*size+tx];
}
C[ty*size+tx] = sum;
}
And here is the image:
I see that your output array is actually located in host memory because of the CL_MEM_ALLOC_HOST_PTR
flag in the following line:
d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR ,sizeof(cl_float)*size*size, 0,&err);
This means that you should be using clEnqueueMapBuffer
, followed by using the matrix in whatever way you see fit, followed by clEnqueueUnmapMemObject
. There is no need for the array matrixC since d_C is already in host memory.
The data transfer from GPU to host actually happens while your kernel is running. The map call makes sure that all data has finished moving from the GPU to the CPU. That is why the transfer times are actually so small.
I can't find any documentation on whether clEnqueueReadBuffer
works for pinned memory or not. I also see that you are retrieving the error codes of each operation but do not check these error codes, hence your code may be silently failing.
Regarding the large difference between the time taken by clEnqueueReadBuffer
and the time spent transferring data, note that all queued operations don't immediately get dispatched to the GPU. One source of delay is the Windows display driver model (WDDM) for graphics cards. The +-20 micro-seconds used for the clEnqueueReadBuffer
sounds right for this delay (I've actually seen longer delays).
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With