Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

OpenCL: Can one kernel call the other kernel

Tags:

opencl

Hi ,
I am trying to run the available convolution code in OpenCL.
I am having heterogeneous system with -
1) CPU
2) GPU
PFB my code base which is running in my system :

convolution.cl

// TODO: Add OpenCL kernel code here.
__kernel 
void convolve(
    const __global uint * const input,
    __constant uint     * const mask,
    __global uint       * const output,
    const int                   inputWidth,
    const int                   maskWidth){

        const int x = get_global_id(0);
        const int y = get_global_id(1);

        uint sum = 0;

        for (int r = 0; r < maskWidth; r++)
        {
            const int idxIntmp = (y + r) * inputWidth + x;
            for (int c = 0; c < maskWidth; c++)
            {
                sum += mask[(r * maskWidth) + c] * input[idxIntmp + c];
            }
        }

        output[y * get_global_size(0) + x] = sum;
}

and convolution.cpp -

//Convolution-Process of applying a 3×3 mask to an 8×8 input signal,resulting in a 6×6 output signal

    #include "CL/cl.h"
    #include "vector"
    #include "iostream"
    #include "time.h"

    #include <fstream>
    #include <sstream>
    #include <string>

using namespace std;

// Constants
const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;

cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
    {3, 1, 1, 4, 8, 2, 1, 3},
    {4, 2, 1, 1, 2, 1, 2, 3},
    {4, 4, 4, 4, 3, 2, 2, 2},
    {9, 8, 3, 8, 9, 0, 0, 0},
    {9, 3, 3, 9, 0, 0, 0, 0},
    {0, 9, 0, 8, 0, 0, 0, 0},
    {3, 0, 8, 8, 9, 4, 4, 4},
    {5, 9, 8, 1, 8, 1, 1, 1}
};

const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;

cl_uint outputSignal[outputSignalWidth][outputSignalHeight];

const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;

cl_uint mask[maskWidth][maskHeight] =
{
    {1, 1, 1}, 
    {1, 0, 1}, 
    {1, 1, 1},
};

inline void checkErr(cl_int err, const char * name)
{
    if (err != CL_SUCCESS)
    {
        std::cerr << "ERROR: " << name
            << " (" << err << ")" << std::endl;
        exit(EXIT_FAILURE);
    }
}

void CL_CALLBACK contextCallback(
    const char * errInfo,
    const void * private_info,
    size_t cb,
    void * user_data)
{
    std::cout << "Error occurred during context use: "<< errInfo << std::endl;
    exit(EXIT_FAILURE);
}

int main(int argc,char argv[]){
    cl_int errNum;

    cl_uint numPlatforms;
    cl_uint numDevices;

    cl_platform_id * platformIDs;
    cl_device_id * deviceIDs;

    cl_context context = NULL;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;

    cl_mem inputSignalBuffer;
    cl_mem outputSignalBuffer;
    cl_mem maskBuffer;

    double start,end,Totaltime;//Timer variables

    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);

    checkErr(
        (errNum != CL_SUCCESS) ? errNum :
        (numPlatforms <= 0 ? -1 : CL_SUCCESS),
        "clGetPlatformIDs");

    platformIDs = (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);

    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);

    checkErr(
        (errNum != CL_SUCCESS) ? errNum :
        (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");

    deviceIDs = NULL;

    cl_uint i;

    for (i = 0; i < numPlatforms; i++)
    {
        errNum = clGetDeviceIDs(
            platformIDs[i],
            CL_DEVICE_TYPE_GPU,
            0,
            NULL,
            &numDevices);
        if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
        {
            checkErr(errNum, "clGetDeviceIDs");
        }
        else if (numDevices > 0)
        {
            deviceIDs = (cl_device_id *)malloc(
                sizeof(cl_device_id) * numDevices);

            errNum = clGetDeviceIDs(
                platformIDs[i], 
                CL_DEVICE_TYPE_GPU, 
                numDevices,
                &deviceIDs[0], 
                NULL);

            checkErr(errNum, "clGetDeviceIDs");

            break;
        }
    }
    if (deviceIDs == NULL) {
        std::cout << "No CPU device found" << std::endl;
        exit(-1);
    }
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,(cl_context_properties)platformIDs[i], 0
    };

    context = clCreateContext(
        contextProperties, numDevices, deviceIDs,
        &contextCallback, NULL, &errNum);

    checkErr(errNum, "clCreateContext");

    std::ifstream srcFile("convolution.cl");

    checkErr(srcFile.is_open() ? CL_SUCCESS : -1,
        "reading convolution.cl");

    std::string srcProg(
        std::istreambuf_iterator<char>(srcFile),
        (std::istreambuf_iterator<char>()));

    const char * src = srcProg.c_str();
    size_t length = srcProg.length();

    program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);

    checkErr(errNum, "clCreateProgramWithSource");

    errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);

    checkErr(errNum, "clBuildProgram");

    kernel = clCreateKernel(program, "convolve", &errNum);

    checkErr(errNum, "clCreateKernel");

    inputSignalBuffer = clCreateBuffer(
        context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(cl_uint) * inputSignalHeight * inputSignalWidth,
        static_cast<void *>(inputSignal), &errNum);

    checkErr(errNum, "clCreateBuffer(inputSignal)");    

    maskBuffer = clCreateBuffer(
        context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(cl_uint) * maskHeight * maskWidth,
        static_cast<void *>(mask), &errNum);

    checkErr(errNum, "clCreateBuffer(mask)");

    outputSignalBuffer = clCreateBuffer(
        context, CL_MEM_WRITE_ONLY,
        sizeof(cl_uint) * outputSignalHeight * outputSignalWidth,
        NULL, &errNum);

    checkErr(errNum, "clCreateBuffer(outputSignal)");

    queue = clCreateCommandQueue(
        context, deviceIDs[0], 0, &errNum);
    checkErr(errNum, "clCreateCommandQueue");

    errNum = clSetKernelArg(
        kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
    errNum |= clSetKernelArg(
        kernel, 1, sizeof(cl_mem), &maskBuffer);
    errNum |= clSetKernelArg(
        kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
    errNum |= clSetKernelArg(
        kernel, 3, sizeof(cl_uint), &inputSignalWidth);
    errNum |= clSetKernelArg(
        kernel, 4, sizeof(cl_uint), &maskWidth);

    checkErr(errNum, "clSetKernelArg");

    const size_t globalWorkSize[1] ={ outputSignalWidth * outputSignalHeight };
    const size_t localWorkSize[1] = { 1 };

    start = clock();

    errNum = clEnqueueNDRangeKernel(
                                    queue,
                                    kernel,
                                    1,
                                    NULL,
                                    globalWorkSize,
                                    localWorkSize,
                                    0,
                                    NULL,
                                    NULL
                                    );

    checkErr(errNum, "clEnqueueNDRangeKernel");

    errNum = clEnqueueReadBuffer(
        queue, outputSignalBuffer, CL_TRUE, 0,
        sizeof(cl_uint) * outputSignalHeight * outputSignalHeight,
        outputSignal, 0, NULL, NULL);

    checkErr(errNum, "clEnqueueReadBuffer");

    end= clock(); - start;
    cout<<"Time in ms = "<<((end/CLOCKS_PER_SEC) * 1000) << endl;

    for (int y = 0; y < outputSignalHeight; y++)
    {
        for (int x = 0; x < outputSignalWidth; x++)
        {
            std::cout << outputSignal[x][y] << " ";
        }
        std::cout << std::endl;
    }

    return 0;
}

Questions : I am having below doubts-
1) When I am using device type as CL_DEVICE_TYPE_GPU,
am getting 267 ms performance .When I am using CL_DEVICE_TYPE_CPU,execution time changed to 467 ms. I want to know that what is the difference between running a convolution code on a CPU without GPU and CPU with GPU (by selecting device type as CL_DEVICE_TYPE_CPU) .
2) As I can see the convolution.cl file where there is a for loop which is executing 3 times.Can I call other Kernel for doing this operation from available kernel file ??

I am asking this question as I am new to the OpenCL coding and want to know that thing.

like image 792
Ashwin Avatar asked Mar 18 '23 10:03

Ashwin


1 Answers

  1. Both CPU & GPU are OpenCL Devices. So, by choosing CL_DEVICE_TYPE_CPU, you are telling OpenCL runtime to compile kernel code to CPU assembler & run it on CPU. When you are choosing CL_DEVICE_TYPE_GPU, kernel code is compiled to GPU assembler & executed on your video card. Ability to change device type without re-writing source code is of the main OpenCL features. It doesn't matter, does your CPU have integrated GPU, and / or discrete GPU is installed, you just pick available Device & run kernel on it.

  2. For OpenCL 1.2 & older you can't call kernel from kernel. Dynamic parallelism is implemented in OpenCL 2.0.

like image 139
Roman Arzumanyan Avatar answered May 13 '23 05:05

Roman Arzumanyan