On my laptop I have two graphic cards - Intel Iris and Nvidia GeForce GT 750M. I am trying to do a simple vector add using <code>OpenCL</code>. I know, that Nvidia card is much faster and can do the job better. In principle, I can put an <code>if</code> statement in the code that will look for <code>NVIDIA</code> in the <code>VENDOR</code> attribute. But I'd like to have something elegant. What is the best way to choose a better (faster) GPU programmatically in <code>OpenCL C/C++</code>?

Take a look at this code for GPU discrimination: <pre class="prettyprint"><code>#include <iostream> #ifdef __APPLE__ #include <OpenCL/cl.h> #else #include <CL/cl.h> #pragma comment (lib, "x86_64/opencl.lib") #endif //OpenCL saxpy kernel used for benchmarking const char* saxpy_kernel = "__kernel \n" "void saxpy_kernel(float alpha, \n" " __global float *A, \n" " __global float *B, \n" " __global float *C) \n" "{ \n" " int idx = get_global_id(0); \n" " C[idx] = alpha * A[idx] + B[idx]; \n" "} "; const char* clErrName[] = { "CL_SUCCESS", //0 "CL_DEVICE_NOT_FOUND", //-1 "CL_DEVICE_NOT_AVAILABLE", //-2 "CL_COMPILER_NOT_AVAILABLE", //-3 "CL_MEM_OBJECT_ALLOCATION_FAILURE", //-4 "CL_OUT_OF_RESOURCES", //-5 "CL_OUT_OF_HOST_MEMORY", //-6 "CL_PROFILING_INFO_NOT_AVAILABLE", //-7 "CL_MEM_COPY_OVERLAP", //-8 "CL_IMAGE_FORMAT_MISMATCH", //-9 "CL_IMAGE_FORMAT_NOT_SUPPORTED", //-10 "CL_BUILD_PROGRAM_FAILURE", //-11 "CL_MAP_FAILURE", //-12 "CL_MISALIGNED_SUB_BUFFER_OFFSET", //-13 "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST", //-14 "CL_COMPILE_PROGRAM_FAILURE", //-15 "CL_LINKER_NOT_AVAILABLE", //-16 "CL_LINK_PROGRAM_FAILURE", //-17 "CL_DEVICE_PARTITION_FAILED", //-18 "CL_KERNEL_ARG_INFO_NOT_AVAILABLE", //-19 "CL_UNDEFINED_ERROR_20", //-20 "CL_UNDEFINED_ERROR_21", //-21 "CL_UNDEFINED_ERROR_22", //-22 "CL_UNDEFINED_ERROR_23", //-23 "CL_UNDEFINED_ERROR_24", //-24 "CL_UNDEFINED_ERROR_25", //-25 "CL_UNDEFINED_ERROR_26", //-26 "CL_UNDEFINED_ERROR_27", //-27 "CL_UNDEFINED_ERROR_28", //-28 "CL_UNDEFINED_ERROR_29", //-29 "CL_INVALID_VALUE", //-30 "CL_INVALID_DEVICE_TYPE", //-31 "CL_INVALID_PLATFORM", //-32 "CL_INVALID_DEVICE", //-33 "CL_INVALID_CONTEXT", //-34 "CL_INVALID_QUEUE_PROPERTIES", //-35 "CL_INVALID_COMMAND_QUEUE", //-36 "CL_INVALID_HOST_PTR", //-37 "CL_INVALID_MEM_OBJECT", //-38 "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", //-39 "CL_INVALID_IMAGE_SIZE", //-40 "CL_INVALID_SAMPLER", //-41 "CL_INVALID_BINARY", //-42 "CL_INVALID_BUILD_OPTIONS", //-43 "CL_INVALID_PROGRAM", //-44 "CL_INVALID_PROGRAM_EXECUTABLE", //-45 "CL_INVALID_KERNEL_NAME", //-46 "CL_INVALID_KERNEL_DEFINITION", //-47 "CL_INVALID_KERNEL", //-48 "CL_INVALID_ARG_INDEX", //-49 "CL_INVALID_ARG_VALUE", //-50 "CL_INVALID_ARG_SIZE", //-51 "CL_INVALID_KERNEL_ARGS", //-52 "CL_INVALID_WORK_DIMENSION", //-53 "CL_INVALID_WORK_GROUP_SIZE", //-54 "CL_INVALID_WORK_ITEM_SIZE", //-55 "CL_INVALID_GLOBAL_OFFSET", //-56 "CL_INVALID_EVENT_WAIT_LIST", //-57 "CL_INVALID_EVENT", //-58 "CL_INVALID_OPERATION", //-59 "CL_INVALID_GL_OBJECT", //-60 "CL_INVALID_BUFFER_SIZE", //-61 "CL_INVALID_MIP_LEVEL", //-62 "CL_INVALID_GLOBAL_WORK_SIZE", //-63 "CL_INVALID_PROPERTY", //-64 "CL_INVALID_IMAGE_DESCRIPTOR", //-65 "CL_INVALID_COMPILER_OPTIONS", //-66 "CL_INVALID_LINKER_OPTIONS", //-67 "CL_INVALID_DEVICE_PARTITION_COUNT", //-68 "CL_INVALID_PIPE_SIZE", //-69 "CL_INVALID_DEVICE_QUEUE", //-70 }; const int MAX_ERR_CODE = 70; inline bool __clCallSuccess(cl_int err_code, const char* source_file, const int source_line) { if (err_code == CL_SUCCESS) return true; if ((err_code > 0) || (err_code < -MAX_ERR_CODE)) std::clog << "\t - unknown CL error: " << err_code; else std::clog << "\t - CL call error: " << clErrName[-err_code]; std::clog << " [" << source_file << " : " << source_line << "]" << std::endl; return false; } #define clCallSuccess(err_code) __clCallSuccess(err_code, __FILE__, __LINE__) float cl_BenchmarkDevice(cl_context context, cl_command_queue command_queue, cl_device_id device_id) { float microSeconds = -1.; int i; cl_int clStatus; const int VECTOR_SIZE = 512 * 1024; // Allocate space for vectors A, B and C float* A = (float*)malloc(sizeof(float) * VECTOR_SIZE); if(A) { float* B = (float*)malloc(sizeof(float) * VECTOR_SIZE); if(B) { float* C = (float*)malloc(sizeof(float) * VECTOR_SIZE); if(C) { for (i = 0; i < VECTOR_SIZE; i++) { A[i] = (float)i; B[i] = (float)(VECTOR_SIZE - i); C[i] = 0; } // Create memory buffers on the device for each vector cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); if (clCallSuccess(clStatus)) { cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); if (clCallSuccess(clStatus)) { cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); if (clCallSuccess(clStatus)) { // Copy the Buffer A and B to the device clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL); if (clCallSuccess(clStatus)) { clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL); if (clCallSuccess(clStatus)) { // Create a program from the kernel source and build it cl_program program = clCreateProgramWithSource(context, 1, (const char**)&saxpy_kernel, NULL, &clStatus); if (clCallSuccess(clStatus) && program) { clStatus = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (clCallSuccess(clStatus)) { // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus); if (clCallSuccess(clStatus) && kernel) { float alpha = 2.5; // Set the arguments of the kernel clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void*)&alpha); if (clCallSuccess(clStatus)) { clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&A_clmem); if (clCallSuccess(clStatus)) { clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&B_clmem); if (clCallSuccess(clStatus)) { clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&C_clmem); if (clCallSuccess(clStatus)) { // Execute the OpenCL kernel on the list cl_event event; size_t global_size = VECTOR_SIZE; // Process the entire lists size_t local_size = 512; // Process one item at a time //clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event); clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &event); if (clCallSuccess(clStatus)) { clStatus = clWaitForEvents(1, &event); if (clCallSuccess(clStatus)) { //measure duration cl_ulong time_start; cl_ulong time_end; clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); microSeconds = (float)(time_end - time_start) / 1000.0f; std::clog << "\nOpenCl benchmarking time: " << microSeconds << " microseconds \n"; std::clog << "\n\t*****************************\n\n"; } // Read the cl memory C_clmem on device to the host variable C clCallSuccess(clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL)); // Clean up and wait for all the comands to complete. clCallSuccess(clFlush(command_queue)); clCallSuccess(clFinish(command_queue)); } //Kernel }}}} //SetKErnelArg // Finally release all OpenCL allocated objects and host buffers. clCallSuccess(clReleaseKernel(kernel)); } } //BuildProgram clCallSuccess(clReleaseProgram(program)); } } } //EnqueueWriteBuffer clCallSuccess(clReleaseMemObject(C_clmem)); } clCallSuccess(clReleaseMemObject(B_clmem)); } clCallSuccess(clReleaseMemObject(A_clmem)); } free(C); } free(B); } free(A); } return microSeconds; } /* struct _dev_info { cl_platform_id platfID; cl_device_id devID; }; typedef struct _dev_info dev_info; */ cl_device_id cl_GetBestDevice(void) { cl_int err; cl_uint numPlatforms, numDevices; cl_platform_id platfIDs[10]; cl_device_id devIDsAll[10]; int countGPUs = 0; cl_device_id best_device = NULL; float best_perf = 100000000.; if (clCallSuccess(clGetPlatformIDs(10, platfIDs, &numPlatforms))) { std::clog << "OpenCL platforms detected: " << numPlatforms << std::endl; for (unsigned int i = 0; i < numPlatforms; i++) { std::clog << "PlatformInfo for platform no." << (i + 1) << std::endl; const int SZ_INFO = 1024; char info[SZ_INFO]; size_t sz; if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_NAME, SZ_INFO, info, &sz))) std::clog << " - - Name: " << info << std::endl; if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_VENDOR, SZ_INFO, info, &sz))) std::clog << " - - Vendor: " << info << std::endl; if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_PROFILE, SZ_INFO, info, &sz))) std::clog << " - - Profile: " << info << std::endl; if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_VERSION, SZ_INFO, info, &sz))) std::clog << " - - Version: " << info << std::endl; if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_EXTENSIONS, SZ_INFO, info, &sz))) std::clog << " - - Extensions: " << info << std::endl; if (clCallSuccess(clGetDeviceIDs(platfIDs[i], CL_DEVICE_TYPE_ALL, 10, devIDsAll, &numDevices))) { cl_context_properties cProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platfIDs[i]), 0 }; cl_command_queue_properties qProperties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; for (unsigned int ii = 0; ii < numDevices; ii++) { cl_uint val; cl_ulong memsz; cl_device_type dt; size_t mws; std::clog << " >> DeviceInfo for device no." << (ii + 1) << std::endl; if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_NAME, SZ_INFO, info, &sz))) std::clog << "\t - Name: " << info << std::endl; if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_VENDOR, SZ_INFO, info, &sz))) std::clog << "\t - Vendor: " << info << std::endl; if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_VERSION, SZ_INFO, info, &sz))) std::clog << "\t - Version: " << info << std::endl; if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_TYPE, sizeof(dt), &dt, &sz))) { std::clog << "\t - Type: "; switch (dt) { case CL_DEVICE_TYPE_CPU: std::clog << "CPU"; break; case CL_DEVICE_TYPE_GPU: std::clog << "GPU"; break; case CL_DEVICE_TYPE_ACCELERATOR: std::clog << "Accelerator"; break; case CL_DEVICE_TYPE_DEFAULT: std::clog << "Default"; break; default: std::clog << "ERROR"; } std::clog << std::endl; } if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(memsz), &memsz, &sz))) std::clog << "\t - Memory: " << (memsz / 1024 / 1024) << " MB" << std::endl; if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(val), &val, &sz))) std::clog << "\t - Max Frequency: " << val << " MHz" << std::endl; if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(val), &val, &sz))) std::clog << "\t - Compute units: " << val << std::endl; if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(mws), &mws, &sz))) std::clog << "\t - Max workgroup size: " << mws << std::endl; // Create an OpenCL context cl_context context = clCreateContext(NULL, 1, devIDsAll+ii, NULL, NULL, &err); if (clCallSuccess(err) && context) { // Create a command queue cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, devIDsAll[ii], qProperties, &err); if (clCallSuccess(err) && command_queue) { float perf = cl_BenchmarkDevice(context, command_queue, devIDsAll[ii]); if ((perf > 0) && (perf < best_perf)) { best_perf = perf; best_device = devIDsAll[ii]; } clCallSuccess(clReleaseCommandQueue(command_queue)); } clCallSuccess(clReleaseContext(context)); } } } } } return best_device; } </code></pre> here is the output on my PC

What is the best way to programmatically choose the best GPU in OpenCL?

Tags:

c++

gpgpu

opencl

On my laptop I have two graphic cards - Intel Iris and Nvidia GeForce GT 750M. I am trying to do a simple vector add using OpenCL. I know, that Nvidia card is much faster and can do the job better. In principle, I can put an if statement in the code that will look for NVIDIA in the VENDOR attribute. But I'd like to have something elegant. What is the best way to choose a better (faster) GPU programmatically in OpenCL C/C++?

743

asked Oct 25 '15 18:10

Sleepyhead

4 Answers

I developed a real-time ray tracer (not just a ray caster) which programmatically chose two GPUs and a CPU and rendered and balanced the load on all three in real time. Here is how I did it.

Let's say there are three devices, d1, d2, and d3. Assign each device a weight: w1, w2, and w3. Call the number of pixels to be rendered n. Assume a free parameter called alpha.

Assign each device a weight of 1/3.
Let alpha = 0.5.
Render the first n1=w1*n pixels on d1, the next n2=w2*n pixels on d2, and the last n3=w3*n pixels on d3 and record the times to render for each deivce t1, t2, and t3.
Calculate a value vsum = n1/t1 + n2/t2 + n3/t3.
Recalcuate the weights w_i = alpha*w_i + (1-alpha)*n_i/t_i/vsum.
Go back to step 3.

The point of the value alpha is to allow a smooth transition. Instead of reassign all the weight based on the times it mixes in some of the old weight. Without using alpha I got instabilities. The value alpha can be tuned. In practice it can probably be set around 1% but not 0%.

Let's choose an example.

I had a GTX 590 which was a dual GPU card with two under-clocked GTX580s. I also had a Sandy Bridge 2600K processor. The GPUs were much faster than the CPU. Let's assume they were about 10 times faster. Let's also say there were 900 pixels.

Render the first 300 pixels with GPU1, the next 300 pixels with GPU2, and the last 300 pixels with CPU1 and record the times of 10 s, 10 s, and 100 s respectively. So one GPU for the whole image would take 30 s and the CPU alone would take 300 s. Both GPUS together would take 15 s.

Calculate vsum = 30 + 30 + 3 = 63. Recalculate the weights again: w1,w2 = 0.5*(1/3) + 0.5*300/10/63 = 0.4 and w3 = 0.5*(1/3) + 0.5*300/100/63 = 0.2.

Render the next frame: 360 pixels with GPU1, 360 PIXELS with GPU2, and 180 PIXELS with CPU1 and the times become a bit more balanced say 11 s, 11 s, and 55 s.

After a number of frames the (1-alpha) term dominates until eventually the weights are all based on that term. In this case the weights become 47% (427 pixels), 47%, 6% (46 pixels) respectively and the times become say 14 s, 14 s, 14 s respectively. In this case the CPU only improves the result of using only the GPUs by one second.

I assumed a uniform load in this calculate. In a real ray tracer the load varies per scan-line and pixel but the algorithm stays the same for determining the weights.

In practice once the weights are found they don't change much unless the load of the scene changes significantly e.g. if one region of the scene has high refraction and reflection and the rest is diffuse but even in this case I limit the tree depth so this does not have a dramatic effect.

It's easy to extend this method to multiple devices with a loop. I tested my ray tracer on four devices once. Two 12-core Xeon CPUs and two GPUs. In this case the CPUs had a lot more influence but the GPUs still dominated.

In case anyone is wondering. I created a context for each device and used each context in a separate thread (using pthreads). For three devices I used three threads.

In fact you can use this to run on the same device from different vendors. For example I used both the AMD and Intel CPU drivers simultaneously (each generating about half the frame) on my 2600K to see which vendor was better. When I first did this (2012), if I recall correctly, AMD beat Intel, ironically, on an Intel CPU.

In case anyone is interested in how I came up with the formula for the weights I used an idea from physics (my background is physics not programming).

Speed (v) = distance/time. In this case distance (d) is the number of pixels to process. The total distance then is

d = v1*t1 + v2*t2 + v3*t3

and we want them to each finish in the same time so

d = (v1 + v2 + v3)*t

then to get the weight define

v_i*t = w_i*d

which gives

w_i = v_i*t/d

and replacing (t/d) from (d = (v1 + v2 + v3)*t) gives:

w_i = v_i /(v1 + v2 + v3)

It's easy to see this can be generalized to any number of devices k

w_i = v_i/(v1 + v2 + ...v_k)

So vsum in my algorithm stands for "sum of the velocities". Lastly since v_i is pixels over time it's n_i/t_i which finally gives

w_i = n_i/t_i/(n1/t1 + n2/t2 + ...n_k/t_k)

which is the second term in my formula to calculate the weights.

198

answered Oct 21 '22 15:10

Z boson

Good: Just pick the first compatible device. On most systems, there is only one.

Better: You can very roughly estimate device performance by multiplying the CL_DEVICE_MAX_COMPUTE_UNITS device info result by the CL_DEVICE_MAX_CLOCK_FREQUENCY device info result. Depending on your workload, you might want to include other metrics such as memory size. You could blend these based on what your workload is.

Best: Benchmark with your exact workflow on each device. It's really the only way to know for sure, since anything else is just a guess.

Finally, the user might care about which of their GPUs you are using, so you should have some way to override your programmatic choice regardless of which method you choose.

answered Oct 21 '22 17:10

Dithermaster

If it is simply a vector add and your app resides in host-side then cpu will win. Or even better, integrated cpu will be much faster. Overall performance depends on algortihms, opencl buffer types(use_host_ptr, read_write, etc) and compute to data ratio. Even if you dont copy but pin the array and access, cpu's latency would be smaller than pci-e latency.

If you are going to use opengl + opencl interop, then you will need to know if your compute device is the same device with your rendering output device. (if your screen gets its data from igpu then it is iris, if not then it is nvidia)

If you just need to do some operations on c++ arrays(host side) and get results with fastest way then I suggest you the "load balancing".

Example of vector-add of 4k elements on a Core i7-5775C with Iris pro and two gt750m (one overclocked by 10%)

First, give equal number of ndrange rages to all devices. At the end of each calculation phase, check timings.

CPU      iGPU        dGPU-1        dGPU-2 oc
Intel    Intel       Nvidia        Nvidia  
1024     1024        1024          1024  
34 ms    5ms         10ms          9ms

then calculate weighted(depends on last ndrange range) but relaxed(not exact but close) approximations of calculation bandwidths and change ndrange ranges accordingly:

Intel    Intel       Nvidia        Nvidia 
512      1536        1024          1024  
16 ms    8ms         10ms          9ms

then continue calculating until it really becomes stable.

Intel    Intel       Nvidia        Nvidia 
256      1792        1024          1024  
9ms      10ms         10ms         9ms

or until you can enable finer grains.

Intel    Intel       Nvidia        Nvidia 
320      1728        1024          1024  
10ms     10ms        10ms          9ms 

Intel    Intel       Nvidia        Nvidia  
320      1728        960           1088  
10ms     10ms        10ms          10ms 

         ^            ^
         |            |
         |            PCI-E bandwidth not more than 16 GB/s per device
        closer to RAM, better bandwidth (20-40 GB/s) and less kernel overhead

Instead of getting just the latest iteration for balancing, you can get average(or PID) of last 10 results to eliminate spikes that mislead balancing. Also buffer copies can take more time than computing, if you include this into balancing, you can shut down unnecessary / non benefiting devices.

If you make a library, then you wont have to try benchmark for every new project of yours. They will be auto balanced between devices when you accelerate matrix multiplications, fluid movements, sql table joins and financial approximations.

For the solution of balancing:

If you can solve a linear system as n unknowns(of loads per device) and n equations (benchmark result of all devices), you can find the target loads in single step. If you choose iterative, you need more steps until it converges. The latter is not harder than writing a benchmark. The former is harder for me but it should be more efficient over time.

Althogh a vector-add-only kernel is not a real world scenario, here is a real benchmark from my system:

 __kernel void bench(__global float * a, __global float *b, __global float *c)
                {
                    int i=get_global_id(0);
                    c[i]=a[i]+b[i];  
                }
2560   768 768
AMD FX(tm)-8150 Eight-Core Processor            Oland Pitcairn

this is after several iterations(fx is faster even with extra buffer copies, not using any host pointer). Even the oland gpu is catching pitcairn because their pci-e bandwidth is same.

Now with some trigonometric functions:

  __kernel void bench(__global float * a, __global float *b, __global float *c)
  {
        int i=get_global_id(0);
        c[i]=sin(a[i])+cos(b[i])+sin(cos((float)i));  
  }

   1792   1024 1280

testing gddr3-128bit vs gddr5-256bit(overclocked) and caching.

__kernel void bench(__global float * a, __global float *b, __global float *c)
{
                    int i=get_global_id(0);

                    c[i]=a[i]+b[i]-a[i]-b[i]+a[i]+b[i]-b[i]-a[i]+b[i]; 
                    for(int j=0;j<12000;j++)
                        c[i]+=a[i]+b[i]-a[i]-b[i]+a[i]+b[i]-b[i]-a[i]+b[i]; 

 }



 256   256 3584

High compute to data ratio :

__kernel void bench(__global float * a, __global float *b, __global float *c)
            {
                int i=get_global_id(0);

                c[i]=0.0f; float c0=c[i];float a0=a[i];float b0=b[i];
                for(int j=0;j<12000;j++)
                    c0+=sin(a0)+cos(b0*a0)+cos(sin(b0)*19.95f); 
                c[i]=c0;

            }

256   2048 1792

Now Oland gpu is worthy again and won even with just 320 cores. Because 4k elements easily wrapped around all 320 cores for more than 10 times but pitcairn gpu(1280 cores) was not fully filled with folded arrays (wavefronts) enough and this led to lower occupation of execution units ---> could not hide latencies. Low end devices for low loads is better I think. Maybe I could use this when directx-12 comes out with some loadbalancer and this Oland can compute physics of 5000 - 10000 particles from in game explosions while pitcairn can compute smoke densities.

answered Oct 21 '22 17:10

huseyin tugrul buyukisik

Take a look at this code for GPU discrimination:

#include <iostream>

#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#pragma comment (lib, "x86_64/opencl.lib")
#endif

//OpenCL saxpy kernel used for benchmarking
const char* saxpy_kernel =
"__kernel                                    \n"
"void saxpy_kernel(float alpha,              \n"
"                  __global float *A,        \n"
"                  __global float *B,        \n"
"                  __global float *C)        \n"
"{                                           \n"
"    int idx = get_global_id(0);             \n"
"    C[idx] = alpha * A[idx] + B[idx];       \n"
"}                                           ";

const char* clErrName[] = {
    "CL_SUCCESS",                                   //0
    "CL_DEVICE_NOT_FOUND",                          //-1
    "CL_DEVICE_NOT_AVAILABLE",                      //-2
    "CL_COMPILER_NOT_AVAILABLE",                    //-3
    "CL_MEM_OBJECT_ALLOCATION_FAILURE",             //-4
    "CL_OUT_OF_RESOURCES",                          //-5
    "CL_OUT_OF_HOST_MEMORY",                        //-6
    "CL_PROFILING_INFO_NOT_AVAILABLE",              //-7
    "CL_MEM_COPY_OVERLAP",                          //-8
    "CL_IMAGE_FORMAT_MISMATCH",                     //-9
    "CL_IMAGE_FORMAT_NOT_SUPPORTED",                //-10
    "CL_BUILD_PROGRAM_FAILURE",                     //-11
    "CL_MAP_FAILURE",                               //-12
    "CL_MISALIGNED_SUB_BUFFER_OFFSET",              //-13
    "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST", //-14
    "CL_COMPILE_PROGRAM_FAILURE",                   //-15
    "CL_LINKER_NOT_AVAILABLE",                      //-16
    "CL_LINK_PROGRAM_FAILURE",                      //-17
    "CL_DEVICE_PARTITION_FAILED",                   //-18
    "CL_KERNEL_ARG_INFO_NOT_AVAILABLE",             //-19

    "CL_UNDEFINED_ERROR_20",                        //-20
    "CL_UNDEFINED_ERROR_21",                        //-21
    "CL_UNDEFINED_ERROR_22",                        //-22
    "CL_UNDEFINED_ERROR_23",                        //-23
    "CL_UNDEFINED_ERROR_24",                        //-24
    "CL_UNDEFINED_ERROR_25",                        //-25
    "CL_UNDEFINED_ERROR_26",                        //-26
    "CL_UNDEFINED_ERROR_27",                        //-27
    "CL_UNDEFINED_ERROR_28",                        //-28
    "CL_UNDEFINED_ERROR_29",                        //-29

    "CL_INVALID_VALUE",                             //-30
    "CL_INVALID_DEVICE_TYPE",                       //-31
    "CL_INVALID_PLATFORM",                          //-32
    "CL_INVALID_DEVICE",                            //-33
    "CL_INVALID_CONTEXT",                           //-34
    "CL_INVALID_QUEUE_PROPERTIES",                  //-35
    "CL_INVALID_COMMAND_QUEUE",                     //-36
    "CL_INVALID_HOST_PTR",                          //-37
    "CL_INVALID_MEM_OBJECT",                        //-38
    "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",           //-39
    "CL_INVALID_IMAGE_SIZE",                        //-40
    "CL_INVALID_SAMPLER",                           //-41
    "CL_INVALID_BINARY",                            //-42
    "CL_INVALID_BUILD_OPTIONS",                     //-43
    "CL_INVALID_PROGRAM",                           //-44
    "CL_INVALID_PROGRAM_EXECUTABLE",                //-45
    "CL_INVALID_KERNEL_NAME",                       //-46
    "CL_INVALID_KERNEL_DEFINITION",                 //-47
    "CL_INVALID_KERNEL",                            //-48
    "CL_INVALID_ARG_INDEX",                         //-49
    "CL_INVALID_ARG_VALUE",                         //-50
    "CL_INVALID_ARG_SIZE",                          //-51
    "CL_INVALID_KERNEL_ARGS",                       //-52
    "CL_INVALID_WORK_DIMENSION",                    //-53
    "CL_INVALID_WORK_GROUP_SIZE",                   //-54
    "CL_INVALID_WORK_ITEM_SIZE",                    //-55
    "CL_INVALID_GLOBAL_OFFSET",                     //-56
    "CL_INVALID_EVENT_WAIT_LIST",                   //-57
    "CL_INVALID_EVENT",                             //-58
    "CL_INVALID_OPERATION",                         //-59
    "CL_INVALID_GL_OBJECT",                         //-60
    "CL_INVALID_BUFFER_SIZE",                       //-61
    "CL_INVALID_MIP_LEVEL",                         //-62
    "CL_INVALID_GLOBAL_WORK_SIZE",                  //-63
    "CL_INVALID_PROPERTY",                          //-64
    "CL_INVALID_IMAGE_DESCRIPTOR",                  //-65
    "CL_INVALID_COMPILER_OPTIONS",                  //-66
    "CL_INVALID_LINKER_OPTIONS",                    //-67
    "CL_INVALID_DEVICE_PARTITION_COUNT",            //-68
    "CL_INVALID_PIPE_SIZE",                         //-69
    "CL_INVALID_DEVICE_QUEUE",                      //-70
};
 const int MAX_ERR_CODE = 70;

inline bool __clCallSuccess(cl_int err_code, const char* source_file, const int source_line)
{
    if (err_code == CL_SUCCESS)
        return true;

    if ((err_code > 0) || (err_code < -MAX_ERR_CODE))
        std::clog << "\t - unknown CL error: " << err_code;
    else
        std::clog << "\t - CL call error: " << clErrName[-err_code];

    std::clog << " [" << source_file << " : " << source_line << "]" << std::endl;

    return false;
}

#define clCallSuccess(err_code) __clCallSuccess(err_code, __FILE__, __LINE__)

float cl_BenchmarkDevice(cl_context context, cl_command_queue command_queue, cl_device_id device_id)
{
    float microSeconds = -1.;
    int i;
    cl_int clStatus;

    const int VECTOR_SIZE = 512 * 1024;
    // Allocate space for vectors A, B and C
    float* A = (float*)malloc(sizeof(float) * VECTOR_SIZE);         if(A) {
    float* B = (float*)malloc(sizeof(float) * VECTOR_SIZE);         if(B) {
    float* C = (float*)malloc(sizeof(float) * VECTOR_SIZE);         if(C) {
    for (i = 0; i < VECTOR_SIZE; i++)
    {
        A[i] = (float)i;
        B[i] = (float)(VECTOR_SIZE - i);
        C[i] = 0;
    }

    // Create memory buffers on the device for each vector
    cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);   if (clCallSuccess(clStatus)) {
    cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);   if (clCallSuccess(clStatus)) {
    cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus);  if (clCallSuccess(clStatus)) {

    // Copy the Buffer A and B to the device
    clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL); if (clCallSuccess(clStatus)) {
    clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL); if (clCallSuccess(clStatus)) {

    // Create a program from the kernel source and build it
    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&saxpy_kernel, NULL, &clStatus);   if (clCallSuccess(clStatus) && program) {
    clStatus = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);                                        if (clCallSuccess(clStatus)) {

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);                  if (clCallSuccess(clStatus) && kernel) {

    float alpha = 2.5;
    // Set the arguments of the kernel
    clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void*)&alpha);                     if (clCallSuccess(clStatus)) {
    clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&A_clmem);                  if (clCallSuccess(clStatus)) {
    clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&B_clmem);                  if (clCallSuccess(clStatus)) {
    clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&C_clmem);                  if (clCallSuccess(clStatus)) {

    // Execute the OpenCL kernel on the list
    cl_event event;
    size_t global_size = VECTOR_SIZE; // Process the entire lists
    size_t local_size = 512;           // Process one item at a time
    //clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event);
    clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &event);                 if (clCallSuccess(clStatus)) {
    clStatus = clWaitForEvents(1, &event);                                                                                  if (clCallSuccess(clStatus)) {
    //measure duration
    cl_ulong time_start;
    cl_ulong time_end;
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
    microSeconds = (float)(time_end - time_start) / 1000.0f;
    std::clog << "\nOpenCl benchmarking time: " << microSeconds << " microseconds \n";

    std::clog << "\n\t*****************************\n\n";
    }
    // Read the cl memory C_clmem on device to the host variable C
    clCallSuccess(clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL));

    // Clean up and wait for all the comands to complete.
    clCallSuccess(clFlush(command_queue));
    clCallSuccess(clFinish(command_queue));


    } //Kernel

    }}}} //SetKErnelArg

    // Finally release all OpenCL allocated objects and host buffers.

    clCallSuccess(clReleaseKernel(kernel)); }

    } //BuildProgram

    clCallSuccess(clReleaseProgram(program)); }

    } } //EnqueueWriteBuffer

    clCallSuccess(clReleaseMemObject(C_clmem)); } 
    clCallSuccess(clReleaseMemObject(B_clmem)); } 
    clCallSuccess(clReleaseMemObject(A_clmem)); }

    free(C); } 
    free(B); } 
    free(A); }

    return microSeconds;
}

/*
struct _dev_info {
    cl_platform_id platfID;
    cl_device_id devID;
};
typedef struct _dev_info dev_info;
*/
cl_device_id cl_GetBestDevice(void)
{
    cl_int err;
    cl_uint numPlatforms, numDevices;
    cl_platform_id platfIDs[10];
    cl_device_id devIDsAll[10];
    int countGPUs = 0;
    cl_device_id best_device = NULL;
    float best_perf = 100000000.;

    if (clCallSuccess(clGetPlatformIDs(10, platfIDs, &numPlatforms))) 
    {
        std::clog << "OpenCL platforms detected: " << numPlatforms << std::endl;

        for (unsigned int i = 0; i < numPlatforms; i++) 
        {
            std::clog << "PlatformInfo for platform no." << (i + 1) << std::endl;

            const int SZ_INFO = 1024;
            char info[SZ_INFO];
            size_t sz;
            if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_NAME, SZ_INFO, info, &sz)))
                std::clog << " - - Name: " << info << std::endl;
            if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_VENDOR, SZ_INFO, info, &sz)))
                std::clog << " - - Vendor: " << info << std::endl;
            if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_PROFILE, SZ_INFO, info, &sz)))
                std::clog << " - - Profile: " << info << std::endl;
            if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_VERSION, SZ_INFO, info, &sz)))
                std::clog << " - - Version: " << info << std::endl;
            if (clCallSuccess(clGetPlatformInfo(platfIDs[i], CL_PLATFORM_EXTENSIONS, SZ_INFO, info, &sz)))
                std::clog << " - - Extensions: " << info << std::endl;

            if (clCallSuccess(clGetDeviceIDs(platfIDs[i], CL_DEVICE_TYPE_ALL, 10, devIDsAll, &numDevices))) 
            {
                cl_context_properties cProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platfIDs[i]), 0 };
                cl_command_queue_properties qProperties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 };

                for (unsigned int ii = 0; ii < numDevices; ii++)
                {
                    cl_uint val;
                    cl_ulong memsz;
                    cl_device_type dt;
                    size_t mws;

                    std::clog << " >> DeviceInfo for device no." << (ii + 1) << std::endl;

                    if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_NAME, SZ_INFO, info, &sz)))
                        std::clog << "\t - Name: " << info << std::endl;
                    if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_VENDOR, SZ_INFO, info, &sz)))
                        std::clog << "\t - Vendor: " << info << std::endl;
                    if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_VERSION, SZ_INFO, info, &sz)))
                        std::clog << "\t - Version: " << info << std::endl;
                    if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_TYPE, sizeof(dt), &dt, &sz)))
                    {
                        std::clog << "\t - Type: ";
                        switch (dt)
                        {
                            case CL_DEVICE_TYPE_CPU: std::clog << "CPU"; break;
                            case CL_DEVICE_TYPE_GPU: std::clog << "GPU"; break;
                            case CL_DEVICE_TYPE_ACCELERATOR: std::clog << "Accelerator"; break;
                            case CL_DEVICE_TYPE_DEFAULT: std::clog << "Default"; break;
                            default: std::clog << "ERROR";
                        }
                        std::clog << std::endl;
                    }
                    if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(memsz), &memsz, &sz)))
                        std::clog << "\t - Memory: " << (memsz / 1024 / 1024) << " MB" << std::endl;
                    if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(val), &val, &sz)))
                        std::clog << "\t - Max Frequency: " << val << " MHz" << std::endl;
                    if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(val), &val, &sz)))
                        std::clog << "\t - Compute units: " << val << std::endl;
                    if (clCallSuccess(clGetDeviceInfo(devIDsAll[ii], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(mws), &mws, &sz)))
                        std::clog << "\t - Max workgroup size: " << mws << std::endl;

                    // Create an OpenCL context
                    cl_context context = clCreateContext(NULL, 1, devIDsAll+ii, NULL, NULL, &err);
                    if (clCallSuccess(err) && context)
                    {
                        // Create a command queue
                        cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, devIDsAll[ii], qProperties, &err);
                        if (clCallSuccess(err) && command_queue)
                        {
                            float perf = cl_BenchmarkDevice(context, command_queue, devIDsAll[ii]);
                            if ((perf > 0) && (perf < best_perf)) 
                            {
                                best_perf = perf;
                                best_device = devIDsAll[ii];
                            }

                            clCallSuccess(clReleaseCommandQueue(command_queue));
                        }
                        clCallSuccess(clReleaseContext(context));
                    }
                }
            }
        }
    }
    return best_device;
}

here is the output on my PC

answered Oct 21 '22 15:10

Mircea G

Related questions
                            
                                How to check if cin is int in c++? [duplicate]
                            
                                Why are these default arguments allowed?
                            
                                Can I access random data with random memory Addresses outside of my C++ Program
                            
                                The end character `\0` is it considered as one character or two characters?
                            
                                C++ decimal arithmetic libraries
                            
                                C++ Error: Multiple definitions of class functions [duplicate]
                            
                                Modify the location a pointer points to in a C++ function
                            
                                Why doesn't unordered_set provide an array access operator
                            
                                How to tell if a float is SNAN or QNAN
                            
                                What are the uses of get() member from the shared_ptr class?
                            
                                Why is a string passed as const string in a function parameter
                            
                                How to allocate memory dynamically for a char array in C++?
                            
                                Determining return type of std::function
                            
                                Sending Two or more chars using SendInput
                            
                                fstream - How to seekg() to position x from end
                            
                                C++ preprocessors are not aware of template arguments?
                            
                                The role of std::ws (whitespace) when reading data
                            
                                Launching std::thread using shared_ptr
                            
                                C++ macro with lambda argument using 2+ captured elements generates error
                            
                                How to disable a warning in Visual Studio 2015 for C? [duplicate]

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With