pyopencl - pycuda performance difference

Tags:

Comparing multiple matrix multiplication calculations with pyopencl and pycuda show differences in performance.

System:

Ubuntu 14.04 with GeForce 920m

Pyopencl code:

#-*- coding: utf-8 -*-
import pyopencl as cl
import pyopencl.array 
from jinja2 import Template
import time
import numpy as np
from scipy.sparse import csr_matrix

KERNEL = Template("""
    {{header}}

    #include <pyopencl-complex.h>

    __kernel
    void complex_mat_mul(__global const {{complex_type}} *a, __global const {{complex_type}} *b, __global {{complex_type}} *res)
    {
        int row = get_local_id(1);
        int col = get_local_id(0);

        int mat_id = get_group_id(0) * get_num_groups(0) + get_group_id(1);

        //printf("mat_id: %d, row: %d, col: %d ----- ", mat_id, row, col);

        {{complex_type}} entry = 0;
        for (int e = 0; e < {{mat_dim}}; ++e) {
            entry += a[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + e] *  b[mat_id*{{mat_dim}}*{{mat_dim}} + e * {{mat_dim}} + col];
        }
        res[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + col] = entry;
    }
""")

def get_ctx_queue(devices=[0]):
    """
    optain context and queue for spcified devices
    """
    platform         = cl.get_platforms()[0]
    platform_devices = platform.get_devices()

    ctx = cl.Context(devices=[platform_devices[x] for x in devices])
    return (ctx, cl.CommandQueue(ctx))

data_types = {
    'cfloat_t': np.complex64,
    'cdouble_t': np.complex128,
    'float': np.float32,
    'double': np.float64
}

def render_kernel(complex_type, real_type, mat_dim):
    header = ""
    if data_types[complex_type] == np.complex128:
        header = """
            #pragma OPENCL EXTENSION cl_khr_fp64 : enable
            #define PYOPENCL_DEFINE_CDOUBLE
        """
    templ = KERNEL.render(
        header=header,
        complex_type=complex_type,
        real_type=real_type,
        mat_dim=mat_dim,
    )
    print(templ)

    return templ

complex_type = 'cdouble_t'
real_type    = 'float'
mat_dim      = 25
mats_count   = 200 # x*x

ctx, queue = get_ctx_queue()

program= cl.Program(ctx, render_kernel(complex_type, real_type, mat_dim)).build()

mats_1 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
mats_2 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])

start = time.time()
numpy_result = np.array([np.dot(mats_1[i], mats_2[i]) for i in range(mats_count**2)])
print("numpy time: %.3f" % (time.time()-start))

a = cl.array.to_device(queue, mats_1)
b = cl.array.to_device(queue, mats_2)
c = cl.array.to_device(queue, np.zeros((mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type]))

start = time.time()
program.complex_mat_mul(queue, (mats_count*mat_dim, mats_count*mat_dim, 1), (mat_dim, mat_dim, 1), a.data,b.data,c.data)
queue.finish()
queue.flush()
result = c.get()
print("opencl time: %.3f" % (time.time()-start))

assert np.allclose(numpy_result.flatten(), result.flatten(), atol=0), "FAIL opencl"
print("Success")

Pycuda code:

#-*- coding: utf-8 -*-
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
from jinja2 import Template
import time
import numpy as np
from scipy.sparse import csr_matrix

KERNEL = Template("""
    #include <stdio.h>
    #include <pycuda-complex.hpp>

    typedef pycuda::complex<float> scmplx;
    typedef pycuda::complex<double> dcmplx;

    __global__ void complex_mat_mul(const {{complex_type}} *a, const {{complex_type}} *b, {{complex_type}} *res)
    {
        int row = threadIdx.y;
        int col = threadIdx.x;

        int mat_id = blockIdx.x * gridDim.x + blockIdx.y;

        //printf("mat_id: %d, row: %d, col: %d ----- ", mat_id, row, col);

        {{complex_type}} entry = 0;
        for (int e = 0; e < {{mat_dim}}; ++e) {
            entry += a[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + e] *  b[mat_id*{{mat_dim}}*{{mat_dim}} + e * {{mat_dim}} + col];
        }
        res[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + col] = entry;
    }
""")

data_types = {
    'scmplx': np.complex64,
    'dcmplx': np.complex128,
    'float': np.float32,
    'double': np.float64
}

def render_kernel(complex_type, real_type, mat_dim, block, gird):
    templ = KERNEL.render(
        complex_type=complex_type,
        real_type=real_type,
        mat_dim=mat_dim,
        blockDim_x=block[0],
        blockDim_y=block[1]
    )
    print(templ)

    return templ

complex_type = 'dcmplx'
real_type    = 'double'
mat_dim      = 25
mats_count   = 200 # x*x

block = (mat_dim,mat_dim,1)
grid  = (mats_count,mats_count)

program = SourceModule(render_kernel(complex_type, real_type, mat_dim, block, grid))

complex_mat_mul = program.get_function("complex_mat_mul")

mats_1 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
mats_2 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
result = np.zeros((mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])

start = time.time()
numpy_result = np.array([np.dot(mats_1[i], mats_2[i]) for i in range(mats_count**2)])
print("numpy time: %.3f" % (time.time()-start))

a = drv.In(mats_1)
b = drv.In(mats_2)
c = drv.Out(result)

start = time.time()
complex_mat_mul(a, b, c,
    block=block, 
    grid=grid
)
print("cuda time: %.3f" % (time.time()-start))

assert np.array_equal(numpy_result.flatten(), result.flatten()), "FAIL"
print("Success")

In single and double precision pyopencl performs at least 2 times faster. Changing the number of matrices doesn't change the result.

Both kernels get called the same number of times and I suppose that the spots of the benchmark are appropriate.

What am I missing?

590

asked Mar 20 '16 11:03

Jesse

1 Answers

Running more tests gave the following results.

Using the Kernel (opencl is equivalent except for the determination of row, col and mat_id):

int row = threadIdx.y;
int col = threadIdx.x;

int mat_id = blockIdx.x * gridDim.x + blockIdx.y;

for (int i=0; i < 10; i++) {

{{complex_type}} entry = 0;
for (int e = 0; e < {{mat_dim}}; ++e) {
    entry += a[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + e] *  b[mat_id*{{mat_dim}}*{{mat_dim}} + e * {{mat_dim}} + col];
}
res[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + col] = entry;

}

Added the extra loop to execute more floating point operations
Measuring the performance includes writing and reading the buffers on the GPU (compared to the actual execution time, those transferoperations are very fast)

For different matrix dimensions (constant number of matrices: 22500): enter image description here

For a different number of matrices (constant matrix dimension: 25): enter image description here

As in the comments discussed I shortly had better result with pycuda, when I included the initial buffer operations. But by adding the extra loop to increase floating point operations, the pycuda head start vanished. Thus leaving this question open, since we would expect pycuda to perform better.

answered Nov 15 '22 07:11

Jesse

Related questions
                            
                                Troubleshooting creeping CPU utilization on Azure websites
                            
                                Logging and Monitoring Spring MVC Web application
                            
                                Compare ways of causing digest
                            
                                Sorting a stack using another stack
                            
                                Analyzing MongoDb aggregation performance
                            
                                How can I make my Python script faster?
                            
                                Efficient swapping of elements in numpy array
                            
                                Interpretation of perf stat output
                            
                                Performance issues with saving object reference to array
                            
                                Why are math libraries often compared by FLOPS?
                            
                                What is the meaning of "% Time in GC" performance counter
                            
                                Haskell / GHC: Unsafely deconstruct a constructor?
                            
                                How does Scala compiler handle unused variable values?
                            
                                Android Application onLowMemory() on API >= 14
                            
                                mix-blend-mode scroll lag issues
                            
                                MATLAB performance benefit from clearing temporary variables?
                            
                                How to process two long lines with constant memory?
                            
                                How to find the server processing time by IIS logs?
                            
                                Why will for-loop with multithreading not have as great performance as with single-thread?
                            
                                WPF 2D in 3D view Animations : Performance issue

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

pyopencl - pycuda performance difference

Tags:

performance

pycuda

pyopencl

Jesse

People also ask

1 Answers

Jesse

Recent Activity

Donate For Us