OpenGL Shader vs CUDA

Tags:

I was using this FXAA Shader for anti-aliasing in my OpenGL program. Now I reimplemented this code in CUDA and tested it. The resulting images are the same, but the CUDA version is much slower. (Shader runs at 60 FPS with vsync, while CUDA drops down to ~40 FPS)

Here is the CUDA code:

__device__ uchar4 readChar(int x, int y){
    return surf2Dread<uchar4>( surfaceRead, (x)*sizeof(uchar4), (y),cudaBoundaryModeClamp);
}

__device__ uchar4 readFloatBilin2(float x, float y){
    int x1 = floor(x);
    int y1 = floor(y);

    uchar4 z11 = readChar(x1,y1);
    uchar4 z12 = readChar(x1,y1+1);
    uchar4 z21 = readChar(x1+1,y1);
    uchar4 z22 = readChar(x1+1,y1+1);

    float u_ratio = x - x1;
    float v_ratio = y - y1;
    float u_opposite = 1 - u_ratio;
    float v_opposite = 1 - v_ratio;
    uchar4 result = (z11   * u_opposite  + z21   * u_ratio) * v_opposite +
                    (z12 * u_opposite  + z22 * u_ratio) * v_ratio;


    return result;
}
__device__ float fluma(const uchar4 &c){
    return c.x*0.299 * (1.0/255) + c.y *0.587 * (1.0/255) + c.z*0.114 * (1.0/255);
}
__global__ void filter_fxaa_opt(TextureData data)
{

    int x = blockIdx.x*blockDim.x + threadIdx.x;
    int y = blockIdx.y*blockDim.y + threadIdx.y;

    if(x >= data.w || y >= data.h)
    {
        return;
    }

    uchar4 out_color;

    const float FXAA_SPAN_MAX = 8.0;
    const float FXAA_REDUCE_MUL = 1.0/8.0;
    const float FXAA_REDUCE_MIN = (1.0/128.0);



    float lumaNW = fluma(readChar(x-1,y-1));

    float lumaNE = fluma(readChar(x+1,y-1));

    float lumaSW = fluma(readChar(x-1,y+1));

     float lumaSE = fluma(readChar(x+1,y+1));

    float lumaM = fluma(readChar(x,y));

    float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE)));
    float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE)));

    float2 dir;
    dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE));
    dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE));

    float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25 * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN);

    float rcpDirMin = 1.0/(min(abs(dir.x), abs(dir.y)) + dirReduce);


//    float2 test = dir * rcpDirMin;
    dir = clamp(dir * rcpDirMin,-FXAA_SPAN_MAX,FXAA_SPAN_MAX);


    uchar4 rgbA = (
                readFloatBilin2(x+ dir.x * (1.0/3.0 - 0.5),y+ dir.y * (1.0/3.0 - 0.5))*0.5f+
                readFloatBilin2(x+ dir.x * (2.0/3.0 - 0.5),y+ dir.y * (2.0/3.0 - 0.5))*0.5f);
    uchar4 rgbB = rgbA * (1.0/2.0) +  (
                readFloatBilin2(x+ dir.x * (0.0/3.0 - 0.5),y+ dir.y * (0.0/3.0 - 0.5))*0.25f+
                readFloatBilin2(x+ dir.x * (3.0/3.0 - 0.5),y+ dir.y * (3.0/3.0 - 0.5))*0.25f);
    float lumaB = fluma(rgbB);


    if((lumaB < lumaMin) || (lumaB > lumaMax)){
        out_color=rgbA;
    } else {
        out_color=rgbB;
    }

    surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y);
}

Setup:

//called for the 'src' and 'dst' texture once at the beginning
checked_cuda( cudaGraphicsGLRegisterImage(&res, gl_buffer,gl_target, cudaGraphicsRegisterFlagsSurfaceLoadStore));

//called for the 'src' and 'dst' texture every frame
checked_cuda( cudaGraphicsMapResources(1, &res, 0));
checked_cuda( cudaGraphicsSubResourceGetMappedArray(&array, res, 0,0));

//kernel call every frame
dim3 block_size(8, 8);
dim3 grid_size;
grid_size.x = (src->w) / (block_size.x) ;
grid_size.y = (src->h) / (block_size.y) ;
checked_cuda(cudaBindSurfaceToArray(surfaceRead, (cudaArray *)src->d_data));
checked_cuda(cudaBindSurfaceToArray(surfaceWrite, (cudaArray *)dst->d_data));
filter_fxaa_opt<<<grid_size, block_size>>>(*src);

System:

Ubuntu 14.04
Opengl version: 4.4.0 NVIDIA 331.113
Renderer version: GeForce GTX 760M/PCIe/SSE2
CUDA 5.5

Question: What does the OpenGL Shader do better and why is it so much faster?

847

asked Jan 04 '15 13:01

dari

1 Answers

As njuffa pointed out the main problem was the manual interpolation and normalization. After using a CUDA texture instead of a CUDA surface the build in interpolation can be used by calling tex2D(..) instead of surf2Dread(...).

The modified CUDA code is now almost indentically to the OpenGL shader and does indeed perform equally well.

__global__ void filter_fxaa2(TextureData data)
{

    int x = blockIdx.x*blockDim.x + threadIdx.x;
    int y = blockIdx.y*blockDim.y + threadIdx.y;

    if(x >= data.w || y >= data.h)
    {
        return;
    }

    uchar4 out_color;

    const float FXAA_SPAN_MAX = 8.0f;
    const float FXAA_REDUCE_MUL = 1.0f/8.0f;
    const float FXAA_REDUCE_MIN = (1.0f/128.0f);

    float u = x + 0.5f;
    float v = y + 0.5f;

    float4 rgbNW = tex2D( texRef, u-1.0f,v-1.0f);
    float4 rgbNE = tex2D( texRef, u+1.0f,v-1.0f);
    float4 rgbSW = tex2D( texRef, u-1.0f,v+1.0f);
    float4 rgbSE = tex2D( texRef, u+1.0f,v+1.0f);
    float4 rgbM = tex2D( texRef, u,v);

    const float4 luma = make_float4(0.299f, 0.587f, 0.114f,0.0f);
    float lumaNW = dot(rgbNW, luma);
    float lumaNE = dot(rgbNE, luma);
    float lumaSW = dot(rgbSW, luma);
    float lumaSE = dot(rgbSE, luma);
    float lumaM = dot( rgbM, luma);

    float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE)));
    float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE)));

    float2 dir;
    dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE));
    dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE));

    float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25f * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN);

    float rcpDirMin = 1.0f/(min(abs(dir.x), abs(dir.y)) + dirReduce);


    float2 test = dir * rcpDirMin;
    dir = clamp(test,-FXAA_SPAN_MAX,FXAA_SPAN_MAX);

    float4 rgbA = (1.0f/2.0f) * (
                tex2D( texRef,u+ dir.x * (1.0f/3.0f - 0.5f),v+ dir.y * (1.0f/3.0f - 0.5f))+
                tex2D( texRef,u+ dir.x * (2.0f/3.0f - 0.5f),v+ dir.y * (2.0f/3.0f - 0.5f)));
    float4 rgbB = rgbA * (1.0f/2.0f) + (1.0f/4.0f) * (
                tex2D( texRef,u+ dir.x * (0.0f/3.0f - 0.5f),v+ dir.y * (0.0f/3.0f - 0.5f))+
                tex2D( texRef,u+ dir.x * (3.0f/3.0f - 0.5f),v+ dir.y * (3.0f/3.0f - 0.5f)));
    float lumaB = dot(rgbB, luma);


    if((lumaB < lumaMin) || (lumaB > lumaMax)){
        out_color=toChar(rgbA);
    } else {
        out_color=toChar(rgbB);
    }


    surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y);
}

Update:

Performance meassured with cudaEvents:

Old Version: ~12.8ms
New Version: ~1.2ms

Conclusion:

Use CUDA surfaces only for writing and not for reading textures!

132

answered Oct 07 '22 10:10

dari

Related questions
                            
                                std::vector::erase(item) needs assignment operator to be defined for item?
                            
                                Implementing std::array-like constructors in other classes
                            
                                c++ template specialization for base class
                            
                                Is it safe to change this c-cast to a reinterpret_cast?
                            
                                Using ifstream to read floats
                            
                                Simple C++ SFML program high CPU usage
                            
                                Specify ruleset for Visual Studio code analysis on command line or from CMake
                            
                                Why cast used variable to void
                            
                                keep std vector/list sorted while insert, or sort all
                            
                                Using custom camera in OpenCV (via GStreamer)
                            
                                Is size_t guaranteed to be an alias type to one of integer types?
                            
                                How can i show MessageBox in another thread Qt
                            
                                Why this code produces invalid alignment with MSVC?
                            
                                Why are std::begin() and std::end() overloaded for std::initializer_list in C++11?
                            
                                Template method to select between functions based on accessibility of constructor
                            
                                How do I share C++ source code files between projects in Visual Studio?
                            
                                Nested type as template parameter of base class
                            
                                Why sqrt in global scope is much slower than std::sqrt in MinGW?
                            
                                Basic usage of conditionals with std::atomic<T>
                            
                                Is it possible to change the background color of a read only edit control

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

OpenGL Shader vs CUDA

Tags:

c++

performance

cuda

opengl

glsl

dari

People also ask

1 Answers

dari

Recent Activity

Donate For Us