Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

float1 vs float in CUDA

Tags:

c++

c

cuda

I have noticed that there is a float1 struct type in cuda. Is there any performance benefit over simple float, for example, in case of using a float array vs float1 array?

struct __device_builtin__ float1
{
    float x;
};

In float4 there is a performance benefit, depending on the occasion, since the alignment is 4x4bytes = 16bytes. Is it just for special usage in __device__ functions with float1 parameters?

Thanks in advance.

like image 356
BugShotGG Avatar asked Jun 12 '14 13:06

BugShotGG


1 Answers

Following @talonmies' comment to the post CUDA Thrust reduction with double2 arrays, I have compared the calculation of the norm of a vector using CUDA Thrust and switching between float and float1. I have considered an array of N=1000000 elements on a GT210 card (cc 1.2). It seems that the calculation of the norm takes exactly the same time for both the cases, namely about 3.4s, so there is no performance improvement. As it appears from the code below, perhaps float is slightly more confortable in use than float1.

Finally, notice that the advantage of float4 stems from the alignment __builtin__align__, rather than __device_builtin__.

#include <thrust\device_vector.h>
#include <thrust\transform_reduce.h>

struct square
{
    __host__ __device__ float operator()(float x)
    {
        return x * x;
    }
};

struct square1
{
    __host__ __device__ float operator()(float1 x)
    {
        return x.x * x.x;
    }
};

void main() {

    const int N = 1000000;

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::device_vector<float> d_vec(N,3.f);

    cudaEventRecord(start, 0);
    float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time reduction:  %3.1f ms \n", time);

    printf("Result of reduction = %f\n",reduction);

    thrust::host_vector<float1>   h_vec1(N);
    for (int i=0; i<N; i++) h_vec1[i].x = 3.f;
    thrust::device_vector<float1> d_vec1=h_vec1;

    cudaEventRecord(start, 0);
    float reduction1 = sqrt(thrust::transform_reduce(d_vec1.begin(), d_vec1.end(), square1(), 0.0f, thrust::plus<float>()));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time reduction1:  %3.1f ms \n", time);

    printf("Result of reduction1 = %f\n",reduction1);

    getchar();

}
like image 194
Vitality Avatar answered Oct 31 '22 17:10

Vitality