float1 vs float in CUDA

Question

I have noticed that there is a float1 struct type in cuda. Is there any performance benefit over simple float, for example, in case of using a float array vs float1 array?

struct __device_builtin__ float1
{
    float x;
};

In float4 there is a performance benefit, depending on the occasion, since the alignment is 4x4bytes = 16bytes. Is it just for special usage in __device__ functions with float1 parameters?

Thanks in advance.

Vitality · Accepted Answer

Following @talonmies' comment to the post CUDA Thrust reduction with double2 arrays, I have compared the calculation of the norm of a vector using CUDA Thrust and switching between float and float1. I have considered an array of N=1000000 elements on a GT210 card (cc 1.2). It seems that the calculation of the norm takes exactly the same time for both the cases, namely about 3.4s, so there is no performance improvement. As it appears from the code below, perhaps float is slightly more confortable in use than float1.

Finally, notice that the advantage of float4 stems from the alignment __builtin__align__, rather than __device_builtin__.

#include <thrust\device_vector.h>
#include <thrust	ransform_reduce.h>

struct square
{
    __host__ __device__ float operator()(float x)
    {
        return x * x;
    }
};

struct square1
{
    __host__ __device__ float operator()(float1 x)
    {
        return x.x * x.x;
    }
};

void main() {

    const int N = 1000000;

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::device_vector<float> d_vec(N,3.f);

    cudaEventRecord(start, 0);
    float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time reduction:  %3.1f ms 
", time);

    printf("Result of reduction = %f
",reduction);

    thrust::host_vector<float1>   h_vec1(N);
    for (int i=0; i<N; i++) h_vec1[i].x = 3.f;
    thrust::device_vector<float1> d_vec1=h_vec1;

    cudaEventRecord(start, 0);
    float reduction1 = sqrt(thrust::transform_reduce(d_vec1.begin(), d_vec1.end(), square1(), 0.0f, thrust::plus<float>()));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time reduction1:  %3.1f ms 
", time);

    printf("Result of reduction1 = %f
",reduction1);

    getchar();

}

float1 vs float in CUDA

Tags:

c++

c

cuda

BugShotGG

1 Answers

Vitality

Recent Activity

Donate For Us

float1 vs float in CUDA

Tags:

c++

c

cuda

BugShotGG

1 Answers

Vitality

Related questions

Recent Activity

Donate For Us