Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Separating even and odd numbers in CUDA

Tags:

cuda

I have an array of numbers as {1,2,3,4,5,6,7,8,9,10} and I want to separate even and odd numbers as:

even = {2,4,6,8}

and:

odd = {1,3,5,7}

I am aware of atomic operations in CUDA, and also aware that the output is not expected to suffer from race conditions. I don't want to use atomic operations. How can I achieve this without using atomic keywords?

CODE:

#include <stdio.h>
#include <cuda.h>

// Kernel that executes on the CUDA device
__global__ void square_array(float *total,float *even,float *odd, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int a=total[idx];
  if ((a%2)==0) 
  {  
    for (int i=0;i<=idx;i++)
    {   
        int b = even[i];
        if(b==0)
        {
            even[i] = total[idx];
            break;

        }
    }
  }
  else
        {  
    for (int i=0;i<idx;i++)
    {   
        int c = odd[i];

            odd[i] = total[idx];
            break;
    }
  }
}

// main routine that executes on the host
int main(void)
{
  float *total_h,*even_h, *odd_h,*total_d, *even_d,*odd_d;  // Pointer to host & device arrays
  const int N = 10;  // Number of elements in arrays
  size_t size = N * sizeof(float);


  total_h = (float *)malloc(size); // Allocate array on host
  even_h = (float *)malloc(size); // Allocate array on host
  odd_h = (float *)malloc(size); // Allocate array on host

  cudaMalloc((void **) &total_d, size);
  cudaMalloc((void **) &even_d, size);
  cudaMemset(even_d,0,size);        
  cudaMalloc((void **) &odd_d, size);   // Allocate array on device
  cudaMemset(odd_d,0,size);


  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) total_h[i] = (float)i+1;
  cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:

  square_array <<< 1,10 >>> (total_d,even_d,odd_d, N);
  // Retrieve result from device and store it in host array

  cudaMemcpy(even_h, even_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  cudaMemcpy(odd_h, odd_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

  // Print results
    printf("total Numbers\n");
  for (int i=0; i<N; i++) printf("%f\n",total_h[i]);

  printf("EVEN Numbers\n");
  for (int i=0; i<N; i++) printf("%f\n",even_h[i]);

  printf("ODD Numbers\n");
  for (int i=0; i<N; i++) printf("%f\n",odd_h[i]);
  // Cleanup
  free(total_h);
  free(even_h);
  free(odd_h);


  cudaFree(total_d);
  cudaFree(even_d);
  cudaFree(odd_d);
}

OUTPUT: enter image description here

like image 383
Laxmi Kadariya Avatar asked Oct 21 '22 05:10

Laxmi Kadariya


1 Answers

As suggested by Jared Hoberock, it would be much more easy to use the efficient partitioning algorithm available in CUDA Thrust instead of starting the development of a partitioning routine of your own. Below, please find a complete worked example.

#include <thrust\device_vector.h>
#include <thrust\partition.h>
#include <thrust\execution_policy.h>

struct is_even { __host__ __device__ bool operator()(const int &x) { return (x % 2) == 0; } };

void main() {

    const int N = 10;

    thrust::host_vector<int> h_data(N);
    for (int i=0; i<N; i++) h_data[i] = i;

    thrust::device_vector<int> d_data(h_data);
    thrust::device_vector<int> d_evens(N/2);
    thrust::device_vector<int> d_odds(N/2);

    thrust::partition_copy(d_data.begin(), d_data.end(), d_evens.begin(), d_odds.begin(), is_even());

    printf("Even numbers\n");
    for (int i=0; i<N/2; i++) {
        int val = d_evens[i];
        printf("evens[%i] = %i\n",i,val);
    }

    printf("Odd numbers\n");
    for (int i=0; i<N/2; i++) {
        int val = d_odds[i];
        printf("odds[%i] = %i\n",i,val);
    }

}
like image 80
Vitality Avatar answered Oct 23 '22 23:10

Vitality