Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why cublas on GTX Titan is slower than single threaded CPU code?

I am testing Nvidia Cublas Library on my GTX Titan. I have the following code:

#include "cublas.h"
#include <stdlib.h>
#include <conio.h>
#include <Windows.h>
#include <iostream>
#include <iomanip>

/* Vector size */
#define N (1024 * 1024 * 32)

/* Main */
int main(int argc, char** argv)
{
  LARGE_INTEGER frequency;
  LARGE_INTEGER t1, t2;

  float* h_A;
  float* h_B;
  float* d_A = 0;
  float* d_B = 0;

  /* Initialize CUBLAS */
  cublasInit();

  /* Allocate host memory for the vectors */
  h_A = (float*)malloc(N * sizeof(h_A[0]));
  h_B = (float*)malloc(N * sizeof(h_B[0]));

  /* Fill the vectors with test data */
  for (int i = 0; i < N; i++)
  {
    h_A[i] = rand() / (float)RAND_MAX;
    h_B[i] = rand() / (float)RAND_MAX;
  }

  QueryPerformanceFrequency(&frequency);
  QueryPerformanceCounter(&t1);
  /* Allocate device memory for the vectors */
  cublasAlloc(N, sizeof(d_A[0]), (void**)&d_A);
  cublasAlloc(N, sizeof(d_B[0]), (void**)&d_B);

  /* Initialize the device matrices with the host vectors */
  cublasSetVector(N, sizeof(h_A[0]), h_A, 1, d_A, 1);
  cublasSetVector(N, sizeof(h_B[0]), h_B, 1, d_B, 1);

  /* Performs operation using cublas */
  float res = cublasSdot(N, d_A, 1, d_B, 1);  

  /* Memory clean up */
  cublasFree(d_A);
  cublasFree(d_B);

  QueryPerformanceCounter(&t2);
  double elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
  std::cout << "GPU time = " << std::setprecision(16) << elapsedTime << std::endl;
  std::cout << "GPU result = " << res << std::endl;

  QueryPerformanceFrequency(&frequency);
  QueryPerformanceCounter(&t1);
  float sum = 0.;
  for (int i = 0; i < N; i++) {
      sum += h_A[i] * h_B[i];
  }
  QueryPerformanceCounter(&t2);
  elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
  std::cout << "CPU time = " << std::setprecision(16) << elapsedTime << std::endl;
  std::cout << "CPU result = " << sum << std::endl;

  free(h_A);
  free(h_B);

  /* Shutdown */
  cublasShutdown();

  getch();

  return EXIT_SUCCESS;
}

When I run the code I get the following result:

GPU time = 164.7487009845991
GPU result = 8388851
CPU time = 45.22368030957917
CPU result = 7780599.5

Why using cublas library on GTX Titan is 3 times slower than calculations on one Xeon 2.4GHz IvyBridge core? When I increase or decrease the vector sizes, I get the same results: GPU is slower than CPU. Double precision doesn't change it.

like image 472
whatisgto Avatar asked Dec 06 '22 03:12

whatisgto


1 Answers

Because dot product is a function that uses each vector element only once. That means that the time to send it to the video card is much greater than to calculate everything on cpu, because PCIExpress is much slower than RAM.

like image 190
Alex Telishev Avatar answered Dec 08 '22 18:12

Alex Telishev