I have been trying for days to get a Qt project file running on a 32-bit Windows 7 system, in which I want/need to include Cuda code. This combination of things is either so simple that no one ever bothered to put an example online, or so difficult that nobody ever succeeded, it seems. Whatever way, the only helpful forum threads I found were the same issue on Linux or Mac, or with Visual Studio on a Windows.
All of these give all sorts of different errors, however, whether due to linking or clashing libraries, or spaces in file names or non-existing folders in the Windows version of the Cuda SDK.
Is there someone who has a clear .pro
file to offer that does the trick?
I am aiming to compile a simple programme with ordinary C++ code in Qt style, with Qt 4.8 libraries, which reference several Cuda modules in .cu files. Something of the form:
TestCUDA \
TestCUDA.pro
main.cpp
test.cu
So I finally managed to assemble a .pro
file that works on my and probably on all Windows systems. The following is an easy test programme that should probably do the trick. The following is a small project file plus test programme that works at least on my system.
The file system looks as follows:
TestCUDA \
TestCUDA.pro
main.cpp
vectorAddition.cu
The project file reads:
TARGET = TestCUDA
# Define output directories
DESTDIR = release
OBJECTS_DIR = release/obj
CUDA_OBJECTS_DIR = release/cuda
# Source files
SOURCES += src/main.cpp
# This makes the .cu files appear in your project
OTHER_FILES += vectorAddition.cu
# CUDA settings <-- may change depending on your system
CUDA_SOURCES += src/cuda/vectorAddition.cu
CUDA_SDK = "C:/ProgramData/NVIDIA Corporation/NVIDIA GPU Computing SDK 4.2/C" # Path to cuda SDK install
CUDA_DIR = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v4.2" # Path to cuda toolkit install
SYSTEM_NAME = Win32 # Depending on your system either 'Win32', 'x64', or 'Win64'
SYSTEM_TYPE = 32 # '32' or '64', depending on your system
CUDA_ARCH = sm_11 # Type of CUDA architecture, for example 'compute_10', 'compute_11', 'sm_10'
NVCC_OPTIONS = --use_fast_math
# include paths
INCLUDEPATH += $$CUDA_DIR/include \
$$CUDA_SDK/common/inc/ \
$$CUDA_SDK/../shared/inc/
# library directories
QMAKE_LIBDIR += $$CUDA_DIR/lib/$$SYSTEM_NAME \
$$CUDA_SDK/common/lib/$$SYSTEM_NAME \
$$CUDA_SDK/../shared/lib/$$SYSTEM_NAME
# Add the necessary libraries
LIBS += -lcuda -lcudart
# The following library conflicts with something in Cuda
QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
QMAKE_LFLAGS_DEBUG = /NODEFAULTLIB:msvcrtd.lib
# The following makes sure all path names (which often include spaces) are put between quotation marks
CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
# Configuration of the Cuda compiler
CONFIG(debug, debug|release) {
# Debug mode
cuda_d.input = CUDA_SOURCES
cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
cuda_d.commands = $$CUDA_DIR/bin/nvcc.exe -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda_d.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda_d
}
else {
# Release mode
cuda.input = CUDA_SOURCES
cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
cuda.commands = $$CUDA_DIR/bin/nvcc.exe $$NVCC_OPTIONS $$CUDA_INC $$LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda
}
Note the QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
: it took me a long time to figure out, but this library seems to clash with other things in Cuda, which produces strange linking warnings and errors. If someone has an explanation for this, and potentially a prettier way to get around this, I'd like to hear it.
Also, since Windows file paths often include spaces (and NVIDIA's SDK by default does so too), it is necessary to artificially add quotation marks around the include paths. Again, if someone knows a more elegant way of solving this problem, I'd be interested to know.
The main.cpp
file looks like this:
#include <cuda.h>
#include <builtin_types.h>
#include <drvapi_error_string.h>
#include <QtCore/QCoreApplication>
#include <QDebug>
// Forward declare the function in the .cu file
void vectorAddition(const float* a, const float* b, float* c, int n);
void printArray(const float* a, const unsigned int n) {
QString s = "(";
unsigned int ii;
for (ii = 0; ii < n - 1; ++ii)
s.append(QString::number(a[ii])).append(", ");
s.append(QString::number(a[ii])).append(")");
qDebug() << s;
}
int main(int argc, char* argv [])
{
QCoreApplication(argc, argv);
int deviceCount = 0;
int cudaDevice = 0;
char cudaDeviceName [100];
unsigned int N = 50;
float *a, *b, *c;
cuInit(0);
cuDeviceGetCount(&deviceCount);
cuDeviceGet(&cudaDevice, 0);
cuDeviceGetName(cudaDeviceName, 100, cudaDevice);
qDebug() << "Number of devices: " << deviceCount;
qDebug() << "Device name:" << cudaDeviceName;
a = new float [N]; b = new float [N]; c = new float [N];
for (unsigned int ii = 0; ii < N; ++ii) {
a[ii] = qrand();
b[ii] = qrand();
}
// This is the function call in which the kernel is called
vectorAddition(a, b, c, N);
qDebug() << "input a:"; printArray(a, N);
qDebug() << "input b:"; printArray(b, N);
qDebug() << "output c:"; printArray(c, N);
if (a) delete a;
if (b) delete b;
if (c) delete c;
}
The Cuda file vectorAddition.cu
, which describes a simple vector addition, look like this:
#include <cuda.h>
#include <builtin_types.h>
extern "C"
__global__ void vectorAdditionCUDA(const float* a, const float* b, float* c, int n)
{
int ii = blockDim.x * blockIdx.x + threadIdx.x;
if (ii < n)
c[ii] = a[ii] + b[ii];
}
void vectorAddition(const float* a, const float* b, float* c, int n) {
float *a_cuda, *b_cuda, *c_cuda;
unsigned int nBytes = sizeof(float) * n;
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
// allocate and copy memory into the device
cudaMalloc((void **)& a_cuda, nBytes);
cudaMalloc((void **)& b_cuda, nBytes);
cudaMalloc((void **)& c_cuda, nBytes);
cudaMemcpy(a_cuda, a, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(b_cuda, b, nBytes, cudaMemcpyHostToDevice);
vectorAdditionCUDA<<<blocksPerGrid, threadsPerBlock>>>(a_cuda, b_cuda, c_cuda, n);
// load the answer back into the host
cudaMemcpy(c, c_cuda, nBytes, cudaMemcpyDeviceToHost);
cudaFree(a_cuda);
cudaFree(b_cuda);
cudaFree(c_cuda);
}
If you get this to work, then more complicated examples are self-evident, I think.
Edit (24-1-2013): I added the QMAKE_LFLAGS_DEBUG = /NODEFAULTLIB:msvcrtd.lib
and the CONFIG(debug)
with the extra D_DEBUG
flag, such that it also compiles in debug mode.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With