Use of unique_ptr and cudaMalloc

Tags:

I've been thinking about playing around with using std::unique_ptr with device pointers in CUDA. What I was wondering is if the current c++11 unique_ptr can be used in conjunction with cudaMalloc. I know it can be used with normal malloc (Is it possible to use a C++ smart pointers together with C's malloc?), but cudaMalloc doesn't return the pointer in the function's return statement. Instead, it returns an error code. The pointer is returned in a reference.

This blog post recommends the following technique:

auto deleter=[&](float* ptr){ cudaFree(ptr); };
std::unique_ptr<float[], decltype(deleter)> d_in(new float[size], deleter);
cudaMalloc((void **) &d_in, size * sizeof(float));

Question: However, I'm concerned that this creates host memory that never gets deleted (i.e. d_in(new float[size], deleter);)? Unless new float[size] doesn't actually generate host memory or is overridden? If the above doesn't in fact work, could defining my own cudaMalloc wrapper work? - to pass the pointer to unique_ptr?

Something like:

void* myCudaMalloc(size_t mySize){ 
    void * p; 
    checkCUDAerrorMacro(cudaMalloc((void**) &p, size);) 
    return p;
}

...

auto deleter=[](float* ptr){ cudaFree(ptr); };
std::unique_ptr<float[], decltype(deleter)> d_in(myCudaMalloc(size_t mySize), deleter);

841

asked Nov 20 '17 00:11

dada_dave

1 Answers

After some work I figured out how to test 3 versions of it - tl;dr the blog post's version (v1) does indeed leak, but can be tweaked so that it doesn't (v2) and improved (v3):

common code:

template <typename Deleter>
using unique_p = std::unique_ptr<float[], Deleter>;

constexpr int length = 20;

v1: (what is recommended in the blog post)

void version1(){
    auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted1\n"; };
    unique_p<decltype(deleter)> d_in(new float[length],deleter);
    cudaMalloc((void **) &d_in, length * sizeof(float));

    ...
}

v2: (similar to above, but initializes d_in with nullptr)

void version2(){
    auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted2\n"; };
    unique_p<decltype(deleter)> d_in(nullptr,deleter);
    cudaMalloc((void **) &d_in, length * sizeof(float));

    ...
}

v3: (d_in "adopts" pointer initialized with cudaMalloc)

void version3(){
    auto  myCudaMalloc = [](size_t mySize) { void* ptr; cudaMalloc((void**)&ptr, mySize); return ptr; };
    auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted3\n"; };
    unique_p<decltype(deleter)> d_in((float*)myCudaMalloc(length*sizeof(float)),deleter);

    ...
}

All 3 create proper device pointers. However, version 1 definitely leaks host memory (tested using valgrind with the cuda warnings suppressed: Valgrind and CUDA: Are reported leaks real?). Neither v2 nor v3 leak host memory. cuda-memcheck also confirmed that there were no device-side memory leaks for any of the versions.

Between version 2 and 3, I prefer version 3 as it makes it more clear that unique_ptr owns the pointer and it follows the idiom of new and malloc in the unique_ptr constructor. You also only have to define the constructing function/lambda once and then can use it over and over again, so it is fewer lines of code.

========================

Full test code (compiled with nvcc -std=c++14):

#include <cuda_runtime.h>
#include <memory>
#include <iostream>

template <typename Deleter>
using unique_p = std::unique_ptr<float[], Deleter>;

__global__ void printArray(float * d_in, int num){
    for(int i = 0; i < num; i++){ printf("%f\t",d_in[i]); }
    printf("\n");

}

struct myDeleter{
    void operator()(float* ptr){ cudaFree(ptr); std::cout<<"\nDeleted\n"; } 
};

constexpr int length = 20;

void version1(){
    auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted1\n"; };
    unique_p<decltype(deleter)> d_in(new float[length],deleter);
    cudaMalloc((void **) &d_in, length * sizeof(float));

    std::unique_ptr<float[]> h_out(new float[length]);

    for(int i = 0; i < length; i++){ h_out[i] = i; }

    cudaMemcpy(d_in.get(), h_out.get(),length*sizeof(float),cudaMemcpyHostToDevice);


    printArray<<<1,1>>>(d_in.get(),length);
}

void version2(){
    auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted2\n"; };
    unique_p<decltype(deleter)> d_in(nullptr,deleter);
    cudaMalloc((void **) &d_in, length * sizeof(float));

    std::unique_ptr<float[]> h_out(new float[length]);

    for(int i = 0; i < length; i++){ h_out[i] = i; }

    cudaMemcpy(d_in.get(), h_out.get(),length*sizeof(float),cudaMemcpyHostToDevice);


    printArray<<<1,1>>>(d_in.get(),length);
}


void version3(){
    auto  myCudaMalloc = [](size_t mySize) { void* ptr; cudaMalloc((void**)&ptr, mySize); return ptr; };
    auto deleter = [](float* ptr) { cudaFree(ptr); std::cout<<"\nDeleted3\n"; };
    unique_p<decltype(deleter)> d_in((float*)myCudaMalloc(length*sizeof(float)),deleter);
    //unique_p<myDeleter> d_in((float*)myCudaMalloc(20*sizeof(float)));

    std::unique_ptr<float[]> h_out(new float[length]);
    for(int i = 0; i < length; i++){ h_out[i] = i; }

    cudaMemcpy(d_in.get(), h_out.get(),length*sizeof(float),cudaMemcpyHostToDevice);

    printArray<<<1,1>>>(d_in.get(),length);
}

int main(){

    version1();
    version2();
    version3();

    cudaDeviceReset();
    return 0;
}

191

answered Sep 28 '22 09:09

dada_dave

Related questions
                            
                                Why are std::allocator::construct and std::allocator::destroy templated on element type?
                            
                                Requirements for std::thread::id. Can it be atomized?
                            
                                How do I iterate through a sequence of shared_ptr objects?
                            
                                Why CR LF is changed to LF in Windows?
                            
                                Moving std::thread
                            
                                Can I override std::hash?
                            
                                Determining the "optimal" common numeric type in a template parameter pack
                            
                                Change boost.build jamfile for C++11 support?
                            
                                Why does decltype remove const from return types for built-in types?
                            
                                How do I efficiently remove_if only a single element from a forward_list?
                            
                                std::tolower and Visual Studio 2013
                            
                                C++ Passing std::string by reference to function in dll
                            
                                how to iterate all regex matches in a std::string with their starting positions in c++11 std::regex?
                            
                                Why is std::endl generating this cryptic error message?
                            
                                Converting std::array to std::vector
                            
                                How do I convert a C string to a int at compile time?
                            
                                What is the behavior of a defaulted default constructor with in-class initialization?
                            
                                c++ Should I define lambda out of for loop or inside it to keep small scope, what is best practice
                            
                                Boost build fails C++11 feature checks when using (custom) GCC 4.x or 5.x
                            
                                Range-based for over pair list

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Use of unique_ptr and cudaMalloc

Tags:

pointers

c++11

cuda

unique-ptr

dada_dave

People also ask

1 Answers

dada_dave

Recent Activity

Donate For Us