How to asynchronously copy memory from the host to the device using thrust and CUDA streams

Tags:

I would like to copy memory from the host to the device using thrust as in

thrust::host_vector<float> h_vec(1 << 28);
thrust::device_vector<float> d_vec(1 << 28);
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());

using CUDA streams analogously to how you would copy memory from the device to the device using streams:

cudaStream_t s;
cudaStreamCreate(&s);

thrust::device_vector<float> d_vec1(1 << 28), d_vec2(1 << 28);
thrust::copy(thrust::cuda::par.on(s), d_vec1.begin(), d_vec1.end(), d_vec2.begin());

cudaStreamSynchronize(s);
cudaStreamDestroy(s);

The problem is that I can't set the execution policy to CUDA to specify the stream when copying from the host to the device, because, in that case, thrust would assume that both vectors are stored on the device. Is there a way to get around this problem? I'm using the latest thrust version from github (it says 1.8 in the version.h file).

776

asked Jul 31 '14 16:07

Tom

2 Answers

As indicated in the comments, I don't think this will be possible directly with thrust::copy. However we can use cudaMemcpyAsync in a thrust application to achieve the goal of asynchronous copies and overlap of copy with compute.

Here is a worked example:

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>

// DSIZE determines duration of H2D and D2H transfers
#define DSIZE (1048576*8)
// SSIZE,LSIZE determine duration of kernel launched by thrust
#define SSIZE (1024*512)
#define LSIZE 1
// KSIZE determines size of thrust kernels (number of threads per block)
#define KSIZE 64
#define TV1 1
#define TV2 2

typedef int mytype;
typedef thrust::host_vector<mytype, thrust::cuda::experimental::pinned_allocator<mytype> > pinnedVector;

struct sum_functor
{
  mytype *dptr;
  sum_functor(mytype* _dptr) : dptr(_dptr) {};
  __host__ __device__ void operator()(mytype &data) const
    {
      mytype result = data;
      for (int j = 0; j < LSIZE; j++)
        for (int i = 0; i < SSIZE; i++)
          result += dptr[i];
      data = result;
    }
};

int main(){

  pinnedVector hi1(DSIZE);
  pinnedVector hi2(DSIZE);
  pinnedVector ho1(DSIZE);
  pinnedVector ho2(DSIZE);
  thrust::device_vector<mytype> di1(DSIZE);
  thrust::device_vector<mytype> di2(DSIZE);
  thrust::device_vector<mytype> do1(DSIZE);
  thrust::device_vector<mytype> do2(DSIZE);
  thrust::device_vector<mytype> dc1(KSIZE);
  thrust::device_vector<mytype> dc2(KSIZE);

  thrust::fill(hi1.begin(), hi1.end(),  TV1);
  thrust::fill(hi2.begin(), hi2.end(),  TV2);
  thrust::sequence(do1.begin(), do1.end());
  thrust::sequence(do2.begin(), do2.end());

  cudaStream_t s1, s2;
  cudaStreamCreate(&s1); cudaStreamCreate(&s2);

  cudaMemcpyAsync(thrust::raw_pointer_cast(di1.data()), thrust::raw_pointer_cast(hi1.data()), di1.size()*sizeof(mytype), cudaMemcpyHostToDevice, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(di2.data()), thrust::raw_pointer_cast(hi2.data()), di2.size()*sizeof(mytype), cudaMemcpyHostToDevice, s2);

  thrust::for_each(thrust::cuda::par.on(s1), do1.begin(), do1.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di1.data())));
  thrust::for_each(thrust::cuda::par.on(s2), do2.begin(), do2.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di2.data())));

  cudaMemcpyAsync(thrust::raw_pointer_cast(ho1.data()), thrust::raw_pointer_cast(do1.data()), do1.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(ho2.data()), thrust::raw_pointer_cast(do2.data()), do2.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s2);

  cudaDeviceSynchronize();
  for (int i=0; i < KSIZE; i++){
    if (ho1[i] != ((LSIZE*SSIZE*TV1) + i)) { std::cout << "mismatch on stream 1 at " << i << " was: " << ho1[i] << " should be: " << ((DSIZE*TV1)+i) << std::endl; return 1;}
    if (ho2[i] != ((LSIZE*SSIZE*TV2) + i)) { std::cout << "mismatch on stream 2 at " << i << " was: " << ho2[i] << " should be: " << ((DSIZE*TV2)+i) << std::endl; return 1;}
    }
  std::cout << "Success!" << std::endl;
  return 0;
}

For my test case, I used RHEL5.5, Quadro5000, and cuda 6.5RC. This example is designed to have thrust create very small kernels (only a single threadblock, as long as KSIZE is small, say 32 or 64), so that the kernels that thrust creates from thrust::for_each are able to run concurrently.

When I profile this code, I see:

nvvp output for thrust streams application

This indicates that we are achieving proper overlap both between thrust kernels, and between copy operations and thrust kernels, as well as asynchronous data copying at the completion of the kernels. Note that the cudaDeviceSynchronize() operation "fills" the timeline, indicating that all the async operations (data copying, thrust functions) were issued asynchronously and control returned to the host thread before any of the operations were underway. All of this is expected, proper behavior for full concurrency between host, GPU, and data copying operations.

answered Nov 13 '22 12:11

Robert Crovella

Here's a worked example using thrust::cuda::experimental::pinned_allocator<T>:

// Compile with:
// nvcc --std=c++11 mem_async.cu -o mem_async

#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/fill.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>

#define LEN 1024

int main(int argc, char *argv[]) {
    thrust::host_vector<float, thrust::cuda::experimental::pinned_allocator<float>> h_vec(LEN);
    thrust::device_vector<float> d_vec(LEN);

    thrust::fill(d_vec.begin(), d_vec.end(), -1.0);

    cudaMemcpyAsync(thrust::raw_pointer_cast(h_vec.data()),
                    thrust::raw_pointer_cast(d_vec.data()),
                    d_vec.size()*sizeof(float),
                    cudaMemcpyDeviceToHost);

    // Comment out this line to see what happens.
    cudaDeviceSynchronize();

    std::cout << h_vec[0] << std::endl;
}

Comment out the synchronize step and you should get 0 printed to the console due to the async memory transfer.

answered Nov 13 '22 11:11

Andreas Yankopolus

Related questions
                            
                                Debug Assertion Failed Expression: _pFirstBlock == pHead using OpenCV and C++ trying to call SurfFeatureDetector
                            
                                What are "annotated fallthrough" and "partly annotated method" in Clang's wording?
                            
                                Fastest way of computing the power that a "power of 2" number used?
                            
                                constexpr bug in clang but not in gcc?
                            
                                std::shared_ptr::owner_before and std::owner_less: What exactly is meant by "owner-based order"?
                            
                                std::sort comparing elements to null
                            
                                C++ boost::split first of
                            
                                setting a row of a matrix to 0, in eigen
                            
                                Windows Multi-monitor: How can I determine if a target is physically connected to a source when the target is available but not active?
                            
                                SFINAE to enable nontemplate member function
                            
                                C++11 code with inheritance doesn't compile with clang
                            
                                Is it fair to always recommend std::vector over realloc?
                            
                                Netbeans: cannot find include file <iostream>, also, unable to resolve identifier std, cout
                            
                                Cannot find Magick++ header files
                            
                                Is there a way in Qt to prohibit the computer from going to sleep?
                            
                                Can pop_back() ever reduce the capacity of a vector? (C++)
                            
                                Binding Checkbox 'checked' property with a C++ object Q_PROPERTY
                            
                                Show image in a column of QTableView from QSqlTableModel
                            
                                Thread Working Directory
                            
                                How long does a string constant live in c++?

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

How to asynchronously copy memory from the host to the device using thrust and CUDA streams

Tags:

c++

asynchronous

cuda

thrust

Tom

People also ask

2 Answers

Robert Crovella

Andreas Yankopolus

Recent Activity

Donate For Us