I've been reading through the CUDA documentation and it seems to me, that every buffer that needs to interface with OpenGL needs to be created in the glBuffer.
According to the nvidia programming guide, this has to be done like this:
GLuint positionsVBO;
struct cudaGraphicsResource* positionsVBO_CUDA;
int main() {
// Explicitly set device
cudaGLSetGLDevice(0);
// Initialize OpenGL and GLUT
...
glutDisplayFunc(display);
// Create buffer object and register it with CUDA
glGenBuffers(1, positionsVBO);
glBindBuffer(GL_ARRAY_BUFFER, &vbo);
unsigned int size = width * height * 4 * sizeof(float);
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
cudaGraphicsGLRegisterBuffer(&positionsVBO_CUDA, positionsVBO, cudaGraphicsMapFlagsWriteDiscard);
// Launch rendering loop
glutMainLoop();
}
void display() {
// Map buffer object for writing from CUDA
float4* positions;
cudaGraphicsMapResources(1, &positionsVBO_CUDA, 0);
size_t num_bytes;
cudaGraphicsResourceGetMappedPointer((void**)&positions, &num_bytes, positionsVBO_CUDA));
// Execute kernel
dim3 dimBlock(16, 16, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
createVertices<<<dimGrid, dimBlock>>>(positions, time, width, height);
// Unmap buffer object
cudaGraphicsUnmapResources(1, &positionsVBO_CUDA, 0);
// Render from buffer object
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glBindBuffer(GL_ARRAY_BUFFER, positionsVBO);
glVertexPointer(4, GL_FLOAT, 0, 0);
glEnableClientState(GL_VERTEX_ARRAY);
glDrawArrays(GL_POINTS, 0, width * height);
glDisableClientState(GL_VERTEX_ARRAY);
// Swap buffers
glutSwapBuffers();
glutPostRedisplay();
}
void deleteVBO() {
cudaGraphicsUnregisterResource(positionsVBO_CUDA);
glDeleteBuffers(1, &positionsVBO);
}
__global__ void createVertices(float4* positions, float time, unsigned int width, unsigned int height) {
// [....]
}
Is there a way to give the cudaMalloc created memory space directly to OpenGL? I've got already working code written on cuda and I want to put my float4 array directly into OpenGL.
Say if've got already code like:
float4 *cd = (float4*) cudaMalloc(elements*sizeof(float4)).
do_something<<<16,1>>>(cd);
And I wanted to display the output of do_something through OpenGL.
Side note: why is the cudaGraphicsResourceGetMappedPointer function run on every timestep?
As of CUDA 4.0, OpenGL interop is one-way. That means to do what you want (run a CUDA kernel that writes data to a GL buffer or texture image), you have to map the buffer to a device pointer, and pass that pointer to your kernel, as shown in your example.
As for your side note: cudaGraphicsResourceGetMappedPointer is called every time display() is called because cudaGraphicsMapResource is called every frame. Any time you re-map a resource you should re-get the mapped pointer, because it may have changed. Why re-map every frame? Well, OpenGL sometimes moves buffer objects around in memory, for performance reasons (especially in memory-intensive GL applications). If you leave the resource mapped all the time, it can't do this, and performance may suffer. I believe GL's ability and need to virtualize memory objects is also one of the reasons the current GL interop API is one-way (the GL is not allowed to move CUDA allocations around, and therefore you can't map a CUDA-allocated device pointer into a GL buffer object).
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With