For the record this is homework so help as little or as much with that in mind. We are using constant memory to store a "mask matrix" that will be used to perform a convolution on a larger matrix. When I am in the host code I am copying the mask to constant memory using the cudaMemcpyToSymbol().
My question is once this is copied over and I launch my device kernel code how does the device know where to access the constant memory mask matrix. Is there a pointer that I need to pass in on kernel launch. Most of the code that the professor gave us is not supposed to be changed (there is no pointer to the mask passed in) but there is always the possibility that he made a mistake ( although it is most likely my understanding of something)
Is the constant memeory declaratoin supposed to be included in the seperate kernel.cu file?
I am minimizing the code to just show the things having to do with the constant memory. As such please don't point out if something is not initialized ect. There is code for that but that is not of concern at this time.
main.cu:
#include <stdio.h>
#include "kernel.cu"
__constant__ float M_d[FILTER_SIZE * FILTER_SIZE];
int main(int argc, char* argv[])
{
Matrix M_h, N_h, P_h; // M: filter, N: input image, P: output image
/* Allocate host memory */
M_h = allocateMatrix(FILTER_SIZE, FILTER_SIZE);
N_h = allocateMatrix(imageHeight, imageWidth);
P_h = allocateMatrix(imageHeight, imageWidth);
/* Initialize filter and images */
initMatrix(M_h);
initMatrix(N_h);
cudaError_t cudda_ret = cudaMemcpyToSymbol(M_d, M_h.elements, M_h.height * M_h.width * sizeof(float), 0, cudaMemcpyHostToDevice);
//char* cudda_ret_pointer = cudaGetErrorString(cudda_ret);
if( cudda_ret != cudaSuccess){
printf("\n\ncudaMemcpyToSymbol failed\n\n");
printf("%s, \n\n", cudaGetErrorString(cudda_ret));
}
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
//INSERT CODE HERE
//block size is 16x16
// \\\\\\\\\\\\\**DONE**
dim_grid = dim3(ceil(N_h.width / (float) BLOCK_SIZE), ceil(N_h.height / (float) BLOCK_SIZE));
dim_block = dim3(BLOCK_SIZE, BLOCK_SIZE);
//KERNEL Launch
convolution<<<dim_grid, dim_block>>>(N_d, P_d);
return 0;
}
kernel.cu: THIS IS WHERE I DO NOT KNOW HOW TO ACCESS THE CONSTANT MEMORY.
//__constant__ float M_c[FILTER_SIZE][FILTER_SIZE];
__global__ void convolution(Matrix N, Matrix P)
{
/********************************************************************
Determine input and output indexes of each thread
Load a tile of the input image to shared memory
Apply the filter on the input image tile
Write the compute values to the output image at the correct indexes
********************************************************************/
//INSERT KERNEL CODE HERE
//__shared__ float N_shared[BLOCK_SIZE][BLOCK_SIZE];
//int row = (blockIdx.y * blockDim.y) + threadIdx.y;
//int col = (blockIdx.x * blockDim.x) + threadIdx.x;
}
In "classic" CUDA compilation you must define all code and symbols (textures, constant memory, device functions) and any host API calls which access them (including kernel launches, binding to textures, copying to symbols) within the same translation unit. This means, effectively, in the same file (or via multiple include statements within the same file). This is because "classic" CUDA compilation doesn't include a device code linker.
Since CUDA 5 was released, there is the possibility of using separate compilation mode and linking different device code objects into a single fatbinary payload on architectures which support it. In that case, you need to declare any __constant__ variables using the extern keyword and define the symbol exactly once.
If you can't use separate compilation, then the usual workaround is to define the __constant__ symbol in the same .cu file as your kernel, and include a small host wrapper function which just calls cudaMemcpyToSymbol
to set the __constant__ symbol in question. You would probably do the same with kernel calls and texture operations.
Below is a "minimum-sized" example showing the use of __constant__
symbols. You do not need to pass any pointer to the __global__
function.
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
__constant__ float test_const;
__global__ void test_kernel(float* d_test_array) {
d_test_array[threadIdx.x] = test_const;
}
#include <conio.h>
int main(int argc, char **argv) {
float test = 3.f;
int N = 16;
float* test_array = (float*)malloc(N*sizeof(float));
float* d_test_array;
cudaMalloc((void**)&d_test_array,N*sizeof(float));
cudaMemcpyToSymbol(test_const, &test, sizeof(float));
test_kernel<<<1,N>>>(d_test_array);
cudaMemcpy(test_array,d_test_array,N*sizeof(float),cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++) printf("%i %f\n",i,test_array[i]);
getch();
return 0;
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With