How to read successfully from a 2D texture

Question

How can I:

Bind cudaMallocPitch float memory to a 2D texture reference
Copy some host data to the 2D array on the device
Add one to the texture reference and write to either a.) the Pitch 2D array OR b.) write to a linear memory array
Read the answer back and display it.

Below is a code that should accomplish this. Note that for NxN array sizes, my code works. For NxM where N!=M, my code bites the dust (not the correct result). If you can solve this problem I will award you 1 internets (supply limited). Maybe I'm crazy, but according to the documentation this should work (and it does work for square arrays!). The attached code should run with 'nvcc whateveryoucallit.cu -o runit'.

Help is appreciated!

#include<stdio.h>
#include<cuda.h>
#include<iostream>
#define height 16
#define width 11
#define BLOCKSIZE 16

using namespace std;

// Device Kernels

//Texture reference Declaration
texture<float,2> texRefEx;


__global__ void kernel_w_textures(float* devMPPtr, float * devMPtr, int pitch)
{
 // Thread indexes
        unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
        unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;

 // Texutre Coordinates
 float u=(idx)/float(width);
 float v=(idy)/float(height);
 devMPtr[idy*width+idx]=devMPPtr[idy*pitch/sizeof(float)+idx];
 // Write Texture Contents to malloc array +1
 devMPtr[idy*width+idx]= tex2D(texRefEx,u,v);//+1.0f;
}
int main()
{
 // memory size
 size_t memsize=height*width;
 size_t offset;
 float * data,  // input from host
  *h_out,  // host space for output
  *devMPPtr, // malloc Pitch ptr
  *devMPtr; // malloc ptr

 size_t pitch;

 // Allocate space on the host
 data=(float *)malloc(sizeof(float)*memsize);
 h_out=(float *)malloc(sizeof(float)*memsize);


// Define data
for (int i = 0; i <  height; i++)
 for (int j=0; j < width; j++)
  data[i*width+j]=float(j);

// Define the grid
dim3 grid((int)(width/BLOCKSIZE)+1,(int)(height/BLOCKSIZE)+1), threads(BLOCKSIZE,BLOCKSIZE);

// allocate Malloc Pitch
cudaMallocPitch((void**)&devMPPtr,&pitch, width * sizeof(float), height);

// Print the pitch
printf("The pitch is %d 
",pitch/sizeof(float));

// Texture Channel Description
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);

// Bind texture to pitch mem:
cudaBindTexture2D(&offset,&texRefEx,devMPPtr,&channelDesc,width,height,pitch);
cout << "My Description x is " << channelDesc.x << endl;
cout << "My Description y is " << channelDesc.y << endl;
cout << "My Description z is " << channelDesc.z << endl;
cout << "My Description w is " << channelDesc.w << endl;
cout << "My Description kind is " << channelDesc.f << endl;
cout << "Offset is " << offset << endl;

// Set mutable properties:
texRefEx.normalized=true;
texRefEx.addressMode[0]=cudaAddressModeWrap;
texRefEx.addressMode[1]=cudaAddressModeWrap;
texRefEx.filterMode= cudaFilterModePoint;

// Allocate cudaMalloc memory
cudaMalloc((void**)&devMPtr,memsize*sizeof(float));

// Read data from host to device
cudaMemcpy2D((void*)devMPPtr,pitch,(void*)data,sizeof(float)*width,
  sizeof(float)*width,height,cudaMemcpyHostToDevice);

//Read back and check this memory
cudaMemcpy2D((void*)h_out,width*sizeof(float),(void*)devMPPtr,pitch,
  sizeof(float)*width,height,cudaMemcpyDeviceToHost);

// Print the memory
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*width+j]);
  }
 cout << endl;
 }

 cout << "Done" << endl;
// Memory is fine... 

kernel_w_textures<<<grid,threads>>>(devMPPtr, devMPtr, pitch);

// Copy back data to host
cudaMemcpy((void*)h_out,(void*)devMPtr,width*height*sizeof(float),cudaMemcpyDeviceToHost);


// Print the Result
 cout << endl;
 for (int i=0; i<height; i++){
  for (int j=0; j<width; j++){
   printf("%2.2f ",h_out[i*width+j]);
  }
 cout << endl;
 }
 cout << "Done" << endl;

return(0);
}

Edit October 17: So I still haven't found a solution to this issue. Nvidia is pretty silent on this seems that the world is too. I found a workaround using shared mem but if anyone has a texture solution I would be very please.

Edit Octoboer 26: Still no soltuion, but still interested in one if anyone knows.

Edit July 26: Wow it has been 9 months - and I had overlooked the correct answer the whole time. The trick was:

if ( idx < width  && idy < height){//.... code }

As had been pointed out before. Thanks to all of those who contributed!

tkerwin · Accepted Answer

 // Texutre Coordinates
 float u=(idx + 0.5)/float(width);
 float v=(idy + 0.5)/float(height);

You need an offset to get to the center of the texel. I think there might have been some rounding error for your non-multiple of 16 textures. I tried this and it worked for me (both outputs were identical).

user1668176 · Answer

I think:

 float u=(idx)/float(width);
 float v=(idy)/float(height);

should be

 float u=(idx+0.5f)/float(width);
 float v=(idy+0.5f)/float(height);

In order the get identical input/output, otherwise the second column of output equals the first column of input rather than second and the second last column of output is also wrong.

Please correct me if you have different observation.

How to read successfully from a 2D texture

Tags:

c++

cuda

textures

Marm0t

2 Answers

tkerwin

user1668176

Recent Activity

Donate For Us

How to read successfully from a 2D texture

Tags:

c++

cuda

textures

Marm0t

2 Answers

tkerwin

user1668176

Related questions

Recent Activity

Donate For Us