GPU code running slower than CPU version

Tags:

I am working on an application which divides a string into pieces and assigns each to a block. Within each block the the text is scanned character by character and a shared array of int, D is to be updated by different threads in parallel based on the character read. At the end of each iteration the last element of D is checked, and if it satisfied the condition, a global int array m is set to 1 at the position corresponding to the text. This code was executed on a NVIDIA GEForce Fermi 550, and runs even slower than the CPU version. I have just included the kernel here:

__global__ void match(uint32_t* BB_d,const char* text_d,int n, int m,int k,int J,int lc,int start_addr,int tBlockSize,int overlap ,int* matched){
    __shared__ int D[MAX_THREADS+2];
    __shared__ char Text_S[MAX_PATTERN_SIZE];
    __shared__ int DNew[MAX_THREADS+2];
    __shared__ int BB_S[4][MAX_THREADS];
    int w=threadIdx.x+1;

    for(int i=0;i<4;i++)
    {
        BB_S[i][threadIdx.x]= BB_d[i*J+threadIdx.x];
    }

    {
        D[threadIdx.x] = 0;
        {
            D[w] = (1<<(k+1)) -1;

            for(int i = 0; i < lc - 1; i++)
            {
                D[w] = (D[w] << k+2) + (1<<(k+1)) -1;
            }
        }
        D[J+1] = (1<<((k+2)*lc)) - 1;
    }
    int startblock=(blockIdx.x == 0?start_addr:(start_addr+(blockIdx.x * (tBlockSize-overlap))));
    int size= (((startblock + tBlockSize) > n )? ((n- (startblock))):( tBlockSize));

    int copyBlock=(size/J)+ ((size%J)==0?0:1);
    if((threadIdx.x * copyBlock) <= size)
        memcpy(Text_S+(threadIdx.x*copyBlock),text_d+(startblock+threadIdx.x*copyBlock),(((((threadIdx.x*copyBlock))+copyBlock) > size)?(size-(threadIdx.x*copyBlock)):copyBlock));
    memcpy(DNew, D, (J+2)*sizeof(int));
    __syncthreads();
    uint32_t initial = D[1];
    uint32_t x;
    uint32_t mask = 1;
    for(int i = 0; i < lc - 1; i++)mask = (mask<<(k+2)) + 1;
    for(int i = 0; i < size;i++)
    {
        {
            x =  ((D[w] >> (k+2)) | (D[w - 1] << ((k + 2)* (lc - 1))) | (BB_S[(((int)Text_S[i])/2)%4][w-1])) & ((1 << (k + 2)* lc) - 1);
            DNew[w] = ((D[w]<<1) | mask)
                & (((D[w] << k+3) | mask|((D[w +1] >>((k+2)*(lc - 1)))<<1)))
                & (((x + mask) ^ x) >> 1)
                & initial;
        }
        __syncthreads();
        memcpy(D, DNew, (J+2)*sizeof(int));
        if(!(D[J] & 1<<(k + (k + 2)*(lc*J -m + k ))))
        {
            matched[startblock+i] = 1;
            D[J] |= ((1<<(k + 1 + (k + 2)*(lc*J -m + k ))) - 1);
        }
    }
}

I am not very familiar with CUDA so I dont quite understand issues such as shared memory bank conflicts. Could that be the bottleneck here?

As asked, this is the code where I launch the kernels:

#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#define uint32_t unsigned int
#define MAX_THREADS 512
#define MAX_PATTERN_SIZE 1024
#define MAX_BLOCKS 8
#define MAX_STREAMS 16
#define TEXT_MAX_LENGTH 1000000000
void calculateBBArray(uint32_t** BB,const char* pattern_h,int m,int k , int lc , int J){};
void checkCUDAError(const char *msg) {
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err) 
    {   
            fprintf(stderr, "Cuda error: %s: %s.\n", msg, 
                            cudaGetErrorString( err) );
            exit(EXIT_FAILURE);
    }    
}
char* getTextString() {
 FILE *input, *output;
 char c;
 char * inputbuffer=(char *)malloc(sizeof(char)*TEXT_MAX_LENGTH);

int numchars = 0, index  = 0;

input = fopen("sequence.fasta", "r");
c = fgetc(input);
while(c != EOF)
{
inputbuffer[numchars] = c;
numchars++;
c = fgetc(input);
}
fclose(input);
inputbuffer[numchars] = '\0'; 
return inputbuffer;
}

int main(void) {
const char pattern_h[] = "TACACGAGGAGAGGAGAAGAACAACGCGACAGCAGCAGACTTTTTTTTTTTTACAC";
char * text_h=getTextString();  //reading text from file, supported upto 200MB currently

int k = 13;
int i;
int count=0;
char *pattern_d, *text_d;     // pointers to device memory
char* text_new_d;
int* matched_d;
int* matched_new_d;
uint32_t* BB_d;
uint32_t* BB_new_d;
int* matched_h = (int*)malloc(sizeof(int)* strlen(text_h));
cudaMalloc((void **) &pattern_d, sizeof(char)*strlen(pattern_h)+1);
cudaMalloc((void **) &text_d, sizeof(char)*strlen(text_h)+1);
cudaMalloc((void **) &matched_d, sizeof(int)*strlen(text_h));
cudaMemcpy(pattern_d, pattern_h, sizeof(char)*strlen(pattern_h)+1, cudaMemcpyHostToDevice);
cudaMemcpy(text_d, text_h, sizeof(char)*strlen(text_h)+1, cudaMemcpyHostToDevice);
cudaMemset(matched_d, 0,sizeof(int)*strlen(text_h));

int m = strlen(pattern_h);
int n = strlen(text_h);

uint32_t* BB_h[4];
    unsigned int maxLc = ((((m-k)*(k+2)) > (31))?(31/(k+2)):(m-k));
unsigned int lc=2;   // Determines the number of threads per block
    // can be varied upto maxLc for tuning performance
if(lc>maxLc)
{
    exit(0);
}
unsigned int noWordorNfa =((m-k)/lc) + (((m-k)%lc)  == 0?0:1);
cudaMalloc((void **) &BB_d, sizeof(int)*noWordorNfa*4);
if(noWordorNfa >= MAX_THREADS)
{
    printf("Error: max threads\n");
    exit(0);
}

calculateBBArray(BB_h,pattern_h,m,k,lc,noWordorNfa);  // not included this function

for(i=0;i<4;i++)
{
    cudaMemcpy(BB_d+ i*noWordorNfa, BB_h[i], sizeof(int)*noWordorNfa, cudaMemcpyHostToDevice);
}
int overlap=m;
int textBlockSize=(((m+k+1)>n)?n:(m+k+1));
cudaStream_t stream[MAX_STREAMS];
for(i=0;i<MAX_STREAMS;i++) {
    cudaStreamCreate( &stream[i] );
    }

int start_addr=0,index=0,maxNoBlocks=0;
if(textBlockSize>n)
{
    maxNoBlocks=1;
}
else
{
     maxNoBlocks=((1 + ((n-textBlockSize)/(textBlockSize-overlap)) + (((n-textBlockSize)%(textBlockSize-overlap)) == 0?0:1)));
}
int kernelBlocks = ((maxNoBlocks > MAX_BLOCKS)?MAX_BLOCKS:maxNoBlocks);
int blocksRemaining =maxNoBlocks;
printf(" maxNoBlocks %d kernel Blocks %d \n",maxNoBlocks,kernelBlocks);
while(blocksRemaining >0)
{
kernelBlocks = ((blocksRemaining > MAX_BLOCKS)?MAX_BLOCKS:blocksRemaining);
printf(" Calling %d Blocks with starting Address %d , textBlockSize %d \n",kernelBlocks,start_addr,textBlockSize);
match<<<kernelBlocks,noWordorNfa,0,stream[(index++)%MAX_STREAMS]>>>(BB_d,text_d,n,m,k,noWordorNfa,lc,start_addr,textBlockSize,overlap,matched_d);
start_addr+=kernelBlocks*(textBlockSize-overlap);;
blocksRemaining -= kernelBlocks;
}
cudaMemcpy(matched_h, matched_d, sizeof(int)*strlen(text_h), cudaMemcpyDeviceToHost);
checkCUDAError("Matched Function");
for(i=0;i<MAX_STREAMS;i++)
    cudaStreamSynchronize( stream[i] ); 
    // do stuff with matched
    // ....
    // ....
free(matched_h);cudaFree(pattern_d);cudaFree(text_d);cudaFree(matched_d);
    return 0;

}

Number of threads launched per block depends upon the length pattern_h(could be at most maxLc above). I expect it to be around 30 in this case. Shoudn't that be enough to see a good amount of concurrency? As for blocks, I see no point in launching more than MAX_BLOCKS (=10) at a time since the hardware can schedule only 8 simultaneously

NOTE: I don't have GUI access.

735

asked Nov 27 '12 20:11

Bug Killer

3 Answers

With all the shared memory you're using, you could be running into bank conflicts if consecutive threads are not reading from consecutive addresses in the shared arrays ... that could cause serialization of the memory accesses, which in turn will kill the parallel performance of your algorithm.

196

answered Sep 16 '22 21:09

Jason

I breifly looked at your code but it looks like your sending data to the gpu back and forth creating a bottle neck on the bus? did you try profiling it?

answered Sep 16 '22 21:09

pyCthon

I found that I was copying the whole array Dnew to D in each thread rather than copying only the portion each thread was supposed to update D[w]. This would cause the threads to execute serially, although I don't know if it could be called a shared memory bank conflict. Now it gives 8-9x speedup for large enough patterns(=more threads). This is much less than what I expected. I will try to increase number of blocks as suggested. I dont know how to increase the # of threads

answered Sep 19 '22 21:09

Bug Killer

Related questions
                            
                                use va_list from another thread
                            
                                Override a function call in C from the same compilation unit
                            
                                Algorithm for ASTC software texture compression/decompression [closed]
                            
                                Dealing with C library anonymous struct types in C++
                            
                                Making a C library thread safe
                            
                                Add all elements in a lane
                            
                                Correct cyclic garbage-collection in extension modules
                            
                                How can I add a thousands separator to a double in C on Windows?
                            
                                How to compare more than two numbers in parallel?
                            
                                cropping IplImage most effectively
                            
                                atomic compare(not equal) and swap
                            
                                Send multiple files via HTTP POST with libcurl
                            
                                "##" in printk, what does ## mean
                            
                                Elegant way to add/remove descriptors to/from poll
                            
                                C - Pros/Cons of Enum-Indexed Arrays [closed]
                            
                                freeing memory in Circular Doubly Linked List
                            
                                What does ovly_debug_event do in chrome?
                            
                                Open a fullscreen window on 2nd monitor with SFML?
                            
                                MD5 HMAC With OpenSSL
                            
                                kernel module: Accessing member of structure defined in another module header

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

GPU code running slower than CPU version

Tags:

c

cuda

gpgpu

Bug Killer

People also ask

3 Answers

Jason

pyCthon

Bug Killer

Recent Activity

Donate For Us