Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Using realloc to expand buffer while reading from file crashes

Tags:

c

realloc

fasta

I am writing some code that needs to read fasta files, so part of my code (included below) is a fasta parser. As a single sequence can span multiple lines in the fasta format, I need to concatenate multiple successive lines read from the file into a single string. I do this, by realloc'ing the string buffer after reading every line, to be the current length of the sequence plus the length of the line read in. I do some other stuff, like stripping white space etc. All goes well for the first sequence, but fasta files can contain multiple sequences. So similarly, I have a dynamic array of structs with a two strings (title, and actual sequence), being "char *". Again, as I encounter a new title (introduced by a line beginning with '>') I increment the number of sequences, and realloc the sequence list buffer. The realloc segfaults on allocating space for the second sequence with

*** glibc detected *** ./stackoverflow: malloc(): memory corruption: 0x09fd9210 ***
Aborted

For the life of me I can't see why. I've run it through gdb and everything seems to be working (i.e. everything is initialised, the values seems sane)... Here's the code:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
#include <errno.h>

//a struture to keep a record of sequences read in from file, and their titles
typedef struct {
    char *title;
    char *sequence;
} sequence_rec;

//string convenience functions

//checks whether a string consists entirely of white space
int empty(const char *s) {
    int i;
    i = 0;
    while (s[i] != 0) {
        if (!isspace(s[i])) return 0;
        i++;
    }
    return 1;
}

//substr allocates and returns a new string which is a substring of s from i to
//j exclusive, where i < j; If i or j are negative they refer to distance from
//the end of the s
char *substr(const char *s, int i, int j) {
    char *ret;
    if (i < 0) i = strlen(s)-i;
    if (j < 0) j = strlen(s)-j;
    ret = malloc(j-i+1);
    strncpy(ret,s,j-i);
    return ret;
}

//strips white space from either end of the string
void strip(char **s) {
    int i, j, len;
    char *tmp = *s;
    len = strlen(*s);
    i = 0;
    while ((isspace(*(*s+i)))&&(i < len)) {
        i++;
    }
    j = strlen(*s)-1;
    while ((isspace(*(*s+j)))&&(j > 0)) {
        j--;
    }
    *s = strndup(*s+i, j-i);
    free(tmp);
}


int main(int argc, char**argv) {
    sequence_rec *sequences = NULL;
    FILE *f = NULL;
    char *line = NULL;
    size_t linelen;
    int rcount;
    int numsequences = 0;

    f = fopen(argv[1], "r");
    if (f == NULL) {
        fprintf(stderr, "Error opening %s: %s\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }
    rcount = getline(&line, &linelen, f);
    while (rcount != -1) {
        while (empty(line)) rcount = getline(&line, &linelen, f);
        if (line[0] != '>') {
            fprintf(stderr,"Sequence input not in valid fasta format\n");
            return EXIT_FAILURE;
        }

        numsequences++;
        sequences = realloc(sequences,sizeof(sequence_rec)*numsequences);
        sequences[numsequences-1].title = strdup(line+1); strip(&sequences[numsequences-1].title);
        rcount = getline(&line, &linelen, f);
        sequences[numsequences-1].sequence = malloc(1); sequences[numsequences-1].sequence[0] = 0;
        while ((!empty(line))&&(line[0] != '>')) {
            strip(&line);
            sequences[numsequences-1].sequence = realloc(sequences[numsequences-1].sequence, strlen(sequences[numsequences-1].sequence)+strlen(line)+1);
            strcat(sequences[numsequences-1].sequence,line);
            rcount = getline(&line, &linelen, f);
        }
    }
    return EXIT_SUCCESS;
}
like image 418
sirlark Avatar asked Jan 23 '12 14:01

sirlark


1 Answers

You should use strings that look something like this:

struct string {
    int len;
    char *ptr;
};

This prevents strncpy bugs like what it seems you saw, and allows you to do strcat and friends faster.

You should also use a doubling array for each string. This prevents too many allocations and memcpys. Something like this:

int sstrcat(struct string *a, struct string *b)
{
    int len = a->len + b->len;
    int alen = a->len;
    if (a->len < len) {
        while (a->len < len) {
            a->len *= 2;
        }
        a->ptr = realloc(a->ptr, a->len);
        if (a->ptr == NULL) {
            return ENOMEM;
        }
    }
    memcpy(&a->ptr[alen], b->ptr, b->len);
    return 0;
}

I now see you are doing bioinformatics, which means you probably need more performance than I thought. You should use strings like this instead:

struct string {
    int len;
    char ptr[0];
};

This way, when you allocate a string object, you call malloc(sizeof(struct string) + len) and avoid a second call to malloc. It's a little more work but it should help measurably, in terms of speed and also memory fragmentation.

Finally, if this isn't actually the source of error, it looks like you have some corruption. Valgrind should help you detect it if gdb fails.

like image 135
leif Avatar answered Nov 15 '22 05:11

leif