Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to parse files that cannot fit entirely in memory RAM

I have created a framework to parse text files of reasonable size that can fit in memory RAM, and for now, things are going well. I have no complaints, however what if I encountered a situation where I have to deal with large files, say, greater than 8GB(which is the size of mine)? What would be an efficient approach to deal with such large files?

My framework:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

int Parse(const char *filename,
    const char *outputfile);

int main(void)
{
    clock_t t1 = clock();
    /* ............................................................................................................................. */
    Parse("file.txt", NULL);
    /* ............................................................................................................................. */
    clock_t t2 = clock();
    fprintf(stderr, "time elapsed: %.4f\n", (double)(t2 - t1) / CLOCKS_PER_SEC);
    fprintf(stderr, "Press any key to continue . . . ");
    getchar();
    return 0;
}

long GetFileSize(FILE * fp)
{
    long f_size;
    fseek(fp, 0L, SEEK_END);
    f_size = ftell(fp);
    fseek(fp, 0L, SEEK_SET);
    return f_size;
}

char *dump_file_to_array(FILE *fp,
    size_t f_size)
{
    char *buf = (char *)calloc(f_size + 1, 1);
    if (buf) {
        size_t n = 0;
        while (fgets(buf + n, INT_MAX, fp)) {
            n += strlen(buf + n);
        }
    }
    return buf;
}

int Parse(const char *filename,
    const char *outputfile)
{
    /* open file for reading in text mode */
    FILE *fp = fopen(filename, "r");
    if (!fp) {
        perror(filename);
        return 1;
    }
    /* store file in dynamic memory and close file */
    size_t f_size = GetFileSize(fp);
    char *buf = dump_file_to_array(fp, f_size);
    fclose(fp);
    if (!buf) {
        fputs("error: memory allocation failed.\n", stderr);
        return 2;
    }
    /* state machine variables */
    // ........

    /* array index variables */
    size_t x = 0;
    size_t y = 0;
    /* main loop */
    while (buf[x]) {
        switch (buf[x]) {
            /* ... */
        }
        x++;
    }
    /* NUL-terminate array at y */
    buf[y] = '\0';
    /* write buffer to file and clean up */
    outputfile ? fp = fopen(outputfile, "w") :
                 fp = fopen(filename, "w");
    if (!fp) {
        outputfile ? perror(outputfile) :
                     perror(filename);
    }
    else {
        fputs(buf, fp);
        fclose(fp);
    }
    free(buf);
    return 0;
}

Pattern deletion function based on the framework:

int delete_pattern_in_file(const char *filename,
    const char *pattern, const char *outputfile)
{
    /* open file for reading in text mode */
    FILE *fp = fopen(filename, "r");
    if (!fp) {
        perror(filename);
        return 1;
    }
    /* copy file contents to buffer and close file */
    size_t f_size = GetFileSize(fp);
    char *buf = dump_file_to_array(fp, f_size);
    fclose(fp);
    if (!buf) {
        fputs("error - memory allocation failed", stderr);
        return 2;
    }
    /* delete first match */
    size_t n = 0, pattern_len = strlen(pattern);
    char *tmp, *ptr = strstr(buf, pattern);
    if (!ptr) {
        fputs("No match found.\n", stderr);
        free(buf);
        return -1;
    }
    else {
        n = ptr - buf;
        ptr += pattern_len;
        tmp = ptr;
    }
    /* delete the rest */
    while (ptr = strstr(ptr, pattern)) {
        while (tmp < ptr) {
            buf[n++] = *tmp++;
        }
        ptr += pattern_len;
        tmp = ptr;
    }
    /* copy the rest of the buffer */
    strcpy(buf + n, tmp);
    /* open file for writing and print the processed buffer to it */
    outputfile ? fp = fopen(outputfile, "w") :
                 fp = fopen(filename, "w");
    if (!fp) {
        outputfile ? perror(outputfile) :
                     perror(filename);
    }
    else {
        fputs(buf, fp);
        fclose(fp);
    }
    free(buf);
    return 0;
}
like image 565
machine_1 Avatar asked Dec 21 '16 09:12

machine_1


3 Answers

If you wish to stick with your current design, an option might be to mmap() the file instead of reading it into a memory buffer.

You could change the function dump_file_to_array to the following (linux-specific):

char *dump_file_to_array(FILE *fp, size_t f_size) {
   buf = mmap(NULL, f_size, PROT_READ, MAP_SHARED, fileno(fp), 0);
   if (buf == MAP_FAILED)
       return NULL;
   return buf;
}

Now you can read over the file, the memory manager will take automatically care to only hold the relevant potions of the file in memory. For Windows, similar mechanisms exist.

like image 69
Ctx Avatar answered Nov 06 '22 20:11

Ctx


Chances you are parsing the file line-by line. So read in a large block (4k or 16k) and parse all the lines in that. Copy the small remainder to the beginning of the 4k or 16k buffer and read in the rest of the buffer. Rinse and repeat.

For JSON or XML you will need an event based parser that can accept multiple blocks or input.

like image 2
doron Avatar answered Nov 06 '22 21:11

doron


There are multiple issues with your approach.

The concept of maximum and available memory are not so evident: technically, you are not limited by the RAM size, but by the quantity of memory your environment will let you allocate and use for your program. This depends on various factors:

  • What ABI you compile for: the maximum memory size accessible to your program is limited to less than 4 GB if you compile for 32-bit code, even if your system has more RAM than that.
  • What quota the system is configured to let your program use. This may be less than available memory.
  • What strategy the system uses when more memory is requested than is physically available: most modern systems use virtual memory and share physical memory between processes and system tasks (such as the disk cache) using very advanced algorithms that cannot be describe in a few lines. It is possible on some systems for your program to allocate and use more memory than is physically installed on the motherboard, swapping memory pages to disk as more memory is accessed, at a huge cost in lag time.

There are further issues in your code:

  • The type long might be too small to hold the size of the file: on Windows systems, long is 32-bit even on 64-bit versions where memory can be allocated in chunks larger than 2GB. You must use different API to request the file size from the system.
  • You read the file with an series of calls to fgets(). This is inefficient, a single call to fread() would suffice. Furthermore, if the file contains embedded null bytes ('\0' characters), chunks from the file will be missing in memory. However you could not deal with embedded null bytes if you use string functions such as strstr() and strcpy() to handle your string deletion task.
  • the condition in while (ptr = strstr(ptr, pattern)) is an assignment. While not strictly incorrect, it is poor style as it confuses readers of your code and prevents life saving warnings by the compiler where such assignment-conditions are coding errors. You might think that could never happen, but anyone can make a typo and a missing = in a test is difficult to spot and has dire consequences.
  • you short-hand use of the ternary operator in place of if statements is quite confusing too: outputfile ? fp = fopen(outputfile, "w") : fp = fopen(filename, "w");
  • rewriting the input file in place is risky too: if anything goes wrong, the input file will be lost.

Note that you can implement the filtering on the fly, without a buffer, albeit inefficiently:

#include <stdio.h>
#include <string.h>

int main(int argc, char *argv[]) {
    if (argc < 2) {
        fprintf(stderr, "usage: delpat PATTERN < inputfile > outputfile\n");
        return 1;
    }
    unsigned char *pattern = (unsigned char*)argv[1];
    size_t i, j, n = strlen(argv[1]);
    size_t skip[n + 1];
    int c;

    skip[0] = 0;
    for (i = j = 1; i < n; i++) {
        while (memcmp(pattern, pattern + j, i - j)) {
            j++;
        }
        skip[i] = j;
    }

    i = 0;
    while ((c = getchar()) != EOF) {
        for (;;) {
            if (i < n && c == pattern[i]) {
                if (++i == n) {
                    i = 0; /* match found, consumed */
                }
                break;
            }
            if (i == 0) {
                putchar(c);
                break;
            }
            for (j = 0; j < skip[i]; j++) {
                putchar(pattern[j]);
            }
            i -= skip[i];
        }
    }
    for (j = 0; j < i; j++) {
        putchar(pattern[j]);
    }
    return 0;
}
like image 2
chqrlie Avatar answered Nov 06 '22 20:11

chqrlie