Grepping a 20g file in bash

Question

Question about code performance: I'm trying to run ~25 regex rules against a ~20g text file. The script should output matches to text files; each regex rule generates its own file. See the pseudocode below:

regex_rules=~/Documents/rulesfiles/regexrulefile.txt
for tmp in *.unique20gbfile.suffix; do
    while read line
    # Each $line in the looped-through file contains a regex rule, e.g.,
    # egrep -i '(^| )justin ?bieber|(^| )selena ?gomez'
    # $rname is a unique rule name generated by a separate bash function
    # exported to the current shell.
        do
        cmd="$line $tmp > ~/outputdir/$tmp.$rname.filter.piped &"
        eval $cmd
    done < $regex_rules
done

Couple thoughts:

Is there a way to loop the text file just once, evaluating all rules and splitting to individual files in one go? Would this be faster?
Is there a different tool I should be using for this job?

Thanks.

Alan Curry · Accepted Answer

This is the reason grep has a -f option. Reduce your regexrulefile.txt to just the regexps, one per line, and run

egrep -f regexrulefile.txt the_big_file

This produces all the matches in a single output stream, but you can do your loop thing on it afterward to separate them out. Assuming the combined list of matches isn't huge, this will be a performance win.

Spencer Rathbun · Answer

I did something similar with lex. Of course, it runs every other day, so YMMV. It is very fast, even on several hundred megabyte files on a remote windows share. It takes only a few seconds to process. I don't know how comfortable you are hacking up a quick C program, but I've found this to be the fastest, easiest solution for large scale regex problems.

Parts redacted to protect the guilty:

    /************************************************** 
        start of definitions section

    ***************************************************/


%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <getopt.h>
#include <errno.h>

char inputName[256];
// static insert variables

//other variables
char tempString[256];
char myHolder[256];
char fileName[256];
char unknownFileName[256];
char stuffFileName[256];
char buffer[5];

/* we are using pointers to hold the file locations, and allow us to dynamically open and close new files */
/* also, it allows us to obfuscate which file we are writing to, otherwise this couldn't be done */

FILE *yyTemp;
FILE *yyUnknown;
FILE *yyStuff;

// flags for command line options
static int help_flag = 0;

%}

%option 8bit 
%option nounput nomain noyywrap 
%option warn

%%
    /************************************************ 
        start of rules section
    *************************************************/


(\"A\",\"(1330|1005|1410|1170)\") { 
    strcat(myHolder, yytext);
    yyTemp = &(*yyStuff);
} //stuff files

. { strcat(myHolder, yytext); }


  {
    if (&(*yyTemp) == &(*yyUnknown))
        unknownCount += 1;
    strcat(myHolder, yytext); 
    //print to file we are pointing at, whatever it is
    fprintf(yyTemp, "%s", myHolder);
    strcpy(myHolder, "");
    yyTemp = &(*yyUnknown);
}

<<EOF>> {
    strcat(myHolder, yytext); 
    fprintf(yyTemp, "%s", myHolder);
    strcpy(myHolder, "");
    yyTemp = &(*yyUnknown);

    yyterminate();
}

%%
    /**************************************************** 
        start of code section


    *****************************************************/


int main(int argc, char **argv);

int main (argc,argv)
int argc;
char **argv;
{
    /****************************************************
        The main method drives the program. It gets the filename from the
        command line, and opens the initial files to write to. Then it calls the lexer.
        After the lexer returns, the main method finishes out the report file,
        closes all of the open files, and prints out to the command line to let the
        user know it is finished.
    ****************************************************/

    int c;

    // the gnu getopt library is used to parse the command line for flags
    // afterwards, the final option is assumed to be the input file

    while (1) {
        static struct option long_options[] = {
            /* These options set a flag. */
            {"help",   no_argument,     &help_flag, 1},
            /* These options don't set a flag. We distinguish them by their indices. */
            {0, 0, 0, 0}
        };
           /* getopt_long stores the option index here. */
        int option_index = 0;
        c = getopt_long (argc, argv, "h",
            long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;

        switch (c) {
            case 0:
               /* If this option set a flag, do nothing else now. */
               if (long_options[option_index].flag != 0)
                 break;
               printf ("option %s", long_options[option_index].name);
               if (optarg)
                 printf (" with arg %s", optarg);
               printf ("
");
               break;

            case 'h':
                help_flag = 1;
                break;

            case '?':
               /* getopt_long already printed an error message. */
               break;

            default:
               abort ();
            }
    }

    if (help_flag == 1) {
        printf("proper syntax is: yourProgram.exe [OPTIONS]... INFILE
");
        printf("splits csv file into multiple files")
        printf("Option list: 
");
        printf("--help                  print help to screen
");
        printf("
");
        return 0;
    }

    //get the filename off the command line and redirect it to input
    //if there is no filename then use stdin

    if (optind < argc) {
        FILE *file;

        file = fopen(argv[optind], "r");
        if (!file) {
            fprintf (stderr, "%s: Couldn't open file %s; %s
", argv[0], argv[optind], strerror (errno));
            exit(errno);
        }
        yyin = file;
        strcpy(inputName, argv[optind]);
    }
    else {
        printf("no input file set, using stdin. Press ctrl-c to quit");
        yyin = stdin;
        strcpy(inputName, "\b\b\b\b\bagainst stdin");
    }

    //set up initial file names

    strcpy(fileName, inputName);
    strncpy(unknownFileName, fileName, strlen(fileName)-4);
    strncpy(stuffFileName, fileName, strlen(fileName)-4);

    strcat(unknownFileName, "_UNKNOWN_1.csv");
    strcat(stuffFileName, "_STUFF_1.csv");

    //open files for writing

    yyout = stdout;
    yyTemp = malloc(sizeof(FILE));
    yyUnknown = fopen(unknownFileName,"w");
    yyTemp = &(*yyUnknown);

    yyStuff = fopen(stuffFileName,"w");

    yylex();

    //close open files

    fclose(yyUnknown);

    printf("Lexer finished running %s",fileName);

    return 0;

}

To build this flex program, have flex installed, and use this makefile (adjust the paths):

TARGET = project.exe
TESTBUILD = project
LEX = flex
LFLAGS = -Cf
CC = i586-mingw32msvc-gcc
CFLAGS = -O -Wall 
INSTALLDIR = /mnt/J/Systems/executables

.PHONY: default all clean install uninstall cleanall

default: $(TARGET)

all: default install

OBJECTS = $(patsubst %.l, %.c, $(wildcard *.l))

%.c: %.l
    $(LEX) $(LFLAGS) -o $@ $<

.PRECIOUS: $(TARGET) $(OBJECTS)

$(TARGET): $(OBJECTS)
    $(CC) $(OBJECTS) $(CFLAGS) -o $@

linux: $(OBJECTS)
    gcc $(OBJECTS) $(CFLAGS) -lm -g -o $(TESTBUILD)

cleanall: clean uninstall

clean:
    -rm -f *.c
    -rm -f $(TARGET)
    -rm -f $(TESTBUILD)

uninstall:
    -rm -f $(INSTALLDIR)/$(TARGET)

install:
    cp -f $(TARGET) $(INSTALLDIR)

Grepping a 20g file in bash

Tags:

performance

regex

grep

bash

unix

Zack

2 Answers

Alan Curry

Spencer Rathbun

Recent Activity

Donate For Us

Grepping a 20g file in bash

Tags:

performance

regex

grep

bash

unix

Zack

2 Answers

Alan Curry

Spencer Rathbun

Related questions

Recent Activity

Donate For Us