Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Grepping a 20g file in bash

Question about code performance: I'm trying to run ~25 regex rules against a ~20g text file. The script should output matches to text files; each regex rule generates its own file. See the pseudocode below:

regex_rules=~/Documents/rulesfiles/regexrulefile.txt
for tmp in *.unique20gbfile.suffix; do
    while read line
    # Each $line in the looped-through file contains a regex rule, e.g.,
    # egrep -i '(^| )justin ?bieber|(^| )selena ?gomez'
    # $rname is a unique rule name generated by a separate bash function
    # exported to the current shell.
        do
        cmd="$line $tmp > ~/outputdir/$tmp.$rname.filter.piped &"
        eval $cmd
    done < $regex_rules
done

Couple thoughts:

  • Is there a way to loop the text file just once, evaluating all rules and splitting to individual files in one go? Would this be faster?

  • Is there a different tool I should be using for this job?

Thanks.

like image 945
Zack Avatar asked Mar 16 '26 02:03

Zack


2 Answers

This is the reason grep has a -f option. Reduce your regexrulefile.txt to just the regexps, one per line, and run

egrep -f regexrulefile.txt the_big_file

This produces all the matches in a single output stream, but you can do your loop thing on it afterward to separate them out. Assuming the combined list of matches isn't huge, this will be a performance win.

like image 118
Alan Curry Avatar answered Mar 17 '26 20:03

Alan Curry


I did something similar with lex. Of course, it runs every other day, so YMMV. It is very fast, even on several hundred megabyte files on a remote windows share. It takes only a few seconds to process. I don't know how comfortable you are hacking up a quick C program, but I've found this to be the fastest, easiest solution for large scale regex problems.

Parts redacted to protect the guilty:

    /************************************************** 
        start of definitions section

    ***************************************************/


%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <getopt.h>
#include <errno.h>

char inputName[256];
// static insert variables

//other variables
char tempString[256];
char myHolder[256];
char fileName[256];
char unknownFileName[256];
char stuffFileName[256];
char buffer[5];

/* we are using pointers to hold the file locations, and allow us to dynamically open and close new files */
/* also, it allows us to obfuscate which file we are writing to, otherwise this couldn't be done */

FILE *yyTemp;
FILE *yyUnknown;
FILE *yyStuff;

// flags for command line options
static int help_flag = 0;

%}

%option 8bit 
%option nounput nomain noyywrap 
%option warn

%%
    /************************************************ 
        start of rules section
    *************************************************/


(\"A\",\"(1330|1005|1410|1170)\") { 
    strcat(myHolder, yytext);
    yyTemp = &(*yyStuff);
} //stuff files

. { strcat(myHolder, yytext); }

\n  {
    if (&(*yyTemp) == &(*yyUnknown))
        unknownCount += 1;
    strcat(myHolder, yytext); 
    //print to file we are pointing at, whatever it is
    fprintf(yyTemp, "%s", myHolder);
    strcpy(myHolder, "");
    yyTemp = &(*yyUnknown);
}

<<EOF>> {
    strcat(myHolder, yytext); 
    fprintf(yyTemp, "%s", myHolder);
    strcpy(myHolder, "");
    yyTemp = &(*yyUnknown);

    yyterminate();
}

%%
    /**************************************************** 
        start of code section


    *****************************************************/


int main(int argc, char **argv);

int main (argc,argv)
int argc;
char **argv;
{
    /****************************************************
        The main method drives the program. It gets the filename from the
        command line, and opens the initial files to write to. Then it calls the lexer.
        After the lexer returns, the main method finishes out the report file,
        closes all of the open files, and prints out to the command line to let the
        user know it is finished.
    ****************************************************/

    int c;

    // the gnu getopt library is used to parse the command line for flags
    // afterwards, the final option is assumed to be the input file

    while (1) {
        static struct option long_options[] = {
            /* These options set a flag. */
            {"help",   no_argument,     &help_flag, 1},
            /* These options don't set a flag. We distinguish them by their indices. */
            {0, 0, 0, 0}
        };
           /* getopt_long stores the option index here. */
        int option_index = 0;
        c = getopt_long (argc, argv, "h",
            long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;

        switch (c) {
            case 0:
               /* If this option set a flag, do nothing else now. */
               if (long_options[option_index].flag != 0)
                 break;
               printf ("option %s", long_options[option_index].name);
               if (optarg)
                 printf (" with arg %s", optarg);
               printf ("\n");
               break;

            case 'h':
                help_flag = 1;
                break;

            case '?':
               /* getopt_long already printed an error message. */
               break;

            default:
               abort ();
            }
    }

    if (help_flag == 1) {
        printf("proper syntax is: yourProgram.exe [OPTIONS]... INFILE\n");
        printf("splits csv file into multiple files")
        printf("Option list: \n");
        printf("--help                  print help to screen\n");
        printf("\n");
        return 0;
    }

    //get the filename off the command line and redirect it to input
    //if there is no filename then use stdin

    if (optind < argc) {
        FILE *file;

        file = fopen(argv[optind], "r");
        if (!file) {
            fprintf (stderr, "%s: Couldn't open file %s; %s\n", argv[0], argv[optind], strerror (errno));
            exit(errno);
        }
        yyin = file;
        strcpy(inputName, argv[optind]);
    }
    else {
        printf("no input file set, using stdin. Press ctrl-c to quit");
        yyin = stdin;
        strcpy(inputName, "\b\b\b\b\bagainst stdin");
    }

    //set up initial file names

    strcpy(fileName, inputName);
    strncpy(unknownFileName, fileName, strlen(fileName)-4);
    strncpy(stuffFileName, fileName, strlen(fileName)-4);

    strcat(unknownFileName, "_UNKNOWN_1.csv");
    strcat(stuffFileName, "_STUFF_1.csv");

    //open files for writing

    yyout = stdout;
    yyTemp = malloc(sizeof(FILE));
    yyUnknown = fopen(unknownFileName,"w");
    yyTemp = &(*yyUnknown);

    yyStuff = fopen(stuffFileName,"w");

    yylex();

    //close open files

    fclose(yyUnknown);

    printf("Lexer finished running %s",fileName);

    return 0;

}

To build this flex program, have flex installed, and use this makefile (adjust the paths):

TARGET = project.exe
TESTBUILD = project
LEX = flex
LFLAGS = -Cf
CC = i586-mingw32msvc-gcc
CFLAGS = -O -Wall 
INSTALLDIR = /mnt/J/Systems/executables

.PHONY: default all clean install uninstall cleanall

default: $(TARGET)

all: default install

OBJECTS = $(patsubst %.l, %.c, $(wildcard *.l))

%.c: %.l
    $(LEX) $(LFLAGS) -o $@ $<

.PRECIOUS: $(TARGET) $(OBJECTS)

$(TARGET): $(OBJECTS)
    $(CC) $(OBJECTS) $(CFLAGS) -o $@

linux: $(OBJECTS)
    gcc $(OBJECTS) $(CFLAGS) -lm -g -o $(TESTBUILD)

cleanall: clean uninstall

clean:
    -rm -f *.c
    -rm -f $(TARGET)
    -rm -f $(TESTBUILD)

uninstall:
    -rm -f $(INSTALLDIR)/$(TARGET)

install:
    cp -f $(TARGET) $(INSTALLDIR)
like image 30
Spencer Rathbun Avatar answered Mar 17 '26 18:03

Spencer Rathbun



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!