Question about code performance: I'm trying to run ~25 regex rules against a ~20g text file. The script should output matches to text files; each regex rule generates its own file. See the pseudocode below:
regex_rules=~/Documents/rulesfiles/regexrulefile.txt
for tmp in *.unique20gbfile.suffix; do
while read line
# Each $line in the looped-through file contains a regex rule, e.g.,
# egrep -i '(^| )justin ?bieber|(^| )selena ?gomez'
# $rname is a unique rule name generated by a separate bash function
# exported to the current shell.
do
cmd="$line $tmp > ~/outputdir/$tmp.$rname.filter.piped &"
eval $cmd
done < $regex_rules
done
Couple thoughts:
Is there a way to loop the text file just once, evaluating all rules and splitting to individual files in one go? Would this be faster?
Is there a different tool I should be using for this job?
Thanks.
This is the reason grep has a -f option. Reduce your regexrulefile.txt to just the regexps, one per line, and run
egrep -f regexrulefile.txt the_big_file
This produces all the matches in a single output stream, but you can do your loop thing on it afterward to separate them out. Assuming the combined list of matches isn't huge, this will be a performance win.
I did something similar with lex. Of course, it runs every other day, so YMMV. It is very fast, even on several hundred megabyte files on a remote windows share. It takes only a few seconds to process. I don't know how comfortable you are hacking up a quick C program, but I've found this to be the fastest, easiest solution for large scale regex problems.
Parts redacted to protect the guilty:
/**************************************************
start of definitions section
***************************************************/
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <getopt.h>
#include <errno.h>
char inputName[256];
// static insert variables
//other variables
char tempString[256];
char myHolder[256];
char fileName[256];
char unknownFileName[256];
char stuffFileName[256];
char buffer[5];
/* we are using pointers to hold the file locations, and allow us to dynamically open and close new files */
/* also, it allows us to obfuscate which file we are writing to, otherwise this couldn't be done */
FILE *yyTemp;
FILE *yyUnknown;
FILE *yyStuff;
// flags for command line options
static int help_flag = 0;
%}
%option 8bit
%option nounput nomain noyywrap
%option warn
%%
/************************************************
start of rules section
*************************************************/
(\"A\",\"(1330|1005|1410|1170)\") {
strcat(myHolder, yytext);
yyTemp = &(*yyStuff);
} //stuff files
. { strcat(myHolder, yytext); }
\n {
if (&(*yyTemp) == &(*yyUnknown))
unknownCount += 1;
strcat(myHolder, yytext);
//print to file we are pointing at, whatever it is
fprintf(yyTemp, "%s", myHolder);
strcpy(myHolder, "");
yyTemp = &(*yyUnknown);
}
<<EOF>> {
strcat(myHolder, yytext);
fprintf(yyTemp, "%s", myHolder);
strcpy(myHolder, "");
yyTemp = &(*yyUnknown);
yyterminate();
}
%%
/****************************************************
start of code section
*****************************************************/
int main(int argc, char **argv);
int main (argc,argv)
int argc;
char **argv;
{
/****************************************************
The main method drives the program. It gets the filename from the
command line, and opens the initial files to write to. Then it calls the lexer.
After the lexer returns, the main method finishes out the report file,
closes all of the open files, and prints out to the command line to let the
user know it is finished.
****************************************************/
int c;
// the gnu getopt library is used to parse the command line for flags
// afterwards, the final option is assumed to be the input file
while (1) {
static struct option long_options[] = {
/* These options set a flag. */
{"help", no_argument, &help_flag, 1},
/* These options don't set a flag. We distinguish them by their indices. */
{0, 0, 0, 0}
};
/* getopt_long stores the option index here. */
int option_index = 0;
c = getopt_long (argc, argv, "h",
long_options, &option_index);
/* Detect the end of the options. */
if (c == -1)
break;
switch (c) {
case 0:
/* If this option set a flag, do nothing else now. */
if (long_options[option_index].flag != 0)
break;
printf ("option %s", long_options[option_index].name);
if (optarg)
printf (" with arg %s", optarg);
printf ("\n");
break;
case 'h':
help_flag = 1;
break;
case '?':
/* getopt_long already printed an error message. */
break;
default:
abort ();
}
}
if (help_flag == 1) {
printf("proper syntax is: yourProgram.exe [OPTIONS]... INFILE\n");
printf("splits csv file into multiple files")
printf("Option list: \n");
printf("--help print help to screen\n");
printf("\n");
return 0;
}
//get the filename off the command line and redirect it to input
//if there is no filename then use stdin
if (optind < argc) {
FILE *file;
file = fopen(argv[optind], "r");
if (!file) {
fprintf (stderr, "%s: Couldn't open file %s; %s\n", argv[0], argv[optind], strerror (errno));
exit(errno);
}
yyin = file;
strcpy(inputName, argv[optind]);
}
else {
printf("no input file set, using stdin. Press ctrl-c to quit");
yyin = stdin;
strcpy(inputName, "\b\b\b\b\bagainst stdin");
}
//set up initial file names
strcpy(fileName, inputName);
strncpy(unknownFileName, fileName, strlen(fileName)-4);
strncpy(stuffFileName, fileName, strlen(fileName)-4);
strcat(unknownFileName, "_UNKNOWN_1.csv");
strcat(stuffFileName, "_STUFF_1.csv");
//open files for writing
yyout = stdout;
yyTemp = malloc(sizeof(FILE));
yyUnknown = fopen(unknownFileName,"w");
yyTemp = &(*yyUnknown);
yyStuff = fopen(stuffFileName,"w");
yylex();
//close open files
fclose(yyUnknown);
printf("Lexer finished running %s",fileName);
return 0;
}
To build this flex program, have flex installed, and use this makefile (adjust the paths):
TARGET = project.exe
TESTBUILD = project
LEX = flex
LFLAGS = -Cf
CC = i586-mingw32msvc-gcc
CFLAGS = -O -Wall
INSTALLDIR = /mnt/J/Systems/executables
.PHONY: default all clean install uninstall cleanall
default: $(TARGET)
all: default install
OBJECTS = $(patsubst %.l, %.c, $(wildcard *.l))
%.c: %.l
$(LEX) $(LFLAGS) -o $@ $<
.PRECIOUS: $(TARGET) $(OBJECTS)
$(TARGET): $(OBJECTS)
$(CC) $(OBJECTS) $(CFLAGS) -o $@
linux: $(OBJECTS)
gcc $(OBJECTS) $(CFLAGS) -lm -g -o $(TESTBUILD)
cleanall: clean uninstall
clean:
-rm -f *.c
-rm -f $(TARGET)
-rm -f $(TESTBUILD)
uninstall:
-rm -f $(INSTALLDIR)/$(TARGET)
install:
cp -f $(TARGET) $(INSTALLDIR)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With