我做了类似的事情lex
。当然,它每隔一天运行一次,所以 YMMV。它非常快,甚至在远程 Windows 共享上的数百兆字节文件上也是如此。处理只需几秒钟。我不知道你破解一个快速C
程序有多舒服,但我发现这是解决大规模正则表达式问题的最快、最简单的解决方案。
部分编辑以保护有罪者:
/**************************************************
start of definitions section
***************************************************/
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <getopt.h>
#include <errno.h>
char inputName[256];
// static insert variables
//other variables
char tempString[256];
char myHolder[256];
char fileName[256];
char unknownFileName[256];
char stuffFileName[256];
char buffer[5];
/* we are using pointers to hold the file locations, and allow us to dynamically open and close new files */
/* also, it allows us to obfuscate which file we are writing to, otherwise this couldn't be done */
FILE *yyTemp;
FILE *yyUnknown;
FILE *yyStuff;
// flags for command line options
static int help_flag = 0;
%}
%option 8bit
%option nounput nomain noyywrap
%option warn
%%
/************************************************
start of rules section
*************************************************/
(\"A\",\"(1330|1005|1410|1170)\") {
strcat(myHolder, yytext);
yyTemp = &(*yyStuff);
} //stuff files
. { strcat(myHolder, yytext); }
\n {
if (&(*yyTemp) == &(*yyUnknown))
unknownCount += 1;
strcat(myHolder, yytext);
//print to file we are pointing at, whatever it is
fprintf(yyTemp, "%s", myHolder);
strcpy(myHolder, "");
yyTemp = &(*yyUnknown);
}
<<EOF>> {
strcat(myHolder, yytext);
fprintf(yyTemp, "%s", myHolder);
strcpy(myHolder, "");
yyTemp = &(*yyUnknown);
yyterminate();
}
%%
/****************************************************
start of code section
*****************************************************/
int main(int argc, char **argv);
int main (argc,argv)
int argc;
char **argv;
{
/****************************************************
The main method drives the program. It gets the filename from the
command line, and opens the initial files to write to. Then it calls the lexer.
After the lexer returns, the main method finishes out the report file,
closes all of the open files, and prints out to the command line to let the
user know it is finished.
****************************************************/
int c;
// the gnu getopt library is used to parse the command line for flags
// afterwards, the final option is assumed to be the input file
while (1) {
static struct option long_options[] = {
/* These options set a flag. */
{"help", no_argument, &help_flag, 1},
/* These options don't set a flag. We distinguish them by their indices. */
{0, 0, 0, 0}
};
/* getopt_long stores the option index here. */
int option_index = 0;
c = getopt_long (argc, argv, "h",
long_options, &option_index);
/* Detect the end of the options. */
if (c == -1)
break;
switch (c) {
case 0:
/* If this option set a flag, do nothing else now. */
if (long_options[option_index].flag != 0)
break;
printf ("option %s", long_options[option_index].name);
if (optarg)
printf (" with arg %s", optarg);
printf ("\n");
break;
case 'h':
help_flag = 1;
break;
case '?':
/* getopt_long already printed an error message. */
break;
default:
abort ();
}
}
if (help_flag == 1) {
printf("proper syntax is: yourProgram.exe [OPTIONS]... INFILE\n");
printf("splits csv file into multiple files")
printf("Option list: \n");
printf("--help print help to screen\n");
printf("\n");
return 0;
}
//get the filename off the command line and redirect it to input
//if there is no filename then use stdin
if (optind < argc) {
FILE *file;
file = fopen(argv[optind], "r");
if (!file) {
fprintf (stderr, "%s: Couldn't open file %s; %s\n", argv[0], argv[optind], strerror (errno));
exit(errno);
}
yyin = file;
strcpy(inputName, argv[optind]);
}
else {
printf("no input file set, using stdin. Press ctrl-c to quit");
yyin = stdin;
strcpy(inputName, "\b\b\b\b\bagainst stdin");
}
//set up initial file names
strcpy(fileName, inputName);
strncpy(unknownFileName, fileName, strlen(fileName)-4);
strncpy(stuffFileName, fileName, strlen(fileName)-4);
strcat(unknownFileName, "_UNKNOWN_1.csv");
strcat(stuffFileName, "_STUFF_1.csv");
//open files for writing
yyout = stdout;
yyTemp = malloc(sizeof(FILE));
yyUnknown = fopen(unknownFileName,"w");
yyTemp = &(*yyUnknown);
yyStuff = fopen(stuffFileName,"w");
yylex();
//close open files
fclose(yyUnknown);
printf("Lexer finished running %s",fileName);
return 0;
}
要构建这个 flex 程序,安装 flex,并使用这个 makefile(调整路径):
TARGET = project.exe
TESTBUILD = project
LEX = flex
LFLAGS = -Cf
CC = i586-mingw32msvc-gcc
CFLAGS = -O -Wall
INSTALLDIR = /mnt/J/Systems/executables
.PHONY: default all clean install uninstall cleanall
default: $(TARGET)
all: default install
OBJECTS = $(patsubst %.l, %.c, $(wildcard *.l))
%.c: %.l
$(LEX) $(LFLAGS) -o $@ $<
.PRECIOUS: $(TARGET) $(OBJECTS)
$(TARGET): $(OBJECTS)
$(CC) $(OBJECTS) $(CFLAGS) -o $@
linux: $(OBJECTS)
gcc $(OBJECTS) $(CFLAGS) -lm -g -o $(TESTBUILD)
cleanall: clean uninstall
clean:
-rm -f *.c
-rm -f $(TARGET)
-rm -f $(TESTBUILD)
uninstall:
-rm -f $(INSTALLDIR)/$(TARGET)
install:
cp -f $(TARGET) $(INSTALLDIR)