仅使用正则表达式是不可行的 - 您可以创建状态机,因为您必须区分此类情况,例如:
- 三元组
- 续行
- /* 可以在字符串中找到 - 然后它不会开始注释
- 相反,如果 openin /在字符串之外,您可能在字符串中包含 /
你不会使用正则表达式来做到这一点。只是状态机。
我知道你想要 python,但前几天我在 erl 中做了类似的事情,所以她你走了。继续并转换为python。也许它不是最快/最好但足够好:
######################################################################################
#### Before going any further perform all 4 stages of preprocessing
#### described here http://gcc.gnu.org/onlinedocs/cpp/Initial-processing.html
############################# 1 - break file into lines ##############################
open FILE, $file or die "file [$file] was not found\n";
my @lines = <FILE>; # deletes \r from every line(\n stays on place)
close FILE;
################################ 2 - handle trigraphs ################################
foreach ( @lines )
{
s!\Q??=\E!#!g; #??= becomes #
s#\Q??/\E#\\#g; #??/ becomes \
s#\Q??'\E#^#g; #??' becomes ^
s#\Q??(\E#[#g; #??( becomes [
s#\Q??)\E#]#g; #??) becomes ]
s#\Q??!\E#|#g; #??! becomes |
s#\Q??<\E#{#g; #??< becomes {
s#\Q??>\E#}#g; #??> becomes }
s#\Q??-\E#~#g; #??- becomes ~
}
################################ 3 - merge continued lines ###########################
# everything in C/C++ may be spanned across many lines so we must merge continued
# lines to handle things correctly
# we do not delete lines that are merged with preceeding line - we just leave an
# empty line to preserve overal location of all things which will be needed later
# to properly report line numbers if we find sth that we are intersted in
for (my $i = 0; $i <= $#lines; $i++ )
{
# shows where continued line started ie. where to append following continued line(s)
state $appendHere; # acts also as an "append indicator"
my $continuedLine;
# theoretically continued line ends with \ but preprocessors accept \ followed by
# one or more whitespaces too so we accept it as well
if ( $lines[$i] =~ m#\\[ \t\v\f]*$# ) # merge with next line / continued line ?
{
$lines[$i] =~ s#\\[ \t\v\f]*$##; # delete \ with trailing whitespaces if any
$continuedLine = 1;
}
else
{
$continuedLine = 0;
}
if ( !defined $appendHere )
{
if ( $continuedLine == 1 )
{
# we will append continued lines to $lines[$appendHere]
$appendHere = $i;
}
}
else
{
chomp $lines[$appendHere]; # get rid of \n before appending next
chomp $lines[$i]; # get rid of \n before appending next
$lines[$appendHere] .= "$lines[$i]\n"; # append current line to previously marked location
$lines[$i] = "\n"; # leave only \n in the current line since we want to preserve line numbers
if ( $continuedLine == 0 ) # merge next line too?
{
$appendHere = undef;
}
}
}
#printFileFormatted();
######################## 4 - handle comments and strings ######################################
# similarly substituting a comment body with a single space may spoil our line numbers so
# we are just replacing comments with spaces preserving newlines where necessary
my $state = "out";
my $error;
my $COMMENT_SUBST = ' '; #'@';
my $STRING_SUBST = ' '; #'%';
ERROR: for ( my $line = 0; $line <= $#lines; $line++ )
{
state $hexVal = 0;
state $octVal = 0;
state $string = "";
my @chars = split //, $lines[$line];
my $newLine = "";
for ( my $i = 0; $i <= $#chars; $i++ )
{
my $c = $chars[$i];
if ( $state eq 'out' ) # ----------------------------------------------
{
if ( $c eq '/' )
{
$state = 'comment?';
$newLine .= $c;
}
elsif ( $c eq '"' )
{
$state = 'string char';
$newLine .= $STRING_SUBST;
}
else
{
$newLine .= $c;
}
}
elsif ( $state eq 'comment?' ) # ----------------------------------------------
{
if ( $c eq '/' )
{
$state = '//comment';
chop $newLine;
$newLine .= $COMMENT_SUBST x 2;
}
elsif ( $c eq '*' )
{
$state = '/*comment';
chop $newLine;
$newLine .= $COMMENT_SUBST x 2;
}
else
{
$state = 'out';
$newLine .= $c;
}
}
elsif ( $state eq '//comment' ) # ----------------------------------------------
{
if ( $c eq "\n" )
{
$state = 'out';
$newLine .= $c;
}
else
{
$newLine .= $COMMENT_SUBST;
}
}
elsif ( $state eq '/*comment' ) # ----------------------------------------------
{
if ( $c eq '*' )
{
$state = '/*comment end?';
$newLine .= $COMMENT_SUBST;
}
elsif ( $c eq "\n" )
{
$newLine .= $c;
}
else
{
$newLine .= $COMMENT_SUBST;
}
}
elsif ( $state eq '/*comment end?' ) # ----------------------------------------------
{
if ( $c eq '*' )
{
$newLine .= $COMMENT_SUBST;
}
elsif ( $c eq "\n" )
{
$newLine .= $c;
}
elsif ( $c eq '/' )
{
$state = 'out';
$newLine .= $COMMENT_SUBST;
}
else
{
$state = '/*comment';
$newLine .= $COMMENT_SUBST;
}
}
elsif ( $state eq 'string char' ) # ----------------------------------------------
{
# theoretically ignore "everything" within a string
# which may look like "abc\\" = abc\ or "abc\"" = abc"
# "abc\" - wrong - no end of string, "abc\\\" wrong again
# in order to detect if particular " terminates a string we have to check the whole string
# since it cannot be determined just by checking what the previous character was hence
# that state machine was created
if ( $c eq '"' )
{
$state = 'out';
$newLine .= $STRING_SUBST;
}
elsif ( $c eq "\\" )
{
$state = 'string esc seq';
$newLine .= $STRING_SUBST;
}
elsif ( $c eq "\n" )
{
$error = "line [".($line+1)."] - error - a newline within a string\n";
last ERROR;
}
else
{
$newLine .= $STRING_SUBST;
}
}
elsif ( $state eq 'string esc seq' ) # ----------------------------------------------
{
# simple esc seq \' \" \? \\ \a \b \f \n \r \t \v
# oct num \o \oo \ooo no more than 3 oct digits (o=[0-7]{1,3}) but value must be < than 255
# hex num \xh \xhh \xhhh..... unlimited number of hex digits (h=[0-9a-fA-F]+) but value must be < than 255
# in any other esc seq \ will be ignored hence \u=u \p=p \k=k etc
if ( $c =~ m#^['"\?\\abfhrtv]$# )
{
$state = 'string char';
$newLine .= $STRING_SUBST x 2;
}
elsif ( $c eq 'x' )
{
$state = 'string hex marker';
$newLine .= $STRING_SUBST;
}
elsif ( $c =~ m#^[0-7]$#)
{
$state = 'string oct';
$octVal = oct($c);
$newLine .= $STRING_SUBST;
}
elsif ( $c eq "\n" )
{
$error = "line [".($line+1)."] - error - a newline within a string\n";
last ERROR;
}
else # other esc seqences are ignored - usually a warning is issued
{
$state = 'string char';
$newLine .= $STRING_SUBST x 2;
}
}
elsif ( $state eq 'string hex marker' ) # ----------------------------------------------
{
if ( $c =~ m#^[0-9a-fA-F]$# )
{
$state = 'string hex';
$hexVal = hex($c);
$newLine .= $STRING_SUBST;
}
else
{
$error = "line [".($line+1)."] - error - hex escape sequence not finished\n";
last ERROR;
}
}
elsif ( $state eq 'string hex' ) # ----------------------------------------------
{
if ( $c =~ m#^[0-9a-fA-F]$# )
{
$hexVal <<= 4;
$hexVal += hex($c);
# treat as regular 8bit character sequence - no fancy long chars etc
if ( $hexVal > 255 )
{
$error = "line [".($line+1)."] - error - hex escape sequence too big for a character\n";
last ERROR;
}
$newLine .= $STRING_SUBST;
}
elsif ( $c eq '"' )
{
$state = 'out';
$newLine .= $STRING_SUBST;
$hexVal = 0;
}
elsif ( $c eq "\n" )
{
$error = "line [".($line+1)."] - error - a newline within a string\n";
last ERROR;
}
else
{
$state = 'string char';
$newLine .= $STRING_SUBST;
$hexVal = 0;
}
}
elsif ( $state eq 'string oct' ) # ----------------------------------------------
{
if ( $c =~ m#^[0-7]$# )
{
$octVal <<= 3;
$octVal += oct($c);
# treat as regular 8bit character sequence - no fancy long chars etc
if ( $octVal > 255 )
{
$error = "line [".($line+1)."] - error - oct esc sequence too big for a character\n";
last ERROR;
}
$newLine .= $STRING_SUBST;
}
elsif ( $c eq "\n" )
{
$error = "line [".($line+1)."] - error - a newline within a string\n";
last ERROR;
}
elsif ( $c eq '"' )
{
$state = 'out';
$newLine .= $STRING_SUBST;
$octVal = 0;
}
else
{
$state = 'string char';
$newLine .= $STRING_SUBST;
$octVal = 0;
}
}
else
{
$error = "line [".($line+1)."] - error - state machine problem - unknown state\n";
last ERROR;
}
}#for ( my $i = 0; $i <= $#chars; $i++ )
$lines[ $line ] = $newLine;
}#for ( my $line = 0; $line <= $#lines; $line++ )
if ( $error ) # errors detected within state machine?
{
print "$error";
exit(1);
}
else # EOF met - check the state
{
if ( $state eq 'out' )
{
# ok no problem
}
elsif ( $state eq 'comment?' )
{
# ok no problem - may be a division after all - not a preproc problem
}
elsif ( $state eq '//comment' )
{
# ok no problem
}
elsif ( $state eq '/*comment' )
{
print "EOF reached within /* */ comment\n";
exit(1);
}
elsif ( $state eq '/*comment end?' )
{
print "EOF reached within /* */ comment\n";
exit(1);
}
elsif ( $state eq 'string char' )
{
print "EOF reached within string\n";
exit(1);
}
elsif ( $state eq 'string esc seq' )
{
print "EOF reached within string\n";
exit(1);
}
elsif ( $state eq 'string hex marker' )
{
print "EOF reached within string\n";
exit(1);
}
elsif ( $state eq 'string hex' )
{
print "EOF reached within string\n";
exit(1);
}
elsif ( $state eq 'string oct' )
{
print "EOF reached within string\n";
exit(1);
}
else
{
print "EOF reached and state machine is in unknown state\n";
exit(1);
}
}