我正在尝试使用 c 代码从 ac 文件中删除注释和字符串。我会坚持对示例进行评论。n
我有一个滑动窗口,所以我n-1
在任何特定时刻都只有角色。我正在尝试找出一种whiles
尽可能不使用嵌套的算法,但我需要一段时间才能getchar
通过输入。我的第一个想法是 while through find when n=* and (n-1)=/
then while through until n=/ and (n-1)=*
,但考虑到这是嵌套的 while 我觉得它效率低下。如果必须,我可以这样做,但我想知道是否有人有更好的解决方案。
3 回答
正确地做到这一点比人们最初想象的要复杂,正如这里的其他评论巧妙地指出的那样。我强烈建议编写一个表驱动的 FSM,使用状态转换图来获得正确的转换。试图用 case 语句做比几个州更多的事情是非常容易出错的 IMO。
这是一个点/graphviz 格式的图表,您可以从中直接编写一个状态表。请注意,我根本没有测试过这个,所以 YMMV。
该图的语义是,当您看到 时<ch>
,如果该状态中的其他输入均不匹配,则它是一个失败。文件结尾在除 之外的任何状态下都是错误,任何S0
未明确列出的字符或<ch>
. 扫描的每个字符都会打印,除非在注释中(S4
和S5
),以及在检测到开始注释时(S1
)。在检测到开始注释时,您必须缓冲字符,如果它是错误的开始,则打印它们,否则在确定它确实是注释时将它们丢弃。
在点图中,sq
是单引号'
,dq
是双引号"
。
digraph state_machine {
rankdir=LR;
size="8,5";
node [shape=doublecircle]; S0 /* init */;
node [shape=circle];
S0 /* init */ -> S1 /* begin_cmt */ [label = "'/'"];
S0 /* init */ -> S2 /* in_str */ [label = dq];
S0 /* init */ -> S3 /* in_ch */ [label = sq];
S0 /* init */ -> S0 /* init */ [label = "<ch>"];
S1 /* begin_cmt */ -> S4 /* in_slc */ [label = "'/'"];
S1 /* begin_cmt */ -> S5 /* in_mlc */ [label = "'*'"];
S1 /* begin_cmt */ -> S0 /* init */ [label = "<ch>"];
S1 /* begin_cmt */ -> S1 /* begin_cmt */ [label = "'\\n'"]; // handle "/\n/" and "/\n*"
S2 /* in_str */ -> S0 /* init */ [label = "'\\'"];
S2 /* in_str */ -> S6 /* str_esc */ [label = "'\\'"];
S2 /* in_str */ -> S2 /* in_str */ [label = "<ch>"];
S3 /* in_ch */ -> S0 /* init */ [label = sq];
S4 /* in_slc */ -> S4 /* in_slc */ [label = "<ch>"];
S4 /* in_slc */ -> S0 /* init */ [label = "'\\n'"];
S5 /* in_mlc */ -> S7 /* end_mlc */ [label = "'*'"];
S5 /* in_mlc */ -> S5 /* in_mlc */ [label = "<ch>"];
S7 /* end_mlc */ -> S7 /* end_mlc */ [label = "'*'|'\\n'"];
S7 /* end_mlc */ -> S0 /* init */ [label = "'/'"];
S7 /* end_mlc */ -> S5 /* in_mlc */ [label = "<ch>"];
S6 /* str_esc */ -> S8 /* oct */ [label = "[0-3]"];
S6 /* str_esc */ -> S9 /* hex */ [label = "'x'"];
S6 /* str_esc */ -> S2 /* in_str */ [label = "<ch>"];
S8 /* oct */ -> S10 /* o1 */ [label = "[0-7]"];
S10 /* o1 */ -> S2 /* in_str */ [label = "[0-7]"];
S9 /* hex */ -> S11 /* h1 */ [label = hex];
S11 /* h1 */ -> S2 /* in_str */ [label = hex];
S3 /* in_ch */ -> S12 /* ch_esc */ [label = "'\\'"];
S3 /* in_ch */ -> S13 /* out_ch */ [label = "<ch>"];
S13 /* out_ch */ -> S0 /* init */ [label = sq];
S12 /* ch_esc */ -> S3 /* in_ch */ [label = sq];
S12 /* ch_esc */ -> S12 /* ch_esc */ [label = "<ch>"];
}
用一个while
循环编写的算法可能如下所示:
while ((c = getchar()) != EOF)
{
... // looking at the byte that was just read
if (...) // the symbol is not inside a comment
{
putchar(c);
}
}
要确定输入是否char
属于评论,您可以使用状态机。在下面的例子中,它有 4 个状态;还有遍历到下一个状态的规则。
int state = 0;
int next_state;
while ((c = getchar()) != EOF)
{
switch (state)
{
case 0: next_state = (c == '/' ? 1 : 0); break;
case 1: next_state = (c == '*' ? 2 : c == '/' ? 1 : 0); break;
case 2: next_state = (c == '*' ? 3 : 2); break;
case 3: next_state = (c == '/' ? 0 : c == '*' ? 3 : 2); break;
default: next_state = state; // will never happen
}
if (state == 1 && next_state == 0)
{
putchar('/'); // for correct output when a slash is not followed by a star
}
if (state == 0 && next_state == 0)
{
putchar(c);
}
state = next_state;
}
上面的例子非常简单:它不能/*
在非注释上下文中正常工作,比如在 C 字符串中;它不支持//
评论等。
由于您只想对缓冲区使用两个字符并且只使用一个 while 循环,因此我建议使用第三个字符来跟踪您的状态(无论是否跳过文本)。我为您编写了一个测试程序,其中包含解释逻辑的内联注释:
// Program to strip comments and strings from a C file
//
// Build:
// gcc -o strip-comments strip-comments.c
//
// Test:
// ./strip-comments strip-comments.c
#include <stdio.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
/* The following is a block of strings, and comments for testing
* the code.
*/
/* test if three comments *//* chained together */// will be removed.
static int value = 128 /* test comment within valid code *// 2;
const char * test1 = "This is a test of \" processing"; /* testing inline comment */
const char * test2 = "this is a test of \n within strings."; // testing inline comment
// this is a the last test
int strip_c_code(FILE * in, FILE * out)
{
char buff[2];
char skipping;
skipping = '\0';
buff[0] = '\0';
buff[1] = '\0';
// loop through the file
while((buff[0] = fgetc(in)) != EOF)
{
// checking for start of comment or string block
if (!(skipping))
{
// start skipping in "//" comments
if ((buff[1] == '/') && (buff[0] == '/'))
skipping = '/';
// start skipping in "/*" comments
else if ((buff[1] == '/') && (buff[0] == '*'))
skipping = '*';
// start skipping at start of strings, but not character assignments
else if ( ((buff[1] != '\'') && (buff[0] == '"')) &&
((buff[1] != '\\') && (buff[0] == '"')) )
{
fputc(buff[1], out);
skipping = '"';
};
// clear buffer so that processed characters are not interpreted as
// end of skip characters.
if ((skipping))
{
buff[0] = '\0';
buff[1] = '\0';
};
};
// check for characters which terminate skip block
switch(skipping)
{
// if skipping "//" comments, look for new line
case '/':
if (buff[1] == '\n')
skipping = '\0';
break;
// if skipping "/*" comments, look for "*/" terminating string
case '*':
if ((buff[1] == '*') && (buff[0] == '/'))
{
buff[0] = '\0';
buff[1] = '\0';
skipping = '\0';
};
break;
// if skipping strings, look for terminating '"' character
case '"':
if ((buff[1] != '\\') && (buff[0] == '"'))
{
skipping = '\0';
buff[0] = '\0';
buff[1] = '\0';
fprintf(out, "NULL"); // replace string with NULL
};
break;
default:
break;
};
// if not skipping, write character out
if ( (!(skipping)) && ((buff[1])) )
fputc(buff[1], out);
// shift new character to old character position
buff[1] = buff[0];
};
// verify that the comment or string was terminated properly
if ((skipping))
{
fprintf(stderr, "Unterminated comment or string\n");
return(-1);
};
// write last character
fputc(buff[1], out);
return(0);
}
int main(int argc, char * argv[])
{
FILE * fs;
if (argc != 2)
{
fprintf(stderr, "Usage: %s <filename>\n", argv[0]);
return(1);
};
if ((fs = fopen(argv[1], "r")) == NULL)
{
perror("fopen()");
return(1);
};
strip_c_code(fs, stdout);
fclose(fs);
return(0);
}
/* end of source file */
我还在 Github 上发布了这段代码,以便于下载和编译: