我知道它不在 C 语言中,但该演示文稿可能会给您一些关于如何有效解决问题的意见。
https://web.archive.org/web/20120115060003/http://cuddle.googlecode.com/hg/talk/lex.html#landing-slide
我还根据您的初始要求用 JavaScript 编写了一个非常简单的解析器示例(同样不是 C,但希望您也了解 JS),这意味着它不会解析任何属性,也不会处理自关闭标签和许多其他事情应根据 HTML 规范处理。它将以这种格式生成解析树:
{
cn: [{
tag: 'html',
cn: [{
tag: 'body',
cn: [
{ tag: 'h1', cn: ['test'] },
' some text ',
...
]
}]
}]
}
这是代码和小提琴:http: //jsfiddle.net/LUpyZ/3/
请注意,空白不会被忽略,并将在文本节点中捕获。
var html = '<html><body><h1>test</h1> some text <div> <p>text</p></div></body></html>';
var parseHTML = (function () {
var nodesStack = [],
i = 0,
len = html.length,
stateFn = parseText,
parseTree = { cn: [] },
alphaNumRx = /\w/,
currentNode = parseTree,
text = '',
tag = '',
newNode;
function parseTag(token) {
if (token === '/') {
return parseCloseTag;
}
i--; //backtrack to first tag character
return parseOpenTag;
}
function parseCloseTag(token) {
if (token === '>') {
if (currentNode.tag !== tag) {
throw 'Wrong closed tag at char ' + i;
}
tag = '';
nodesStack.pop();
currentNode = currentNode.parentNode;
return parseText;
}
assertValidTagNameChar(token);
tag += token;
return parseCloseTag;
}
function parseOpenTag(token) {
if (token === '>') {
currentNode.cn.push(newNode = { tag: tag, parentNode: currentNode, cn: []});
nodesStack.push(currentNode = newNode);
tag = '';
return parseText;
}
assertValidTagNameChar(token);
tag += token;
return parseOpenTag;
}
function parseText(token) {
if (token === '<') {
if (text) {
currentNode.cn.push(text);
text = '';
}
return parseTag;
}
text += token;
return parseText;
}
function assertValidTagNameChar(c) {
if (!alphaNumRx.test(c)) {
throw 'Invalid tag name char at ' + i;
}
}
return function (html) {
for (; i < len; i++) {
stateFn = stateFn(html[i]);
}
if (currentNode = nodesStack.pop()) {
throw 'Unbalanced tags: ' + currentNode.tag + ' is never closed.';
}
return parseTree;
};
})();
console.log(parseHTML(html));