所有关于使用正则表达式解析 HTML 的常见警告都适用。
#include <boost/regex.hpp>
#include <iostream>
#include <string>
int main()
{
boost::regex double_br("<br/?>[ \\r\\n\\s]*<br/?>", boost::regex::icase);
boost::regex fonts("</?font[^>]*>", boost::regex::icase);
std::string cases[] = {
"foo<br><br>bar",
"one<br/><br>two",
"a<br> <br/>b",
"a<br><br>c<br/><br/>d",
"<font attr=\"value\">w00t!</font>",
"<font attr=\"value\">hello</font><font>bye</font>",
""
};
for (std::string *s = cases; *s != ""; ++s) {
std::cout << *s << ":\n";
std::string result;
result = boost::regex_replace(*s, double_br, "</p><p>");
result = boost::regex_replace(result, fonts, "");
std::cout << " - [" << result << "]\n";
}
return 0;
}
输出:
foo<br><br>酒吧:
- [foo</p><p>bar]
一<br/><br>二:
- [一个</p><p>两个]
一个<br> <br/>b:
- [a</p><p>b]
a<br><br>c<br/><br/>d:
- [a</p><p>c</p><p>d]
<font attr="value">w00t!</font>:
- [w00t!]
<font attr="value">你好</font><font>再见</font>:
- [你好再见]