我知道用正则表达式解析 HTML是不好的,并且它不能适用于所有情况(在 Stack Overflow 上有很多关于此的主题)。但我仍然想尝试使用基于白名单方法的正则表达式来清理 HTML。
我想向您展示我的代码(用 PHP 5.2 编写)。它似乎工作正常,但我仍然想知道是否存在安全问题。
那么,我有什么问题吗?
基本原理是使用 Html_Sanitizer::sanitize()
- 该函数首先用标记替换没有属性的允许标记。然后解析带有属性的标签并用令牌替换它们。
- 然后解析 HTML 标记以检测允许的属性(使用 cleanTag 函数)。因此,HTML 标记以(让希望)安全的方式重新构建。
- htmlspecialchars 用于确保剩余代码是干净的
- 令牌被替换为安全标签。
代码:
class Html_Sanitizer
{
const VALIDATOR_CSS_UNIT = '(([\+\-]?[0-9\.]+)(em|ex|px|in|cm|mm|pt|pc|\%))|0';
const VALIDATOR_URL = 'http://\\S+';
const VALIDATOR_CSS_PROPERTY = '[a-z\-]+';
const VALIDATOR_STYLE = '[^"]*';
protected static $_tags = 'a|b|blockquote|br|cite|d[ldt]|h[1-6]|i|img|li|ol|p|span|strong|u|ul';
protected static $_attributes = array(
'img' => array(
'width' => '[0-9]+',
'height' => '[0-9]+',
'src' => self::VALIDATOR_URL,
'style' => self::VALIDATOR_STYLE
),
'span' => array(
'style' => self::VALIDATOR_STYLE
),
'p' => array(
'style' => self::VALIDATOR_STYLE
),
'a' => array(
'href' => self::VALIDATOR_URL
)
);
protected static $_styleValidators = array(
'color' => '(\#[a-fA-F0-9]+)|([a-z ]+)',
'background-color' => '\#[a-zA-Z0-9]+',
'font-style' => '(normal|italic|oblique)',
'font-size' => '[\-a-z]+',
'margin-left' => self::VALIDATOR_CSS_UNIT,
'margin-right' => self::VALIDATOR_CSS_UNIT,
'text-align' => '(left|right|center|justify)',
'text-indent' => self::VALIDATOR_CSS_UNIT,
'text-decoration' => '(none|overline|underline|blink|line-through)',
'width' => self::VALIDATOR_CSS_UNIT,
'height' => self::VALIDATOR_CSS_UNIT
);
public static function sanitize($str)
{
$tokens = array();
//tokenize opening tags with no attributes
$pattern = '#<(/)?('. self::$_tags .')>#';
$replace = '__SAFE_TAG_$1$2__';
$str = preg_replace($pattern, $replace, $str);
// tokenize tags with attributes
$pattern = '#<('. self::$_tags .')(?:\s+(?:[a-z]+)="(?:[^"\\\]*(?:\\\"[^"\\\]*)*)")*\s*(/)?>#';
preg_match_all($pattern, $str, $matches, PREG_SET_ORDER);
foreach($matches as $i => $match) {
$tokens[$i] = self::cleanTag($match[1], $match[0]);
$str = str_replace($match[0], '__SAFE_TOKEN_'.$i.'__', $str);
}
$str = htmlspecialchars($str);
foreach ($tokens as $i => $cleanTag) {
$str = str_replace('__SAFE_TOKEN_'.$i.'__', $cleanTag, $str);
}
$pattern = '#__SAFE_TAG_(/?(?:'. self::$_tags .'))__#';
$replace = '<$1>';
$str = preg_replace($pattern, $replace, $str);
return $str;
}
public static function cleanTag($tag, $str)
{
$cleanTag = '<' . $tag;
if ($tag === 'a') {
$cleanTag .= ' rel="nofolow" target="_blank"';
}
if (isset(self::$_attributes[$tag])) {
foreach(self::$_attributes[$tag] as $attr => $attrPattern) {
$pattern = '#'.$attr.'="('. $attrPattern .')"#';
preg_match($pattern, $str, $match);
if (isset($match[1])) {
if ($attr == 'style') {
$cleanTag .= ' style="' . self::cleanStyle($match[1]) . '"';
} else {
$cleanTag .= ' ' . $attr . '="' . $match[1] . '"';
}
}
}
}
if ($tag === 'img') {
$cleanTag .= ' /';
}
$cleanTag .= '>';
return $cleanTag;
}
public static function cleanStyle($style)
{
$cleanStyle = '';
foreach(self::$_styleValidators as $stl => $stlPattern) {
$pattern = '#[; ]?' . $stl . '\s*:\s*(' . $stlPattern . ')\s*;#i';
preg_match($pattern, $style, $match);
if (isset($match[1])) {
$cleanStyle .= ($cleanStyle ? ' ' : '') . $stl . ':' . $match[1] . ';';
}
}
return $cleanStyle;
}
}