0

我正在验证 HTML 输入(形成 rss 提要)以显示在 Mvc 视图中

我正在使用以下白名单方法来清理我的 html

private static Regex _tags = new Regex("<[^>]*(>|$)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
private static Regex _whitelist = new Regex(@"
^</?(b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3)|i|kbd|u|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$|
^<(b|h)r\s?/?>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_a = new Regex(@"
^<a\s
href=""(\#\d+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
(\stitle=""[^""<>]+"")?\s?>$|
^</a>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
private static Regex _whitelist_img = new Regex(@"
^<img\s
src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
(\swidth=""\d{1,3}"")?
(\sheight=""\d{1,3}"")?
(\salt=""[^""<>]*"")?
(\stitle=""[^""<>]*"")?
\s?/?>$",
    RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);


/// <summary>
/// sanitize any potentially dangerous tags from the provided raw HTML input using 
/// a whitelist based approach, leaving the "safe" HTML tags
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937
/// </summary>
public static string Sanitize(string html)
{
    if (String.IsNullOrEmpty(html)) return html;

    string tagname;
    Match tag;

    // match every HTML tag in the input
    MatchCollection tags = _tags.Matches(html);
    for (int i = tags.Count - 1; i > -1; i--)
    {
        tag = tags[i];
        tagname = tag.Value.ToLowerInvariant();

        if (!(_whitelist.IsMatch(tagname) || _whitelist_a.IsMatch(tagname) || _whitelist_img.IsMatch(tagname)))
        {
            html = html.Remove(tag.Index, tag.Length);

        }
    }

    return html;
}

我还想允许使用 iFrames 或 html5 视频标签显示来自 Youtube 或 Vimeo 的视频内容

任何人都可以为我指出一个更灵活的reg ex的正确方向吗?

这是我对 iframe 的尝试

private static Regex _whitelist_iframe = new Regex(@"
             ^<iframe\s
            src=""https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+""
            (\swidth=""\d{1,3}"")?
            (\sheight=""\d{1,3}"")?
            (\sframeborder=""\d{1,3}"")?
            (\sallowfullscreen)?
            \s?>$|^</iframe>$",
            RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);
4

1 回答 1

1

上面的 RegEx 方法太严格了,更不用说凯文的观点了!

这是我所做的:

使用html-agility-pack解析 Html,并按照此stackoverflow 答案中所述对其进行清理

我还添加了一些代码来根据正则表达式检查图像或 iframe 的 src 标签。(我很确定它可以做得更好)

public class HtmlSanitizer
{
    private readonly IDictionary<string, string[]> _whitelist;
    private readonly List<string> _deletableNodesXpath = new List<string>();

    public HtmlSanitizer()
    {
        _whitelist = new Dictionary<string, string[]>
                        {
                            {"a", new[] {"href", "target", "title"}},
                            {"img", new[] {"src", "alt", "width", "height"}},
                            {"iframe", new[] {"src", "width", "height", "frameborder", "allowfullscreen" }},
                            {"strong", null},
                            {"em", null},
                            {"blockquote", null},
                            {"b", null},
                            {"p", null},
                            {"ul", null},
                            {"ol", null},
                            {"li", null},
                            {"div", new[] {"align"}},
                            {"strike", null},
                            {"u", null},
                            {"sub", null},
                            {"sup", null},
                            {"table", null},
                            {"tr", null},
                            {"td", null},
                            {"th", null},
                            {"dd", null},
                            {"dt", null},
                            {"dl", null},
                            {"h1", null},
                            {"h2", null},
                            {"h3", null},
                        };
    }

    public string Sanitize(string input)
    {
        if (input.Trim().Length < 1)
            return string.Empty;
        var htmlDocument = new HtmlDocument();

        htmlDocument.LoadHtml(input);
        SanitizeNode(htmlDocument.DocumentNode);
        string xPath = CreateXPath();

        return StripHtml(htmlDocument.DocumentNode.WriteTo().Trim(), xPath);
    }

    private void SanitizeChildren(HtmlNode parentNode)
    {
        for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
        {
            SanitizeNode(parentNode.ChildNodes[i]);
        }
    }

    private static Regex _srcAttribute = new Regex(@"^https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+$", RegexOptions.Singleline | RegexOptions.IgnoreCase
                         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

    private static Regex _iframeSrc = new Regex(@"https?://(player.vimeo.com|www.youtube.com)/[-a-z0-9+&@#/%?=~_|!:,.;\(\)|\s]+", RegexOptions.Singleline | RegexOptions.IgnoreCase
                         | RegexOptions.ExplicitCapture | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace);

    private void SanitizeNode(HtmlNode node)
    {
        if (node.NodeType == HtmlNodeType.Element)
        {
            if (!_whitelist.ContainsKey(node.Name))
            {
                if (!_deletableNodesXpath.Contains(node.Name))
                {
                    //DeletableNodesXpath.Add(node.Name.Replace("?",""));
                    node.Name = "removeableNode";
                    _deletableNodesXpath.Add(node.Name);
                }
                if (node.HasChildNodes)
                {
                    SanitizeChildren(node);
                }

                return;
            }

            if (node.HasAttributes)
            {
                for (int i = node.Attributes.Count - 1; i >= 0; i--)
                {
                    HtmlAttribute currentAttribute = node.Attributes[i];
                    string[] allowedAttributes = _whitelist[node.Name];
                    if (allowedAttributes != null)
                    {
                        if (!allowedAttributes.Contains(currentAttribute.Name))
                        {
                            node.Attributes.Remove(currentAttribute);
                        }

                        // if img src ensure matches regex 
                        if (node.Name == "img" && currentAttribute.Name == "src")
                        {
                            if (!_srcAttribute.IsMatch(currentAttribute.Value))
                            {
                                // remove node 
                                node.Name = "removeableNode";
                                _deletableNodesXpath.Add(node.Name);
                            }
                        }

                        // if iframe - ensure it within allowed src tags 
                        if (node.Name == "iframe" && currentAttribute.Name == "src")
                        {
                            if (!_iframeSrc.IsMatch(currentAttribute.Value))
                            {
                                // remove node 
                                node.Name = "removeableNode";
                                _deletableNodesXpath.Add(node.Name);
                            }
                        }

                    }
                    else
                    {
                        node.Attributes.Remove(currentAttribute);
                    }
                }
            }
        }

        if (node.HasChildNodes)
        {
            SanitizeChildren(node);
        }
    }

    private string StripHtml(string html, string xPath)
    {
        HtmlDocument htmlDoc = new HtmlDocument();
        htmlDoc.LoadHtml(html);
        if (xPath.Length > 0)
        {
            HtmlNodeCollection invalidNodes = htmlDoc.DocumentNode.SelectNodes(@xPath);
            foreach (HtmlNode node in invalidNodes)
            {
                node.ParentNode.RemoveChild(node, true);
            }
        }
        return htmlDoc.DocumentNode.WriteContentTo();
        ;
    }

    private string CreateXPath()
    {
        string xPath = string.Empty;
        for (int i = 0; i < _deletableNodesXpath.Count; i++)
        {
            if (i != _deletableNodesXpath.Count - 1)
            {
                xPath += string.Format("//{0}|", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
            }
            else xPath += string.Format("//{0}", _deletableNodesXpath[i].ToString(CultureInfo.InvariantCulture));
        }
        return xPath;
    }
}
于 2012-05-28T17:13:27.157 回答