我想使用 C# 将 html 转换为纯文本到目前为止我已经实现了这段代码:->
public static string HTMLToText(string HTMLCode)
{
if (HTMLCode == null)
return null;
// Remove new lines since they are not visible in HTML
HTMLCode = HTMLCode.Replace("\n", " ");
// Remove tab spaces
HTMLCode = HTMLCode.Replace("\t", " ");
// Remove multiple white spaces from HTML
HTMLCode = System.Text.RegularExpressions.Regex.Replace(HTMLCode, "\\s+", " ");
// Remove HEAD tag
HTMLCode = System.Text.RegularExpressions.Regex.Replace(HTMLCode, "<head.*?</head>", ""
, System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline);
// Remove any JavaScript
HTMLCode = System.Text.RegularExpressions.Regex.Replace(HTMLCode, "<script.*?</script>", ""
, System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline);
// Replace special characters like &, <, >, " etc.
StringBuilder sbHTML = new StringBuilder(HTMLCode);
// Note: There are many more special characters, these are just
// most common. You can add new characters in this arrays if needed
string[] OldWords = { " ", "&", """, "<", ">", "®", "©", "•", "™" };
string[] NewWords = { " ", "&", "\"", "<", ">", "®", "©", "•", "™" };
for (int i = 0; i < OldWords.Length; i++)
{
sbHTML.Replace(OldWords[i], NewWords[i]);
}
// Check if there are line breaks (<br>) or paragraph (<p>)
sbHTML.Replace("<br>", "\n");
sbHTML.Replace("<br ", "\n ");
sbHTML.Replace("<p> ", "\n ");
sbHTML.Replace("<p ", "\n ");
sbHTML.Replace("<span ", "\n ");
sbHTML.Replace("style", " ");
sbHTML.Replace("FONT", " ");
sbHTML.Replace("FONT-SIZE", " ");
sbHTML.Replace("</span> ", "\n");
// Finally, remove all HTML tags and return plain text
//Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase);
return System.Text.RegularExpressions.Regex.Replace(sbHTML.ToString(), @"</?[a-z][a-z0-9][^<>]>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
}
<p>.....<p>...</p>..</p>
但它并没有删除所有标签,并且像这样的递归标签也<p>
被删除了,但内部标签仍然是他们的......
他们有什么办法可以得到纯文本吗?