添加我的两分钱,因为这些方法都没有处理我想要的(删除一组给定的标签,如p
anddiv
并在保留内部标签的同时正确处理嵌套)。
以下是我想出的并通过了我所有的单元测试以及我认为我需要处理的大多数情况:
var htmlDoc = new HtmlDocument();
// load html
htmlDoc.LoadHtml(html);
var tags = (from tag in htmlDoc.DocumentNode.Descendants()
where tagNames.Contains(tag.Name)
select tag).Reverse();
// find formatting tags
foreach (var item in tags)
{
if (item.PreviousSibling == null)
{
// Prepend children to parent node in reverse order
foreach (HtmlNode node in item.ChildNodes.Reverse())
{
item.ParentNode.PrependChild(node);
}
}
else
{
// Insert children after previous sibling
foreach (HtmlNode node in item.ChildNodes)
{
item.ParentNode.InsertAfter(node, item.PreviousSibling);
}
}
// remove from tree
item.Remove();
}
// return transformed doc
html = htmlDoc.DocumentNode.WriteContentTo().Trim();
以下是我用来测试的案例:
[TestMethod]
public void StripTags_CanStripSingleTag()
{
var input = "<p>tag</p>";
var expected = "tag";
var actual = HtmlUtilities.StripTags(input, "p");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripNestedTag()
{
var input = "<p>tag <p>inner</p></p>";
var expected = "tag inner";
var actual = HtmlUtilities.StripTags(input, "p");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripTwoTopLevelTags()
{
var input = "<p>tag</p> <div>block</div>";
var expected = "tag block";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripMultipleNestedTags_2LevelsDeep()
{
var input = "<p>tag <div>inner</div></p>";
var expected = "tag inner";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripMultipleNestedTags_3LevelsDeep()
{
var input = "<p>tag <div>inner <p>superinner</p></div></p>";
var expected = "tag inner superinner";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripTwoTopLevelMultipleNestedTags_3LevelsDeep()
{
var input = "<p>tag <div>inner <p>superinner</p></div></p> <div><p>inner</p> toplevel</div>";
var expected = "tag inner superinner inner toplevel";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_IgnoresTagsThatArentSpecified()
{
var input = "<p>tag <div>inner <a>superinner</a></div></p>";
var expected = "tag inner <a>superinner</a>";
var actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
input = "<wrapper><p>tag <div>inner</div></p></wrapper>";
expected = "<wrapper>tag inner</wrapper>";
actual = HtmlUtilities.StripTags(input, "p", "div");
Assert.AreEqual(expected, actual);
}
[TestMethod]
public void StripTags_CanStripSelfClosingAndUnclosedTagsLikeBr()
{
var input = "<p>tag</p><br><br/>";
var expected = "tag";
var actual = HtmlUtilities.StripTags(input, "p", "br");
Assert.AreEqual(expected, actual);
}
它可能无法处理所有事情,但它可以满足我的需要。