如果使用得当, XML::LibXML也可能令人惊讶地擅长这种清理工作。它也非常快;一旦你超越了它的学习曲线,它就会变得深入/灵活。
#!/usr/bin/env perl
use strictures;
use XML::LibXML;
my @craptastic = ( '<div class="article"><style>@font-face{ font-family: "Cambria" }</style>Article starts here</div>',
'<div class="highlight"><html><head></head><body><p>Note that ...</p></html></div>' );
# The inline setting of recover_silently is broken/non-functional so
# we do the method calls to set.
my $parser = XML::LibXML->new();
$parser->recover_silently(1);
$parser->keep_blanks(1);
for my $crap ( @craptastic )
{
my $doc = $parser->load_html( string => $crap );
# Optional example for killing style tags not in the <head/>
$_->parentNode->removeChild($_) for $doc->findnodes("//body//style");
print $/, $crap, $/;
my ( $body ) = $doc->findnodes("//body");
print "-" x 60, $/;
print $_->serialize(1) for $body->childNodes;
print $/, $/;
}
给你-
<div class="article"><style>@font-face{ font-family: "Cambria" }</style>Article starts here</div>
------------------------------------------------------------
<div class="article">Article starts here</div>
<div class="highlight"><html><head></head><body><p>Note that ...</p></html></div>
------------------------------------------------------------
<div class="highlight">
<p>Note that ...</p>
</div>