/**
* Refine the input HTML (string) and keep what was specified
*
* @param $string : Input HTML
* @param array $allowed : What will be kept?
* @return bool|simple_html_dom
*/
function crl_parse_html($string, $allowed = array())
{
// String --> DOM Elements
$string = str_get_html($string);
// Fetch child of the current element (one by one)
foreach ($string->find('*') as $child) {
if (
// Current inner-text contain one or more elements
preg_match('/<[^<]+?>/is', $child->innertext) and
// Current element tag is in maintained elements array
in_array($child->tag, $allowed)
) {
// Assign current inner-text to current filtered inner-text
$child->innertext = crl_parse_html($child->innertext, $allowed);
} else if (
// Current inner-text contain one or more elements
preg_match('/<[^<]+?>/is', $child->innertext) and
// Current element tag is NOT in maintained elements array
!in_array($child->tag, $allowed)
) {
// Assign current inner-text to the set of inner-elements (if exists)
$child->innertext = preg_replace('/(?<=^|>)[^><]+?(?=<|$)(<[^\/]+?>.+)/is', '$1', $child->innertext);
// Assign current outer-text to current filtered inner-text
$child->outertext = crl_parse_html($child->innertext, $allowed);
} else if (
(
// Current inner-text is only plaintext
preg_match('/(?<=^|>)[^><]+?(?=<|$)/is', $child->innertext) and
// Current element tag is NOT in maintained elements array
!in_array($child->tag, $allowed)
) or
// Current plain-text is empty
trim($child->plaintext) == ''
) {
// Assign current outer-text to empty string
$child->outertext = '';
}
}
return $string;
}
这是我的解决方案,我做到了,如果有人需要,我只是在这里发布并结束这个问题。
注意:这个函数使用递归。因此,太大的数据将是一个大问题。决定使用此功能时请仔细考虑。