您的方法似乎足够好,我只会给某些标签/属性一个权重,并使用 XPath 查询循环遍历它们,直到找到退出的东西并且它不是无效的。就像是:
i = 0
while (//img[i][@src])
if (//img[i][@alt])
return alt
else if (//img[i][@description])
return description
else if (//img[i]/../p[0])
return p
else
return (//title)
i++
一个简单的 XPath 示例(从我的框架移植的函数):
function ph_DOM($html, $xpath = null)
{
if (is_object($html) === true)
{
if (isset($xpath) === true)
{
$html = $html->xpath($xpath);
}
return $html;
}
else if (is_string($html) === true)
{
$dom = new DOMDocument();
if (libxml_use_internal_errors(true) === true)
{
libxml_clear_errors();
}
if ($dom->loadHTML(ph()->Text->Unicode->mb_html_entities($html)) === true)
{
return ph_DOM(simplexml_import_dom($dom), $xpath);
}
}
return false;
}
以及实际用法:
$html = file_get_contents('http://en.wikipedia.org/wiki/Photography');
print_r(ph_DOM($html, '//img')); // gets all images
print_r(ph_DOM($html, '//img[@src]')); // gets all images that have a src
print_r(ph_DOM($html, '//img[@src]/..')); // gets all images that have a src and their parent element
print_r(ph_DOM($html, '//img[@src]/../..')); // and so on...
print_r(ph_DOM($html, '//title')); // get the title of the page