使用 DOM 有什么问题?它使您可以更好地控制信息的上下文,并且您提取的内容实际上是 URL 的可能性更高。
<?php
$resultFromCurl = '
<html>
<body>
<img src="hello.jpg" />
<a href="yep.jpg">Yep</a>
<table background="yep.jpg">
</table>
<p>
Perhaps you should check out foo.jpg! I promise it
is safe for work.
</p>
</body>
</html>
';
// these are all the attributes i could think of that
// can contain URLs.
$queries = array(
'//table/@background',
'//img/@src',
'//input/@src',
'//a/@href',
'//area/@href',
'//img/@longdesc',
);
$dom = @DOMDocument::loadHtml($resultFromCurl);
$xpath = new DOMXPath($dom);
$urls = array();
foreach ($queries as $query) {
foreach ($xpath->query($query) as $link) {
if (preg_match('@\.(gif|jpe?g|png)$@', $link->textContent))
$urls[$link->textContent] = true;
}
}
if (preg_match_all('@\b[^\s]+\.(?:gif|jpe?g|png)\b@', $dom->textContent, $matches)) {
foreach ($matches as $m) {
$urls[$m[0]] = true;
}
}
$urls = array_keys($urls);
var_dump($urls);