Readability.php 工作得很好,但我发现如果你为 html 内容卷曲并欺骗用户代理,你会得到更成功的结果。您还可以使用一些重定向转发,以防您尝试访问的 url 给您带来麻烦。这是我现在使用的内容,从另一篇文章(重定向后的 PHP Curl)稍作修改。希望你觉得它有用。
function getData($url) {
$url = str_replace('&', '&', urldecode(trim($url)) );
$timeout = 5;
$cookie = tempnam('/tmp', 'CURLCOOKIE');
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1');
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_ENCODING, '');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
$content = curl_exec($ch);
curl_close ($ch);
return $content;
}
执行:
$url = 'http://';
//$html = file_get_contents($url);
$html = getData($url);
if (function_exists('tidy_parse_string')) {
$tidy = tidy_parse_string($html, array(), 'UTF8');
$tidy->cleanRepair();
$html = $tidy->value;
}
$readability = new Readability($html, $url);
//...