嘿,试图运行 curl 脚本来抓取网站,但它一直超时。它在本地工作,但不在此服务器上。据我所知,一切都设置正确。
请注意,我已将此代码示例的站点全名替换为 XXXXX。
<?php
error_reporting(E_ALL);
$url = "http://xxxxx.eu/search?f=b" ;
$header = getHeaders(0);
$request = array();
$request['url'] = $url ;
$request['method'] = 'get' ;
$request['header'] = $header ;
echo getPageCURL($request);
die();
function getHeaders($content_length=0)
{
$header = array();
$header[] = "Host: xxxxx.eu";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Accept-Encoding: gzip, deflate";
$header[] = "User-Agent: Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0";
$header[] = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
if($content_length>10) $header[] = "Content Size: ".$content_length;
$header[] = "Content-Type: text/html; charset=utf-8";
$header[] = "Content-Encoding: gzip";
$header[] = "Vary: Accept-Encoding";
$header[] = "Content-Length: 22";
$header[] = "Connection: keep-alive";
return $header;
}
function getPageCURL($request)
{
$page = '';
$verified = '';
$page_type = 'O';
$filter = true;
$page_header = 0;
$followlocation = true;
$cookies = true;
$bad_url = false;
if(is_array($request))
{
foreach($request as $request_key=>$request_value)
$$request_key = $request_value;
}
else $url = $request;
if(empty($url)) return '';
$url = str_replace(' ', '+', $url);
//echo "\nCalling : ".$url;
for($i=1; $i<=10; $i++)
{
$curl = curl_init();
if(isset($header) && is_array($header))
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
if(isset($referer) && !empty($referer))
curl_setopt($curl, CURLOPT_REFERER, $referer);
if(isset($ssl))
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
if(isset($method) && $method=='post')
{
curl_setopt($curl, CURLOPT_POST, true);
if(isset($post_data) && $post_data!='')
{
curl_setopt($curl, CURLOPT_POSTFIELDS, $post_data);
}
}
if($cookies)
{
curl_setopt($curl, CURLOPT_COOKIEFILE, 'cookie.txt');
curl_setopt($curl, CURLOPT_COOKIEJAR, 'cookie.txt');
}
curl_setopt($curl, CURLOPT_TIMEOUT, 120);
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_HEADER, $page_header);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, $followlocation);
$page = curl_exec($curl);
if(($page===false || trim($page) == '' || empty($page)) && (curl_errno($curl) == 6 || curl_errno($curl) == 7))
{
curl_close ($curl);
//echo "\nNetwork problem...";
sleep(10);
$i--;
continue;
}
else if(curl_errno($curl) == 6)
{
file_put_contents('cu_failed-'.time().'.txt', $page, FILE_APPEND);
}
curl_close ($curl);
}
$page = str_replace(array("\n", "\r", "\t"), " ", $page);
return $page;
}
?>
如果我尝试在有问题的网站上做一个基本的 wget,我会得到这个响应:
root@vps38132:~# wget http://xxxxx.eu
--2013-10-31 10:30:02-- http://xxxxx.eu/
Resolving xxxxxx.eu (xxxxxx.eu)... 31.7.58.171, 31.7.58.172, 31.7.58.170
Connecting to xxxxxx.eu (xxxxxx.eu)|31.7.58.171|:80...
有任何想法吗 ?我能够 curl/wget 其他站点,而不是这个,发生了什么?