我使用这个类进行多线程 cUrl 文件下载;
在本地机器上下载文件内容,文件是空的;
我错过了什么?
<?php
/**
* Crawler class file.
*
* CRAWLER
*
*/
class CrawlerCommand extends CConsoleCommand {
private $instance_crawler_id;
private $instance_crawler_url_limit = 10;
public $multi_exec_curl_files = array();
public $collection = array();
public static $userAgents = array(
'FireFox3' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; pl; rv:1.9) Gecko/2008052906 Firefox/3.0',
'GoogleBot' => 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'IE7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Netscape' => 'Mozilla/4.8 [en] (Windows NT 6.0; U)',
'Opera' => 'Opera/9.25 (Windows NT 6.0; U; en)'
);
public static $options = array(
CURLOPT_USERAGENT => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
CURLOPT_AUTOREFERER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FRESH_CONNECT => true,
CURLOPT_COOKIEJAR => "cookies.txt",
CURLOPT_COOKIEFILE => "cookies.txt",
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_CONNECTTIMEOUT => 5,
CURLOPT_TIMEOUT => 10,
//CURLOPT_COOKIESESSION => false,
);
/**
* Executes the command.
* @param array command line parameters for this command.
*/
public function run($args) {
// $args gives an array of the command-line arguments for this command
for ($i = 0; $i < 50; $i++) {
echo PHP_EOL;
}
//check if we need to create a new crawler instance; if all html sources from the active url's have been downloaded, there is no point to recrawl
$day = date('d', time());
$sql = "select * from `url` where `instance_crawler_day`!='$day' and `status`='1' order by `id` asc limit $this->instance_crawler_url_limit;";
$cmd = Yii::app()->db->createCommand($sql);
$rows = $cmd->queryAll();
$actual_files = count($rows);
//are there any files left to download for today ?
if ($actual_files > 0) {
//creating a new unique crawler instance
$model_instance_crawler = new InstanceCrawler();
$model_instance_crawler->day = $day;
$model_instance_crawler->month = date('m', time());
$model_instance_crawler->year = date('Y', time());
if ($model_instance_crawler->save()) {
//locking x url's for the crawler to download time at a time
$sql = "update `url` set `instance_crawler_id`='$model_instance_crawler->id', `instance_crawler_day`='$model_instance_crawler->day' where `instance_crawler_day`!='$model_instance_crawler->day' and `status`='1' limit $this->instance_crawler_url_limit;";
$cmd = Yii::app()->db->createCommand($sql);
$cmd->query();
$robots_txt = new RobotsTXT();
$robots_txt->load_robots_from_db();
$time = strtotime('-60 days');
$sql = "SELECT u.*
FROM url AS u
JOIN product p ON p.`url_id` = u.id
JOIN product_follower pf ON pf.`product_id` = p.`id` AND pf.`created`>:time
JOIN `user` us ON us.id = pf.`user_id`
WHERE us.`status` = 1
GROUP BY u.id
order by `u`.`website_id` asc";
//selecting x rows that belong to the new crawler instance
$sql = "select * from `url` where `instance_crawler_id`='$model_instance_crawler->id' and `instance_crawler_day`='$day' and `status`='1' order by `id` asc;";
$cmd = Yii::app()->db->createCommand($sql);
$rows = $cmd->queryAll();
if (count($rows) > 0) {
foreach ($rows as $row) {
$this->collection[$row['website_id']]['items'][] = $row;
}
foreach (array_keys($this->collection) as $key) {
$this->collection[$key]['urls'] = 0;
$collection2[] = $this->collection[$key];
}
$this->collection = $collection2;
$collection2 = null;
$this->processCollection();
//on dew, comment
$this->multiCurl($this->multi_exec_curl_files);
//CurlTool::downloadFile($url, $fileName, $fields = null, $verbose = false);
}
}
} else {
echo 'There are no files left to download today. Come back tomorow.' . PHP_EOL;
}
echo PHP_EOL . 'DONE' . PHP_EOL;
}
/**
* Provides the command description.
* This method may be overridden to return the actual command description.
* @return string the command description. Defaults to 'Usage: php entry-script.php command-name'.
*/
public function getHelp() {
return 'Usage: how to use this command';
}
private function checkCounters() {
$status = false;
foreach ($this->collection as $key => $value) {
if ($value['urls'] < count($value['items']))
$status = true;
}
return $status;
}
public function processCollection() {
//print'<pre>';
//print_r($this->collection);
$w = 0;
while ($this->checkCounters()) {
foreach ($this->collection as $key => $value)
if ($value['urls'] < count($value['items'])) {
echo 'downloading file: ' . ($value['items'][$value['urls']]['id']) . '.html' . PHP_EOL;
//prepare the array for the multi thread cURL downloading process
$this->multi_exec_curl_files[$value['items'][$value['urls']]['id']] = array('link' => $value['items'][$value['urls']]['link']);
//$this->downloadFile($value['items'][$value['urls']]['url'], CRAWLER_FILES . ($value['items'][$value['urls']]['id']) . '.html');
//echo $value['items'][$value['urls']]['link'].'<br>';
//CurlTool::downloadFile($value['items'][$value['urls']]['link'], ($value['items'][$value['urls']]['id']) . '.html');
$this->collection[$key]['urls']++;
}
$w++;
if ($w > count($this->collection))
$w = 0;
}
}
public function multiCurl($res, $options = "") {
if (count($res) <= 0)
return False;
$handles = array();
if (!$options) // add default options
$options = self::$options;
// add curl options to each handle
foreach ($res as $k => $row) {
$ch{$k} = curl_init();
$options[CURLOPT_URL] = $row['link'];
curl_setopt_array($ch{$k}, $options);
$handles[$k] = $ch{$k};
}
$mh = curl_multi_init();
foreach ($handles as $k => $handle) {
curl_multi_add_handle($mh, $handle);
}
$running_handles = null;
//execute the handles
do {
$status_cme = curl_multi_exec($mh, $running_handles);
} while ($cme == CURLM_CALL_MULTI_PERFORM);
while ($running_handles && $status_cme == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$status_cme = curl_multi_exec($mh, $running_handles);
} while ($status == CURLM_CALL_MULTI_PERFORM);
}
}
foreach ($res as $k => $row) {
$res[$k]['error'] = curl_error($handles[$k]);
print_r($res[$k]['error']);
if (!empty($res[$k]['error'])) {
$res[$k]['data'] = '';
} else {
//$res[$k]['data'] = curl_multi_getcontent($handles[$k]); // get results
file_put_contents(CRAWLER_FILES . $k . '.html', curl_multi_getcontent($handles[$k]));
}
// close current handler
curl_multi_remove_handle($mh, $handles[$k]);
}
curl_multi_close($mh);
return $res; // return response
}
}