0

我使用这个类进行多线程 cUrl 文件下载;

在本地机器上下载文件内容,文件是空的;

我错过了什么?

<?php

/**
 * Crawler class file.
 *
 * CRAWLER
 *
 */
class CrawlerCommand extends CConsoleCommand {

    private $instance_crawler_id;
    private $instance_crawler_url_limit = 10;
    public $multi_exec_curl_files = array();
    public $collection = array();
    public static $userAgents = array(
        'FireFox3' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; pl; rv:1.9) Gecko/2008052906 Firefox/3.0',
        'GoogleBot' => 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
        'IE7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Netscape' => 'Mozilla/4.8 [en] (Windows NT 6.0; U)',
        'Opera' => 'Opera/9.25 (Windows NT 6.0; U; en)'
    );
    public static $options = array(
        CURLOPT_USERAGENT => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        CURLOPT_AUTOREFERER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FRESH_CONNECT => true,
        CURLOPT_COOKIEJAR => "cookies.txt",
        CURLOPT_COOKIEFILE => "cookies.txt",
        CURLOPT_SSL_VERIFYPEER => false,
        CURLOPT_CONNECTTIMEOUT => 5,
        CURLOPT_TIMEOUT => 10,
            //CURLOPT_COOKIESESSION => false,
    );

    /**
     * Executes the command.
     * @param array command line parameters for this command.
     */
    public function run($args) {
        // $args gives an array of the command-line arguments for this command

        for ($i = 0; $i < 50; $i++) {
            echo PHP_EOL;
        }

        //check if we need to create a new crawler instance; if all html sources from the active url's have been downloaded, there is no point to recrawl
        $day = date('d', time());
        $sql = "select * from `url` where `instance_crawler_day`!='$day' and `status`='1' order by `id` asc limit $this->instance_crawler_url_limit;";
        $cmd = Yii::app()->db->createCommand($sql);
        $rows = $cmd->queryAll();

        $actual_files = count($rows);
        //are there any files left to download for today ?
        if ($actual_files > 0) {

            //creating a new unique crawler instance
            $model_instance_crawler = new InstanceCrawler();
            $model_instance_crawler->day = $day;
            $model_instance_crawler->month = date('m', time());
            $model_instance_crawler->year = date('Y', time());
            if ($model_instance_crawler->save()) {

                //locking x url's for the crawler to download time at a time
                $sql = "update `url` set `instance_crawler_id`='$model_instance_crawler->id', `instance_crawler_day`='$model_instance_crawler->day' where `instance_crawler_day`!='$model_instance_crawler->day' and `status`='1' limit $this->instance_crawler_url_limit;";
                $cmd = Yii::app()->db->createCommand($sql);
                $cmd->query();

                $robots_txt = new RobotsTXT();

                $robots_txt->load_robots_from_db();

                $time = strtotime('-60 days');

                $sql = "SELECT u.*
FROM url AS u
JOIN product p ON p.`url_id` = u.id
JOIN product_follower pf ON pf.`product_id` = p.`id` AND pf.`created`>:time
JOIN `user` us ON us.id = pf.`user_id`
WHERE us.`status` = 1
GROUP BY u.id
order by `u`.`website_id` asc";

                //selecting x rows that belong to the new crawler instance
                $sql = "select * from `url` where `instance_crawler_id`='$model_instance_crawler->id' and `instance_crawler_day`='$day' and `status`='1' order by `id` asc;";
                $cmd = Yii::app()->db->createCommand($sql);
                $rows = $cmd->queryAll();

                if (count($rows) > 0) {

                    foreach ($rows as $row) {
                        $this->collection[$row['website_id']]['items'][] = $row;
                    }

                    foreach (array_keys($this->collection) as $key) {

                        $this->collection[$key]['urls'] = 0;

                        $collection2[] = $this->collection[$key];
                    }

                    $this->collection = $collection2;

                    $collection2 = null;

                    $this->processCollection();

                    //on dew, comment
                    $this->multiCurl($this->multi_exec_curl_files);
                    //CurlTool::downloadFile($url, $fileName, $fields = null, $verbose = false);
                }
            }
        } else {
            echo 'There are no files left to download today. Come back tomorow.' . PHP_EOL;
        }

        echo PHP_EOL . 'DONE' . PHP_EOL;
    }

    /**
     * Provides the command description.
     * This method may be overridden to return the actual command description.
     * @return string the command description. Defaults to 'Usage: php entry-script.php command-name'.
     */
    public function getHelp() {
        return 'Usage: how to use this command';
    }

    private function checkCounters() {
        $status = false;
        foreach ($this->collection as $key => $value) {
            if ($value['urls'] < count($value['items']))
                $status = true;
        }
        return $status;
    }

    public function processCollection() {
        //print'<pre>';
        //print_r($this->collection);
        $w = 0;
        while ($this->checkCounters()) {
            foreach ($this->collection as $key => $value)
                if ($value['urls'] < count($value['items'])) {
                    echo 'downloading file: ' . ($value['items'][$value['urls']]['id']) . '.html' . PHP_EOL;
                    //prepare the array for the multi thread cURL downloading process
                    $this->multi_exec_curl_files[$value['items'][$value['urls']]['id']] = array('link' => $value['items'][$value['urls']]['link']);
                    //$this->downloadFile($value['items'][$value['urls']]['url'], CRAWLER_FILES . ($value['items'][$value['urls']]['id']) . '.html');
                    //echo $value['items'][$value['urls']]['link'].'<br>';
                    //CurlTool::downloadFile($value['items'][$value['urls']]['link'], ($value['items'][$value['urls']]['id']) . '.html');
                    $this->collection[$key]['urls']++;
                }
            $w++;
            if ($w > count($this->collection))
                $w = 0;
        }
    }

    public function multiCurl($res, $options = "") {

        if (count($res) <= 0)
            return False;

        $handles = array();

        if (!$options) // add default options
            $options = self::$options;

        // add curl options to each handle
        foreach ($res as $k => $row) {
            $ch{$k} = curl_init();
            $options[CURLOPT_URL] = $row['link'];
            curl_setopt_array($ch{$k}, $options);
            $handles[$k] = $ch{$k};
        }

        $mh = curl_multi_init();

        foreach ($handles as $k => $handle) {
            curl_multi_add_handle($mh, $handle);
        }

        $running_handles = null;
        //execute the handles
        do {
            $status_cme = curl_multi_exec($mh, $running_handles);
        } while ($cme == CURLM_CALL_MULTI_PERFORM);

        while ($running_handles && $status_cme == CURLM_OK) {
            if (curl_multi_select($mh) != -1) {
                do {
                    $status_cme = curl_multi_exec($mh, $running_handles);
                } while ($status == CURLM_CALL_MULTI_PERFORM);
            }
        }

        foreach ($res as $k => $row) {
            $res[$k]['error'] = curl_error($handles[$k]);
            print_r($res[$k]['error']);
            if (!empty($res[$k]['error'])) {
                $res[$k]['data'] = '';
            } else {
                //$res[$k]['data'] = curl_multi_getcontent($handles[$k]);  // get results
                file_put_contents(CRAWLER_FILES . $k . '.html', curl_multi_getcontent($handles[$k]));
            }

            // close current handler
            curl_multi_remove_handle($mh, $handles[$k]);
        }
        curl_multi_close($mh);
        return $res; // return response
    }

}
4

1 回答 1

0

使用http://www.somacon.com/p537.php找到了答案

我弄错了一些变量名,通过比较两个代码,我找到了它们

<?php
// LICENSE: PUBLIC DOMAIN
// The author disclaims copyright to this source code.
// AUTHOR: Shailesh N. Humbad
// SOURCE: http://www.somacon.com/p539.php
// DATE: 6/4/2008

// index.php
// Run the parallel get and print the total time
$s = microtime(true);
// Define the URLs
$urls = array(
  "http://localhost/r.php?echo=request1",
  "http://localhost/r.php?echo=request2",
  "http://localhost/r.php?echo=request3"
);
$pg = new ParallelGet($urls);
print "<br />total time: ".round(microtime(true) - $s, 4)." seconds";

// Class to run parallel GET requests and return the transfer
class ParallelGet
{
  function __construct($urls)
  {
    // Create get requests for each URL
    $mh = curl_multi_init();
    foreach($urls as $i => $url)
    {
      $ch[$i] = curl_init($url);
      curl_setopt($ch[$i], CURLOPT_RETURNTRANSFER, 1);
      curl_multi_add_handle($mh, $ch[$i]);
    }

    // Start performing the request
    do {
        $execReturnValue = curl_multi_exec($mh, $runningHandles);
    } while ($execReturnValue == CURLM_CALL_MULTI_PERFORM);
    // Loop and continue processing the request
    while ($runningHandles && $execReturnValue == CURLM_OK) {
      // Wait forever for network
      $numberReady = curl_multi_select($mh);
      if ($numberReady != -1) {
        // Pull in any new data, or at least handle timeouts
        do {
          $execReturnValue = curl_multi_exec($mh, $runningHandles);
        } while ($execReturnValue == CURLM_CALL_MULTI_PERFORM);
      }
    }

    // Check for any errors
    if ($execReturnValue != CURLM_OK) {
      trigger_error("Curl multi read error $execReturnValue\n", E_USER_WARNING);
    }

    // Extract the content
    foreach($urls as $i => $url)
    {
      // Check for errors
      $curlError = curl_error($ch[$i]);
      if($curlError == "") {
        $res[$i] = curl_multi_getcontent($ch[$i]);
      } else {
        print "Curl error on handle $i: $curlError\n";
      }
      // Remove and close the handle
      curl_multi_remove_handle($mh, $ch[$i]);
      curl_close($ch[$i]);
    }
    // Clean up the curl_multi handle
    curl_multi_close($mh);

    // Print the response data
    print_r($res);
  }

}
?>

<?php
// r.php
// This script runs a variable amount of time
// and generates a variable amount of data

// Output a random amount of blank space
$s = microtime(true);
$m = rand(500,1000);
for($i = 0; $i < $m; $i++) {
  print "         \n";
  usleep(10);
}

// Print time taken and the value of the "echo" parameter
print isset($_REQUEST["echo"]) ? $_REQUEST["echo"] : "";
print " in ";
print round(microtime(true) - $s, 4)." seconds";
exit();
?>
于 2012-11-16T12:59:19.800 回答