0

I'm using following code to check broken links in the given url. but this process is very slow. i need to speed up this process quickly.

$$url_list = array(
"http://goog528le.com",
"http://facebook.com",
"http://google.com", 
"http://youtube.com", 
"http://yahoo.com", 
"http://amazon.com",
"http://baidu.com", 
"http://wikipedia.org", 
"http://live.com",
"http://qq.com", 
"http://taobao.com", 
"http://google.co.in",
"http://twitter.com", 
"http://blogspot.com",
"http://yahoo.co.jp", 
"http://linkedin.com",
"http://bing.com",
"http://sina.com.cn"
, "http://yandex.ru");

// 1. multi handle
$mh = curl_multi_init();
$max_connections = 10;
$dead_urls = array();
$not_found_urls = array();

// 2. add multiple URLs to the multi handle
for ($i = 0; $i < $max_connections; $i++) {
   add_url_to_multi_handle($mh, $url_list);
}

// 3. initial execution
 do {
$mrc = curl_multi_exec($mh, $active);

} while ($mrc == CURLM_CALL_MULTI_PERFORM);

// 4. main loop while ($active && $mrc == CURLM_OK) {

// 5. there is activity
if (curl_multi_select($mh) != -1) {

    // 6. do work
    do {
        $mrc = curl_multi_exec($mh, $active);
    } while ($mrc == CURLM_CALL_MULTI_PERFORM);

    // 7. is there info?
    if ($mhinfo = curl_multi_info_read($mh)) {
        // this means one of the requests were finished
        // 8. get the info on the curl handle
        $chinfo = curl_getinfo($mhinfo['handle']);

        // 9. dead link?
        if (!$chinfo['http_code']) {
            $dead_urls [] = $chinfo['url'];

            // 10. 404?
        } else if ($chinfo['http_code'] == 404) {
            $not_found_urls [] = $chinfo['url'];

            // 11. working
        } else {
            $working_urls [] = $chinfo['url'];
        }

        // 12. remove the handle
        curl_multi_remove_handle($mh, $mhinfo['handle']);
        curl_close($mhinfo['handle']);

        // 13. add a new url and do work
        if (add_url_to_multi_handle($mh, $url_list)) {

            do {
                $mrc = curl_multi_exec($mh, $active);
            } while ($mrc == CURLM_CALL_MULTI_PERFORM);
        }
    }
}

}

// 14. finished curl_multi_close($mh);

echo "==Dead URLs==\n";
echo implode("\n", $dead_urls) . "\n\n";

echo "==404 URLs==\n";
echo implode("\n", $not_found_urls) . "\n\n";

echo "==Working URLs==\n";
echo implode("\n", $working_urls);

// 15. adds a url to the multi handle
function add_url_to_multi_handle($mh, $url_list)
{
static $index = 0;

// if we have another url to get
if (isset($url_list[$index]) && $url_list[$index]) {

    // new curl handle
    $ch = curl_init();

    // set the url
    curl_setopt($ch, CURLOPT_URL, $url_list[$index]);
    // to prevent the response from being outputted
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    // follow redirections
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    // do not need the body. this saves bandwidth and time
    curl_setopt($ch, CURLOPT_NOBODY, 1);

    // add it to the multi handle
    curl_multi_add_handle($mh, $ch);
    // increment so next url is used next time
    $index++;

    return true;
} else {

    // we are done adding new URLs
    return false;
}
}
4

1 回答 1

3

解决方案是在完成后立即处理每个请求。这消除了因忙等待而浪费的 CPU 周期。创建一个 cURL 请求队列以实现最大吞吐量也是一个好主意。每次请求完成时,我都会从队列中添加一个新请求。通过动态添加和删除链接,我们始终保持恒定数量的链接下载。这为我们提供了一种限制我们发送的同时请求数量的方法。结果是一种更快、更有效的并行处理大量 cURL 请求的方法。

资料来源:onlineaspect.com

这是一个供参考的函数:

function rolling_curl($urls, $callback, $custom_options = null) {

    // make sure the rolling window isn't greater than the # of urls
    $rolling_window = 5;
    $rolling_window = (sizeof($urls) &lt; $rolling_window) ? sizeof($urls) : $rolling_window;

    $master = curl_multi_init();
    $curl_arr = array();

    // add additional curl options here
    $std_options = array(CURLOPT_RETURNTRANSFER =&gt; true,
    CURLOPT_FOLLOWLOCATION =&gt; true,
    CURLOPT_MAXREDIRS =&gt; 5);
    $options = ($custom_options) ? ($std_options + $custom_options) : $std_options;

    // start the first batch of requests
    for ($i = 0; $i &lt; $rolling_window; $i++) {
        $ch = curl_init();
        $options[CURLOPT_URL] = $urls[$i];
        curl_setopt_array($ch,$options);
        curl_multi_add_handle($master, $ch);
    }

    do {
        while(($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
        if($execrun != CURLM_OK)
            break;
        // a request was just completed -- find out which one
        while($done = curl_multi_info_read($master)) {
            $info = curl_getinfo($done['handle']);
            if ($info['http_code'] == 200)  {
                $output = curl_multi_getcontent($done['handle']);

                // request successful.  process output using the callback function.
                $callback($output);

                // start a new request (it's important to do this before removing the old one)
                $ch = curl_init();
                $options[CURLOPT_URL] = $urls[$i++];  // increment i
                curl_setopt_array($ch,$options);
                curl_multi_add_handle($master, $ch);

                // remove the curl handle that just completed
                curl_multi_remove_handle($master, $done['handle']);
            } else {
                // request failed.  add error handling.
            }
        }
    } while ($running);

    curl_multi_close($master);
    return true;
}

希望这可以帮助!

于 2013-07-17T09:55:30.903 回答