-3

这是一个代理抓取器和测试器脚本。

我已经尝试了很多东西。我喜欢 Rolling Curl,但我似乎无法使用它。有什么方法可以加快速度,或者用 JavaScript 限制它?处理时间和资源太高了!

现在它只有在有一个或两个来源的情况下才能工作,否则它只会永远运行。在 PYTHON 和 Windows 应用程序中有大量这样的应用程序可以收集和检查数千个代理。我只是想知道 PHP 是否有可能这样做。

// Settings
error_reporting(E_ALL);
ini_set('max_execution_time', 0);
require_once ('classes/class.multicurl.php');

set_time_limit(0);
// Short Delay
$delay = rand(2, 4);
// Long Delay
$longdelay = rand(4, 7);
// Checking Proxies
$fileName = "leeched/proxies.txt";
// where to save successful proxies
$success = "goodproxies/success.txt";
$source = file('sources/sources.txt');
// SET Cookie
$c = new Curl;
foreach($source as $sources) {
  // Request To Delete Duplicate Proxies
  $c->addRequest(trim($sources));
}
$c->chunk(25);
$c->perform();
$proxies = array();
foreach($c->results as $url => $res) {
  // REGEX MATCH
  preg_match_all('@[0-9]{1,4}\.[0-9]{1,4}\.[0-9]{1,4}\.[0-9]{1,4}:[0-9]    {1,6}@', $res, $m);
  $eachproxy = stream_get_contents($res);
  $proxies[$url] = $m[0]; {
    while ($proxies == time() && $eachproxy > 4) { // go into "waiting" when       we going to fast
      usleep(100000); // wait .1 second and ask again
    }
    if ($proxies != time()) { // remember to reset this second and the cnt
      $proxies = time();
      $eachproxy = 0;
    }
  }
  foreach($proxies as $url => $parr) {
    $str = implode("\n", $parr);
    file_put_contents('leeched/proxies.txt', $str);
    $k = count($parr);
    $str2 = date('h:i:s d m') . " | \t" . $k . "\t" . $url . "\n";
    file_put_contents('logs/counts.txt', $str2, FILE_APPEND);
  }
  $uar = file('leeched/proxies.txt');
  $uar = array_unique($uar);
  $str = implode("\n", $uar) . "\n";
  $str = preg_replace('/^\h*\v+/m', '', $str);
  file_put_contents('leeched/proxies.txt', $str);
}
// Proxy Testing
if (!is_file($fileName)) die('Proxy file not available');
$proxies = file($fileName);
for ($p = 0; $p < count($proxies); $p++) {
  $ch = curl_init(); //initizlize and set url
  curl_setopt($ch, CURLOPT_URL, "http://www.yordomain.com/check.php");
  curl_setopt($ch, CURLOPT_HEADER, 1);
  curl_setopt($ch, CURLOPT_HTTPGET, 1);
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  curl_setopt($ch, CURLOPT_HEADER, FALSE);
  curl_setopt($ch, CURLOPT_VERBOSE, TRUE);
  curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 7);
  curl_setopt($ch, CURLOPT_TIMEOUT, 5);
  curl_setopt($ch, CURLOPT_PROXY, trim($proxies[$p]));
  $data = curl_exec($ch);
  usleep(100000);
  if (strpos($data, 'Anonymous') !== false) {
    usleep(100000);
    echo "<img src=\"images/good.png\">&nbsp;&nbsp;<font  color=\"#7CFC00\"><strong>" . $proxies[$p] . " </font></strong><font color=\"#FFFFE0\"><strong>   THIS IS A WORKING ANONYMOUS PROXY SAVED TO /goodproxies/success.txt</font></strong><font color=\"yellow\"><strong> " . "Total time: " . curl_getinfo($ch, CURLINFO_TOTAL_TIME) . " seconds!</font></strong><img src=\"images/small.png\"> <br/><br/>";
    $f = fopen($success, "a");
    fwrite($f, $proxies[$p]);
    fclose($f);
  }
  elseif (curl_errno($ch)) {
    usleep(100000);
    echo "<img src=\"images/bad.png\">&nbsp;&nbsp;<font color=\"white\"><strong>" . $proxies[$p] . " </font></strong><font color=\"red\"><strong>ERROR:</font></strong><font color=\"#00FFFF\"><strong> " . curl_error($ch) . " </font></strong><img src=\"images/redx.png\"> <br/><br/>";
  }
  else {
    echo "<img src=\"images/warning.png\">&nbsp;&nbsp;<font color=\"#7CFC00\"><strong> " . $proxies[$p] . "   </font></strong><font color=\"white\"><strong> THERE WAS NO ERROR CONNECTING BUT THIS PROXY IS NOT ANONYMOUS!  NOT SAVED</font></strong> <font color=\"#FF69B4\"><strong>(No content from source)</font></strong><img src=\"images/redx.png\"> <br/><br/>";
  }
  flush();
  curl_close($ch);
}
$done = "done";
echo $done;
4

1 回答 1

0

如果你这样做会更快,curl_setopt($ch, CURLOPT_ENCODING, '');并且你的 libcurl 是用 gzip/deflate 支持编译的,并且目标网站至少支持其中的 1 个(几乎总是这样)

于 2017-03-11T11:39:32.633 回答