这是一个代理抓取器和测试器脚本。
我已经尝试了很多东西。我喜欢 Rolling Curl,但我似乎无法使用它。有什么方法可以加快速度,或者用 JavaScript 限制它?处理时间和资源太高了!
现在它只有在有一个或两个来源的情况下才能工作,否则它只会永远运行。在 PYTHON 和 Windows 应用程序中有大量这样的应用程序可以收集和检查数千个代理。我只是想知道 PHP 是否有可能这样做。
// Settings
error_reporting(E_ALL);
ini_set('max_execution_time', 0);
require_once ('classes/class.multicurl.php');
set_time_limit(0);
// Short Delay
$delay = rand(2, 4);
// Long Delay
$longdelay = rand(4, 7);
// Checking Proxies
$fileName = "leeched/proxies.txt";
// where to save successful proxies
$success = "goodproxies/success.txt";
$source = file('sources/sources.txt');
// SET Cookie
$c = new Curl;
foreach($source as $sources) {
// Request To Delete Duplicate Proxies
$c->addRequest(trim($sources));
}
$c->chunk(25);
$c->perform();
$proxies = array();
foreach($c->results as $url => $res) {
// REGEX MATCH
preg_match_all('@[0-9]{1,4}\.[0-9]{1,4}\.[0-9]{1,4}\.[0-9]{1,4}:[0-9] {1,6}@', $res, $m);
$eachproxy = stream_get_contents($res);
$proxies[$url] = $m[0]; {
while ($proxies == time() && $eachproxy > 4) { // go into "waiting" when we going to fast
usleep(100000); // wait .1 second and ask again
}
if ($proxies != time()) { // remember to reset this second and the cnt
$proxies = time();
$eachproxy = 0;
}
}
foreach($proxies as $url => $parr) {
$str = implode("\n", $parr);
file_put_contents('leeched/proxies.txt', $str);
$k = count($parr);
$str2 = date('h:i:s d m') . " | \t" . $k . "\t" . $url . "\n";
file_put_contents('logs/counts.txt', $str2, FILE_APPEND);
}
$uar = file('leeched/proxies.txt');
$uar = array_unique($uar);
$str = implode("\n", $uar) . "\n";
$str = preg_replace('/^\h*\v+/m', '', $str);
file_put_contents('leeched/proxies.txt', $str);
}
// Proxy Testing
if (!is_file($fileName)) die('Proxy file not available');
$proxies = file($fileName);
for ($p = 0; $p < count($proxies); $p++) {
$ch = curl_init(); //initizlize and set url
curl_setopt($ch, CURLOPT_URL, "http://www.yordomain.com/check.php");
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_HTTPGET, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_VERBOSE, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 7);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_PROXY, trim($proxies[$p]));
$data = curl_exec($ch);
usleep(100000);
if (strpos($data, 'Anonymous') !== false) {
usleep(100000);
echo "<img src=\"images/good.png\"> <font color=\"#7CFC00\"><strong>" . $proxies[$p] . " </font></strong><font color=\"#FFFFE0\"><strong> THIS IS A WORKING ANONYMOUS PROXY SAVED TO /goodproxies/success.txt</font></strong><font color=\"yellow\"><strong> " . "Total time: " . curl_getinfo($ch, CURLINFO_TOTAL_TIME) . " seconds!</font></strong><img src=\"images/small.png\"> <br/><br/>";
$f = fopen($success, "a");
fwrite($f, $proxies[$p]);
fclose($f);
}
elseif (curl_errno($ch)) {
usleep(100000);
echo "<img src=\"images/bad.png\"> <font color=\"white\"><strong>" . $proxies[$p] . " </font></strong><font color=\"red\"><strong>ERROR:</font></strong><font color=\"#00FFFF\"><strong> " . curl_error($ch) . " </font></strong><img src=\"images/redx.png\"> <br/><br/>";
}
else {
echo "<img src=\"images/warning.png\"> <font color=\"#7CFC00\"><strong> " . $proxies[$p] . " </font></strong><font color=\"white\"><strong> THERE WAS NO ERROR CONNECTING BUT THIS PROXY IS NOT ANONYMOUS! NOT SAVED</font></strong> <font color=\"#FF69B4\"><strong>(No content from source)</font></strong><img src=\"images/redx.png\"> <br/><br/>";
}
flush();
curl_close($ch);
}
$done = "done";
echo $done;