我想开发一个网络爬虫,它从种子 URL 开始,然后爬取 100 个它发现与种子 URL 属于同一域的 html 页面,并记录遍历的 URL,避免重复。我已经编写了以下内容,但 $url_count 值似乎没有增加,并且检索到的 URL 甚至包含来自其他域的链接。我该如何解决这个问题?在这里,我插入了 stackoverflow.com 作为我的起始 URL。
use strict;
use warnings;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
##open file to store links
open my $file1,">>", ("extracted_links.txt");
select($file1);
##starting URL
my @urls = 'http://stackoverflow.com/';
my $browser = LWP::UserAgent->new('IE 6');
$browser->timeout(10);
my %visited;
my $url_count = 0;
while (@urls)
{
my $url = shift @urls;
if (exists $visited{$url}) ##check if URL already exists
{
next;
}
else
{
$url_count++;
}
my $request = HTTP::Request->new(GET => $url);
my $response = $browser->request($request);
if ($response->is_error())
{
printf "%s\n", $response->status_line;
}
else
{
my $contents = $response->content();
$visited{$url} = 1;
@lines = split(/\n/,$contents);
foreach $line(@lines)
{
$line =~ m@(((http\:\/\/)|(www\.))([a-z]|[A-Z]|[0-9]|[/.]|[~]|[-_]|[()])*[^'">])@g;
print "$1\n";
push @urls, $$line[2];
}
sleep 60;
if ($visited{$url} == 100)
{
last;
}
}
}
close $file1;