I am building a basic search engine using vector-space model and this is the crawler for returning 500 URLs and removes the SGML tags from the content. However, it is very slow (takes more than 30mins for retrieving the URLs only). How can I optimize the code? I have inserted wikipedia.org as an example starting URL.
use warnings;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTML::LinkExtor;
my $starting_url = 'http://en.wikipedia.org/wiki/Main_Page';
my @urls = $starting_url;
my %alreadyvisited;
my $browser = LWP::UserAgent->new();
my $url_count = 0;
while (@urls)
my $url = shift @urls;
next if $alreadyvisited{$url}; ## check if already visited
my $request = HTTP::Request->new(GET => $url);
my $response = $browser->request($request);
if ($response->is_error())
print $response->status_line, "\n"; ## check for bad URL
my $contents = $response->content(); ## get contents from URL
push @c, $contents;
my @text = &RemoveSGMLtags(\@c);
#print "@text\n";
$alreadyvisited{$url} = 1; ## store URL in hash for future reference
print "$url\n";
if ($url_count == 500) ## exit if number of crawled pages exceed limit
exit 0;
my ($page_parser) = HTML::LinkExtor->new(undef, $url);
$page_parser->parse($contents)->eof; ## parse page contents
my @links = $page_parser->links;
foreach my $link (@links)
$test = $$link[2];
$test =~ s!^https?://(?:www\.)?!!i;
$test =~ s!/.*!!;
$test =~ s/[\?\#\:].*//;
if ($test eq "en.wikipedia.org") ## check if URL belongs to unt domain
next if ($$link[2] =~ m/^mailto/);
next if ($$link[2] =~ m/s?html?|xml|asp|pl|css|jpg|gif|pdf|png|jpeg/);
push @urls, $$link[2];
sleep 1;
sub RemoveSGMLtags
my ($input) = @_;
my @INPUTFILEcontent = @$input;
my $j;my @raw_text;
for ($j=0; $j<$#INPUTFILEcontent; $j++)
my $INPUTFILEvalue = $INPUTFILEcontent[$j];
use HTML::Parse;
use HTML::FormatText;
my $plain_text = HTML::FormatText->new->format(parse_html($INPUTFILEvalue));
push @raw_text, ($plain_text);
return @raw_text;