2

我正在取得进展,但我遇到了一个新问题。

这是新代码:

#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use HTML::TreeBuilder;

my $url = 'http://oreilly.com/store/complete.html';
my $page = get( $url ) or die $!;
my $p = HTML::TreeBuilder->new_from_content( $page );
my($book);
my($edition);

my @links = $p->look_down(
    _tag => 'a',
    href => qr{^ /Qhttp://www.oreilly.com/catalog/\E \w+ $}x
);

my @rows = map { $_->parent->parent } @links;

my @books;
for my $row (@rows) {
    my %book;
    my @cells = $row->look_down( _tag => 'td' );
    $book{title}    =$cells[0]->as_trimmed-text;
    $book{price}    =$cells[2]->as_trimmed-text;
    $book{price} =~ s/^\$//;

    $book{url}      = get_url( $cells[0] );
    $book{ebook}    = get_url( $cells[3] );
    $book{safari}   = get_url( $cells[4] );
    $book{examples} = get_url( $cells[5] );
    push @books, \%book;
}

sub get_url {
    my $node = shift;
    my @hrefs = $node->look_down( _tag => 'a');
    return unless @hrefs;
    my $url = $hrefs[0]->atr('href');
    $url =~ s/\s+$//;
    return $url;
}

$p = $p->delete; #we don't need this anymore.

{
    my $count = 1;
    my @perlbooks = sort { $a->{price} <=> $b->{price} }
                    grep { $_->{title} =~/perl/i } @books;
    print $count++, "\t", $_->{price}, "\t", $_->{title} for @perlbooks;
}

{
    my @perlbooks = grep { $_->{title} =~ /perl/i } @books;
    my @javabooks = grep { $_->{title} =~ /java/i } @books;
    my $diff =  @javabooks - @perlbooks;
    print "There are ".@perlbooks." Perl books and ".@javabooks. " Java books. $diff more Java than Perl.";
}

for my $book ( $books[34] ) {
    my $url = $book->{url};
    my $page = get( $url );
    my $tree = HTML::TreeBuilder->new_from_content( $page );
    my ($pubinfo) = $tree->look_down(
                                    _tag => 'span',
                                    class => 'secondary2'
    );
    my $html = $pubinfo->as_HTML; print $html;
    my ($pages) = $html =~ /(\d+) pages/,
    my ($edition) = $html =~ /(\d)(?:st|nd|rd|th) Edition/;
    my ($date) = $html =~ /(\w+ (19|20)\d\d)/;

    print "\n$pages $edition $date\n";

    my ($img_node) = $tree->look_down(
                                    _tag => 'img',
                                    src  => qr{^/catalog/covers/},
    );
    my $img_url = 'http://www.oreilly.com'.$img_node->attr('src');
    my $cover = get( $img_url );
    # now save $cover to disk
}

现在我收到这些错误,

在 ./SpiderTutorial_19_06.pl 第 23 行使用“strict subs”时不允许使用裸词“text”。在 ./SpiderTutorial_19_06.pl 第 24 行使用“strict subs”时不允许使用裸词“text”。执行 ./SpiderTutorial_19_06。 pl 由于编译错误而中止。

任何帮助将不胜感激。

4

2 回答 2

4

我不知道原始程序,但很可能as_trimmed-text应该是as_trimmed_text.

于 2010-06-25T23:40:16.700 回答
3

问题是方法名称as_trimmed-text。perl 中的名称中不允许使用连字符。你可能的意思是as_trimmed_text。现在它解析为$cells[0]->as_trimmed() - text().

于 2010-06-25T23:40:32.953 回答