0

I asked this question a while ago on Ask Ubuntu and was directed to provide some code here, as it's probably an optimization problem. I've included the whole script.

The general goal is to go through roughly 7000 HTML files in a directory and parse specific information from them and export it to a text file as one line.

#!/usr/bin/perl

use Switch;
use strict;

use HTML::Query 'Query';

my $dir = '/home/mark/Documents/Perl/garchivesfiles/completeresults';

opendir my $dh, $dir or die "Can't open $dir: $!";
my @files = map {"$dir/$_"} grep { $_ !~ /^\./ } readdir $dh;
closedir $dh;

my $total;

my %xlateNum2Text =  qw (0   January
                         1   Febuary
                         2   March
                         3   April
                         4   May
                         5   June
                         6   July
                         7   August
                         8   September
                         9   October
                         10  November
                         11  December                  
                       );


my $inc = 0;
foreach my $file (@files) {
    open FILE, $file;
    my $html = do { local $/; <FILE> };
    my $q = Query(text => $html);

    my @homescore = $q->query("span.homeScore");
    my @awayscore = $q->query("span.awayScore");
    my $singlehomescore = $homescore[0]->as_text();
    my $singleawayscore = $homescore[0]->as_text();

    my @hometeam = $q->query("table.teaminfo td.home span");
    my @awayteam = $q->query("table.teaminfo td.away span");
    my $singlehometeam = rightTeamName($hometeam[0]->as_text());
    my $singleawayteam = rightTeamName($awayteam[0]->as_text());


    my @homegoalstotal;
    my @awaygoalstotal;
    my @datearray;
    my @fixtureinfo;

    my @newhomegoals;
    my @newawaygoals;

    my @allinfogoals;

    if($singlehomescore ne "0" || $singleawayscore ne "0") {
        @homegoalstotal = $q->query("div.home ul li");
        @awaygoalstotal = $q->query("div.away ul li");
        my $i = 0;

        @datearray = $q->query("p.fixtureinfo span");
        my $finaldate = $datearray[0]->as_text();
        my @datecomponents = split(" ", $finaldate);
        my $mysqlyyyy = $datecomponents[3];
        my $mysqlmm = monthConvert($datecomponents[2]); 
        my $mysqldd = $datecomponents[1];

        my $mysqldate;

        if(length($mysqlmm) == 1) {
            $mysqlmm = "0".$mysqlmm;
        }

        if(length($mysqldd) == 1) {
            $mysqldd = "0".$mysqldd;
        }

        $mysqldate = $mysqlyyyy."-".$mysqlmm."-".$mysqldd;


        @fixtureinfo = $q->query("p.fixtureinfo");
        my $fixtureinfoinit = $fixtureinfo[0]->as_text();
        my @fixtureinfobrokenup = split(/ \| /, $fixtureinfoinit);
        my $fixtureinfostring = $fixtureinfobrokenup[1];

        foreach my $goal (@homegoalstotal) {
            my $tempmodifier = $goal->as_text();
            $tempmodifier =~ s/\)//g;
            my @tempcomponents = split(' \(', $tempmodifier);
            my $substitutetemp;
            my @extratimesplit;
            my $compositetime;

            if(index($tempcomponents[1], ",") != -1) {
                my @goaltimes = split('\,', $tempcomponents[1]);
                foreach my $individmultgoal (@goaltimes) {
                    $individmultgoal =~ s/Pen//g;
                    $individmultgoal =~ s/ //g;
                    if(index($individmultgoal, "OG") == -1) {
                        if(index($individmultgoal, "+") != -1) {
                            @extratimesplit = split('\+', $individmultgoal);
                            $compositetime = $extratimesplit[0];
                            push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
                            $i++;
                        } else {
                            push (@{$allinfogoals[$i]}, ($tempcomponents[0], $individmultgoal, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                            $i++;
                        }   
                    }
                }
            } else {
                $substitutetemp = $tempcomponents[1];
                $substitutetemp =~ s/Pen//g;
                $substitutetemp =~ s/ //g;
                if(index($substitutetemp, "OG") == -1) {
                    if(index($substitutetemp, "+") != -1) {
                        @extratimesplit = split('\+', $substitutetemp);
                        $compositetime = $extratimesplit[0];
                        push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
                        $i++;
                    } else {
                        push (@{$allinfogoals[$i]}, ($tempcomponents[0], $substitutetemp, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                        $i++;
                    }
                }
            }
        }

        foreach my $goal (@awaygoalstotal) {
            my $tempmodifier2 = $goal->as_text();
            $tempmodifier2 =~ s/\)//g;
            my @tempcomponents2 = split(' \(', $tempmodifier2);
            my $substitutetemp2;
            my @extratimesplit2;
            my $compositetime2;

            if(index($tempcomponents2[1], ",") != -1) {
                my @goaltimes2 = split('\,', $tempcomponents2[1]);
                foreach my $individmultgoal2 (@goaltimes2) {
                    $individmultgoal2 =~ s/Pen//g;
                    $individmultgoal2 =~ s/ //g;
                    if(index($individmultgoal2, "OG") == -1) {
                        if(index($individmultgoal2, "+") != -1) {
                            @extratimesplit2 = split('\+', $individmultgoal2);
                            $compositetime2 = $extratimesplit2[0];
                            push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
                            $i++;
                        } else {
                            push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $individmultgoal2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                            $i++;
                        }
                    }
                }
            } else {
                $substitutetemp2 = $tempcomponents2[1];
                $substitutetemp2 =~ s/Pen//g;
                $substitutetemp2 =~ s/ //g;
                if(index($substitutetemp2, "OG") == -1) {
                    if(index($substitutetemp2, "+") != -1) {
                        @extratimesplit2 = split('\+', $substitutetemp2);
                        $compositetime2 = $extratimesplit2[0];
                        push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
                        $i++;
                    } else {
                        push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $substitutetemp2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                        $i++;
                    }
                }
            }
        }


        @allinfogoals = sort { $a->[1] <=> $b->[1] || $a->[12] <=> $b->[12] } @allinfogoals;

        open(GOALCSV, '>>goalcsv.txt');

        my $homegoalcount = 0;
        my $awaygoalcount = 0;

        foreach my $row(@allinfogoals){
           foreach my $val(@$row){
                if($val eq "for:".$singlehometeam) {
                    $homegoalcount++;
                    print GOALCSV "$val,".$homegoalcount.",".$awaygoalcount.",true,";
                    print "$val,".$homegoalcount.",".$awaygoalcount.",true,";
                } elsif($val eq "for:".$singleawayteam) {
                    $awaygoalcount++;
                    print GOALCSV "$val,".$awaygoalcount.",".$homegoalcount.",false,";
                    print "$val,".$awaygoalcount.",".$homegoalcount.",false,";
                } else {
                    print GOALCSV "$val,";
                    print "$val,";
                }
           }
           print GOALCSV "\n";
           print "\n";
        }
    }

}

sub rightTeamName{
    my $teamname = $_[0];

    switch($teamname) {
        case "Nott'm Forest" { return "Nottingham Forest" }
        case "QPR"  { return "Queens Park Rangers" }
        case "Southampton" { return "Southampton FC" }
        case "Norwich" { return "Norwich City" }
        case "Tottenham" { return "Tottenham Hotspur" }
        case "Leeds" { return "Leeds United" }
        case "Middlesbrough" { return "Middlesbrough FC" }
        case "Chelsea" { return "Chelsea FC" }
        case "Arsenal" { return "Arsenal FC" }
        case "Oldham" { return "Oldham Athletic" }
        case "Ipswich" { return "Ipswich Town" }
        case "Man Utd" { return "Manchester United" }
        case "Man City" { return "Manchester City" }
        case "Sheffield Wed" { return "Sheffield Wednesday" }
        case "Man City" { return "Manchester City" }
        case "Blackburn" { return "Blackburn Rovers" }
        case "Wimbledon" { return "AFC Wimbledon" }
        case "Liverpool" { return "Liverpool FC" }
        case "Coventry" { return "Coventry City" }
        else        { return $teamname }

    }
}

sub monthConvert{
        switch($_[0]) {
            case "January" { return 1 }
            case "February" { return 2 }
            case "March" { return 3 }
            case "April" { return 4 }
            case "May" { return 5 }
            case "June" { return 6 }
            case "July" { return 7 }
            case "August" { return 8 }
            case "September" { return 9 }
            case "October" { return 10 }
            case "November" { return 11}
            case "December" { return 12 }
        }
}
4

2 回答 2

3

HTML::Query 使用 HTML::Element 和 HTML::TreeBuilder 对文档的节点进行建模。节点以复杂的方式连接,使得 Perl 垃圾收集器无法清理节点。因此,您要么必须

  • 断言你有一个可以使用弱引用的 HTML::Element 版本。这些不会阻止垃圾收集。Ause HTML::TreeBuilder 5 -weak应该做的伎俩。

  • 调用delete该方法的任何结果query

有关更多信息,请参阅文档(例如HTML::Element中的文档)。

下面是您的脚本的清理版本,它试图减少代码重复(原始代码中有明显的复制和粘贴迹象)。它仍然不漂亮,并且仍然存在一些 WTF,但它应该是对可维护性的改进。值得注意的是,我不知道第 12 列到底是什么@allinfogoals(在排序中),或者为什么 CSV 以这种相当奇怪的方式发出(我们已经知道for:列的索引 (→ 2),所以我们不知道) t 已将每一列与预期值相匹配)。

理解一些缺少的 if-else 的提示:当一个字符串不包含某个子字符串时,那么在该子字符串上拆分字符串的返回值等于原始字符串。作为代码:

use Test::More;
my ($string, $substring) = ("foo+bar", "-"); # try it yourself!
my ($split) = split /\Q$substring\E/, $string;
if (-1 == index $string, $substring) {
  is $split, $string;
} else {
  isnt $split, $string;
}
done_testing;

这是清理后的版本:

#!/usr/bin/perl

use strict; use warnings;

use HTML::TreeBuilder 5 -weak;
use HTML::Query;

my $dir = '/home/mark/Documents/Perl/garchivesfiles/completeresults';

opendir my $dh, $dir or die "Can't open $dir: $!";

while (my $filename = readdir $dh) {
    next if $filename =~ /^\./;
    my $q = HTML::Query->new(file => "$dir/$filename");

    my $homescore = $q->query("span.homeScore")->first->as_text;
    my $awayscore = $q->query("span.awayScore")->first->as_text;

    my $hometeam = correctTeamName($q->query("table.teaminfo td.home span")->first->as_text);
    my $awayteam = correctTeamName($q->query("table.teaminfo td.away span")->first->as_text);

    my @allinfogoals;

    if($homescore ne "0" || $awayscore ne "0") {

        my ($fixtureinfo_span) = $q->query("p.fixtureinfo span");
        my (undef, $day, $month, $year) = split ' ', $fixtureinfo_span->as_text;
        my $mysqldate = sprintf '%04d-%02d-%02d', $year, monthConvert($month), $day;

        my ($fixtureinfo) = $q->query('p.fixtureinfo');
        my (undef, $fixtureinfostring) = split / \| /, $fixtureinfo->as_text;

        for my $goal_list (
            [$hometeam, $awayteam, [$q->query("div.home ul li")->as_text]],
            [$awayteam, $hometeam, [$q->query("div.away ul li")->as_text]]
        ) {
            my ($thisteam, $otherteam, $goalstotal) = @$goal_list;
            for my $goal (@$goalstotal) {
                $goal =~ s/\)//g;
                my ($tempcomponent_1, $tempcomponent) = split / \(/, $goal;

                for my $individmultgoal (split/,/, $tempcomponent) {
                    next if -1 != index $individmultgoal, 'OG';
                    $individmultgoal =~ s/Pen//g;
                    $individmultgoal =~ s/ //g;
                    my @timesplit = 
                        (index($individmultgoal, "+") != -1)
                        ? (split /\+/, $individmultgoal)
                        : ($individmultgoal, 0);
                    push @allinfogoals, [
                        $tempcomponent_1,
                        $timesplit[0],
                        "for:$thisteam",
                        $otherteam,
                        $day,
                        $month,
                        $year,
                        $fixtureinfostring,
                        "Barclays Premier League",
                        monthConvert($month),
                        $mysqldate,
                        $timesplit[1],
                    ];
                }
            }
        }

        @allinfogoals = sort { $a->[1] <=> $b->[1] || $a->[12] <=> $b->[12] } @allinfogoals;

        open my $GOALCSV, '>>', 'goalcsv.txt' or die "Can't open goalcsv.txt: $!";

        my $print_both = sub {
            print {$GOALCSV} @_;
            print            @_;
        };

        my $homegoalcount = 0;
        my $awaygoalcount = 0;

        for my $row (@allinfogoals){
            for my $val(@$row){
                if($val eq "for:$hometeam") {
                    $homegoalcount++;
                    $print_both->("$val,$homegoalcount,$awaygoalcount,true,");
                } elsif($val eq "for:$awayteam") {
                    $awaygoalcount++;
                    $print_both->("$val,$awaygoalcount,$homegoalcount,false,");
                } else {
                    $print_both->("$val,");
                }
            }
            $print_both->("\n");
        }
    }
}

closedir $dh;

sub correctTeamName{
    my %teamnames = (
        "Nott'm Forest" => "Nottingham Forest",
        "QPR"           => "Queens Park Rangers",
        "Southampton"   => "Southampton FC",
        "Norwich"       => "Norwich City",
        "Tottenham"     => "Tottenham Hotspur",
        "Leeds"         => "Leeds United",
        "Middlesbrough" => "Middlesbrough FC",
        "Chelsea"       => "Chelsea FC",
        "Arsenal"       => "Arsenal FC",
        "Oldham"        => "Oldham Athletic",
        "Ipswich"       => "Ipswich Town",
        "Man Utd"       => "Manchester United",
        "Man City"      => "Manchester City",
        "Sheffield Wed" => "Sheffield Wednesday",
        "Man City"      => "Manchester City",
        "Blackburn"     => "Blackburn Rovers",
        "Wimbledon"     => "AFC Wimbledon",
        "Liverpool"     => "Liverpool FC",
        "Coventry"      => "Coventry City",
    );
    return exists $teamnames{$_[1]} ? $teamnames{$_[1]} : $_[0];
}

sub monthConvert{
    my $i = 1;
    my %months = map { $_ => $i++ } qw/
        January February    March
        April   May         June
        July    August      September
        October November    December
    /;
    exists $months{$_[0]} or die "Unknown month name $_[0]";
    return $months{$_[0]};
}

注意:代码未经测试,因为没有提供示例文件。至少它编译。

于 2013-05-17T10:28:53.133 回答
3

您的一个或多个文件很可能非常大。

在您浏览文件时打印出文件的名称。您每次都会在其中一个上看到您的代码中断。

于 2013-05-17T04:34:27.437 回答