I asked this question a while ago on Ask Ubuntu and was directed to provide some code here, as it's probably an optimization problem. I've included the whole script.
The general goal is to go through roughly 7000 HTML files in a directory and parse specific information from them and export it to a text file as one line.
#!/usr/bin/perl
use Switch;
use strict;
use HTML::Query 'Query';
my $dir = '/home/mark/Documents/Perl/garchivesfiles/completeresults';
opendir my $dh, $dir or die "Can't open $dir: $!";
my @files = map {"$dir/$_"} grep { $_ !~ /^\./ } readdir $dh;
closedir $dh;
my $total;
my %xlateNum2Text = qw (0 January
1 Febuary
2 March
3 April
4 May
5 June
6 July
7 August
8 September
9 October
10 November
11 December
);
my $inc = 0;
foreach my $file (@files) {
open FILE, $file;
my $html = do { local $/; <FILE> };
my $q = Query(text => $html);
my @homescore = $q->query("span.homeScore");
my @awayscore = $q->query("span.awayScore");
my $singlehomescore = $homescore[0]->as_text();
my $singleawayscore = $homescore[0]->as_text();
my @hometeam = $q->query("table.teaminfo td.home span");
my @awayteam = $q->query("table.teaminfo td.away span");
my $singlehometeam = rightTeamName($hometeam[0]->as_text());
my $singleawayteam = rightTeamName($awayteam[0]->as_text());
my @homegoalstotal;
my @awaygoalstotal;
my @datearray;
my @fixtureinfo;
my @newhomegoals;
my @newawaygoals;
my @allinfogoals;
if($singlehomescore ne "0" || $singleawayscore ne "0") {
@homegoalstotal = $q->query("div.home ul li");
@awaygoalstotal = $q->query("div.away ul li");
my $i = 0;
@datearray = $q->query("p.fixtureinfo span");
my $finaldate = $datearray[0]->as_text();
my @datecomponents = split(" ", $finaldate);
my $mysqlyyyy = $datecomponents[3];
my $mysqlmm = monthConvert($datecomponents[2]);
my $mysqldd = $datecomponents[1];
my $mysqldate;
if(length($mysqlmm) == 1) {
$mysqlmm = "0".$mysqlmm;
}
if(length($mysqldd) == 1) {
$mysqldd = "0".$mysqldd;
}
$mysqldate = $mysqlyyyy."-".$mysqlmm."-".$mysqldd;
@fixtureinfo = $q->query("p.fixtureinfo");
my $fixtureinfoinit = $fixtureinfo[0]->as_text();
my @fixtureinfobrokenup = split(/ \| /, $fixtureinfoinit);
my $fixtureinfostring = $fixtureinfobrokenup[1];
foreach my $goal (@homegoalstotal) {
my $tempmodifier = $goal->as_text();
$tempmodifier =~ s/\)//g;
my @tempcomponents = split(' \(', $tempmodifier);
my $substitutetemp;
my @extratimesplit;
my $compositetime;
if(index($tempcomponents[1], ",") != -1) {
my @goaltimes = split('\,', $tempcomponents[1]);
foreach my $individmultgoal (@goaltimes) {
$individmultgoal =~ s/Pen//g;
$individmultgoal =~ s/ //g;
if(index($individmultgoal, "OG") == -1) {
if(index($individmultgoal, "+") != -1) {
@extratimesplit = split('\+', $individmultgoal);
$compositetime = $extratimesplit[0];
push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
$i++;
} else {
push (@{$allinfogoals[$i]}, ($tempcomponents[0], $individmultgoal, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
$i++;
}
}
}
} else {
$substitutetemp = $tempcomponents[1];
$substitutetemp =~ s/Pen//g;
$substitutetemp =~ s/ //g;
if(index($substitutetemp, "OG") == -1) {
if(index($substitutetemp, "+") != -1) {
@extratimesplit = split('\+', $substitutetemp);
$compositetime = $extratimesplit[0];
push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
$i++;
} else {
push (@{$allinfogoals[$i]}, ($tempcomponents[0], $substitutetemp, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
$i++;
}
}
}
}
foreach my $goal (@awaygoalstotal) {
my $tempmodifier2 = $goal->as_text();
$tempmodifier2 =~ s/\)//g;
my @tempcomponents2 = split(' \(', $tempmodifier2);
my $substitutetemp2;
my @extratimesplit2;
my $compositetime2;
if(index($tempcomponents2[1], ",") != -1) {
my @goaltimes2 = split('\,', $tempcomponents2[1]);
foreach my $individmultgoal2 (@goaltimes2) {
$individmultgoal2 =~ s/Pen//g;
$individmultgoal2 =~ s/ //g;
if(index($individmultgoal2, "OG") == -1) {
if(index($individmultgoal2, "+") != -1) {
@extratimesplit2 = split('\+', $individmultgoal2);
$compositetime2 = $extratimesplit2[0];
push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
$i++;
} else {
push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $individmultgoal2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
$i++;
}
}
}
} else {
$substitutetemp2 = $tempcomponents2[1];
$substitutetemp2 =~ s/Pen//g;
$substitutetemp2 =~ s/ //g;
if(index($substitutetemp2, "OG") == -1) {
if(index($substitutetemp2, "+") != -1) {
@extratimesplit2 = split('\+', $substitutetemp2);
$compositetime2 = $extratimesplit2[0];
push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
$i++;
} else {
push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $substitutetemp2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
$i++;
}
}
}
}
@allinfogoals = sort { $a->[1] <=> $b->[1] || $a->[12] <=> $b->[12] } @allinfogoals;
open(GOALCSV, '>>goalcsv.txt');
my $homegoalcount = 0;
my $awaygoalcount = 0;
foreach my $row(@allinfogoals){
foreach my $val(@$row){
if($val eq "for:".$singlehometeam) {
$homegoalcount++;
print GOALCSV "$val,".$homegoalcount.",".$awaygoalcount.",true,";
print "$val,".$homegoalcount.",".$awaygoalcount.",true,";
} elsif($val eq "for:".$singleawayteam) {
$awaygoalcount++;
print GOALCSV "$val,".$awaygoalcount.",".$homegoalcount.",false,";
print "$val,".$awaygoalcount.",".$homegoalcount.",false,";
} else {
print GOALCSV "$val,";
print "$val,";
}
}
print GOALCSV "\n";
print "\n";
}
}
}
sub rightTeamName{
my $teamname = $_[0];
switch($teamname) {
case "Nott'm Forest" { return "Nottingham Forest" }
case "QPR" { return "Queens Park Rangers" }
case "Southampton" { return "Southampton FC" }
case "Norwich" { return "Norwich City" }
case "Tottenham" { return "Tottenham Hotspur" }
case "Leeds" { return "Leeds United" }
case "Middlesbrough" { return "Middlesbrough FC" }
case "Chelsea" { return "Chelsea FC" }
case "Arsenal" { return "Arsenal FC" }
case "Oldham" { return "Oldham Athletic" }
case "Ipswich" { return "Ipswich Town" }
case "Man Utd" { return "Manchester United" }
case "Man City" { return "Manchester City" }
case "Sheffield Wed" { return "Sheffield Wednesday" }
case "Man City" { return "Manchester City" }
case "Blackburn" { return "Blackburn Rovers" }
case "Wimbledon" { return "AFC Wimbledon" }
case "Liverpool" { return "Liverpool FC" }
case "Coventry" { return "Coventry City" }
else { return $teamname }
}
}
sub monthConvert{
switch($_[0]) {
case "January" { return 1 }
case "February" { return 2 }
case "March" { return 3 }
case "April" { return 4 }
case "May" { return 5 }
case "June" { return 6 }
case "July" { return 7 }
case "August" { return 8 }
case "September" { return 9 }
case "October" { return 10 }
case "November" { return 11}
case "December" { return 12 }
}
}