我被赋予了一项令人沮丧的任务,即尝试对我们刚刚购买的 Royal Mail PAF 文件做一些事情,我现在已将所有相关表加载到我们的 SQL 服务器中,并根据相关 ID 将它们链接在一起。我现在需要做的是根据多个规则将正确的地址放在一起,例如,如果仅提供建筑物名称或编号,或者在给定特定邮政编码时提供邮政信箱。我希望由于这似乎是一个被广泛使用的文件,有人可能已经为此做了一些工作,任何人都可以在本周末之前提出任何他们想要的想法。
希望有人可以帮忙,P
沮丧-这是一项大工作-很多规则很多例外。我认为您已经看过 PAF 程序员指南中的所有规则?http://www.royalmail.com/sites/default/files/docs/pdf/programmers_guide_edition_7_v5.pdf 如果您希望在本周末完成,您最好获得实际上是扩展的“压缩标准”版本您所追求的地址,但当然它没有针对数据库进行规范化。很久以前,我从关系格式构建地址。我看看能不能找到代码。
发现一些丑陋的 8 岁 Perl - 谨慎使用
while ( <PAF> ) # Reading from mysql view - you can work this out from the fields populated by the split /\t/
{
chomp;
$incount++;
print STDERR "Processed ..... $incount\n" if $incount%1000000 == 0;
my ( $postcode , $aKey , $lkey , $skey ,
$sekey , $dskey , $dsekey , $bnumber ,
$bkey , $sbkey , $households , $oKey ,
$fmtPcType , $concat , $dps , $fmtsUser ,
$poBox , $version , $active ,
$fmtCompany , $fmtDepartment , $rawSubLine , $rawBldLine ,
$DepSteet , $fmtDepSteetEnd , $Steet , $fmtSteetEnd ,
$DepLocality , $Locality , $PostTown )
= split /\t/ , $_ , -1 ;
my $fmtaKey = sprintf("%08s",$aKey);
my $fmtoKey = sprintf("%08s",$oKey);
$fmtoKey = "" if $fmtoKey eq "00000000" ;
$fmtoKey = $fmtaKey if $fmtPcType eq "L";
$fmtDepLocality = removePunch($DepLocality);
$fmtLocality = removePunch($Locality,"dq");
$fmtDepSteet = removePunch($DepSteet);
$fmtSteet = removePunch($Steet,"dh");
$rawSubLine = removePunch($rawSubLine,"d");
$rawBldLine = removePunch($rawBldLine,"d");
my $fmtPostTown = removePunch($PostTown);
my ($opc,$ipc) = unpack("A4A3" ,$postcode);
$opc =~ s/\s$//;
my $fmtPoBox = "PO BOX $poBox" if (length($poBox) > 0);
my ($fmtSubBuilding , $fmtBuilding ) = ( "","") ;
# ##############################################################################
# Format rules are based on presence of builing name , subbuilding and number
# See PAF Digest for details
# ##############################################################################
my ($subBuildingFlag,$buildingFlag,$buildingNumberFlag) = ("N","N","N") ;
$buildingNumberFlag ="Y" if $bnumber ne "0" ;
$buildingFlag ="Y" if length($rawBldLine) > 0;
$subBuildingFlag ="Y" if length($rawSubLine) > 0;
my $formatKey="${subBuildingFlag}${buildingFlag}${buildingNumberFlag}";
my $ruleid ;
# ----------------------------------------------------------------------------
# Formmating Rule 1 (Org name only)
# ----------------------------------------------------------------------------
if ( $formatKey eq "NNN" )
{
$ruleid="1";
}
# ----------------------------------------------------------------------------
# Formmating Rule 2 (Building number only)
# ----------------------------------------------------------------------------
elsif ( $formatKey eq "NNY" )
{
$ruleid="2";
InsertNumber ($bnumber) ;
}
# ----------------------------------------------------------------------------
# Formmating Rule 3 (Building Name only)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "NYN")
{
$ruleid="3";
my $N="";
($fmtBuilding,$N) = split /\|/ , F1Check ($rawBldLine) , -1 ;
InsertNumber ($N) ;
}
# ----------------------------------------------------------------------------
# Formmating Rule 4 (Building Name and building Number)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "NYY")
{
$ruleid="4";
$fmtBuilding = "$rawBldLine";
InsertNumber ($bnumber) ;
}
# ----------------------------------------------------------------------------
# Formmating Rule 5 (SubBuilding Name and building Number)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "YNY")
{
$ruleid="5";
if ($concat eq "Y")
{
my $numSub = "$bnumber $rawSubLine";
InsertNumber ($numSub) ;
}
else
{
$fmtSubBuilding = "$rawSubLine";
InsertNumber ($bnumber) ;
}
}
# ----------------------------------------------------------------------------
# Formmating Rule 6 (SubBuilding Name and building name)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "YYN" )
{
$ruleid="6";
($fmtSubBuilding,$N1) = split /\|/ , F1Check ($rawSubLine) , -1 ;
($fmtBuilding,$N2) = split /\|/ , F1Check ($rawBldLine) , -1 ;
if ( $fmtSubBuilding eq "" )
{
if ( $N2 =~ /^REAR OF/ )
{
$fmtSubBuilding .= "$N1 $N2" ;
$N1 = "" ;
$N2 = "" ;
}
else
{
$fmtSubBuilding .= "$N1 " . $fmtBuilding;
$fmtBuilding = "";
$N1 = "" ;
}
}
$N2 = $N1 if $N2 eq "";
InsertNumber ($N2) ;
}
# ----------------------------------------------------------------------------
# Formmating Rule 7 (SubBuilding Name ,building name and building number)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "YYY" )
{
$ruleid="7";
($fmtSubBuilding,$N1) = split /\|/ , F1Check ($rawSubLine) , -1 ;
($fmtBuilding,$N2) = split /\|/ , F1Check ($rawBldLine) , -1 ;
if ( $fmtSubBuilding eq "" )
{
$fmtSubBuilding .= "$N1 " . $fmtBuilding;
$fmtBuilding = "";
}
InsertNumber ($bnumber) ;
}
# ##############################################################################
# Format the address
# ##############################################################################
#----------------------------------------------------------------
# subbuilding = building if no subbuilding (why?)
#----------------------------------------------------------------
if (( length($fmtBuilding) > 0) && ($fmtSubBuilding eq "" ))
{
$fmtSubBuilding = $fmtBuilding;
$fmtBuilding = "";
}
#----------------------------------------------------------------
# Get rid of duplicate lines
#----------------------------------------------------------------
if ("${fmtSubBuilding}${fmtBuilding}" eq "${fmtSteet}${fmtSteetEnd}")
{
$fmtSubBuilding="";
$fmtBuilding="";
}
#----------------------------------------------------------------
# Parse out number ranges (including number suffixes)
#----------------------------------------------------------------
my ($fmtStreetName ,$lo_num , $low_suf , $hi_num ,$hi_suf) = split /\|/ , getLoHiNum ("$fmtSteet") , -1 ;
my $dependentTfare = "$fmtDepSteet";
$dependentTfare .= " $fmtDepSteetEnd" if length($fmtDepSteetEnd) > 0;
# ABERDEEN CITY COUNCIL|EDUCATION DEPARTMENT||ST NICHOLAS HOUSE||||BROAD|STREET|||ABERDEEN|ABERDEENSHIRE|AB10|1AG|1A|0|01901355|01901355|L||
my $fmtaddr .= "$postcode|$dps|$fmtaKey|$fmtoKey|$fmtPoBox|$fmtSubBuilding|$fmtBuilding|$fmtDepSteet|$fmtDepSteetEnd|" ;
$fmtaddr .= "$fmtSteet|$fmtSteetEnd|$fmtDepLocality|$fmtLocality|$fmtPostTown|$opc|$ipc|" ;
$fmtaddr .= "$ruleid|$lo_num|$low_suf|$hi_num|$hi_suf|$dependentTfare|$fmtStreetName";
print "$fmtaddr\n";
}
close PAF;
# # ############################################################################
# SUB F1Check (PAF Digest Note:1 Page 42)
#
# Try and extract a number embeded in the (sub)building name
# There are loads of exceptions to the rules in the PAF digest
# hence the horrible regex's
# # ############################################################################
sub F1Check
{
my ($building) = (@_);
my $bnumber = "";
if ($building =~ m/^REAR OF.+\d/ )
{
$bnumber=$building;
$building="" ;
}
elsif (($building =~ m/\d+/) && ($building !~ /UNIT[S]?\s|
FLAT[S]?\s|
^DP\d+[A-Z]?$|
BLOCK[S]?\s|
^OFFICE[S]?\s|
HANG[EA]R\s|
^BUILDING[S]?\s|
^HOUSE\s|
HOLDING\s|
APARTMENT\s|
CHALET\s|
^BUNGALOW\s|
^CARAVAN\s|
^ANNEXE\s|
^PLOT[S]?\s|
^HOME[S]?\s|
^LODGE\s\d|
^PLATFORM\s|
^DOMUS\s|
^BARLOW\s|
^BEALAH\s|
^KIOSK[S]?\s|
^LEVEL[S]?\s|
^VILLA\s|
MEOTA\s|
MAXI[NM]\s|
^AQUEOUS\s|
SUITE[S]?\s|
WING\s|
^CAMPUS\s|
^STUDIO\s|
^COTTAGE\s|
^STALL\s|
^SHOP\s|
^ARCH\s|
^QUAY\s|
^ABOVE\s\d|
^LINK\s|
JETTY\s|
WAREHOUSE\s|
^HOLDING\s|
^PENTHOUSE\s|
^MOORING\s|
^BOTHY\s|
^MAISONETTE\s|
^SITE\s|
^WORKSHOP\s|
^BARN[S]?\s|
STALL\s|
^BOAT\s|
^STAND\s|
^TOWER\s\d|
^YARD\s\d|
^STANCE\s|
^VAN\s|
^BAY\s\d|
^MOBILE HOME\s|
^STABLE\s|
^ROOM\s|
^[A-Z]\d+$
/xo
))
{
if ( $building =~ m/^([A-Z]?\d+[A-Z]{0,2}|
\d+[A-Z]{0,2}[\-\&\ \\\/]{0,1}\d+[A-Z]{0,2}|
[A-Z])$/ox )
{
$bnumber=$building;
$building="" ;
}
elsif ($building =~ m/^([A-Z]?\d*\s*[A-Z\s\.]+)(\d+[A-Z]{0,2}[\-\&\ \\\/]{0,1}\d*[A-Z]{0,2})$/o)
{
if (length($1) > 2)
{
$bnumber=$2;
$building=$1;
$building =~ s/\s+$//g;
}
}
elsif ($building =~ m/^(\D+)(\d+[A-Z]{0,2}[\-\&\ \\\/]\d+[A-Z]{0,2})$/)
{
$bnumber=$2;
$building=$1;
$building =~ s/\s+$//g;
}
}
return "$building|$bnumber"
}
# # ############################################################################
# SUB InsertNumber
#
# Prepend number to first non blank line from thoroughfare onwards
# # ############################################################################
sub InsertNumber
{
my ($N) = (@_) ;
if ( $N =~ /\d/ && $N ne "0")
{
if (length($fmtDepSteet) > 0) {$fmtDepSteet = $N . " " . $fmtDepSteet ;}
elsif (length($fmtSteet) > 0) {$fmtSteet = $N . " " . $fmtSteet;}
elsif (length($fmtDepLocality) > 0) {$fmtDepLocality = $N . " " . $fmtDepLocality ;}
else {$fmtLocality = $N . " " . $fmtLocality;} ;
}
}
# # ############################################################################
# SUB getLoHiNum
#
# Extract number ranges
# # ############################################################################
sub getLoHiNum
{
my ($tfare) = (@_) ;
my ($lonum,$losuf,$hinum,$hisuf) = ( "","","","");
# extract numbers and suffixes
my ($num,$tt) = ($tfare=~ /^(\d+[A-Z]?[\-]?\d*[A-Z]?)\s(.*)$/ );
if (length($num) > 0)
{
$tfare = $tt ;
if ( $num =~ m/[\-]{1}/ )
{
($lonum,$hinum) = ($num=~ /(\d+.*)[\-](\d+.*)/ );
}
else
{
$lonum = $num;
}
if ( $lonum =~ m/[A-Z]$/ )
{
($lonum,$losuf) = ($lonum=~ /(\d+)([A-Z])$/ );
}
if ( $hinum =~ m/[A-Z]$/ )
{
($hinum,$hisuf) = ($hinum=~ /(\d+)([A-Z])$/ );
}
}
return "$tfare|$lonum|$losuf|$hinum|$hisuf" ;
}
# # ############################################################################
# SUB removePunch
#
# Remove punchuation
# # ############################################################################
sub removePunch
{
my ($dirtyWord,$punch) = (@_) ;
$punch = "dhqa" if length($punch) == 0;
$dirtyWord =~ s/\.//g if $punch =~ m/d/ ;
$dirtyWord =~ s/\-/ /g if $punch =~ m/h/ ;
$dirtyWord =~ s/\'//g if $punch =~ m/q/ ;
$dirtyWord =~ s/\@//g if $punch =~ m/a/ ;
return $dirtyWord;
}
万一其他人遇到这个问题,如果我正确理解了这个问题,从数据文件中重新构建地址,RM PAF 文件中的列的顺序是相反的。因此,要构建一个简单的地址字符串,从右到左检查列,您只需检查每个列的值,如果有,则用空格将其连接到地址字符串中。如果您查看一些使用每个字段的示例,您可以计算出空格和逗号的解析。