chdir("c:/perl/normalized");
$docid=0;
use List::MoreUtils qw( uniq );
my %hash = ();
@files = <*>;
foreach $file (@files)
{
$docid++;
open (input, $file);
while (<input>)
{
open (output,'>>c:/perl/postinglist/total');
chomp;
(@words) = split(" ");
foreach $word (@words)
{
push @{ $hash{$word} }, $docid;
}
}
}
foreach $key (sort keys %hash)
{
$size = scalar (@{$hash{$key}});
print output "Term: $key, Frequency:$size, Document(s):", join(" ", uniq @{ $hash{$key} }), "\n";
}
close (input);
close (output);
之前的join(" ", uniq @{ $hash{$key} })
输出如下:
Term:of Frequency:35 Document(s): 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 4 4 4 4 5 6 6 7 7 7 7 7 7 7 7 7
文件显示频率分布在哪里
Term:of Frequency:35 Document(s):1 2 3 4 5 6 7
直到这里都很好......我想为删除重复项保留一个计数器,这样我的新输出将是
Term:of Frequency:35 Document(s) of: 1(10) 2(7) 3(2) 4(4) 5(1) 6(2) 7(9)
那是值(计数器)
我能够通过对源代码进行一些更改来解决我自己的问题
chdir("c:/perl/normalized");
$docid=0;
my %hash = ();
@files = <*>;
foreach $file (@files)
{$counter=0;
$docid++;
open (input, $file);
while (<input>)
{
open (output,'>>c:/perl/tokens/total');
chomp;
(@words) = split(" ");
foreach $word (@words)
{
push @{ $hash{$word}{$docid}},$counter;
@{$hash{$word}{$docid}}[$counter]++;
}
}
}
foreach my $line (sort keys %hash) {
print output "Term:$line \n";
foreach my $elem (sort keys %{$hash{$line}}) {
print output" Doc:$elem " . "freq:".@{$hash{$line}->{$elem}} . "\n";
}
}
close (input);
close (output);