性能比较
注意:表中的时间不包括加载文件所需的时间。
| approach | american-english, | big.txt, | time w.r.t. defaultdict |
| | time, seconds | time, seconds | |
|----------------+-------------------+---------------+-------------------------|
| Counter | 0.451 | 3.367 | 3.6 |
| setdefault | 0.348 | 2.320 | 2.5 |
| list | 0.277 | 1.822 | 2 |
| try/except | 0.158 | 1.068 | 1.2 |
| defaultdict | 0.141 | 0.925 | 1 |
| numpy | 0.012 | 0.076 | 0.082 |
| S.Mark's ext. | 0.003 | 0.019 | 0.021 |
| ext. in Cython | 0.001 | 0.008 | 0.0086 |
#+TBLFM: $4=$3/@7$3;%.2g
使用的文件:'/usr/share/dict/american-english'
和'big.txt'
.
比较“Counter”、“setdefault”、“list”、“try/except”、“defaultdict”、“numpy”、“cython”和@S.Mark 解决方案的脚本位于http://gist。 github.com/347000
最快的解决方案是用 Cython 编写的 Python 扩展:
import cython
@cython.locals(
chars=unicode,
i=cython.Py_ssize_t,
L=cython.Py_ssize_t[0x10000])
def countchars_cython(chars):
for i in range(0x10000): # unicode code points > 0xffff are not supported
L[i] = 0
for c in chars:
L[c] += 1
return {unichr(i): L[i] for i in range(0x10000) if L[i]}
之前的比较:
* python (dict) : 0.5 seconds
* python (list) : 0.5 (ascii) (0.2 if read whole file in memory)
* perl : 0.5
* python (numpy): 0.07
* c++ : 0.05
* c : 0.008 (ascii)
输入数据:
$ tail /usr/share/dict/american-english
éclat's
élan
élan's
émigré
émigrés
épée
épées
étude
étude's
études
$ du -h /usr/share/dict/american-english
912K /usr/share/dict/american-english
蟒蛇(计数器):0.5秒
#!/usr/bin/env python3.1
import collections, fileinput, textwrap
chars = (ch for word in fileinput.input() for ch in word.rstrip())
# faster (0.4s) but less flexible: chars = open(filename).read()
print(textwrap.fill(str(collections.Counter(chars)), width=79))
运行:
$ time -p python3.1 count_char.py /usr/share/dict/american-english
计数器({'e':87823,'s':86620,'i':66548,'a':62778,'n':56696,'r':
56286,'t':51588,'o':48425,'l':39914,'c':30020,'d':28068,'u':25810,
“'”:24511,'g':22262,'p':20917,'m':20747,'h':18453,'b':14137,'y':
12367,'f':10049,'k':7800,'v':7573,'w':6924,'z':3088,'x':2082,'M':
1686,'C':1549,'S':1515,'q':1447,'B':1387,'j':1376,'A':1345,'P':
974,“L”:912,“H”:860,“T”:858,“G”:811,“D”:809,“R”:749,“K”:656,“E”:
618,'J':539,'N':531,'W':507,'F':502,'O':354,'I':344,'V':330,'Z':
150,“Y”:140,“é”:128,“U”:117,“Q”:63,“X”:42,“è”:29,“ö”:12,“ü”:12,
'ó': 10, 'á': 10, 'ä': 7, 'ê': 6, 'â': 6, 'ñ': 6, 'ç': 4, 'å': 3, 'û ': 3, 'í':
2, 'ô': 2, 'Å': 1})
实数 0.44
用户 0.43
系统 0.01
perl:0.5 秒
time -p perl -MData::Dumper -F'' -lanwe'$c{$_}++ for (@F);
END{ $Data::Dumper::Terse = 1; $Data::Dumper::Indent = 0; print Dumper(\%c) }
' /usr/share/dict/american-english
输出:
{'S' => 1515,'K' => 656,'' => 29,'d' => 28068,'Y' => 140,'E' => 618,'y' => 12367,' g' => 22262,'e' => 87823,'' => 2,'J' => 539,'' => 241,'' => 3,'' => 6,'' => 4, '' => 128,'D' => 809,'q' => 1447,'b' => 14137,'z' => 3088,'w' => 6924,'Q' => 63,'' => 10,'M' => 1686,'C' => 1549,'' => 10,'L' => 912,'X' => 42,'P' => 974,'' => 12 ,'\'' => 24511,'' => 6,'a' => 62778,'T' => 858,'N' => 531,'j' => 1376,'Z' => 150, '你' => 25810,'k' =>7800,'t' => 51588,'' => 6,'W' => 507,'v' => 7573,'s' => 86620,'B' => 1387,'H' => 860, 'c' => 30020,'' => 12,'I' => 344,'' => 3,'G' => 811,'U' => 117,'F' => 502,'' = > 2,'r' => 56286,'x' => 2082,'V' => 330,'h' => 18453,'f' => 10049,'' => 1,'i' => 66548 ,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917,' R' => 749,'o' => 48425}'c' => 30020,'' => 12,'I' => 344,'' => 3,'G' => 811,'U' => 117,'F' => 502,'' = > 2,'r' => 56286,'x' => 2082,'V' => 330,'h' => 18453,'f' => 10049,'' => 1,'i' => 66548 ,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917,' R' => 749,'o' => 48425}'c' => 30020,'' => 12,'I' => 344,'' => 3,'G' => 811,'U' => 117,'F' => 502,'' = > 2,'r' => 56286,'x' => 2082,'V' => 330,'h' => 18453,'f' => 10049,'' => 1,'i' => 66548 ,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917,' R' => 749,'o' => 48425}66548,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917, 'R' => 749,'o' => 48425}66548,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917, 'R' => 749,'o' => 48425}
实数 0.51
用户 0.49
系统 0.02
蟒蛇(numpy):0.07秒
基于Ants Aasma 的回答(修改为支持 unicode):
#!/usr/bin/env python
import codecs, itertools, operator, sys
import numpy
filename = sys.argv[1] if len(sys.argv)>1 else '/usr/share/dict/american-english'
# ucs2 or ucs4 python?
dtype = {2: numpy.uint16, 4: numpy.uint32}[len(buffer(u"u"))]
# count ordinals
text = codecs.open(filename, encoding='utf-8').read()
a = numpy.frombuffer(text, dtype=dtype)
counts = numpy.bincount(a)
# pretty print
counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
counts.sort(key=operator.itemgetter(1))
print ' '.join('("%s" %d)' % c for c in counts if c[0] not in ' \t\n')
输出:
("Å" 1) ("í" 2) ("ô" 2) ("å" 3) ("û" 3) ("ç" 4) ("â" 6) ("ê" 6) ("ñ" 6) ("ä" 7) ("á" 10) ("ó" 10) ("ö" 12) ("ü" 12) ("è" 29) ("X" 42) ("Q" 63) ("U" 117) ("é" 128) ("Y" 140) ("Z" 150) ("V" 330) ("I" 344) ("O" 354) ("F" 502) ("W" 507) ("N" 531) ("J" 539) ("E" 618) ("K" 656) ("R" 749) ("D" 809) ("G" 811) ("T" 858) ("H" 860) ("L" 912) ("P" 974) ("A" 1345) ("j" 1376) ("B" 1387) ("q" 1447) ("S" 1515) ("C" 1549) ("M" 1686) ("x" 2082) ("z" 3088) ("w" 6924) ("v" 7573) ("k" 7800) ("f" 10049) ("y" 12367) ("b" 14137) ("h" 18453) ("m" 20747) ("p" 20917) ("g" 22262) ("'" 24511) ("u" 25810) ("d" 28068) ("c" 30020) ("l" 39914) ("o" 48425) ("t" 51588) ("r" 56286) ("n" 56696) ("a" 62778) ("i" 66548) ("s" 86620) ("e" 87823)
real 0.07
user 0.06
sys 0.01
c++: 0.05 秒
// $ g++ *.cc -lboost_program_options
// $ ./a.out /usr/share/dict/american-english
#include <iostream>
#include <fstream>
#include <cstdlib> // exit
#include <boost/program_options/detail/utf8_codecvt_facet.hpp>
#include <boost/tr1/unordered_map.hpp>
#include <boost/foreach.hpp>
int main(int argc, char* argv[]) {
using namespace std;
// open input file
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <filename>\n";
exit(2);
}
wifstream f(argv[argc-1]);
// assume the file has utf-8 encoding
locale utf8_locale(locale(""),
new boost::program_options::detail::utf8_codecvt_facet);
f.imbue(utf8_locale);
// count characters frequencies
typedef std::tr1::unordered_map<wchar_t, size_t> hashtable_t;
hashtable_t counts;
for (wchar_t ch; f >> ch; )
counts[ch]++;
// print result
wofstream of("output.utf8");
of.imbue(utf8_locale);
BOOST_FOREACH(hashtable_t::value_type i, counts)
of << "(" << i.first << " " << i.second << ") ";
of << endl;
}
结果:
$ cat output.utf8
(í 2) (O 354) (P 974) (Q 63) (R 749) (S 1,515) (ñ 6) (T 858) (U 117) (ó 10) (ô 2) (V 330) (W 507) (X 42) (ö 12) (Y 140) (Z 150) (û 3) (ü 12) (a 62,778) (b 14,137) (c 30,020) (d 28,068) (e 87,823) (f 10,049) (g 22,262) (h 18,453) (i 66,548) (j 1,376) (k 7,800) (l 39,914) (m 20,747) (n 56,696) (o 48,425) (p 20,917) (q 1,447) (r 56,286) (s 86,620) (t 51,588) (u 25,810) (Å 1) (' 24,511) (v 7,573) (w 6,924) (x 2,082) (y 12,367) (z 3,088) (A 1,345) (B 1,387) (C 1,549) (á 10) (â 6) (D 809) (E 618) (F 502) (ä 7) (å 3) (G 811) (H 860) (ç 4) (I 344) (J 539) (è 29) (K 656) (é 128) (ê 6) (L 912) (M 1,686) (N 531)
c (ascii): 0.0079 秒
// $ gcc -O3 cc_ascii.c -o cc_ascii && time -p ./cc_ascii < input.txt
#include <stdio.h>
enum { N = 256 };
size_t counts[N];
int main(void) {
// count characters
int ch = -1;
while((ch = getchar()) != EOF)
++counts[ch];
// print result
size_t i = 0;
for (; i < N; ++i)
if (counts[i])
printf("('%c' %zu) ", (int)i, counts[i]);
return 0;
}