这是给任何感兴趣的人的快速 C 版本。我机器上的标题时间:
Python(>5Gb 内存)
time ./read.py largefile.txt
real 0m48.711s
user 0m46.911s
sys 0m1.783s
C(~1.9Gb 内存)
gcc -O3 read.c -o read
time ./read largefile.txt
real 0m6.250s
user 0m5.676s
sys 0m0.573s
所以在 C 语言中大约快 7.8 倍。:)
我应该补充一点,我的 seq 版本不会在不将命令更改为的情况下创建可用列表:
paste <(seq -f "%.0f" 20000000) <(seq -f "%.0f" 2 20000001) > largefile.txt
下面的代码,必须归功于 Vijay Mathew,他将 C 编程语言第 6.6 节中的 dict 示例复制到他的示例中(我复制到下面的答案中):
在 C 中实现字典的快速方法
====== 编辑 ====== (13/08/2013)
继我的回答的评论 #2 之后,我已将代码更新为代码清单 2 中的代码,以允许单个键具有多个值,并且还开始使用更新的 perl 代码生成测试文件(因此大小只有一半大约一半的执行时间)。
标题时间是:
Python(>5Gb 内存)
time ./read.py largefile.txt
real 0m25.925s
user 0m25.228s
sys 0m0.688s
C(~1.9Gb 内存)
gcc -O3 read.c -o read
time ./read largefile.txt
real 0m3.497s (although sub 3 seconds is possible by reducing the hash size?!?!?)
user 0m3.183s
sys 0m0.315s
因此,尽管 panda 可能很接近,但 C 语言的速度大约快了 7.4 倍。
然而,关于大小的这一点很重要。我可以通过将散列大小减小到一个非常小的数字来“作弊”,这对于多值字典会以查找为代价来提高插入速度。因此,要真正测试这些实现中的任何一个,您确实还需要测试查找速度。
代码 2(多值字典)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct nlist { /* table entry: */
struct nlist *next; /* next entry in chain */
char *name; /* defined name */
char *defn; /* replacement text */
};
#define HASHSIZE 10000001
static struct nlist *hashtab[HASHSIZE]; /* pointer table */
/* hash: form hash value for string s */
unsigned hash(char *s)
{
unsigned hashval;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31 * hashval;
return hashval % HASHSIZE;
}
/* lookup: look for s in hashtab */
struct nlist *lookup(char *s)
{
struct nlist *np;
for (np = hashtab[hash(s)]; np != NULL; np = np->next)
if (strcmp(s, np->name) == 0)
return np; /* found */
return NULL; /* not found */
}
struct nlist * lookup_all(char *key)
{
struct nlist *np, *np2, *ret;
unsigned hashval = hash(key);
ret = NULL;
for (np = hashtab[hashval]; np != NULL; np = np->next) {
if (strcmp(key, np->name) == 0) {
np2 = malloc(sizeof(*np2));
np2->name = np->name;
np2->defn = np->defn;
np2->next = ret;
ret = np2;
}
}
return ret; /* not found */
}
/* install: put (name, defn) in hashtab */
struct nlist *install(char *name, char *defn)
{
struct nlist *np, *np2;
unsigned hashval = hash(name);;
//if ((np = lookup(name, hashval)) == NULL) { /* not found */
np = (struct nlist *) malloc(sizeof(*np));
if (np == NULL || (np->name = strdup(name)) == NULL)
return NULL;
np->next = hashtab[hashval];
hashtab[hashval] = np;
if ((np->defn = strdup(defn)) == NULL)
return NULL;
return np;
}
#ifdef STRDUP
char *strdup(char *s) /* make a duplicate of s */
{
char *p;
p = (char *) malloc(strlen(s)+1); /* +1 for .\0. */
if (p != NULL)
strcpy(p, s);
return p;
}
#endif /* STRDUP */
int main(int argc, char *argv[]) {
FILE *fp;
char str1[20];
char str2[20];
int size = 0;
int progress = 0;
struct nlist *result;
fp = fopen(argv[1],"r");
if(fp==NULL) {return 1;}
fseek(fp, 0, SEEK_END);
size = ftell(fp);
rewind(fp);
while(size != ftell(fp)) {
if(0==fscanf(fp, "%s %s",str1,str2))
break;
(void)install(str1,str2);
}
printf("Done\n");
fclose(fp);
// Test a lookup to see if we get multiple items back.
result = lookup_all("1");
while (result) {
printf("Key = %s Value = %s\n",result->name,result->defn);
result = result->next;
}
return 0;
}
代码 1(单值字典)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct nlist { /* table entry: */
struct nlist *next; /* next entry in chain */
char *name; /* defined name */
char *defn; /* replacement text */
};
#define HASHSIZE 10000001
static struct nlist *hashtab[HASHSIZE]; /* pointer table */
/* hash: form hash value for string s */
unsigned hash(char *s)
{
unsigned hashval;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31 * hashval;
return hashval % HASHSIZE;
}
/* lookup: look for s in hashtab */
struct nlist *lookup(char *s)
{
struct nlist *np;
for (np = hashtab[hash(s)]; np != NULL; np = np->next)
if (strcmp(s, np->name) == 0)
return np; /* found */
return NULL; /* not found */
}
/* install: put (name, defn) in hashtab */
struct nlist *install(char *name, char *defn)
{
struct nlist *np;
unsigned hashval;
if ((np = lookup(name)) == NULL) { /* not found */
np = (struct nlist *) malloc(sizeof(*np));
if (np == NULL || (np->name = strdup(name)) == NULL)
return NULL;
hashval = hash(name);
np->next = hashtab[hashval];
hashtab[hashval] = np;
} else /* already there */
free((void *) np->defn); /*free previous defn */
if ((np->defn = strdup(defn)) == NULL)
return NULL;
return np;
}
#ifdef STRDUP
char *strdup(char *s) /* make a duplicate of s */
{
char *p;
p = (char *) malloc(strlen(s)+1); /* +1 for .\0. */
if (p != NULL)
strcpy(p, s);
return p;
}
#endif /* STRDUP */
int main(int argc, char *argv[]) {
FILE *fp;
char str1[20];
char str2[20];
int size = 0;
int progress = 0;
fp = fopen(argv[1],"r");
if(fp==NULL) {return 1;}
fseek(fp, 0, SEEK_END);
size = ftell(fp);
rewind(fp);
while(size != ftell(fp)) {
if(0==fscanf(fp, "%s %s",str1,str2))
break;
//printf(">%s<>%s<\n",str1,str2);
(void)install(str1,str2);
++progress;
if((progress % 100000)==0)
printf("Progress = %d\n",progress);
}
printf("Done\n");
fclose(fp);
return 0;
}