python
出于教育原因,我只使用 的标准库编写了这个简单的版本。
生产代码应该使用spacy
和pandas
import collections
from operator import itemgetter as at
with open("input.csv",'r') as f:
data = [l.split(',', 2) for l in f.readlines()]
spaced = lambda t: (t[0][0],' '.join(map(at(1), t))) if t[0][0]==t[1][0] else []
unigrams = [(i,w) for i, d in data for w in d.split()]
bigrams = filter(any, map(spaced, zip(unigrams, unigrams[1:] )))
trigrams = filter(any, map(spaced, zip(unigrams, unigrams[1:], unigrams[2:])))
with open("output.csv", 'w') as f:
for ngram in [unigrams, bigrams, trigrams]:
counts = collections.Counter(ngram)
for t,count in counts.items():
f.write("{i},{w},{c}\n".format(c=count, i=t[0], w=t[1]))