我需要比较存储在数据库中的文档,并得出一个介于 0 和 1 之间的相似度分数。
我需要使用的方法必须非常简单。实现 n-gram 的 vanilla 版本(可以定义使用多少克),以及 tf-idf 和余弦相似度的简单实现。
有没有什么程序可以做到这一点?还是我应该从头开始写这个?
查看 NLTK 包:http ://www.nltk.org它有你需要的一切
对于 cosine_similarity:
def cosine_distance(u, v):
"""
Returns the cosine of the angle between vectors v and u. This is equal to
u.v / |u||v|.
"""
return numpy.dot(u, v) / (math.sqrt(numpy.dot(u, u)) * math.sqrt(numpy.dot(v, v)))
对于 ngram:
def ngrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None):
"""
A utility that produces a sequence of ngrams from a sequence of items.
For example:
>>> ngrams([1,2,3,4,5], 3)
[(1, 2, 3), (2, 3, 4), (3, 4, 5)]
Use ingram for an iterator version of this function. Set pad_left
or pad_right to true in order to get additional ngrams:
>>> ngrams([1,2,3,4,5], 2, pad_right=True)
[(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
@param sequence: the source data to be converted into ngrams
@type sequence: C{sequence} or C{iterator}
@param n: the degree of the ngrams
@type n: C{int}
@param pad_left: whether the ngrams should be left-padded
@type pad_left: C{boolean}
@param pad_right: whether the ngrams should be right-padded
@type pad_right: C{boolean}
@param pad_symbol: the symbol to use for padding (default is None)
@type pad_symbol: C{any}
@return: The ngrams
@rtype: C{list} of C{tuple}s
"""
if pad_left:
sequence = chain((pad_symbol,) * (n-1), sequence)
if pad_right:
sequence = chain(sequence, (pad_symbol,) * (n-1))
sequence = list(sequence)
count = max(0, len(sequence) - n + 1)
return [tuple(sequence[i:i+n]) for i in range(count)]
对于 tf-idf,您必须首先计算分布,我正在使用 Lucene 来执行此操作,但您可以使用 NLTK 做类似的事情,使用 FreqDist:
http://nltk.googlecode.com/svn/trunk/doc/book/ch01.html#frequency_distribution_index_term
如果你喜欢 pylucene,这将告诉你如何计算 tf.idf
# reader = lucene.IndexReader(FSDirectory.open(index_loc))
docs = reader.numDocs()
for i in xrange(docs):
tfv = reader.getTermFreqVector(i, fieldname)
if tfv:
rec = {}
terms = tfv.getTerms()
frequencies = tfv.getTermFrequencies()
for (t,f,x) in zip(terms,frequencies,xrange(maxtokensperdoc)):
df= searcher.docFreq(Term(fieldname, t)) # number of docs with the given term
tmap.setdefault(t, len(tmap))
rec[t] = sim.tf(f) * sim.idf(df, max_doc) #compute TF.IDF
# and normalize the values using cosine normalization
if cosine_normalization:
denom = sum([x**2 for x in rec.values()])**0.5
for k,v in rec.items():
rec[k] = v / denom
如果您有兴趣,我已经完成了关于 tf-idf 和使用Scikits.learn (sklearn) Python 模块的教程系列(第一部分和第二部分)。
第 3 部分具有余弦相似度。
简而言之,这是python
+的答案:numpy
余弦:
def cosine_sim(u,v):
return np.dot(u,v) / (sqrt(np.dot(u,u)) * sqrt(np.dot(v,v)))
Ngram:
def ngrams(sentence, n):
return zip(*[sentence.split()[i:] for i in range(n)])
TF-IDF(这有点奇怪,但它有效):
def tfidf(corpus, vocab):
"""
INPUT:
corpus = [('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]),
('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]),
('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
vocab = ['a', 'bar', 'black', 'foo', 'is', 'sentence',
'sheep', 'this']
OUTPUT:
[[0.300, 0.300, 0.0, 0.300, 0.300, 0.0, 0.0, 0.300],
[0.0, 0.600, 0.600, 0.300, 0.0, 0.0, 0.600, 0.0],
[0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
"""
def termfreq(matrix, doc, term):
try: return matrix[doc][term] / float(sum(matrix[doc].values()))
except ZeroDivisionError: return 0
def inversedocfreq(matrix, term):
try:
return float(len(matrix)) /sum([1 for i,_ in enumerate(matrix) if matrix[i][term] > 0])
except ZeroDivisionError: return 0
matrix = [{k:v for k,v in zip(vocab, i[1])} for i in corpus]
tfidf = defaultdict(dict)
for doc,_ in enumerate(matrix):
for term in matrix[doc]:
tf = termfreq(matrix,doc,term)
idf = inversedocfreq(matrix, term)
tfidf[doc][term] = tf*idf
return [[tfidf[doc][term] for term in vocab] for doc,_ in enumerate(tfidf)]
这是测试的长答案:
import numpy as np
from math import sqrt, log
from itertools import chain, product
from collections import defaultdict
def cosine_sim(u,v):
return np.dot(u,v) / (sqrt(np.dot(u,u)) * sqrt(np.dot(v,v)))
def ngrams(sentence, n):
return zip(*[sentence.split()[i:] for i in range(n)])
def tfidf(corpus, vocab):
"""
INPUT:
corpus = [('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]),
('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]),
('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
vocab = ['a', 'bar', 'black', 'foo', 'is', 'sentence',
'sheep', 'this']
OUTPUT:
[[0.300, 0.300, 0.0, 0.300, 0.300, 0.0, 0.0, 0.300],
[0.0, 0.600, 0.600, 0.300, 0.0, 0.0, 0.600, 0.0],
[0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
"""
def termfreq(matrix, doc, term):
try: return matrix[doc][term] / float(sum(matrix[doc].values()))
except ZeroDivisionError: return 0
def inversedocfreq(matrix, term):
try:
return float(len(matrix)) /sum([1 for i,_ in enumerate(matrix) if matrix[i][term] > 0])
except ZeroDivisionError: return 0
matrix = [{k:v for k,v in zip(vocab, i[1])} for i in corpus]
tfidf = defaultdict(dict)
for doc,_ in enumerate(matrix):
for term in matrix[doc]:
tf = termfreq(matrix,doc,term)
idf = inversedocfreq(matrix, term)
tfidf[doc][term] = tf*idf
return [[tfidf[doc][term] for term in vocab] for doc,_ in enumerate(tfidf)]
def corpus2vectors(corpus):
def vectorize(sentence, vocab):
return [sentence.split().count(i) for i in vocab]
vectorized_corpus = []
vocab = sorted(set(chain(*[i.lower().split() for i in corpus])))
for i in corpus:
vectorized_corpus.append((i, vectorize(i, vocab)))
return vectorized_corpus, vocab
def create_test_corpus():
sent1 = "this is a foo bar"
sent2 = "foo bar bar black sheep"
sent3 = "this is a sentence"
all_sents = [sent1,sent2,sent3]
corpus, vocab = corpus2vectors(all_sents)
return corpus, vocab
def test_cosine():
corpus, vocab = create_test_corpus()
for sentx, senty in product(corpus, corpus):
print sentx[0]
print senty[0]
print "cosine =", cosine_sim(sentx[1], senty[1])
print
def test_ngrams():
corpus, vocab = create_test_corpus()
for sentx in corpus:
print sentx[0]
print ngrams(sentx[0],2)
print ngrams(sentx[0],3)
print
def test_tfidf():
corpus, vocab = create_test_corpus()
print corpus
print vocab
print tfidf(corpus, vocab)
print "Testing cosine..."
test_cosine()
print
print "Testing ngrams..."
test_ngrams()
print
print "Testing tfidf..."
test_tfidf()
print
[出去]:
Testing cosine...
this is a foo bar
this is a foo bar
cosine = 1.0
this is a foo bar
foo bar bar black sheep
cosine = 0.507092552837
this is a foo bar
this is a sentence
cosine = 0.67082039325
foo bar bar black sheep
this is a foo bar
cosine = 0.507092552837
foo bar bar black sheep
foo bar bar black sheep
cosine = 1.0
foo bar bar black sheep
this is a sentence
cosine = 0.0
this is a sentence
this is a foo bar
cosine = 0.67082039325
this is a sentence
foo bar bar black sheep
cosine = 0.0
this is a sentence
this is a sentence
cosine = 1.0
Testing ngrams...
this is a foo bar
[('this', 'is'), ('is', 'a'), ('a', 'foo'), ('foo', 'bar')]
[('this', 'is', 'a'), ('is', 'a', 'foo'), ('a', 'foo', 'bar')]
foo bar bar black sheep
[('foo', 'bar'), ('bar', 'bar'), ('bar', 'black'), ('black', 'sheep')]
[('foo', 'bar', 'bar'), ('bar', 'bar', 'black'), ('bar', 'black', 'sheep')]
this is a sentence
[('this', 'is'), ('is', 'a'), ('a', 'sentence')]
[('this', 'is', 'a'), ('is', 'a', 'sentence')]
Testing tfidf...
[('this is a foo bar', [1, 1, 0, 1, 1, 0, 0, 1]), ('foo bar bar black sheep', [0, 2, 1, 1, 0, 0, 1, 0]), ('this is a sentence', [1, 0, 0, 0, 1, 1, 0, 1])]
['a', 'bar', 'black', 'foo', 'is', 'sentence', 'sheep', 'this']
[[0.30000000000000004, 0.30000000000000004, 0.0, 0.30000000000000004, 0.30000000000000004, 0.0, 0.0, 0.30000000000000004], [0.0, 0.6000000000000001, 0.6000000000000001, 0.30000000000000004, 0.0, 0.0, 0.6000000000000001, 0.0], [0.375, 0.0, 0.0, 0.0, 0.375, 0.75, 0.0, 0.375]]
如果您仍然对这个问题感兴趣,我使用Lucene Java和 Jython 做了一些非常相似的事情。这是我的代码中的一些片段。
Lucene 使用所谓的分析器预处理文档和查询。这个使用 Lucene 的内置 n-gram 过滤器:
class NGramAnalyzer(Analyzer):
'''Analyzer that yields n-grams for minlength <= n <= maxlength'''
def __init__(self, minlength, maxlength):
self.minlength = minlength
self.maxlength = maxlength
def tokenStream(self, field, reader):
lower = ASCIIFoldingFilter(LowerCaseTokenizer(reader))
return NGramTokenFilter(lower, self.minlength, self.maxlength)
要将列表ngrams
转换为Document
:
doc = Document()
doc.add(Field('n-grams', ' '.join(ngrams),
Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
要将文档存储在索引中:
wr = IndexWriter(index_dir, NGramAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
wr.addDocument(doc)
构建查询有点困难,因为 LuceneQueryParser
需要一种带有特殊运算符、引号等的查询语言,但它可以被规避(如部分解释这里)。
对于我们的信息检索课程,我们使用了一些由我们的教授用 Java 编写的代码。抱歉,没有 python 端口。“它仅根据 GNU 通用公共许可证发布用于教育和研究目的。”
您可以查看文档http://userweb.cs.utexas.edu/~mooney/ir-course/doc/
但更具体地说,请查看: http ://userweb.cs.utexas.edu/users/mooney/ir-course/doc/ir/vsr/HashMapVector.html