更新:内存视图获胜。 Cython 使用类型化的内存视图:0.0253449
特别感谢 lothario 指出了几个关键的变化。
荒谬的。当然现在的问题是,似乎不能对它们做很多算术(求和和乘法)。 受Python (numpy) 实现主题模型 启发的原始帖子,速度非常慢。我认为对它进行cythonize是个好主意。但是我只能弄清楚如何将 cython 的时间减半。这里有明显的数组操作没有被优化 - 一些想法和建议将是最受欢迎的。我一直想和 cython 一起玩,这似乎是一个好机会!
对于 15 个文档,每个文档大约 300 个单词,python:39.6903322834 cython:19.2733114806 Cython using typed memoryviews:0.547822975
我特别想使用 nogil,因此可以进一步加快速度:1)使用内存视图,将 nogil 添加到循环中是否有帮助?2)我有一个文档列表,每个文档都由一个数字数组表示。什么是最适合我使用的 C 对象?nogil 不适用于 python 对象。目前我将此作为数组列表。
我不是 C 爱好者,但欢迎任何进一步的优化建议。
来自朋友的 Java 实现,1000 个文档,每个文档 300 字,3 秒。
lda_pyx Cython 代码
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t
cdef class LDA:
cdef int iteration, M
cdef int[:] docSizes
cdef double[:, ::1] n_k_w ,n_m_k
#cdef
cdef double[:] n_k
cdef list k_m_n
cdef list numbered_docs
#def __init__(self,int iteration,int M, np.ndarray[np.double_t, ndim=2] n_k_w ,np.ndarray[np.double_t, ndim=2] n_m_k, np.ndarray[np.double_t, ndim=1] n_k,np.ndarray[np.int_t, ndim=1] docSizes, list numbered_docs, list k_m_n):
def __init__(self,int iteration,int M, double[:, ::1] n_k_w ,double[:, ::1] n_m_k, double[:] n_k, int[:] docSizes, list numbered_docs, list k_m_n):
self.iteration = iteration
self.M = M
self.n_k_w = n_k_w
self.n_m_k = n_m_k
self.n_k = n_k
self.k_m_n = k_m_n
self.numbered_docs = numbered_docs
self.docSizes = docSizes
@cython.boundscheck(False)
@cython.wraparound(False)
cdef int _sample(self) :
#cdef np.ndarray[np.double_t, ndim=2, mode="c"] n_k_w = self.n_k_w
#cdef np.ndarray[np.double_t, ndim=2, mode="c"] n_m_k = self.n_m_k
#cdef np.ndarray[np.double_t, ndim=1, mode="c"] n_k = self.n_k
cdef double[:, ::1] n_k_w = self.n_k_w
cdef double[:] n_k = self.n_k
cdef double[:, ::1] n_m_k = self.n_m_k
#cdef np.ndarray[np.int_t, ndim=1, mode="c"] docSizes = self.docSizes
cdef int[:] docSizes = self.docSizes
cdef int m , n, t , k ,new_k
#cdef np.ndarray[np.int_t, ndim=1, mode="c"] doc
cdef int[:] doc
for m in xrange(self.M):
doc = self.numbered_docs[m]
for n in xrange(docSizes[m]):
t = doc[n]
# discount for n-th word t with topic z
k = self.k_m_n[m][n]
#print k
n_m_k[m,k] -= 1
n_k_w[k,t] -= 1
n_k[k] -= 1
#print "ok"
# sampling topic new_z for t
#p_k = n_k_w[:, t] * n_m_k[m][k] / n_k
new_k = 1
#np.random.multinomial(1, p_z / p_z.sum()).argmax()
# set z the new topic and increment counters
self.k_m_n[m][n] = new_k
#print n_m_k[m, new_k] ,"after"
n_m_k[m, new_k] += 1
#print n_m_k[m, new_k] ,"after"
n_k_w[new_k][t] += 1
n_k[new_k] += 1
#print self.n_k_w ,"before"
self.n_k_w = n_k_w
#print self.n_k_w ,"after"
self.n_m_k = n_m_k
self.n_k = n_k
#self.k_m_n = k_m_n
return 1
@cython.boundscheck(False)
@cython.wraparound(False)
cdef int _iterate(self) :
while self.iteration >0 :
self._sample()
self.iteration -= 1
return 1
def iterate(iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n ):
cdef LDA lda
lda= LDA(iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n)
lda._iterate()
return lda.n_k_w , lda.n_m_k, lda.n_k , lda.k_m_n
纯python版本
def gibbs_sample():
for i in xrange(iteration):
#print i
for m in xrange(M):
doc = numbered_docs[m]
for n in xrange(docSizes[m]):
#print t
t = doc[n]
# discount for n-th word t with topic z
k = k_m_n[m][n]
n_m_k[m][k] -= 1
n_k_w[k][t] -= 1
n_k[k] -= 1
# sampling topic new_z for t
#p_k = n_k_w[:, t] * n_m_k[m][k] / n_k
new_k = 1
#np.random.multinomial(1, p_z / p_z.sum()).argmax()
# set z the new topic and increment counters
k_m_n[m][n] = new_k
n_m_k[m][new_k] += 1
n_k_w[new_k][t] += 1
n_k[new_k] += 1
cProfile
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 0.419 0.419 <string>:1(<module>)
1 0.419 0.419 0.419 0.419 {lda_pyx.iterate}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}