我带着另一个冗长的问题回来了。在尝试了许多基于 Python 的 Damerau-Levenshtein 编辑距离实现后,我终于找到了下面列出的editdistance_reference()
. 它似乎提供了正确的结果,并且似乎有一个有效的实施。
所以我开始着手将代码转换为 Cython。在我的测试数据中,参考方法能够提供 11,000 次比较的结果(对于 12 个字母长的词对),而 Cythonized 方法每秒进行超过 200,000 次比较。可悲的是,结果不正确:当您查看thisrow
我打印出来进行调试的变量时,无论我向其输入什么数据,我的版本都充满了变量,而参考输出显示了另一张图片。例如,测试'helo'
产生'world'
以下输出(ED
标记我的函数,EDR
是正确工作的参考):
来自editdistance()
:
#ED A [0, 0, 0, 0, 0, 1]
#ED B [1, 0, 0, 0, 0, 1]
#ED B [1, 1, 0, 0, 0, 1]
#ED B [1, 1, 1, 0, 0, 1]
#ED B [1, 1, 1, 1, 0, 1]
#ED B [1, 1, 1, 1, 1, 1]
#ED A [0, 0, 0, 0, 0, 2]
#ED B [1, 0, 0, 0, 0, 2]
#ED B [1, 1, 0, 0, 0, 2]
#ED B [1, 1, 1, 0, 0, 2]
#ED B [1, 1, 1, 1, 0, 2]
#ED B [1, 1, 1, 1, 1, 2]
#ED A [0, 0, 0, 0, 0, 3]
#ED B [1, 0, 0, 0, 0, 3]
#ED B [1, 1, 0, 0, 0, 3]
#ED B [1, 1, 1, 0, 0, 3]
#ED B [1, 1, 1, 1, 0, 3]
#ED B [1, 1, 1, 1, 1, 3]
#ED A [0, 0, 0, 0, 0, 4]
#ED B [1, 0, 0, 0, 0, 4]
#ED B [1, 1, 0, 0, 0, 4]
#ED B [1, 1, 1, 0, 0, 4]
#ED B [1, 1, 1, 1, 0, 4]
#ED B [1, 1, 1, 1, 1, 4]
来自editdistance_reference()
:
#EDR A [0, 0, 0, 0, 0, 1]
#EDR B [1, 0, 0, 0, 0, 1]
#EDR B [1, 2, 0, 0, 0, 1]
#EDR B [1, 2, 3, 0, 0, 1]
#EDR B [1, 2, 3, 4, 0, 1]
#EDR B [1, 2, 3, 4, 5, 1]
#EDR A [0, 0, 0, 0, 0, 2]
#EDR B [2, 0, 0, 0, 0, 2]
#EDR B [2, 2, 0, 0, 0, 2]
#EDR B [2, 2, 3, 0, 0, 2]
#EDR B [2, 2, 3, 4, 0, 2]
#EDR B [2, 2, 3, 4, 5, 2]
#EDR A [0, 0, 0, 0, 0, 3]
#EDR B [3, 0, 0, 0, 0, 3]
#EDR B [3, 3, 0, 0, 0, 3]
#EDR B [3, 3, 3, 0, 0, 3]
#EDR B [3, 3, 3, 3, 0, 3]
#EDR B [3, 3, 3, 3, 4, 3]
#EDR A [0, 0, 0, 0, 0, 4]
#EDR B [4, 0, 0, 0, 0, 4]
#EDR B [4, 4, 0, 0, 0, 4]
#EDR B [4, 4, 4, 0, 0, 4]
#EDR B [4, 4, 4, 4, 0, 4]
#EDR B [4, 4, 4, 4, 4, 4]
我一定很愚蠢,因为错误可能是那些非常明显的事情之一。但我似乎找不到它。
还有第二个问题:三个数组、、 和malloc
的 i 空间,然后它们以循环方式交换。当我尝试等等时,我得到一条 glibc 抱怨的行。我用谷歌搜索过;会不会是指针交换业务让 glibc 有点头晕,从而无法正确释放内存?twoago
oneago
thisrow
free( twoago )
double free or corruption
下面我首先列出setup.py
进行编译所需的内容(/path/to/python3.1 ./setup.py build_ext --inplace
),然后是适当的编辑距离代码,因此有兴趣的人会发现它更容易复制。
还有一件事:这是用 Python3.1 运行的;一件有趣的事情是,在*.pyx
文件内部我们确实有裸 unicode 字符串,但print
它仍然是一个语句,而不是一个函数。
是的,我知道这里有很多代码要粘贴,但问题是,当你把它剪得太多时,代码根本无法运行。我相信除了editdistance()
正常工作之外的所有方法,但请随时指出您发现的任何问题。
setup.py
:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(
name = 'cython_dameraulevenshtein',
ext_modules = [
Extension( 'cython_dameraulevenshtein', [ 'cython_dameraulevenshtein.pyx', ] ), ],
cmdclass = {
'build_ext': build_ext }, )
cython_dameraulevenshtein.pyx
(一直滚动到最后,看看有趣的东西):
############################################################################################################
cdef extern from "stdlib.h":
ctypedef unsigned int size_t
void *malloc(size_t size)
void *realloc( void *ptr, size_t size )
void free(void *ptr)
#-----------------------------------------------------------------------------------------------------------
cdef inline unsigned int _minimum_of_two_uints( unsigned int a, unsigned int b ):
if a < b: return a
return b
#-----------------------------------------------------------------------------------------------------------
cdef inline unsigned int _minimum_of_three_uints( unsigned int a, unsigned int b, unsigned int c ):
if a < b:
if c < a:
return c
return a
if c < b:
return c
return b
#-----------------------------------------------------------------------------------------------------------
cdef inline int _warp( unsigned int limit, int value ):
return value if value >= 0 else limit + value
############################################################################################################
# ARRAYS THAT SAY SIZE ;-)
#-----------------------------------------------------------------------------------------------------------
cdef class Array_of_unsigned_int:
cdef unsigned int *data
cdef unsigned int length
#---------------------------------------------------------------------------------------------------------
def __cinit__( self, unsigned int length, fill_value = None ):
self.length = length
self.data = <unsigned int *>malloc( length * sizeof( unsigned int ) ) ###OBS### must check malloc doesn't return NULL pointer
if fill_value is not None:
self.fill( fill_value )
#---------------------------------------------------------------------------------------------------------
cdef fill( self, unsigned int value ):
cdef unsigned int idx
cdef unsigned int *d = self.data
for idx from 0 <= idx < self.length:
d[ idx ] = value
#---------------------------------------------------------------------------------------------------------
cdef resize( self, unsigned int length ):
self.data = <unsigned int *>realloc( self.data, length * sizeof( unsigned int ) ) ###OBS### must check realloc doesn't return NULL pointer
self.length = length
#---------------------------------------------------------------------------------------------------------
def free( self ):
"""Always remember the milk: Free up memory."""
free( self.data ) ###OBS### should free memory here
#---------------------------------------------------------------------------------------------------------
def as_list( self ):
"""Return the array as a Python list."""
R = []
cdef unsigned int idx
cdef unsigned int *d = self.data
for idx from 0 <= idx < self.length:
R.append( d[ idx ] )
return R
############################################################################################################
# CONVERTING UNICODE TO CHARACTER IDs (CIDs)
#---------------------------------------------------------------------------------------------------------
cdef unsigned int _UMX_surrogate_lower_bound = 0x10000
cdef unsigned int _UMX_surrogate_upper_bound = 0x10ffff
cdef unsigned int _UMX_surrogate_hi_lower_bound = 0xd800
cdef unsigned int _UMX_surrogate_hi_upper_bound = 0xdbff
cdef unsigned int _UMX_surrogate_lo_lower_bound = 0xdc00
cdef unsigned int _UMX_surrogate_lo_upper_bound = 0xdfff
cdef unsigned int _UMX_surrogate_foobar_factor = 0x400
#---------------------------------------------------------------------------------------------------------
cdef Array_of_unsigned_int _cids_from_text( text ):
"""Givn a ``text`` either as a Unicode string or as a ``bytes`` or ``bytearray``, return an instance of
``Array_of_unsigned_int`` that enumerates either the Unicode codepoints of each character or the value of
each byte. Surrogate pairs will be condensed into single values, so on narrow Python builds the length of
the array returned may be less than ``len( text )``."""
#.........................................................................................................
# Make sure ``text`` is either a Unicode string (``str``) or a ``bytes``-like thing:
is_bytes = isinstance( text, ( bytes, bytearray, ) )
assert is_bytes or isinstance( text, str ), '#121'
#.........................................................................................................
# Whether it is a ``str`` or a ``bytes``, we know the result can only have at most as many elements as
# there are characters in ``text``, so we can already reserve that much space (in the case of a Unicode
# text, there may be fewer CIDs if there happen to be surrogate characters):
cdef unsigned int length = <unsigned int>len( text )
cdef Array_of_unsigned_int R = Array_of_unsigned_int( length )
#.........................................................................................................
# If ``text`` is empty, we can return an empty array right away:
if length == 0: return R
#.........................................................................................................
# Otherwise, prepare to copy data:
cdef unsigned int idx = 0
#.........................................................................................................
# If ``text`` is a ``bytes``-like thing, use simplified processing; we just have to copy over all byte
# values and are done:
if is_bytes:
for idx from 0 <= idx < length:
R.data[ idx ] = <unsigned int>text[ idx ]
return R
#.........................................................................................................
cdef unsigned int cid = 0
cdef bool is_surrogate = False
cdef unsigned int hi = 0
cdef unsigned int lo = 0
cdef unsigned int chr_count = 0
#.........................................................................................................
# Iterate over all indexes in text:
for idx from 0 <= idx < length:
#.......................................................................................................
# If we met with a surrogate CID in the last cycle, then that was a high surrogate CID, and the
# corresponding low CID is on the current position. Having both, we can compute the intended CID
# and reset the flag:
if is_surrogate:
lo = <unsigned int>ord( text[ idx ] )
# IIRC, this formula was documented in Unicode 3:
cid = ( ( hi - _UMX_surrogate_hi_lower_bound ) * _UMX_surrogate_foobar_factor
+ ( lo - _UMX_surrogate_lo_lower_bound ) + _UMX_surrogate_lower_bound )
is_surrogate = False
#.......................................................................................................
else:
# Otherwise, we retrieve the CID from the current position:
cid = <unsigned int>ord( text[ idx ] )
#.....................................................................................................
if _UMX_surrogate_hi_lower_bound <= cid <= _UMX_surrogate_hi_upper_bound:
# If this CID is a high surrogate CID, set ``hi`` to this value and set a flag so we'll come back
# in the next cycle:
hi = cid
is_surrogate = True
continue
#.......................................................................................................
R.data[ chr_count ] = cid
chr_count += 1
#.........................................................................................................
# Surrogate CIDs take up two characters but end up as a single resultant CID, so the return value may
# have fewer elements than the naive string length indicated; in this case, we want to free some memory
# and correct array length data:
if chr_count != length:
R.resize( chr_count )
#.........................................................................................................
return R
#---------------------------------------------------------------------------------------------------------
def cids_from_text( text ):
cdef Array_of_unsigned_int c_R =_cids_from_text( text )
R = c_R.as_list()
c_R.free() ###OBS### should free memory here
return R
############################################################################################################
# SECOND-ORDER SIMILARITY
#-----------------------------------------------------------------------------------------------------------
cpdef float similarity( char *a, char *b ):
"""Given two byte strings ``a`` and ``b``, return their Damerau-Levenshtein similarity as a float between
0.0 and 1.1. Similarity is computed as ``1 - relative_editdistance( a, b )``, so a result of ``1.0``
indicates identity, while ``0.0`` indicates complete dissimilarity."""
return 1.0 - relative_editdistance( a, b )
#-----------------------------------------------------------------------------------------------------------
cpdef float relative_editdistance( char *a, char *b ):
"""Given two byte strings ``a`` and ``b``, return their relative Damerau-Levenshtein distance. The return
value is a float between 0.0 and 1.0; it is calculated as the absolute edit distance, divided by the
length of the longer string. Therefore, ``0.0`` indicates identity, while ``1.0`` indicates complete
dissimilarity."""
cdef int length = max( len( a ), len( b ) )
if length == 0: return 0.0
return editdistance( a, b ) / <float>length
############################################################################################################
# EDIT DISTANCE
#-----------------------------------------------------------------------------------------------------------
cpdef unsigned int editdistance( text_a, text_b ):
"""Given texts as Unicode strings or ``bytes`` / ``bytearray`` objects, return their absolute
Damerau-Levenshtein distance. Each deletion, insertion, substitution, and transposition is counted as one
difference, so the edit distance between ``abc`` and ``ab``, ``abcx``, ``abx``, ``acb``, respectively, is
``1``."""
#.........................................................................................................
# This should be fast in Python, as it can (and probably is) implemented by doing an identity check in
# the case of ``bytes`` and ``str`` objects:
if text_a == text_b: return 0
#.........................................................................................................
# Convert Unicode text to C array of unsigned integers:
cdef Array_of_unsigned_int a = _cids_from_text( text_a )
cdef Array_of_unsigned_int b = _cids_from_text( text_b )
R = c_editdistance( a, b )
#.........................................................................................................
# Always remember the milk:
a.free()
b.free()
#.........................................................................................................
return R
#-----------------------------------------------------------------------------------------------------------
cdef unsigned int c_editdistance( Array_of_unsigned_int cids_a, Array_of_unsigned_int cids_b ):
# Conceptually, this is based on a len(a) + 1 * len(b) + 1 matrix.
# However, only the current and two previous rows are needed at once,
# so we only store those.
#.........................................................................................................
# This shortcut is pretty useless if comparison is not very fast; therefore, it is done in the function
# that deals with the Python objects, q.v.
# if cids_a.equals( cids_b ): return 0
#.........................................................................................................
cdef unsigned int a_length = cids_a.length
cdef unsigned int b_length = cids_b.length
#.........................................................................................................
# Another shortcut: if one of the texts is empty, then the edit distance is trivially the length of the
# other text. This also works for two empty texts, but those have already been taken care of by the
# previous shortcut:
#.........................................................................................................
if a_length == 0: return b_length
if b_length == 0: return a_length
#.........................................................................................................
cdef unsigned int row_length = b_length + 1
cdef unsigned int row_length_1 = row_length - 1
cdef unsigned int row_bytecount = sizeof( unsigned int ) * row_length
cdef unsigned int *oneago = <unsigned int *>malloc( row_bytecount ) ###OBS### must check malloc doesn't return NULL pointer
cdef unsigned int *twoago = <unsigned int *>malloc( row_bytecount ) ###OBS### must check malloc doesn't return NULL pointer
cdef unsigned int *thisrow = <unsigned int *>malloc( row_bytecount ) ###OBS### must check malloc doesn't return NULL pointer
cdef unsigned int idx = 0
cdef unsigned int idx_a = 0
cdef unsigned int idx_b = 0
cdef int idx_a_1_text = 0
cdef int idx_b_1_row = 0
cdef int idx_b_2_row = 0
cdef int idx_b_1_text = 0
cdef unsigned int deletion_cost = 0
cdef unsigned int addition_cost = 0
cdef unsigned int substitution_cost = 0
#.........................................................................................................
# Equivalent of ``thisrow = list( range( 1, b_length + 1 ) ) + [ 0 ]``:
#print( '#305', cids_a.as_list(), cids_b.as_list(), a_length, b_length, row_length, row_length_1 )
for idx from 1 <= idx < row_length:
thisrow[ idx - 1 ] = idx
thisrow[ row_length - 1 ] = 0
#.........................................................................................................
for idx_a from 0 <= idx_a < a_length:
idx_a_1_text = _warp( a_length, idx_a - 1 )
twoago, oneago = oneago, thisrow
#.......................................................................................................
# Equivalent of ``thisrow = [ 0 ] * b_length + [ idx_a + 1 ]``:
for idx from 0 <= idx < row_length_1:
thisrow[ idx ] = 0
thisrow[ row_length - 1 ] = idx_a + 1
#.......................................................................................................
# some diagnostic output:
x = []
for idx from 0 <= idx < row_length: x.append( thisrow[ idx ] )
print
print '#ED A', x
#.......................................................................................................
for idx_b from 0 <= idx_b < b_length:
#.....................................................................................................
idx_b_1_row = _warp( row_length, idx_b - 1 )
idx_b_1_text = _warp( b_length, idx_b - 1 )
#.....................................................................................................
assert 0 <= idx_b_1_row < row_length, ( '#323', idx_b_1_row, )
assert 0 <= idx_a_1_text < a_length, ( '#324', idx_a_1_text, )
assert 0 <= idx_b_1_text < b_length, ( '#325', idx_b_1_text, )
#.....................................................................................................
deletion_cost = oneago[ idx_b ] + 1
addition_cost = thisrow[ idx_b_1_row ] + 1
substitution_cost = oneago[ idx_b_1_row ] + ( 1 if cids_a.data[ idx_a ]
!= cids_b.data[ idx_b ] else 0 )
thisrow[ idx_b ] = _minimum_of_three_uints( deletion_cost, addition_cost, substitution_cost )
#.....................................................................................................
# Transpositions:
if ( idx_a > 0
and idx_b > 0
and cids_a.data[ idx_a ] == cids_b.data[ idx_b_1_text ]
and cids_a.data[ idx_a_1_text ] == cids_b.data[ idx_b ]
and cids_a.data[ idx_a ] != cids_b.data[ idx_b ] ):
#...................................................................................................
idx_b_2_row = _warp( row_length, idx_b - 2 )
assert 0 <= idx_b_2_row < row_length, ( '#340', idx_b_2_row, )
thisrow[ idx_b ] = _minimum_of_two_uints( thisrow[ idx_b ], twoago[ idx_b_2_row ] + 1 )
#.....................................................................................................
# some diagnostic output:
x = []
for idx from 0 <= idx < row_length: x.append( thisrow[ idx ] )
print '#ED B', x
#.........................................................................................................
# Here, ``b_length - 1`` can't become negative, since we already tested for ``b_length == 0`` in the
# shortcut above:
cdef unsigned int R = thisrow[ b_length - 1 ]
#.........................................................................................................
# Always remember the milk:
# BUG: Activating below lines leads to glibc failing with ``double free or corruption``
#free( twoago )
#free( oneago )
#free( thisrow )e
#.........................................................................................................
return R
#-----------------------------------------------------------------------------------------------------------
def editdistance_reference( text_a, text_b ):
"""This method is believed to compute a correct Damerau-Levenshtein edit distance, with deletions,
insertions, substitutions, and transpositions. Do not touch it; it is here to validate results returned
from the above method. Code adapted from
http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance"""
# Conceptually, the implementation is based on a ``( len( seq1 ) + 1 ) * ( len( seq2 ) + 1 )`` matrix.
# However, only the current and two previous rows are needed at once, so we only store those. Python
# lists wrap around for negative indices, so we put the leftmost column at the *end* of the list. This
# matches with the zero-indexed strings and saves extra calculation.
b_length = len( text_b )
oneago = None
thisrow = list( range( 1, b_length + 1 ) ) + [ 0 ]
for idx_a in range( len( text_a ) ):
twoago, oneago, thisrow = oneago, thisrow, [ 0 ] * b_length + [ idx_a + 1 ]
#.......................................................................................................
# some diagnostic output:
print
print '#EDR A', thisrow
#.......................................................................................................
for idx_b in range( b_length ):
deletion_cost = oneago[ idx_b ] + 1
addition_cost = thisrow[ idx_b - 1 ] + 1
substitution_cost = oneago[ idx_b - 1 ] + ( text_a[ idx_a ] != text_b[ idx_b ] )
thisrow[ idx_b ] = min( deletion_cost, addition_cost, substitution_cost )
if ( idx_a > 0
and idx_b > 0
and text_a[ idx_a ] == text_b[ idx_b - 1 ]
and text_a[ idx_a - 1 ] == text_b[ idx_b ]
and text_a[ idx_a ] != text_b[ idx_b ] ):
thisrow[ idx_b ] = min( thisrow[ idx_b ], twoago[ idx_b - 2 ] + 1 )
#.....................................................................................................
# some diagnostic output:
print '#EDR B', thisrow
#.....................................................................................................
return thisrow[ len( text_b ) - 1 ]
编辑我还将此文本发布到pastebin和 Cython 列表。