FWIW,这是一个如何在 Python 中执行此操作的粗略示例:
import re
def distinct_words(block, seen, delim):
""" makes a list of words distinct, given a set of words seen earlier """
unique_words = []
for word in re.split(delim, block):
if not word in seen:
seen[word] = True
unique_words.append(word)
return unique_words
def process_line(line):
""" removes all duplicate words from a dictionary line """
# safeguard
if '$' not in line: return line
# split line at the '$'
original, translated = line.split('$')
# make original words distinct
distinct_original = distinct_words(original, {}, r' +')
# make translated words distinct, but keep block structure
# split the translated part at '|' into blocks
# split each block at ', ' into words
seen = {}
distinct_translated = [
distinct_list for distinct_list in (
distinct_words(block, seen, r', +') for block in (
re.split(r'\s*\|\s*', translated)
)
)
if len(distinct_list) > 0
]
# put everything back together again
part_original = ' '.join(distinct_original)
part_translated = [', '.join(block) for block in distinct_translated]
part_translated = ' | '.join(part_translated)
result = part_original + '$' + part_translated
return result
def process_dictionary(filename):
""" processes a dictionary text file, modifies the file in place """
lines = open(filename,'r').readlines()
lines_out = [process_line(line) for line in lines]
contents_out = '\n'.join(lines_out)
open(filename,'w').write(contents_out)
显然你会打电话process_dictionary()
,像这样:
process_dictionary('dict_en_es.txt')
但是为了这个例子,假设你有一行:
line = "house house$casa | casa, vivienda, hogar | edificio, casa | vivienda"
line_out = process_line(line)
print line_out
打印想要的结果:
房子$casa | 维维安达,霍加尔| 高分辨率照片| CLIPARTO 大厦