python - 解码 Unicode 时字节序号不在范围内

Question

我使用以下程序编写了一个将英语句子转换为马拉雅拉姆语的程序

# coding=utf-8
import re
import Eng_Manglish
import linecache
import nltk
line =" "
mal_sent=" "
out_file =open("Mal_sent",'w')
input_file = open("frequently_used_eng_sent",'r')
for line in input_file:
    mal_sent=" "
    convert_indicate =[]
    print "*************************"
    line = line.replace('.'," ")
    line =line.replace('[0-9]',' ')
    line =re.sub('[0-9].[\t]*',' ',line)
    line = line.strip()
    print line
    line = line.lower()
    text=nltk.word_tokenize(line)
    data = nltk.pos_tag(text)
    print data
    words = line.split()
    print words
    for word in words:
        print word[0]
        dict_file = open("Dictionary/Eng-Mal-Dict/"+word[0], "r")
        for dicline in dict_file:
            flag=0  
            #print "*********************************"
            dicline = dicline.strip()
            dicline = dicline.split(':')
            #print dicline[0]
            if dicline[0] == word:#fing code for exact match
                print dicline
                print "found"
                convert_indicate.append(1)
                flag =1
                print dicline[2]
                mal_sent=mal_sent+dicline[2]+" "
                print mal_sent.encode('utf-8')
                break
        if flag==0: 
            convert_indicate.append(0)
    #transform word to manglish if not converted
            mal_sent=mal_sent+Eng_Manglish.transform(word)+" "
         dict_file.close()
    print convert_indicate

    print mal_sent 
    out_file.write(mal_sent)

与表格的字典内容

abbey:n:സന്യാസി മഠം

执行此程序后，我在行mal_sent=mal_sent+dicline[2]+" " as UnicodeDecodeError: 'ascii' codec can't decode byte 0xe0 in position 0: ordinal not in range(128) 中遇到错误

你能帮我解决这个错误吗？

python - 解码 Unicode 时字节序号不在范围内

0 回答 0

Related

Reference