我正在尝试使用 Apache Tika 解析一些文档(如文件类型中所列)。这是我在 Python 中的代码。
auth = urllib2.HTTPPasswordMgrWithDefaultRealm()
auth.add_password(None, url, user, password)
urllib2.install_opener(urllib2.build_opener(urllib2.HTTPBasicAuthHandler(auth)))
outpage = urllib2.urlopen(url)
data = json.loads(outpage.read().decode('utf-8'))
dictitems = data.values()
flattened_list = [y for x in dictitems for y in x]
filetypes = [".pdf", ".doc", ".docx", ".txt"]
def tikiparse(fi):
for i in filetypes:
if fi.endswith(i):
text = parser.from_file(fi, "http://localhost:9998/")
extractedcontent = text["content"]
chunked = ne_chunk(pos_tag(word_tokenize(extractedcontent)))
current_chunk = []
cont_chunk = []
for j in chunked:
if type(j) == Tree:
current_chunk.append(" ".join([token for token, pos in j.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in cont_chunk:
cont_chunk.append(named_entity)
current_chunk = []
else:
continue
return cont_chunk
该循环完美地运行了一段时间并解析一些文档以提取命名实体。突然,我收到以下错误。代码出了什么问题?
Traceback (most recent call last):
File "C:/Users/Kalapala/PycharmProjects/Attachments/DownloadFiles.py", line 74, in <module>
tikiparse(f)
File "C:/Users/Kalapala/PycharmProjects/Attachments/DownloadFiles.py", line 57, in tikiparse
chunked = ne_chunk(pos_tag(word_tokenize(extractedcontent)))
File "C:\Python27\lib\site-packages\nltk\tokenize\__init__.py", line 130, in word_tokenize
sentences = [text] if preserve_line else sent_tokenize(text, language)
File "C:\Python27\lib\site-packages\nltk\tokenize\__init__.py", line 97, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1235, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1283, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1274, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1314, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 312, in _pair_iter
prev = next(it)
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1287, in _slices_from_text
for match in self._lang_vars.period_context_re().finditer(text):
TypeError: expected string or buffer
Process finished with exit code 1