我尝试了以下三种我在网上找到的.conll
用 Python 读取文件的方法,但只得到了我不理解的错误报告。我还阅读了有关不同类型.conll
文件的信息,但我不知道哪一个是我的数据集。我怎样才能知道?是否有必要根据特定类型来实现加载.conll
?有没有简单的方法来处理.conll
文件?
from pyconll load__from_file
data = load_from_file("wnut17train.conll")
ParseError Traceback (most recent call last)
<ipython-input-15-9d2ceebeda92> in <module>()
1 import pyconll
----> 2 data = pyconll.load_from_file("wnut17train.conll")
3 data
/usr/local/lib/python3.6/dist-packages/pyconll/load.py in load_from_file(filename)
46 """
47 with open(filename, encoding='utf-8') as f:
---> 48 c = Conll(f)
49
50 return c
/usr/local/lib/python3.6/dist-packages/pyconll/unit/conll.py in __init__(self, it)
30 self._sentences = []
31
---> 32 for sentence in pyconll._parser.iter_sentences(it):
33 self._sentences.append(sentence)
34
/usr/local/lib/python3.6/dist-packages/pyconll/_parser.py in iter_sentences(lines_it)
51 sent_lines.append(line)
52 elif sent_lines:
---> 53 sentence = _create_sentence(sent_lines)
54 sent_lines.clear()
55
/usr/local/lib/python3.6/dist-packages/pyconll/_parser.py in _create_sentence(sent_lines)
22 """
23 sent_source = '\n'.join(sent_lines)
---> 24 sentence = Sentence(sent_source)
25
26 return sentence
/usr/local/lib/python3.6/dist-packages/pyconll/unit/sentence.py in __init__(self, source)
77 self._meta[k] = None
78 else:
---> 79 token = Token(line)
80 self._tokens.append(token)
81
/usr/local/lib/python3.6/dist-packages/pyconll/unit/token.py in __init__(self, source, empty)
661 error_msg = 'The number of columns per token line must be 10. Invalid token: {}'.format(
662 source)
--> 663 raise ParseError(error_msg)
664
665 # Assign all the field values from the line to internal equivalents.
ParseError: The number of columns per token line must be 10. Invalid token: @paulwalk O
from conllu import parse
train = parse("wnut17train.conll", fields=["id","form","lemma","postag"])
ParseException Traceback (most recent call last)
<ipython-input-21-7be24f5d9e1f> in <module>()
1 from conllu import parse
2
----> 3 train = parse("wnut17train.conll", fields=["id","form","lemma","postag"])
/usr/local/lib/python3.6/dist-packages/conllu/__init__.py in parse(data, fields, field_parsers, metadata_parsers)
18 fields=fields,
19 field_parsers=field_parsers,
---> 20 metadata_parsers=metadata_parsers
21 ))
22
/usr/local/lib/python3.6/dist-packages/conllu/__init__.py in parse_incr(in_file, fields, field_parsers, metadata_parsers)
36 fields=fields,
37 field_parsers=field_parsers,
---> 38 metadata_parsers=metadata_parsers
39 ))
40
/usr/local/lib/python3.6/dist-packages/conllu/parser.py in parse_token_and_metadata(data, fields, field_parsers, metadata_parsers)
94 metadata[key] = value
95 else:
---> 96 tokens.append(parse_line(line, fields, field_parsers))
97
98 return tokens, metadata
/usr/local/lib/python3.6/dist-packages/conllu/parser.py in parse_line(line, fields, field_parsers)
118
119 if len(line_split) == 1:
--> 120 raise ParseException("Invalid line format, line must contain either tabs or two spaces.")
121
122 data = Token()
ParseException: Invalid line format, line must contain either tabs or two spaces.
from nltk.corpus.reader import ConllChunkCorpusReader
TRAIN = ConllChunkCorpusReader("wnut17train.conll", ('NP','VP','PP’), tagset="wsj”, encoding="utf-8”)
File "<ipython-input-26-9362233e7aa7>", line 2
TRAIN = ConllChunkCorpusReader("wnut17train.conll", ('NP','VP','PP’), encoding="utf-8”)
^
SyntaxError: EOL while scanning string literal