0

我尝试了以下三种我在网上找到的.conll用 Python 读取文件的方法,但只得到了我不理解的错误报告。我还阅读了有关不同类型.conll文件的信息,但我不知道哪一个是我的数据集。我怎样才能知道?是否有必要根据特定类型来实现加载.conll?有没有简单的方法来处理.conll文件?

from pyconll load__from_file
data = load_from_file("wnut17train.conll")

ParseError                                Traceback (most recent call last)
<ipython-input-15-9d2ceebeda92> in <module>()
      1 import pyconll
----> 2 data = pyconll.load_from_file("wnut17train.conll")
      3 data

/usr/local/lib/python3.6/dist-packages/pyconll/load.py in load_from_file(filename)
     46     """
     47     with open(filename, encoding='utf-8') as f:
---> 48         c = Conll(f)
     49 
     50     return c

/usr/local/lib/python3.6/dist-packages/pyconll/unit/conll.py in __init__(self, it)
     30         self._sentences = []
     31 
---> 32         for sentence in pyconll._parser.iter_sentences(it):
     33             self._sentences.append(sentence)
     34 

/usr/local/lib/python3.6/dist-packages/pyconll/_parser.py in iter_sentences(lines_it)
     51             sent_lines.append(line)
     52         elif sent_lines:
---> 53             sentence = _create_sentence(sent_lines)
     54             sent_lines.clear()
     55 

/usr/local/lib/python3.6/dist-packages/pyconll/_parser.py in _create_sentence(sent_lines)
     22     """
     23     sent_source = '\n'.join(sent_lines)
---> 24     sentence = Sentence(sent_source)
     25 
     26     return sentence

/usr/local/lib/python3.6/dist-packages/pyconll/unit/sentence.py in __init__(self, source)
     77                         self._meta[k] = None
     78                 else:
---> 79                     token = Token(line)
     80                     self._tokens.append(token)
     81 

/usr/local/lib/python3.6/dist-packages/pyconll/unit/token.py in __init__(self, source, empty)
    661             error_msg = 'The number of columns per token line must be 10. Invalid token: {}'.format(
    662                 source)
--> 663             raise ParseError(error_msg)
    664 
    665         # Assign all the field values from the line to internal equivalents.

ParseError: The number of columns per token line must be 10. Invalid token: @paulwalk   O
from conllu import parse
train = parse("wnut17train.conll", fields=["id","form","lemma","postag"])

ParseException                            Traceback (most recent call last)
<ipython-input-21-7be24f5d9e1f> in <module>()
      1 from conllu import parse
      2 
----> 3 train = parse("wnut17train.conll", fields=["id","form","lemma","postag"])

/usr/local/lib/python3.6/dist-packages/conllu/__init__.py in parse(data, fields, field_parsers, metadata_parsers)
     18         fields=fields,
     19         field_parsers=field_parsers,
---> 20         metadata_parsers=metadata_parsers
     21     ))
     22 

/usr/local/lib/python3.6/dist-packages/conllu/__init__.py in parse_incr(in_file, fields, field_parsers, metadata_parsers)
     36             fields=fields,
     37             field_parsers=field_parsers,
---> 38             metadata_parsers=metadata_parsers
     39         ))
     40 

/usr/local/lib/python3.6/dist-packages/conllu/parser.py in parse_token_and_metadata(data, fields, field_parsers, metadata_parsers)
     94                 metadata[key] = value
     95         else:
---> 96             tokens.append(parse_line(line, fields, field_parsers))
     97 
     98     return tokens, metadata

/usr/local/lib/python3.6/dist-packages/conllu/parser.py in parse_line(line, fields, field_parsers)
    118 
    119     if len(line_split) == 1:
--> 120         raise ParseException("Invalid line format, line must contain either tabs or two spaces.")
    121 
    122     data = Token()

ParseException: Invalid line format, line must contain either tabs or two spaces.
from nltk.corpus.reader import ConllChunkCorpusReader
TRAIN = ConllChunkCorpusReader("wnut17train.conll", ('NP','VP','PP’), tagset="wsj”, encoding="utf-8”)

File "<ipython-input-26-9362233e7aa7>", line 2
    TRAIN = ConllChunkCorpusReader("wnut17train.conll", ('NP','VP','PP’),  encoding="utf-8”)
                                                                                            ^
SyntaxError: EOL while scanning string literal
4

0 回答 0