0

我正在尝试分别标记 2 个句子并将输出存储为 datasets.Sequence 作为预训练模型的输入input_idstoken_type_idsattention_mask作为输入。我想做一个像

{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
'context': Value(dtype='string', id=None),
'id': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None),
'title': Value(dtype='string', id=None)}

这显示在文档的展平部分

这是映射数据集的代码。

def tokenize0(ds,verbose: bool = False):
    q1=datasets.Sequence(dict(tokenizer(ds['question1'], padding='max_length', truncation=True, max_length=128).items()))
    q2=datasets.Sequence(dict(tokenizer(ds['question2'], padding='max_length', truncation=True, max_length=128).items()))
    if verbose:
        tokenize.i+=1
        if not(tokenize.i %100):
            print(ds)
            print(tokenize.i)
            print(q1)
            print(q2)
    return {"q1":q1,"q2":q2}

然后它去

ArrowInvalid: Could not convert Sequence(feature={'input_ids':[too_long_to_show], 'token_type_ids':[too_long_to_show],'attention_mask':[too_long_to_show]}, length=-1, id=None) with type Sequence: did not recognize Python value type when inferring an Arrow data type

问题:

我应该如何修改地图功能?

附加信息:

追溯:

---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
/var/folders/km/fkl_bmms3jj4jc45vy6bs0180000gq/T/ipykernel_10393/2893591874.py in <module>
----> 1 dataset_tokenized0 = dataset.map(lambda x:tokenize0(x,verbose=True))
      2 dataset_tokenized0

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/dataset_dict.py in map(self, function, with_indices, input_columns, batched, batch_size, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc)
    482             cache_file_names = {k: None for k in self}
    483         return DatasetDict(
--> 484             {
    485                 k: dataset.map(
    486                     function=function,

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/dataset_dict.py in <dictcomp>(.0)
    483         return DatasetDict(
    484             {
--> 485                 k: dataset.map(
    486                     function=function,
    487                     with_indices=with_indices,

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
   2034 
   2035         if num_proc is None or num_proc == 1:
-> 2036             return self._map_single(
   2037                 function=function,
   2038                 with_indices=with_indices,

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
    501             self: "Dataset" = kwargs.pop("self")
    502         # apply actual function
--> 503         out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
    504         datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
    505         for dataset in datasets:

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
    468         }
    469         # apply actual function
--> 470         out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
    471         datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
    472         # re-apply format to the output

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
    404             # Call actual function
    405 
--> 406             out = func(self, *args, **kwargs)
    407 
    408             # Update fingerprint of in-place transforms + update in-place history of transforms

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_dataset.py in _map_single(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)
   2425                 if update_data:
   2426                     if writer is not None:
-> 2427                         writer.finalize()
   2428                     if tmp_file is not None:
   2429                         tmp_file.close()

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_writer.py in finalize(self, close_stream)
    440             # Re-intializing to empty list for next batch
    441             self.hkey_record = []
--> 442         self.write_examples_on_file()
    443         if self.pa_writer is None:
    444             if self._schema is not None:

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_writer.py in write_examples_on_file(self)
    309                 [row[0][col] for row in self.current_examples], type=col_type, try_type=col_try_type, col=col
    310             )
--> 311             pa_array = pa.array(typed_sequence)
    312             inferred_type = pa_array.type
    313             first_example = pa.array(OptimizedTypedSequence(typed_sequence.data[:1], type=inferred_type))[0]

~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/array.pxi in pyarrow.lib.array()

~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/array.pxi in pyarrow.lib._handle_arrow_array_protocol()

~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_writer.py in __arrow_array__(self, type)
    113                 out = list_of_np_array_to_pyarrow_listarray(self.data)
    114             else:
--> 115                 out = pa.array(cast_to_python_objects(self.data, only_1d_for_numpy=True), type=type)
    116             if trying_type and out[0].as_py() != self.data[0]:
    117                 raise TypeError(

~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/array.pxi in pyarrow.lib.array()

~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/array.pxi in pyarrow.lib._sequence_to_array()

~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()

~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
4

0 回答 0