我正在尝试分别标记 2 个句子并将输出存储为 datasets.Sequence 作为预训练模型的输入input_ids
,token_type_ids
并attention_mask
作为输入。我想做一个像
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
'context': Value(dtype='string', id=None),
'id': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None),
'title': Value(dtype='string', id=None)}
这显示在文档的展平部分
这是映射数据集的代码。
def tokenize0(ds,verbose: bool = False):
q1=datasets.Sequence(dict(tokenizer(ds['question1'], padding='max_length', truncation=True, max_length=128).items()))
q2=datasets.Sequence(dict(tokenizer(ds['question2'], padding='max_length', truncation=True, max_length=128).items()))
if verbose:
tokenize.i+=1
if not(tokenize.i %100):
print(ds)
print(tokenize.i)
print(q1)
print(q2)
return {"q1":q1,"q2":q2}
然后它去
ArrowInvalid: Could not convert Sequence(feature={'input_ids':[too_long_to_show], 'token_type_ids':[too_long_to_show],'attention_mask':[too_long_to_show]}, length=-1, id=None) with type Sequence: did not recognize Python value type when inferring an Arrow data type
问题:
我应该如何修改地图功能?
附加信息:
追溯:
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
/var/folders/km/fkl_bmms3jj4jc45vy6bs0180000gq/T/ipykernel_10393/2893591874.py in <module>
----> 1 dataset_tokenized0 = dataset.map(lambda x:tokenize0(x,verbose=True))
2 dataset_tokenized0
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/dataset_dict.py in map(self, function, with_indices, input_columns, batched, batch_size, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc)
482 cache_file_names = {k: None for k in self}
483 return DatasetDict(
--> 484 {
485 k: dataset.map(
486 function=function,
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/dataset_dict.py in <dictcomp>(.0)
483 return DatasetDict(
484 {
--> 485 k: dataset.map(
486 function=function,
487 with_indices=with_indices,
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_dataset.py in map(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
2034
2035 if num_proc is None or num_proc == 1:
-> 2036 return self._map_single(
2037 function=function,
2038 with_indices=with_indices,
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
501 self: "Dataset" = kwargs.pop("self")
502 # apply actual function
--> 503 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
504 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
505 for dataset in datasets:
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
468 }
469 # apply actual function
--> 470 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
471 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
472 # re-apply format to the output
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
404 # Call actual function
405
--> 406 out = func(self, *args, **kwargs)
407
408 # Update fingerprint of in-place transforms + update in-place history of transforms
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_dataset.py in _map_single(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)
2425 if update_data:
2426 if writer is not None:
-> 2427 writer.finalize()
2428 if tmp_file is not None:
2429 tmp_file.close()
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_writer.py in finalize(self, close_stream)
440 # Re-intializing to empty list for next batch
441 self.hkey_record = []
--> 442 self.write_examples_on_file()
443 if self.pa_writer is None:
444 if self._schema is not None:
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_writer.py in write_examples_on_file(self)
309 [row[0][col] for row in self.current_examples], type=col_type, try_type=col_try_type, col=col
310 )
--> 311 pa_array = pa.array(typed_sequence)
312 inferred_type = pa_array.type
313 first_example = pa.array(OptimizedTypedSequence(typed_sequence.data[:1], type=inferred_type))[0]
~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/array.pxi in pyarrow.lib._handle_arrow_array_protocol()
~/miniforge3/envs/transformers/lib/python3.9/site-packages/datasets/arrow_writer.py in __arrow_array__(self, type)
113 out = list_of_np_array_to_pyarrow_listarray(self.data)
114 else:
--> 115 out = pa.array(cast_to_python_objects(self.data, only_1d_for_numpy=True), type=type)
116 if trying_type and out[0].as_py() != self.data[0]:
117 raise TypeError(
~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/array.pxi in pyarrow.lib._sequence_to_array()
~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()
~/miniforge3/envs/transformers/lib/python3.9/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()