我想在互联网访问受限的企业环境中使用 spacy-transformers,所以我必须手动从 huggingfaces 中心下载变压器模型,并让它们在 spacy 中工作。
在此示例中,我尝试使用 en_core_web_trf 预训练模型中的转换器管道组件:
import spacy
import spacy_transformers
nlp_trf = spacy.load("en_core_web_trf") # load roberta pretrained model
transformer= nlp_trf.get_pipe("transformer") # get transformer pipeline component
transformer.to_disk("transfomer_pretrained") # save pipeline component to disk
nlp = spacy.blank("en")
trf = nlp.add_pipe("transformer")
trf.from_disk("transformer_pretrained", exclude=["vocab"]) # load transformer pipeline component from disk
我收到以下错误消息:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-23-c66c45181d83> in <module>
1 #trf.model.initialize([nlp.make_doc("hello world")])
----> 2 trf.from_disk("models/transformer_pretrained", exclude=["vocab"])
3 nlp.pipe_names
C:\_Development\Python37\site-packages\spacy_transformers\pipeline_component.py in from_disk(self, path, exclude)
400 "model": load_model,
401 }
--> 402 util.from_disk(path, deserialize, exclude)
403 return self
C:\_Development\Python37\site-packages\spacy\util.py in from_disk(path, readers, exclude)
1172 # Split to support file names like meta.json
1173 if key.split(".")[0] not in exclude:
-> 1174 reader(path / key)
1175 return path
1176
C:\_Development\Python37\site-packages\spacy_transformers\pipeline_component.py in load_model(p)
390 p = Path(p).absolute()
391 tokenizer, transformer = huggingface_from_pretrained(
--> 392 p, self.model.attrs["tokenizer_config"]
393 )
394 self.model.attrs["tokenizer"] = tokenizer
C:\_Development\Python37\site-packages\spacy_transformers\util.py in huggingface_from_pretrained(source, config)
29 else:
30 str_path = source
---> 31 tokenizer = AutoTokenizer.from_pretrained(str_path, **config)
32 transformer = AutoModel.from_pretrained(str_path)
33 ops = get_current_ops()
C:\_Development\Python37\site-packages\transformers\models\auto\tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
388 kwargs["_from_auto"] = True
389 if not isinstance(config, PretrainedConfig):
--> 390 config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
391
392 use_fast = kwargs.pop("use_fast", True)
C:\_Development\Python37\site-packages\transformers\models\auto\configuration_auto.py in from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
396 """
397 kwargs["_from_auto"] = True
--> 398 config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
399 if "model_type" in config_dict:
400 config_class = CONFIG_MAPPING[config_dict["model_type"]]
C:\_Development\Python37\site-packages\transformers\configuration_utils.py in get_config_dict(cls, pretrained_model_name_or_path, **kwargs)
464 local_files_only=local_files_only,
465 use_auth_token=use_auth_token,
--> 466 user_agent=user_agent,
467 )
468 # Load config dict
C:\_Development\Python37\site-packages\transformers\file_utils.py in cached_path(url_or_filename, cache_dir, force_download, proxies, resume_download, user_agent, extract_compressed_file, force_extract, use_auth_token, local_files_only)
1171 user_agent=user_agent,
1172 use_auth_token=use_auth_token,
-> 1173 local_files_only=local_files_only,
1174 )
1175 elif os.path.exists(url_or_filename):
C:\_Development\Python37\site-packages\transformers\file_utils.py in get_from_cache(url, cache_dir, force_download, proxies, etag_timeout, resume_download, user_agent, use_auth_token, local_files_only)
1387 else:
1388 raise ValueError(
-> 1389 "Connection error, and we cannot find the requested files in the cached path."
1390 " Please try again or make sure your Internet connection is on."
1391 )
ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on.
如错误消息所述,在缓存路径中找不到请求的文件。有人可以向我解释我必须将哪些文件放在 chache 路径中吗?或者另一种方式来预下载模型并在 spacy 中使用它们。
版本:
空间 3.0.5
spacy-transformers 1.0.2
变压器 4.5.1