我正在尝试Multi30k
使用 google colab 加载 torchtext 数据集。当我加载它时.de
它工作正常,但是一旦我改变.de
我得到这个错误:
FileNotFoundError: [Errno 2] No such file or directory: '.data/multi30k/train.fr'
这就是我加载的方式.de
并且它起作用了:
train_data, valid_data, test_data = datasets.Multi30k.splits(
root=".data",
exts=('.de', '.en'),
fields = (SRC, TRG),
)
一旦我通过更改.de
为.fr
错误来更改此代码:
train_data, valid_data, test_data = datasets.Multi30k.splits(
root=".data",
exts=('.fr', '.en'),
fields = (SRC, TRG),
)
进口
import torch
from torch import nn
from torch.nn import functional as F
import spacy, math, random
import numpy as np
from torchtext.legacy import datasets, data
import time
from prettytable import PrettyTable
from matplotlib import pyplot as plt
种子
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deteministic = True
分词器spacy
import spacy
spacy.cli.download('fr_core_news_sm')
spacy_fr = spacy.load('fr_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')
def tokenize_fr(sent):
return [tok.text for tok in spacy_fr.tokenizer(sent)]
def tokenize_en(sent):
return [tok.text for tok in spacy_en.tokenizer(sent)]
字段
SRC = data.Field(
tokenize= tokenize_fr,
lower= True,
init_token = "<sos>",
eos_token = "<eos>",
include_lengths =True
)
TRG = data.Field(
tokenize = tokenize_en,
lower= True,
init_token = "<sos>",
eos_token = "<eos>"
)
引发错误的单元格
train_data, valid_data, test_data = datasets.Multi30k.splits(
root=".data",
exts=('.fr', '.en'),
fields = (SRC, TRG),
)