概括
我正在为验证码识别添加字母,但是当添加字母时,pytorch 的 CTC 似乎无法正常工作。
我试过的
起初,我修改BLANK_LABEL
为 62,因为有 62 个标签(0-9,az,AZ),但它给了我运行时错误blank must be in label range
。我也尝试过BLANK_LABEL=0
,然后将 1~63 分配为非空白标签,但它输出 NaN 作为损失。
编码
这是我的代码当前版本的 colab 链接:这里
以下只是代码的核心部分。
常数:
DATASET_PATH = "/home/ik1ne/Downloads/numbers"
MODEL_PATH = "/home/ik1ne/Downloads"
BATCH_SIZE = 50
TRAIN_BATCHES = 180
TEST_BATCHES = 20
TOTAL_BATCHES = TRAIN_BATCHES+TEST_BATCHES
TOTAL_DATASET = BATCH_SIZE*TOTAL_BATCHES
BLANK_LABEL = 63
数据集生成:
!pip install captcha
from captcha.image import ImageCaptcha
import itertools
import os
import random
import string
if not os.path.exists(DATASET_PATH):
os.makedirs(DATASET_PATH)
characters = "0123456789"+string.ascii_lowercase + string.ascii_uppercase
while(len(list(Path(DATASET_PATH).glob('*'))) < TOTAL_BATCHES):
captcha_str = "".join(random.choice(characters) for x in range(6))
if captcha_str in list(Path(DATASET_PATH).glob('*')):
continue
ImageCaptcha().write(captcha_str, f"{DATASET_PATH}/{captcha_str}.png")
数据集:
def convert_strseq_to_numseq(s):
for c in s:
if c >= '0' and c <= '9':
return int(c)
elif c>='a' and c <='z':
return ord(c)-ord('a')+10
else:
return ord(c)-ord('A')+36
class CaptchaDataset(Dataset):
"""CAPTCHA dataset."""
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.image_paths = list(Path(root_dir).glob('*'))
self.transform = transform
def __getitem__(self, index):
image = Image.open(self.image_paths[index])
if self.transform:
image = self.transform(image)
label_sequence = [convert_strseq_to_numseq(c) for c in self.image_paths[index].stem]
return (image, torch.tensor(label_sequence))
def __len__(self):
return len(self.image_paths)
模型:
class StackedLSTM(nn.Module):
def __init__(self, input_size=60, output_size=11, hidden_size=512, num_layers=2):
super(StackedLSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout()
self.fc = nn.Linear(hidden_size, output_size)
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
def forward(self, inputs, hidden):
batch_size, seq_len, input_size = inputs.shape
outputs, hidden = self.lstm(inputs, hidden)
outputs = self.dropout(outputs)
outputs = torch.stack([self.fc(outputs[i]) for i in range(width)])
outputs = F.log_softmax(outputs, dim=2)
return outputs, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
return (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
net = StackedLSTM().to(device)
训练:
net.train() # set network to training phase
epochs = 30
# for each pass of the training dataset
for epoch in range(epochs):
train_loss, train_correct, train_total = 0, 0, 0
h = net.init_hidden(BATCH_SIZE)
# for each batch of training examples
for batch_index, (inputs, targets) in enumerate(train_dataloader):
inputs, targets = inputs.to(device), targets.to(device)
h = tuple([each.data for each in h])
BATCH_SIZE, channels, height, width = inputs.shape
# reshape inputs: NxCxHxW -> WxNx(HxC)
inputs = (inputs
.permute(3, 0, 2, 1)
.contiguous()
.view((width, BATCH_SIZE, -1)))
optimizer.zero_grad() # zero the parameter gradients
outputs, h = net(inputs, h) # forward pass
# compare output with ground truth
input_lengths = torch.IntTensor(BATCH_SIZE).fill_(width)
target_lengths = torch.IntTensor([len(t) for t in targets])
loss = criterion(outputs, targets, input_lengths, target_lengths)
loss.backward() # backpropagation
nn.utils.clip_grad_norm_(net.parameters(), 10) # clip gradients
optimizer.step() # update network weights
# record statistics
prob, max_index = torch.max(outputs, dim=2)
train_loss += loss.item()
train_total += len(targets)
for i in range(BATCH_SIZE):
raw_pred = list(max_index[:, i].cpu().numpy())
pred = [c for c, _ in groupby(raw_pred) if c != BLANK_LABEL]
target = list(targets[i].cpu().numpy())
if pred == target:
train_correct += 1
# print statistics every 10 batches
if (batch_index + 1) % 10 == 0:
print(f'Epoch {epoch + 1}/{epochs}, ' +
f'Batch {batch_index + 1}/{len(train_dataloader)}, ' +
f'Train Loss: {(train_loss/1):.5f}, ' +
f'Train Accuracy: {(train_correct/train_total):.5f}')
train_loss, train_correct, train_total = 0, 0, 0