虽然使用微调的 Bert 进行二进制分类效果很好,但我仍然坚持使用多类分类。我的数据集(德国新闻文章,有 10 个类别)包含大约 10.000 个样本。虽然,训练损失和平均评估损失在 2.2 左右。
一些 NLP 配置变量:
DEBUG=True
VERSION = 1
MAX_LEN = 200 #Set the maximum length according to the diagram above
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-4
MOMENTUM = 0.9
TRAIN_SIZE = 0.7
NUM_LABELS = len(df_data.Labels.unique())
MODEL_NAME = "dbmdz/bert-base-german-cased"
params = {"debug": DEBUG, "max_len": MAX_LEN, "batch_size": BATCH_SIZE, "epochs": EPOCHS, "lr": LEARNING_RATE, "momentum": MOMENTUM, "model": MODEL_NAME, "loss": "BCEWithLogitsLoss", "optimizer": "SGD"}
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
数据加载器:
class NLPDataset(th.utils.data.Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.tokenizer = tokenizer
self.max_len = max_len
self.data = dataframe
self.text = dataframe.Text
self.targets = dataframe.Labels
self.len = len(self.text)
def __getitem__(self,idx):
text = str(self.text[idx])
text = " ".join(text.split())
inputs = self.tokenizer(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True,
truncation=True,
padding='max_length',
)
input_ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
return {
'input_ids': th.tensor(input_ids, dtype=th.long),
'mask': th.tensor(mask, dtype=th.float),
'token_type_ids': th.tensor(token_type_ids, dtype=th.long),
'targets': th.tensor(self.targets[idx], dtype=th.long)
}
def __len__(self):
return self.len
Pytorch_lightning 模块:
class NLPClassifier(pl.LightningModule):
def __init__(self):
super().__init__()
#changing the configuration to 10 lables instead of 2
config = transformers.AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = NUM_LABELS
self.model = transformers.AutoModelForSequenceClassification.from_config(config)
self.loss = th.nn.CrossEntropyLoss(reduction="none")
def prepare_data(self):
# train/val split
train_dataset = df_data.sample(frac=TRAIN_SIZE)
val_dataset=df_data.drop(train_dataset.index).sample(frac=1).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
# Assign CustomDataset Class
train_set = NLPDataset(train_dataset, tokenizer, MAX_LEN)
val_set = NLPDataset(val_dataset, tokenizer, MAX_LEN)
print("FULL Dataset: {}".format(df_data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VAL Dataset: {}".format(val_dataset.shape))
# assign to use in dataloaders
self.train_ds = train_set
self.val_ds = val_set
#self.test_dataset = mnist_test DO TO
def forward(self, input_ids, mask):
logits, = self.model(input_ids, mask)
# logits.shape: (16, 10)
return logits
def training_step(self, batch, batch_idx):
logits = self.forward(batch['input_ids'], batch['mask']).squeeze()
loss = self.loss(logits, batch['targets']).mean()
run.log(name='train_loss', value=loss.tolist())
return {'loss': loss, 'log': {'train_loss': loss}}
def validation_step(self,batch, batch_idx):
logits = self.forward(batch['input_ids'], batch['mask']).squeeze()
print(logits.shape)
acc = (logits.argmax(-1) == batch['targets']).float()
loss = self.loss(logits, batch['targets'])
run.log_list('loss', loss.tolist())
run.log_list('acc', acc.tolist())
return {'loss': loss, 'acc': acc}
def validation_epoch_end(self, outputs):
loss = th.cat([o['loss'] for o in outputs], 0).mean()
acc = th.cat([o['acc'] for o in outputs], 0).mean()
out = {'val_loss': loss, 'val_acc': acc}
run.log('val_loss', loss.tolist())
run.log('val_acc', acc.tolist())
return {**out, 'log': {'val_loss': loss, 'val_acc': acc}}
def train_dataloader(self):
return th.utils.data.DataLoader(
self.train_ds,
batch_size=BATCH_SIZE,
num_workers=8,
drop_last=True,
shuffle=False,
)
def val_dataloader(self):
return th.utils.data.DataLoader(
self.val_ds,
batch_size=BATCH_SIZE,
num_workers=8,
drop_last=False,
shuffle=False,
)
def configure_optimizers(self):
return transformers.AdamW(
self.model.parameters(),
lr=LEARNING_RATE,
#momentum=MOMENTUM,
)
教练:
model = NLPClassifier()
trainer = pl.Trainer(
gpus=(1 if th.cuda.is_available() else 0),
default_root_dir = f"./models/version_{VERSION}",
max_epochs=EPOCHS,
fast_dev_run=DEBUG,
limit_train_batches=1.0,
val_check_interval=0.5,
limit_val_batches=1.0,
profiler=True,
#logger=wandb_logger
)
trainer.fit(模型)
这是一个样本损失曲线。
我的核心问题是:
- 是否正确使用了 CrossEntropyLoss?
- 优化器是否有效,因为每个样本的预测很快就会变得相同。
- 学习率问题并没有解决问题。我尝试了从 1e-2 到 1e-6 的范围
谢谢你的帮助。:)