我试图在 RoBERTa 模型之上堆叠一个 LSTM,以解决我尝试配置的二进制分类问题: - 冻结 RoBERTa 嵌入 - 微调嵌入
在冻结的情况下,我得到大约 57% 的 F 分数,与常规 RoBERTa 相比,这相对较低,用于序列分类得到相同数据的大约 81%
在微调的情况下,我得到 0% 的 F 分数并且验证损失没有收敛
class RoBERTaLSTMClassifier(nn.Module):
def __init__(self, bert_config, num_classes, hidden_size=None, dropout=0.5):
bert: pretrained bert model
num_classes: the number of num_classes
hidden_size: the number of hiddens which will be used by LSTM layer
dropout: dropout rate
super(RoBERTaLSTMClassifier, self).__init__()
self.num_classes = num_classes
self.model = RobertaModel(bert_config)
if hidden_size is None: self.hidden_size = bert_config.hidden_size
else: self.hidden_size = hidden_size
self.lstm = nn.LSTM(bert_config.hidden_size, self.hidden_size, bidirectional=True,batch_first=True)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(self.hidden_size * 2, 1)
self.softmax = nn.Softmax()
## add sigmoid non linearity for binary classification
self.sig = nn.Sigmoid()
def forward(self, input_ids, attention_mask, current_batch_size, hidden):
all_layers: whether or not to return all encoded_layers
return: logits in the following format (batch_size, num_classes)
with torch.no_grad():
## freeze embedding from BERT
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
# last hidden state is input to the LSTM
output, (hidden_h, hidden_c) = self.lstm(outputs[0], hidden)
output_hidden = torch.cat((hidden_h[0], hidden_h[1]), dim=1) #[B, H*2]
logits = self.classifier(self.dropout(output_hidden)) #[B, C]
sig_out = self.sig(logits).view(current_batch_size, -1)
## get the last batch output
sig_out = sig_out[:, -1] # get last batch of labels
hidden = (hidden_h, hidden_c)
return sig_out, hidden
def init_bilstm_hidden(self, batch_size):
h0 = torch.zeros(2, batch_size, self.hidden_size).to(device) # 2 for bidirection
c0 = torch.zeros(2, batch_size, self.hidden_size).to(device)
return (h0, c0)
from sklearn.metrics import f1_score
from tqdm import tqdm, trange
import numpy as np
roberta_conf = RobertaConfig.from_pretrained('roberta-base')
num_classes = 2
hidden_size = 256
LSTMRoBERTaModel = RoBERTaLSTMClassifier(roberta_conf, num_classes=num_classes,hidden_size= hidden_size,dropout=0.5)
criterion = nn.BCELoss() ## binary cross entropy
optimizer = torch.optim.Adam(LSTMRoBERTaModel.parameters(), lr=lr)
epochs = 5
counter = 0
max_grad_norm = 1.0
nb_tr_examples, nb_tr_steps = 0, 0
for _ in trange(epochs, desc="Epoch"):
tr_loss = 0
y_preds = []
y_true = []
hidden_init = LSTMRoBERTaModel.init_bilstm_hidden(batch_size=bs)
h = hidden_init
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
current_batch_size = b_input_ids.size()[0]
h = tuple([each.data for each in h])
## forward pass
preds, h = LSTMRoBERTaModel.forward(b_input_ids, b_input_mask, current_batch_size,h)
loss = criterion(preds.squeeze(),b_labels.float())
# track train loss
tr_loss += loss.item()
nb_tr_examples += b_input_ids.size(0)
nb_tr_steps += 1
# gradient clipping
torch.nn.utils.clip_grad_norm_(parameters=LSTMRoBERTaModel.parameters(), max_norm=max_grad_norm)
# print train loss per epoch
print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
val_h = LSTMRoBERTaModel.init_bilstm_hidden(bs)
for batch in dev_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
current_batch_size = b_input_ids.size()[0]
with torch.no_grad():
preds, val_h = LSTMRoBERTaModel.forward(b_input_ids, b_input_mask, current_batch_size, val_h)
loss = criterion(preds.squeeze(),b_labels.float())
eval_loss += loss
#print(preds[2], b_labels[2] )
#eval_accuracy += f1_score(torch.tensor.numpy(b_labels.float), toch.tensor.numpy(preds))
nb_eval_examples += b_input_ids.size(0)
nb_eval_steps += 1
eval_loss = eval_loss/nb_eval_steps
print("Validation loss: {}".format(eval_loss))
print("F1 - Score: {}".format(f1_score(y_true,y_preds)))
#print("F1- Score: {}".format(eval_accuracy/nb_eval_steps))