python - 微调 BERT 情绪分析时的过度拟合

Question

一般来说，我是机器学习的新手。我目前正在尝试使用 BERT 和 Transformers 遵循有关情绪分析的教程https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

但是，当我训练模型时，模型似乎过度拟合

我不知道如何解决这个问题。我尝试过减少时期的数量，增加批量大小，洗牌我的数据（这是有序的）并增加验证拆分。到目前为止，没有任何效果。我什至尝试过改变不同的学习率，但我现在使用的是最小的。

下面是我的代码：

PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

MAX_LEN = 40

#Make a PyTorch dataset
class FIDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len):

    self.texts = texts

    self.targets = targets

    self.tokenizer = tokenizer

    self.max_len = max_len

  def __len__(self):

    return len(self.texts)

  def __getitem__(self, item):

    text = str(self.texts[item])

    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(

      text,

      add_special_tokens=True,

      max_length=self.max_len,

      return_token_type_ids=False,

      pad_to_max_length=True,

      return_attention_mask=True,

      return_tensors='pt',

    )

    return {

      'text': text,

      'input_ids': encoding['input_ids'].flatten(),

      'attention_mask': encoding['attention_mask'].flatten(),

      'targets': torch.tensor(target, dtype=torch.long)

    }

#split test and train
df_train, df_test = train_test_split(

  df,

  test_size=0.1,

  random_state=RANDOM_SEED

)

df_val, df_test = train_test_split(

  df_test,

  test_size=0.5,

  random_state=RANDOM_SEED

)


#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):

  ds = FIDataset(

    texts=df.content.to_numpy(),

    targets=df.sentiment.to_numpy(),

    tokenizer=tokenizer,

    max_len=max_len

  )

  return DataLoader(

    ds,

    batch_size=batch_size,

    num_workers=4

  )

BATCH_SIZE = 32

#Load data into train, test, val
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)

val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

#Bert model loading
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):

    super(SentimentClassifier, self).__init__()

    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

    self.drop = nn.Dropout(p=0.1)

    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    returned = self.bert(
        
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    pooled_output = returned["pooler_output"]
    output = self.drop(pooled_output)
    
    return self.out(output)

#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)

model = model.to(device)

#Optimize with AdamW
EPOCHS = 6

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(

  optimizer,

  num_warmup_steps=0,

  num_training_steps=total_steps

)

loss_fn = nn.CrossEntropyLoss().to(device)

#Train each Epoch function
def train_epoch(

  model,

  data_loader,

  loss_fn,

  optimizer,

  device,

  scheduler,

  n_examples

):

  model = model.train()

  losses = []

  correct_predictions = 0

  for d in data_loader:

    input_ids = d["input_ids"].to(device)

    attention_mask = d["attention_mask"].to(device)

    targets = d["targets"].to(device)

    outputs = model(

      input_ids=input_ids,

      attention_mask=attention_mask

    )

    _, preds = torch.max(outputs, dim=1)

    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)

    losses.append(loss.item())

    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()

    scheduler.step()

    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

import torch

history = defaultdict(list)

best_accuracy = 0

if __name__ == '__main__':    
    for epoch in range(EPOCHS):

      print(f'Epoch {epoch + 1}/{EPOCHS}')

      print('-' * 10)

      train_acc, train_loss = train_epoch(

        model,

        train_data_loader,

        loss_fn,

        optimizer,

        device,

        scheduler,

        len(df_train)

      )

      print(f'Train loss {train_loss} accuracy {train_acc}')

      val_acc, val_loss = eval_model(

        model,

        val_data_loader,

        loss_fn,

        device,

        len(df_val)

      )

      print(f'Val   loss {val_loss} accuracy {val_acc}')

      print()

      history['train_acc'].append(train_acc)

      history['train_loss'].append(train_loss)

      history['val_acc'].append(val_acc)

      history['val_loss'].append(val_loss)

      if val_acc > best_accuracy:

        torch.save(model.state_dict(), 'best_model_state.bin')

        best_accuracy = val_acc

score 2 · Accepted Answer

从广义上讲，要减少过度拟合，您可以：

增加正则化
降低模型复杂度
执行提前停止
增加训练数据

从您所写的内容来看，您已经尝试了 3 和 4。对于神经网络，您可以通过增加 dropout 来增加正则化。你已经有了它的代码。

# NOTE: You don't need bert_model here since you're creating one inside
# of SentimentClassifier.
#bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):

    super(SentimentClassifier, self).__init__()

    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

    self.drop = nn.Dropout(p=0.1) # <-- INCREASE THIS VALUE

    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

我建议尝试更高的Dropout概率值，正如我在上面的代码中所指出的（“增加这个值”）。跟踪 Dropout 概率和观察到的过度拟合。尝试的概率值0.1, 0.2, 0.3, 0.4, 0.5。

通常，我发现超过 0.5 的 dropout 并没有多大好处。

python - 微调 BERT 情绪分析时的过度拟合

1 回答 1

Related

Reference