我正在使用 BERT 进行文本分类项目
我得到了大约。使用简单的变压器可达到 90% 的准确度。但是我只有 60% 使用我自己的 for-loop 训练(未在此处发布)或使用 transformers 库中的 trainer 模块。两者都是使用简单转换器的默认参数完成的。
我真的很难理解为什么性能会有如此大的差异
数据集来自 Kaggle:https ://www.kaggle.com/ankurzing/sentiment-analysis-for-financial-news
进口:
from transformers import BertForSequenceClassification, AdamW, BertTokenizer, get_linear_schedule_with_warmup, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd
from pathlib import Path
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from torch.nn import functional as F
from collections import defaultdict
import random
from simpletransformers.classification import ClassificationModel
数据预处理:
#loading phrase bank dataset
phrase_bank_dataset = "all-data.csv"
phrase_bank_dataset_file = Path(phrase_bank_dataset)
file_loaded = False
while not file_loaded:
if phrase_bank_dataset_file.exists():
phrase_bank_dataset = pd.read_csv(phrase_bank_dataset, encoding='latin-1')
phrase_bank_dataset = phrase_bank_dataset.values.tolist()
file_loaded = True
print("Dataset Loaded")
else:
print("File not Found")
#correcting the format of phrase bank dataset
phrase_dataset = pd.DataFrame(columns=["news", "sentiment"])
for ele in phrase_bank_dataset:
news = ele[1]
#converting sentiment text into numbers
sentiment = 0 if ele[0] == 'negative' else 1 if ele[0] == 'neutral' else 2
row = [news, sentiment]
phrase_dataset.loc[len(phrase_dataset)] = row
print(phrase_dataset)
简单的变形金刚代码:
model = ClassificationModel('bert', 'bert-base-cased', num_labels=3,use_cuda=True)
train,eva = train_test_split(labeled_dataset,test_size = 0.2)
train_df = pd.DataFrame({
'text': train['news'],
'label': train['sentiment']
})
eval_df = pd.DataFrame({
'text': eva['news'],
'label': eva['sentiment']
})
model.train_model(train_df)
result, model_outputs, wrong_predictions = model.eval_model(eval_df)
lst = []
for arr in model_outputs:
lst.append(np.argmax(arr))
true = eval_df['label'].tolist()
predicted = lst
sklearn.metrics.accuracy_score(true,predicted)
变形金刚教练代码:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', stride = 0.8)
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 3)
if torch.cuda.is_available():
print("\nUsing: ", torch.cuda.get_device_name(0))
device = torch.device('cuda')
else:
print("\nUsing: CPU")
device = torch.device('cpu')
model = model.to(device)
#custom dataset class
class NewsSentimentDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
#method for tokenizing dataset list
def tokenize_headlines(headlines, labels, tokenizer):
encodings = tokenizer.batch_encode_plus(
headlines,
add_special_tokens = True,
pad_to_max_length = True,
return_attention_mask = True
)
dataset = NewsSentimentDataset(encodings, labels)
return dataset
#splitting dataset into training and validation set
all_headlines = phrase_dataset['news'].tolist()
all_labels = phrase_dataset['sentiment'].tolist()
train_headlines, val_headlines, train_labels, val_labels = train_test_split(phrase_headlines, phrase_labels, test_size=.2)
val_dataset = tokenize_headlines(val_headlines, val_labels, tokenizer)
train_dataset = tokenize_headlines(train_headlines, val_labels, tokenizer)
#data loader
train_batch_size = 8
val_batch_size = 8
train_data_loader = DataLoader(train_dataset, batch_size = train_batch_size, sampler=RandomSampler(train_dataset))
val_data_loader = DataLoader(val_dataset, batch_size = val_batch_size, sampler=SequentialSampler(val_dataset))
#optimizer and scheduler
num_epochs = 1
num_steps = len(train_data_loader) * num_epochs
optimizer = AdamW(model.parameters(), lr=4e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=30, num_training_steps=num_steps)
#training and evaluation with trainer moduel from huggingfaces
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=1, # total number of training epochs
per_device_train_batch_size=8, # batch size per device during training
per_device_eval_batch_size=8, # batch size for evaluation
warmup_steps=0, # number of warmup steps for learning rate scheduler
weight_decay=0, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
trainer = Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset , # evaluation dataset
compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()