0

我有一个二进制 TC 问题,大约有 10k 个短样本,并且类比平衡。我正在使用预训练的BERTRoberta进行分类。使用 Roberta,我得到的结果比 BERT 好 20%,在相同的数据集、超参数、种子中几乎完美的 0.99 准确度。这对我来说感觉很奇怪。我确信我有一个适当的训练/开发/测试拆分,没有一个样本在拆分中重复。

RoBERTa 是在做一些 BERT 没有做的事情,比如对最终预测设置阈值,还是只是更好的模型?会不会是技术问题,例如某些缓存将开发/训练数据泄漏到罗伯塔训练过程中?

MAX_LENGTH = 128
#MODEL_NAME = 'fav-kky/FERNET-C5'
MODEL_NAME = 'ufal/robeczech-base'
TARGET_NAMES = ['0', '1']
(train_texts, dev_texts, test_texts, train_labels, dev_labels, test_labels) = ...

set_seed(SEED)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False) # buggy fast tokenizer in Roberta 

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LENGTH)
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True, max_length=MAX_LENGTH)

train_dataset = Dataset(train_encodings, train_labels)
dev_dataset = Dataset(dev_encodings, dev_labels)

if MODEL_NAME == 'ufal/robeczech-base':
    model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(TARGET_NAMES)).to("cuda")
else:
    model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(TARGET_NAMES)).to("cuda")

training_args = TrainingArguments(
    output_dir='./results', 
    num_train_epochs=10, 
    per_device_train_batch_size=64, 
    per_device_eval_batch_size=64, 
    warmup_steps=100, 
    logging_dir='./logs', 
    load_best_model_at_end=True, 
    logging_steps=25, 
    evaluation_strategy="steps",
    learning_rate=1e-5,
)
trainer = Trainer(
    model=model, 
    args=training_args,  
    train_dataset=train_dataset,
    eval_dataset=dev_dataset, 
    compute_metrics=compute_metrics, 
)
trainer.train()
trainer.evaluate()

def get_probs(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    return outputs[0].softmax(1)

predictions = np.array([get_probs(test_texts[i]).cpu().detach().numpy()[0] for i in range(len(test_texts))])
print(f1_score(test_labels, np.argmax(predictions, -1), average='weighted'))
print(confusion_matrix(test_labels, np.argmax(predictions, -1)))
4

0 回答 0