我正在尝试在 amazon-review 数据集上微调预训练的 BERT 模型。为此,我run_classifier
通过以下处理器扩展了文件:
class AmazonProcessor(DataProcessor):
"""Processor for the Amazon data set."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1", "2"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
# header
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[13])
label = tokenization.convert_to_unicode(line[7])
# only train on 3 labels instead of 5
if int(label) <= 2: label = "0"
if int(label) == 3: label = "1"
if int(label) >= 4: label = "2"
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
我正在 GPU 上的 colab 笔记本中进行培训,因此我也根据需要调整了主要方法:
processors = {
"cola": run_classifier.ColaProcessor,
"mnli": run_classifier.MnliProcessor,
"mrpc": run_classifier.MrpcProcessor,
"xnli": run_classifier.XnliProcessor,
"amazon": run_classifier.AmazonProcessor,
}
bert_config_file = os.path.join(BERT_FOLDER, "bert_config.json")
max_seq_length = 128
output_dir = "drive/My Drive/model"
task_name = "amazon"
vocab_file = os.path.join(BERT_FOLDER, "vocab.txt")
do_lower_case = False
master = None
tpu_cluster_resolver = None
save_checkpoints_steps = 1000
iterations_per_loop = 1000
use_tpu = False
data_dir = "drive/My Drive/csv_dataset"
learning_rate = 5e-5
warmup_proportion = 0.1
train_batch_size = 16
eval_batch_size = 1
predict_batch_size = 1
num_train_epochs = 10.0
num_train_steps = 10000
num_tpu_cores = 8
#init_checkpoint = os.path.join(BERT_FOLDER, "bert_model.ckpt")
init_checkpoint = "drive/My Drive/model2/model.ckpt-41000"
do_train = True
do_eval = True
tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)
bert_config = modeling.BertConfig.from_json_file(bert_config_file)
print(bert_config)
task_name = task_name.lower()
processor = processors[task_name]()
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
cluster=tpu_cluster_resolver,
master=master,
model_dir=output_dir,
save_checkpoints_steps=save_checkpoints_steps,
tpu_config=tf.contrib.tpu.TPUConfig(
iterations_per_loop=iterations_per_loop,
num_shards=num_tpu_cores,
per_host_input_for_training=is_per_host))
train_examples = None
num_train_steps = None
num_warmup_steps = None
if do_train:
train_examples = processor.get_train_examples(data_dir)
num_train_steps = int(
len(train_examples) / train_batch_size * num_train_epochs)
num_warmup_steps = int(num_train_steps * warmup_proportion)
model_fn = run_classifier.model_fn_builder(
bert_config=bert_config,
num_labels=len(label_list),
init_checkpoint=init_checkpoint,
learning_rate=learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
use_tpu=use_tpu,
use_one_hot_embeddings=use_tpu)
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=use_tpu,
model_fn=model_fn,
config=run_config,
train_batch_size=train_batch_size,
eval_batch_size=eval_batch_size,
predict_batch_size=predict_batch_size)
if do_train:
train_file = os.path.join(output_dir, "train.tf_record")
run_classifier.file_based_convert_examples_to_features(
train_examples, label_list, max_seq_length, tokenizer, train_file)
tf.logging.info("***** Running training *****")
tf.logging.info(" Num examples = %d", len(train_examples))
tf.logging.info(" Batch size = %d", train_batch_size)
tf.logging.info(" Num steps = %d", num_train_steps)
train_input_fn = run_classifier.file_based_input_fn_builder(
input_file=train_file,
seq_length=max_seq_length,
is_training=True,
drop_remainder=True)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
if do_eval:
eval_examples = processor.get_test_examples(data_dir)
num_actual_eval_examples = len(eval_examples)
if use_tpu:
# TPU requires a fixed batch size for all batches, therefore the number
# of examples must be a multiple of the batch size, or else examples
# will get dropped. So we pad with fake examples which are ignored
# later on. These do NOT count towards the metric (all tf.metrics
# support a per-instance weight, and these get a weight of 0.0).
while len(eval_examples) % eval_batch_size != 0:
eval_examples.append(PaddingInputExample())
eval_file = os.path.join(output_dir, "eval.tf_record")
run_classifier.file_based_convert_examples_to_features(
eval_examples, label_list, max_seq_length, tokenizer, eval_file)
tf.logging.info("***** Running evaluation *****")
tf.logging.info(" Num examples = %d (%d actual, %d padding)",
len(eval_examples), num_actual_eval_examples,
len(eval_examples) - num_actual_eval_examples)
tf.logging.info(" Batch size = %d", eval_batch_size)
# This tells the estimator to run through the entire set.
eval_steps = None
# However, if running eval on the TPU, you will need to specify the
# number of steps.
if use_tpu:
assert len(eval_examples) % eval_batch_size == 0
eval_steps = int(len(eval_examples) // eval_batch_size)
eval_drop_remainder = True if use_tpu else False
eval_input_fn = run_classifier.file_based_input_fn_builder(
input_file=eval_file,
seq_length=max_seq_length,
is_training=False,
drop_remainder=eval_drop_remainder)
result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
output_eval_file = os.path.join(output_dir, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
tf.logging.info("***** Eval results *****")
for key in sorted(result.keys()):
tf.logging.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
我知道这是很多代码,但因为我无法指出我想呈现所有这些错误。
请注意,大多数日志记录输出似乎完全合理:
例如一个转换的例子:
INFO:tensorflow:tokens: [CLS] Ich habe schon viele Klavier ##kon ##zer ##te gehört , aber was Frau Martha Ar ##geri ##ch hier spielt lässt einem ge ##wis ##ser ##ma ##ßen den At ##em stock ##en . So geni ##al habe ich diese 2 Klavier ##kon ##zer ##te von Ra ##ch ##mani ##no ##ff und T ##sch ##aik ##ov ##sky noch nie gehört . Sie ent ##fes ##selt einen regel ##rechte ##n Feuer ##stu ##rm an Vir ##tu ##osi ##tät . [SEP]
INFO:tensorflow:input_ids: 101 21023 21404 16363 18602 48021 17423 14210 10216 16706 117 11566 10134 16783 26904 18484 68462 10269 13329 28508 25758 10745 46503 83648 12754 10369 20284 10140 11699 10451 20511 10136 119 12882 107282 10415 21404 12979 12750 123 48021 17423 14210 10216 10166 38571 10269 31124 10343 13820 10130 157 12044 106333 11024 16116 11230 11058 16706 119 11583 61047 58058 26063 10897 46578 55663 10115 68686 19987 19341 10151 106433 10991 20316 24308 119 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: 2 (id = 2)
或者从检查点文件加载模型:
INFO:tensorflow: name = output_weights:0, shape = (3, 768), *INIT_FROM_CKPT*
INFO:tensorflow: name = output_bias:0, shape = (3,), *INIT_FROM_CKPT*
但最终 eval_accuracy 始终保持不变:
I0625 15:46:41.328946 eval_accuracy = 0.3338616
完整的存储库可以在这里找到:https ://github.com/joroGER/bert/
这里是笔记本的要点:https ://colab.research.google.com/gist/joroGER/75c1c9c6383f0199bb54ce7b63d412d0/untitled4.ipynb