我有两个数据集:数据集 A 和数据集 B。我想使用pycrfsuite在数据集 A 上训练条件随机场 (CRF),然后在数据集 B 上训练 CRF。是否可以使用 pycrfsuite 来实现?
我不想同时在两个数据集上加入 CRF。
我知道如何使用 pycrfsuite 在一个数据集上训练 CRF:https ://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb :
'''Tested with python 2.7 64-bit
Code from https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
sudo pip install nltk python-crfsuite scikit-learn
sudo python -m nltk.downloader conl2002
'''
from __future__ import print_function
from __future__ import division
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import time
print(sklearn.__version__)
nltk.corpus.conll2002.fileids()
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
def word2features(sent, i):
word = sent[i][0]
postag = sent[i][1]
features = [
'bias',
'word.lower=' + word.lower(),
'word[-3:]=' + word[-3:],
'word[-2:]=' + word[-2:],
'word.isupper=%s' % word.isupper(),
'word.istitle=%s' % word.istitle(),
'word.isdigit=%s' % word.isdigit(),
'postag=' + postag,
'postag[:2]=' + postag[:2],
]
if i > 0:
word1 = sent[i-1][0]
postag1 = sent[i-1][1]
features.extend([
'-1:word.lower=' + word1.lower(),
'-1:word.istitle=%s' % word1.istitle(),
'-1:word.isupper=%s' % word1.isupper(),
'-1:postag=' + postag1,
'-1:postag[:2]=' + postag1[:2],
])
else:
features.append('BOS')
if i < len(sent)-1:
word1 = sent[i+1][0]
postag1 = sent[i+1][1]
features.extend([
'+1:word.lower=' + word1.lower(),
'+1:word.istitle=%s' % word1.istitle(),
'+1:word.isupper=%s' % word1.isupper(),
'+1:postag=' + postag1,
'+1:postag[:2]=' + postag1[:2],
])
else:
features.append('EOS')
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token, postag, label in sent]
def sent2tokens(sent):
return [token for token, postag, label in sent]
def bio_classification_report(y_true, y_pred):
"""
Classification report for a list of BIO-encoded sequences.
It computes token-level metrics and discards "O" labels.
Note that it requires scikit-learn 0.15+ (or a version from github master)
to calculate averages properly!
"""
lb = LabelBinarizer()
y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
tagset = set(lb.classes_) - {'O'}
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
return classification_report(
y_true_combined,
y_pred_combined,
labels = [class_indices[cls] for cls in tagset],
target_names = tagset,
)
def main():
'''
This is the main function
'''
feature_extraction_start_time = time.time()
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]
feature_extraction_elapsed_time = time.time() - feature_extraction_start_time
print('feature_extraction_elapsed_time: {0:.2f} seconds'.format(feature_extraction_elapsed_time))
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
trainer.append(xseq, yseq)
#break
trainer.set_params({
'c1': 1.0, # coefficient for L1 penalty
'c2': 1e-3, # coefficient for L2 penalty
'max_iterations': 50, # stop earlier
# include transitions that are possible, but not observed
'feature.possible_transitions': True
})
training_start_time = time.time()
trainer.train('conll2002-esp.crfsuite')
training_elapsed_time = time.time() - training_start_time
print('training_elapsed_time: {0:.2f} seconds'.format(training_elapsed_time))
print(len(trainer.logparser.iterations))
print(trainer.logparser.iterations[-1])
test_start_time = time.time()
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')
y_pred = [tagger.tag(xseq) for xseq in X_test]
print(bio_classification_report(y_test, y_pred))
example_sent = test_sents[0]
print(' '.join(sent2tokens(example_sent)), end='\n\n')
print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct: ", ' '.join(sent2labels(example_sent)))
test_elapsed_training_time = time.time() - test_start_time
print('test_elapsed_training_time: {0:.2f} seconds'.format(test_elapsed_training_time))
if __name__ == "__main__":
main()
#cProfile.run('main()') # if you want to do some profiling
我只是不知道如何在第二个数据集上训练它,因为它会trainer.train()
重置 CRF 的参数。