使用文档https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7和https://spacy.io/usage/processing-pipelines训练 NER spacy自定义训练模型测试用例数据集以在给定文本中准确找到货币。
示例数据集:
TRAIN_DATA = [('This is AFN currency', {'entities': [(8, 11, 'CUR')]}),
('I have EUR european currency', {'entities': [(7, 10, 'CUR')]}),
('let as have ALL money', {'entities': [(12, 15, 'CUR')]}),
('DZD is a dollar', {'entities': [(0, 3, 'CUR')]}),
('money USD united states', {'entities': [(6, 9, 'CUR')]})
]
通过将模型命名为“货币”成功地训练了模型。它对带有正确标签的训练数据集预测良好,但大多数情况下它预测带有错误标签的未经训练的文本数据。
输入测试行:'I have AZWSQTS lot LOT of Indian MZW currency USD INR'
输出:
AZWSQTS - CUR,LOT - CUR,MZW - CUR,美元 - CUR,印度卢比 - CUR
在这里,“AZWSQTS”和“LOT”不是一种货币,但它预测,这就是我遇到的问题。
完整代码:
from __future__ import unicode_literals, print_function
import random
from pathlib import Path
import spacy
from tqdm import tqdm
from spacy.training import Example
def spacy_train_model():
''' Sample traning dataset format'''
'''list of currency'''
currency_list = ['AFN', 'EUR', 'EUR', 'ALL', 'DZD', 'USD', 'EUR', 'AOA', 'XCD', 'XCD', 'ARS',
'AMD', 'AWG', 'SHP', 'AUD', 'EUR', 'AZN', '', 'BSD', 'BHD', 'BDT', 'BBD', 'BYN', 'EUR', 'BZD',
'XOF', 'BMD', 'BTN', 'BOB', 'USD', 'BAM', 'BWP', 'BRL', 'USD', 'USD', 'BND', 'BGN', 'XOF', 'BIF',
'CVE', 'KHR', 'XAF', 'CAD', 'USD', 'KYD', 'XAF', 'XAF', 'NZD', 'CLP', 'CNY', 'AUD', 'AUD', 'COP',
'KMF', 'CDF', 'XAF', 'none', 'CRC', 'XOF', 'HRK', 'CUP', 'ANG', 'EUR', 'CZK', '', 'DKK', 'DJF',
'XCD', 'DOP', '', 'USD', 'EGP', 'USD', 'XAF', 'ERN', 'EUR', 'SZL', 'ETB', '', 'FKP', 'FJD',
'EUR', 'EUR', 'EUR', 'XPF', '', 'XAF', 'GMD', 'GEL', 'EUR', 'GHS', 'GIP', 'EUR', 'DKK', 'XCD',
'EUR', 'USD', 'GTQ', 'GGP', 'GNF', 'XOF', 'GYD', '', 'HTG', 'HNL', 'HKD', 'HUF', 'ISK', 'INR',
'IDR', 'XDR', 'IRR', 'IQD', 'EUR', 'IMP', 'ILS', 'EUR', '', 'JMD', 'JPY', 'JEP', 'JOD',
'KZT', 'KES', 'AUD', 'EUR', 'KWD', 'KGS', '', 'LAK', 'EUR', 'LBP', 'LSL', 'LRD', 'LYD', 'CHF',
'EUR', 'EUR', '', 'MOP', 'MGA', 'MWK', 'MYR', 'MVR', 'XOF', 'EUR', 'USD', 'EUR', 'MRU', 'MUR',
'EUR', 'MXN', 'USD', 'MDL', 'EUR', 'MNT', 'EUR', 'XCD', 'MAD', 'MZN', 'MMK', '', 'NAD', 'AUD',
'NPR', 'EUR', 'XPF', 'NZD', 'NIO', 'XOF', 'NGN', 'NZD', 'AUD', 'USD', 'KPW', 'MKD', 'NOK',
'OMR','PKR', 'USD', 'ILS', 'USD', 'PGK', 'PYG', 'PEN', 'PHP', 'NZD', 'PLN', 'EUR', 'USD','QAR',
'EUR', 'RON', 'RUB', 'RWF', '', 'USD', 'EUR', 'SHP', 'XCD', 'XCD', 'EUR', 'EUR', 'XCD', 'WST',
'EUR', 'STN', 'SAR', 'XOF', 'RSD', 'SCR', 'SLL', 'SGD', 'USD', 'ANG', 'EUR', 'EUR', 'SBD', 'SOS',
'ZAR', 'GBP', 'KRW', 'SSP', 'EUR', 'LKR', 'SDG', 'SRD', 'NOK', 'SEK', 'CHF', 'SYP', '', 'TWD',
'TJS', 'TZS', 'THB', 'USD', 'XOF', 'NZD', 'TOP', 'TTD', 'GBP', 'TND', 'TRY', 'TMT', 'USD', 'AUD',
'UGX', 'UAH', 'AED', 'GBP', 'USD', 'UYU', 'USD', 'UZS', '', 'VUV', 'EUR', 'VES', 'VND', '',
'USD', 'XPF', 'YER', 'ZMW', 'USD']
TRAIN_DATA = [('This is AFN currency', {'entities': [(8, 11, 'CUR')]}),
('I have EUR europen currency', {'entities': [(7, 10, 'CUR')]}),
('let as have ALL money', {'entities': [(12, 15, 'CUR')]}),
('DZD is a dollar', {'entities': [(0, 3, 'CUR')]}),
('money USD united states', {'entities': [(6, 9, 'CUR')]})
]
# model = "en_core_web_lg"
model = None
output_dir=Path(r"D:\currency") # Path to save training model - create new empty directory
n_iter=100
#load the model
if model is not None:
nlp = spacy.load(model)
optimise = nlp.create_optimizer()
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en')
optimise = nlp.begin_training()
print("Created blank 'en' model")
#set up the pipeline
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe('ner', last=True)
else:
ner = nlp.get_pipe('ner')
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.initialize()
# optimizer = optimise
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in tqdm(TRAIN_DATA):
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
nlp.update(
[example],
drop=0.5,
sgd=optimizer,
losses=losses)
print(losses)
for text, _ in TRAIN_DATA:
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
def test_model(text):
nlp = spacy.load(r'D:\currency')
for tex in text.split('\n'):
doc = nlp(tex)
for token in doc.ents:
print(token.text, token.label_)
spacy_train_model() #Training the model
test_model('text') #Testing the model