您可以按照本教程构建自己的标记器https://www.tensorflow.org/text/guide/subwords_tokenizer
这与他们在转换器示例中构建 ted_hrlr_translate_pt_en_converter 标记器的方式完全相同,您只需将其调整为您的语言。
我为您的情况重写了它,但没有对其进行测试:
import collections
import logging
import os
import pathlib
import re
import string
import sys
import time
import numpy as np
#import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
examples, metadata = tfds.load('wmt14_translate/de-en', with_info=True,
as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']
for de_examples, en_examples in train_examples.batch(3).take(1):
for pt in de_examples.numpy():
print(pt.decode('utf-8'))
print()
for en in en_examples.numpy():
print(en.decode('utf-8'))
train_en = train_examples.map(lambda de, en: en)
train_de = train_examples.map(lambda de, en: de)
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]
bert_vocab_args = dict(
# The target vocabulary size
vocab_size = 8000,
# Reserved tokens that must be included in the vocabulary
reserved_tokens=reserved_tokens,
# Arguments for `text.BertTokenizer`
bert_tokenizer_params=bert_tokenizer_params,
# Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
learn_params={},
)
de_vocab = bert_vocab.bert_vocab_from_dataset(
train_de.batch(1000).prefetch(2),
**bert_vocab_args
)
print(de_vocab[:10])
print(de_vocab[100:110])
print(de_vocab[1000:1010])
print(de_vocab[-10:])
def write_vocab_file(filepath, vocab):
with open(filepath, 'w') as f:
for token in vocab:
print(token, file=f)
write_vocab_file('de_vocab.txt', de_vocab)
en_vocab = bert_vocab.bert_vocab_from_dataset(
train_en.batch(1000).prefetch(2),
**bert_vocab_args
)
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])
write_vocab_file('en_vocab.txt', en_vocab)
de_tokenizer = text.BertTokenizer('de_vocab.txt', **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)
# Tokenize the examples -> (batch, word, word-piece)
token_batch = en_tokenizer.tokenize(en_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)
for ex in token_batch.to_list():
print(ex)
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(en_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")
def add_start_end(ragged):
count = ragged.bounding_shape()[0]
starts = tf.fill([count,1], START)
ends = tf.fill([count,1], END)
return tf.concat([starts, ragged, ends], axis=1)
words = en_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)
def cleanup_text(reserved_tokens, token_txt):
# Drop the reserved tokens, except for "[UNK]".
bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
bad_token_re = "|".join(bad_tokens)
bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
result = tf.ragged.boolean_mask(token_txt, ~bad_cells)
# Join them into strings.
result = tf.strings.reduce_join(result, separator=' ', axis=-1)
return result
token_batch = en_tokenizer.tokenize(en_examples).merge_dims(-2,-1)
words = en_tokenizer.detokenize(token_batch)
cleanup_text(reserved_tokens, words).numpy()
class CustomTokenizer(tf.Module):
def __init__(self, reserved_tokens, vocab_path):
self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
self._reserved_tokens = reserved_tokens
self._vocab_path = tf.saved_model.Asset(vocab_path)
vocab = pathlib.Path(vocab_path).read_text().splitlines()
self.vocab = tf.Variable(vocab)
## Create the signatures for export:
# Include a tokenize signature for a batch of strings.
self.tokenize.get_concrete_function(
tf.TensorSpec(shape=[None], dtype=tf.string))
# Include `detokenize` and `lookup` signatures for:
# * `Tensors` with shapes [tokens] and [batch, tokens]
# * `RaggedTensors` with shape [batch, tokens]
self.detokenize.get_concrete_function(
tf.TensorSpec(shape=[None, None], dtype=tf.int64))
self.detokenize.get_concrete_function(
tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
self.lookup.get_concrete_function(
tf.TensorSpec(shape=[None, None], dtype=tf.int64))
self.lookup.get_concrete_function(
tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
# These `get_*` methods take no arguments
self.get_vocab_size.get_concrete_function()
self.get_vocab_path.get_concrete_function()
self.get_reserved_tokens.get_concrete_function()
@tf.function
def tokenize(self, strings):
enc = self.tokenizer.tokenize(strings)
# Merge the `word` and `word-piece` axes.
enc = enc.merge_dims(-2,-1)
enc = add_start_end(enc)
return enc
@tf.function
def detokenize(self, tokenized):
words = self.tokenizer.detokenize(tokenized)
return cleanup_text(self._reserved_tokens, words)
@tf.function
def lookup(self, token_ids):
return tf.gather(self.vocab, token_ids)
@tf.function
def get_vocab_size(self):
return tf.shape(self.vocab)[0]
@tf.function
def get_vocab_path(self):
return self._vocab_path
@tf.function
def get_reserved_tokens(self):
return tf.constant(self._reserved_tokens)
tokenizers = tf.Module()
tokenizers.pt = CustomTokenizer(reserved_tokens, 'de_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, 'en_vocab.txt')
model_name = 'ted_hrlr_translate_de_en_converter'
tf.saved_model.save(tokenizers, model_name)