根据我最初的问题,以下是如何将这些数据转换为纯文本并将它们写入 TSV 文件。
from flair.embeddings import FlairEmbeddings, Sentence
from flair.models import SequenceTagger
from flair.embeddings import StackedEmbeddings
embeddings_f = FlairEmbeddings('pubmed-forward')
embeddings_b = FlairEmbeddings('pubmed-backward')
sentence = Sentence('The RAS-MAPK signalling cascade serves as a central node in transducing signals from membrane receptors to the nucleus.')
tagger = SequenceTagger.load('ner')
tagger.predict(sentence)
embeddings_f.embed(sentence)
stacked_embeddings = StackedEmbeddings([
embeddings_f,
embeddings_b,
])
stacked_embeddings.embed(sentence)
# for token in sentence:
# print(token)
# print(token.embedding)
# print(token.embedding.shape)
tokens = [token for token in sentence]
print(tokens)
'''
[Token: 1 The, Token: 2 RAS-MAPK, Token: 3 signalling, Token: 4 cascade, Token: 5 serves, Token: 6 as, Token: 7 a, Token: 8 central, Token: 9 node, Token: 10 in, Token: 11 transducing, Token: 12 signals, Token: 13 from, Token: 14 membrane, Token: 15 receptors, Token: 16 to, Token: 17 the, Token: 18 nucleus.]
'''
## https://www.geeksforgeeks.org/python-string-split/
tokens = [str(token).split()[2] for token in sentence]
print(tokens)
'''
['The', 'RAS-MAPK', 'signalling', 'cascade', 'serves', 'as', 'a', 'central', 'node', 'in', 'transducing', 'signals', 'from', 'membrane', 'receptors', 'to', 'the', 'nucleus.']
'''
tensors = [token.embedding for token in sentence]
print(tensors)
'''
[tensor([ 0.0077, -0.0227, -0.0004, ..., 0.1377, -0.0003, 0.0028]),
tensor([-0.0007, -0.1601, -0.0274, ..., 0.1982, 0.0013, 0.0042]),
tensor([ 4.2534e-03, -3.1018e-01, -3.9660e-01, ..., 5.9336e-02, -9.4445e-05, 1.0025e-02]),
tensor([ 0.0026, -0.0087, -0.1398, ..., -0.0037, 0.0012, 0.0274]),
tensor([-0.0005, -0.0164, -0.0233, ..., -0.0013, 0.0039, 0.0004]),
tensor([ 3.8261e-03, -7.6409e-02, -1.8632e-02, ..., -2.8906e-03, -4.4556e-04, 5.6909e-05]),
tensor([ 0.0035, -0.0207, 0.1700, ..., -0.0193, 0.0017, 0.0006]),
tensor([ 0.0159, -0.4097, -0.0489, ..., 0.0743, 0.0005, 0.0012]),
tensor([ 9.7725e-03, -3.3817e-01, -2.2848e-02, ..., -6.6284e-02, 2.3646e-04, 1.0505e-02]),
tensor([ 0.0219, -0.0677, -0.0154, ..., 0.0102, 0.0066, 0.0016]),
tensor([ 0.0092, -0.0431, -0.0450, ..., 0.0060, 0.0002, 0.0005]),
tensor([ 0.0047, -0.2732, -0.0408, ..., 0.0136, 0.0005, 0.0072]),
tensor([ 0.0072, -0.0173, -0.0149, ..., -0.0013, -0.0004, 0.0056]),
tensor([ 0.0086, -0.1151, -0.0629, ..., 0.0043, 0.0050, 0.0016]),
tensor([ 7.6452e-03, -2.3825e-01, -1.5683e-02, ..., -5.4974e-04, -1.4646e-04, 6.6120e-03]),
tensor([ 0.0038, -0.0354, -0.1337, ..., 0.0060, -0.0004, 0.0102]),
tensor([ 0.0186, -0.0151, -0.0641, ..., 0.0188, 0.0391, 0.0069]),
tensor([ 0.0003, -0.0461, 0.0043, ..., -0.0126, -0.0004, 0.0142])]
'''
# ----------------------------------------
## Write those data to TSV files.
## https://stackoverflow.com/a/29896136/1904943
import csv
metadata_f = 'metadata.tsv'
tensors_f = 'tensors.tsv'
with open(metadata_f, 'w', encoding='utf8', newline='') as tsv_file:
tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
for token in tokens:
## Assign to a dummy variable ( _ ) to suppress character counts;
## if I use (token), rather than ([token]), I get spaces between all characters:
_ = tsv_writer.writerow([token])
## metadata.tsv :
'''
The
RAS-MAPK
signalling
cascade
serves
as
a
central
node
in
transducing
signals
from
membrane
receptors
to
the
nucleus.
'''
with open(metadata_f, 'w', encoding='utf8', newline='') as tsv_file:
tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
_ = tsv_writer.writerow(tokens)
## metadata.tsv :
'''
The RAS-MAPK signalling cascade serves as a central node in transducing signals from membrane receptors to the nucleus.
'''
tensors = [token.embedding for token in sentence]
print(tensors)
'''
[tensor([ 0.0077, -0.0227, -0.0004, ..., 0.1377, -0.0003, 0.0028]),
tensor([-0.0007, -0.1601, -0.0274, ..., 0.1982, 0.0013, 0.0042]),
tensor([ 4.2534e-03, -3.1018e-01, -3.9660e-01, ..., 5.9336e-02, -9.4445e-05, 1.0025e-02]),
tensor([ 0.0026, -0.0087, -0.1398, ..., -0.0037, 0.0012, 0.0274]),
tensor([-0.0005, -0.0164, -0.0233, ..., -0.0013, 0.0039, 0.0004]),
tensor([ 3.8261e-03, -7.6409e-02, -1.8632e-02, ..., -2.8906e-03, -4.4556e-04, 5.6909e-05]),
tensor([ 0.0035, -0.0207, 0.1700, ..., -0.0193, 0.0017, 0.0006]),
tensor([ 0.0159, -0.4097, -0.0489, ..., 0.0743, 0.0005, 0.0012]),
tensor([ 9.7725e-03, -3.3817e-01, -2.2848e-02, ..., -6.6284e-02, 2.3646e-04, 1.0505e-02]),
tensor([ 0.0219, -0.0677, -0.0154, ..., 0.0102, 0.0066, 0.0016]),
tensor([ 0.0092, -0.0431, -0.0450, ..., 0.0060, 0.0002, 0.0005]),
tensor([ 0.0047, -0.2732, -0.0408, ..., 0.0136, 0.0005, 0.0072]),
tensor([ 0.0072, -0.0173, -0.0149, ..., -0.0013, -0.0004, 0.0056]),
tensor([ 0.0086, -0.1151, -0.0629, ..., 0.0043, 0.0050, 0.0016]),
tensor([ 7.6452e-03, -2.3825e-01, -1.5683e-02, ..., -5.4974e-04, -1.4646e-04, 6.6120e-03]),
tensor([ 0.0038, -0.0354, -0.1337, ..., 0.0060, -0.0004, 0.0102]),
tensor([ 0.0186, -0.0151, -0.0641, ..., 0.0188, 0.0391, 0.0069]),
tensor([ 0.0003, -0.0461, 0.0043, ..., -0.0126, -0.0004, 0.0142])]
'''
with open(tensors_f, 'w', encoding='utf8', newline='') as tsv_file:
tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
for token in sentence:
embedding = token.embedding
_ = tsv_writer.writerow(embedding.tolist())
## tensors.tsv (18 lines: one embedding per token in metadata.tsv):
## note: enormous output, even for this simple sentence.
'''
0.007691788021475077 -0.02268664352595806 -0.0004340760060586035 ...
'''