我如何将文本转换为矩阵,就像 python 开发人员使用张量流文本预处理将文本转换为矩阵一样。
甚至为任何可以做到这一点但仍然找不到的实用程序尝试了“自然”、“丹诺夫”和“张量流”。
它是如何在 python (tfidf) 中完成的 来源:text_to_matrix
def texts_to_matrix(self, texts, mode='binary'):
"""Convert a list of texts to a Numpy matrix.
# Arguments
texts: list of strings.
mode: one of "binary", "count", "tfidf", "freq".
# Returns
A Numpy matrix.
"""
sequences = self.texts_to_sequences(texts)
return self.sequences_to_matrix(sequences, mode=mode)
def sequences_to_matrix(self, sequences, mode='binary'):
"""Converts a list of sequences into a Numpy matrix.
# Arguments
sequences: list of sequences
(a sequence is a list of integer word indices).
mode: one of "binary", "count", "tfidf", "freq"
# Returns
A Numpy matrix.
# Raises
ValueError: In case of invalid `mode` argument,
or if the Tokenizer requires to be fit to sample data.
"""
if not self.num_words:
if self.word_index:
num_words = len(self.word_index) + 1
else:
raise ValueError('Specify a dimension (`num_words` argument), '
'or fit on some text data first.')
else:
num_words = self.num_words
if mode == 'tfidf' and not self.document_count:
raise ValueError('Fit the Tokenizer on some data '
'before using tfidf mode.')
x = np.zeros((len(sequences), num_words))
for i, seq in enumerate(sequences):
if not seq:
continue
counts = defaultdict(int)
for j in seq:
if j >= num_words:
continue
counts[j] += 1
for j, c in list(counts.items()):
if mode == 'count':
x[i][j] = c
elif mode == 'freq':
x[i][j] = c / len(seq)
elif mode == 'binary':
x[i][j] = 1
elif mode == 'tfidf':
# Use weighting scheme 2 in
# https://en.wikipedia.org/wiki/Tf%E2%80%93idf
tf = 1 + np.log(c)
idf = np.log(1 + self.document_count /
(1 + self.index_docs.get(j, 0)))
x[i][j] = tf * idf
else:
raise ValueError('Unknown vectorization mode:', mode)
return x