我正在尝试使用 sklearn 的 CountVectorizer 训练文本分类器。问题是我的训练文档有许多特定于文档的标记。因此,例如,CountVectorizer.fit_transform 方法可以很好地处理常规的英语单词,但是有些标记的格式适合正则表达式:'\w\d\d\w\w\d',例如作为“d84ke2”。就像现在一样,fit_transform 方法只会将“d84ke2”的面值作为特征使用。
docs = ['the quick brown j64ke2 jumped over the lazy dogs r32kl4.',
'an apple a day keeps the w35kf9 away',
'you got the lions share of the e93mf9']
import numpy as np
# define target and target_names
target_names = ['zero', 'one', 'two']
target = np.array([0, 1, 2])
# Create message bunch.
from sklearn.utils import Bunch
doc_info = Bunch(data=docs, target=target, target_names=target_names)
# Vectorize training data
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
vocab = count_vect.vocabulary_
vocab_keys = list(vocab.keys())
#vocab_vals = list(vocab.values())
X_train_counts = count_vect.transform(doc_info.data)
X = X_train_counts.toarray()
import pandas as pd
df = pd.DataFrame(X, columns=vocab_keys)