为了将特定规则应用于决策树分类器上的训练样本,我们需要使用 decision_path 方法decision_path(X[, check_input])
:
现在,在处理一个短文本分类模型时,我已经在文本特征的矢量化上流水线化了一个特征联合,并应用网格搜索来找到一个优化的模型,如下面的代码所示。
- 这使得我很难将 X 功能调用到
decision_path
方法中,因为我不断收到错误 - 此外,我最终会试图用文本特征而不是数字矢量化特征来说明路径......
data, target = df['doc_text'], target_column
data_train, data_test, target_train, target_test, indices_train, indices_test = train_test_split(
data, target, df.index, random_state=0)
#combine words an character grams features
vectorizer = FeatureUnion([
('word_vectorizer', TfidfVectorizer(
sublinear_tf=True,
min_df=2,
#strip_accents='unicode',
#encoding='latin-1'
analyzer='word',
#token_pattern=r'\w{1,}',
ngram_range=(2,5),
norm='l2')),
#dtype = np.float32, #da error
#max_features = 6000)),
('char_vectorizer', TfidfVectorizer(
sublinear_tf=True,
min_df=5,
stop_words='english',
strip_accents='unicode',
analyzer='char',
ngram_range=(2, 5),
norm='l2',
#dtype=np.float32,
max_features = 8000))
])
pipelinedt= Pipeline([
("tfidf", vectorizer),
("clfdt", DecisionTreeClassifier(criterion="entropy",max_depth=7)),
])
tree_para ={ "clfdt__max_depth": (7,25,100),
"clfdt__min_samples_leaf": (1,5,10),
"tfidf__word_vectorizer__max_df": (0.5, 0.75),
"tfidf__word_vectorizer__min_df": (2,),
"tfidf__char_vectorizer__max_df": (0.5, 0.75),
"tfidf__char_vectorizer__min_df": (3,),
"tfidf__char_vectorizer__use_idf": (True, False),
"tfidf__word_vectorizer__use_idf": (True, False),
"tfidf__word_vectorizer__ngram_range": ((1, 2), (2, 4), ),
"tfidf__char_vectorizer__ngram_range": ((4, 5),),
}
if __name__ == "__main__":
dt = GridSearchCV(pipelinedt, tree_para, cv=5, n_jobs=-1, verbose=1)
dt.fit(data_train, target_train)
best_clf= dt.best_estimator_
best_score= dt.best_score_
best_parameters = dt.best_estimator_.get_params()
print ("Best-clf es",best_clf)
print ("Best-score es",best_score)
print("Best Parameters: \n{}\n".format(dt.best_params_))
from sklearn import tree
feature = dt.best_estimator_.named_steps['clfdt'].tree_.feature
threshold = dt.best_estimator_.named_steps['clfdt'].tree_.threshold
node_indicator = dt.best_estimator_.named_steps['clfdt'].decision_path(pipelinedt.named_steps['tfidf'].get_feature_names_out())
sample_id = 6108
node_index = node_indicator.indices[
node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
]
print("Rules used to predict sample {id}:\n".format(id=sample_id))
for node_id in node_index:
# continue to the next node if it is a leaf node
if leaf_id[sample_id] == node_id:
continue
# check if value of the split feature for sample 0 is below threshold
if data_test[sample_id, feature[node_id]] <= threshold[node_id]:
threshold_sign = "<="
else:
threshold_sign = ">"
print(
"decision node {node} : (X_test[{sample}, {feature}] = {value}) "
"{inequality} {threshold})".format(
node=node_id,
sample=sample_id,
feature=feature[node_id],
value=data_test[sample_id, feature[node_id]],
inequality=threshold_sign,
threshold=threshold[node_id],
)
)
我尝试在许多替代方案上使用 node_indicator 变量但没有成功,因为我无法调用已矢量化和流水线化的功能。
实际上,在这项工作之后,我显然会对使用文本特征而不是矢量化特征打印出所选样本的决策树路径非常感兴趣。
任何人都可以帮助如何解决这个问题。我在上述试验中拼命尝试该方法的最后一个错误.get_feature_names_out()
是:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_13648/4239244359.py in <module>
9 threshold = dt.best_estimator_.named_steps['clfdt'].tree_.threshold
10
---> 11 node_indicator = dt.best_estimator_.named_steps['clfdt'].decision_path(pipelinedt.named_steps['tfidf'].get_feature_names_out()[2])
12 #node_indicator = dt.best_estimator_.named_steps['clfdt'].decision_path(pipelinedt.named_steps['tfidf'].fit_transform(data_test))
13 #pipe['tfid'].idf_
~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in decision_path(self, X, check_input)
542 indicates that the samples goes through the nodes.
543 """
--> 544 X = self._validate_X_predict(X, check_input)
545 return self.tree_.decision_path(X)
546
~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in _validate_X_predict(self, X, check_input)
431 """Validate the training data on predict (probabilities)."""
432 if check_input:
--> 433 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
434 if issparse(X) and (
435 X.indices.dtype != np.intc or X.indptr.dtype != np.intc
~\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
559 raise ValueError("Validation should be done on X, y or both.")
560 elif not no_val_X and no_val_y:
--> 561 X = check_array(X, **check_params)
562 out = X
563 elif no_val_X and not no_val_y:
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
736 array = array.astype(dtype, casting="unsafe", copy=False)
737 else:
--> 738 array = np.asarray(array, order=order, dtype=dtype)
739 except ComplexWarning as complex_warning:
740 raise ValueError(
ValueError: could not convert string to float: 'word_vectorizer__acpt is'