python - 具有矢量化和特征联合的决策树分类器管道的特定决策规则

Question

为了将特定规则应用于决策树分类器上的训练样本，我们需要使用 decision_path 方法decision_path(X[, check_input])：

现在，在处理一个短文本分类模型时，我已经在文本特征的矢量化上流水线化了一个特征联合，并应用网格搜索来找到一个优化的模型，如下面的代码所示。

这使得我很难将 X 功能调用到decision_path方法中，因为我不断收到错误
此外，我最终会试图用文本特征而不是数字矢量化特征来说明路径......

data, target = df['doc_text'], target_column
data_train, data_test, target_train, target_test, indices_train, indices_test  = train_test_split(
    data, target, df.index, random_state=0)

#combine words an character grams features
vectorizer = FeatureUnion([
    ('word_vectorizer',  TfidfVectorizer(
    sublinear_tf=True,
    min_df=2,
    #strip_accents='unicode',
    #encoding='latin-1'
    analyzer='word',
    #token_pattern=r'\w{1,}',
    ngram_range=(2,5),
    norm='l2')),
    #dtype = np.float32, #da error
    #max_features = 6000)),

    ('char_vectorizer', TfidfVectorizer(
    sublinear_tf=True,
    min_df=5,
    stop_words='english',
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 5),
    norm='l2',
    #dtype=np.float32,
    max_features = 8000))
])

pipelinedt= Pipeline([
        ("tfidf", vectorizer),
        ("clfdt", DecisionTreeClassifier(criterion="entropy",max_depth=7)),
])

tree_para ={  "clfdt__max_depth": (7,25,100),
              "clfdt__min_samples_leaf": (1,5,10),
              "tfidf__word_vectorizer__max_df": (0.5, 0.75),
              "tfidf__word_vectorizer__min_df": (2,),
              "tfidf__char_vectorizer__max_df": (0.5, 0.75),
              "tfidf__char_vectorizer__min_df": (3,),
              "tfidf__char_vectorizer__use_idf": (True, False),
              "tfidf__word_vectorizer__use_idf": (True, False),
              "tfidf__word_vectorizer__ngram_range": ((1, 2), (2, 4), ),
              "tfidf__char_vectorizer__ngram_range": ((4, 5),),
} 

if __name__ == "__main__":
    dt = GridSearchCV(pipelinedt, tree_para, cv=5, n_jobs=-1, verbose=1)
    dt.fit(data_train, target_train)

    best_clf= dt.best_estimator_
    best_score= dt.best_score_ 
    best_parameters = dt.best_estimator_.get_params()

    print ("Best-clf es",best_clf)
    print ("Best-score es",best_score)
    print("Best Parameters: \n{}\n".format(dt.best_params_))

    from sklearn import tree

    feature = dt.best_estimator_.named_steps['clfdt'].tree_.feature
    threshold = dt.best_estimator_.named_steps['clfdt'].tree_.threshold
    node_indicator = dt.best_estimator_.named_steps['clfdt'].decision_path(pipelinedt.named_steps['tfidf'].get_feature_names_out())

    sample_id = 6108

    node_index = node_indicator.indices[
        node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
    ]

    print("Rules used to predict sample {id}:\n".format(id=sample_id))
    for node_id in node_index:
        # continue to the next node if it is a leaf node
        if leaf_id[sample_id] == node_id:
            continue

        # check if value of the split feature for sample 0 is below threshold
        if data_test[sample_id, feature[node_id]] <= threshold[node_id]:
            threshold_sign = "<="
        else:
            threshold_sign = ">"

        print(
            "decision node {node} : (X_test[{sample}, {feature}] = {value}) "
            "{inequality} {threshold})".format(
                node=node_id,
                sample=sample_id,
                feature=feature[node_id],
                value=data_test[sample_id, feature[node_id]],
                inequality=threshold_sign,
                threshold=threshold[node_id],
            )
        )

我尝试在许多替代方案上使用 node_indicator 变量但没有成功，因为我无法调用已矢量化和流水线化的功能。

实际上，在这项工作之后，我显然会对使用文本特征而不是矢量化特征打印出所选样本的决策树路径非常感兴趣。

任何人都可以帮助如何解决这个问题。我在上述试验中拼命尝试该方法的最后一个错误.get_feature_names_out()是：

ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_13648/4239244359.py in <module>
      9 threshold = dt.best_estimator_.named_steps['clfdt'].tree_.threshold
     10 
---> 11 node_indicator = dt.best_estimator_.named_steps['clfdt'].decision_path(pipelinedt.named_steps['tfidf'].get_feature_names_out()[2])
     12 #node_indicator = dt.best_estimator_.named_steps['clfdt'].decision_path(pipelinedt.named_steps['tfidf'].fit_transform(data_test))
     13 #pipe['tfid'].idf_

~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in decision_path(self, X, check_input)
    542             indicates that the samples goes through the nodes.
    543         """
--> 544         X = self._validate_X_predict(X, check_input)
    545         return self.tree_.decision_path(X)
    546 

~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in _validate_X_predict(self, X, check_input)
    431         """Validate the training data on predict (probabilities)."""
    432         if check_input:
--> 433             X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
    434             if issparse(X) and (
    435                 X.indices.dtype != np.intc or X.indptr.dtype != np.intc

~\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    559             raise ValueError("Validation should be done on X, y or both.")
    560         elif not no_val_X and no_val_y:
--> 561             X = check_array(X, **check_params)
    562             out = X
    563         elif no_val_X and not no_val_y:

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    736                     array = array.astype(dtype, casting="unsafe", copy=False)
    737                 else:
--> 738                     array = np.asarray(array, order=order, dtype=dtype)
    739             except ComplexWarning as complex_warning:
    740                 raise ValueError(

ValueError: could not convert string to float: 'word_vectorizer__acpt is'

python - 具有矢量化和特征联合的决策树分类器管道的特定决策规则

0 回答 0

Related

Reference