0

我想绘制决策树分类器的 ROC 曲线。我的代码引发key of type tuple not found and not a MultiIndex错误。

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, auc
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the Fake news dataset
df_fake = pd.read_csv("C:/Users/User/Downloads/Fake.csv")

# Load the True news dataset
df_true = pd.read_csv("C:/Users/User/Downloads/True.csv")

# Set fake as 1 and true as 0
df_true["class"] = 0
df_fake["class"] = 1

# Concatenate true and fake datasets
df = pd.concat([df_fake, df_true])

# Sanity check
print(f'N rows={len(df)}, M columns={len(df.columns)}')
df.head()

数据框

df.head()
标题 文本 主题 日期 班级
0 唐纳德特朗普发出令人尴尬的新年'... 唐纳德特朗普不能希望所有美国人...... 消息 2017 年 12 月 31 日 1
1 喝醉了吹牛的特朗普员工开始俄罗斯... 众议院情报委员会主席德文·努... 消息 2017 年 12 月 31 日 1
2 警长大卫克拉克成为网络笑话...... 周五,据透露,前密尔沃克... 消息 2017 年 12 月 31 日 1
3 特朗普如此着迷,他甚至有奥巴马的名字...... 圣诞节那天,唐纳德·特朗普宣布... 消息 2017 年 12 月 31 日 1
4 教皇弗朗西斯刚刚喊出了唐纳德特朗普... 教皇弗朗西斯使用他的年度圣诞节主题... 消息 2017 年 12 月 31 日 1
# Extract title column in Tf-IDf
X = TfidfVectorizer(stop_words='english', max_features=50).fit_transform(df['title'])
N, M = X.shape

# Dataframe info
y = df["class"]

# Divide dataset into a separate training dataset (80%) and test dataset (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)

# Baseline classification
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)
dummy_clf.predict(X)
dummy_clf.score(X, y)


def plot_roc(y_score):
    fpr = {}
    tpr = {}
    roc_auc = {}
    for i in range(M):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    return roc_auc[1]


# Decision tree using the "gini" criterion
dt_gini = DecisionTreeClassifier(criterion="gini", random_state=0)
dt_gini_score = dt_gini.fit(X_train, y_train).predict(X_test)
plot_roc(dt_gini_score)

追溯

> Traceback (most recent call last):   File
> "C:/Users/User/PycharmProjects/Applied ML/Mod4.py", line 62, in
> <module>
>     plot_roc(dt_gini_score)   File "C:/Users/User/PycharmProjects/Applied ML/Mod4.py", line 49, in
> plot_roc
>     fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])   File "C:\Users\User\PycharmProjects\Applied
> ML\venv\lib\site-packages\pandas\core\series.py", line 966, in
> __getitem__
>     return self._get_with(key)   File "C:\Users\User\PycharmProjects\Applied
> ML\venv\lib\site-packages\pandas\core\series.py", line 981, in
> _get_with
>     return self._get_values_tuple(key)   File "C:\Users\User\PycharmProjects\Applied
> ML\venv\lib\site-packages\pandas\core\series.py", line 1016, in
> _get_values_tuple
>     raise KeyError("key of type tuple not found and not a MultiIndex") KeyError: 'key of type tuple not found and not a MultiIndex'
> 
> Process finished with exit code 1
4

1 回答 1

1

我们可以plot_roc像这样重写函数(我用问题中提供的玩具数据集进行了尝试):

from sklearn import metrics
from matplotlib import pyplot as plt


def plot_roc(y_test, y_score):

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])
    plt.figure()
    lw = 2
    plt.plot(fpr["micro"], tpr["micro"], color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc["micro"])
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

我们这样称呼它:

plot_roc(y_test, dt_gini_score)

得到预期的情节。

于 2021-09-25T22:07:18.753 回答