我想绘制决策树分类器的 ROC 曲线。我的代码引发key of type tuple not found and not a MultiIndex
错误。
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, auc
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
# Load the Fake news dataset
df_fake = pd.read_csv("C:/Users/User/Downloads/Fake.csv")
# Load the True news dataset
df_true = pd.read_csv("C:/Users/User/Downloads/True.csv")
# Set fake as 1 and true as 0
df_true["class"] = 0
df_fake["class"] = 1
# Concatenate true and fake datasets
df = pd.concat([df_fake, df_true])
# Sanity check
print(f'N rows={len(df)}, M columns={len(df.columns)}')
df.head()
数据框
df.head()
标题 | 文本 | 主题 | 日期 | 班级 | |
---|---|---|---|---|---|
0 | 唐纳德特朗普发出令人尴尬的新年'... | 唐纳德特朗普不能希望所有美国人...... | 消息 | 2017 年 12 月 31 日 | 1 |
1 | 喝醉了吹牛的特朗普员工开始俄罗斯... | 众议院情报委员会主席德文·努... | 消息 | 2017 年 12 月 31 日 | 1 |
2 | 警长大卫克拉克成为网络笑话...... | 周五,据透露,前密尔沃克... | 消息 | 2017 年 12 月 31 日 | 1 |
3 | 特朗普如此着迷,他甚至有奥巴马的名字...... | 圣诞节那天,唐纳德·特朗普宣布... | 消息 | 2017 年 12 月 31 日 | 1 |
4 | 教皇弗朗西斯刚刚喊出了唐纳德特朗普... | 教皇弗朗西斯使用他的年度圣诞节主题... | 消息 | 2017 年 12 月 31 日 | 1 |
# Extract title column in Tf-IDf
X = TfidfVectorizer(stop_words='english', max_features=50).fit_transform(df['title'])
N, M = X.shape
# Dataframe info
y = df["class"]
# Divide dataset into a separate training dataset (80%) and test dataset (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)
# Baseline classification
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)
dummy_clf.predict(X)
dummy_clf.score(X, y)
def plot_roc(y_score):
fpr = {}
tpr = {}
roc_auc = {}
for i in range(M):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
return roc_auc[1]
# Decision tree using the "gini" criterion
dt_gini = DecisionTreeClassifier(criterion="gini", random_state=0)
dt_gini_score = dt_gini.fit(X_train, y_train).predict(X_test)
plot_roc(dt_gini_score)
追溯
> Traceback (most recent call last): File
> "C:/Users/User/PycharmProjects/Applied ML/Mod4.py", line 62, in
> <module>
> plot_roc(dt_gini_score) File "C:/Users/User/PycharmProjects/Applied ML/Mod4.py", line 49, in
> plot_roc
> fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) File "C:\Users\User\PycharmProjects\Applied
> ML\venv\lib\site-packages\pandas\core\series.py", line 966, in
> __getitem__
> return self._get_with(key) File "C:\Users\User\PycharmProjects\Applied
> ML\venv\lib\site-packages\pandas\core\series.py", line 981, in
> _get_with
> return self._get_values_tuple(key) File "C:\Users\User\PycharmProjects\Applied
> ML\venv\lib\site-packages\pandas\core\series.py", line 1016, in
> _get_values_tuple
> raise KeyError("key of type tuple not found and not a MultiIndex") KeyError: 'key of type tuple not found and not a MultiIndex'
>
> Process finished with exit code 1