35

我正在做不同的文本分类实验。现在我需要计算每个任务的 AUC-ROC。对于二进制分类,我已经使用以下代码使其工作:

scaler = StandardScaler(with_mean=False)

enc = LabelEncoder()
y = enc.fit_transform(labels)

feat_sel = SelectKBest(mutual_info_classif, k=200)

clf = linear_model.LogisticRegression()

pipe = Pipeline([('vectorizer', DictVectorizer()),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('mutual_info', feat_sel),
                 ('logistregress', clf)])
y_pred = model_selection.cross_val_predict(pipe, instances, y, cv=10)
# instances is a list of dictionaries

#visualisation ROC-AUC

fpr, tpr, thresholds = roc_curve(y, y_pred)
auc = auc(fpr, tpr)
print('auc =', auc)

plt.figure()
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

但现在我需要为多类分类任务做这件事。我在某处读到需要对标签进行二值化处理,但我真的不知道如何计算多类分类的 ROC。尖端?

4

4 回答 4

45

正如人们在评论中提到的,您必须使用OneVsAll方法将问题转换为二进制,因此您将拥有n_class许多 ROC 曲线。

一个简单的例子:

from sklearn.metrics import roc_curve, auc
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

iris = datasets.load_iris()
X, y = iris.data, iris.target

y = label_binarize(y, classes=[0,1,2])
n_classes = 3

# shuffle and split training and test sets
X_train, X_test, y_train, y_test =\
    train_test_split(X, y, test_size=0.33, random_state=0)

# classifier
clf = OneVsRestClassifier(LinearSVC(random_state=0))
y_score = clf.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(n_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

在此处输入图像描述在此处输入图像描述在此处输入图像描述

于 2017-07-26T19:04:55.383 回答
14

这对我有用,如果你想让它们在同一个情节上很好。它类似于@omdv 的答案,但可能更简洁一些。

def plot_multiclass_roc(clf, X_test, y_test, n_classes, figsize=(17, 6)):
    y_score = clf.decision_function(X_test)

    # structures
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    # calculate dummies once
    y_test_dummies = pd.get_dummies(y_test, drop_first=False).values
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_dummies[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # roc for each class
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic example')
    for i in range(n_classes):
        ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
    ax.legend(loc="best")
    ax.grid(alpha=.4)
    sns.despine()
    plt.show()

plot_multiclass_roc(full_pipeline, X_test, y_test, n_classes=16, figsize=(16, 10))
于 2019-12-13T16:57:53.880 回答
2

要绘制多类 ROC 使用label_binarize函数和以下代码。根据您的应用程序调整和更改代码。


使用 Iris 数据的示例:

import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from itertools import cycle

iris = datasets.load_iris()
X = iris.data
y = iris.target

# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=0))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()
lw=2
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()

在本例中,您可以打印y_score.

print(y_score)

array([[-3.58459897, -0.3117717 ,  1.78242707],
       [-2.15411929,  1.11394949, -2.393737  ],
       [ 1.89199335, -3.89592195, -6.29685764],
       .
       .
       .

在此处输入图像描述

于 2021-11-17T10:16:43.590 回答
0

试试这个方法。它对我有用,使用起来也很简单

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from itertools import cycle
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import OneHotEncoder

#One-hot encoder
y_valid=y_test.values.reshape(-1,1)
ypred=pred_final.reshape(-1,1)
y_valid = pd.DataFrame(y_test)
ypred=pd.DataFrame(pred_final)

onehotencoder = OneHotEncoder()
y_valid= onehotencoder.fit_transform(y_valid).toarray()
ypred = onehotencoder.fit_transform(ypred).toarray()

n_classes = ypred.shape[1]

# Plotting and estimation of FPR, TPR
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_valid[:, i], ypred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'green', 'red','darkorange','olive','purple','navy'])
for i, color in zip(range(n_classes), colors):
pyplot.plot(fpr[i], tpr[i], color=color, lw=1.5, label='ROC curve of class {0} 
(area = {1:0.2f})' ''.format(i+1, roc_auc[i]))
pyplot.plot([0, 1], [0, 1], 'k--', lw=1.5)
pyplot.xlim([-0.05, 1.0])
pyplot.ylim([0.0, 1.05])
pyplot.xlabel('False Positive Rate',fontsize=12, fontweight='bold')
pyplot.ylabel('True Positive Rate',fontsize=12, fontweight='bold')
pyplot.tick_params(labelsize=12)
pyplot.legend(loc="lower right")
ax = pyplot.axes()
pyplot.show()
于 2022-02-19T06:07:46.513 回答