有没有一种方法可以将 SMOTE 与 NaN 一起使用?
这是一个在存在 NaN 值的情况下尝试使用 SMOTE 的虚拟程序
# Imports
from collections import Counter
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import Imputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
# Load data
bc = load_breast_cancer()
X, y = bc.data, bc.target
# Initial number of samples per class
print('Number of samples for both classes: {} and {}.'.format(*Counter(y).values()))
# SMOTEd class distribution
print('Dataset has %s missing values.' % np.isnan(X).sum())
_, y_resampled = SMOTE().fit_sample(X, y)
print('Number of samples for both classes: {} and {}.'.format(*Counter(y_resampled).values()))
# Generate artificial missing values
X[X > 1.0] = np.nan
print('Dataset has %s missing values.' % np.isnan(X).sum())
#_, y_resampled = make_pipeline(Imputer(), SMOTE()).fit_sample(X, y)
sm = SMOTE(ratio = 'auto',k_neighbors = 5, n_jobs = -1)
smote_enn = SMOTEENN(smote = sm)
x_train_res, y_train_res = smote_enn.fit_sample(X, y)
print('Number of samples for both classes: {} and {}.'.format(*Counter(y_resampled).values()))
我得到以下输出/错误:
Number of samples for both classes: 212 and 357.
Dataset has 0 missing values.
Number of samples for both classes: 357 and 357.
Dataset has 6051 missing values.
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').