我正在尝试使用scipy.stats获取数据的分布拟合。col_1, col_2, col_3
数据在单个 CSV 文件中包含多个列。
- 问题是分布拟合只需要一列来识别最佳分布拟合,如下面的代码所示。
如何同时获得所有列的分布拟合?例如分布拟合
col_1, col_2, col_3
import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np import scipy from sklearn.preprocessing import StandardScaler import scipy.stats import matplotlib.pyplot as plt # Load data and select first column from sklearn import datasets data_set = datasets.load_breast_cancer() # Multiple coloumns of csv col_1=data_set.data[:,0] col_2=data_set.data[:,1] col_3=data_set.data[:,2] # Create an index array (x) for data x = np.arange(len(col_1)) size = len(col_1) plt.hist(col_1) plt.show() sc=StandardScaler() yy = col_1.reshape (-1,1) sc.fit(yy) y_std =sc.transform(yy) y_std = y_std.flatten() y_std del yy dist_names = ['beta', 'expon', 'gamma', 'lognorm', 'norm', 'pearson3', 'triang', 'uniform', 'weibull_min', 'weibull_max'] # Set up empty lists to stroe results chi_square = [] p_values = [] # Set up 50 bins for chi-square test # Observed data will be approximately evenly distrubuted aross all bins percentile_bins = np.linspace(0,100,51) percentile_cutoffs = np.percentile(y_std, percentile_bins) observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs)) cum_observed_frequency = np.cumsum(observed_frequency) # Loop through candidate distributions for distribution in dist_names: # Set up distribution and get fitted distribution parameters dist = getattr(scipy.stats, distribution) param = dist.fit(y_std) # Obtain the KS test P statistic, round it to 5 decimal places p = scipy.stats.kstest(y_std, distribution, args=param)[1] p = np.around(p, 5) p_values.append(p) # Get expected counts in percentile bins # This is based on a 'cumulative distrubution function' (cdf) cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2], scale=param[-1]) expected_frequency = [] for bin in range(len(percentile_bins)-1): expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin] expected_frequency.append(expected_cdf_area) # calculate chi-squared expected_frequency = np.array(expected_frequency) * size cum_expected_frequency = np.cumsum(expected_frequency) ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency) chi_square.append(ss) # Collate results and sort by goodness of fit (best at top) results = pd.DataFrame() results['Distribution'] = dist_names results['chi_square'] = chi_square results['p_value'] = p_values results.sort_values(['chi_square'], inplace=True) # Report results print ('\nDistributions sorted by goodness of fit:') print ('----------------------------------------') print (results)