作为对@unutbu 答案的补充,您还可以在 kstest 中提供测试分布的分布参数。假设我们有一些来自变量的样本(并将它们命名为 datax),我们想检查这些样本是否可能来自对数正态、均匀或正态。请注意,对于 scipy stats,为每个分布采用输入参数的方式会有所不同。现在,感谢 kstest 中的“args”(元组或序列),可以为您要测试的 scipy.stats 分布提供参数。
:) 我还添加了使用两个样本测试的选项,以防您想以任何一种方式进行:
import numpy as np
from math import sqrt
from scipy.stats import kstest, ks_2samp, lognorm
import scipy.stats
def KSSeveralDists(data,dists_and_args,samplesFromDists=100,twosampleKS=True):
returnable={}
for dist in dists_and_args:
try:
if twosampleKS:
try:
loc=dists_and_args[dist][0]
scale=dists_and_args[dist][1]
expression='scipy.stats.'+dist+'.rvs(loc=loc,scale=scale,size=samplesFromDists)'
sampledDist=eval(expression)
except:
sc=dists_and_args[dist][0]
loc=dists_and_args[dist][1]
scale=dists_and_args[dist][2]
expression='scipy.stats.'+dist+'.rvs(sc,loc=loc,scale=scale,size=samplesFromDists)'
sampledDist=eval(expression)
D,p=ks_2samp(data,sampledDist)
else:
D,p=kstest(data,dist,N=samplesFromDists,args=dists_and_args[dist])
except:
continue
returnable[dist]={'KS':D,'p-value':p}
return returnable
a=lambda m,std: m-std*sqrt(12.)/2.
b=lambda m,std: m+std*sqrt(12.)/2.
sz=2000
sc=0.5 #shape
datax=lognorm.rvs(sc,loc=0.,scale=1.,size=sz)
normalargs=(datax.mean(),datax.std())
#suppose these are the parameters you wanted to pass for each distribution
dists_and_args={'norm':normalargs,
'uniform':(a(*normalargs),b(*normalargs)),
'lognorm':[0.5,0.,1.]
}
print "two sample KS:"
print KSSeveralDists(datax,dists_and_args,samplesFromDists=sz,twosampleKS=True)
print "one sample KS:"
print KSSeveralDists(datax,dists_and_args,samplesFromDists=sz,twosampleKS=False)
它给出的输出如下:
two sample KS:
{'lognorm': {'KS': 0.023499999999999965, 'p-value': 0.63384188886455217}, 'norm': {'KS': 0.10600000000000004, 'p-value': 2.918766666723155e-10}, 'uniform': {'KS': 0.15300000000000002, 'p-value': 6.443660021191129e-21}}
one sample KS:
{'lognorm': {'KS': 0.01763415915126032, 'p-value': 0.56275820961065193}, 'norm': {'KS': 0.10792612430093562, 'p-value': 0.0}, 'uniform': {'KS': 0.14910036159697559, 'p-value': 0.0}}
Note: For the scipy.stats uniform distribution, a and b are taken as a=loc and b=loc + scale (see documentation).