我假设每个块至少有两个条目,并且如果它有两个以上,您希望它们尽可能接近 80/20。最简单的方法似乎是为所有行分配一个随机数,然后根据每个分层样本中的百分位数进行选择。假设这是文件 strat_sample.csv 中的数据:
Index_1,Index_2,Data_1,Data_2
0,0,0.614583182,0.677644482
0,0,0.321384981,0.598450854
0,0,0.303029607,0.300593782
0,0,0.646010758,0.612006715
0,0,0.484572883,0.30052535
0,1,0.010625416,0.118671475
0,1,0.428967984,0.23795173
0,1,0.523440618,0.457275922
0,1,0.379612652,0.337640868
0,1,0.338180659,0.206399031
1,0,0.079386,0.890939911
1,0,0.572864624,0.725615079
1,0,0.045891404,0.300128917
1,0,0.578792198,0.100698871
1,0,0.776485138,0.475135948
1,0,0.401850419,0.784835723
1,1,0.087660923,0.497299605
1,1,0.8460978,0.825774802
1,1,0.526015021,0.581905971
1,1,0.23324672,0.299475291
然后此代码(使用 Pandas 数据结构)按需要工作
import numpy as np
import random as rnd
import pandas as pd
#sample data strat_sample.csv, contents to follow
def TreatmentOneCount(n , *args):
#assign a minimum one to each group but as close as possible to fraction OptimalRatio in group 1.
OptimalRatio = args[0]
if n < 2:
print("N too small, assignment not defined.")
a = NaN
elif n == 2:
a = 1
else:
"""
There are one of two numbers that are close to the target ratio, one above, the other below
If the number above is N and it is closest to optimal, then you need to set things to N-1 to ensure both groups have at least one member (recall n>2)
If the number below is 0 and it is closest to optimal, then you need to set things to 1 to ensure both groups have at least one member (recall n>2)
"""
targetassigment = OptimalRatio * n
if targetassigment - floor(targetassigment) > 0.5:
a = min(ceil(targetassigment),n-1)
else:
a = max(floor(targetassigment),1)
return a
df = pd.read_csv('strat_sample.csv', sep=',' , header=0)
#assign a random number to each entry
df['RandScore'] = np.random.uniform(0,1,df.shape[0])
df.sort(columns= ['Index_1' ,'Index_2','RandScore'], inplace = True)
#Within each block assign a rank based on random number.
df['RandRank'] = df.groupby(['Index_1','Index_2'])['RandScore'].rank()
#make a group index
df['MasterIdx'] = df['Index_1'].apply(str) + df['Index_2'].apply(str)
#Store the counts for members of each block
seriestest = df.groupby('MasterIdx')['RandRank'].count()
seriestest.name = "Counts"
dftest = pd.DataFrame(seriestest)
#Add the block counts to the data
df = df.merge(dftest, how='left', left_on = 'MasterIdx', right_index= True)
#Make the actual assignments to the two groups
df['Assignment'] = (df['RandRank'] <= df['Counts'].apply(TreatmentOneCount, args = (0.8,))) * -1 + 2
df.drop(['MasterIdx', 'Counts', 'RandRank', 'RandScore'], axis=1)