更新答案:
我只是将答案包装在一个函数中。请注意,这仅对 4 个级别有效。
colnames(Data)<-c("Individuals","Environments","YD") #removed spaces from names
myfun <- function(DF, samplefrom, samplelevels, sampletype, samplesize)
{
if(sampletype == "per1")
{
Env1 = sample(unique(DF[[samplefrom]]), samplesize)
Env2 <- Env3 <- Env4 <- Env1
}
if(sampletype == "per4")
{
Env1 = sample(unique(DF[[samplefrom]]), samplesize)
Env2 = sample(unique(DF[[samplefrom]])[!unique(DF[[samplefrom]]) %in% Env1], samplesize)
Env3 = sample(unique(DF[[samplefrom]])[!unique(DF[[samplefrom]]) %in% c(Env1, Env2)], samplesize)
Env4 = sample(unique(DF[[samplefrom]])[!unique(DF[[samplefrom]]) %in% c(Env1, Env2, Env3)], samplesize)
}
if(sampletype == "per2")
{
Env1 = sample(unique(DF[[samplefrom]]), samplesize)
Env2 <- Env1
Env3 = sample(unique(DF[[samplefrom]])[!unique(DF[[samplefrom]]) %in% Env1], samplesize)
Env4 <- Env3
}
ret = do.call(rbind, mapply(function(ind, env) {df <- Data[DF[[samplelevels]] == env,];
df[df[[samplefrom]] %in% ind,]},
env = as.list(sample(unique(DF[[samplelevels]]))), ind = list(Env1, Env2, Env3, Env4),
SIMPLIFY = F)) #in `env = ` added `sample` to select the environments
#in random order and assign them the individuals
return(ret)
}
myfun(Data, "Individuals", "Environments", "per1", 2)
# Individuals Environments YD
#21 13954 1 0.6658681
#345 457982 1 -1.1022770
#620 13954 2 -0.4888968
#944 457982 2 0.6026167
#1219 13954 4 -0.7183965
#1543 457982 4 0.4881141
#1818 13954 5 0.2660623
#2142 457982 5 -2.0626073
myfun(Data, "Individuals", "Environments", "per2", 2)
# Individuals Environments YD
#25 15292 1 -1.1272386
#248 373045 1 -0.6659416
#624 15292 2 -0.2362053
#847 373045 2 0.5778210
#1260 62150 4 1.2077921
#1654 1541043 4 1.1406084
#1859 62150 5 -0.3358584
#2253 1541043 5 0.3897426
myfun(Data, "Individuals", "Environments", "per4", 2)
# Individuals Environments YD
#106 85786 1 1.4480500
#567 3830162 1 -1.8052577
#1029 1301802 2 0.2737786
#1043 1410845 2 1.0617118
#1630 1302304 4 0.6673241
#1678 1766332 4 -0.0451913
#1871 65315 5 -0.0597450
#2336 2621166 5 2.5590801
更新 2一些评论
mapply
将函数顺序应用于多个参数。在这里,该函数有两个参数:ind
和env
。函数 1) 对数据帧进行子集化,env
并且 2) 对子集化数据帧进行子集化ind
。env
是一个环境,是先前在 中计算的个体 ( , ...)ind
的样本。要成为的函数的多个参数是:[1, 2, 3, 4] 和:[Env1, Env2, Env3, Env4]。依次获取and ,等等,并在列表中给出结果(必要的子集)。在数据框输出中加入列表。Env1
myfun
mapplied
env
ind
mapply
env = 1
ind = Env1
env = 2
ind = Env2
do.call(rbind,)
PS请注意,因为sample
is usedenv
可以是 [1, 2, 3, 4] 或 [2, 4, 3, 1] 或其他任何东西,所以函数(将mapplied
)参数的顺序组合不仅是env = 1
andind = Env1
而是env = 1 or 2 or 3 or 4
and ind = Env1
, and很快。
更新 3 和 4功能不同的 No 级别
No_different_samples
是您希望采集的不同样本的数量;我将其设置为默认数量samplelevels
(即每个级别都有不同的样本)。如果No_different_samples
不适合 No 级别(即,如果您想要来自具有 4 个级别的总体中的 3 个不同样本(作为您的Data
),它会引发错误;您必须选择 1 或 2 或4.
myfun2 <- function(DF, samplefrom, samplelevels,
No_different_samples = NULL, grouping = NULL, samplesize)
{
samp <- sample(unique(DF[[samplefrom]]))
levs <- unique(DF[[samplelevels]])
if(is.null(No_different_samples)) No_different_samples <- length(levs)
if(is.null(grouping)) grouping <- c(1, 1, 1, 1)
if(length(levs) %% No_different_samples) stop("an error message here")
if(length(samp) < No_different_samples * samplesize)
stop("can't take a sample this large from the population")
ls_diffr_samps <- vector("list", length = No_different_samples)
for(i in 1:No_different_samples)
{
ls_diffr_samps[[i]] <- samp[(i * samplesize - (samplesize - 1)) : (i * samplesize)]
}
list_samples <- rep(ls_diffr_samps, times = grouping)
ret = do.call(rbind, mapply(function(ind, env) {df <- DF[DF[[samplelevels]] == env,];
df[df[[samplefrom]] %in% ind,]},
env = as.list(sample(levs)), ind = list_samples,
SIMPLIFY = F))
return(ret)
}
myfun2(Data, "Individuals", "Environments", 1, 4, 2) #same sample for all
myfun2(Data, "Individuals", "Environments", 2, c(2, 2), 2) #same sample per 2
myfun2(Data, "Individuals", "Environments", 2, c(3, 1), 2) #same sample for 3
myfun2(Data, "Individuals", "Environments", 4, c(1, 1, 1, 1), 2) #different sample for all