正如@db 几个小时前指出的那样,您面临的选择是对现有数据进行采样,或者假设它符合一些理论分布,例如rnorm
. 很明显,您正在模仿的文章的作者选择了后者。新数据集的摘要清楚地显示了一些 setosa Petal.Width 的原始值iris
和NA
s 中没有的值,因为在大样本中,我们必然会低于 0 进行测量。
这是一组快速而肮脏的代码,您应该能够根据自己的数据进行调节。
set.seed(2020)
library(dplyr)
testing <- iris %>%
group_by(Species) %>%
summarise_at(vars(Sepal.Length:Petal.Width), list(mean = mean,
sd = sd)) %>%
rowwise() %>%
group_by(Species) %>%
summarise(Sepal.Length = rnorm(1500,
mean = Sepal.Length_mean,
sd = Sepal.Length_sd),
Sepal.Width = rnorm(1500,
mean = Sepal.Width_mean,
sd = Sepal.Width_sd),
Petal.Length = rnorm(1500,
mean = Petal.Length_mean,
sd = Petal.Length_sd),
Petal.Width = rnorm(1500,
mean = Petal.Width_mean,
sd = Petal.Width_sd)) %>%
ungroup %>% # so we stop being rowwise
filter_at(vars(Sepal.Length:Petal.Width), ~ . > .1) # to eliminate ridiculously small or negative values
summary(testing)
#> Species Sepal.Length Sepal.Width Petal.Length
#> setosa :1368 Min. :3.784 Min. :1.719 Min. :0.8857
#> versicolor:1500 1st Qu.:5.168 1st Qu.:2.746 1st Qu.:1.6116
#> virginica :1500 Median :5.834 Median :3.014 Median :4.2998
#> Mean :5.855 Mean :3.047 Mean :3.8148
#> 3rd Qu.:6.443 3rd Qu.:3.322 3rd Qu.:5.2312
#> Max. :8.304 Max. :4.547 Max. :7.5825
#> Petal.Width
#> Min. :0.1001
#> 1st Qu.:0.3373
#> Median :1.3439
#> Mean :1.2332
#> 3rd Qu.:1.8460
#> Max. :3.0523
通过 pivot_longer 或避免 4 次重复调用的自定义函数,比我更流利的人可能会做得更好rnorm
。寻找不合理的值并证明为什么rnorm
适合您的数据由您决定。
添加一个更复杂的解决方案MASS::mvrnorm
来解释 Remi 在他的回答中提到的相关性。抱歉懒得想更好的代码,这里只是暴力重复。
library(dplyr)
# Get the covariance matrix by species
sigma.setosa <- iris %>%
filter(Species == "setosa") %>%
select(-Species) %>%
cov
sigma.versicolor <- iris %>%
filter(Species == "versicolor") %>%
select(-Species) %>%
cov
sigma.virginica <- iris %>%
filter(Species == "virginica") %>%
select(-Species) %>%
cov
# generate samples based on those covariance matricies
set.seed(2020)
setosa.rows <- MASS::mvrnorm(n = 1500,
c(mean(iris$Sepal.Length), mean(iris$Sepal.Width), mean(iris$Petal.Length), mean(iris$Petal.Width)),
sigma.setosa,
empirical = TRUE)
versicolor.rows <- MASS::mvrnorm(n = 1500,
c(mean(iris$Sepal.Length), mean(iris$Sepal.Width), mean(iris$Petal.Length), mean(iris$Petal.Width)),
sigma.versicolor,
empirical = TRUE)
virginica.rows <- MASS::mvrnorm(n = 1500,
c(mean(iris$Sepal.Length), mean(iris$Sepal.Width), mean(iris$Petal.Length), mean(iris$Petal.Width)),
sigma.virginica,
empirical = TRUE)
# convert to dataframes
setosa.df <- data.frame(setosa.rows, Species = "setosa")
versicolor.df <- data.frame(setosa.rows, Species = "versicolor")
virginica.df <- data.frame(setosa.rows, Species = "virginica")
# bind them return species to a factor
newiris <- rbind(setosa.df, versicolor.df, virginica.df)
newiris$Species <- factor(newiris$Species)
summary(newiris)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width
#> Min. :4.669 Min. :1.759 Min. :3.183 Min. :0.820
#> 1st Qu.:5.598 1st Qu.:2.805 1st Qu.:3.637 1st Qu.:1.130
#> Median :5.848 Median :3.064 Median :3.761 Median :1.199
#> Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
#> 3rd Qu.:6.083 3rd Qu.:3.306 3rd Qu.:3.878 3rd Qu.:1.267
#> Max. :6.969 Max. :4.288 Max. :4.342 Max. :1.578
#> Species
#> setosa :1500
#> versicolor:1500
#> virginica :1500
#>
#>
#>
summary(iris)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width
#> Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
#> 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
#> Median :5.800 Median :3.000 Median :4.350 Median :1.300
#> Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
#> 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
#> Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#> Species
#> setosa :50
#> versicolor:50
#> virginica :50
#>
#>
#>