2

我有一个大量填充缺失值的数据集。我试图通过找到两列的最大值来使它们中的一些消失。所以我的数据集是这样的:

df1 <- data.frame(id = 1:20, col1 = runif(20)*100, col2 = runif(20)*100)

df1[c(1,3,7,12,18),'col1'] <- NA
df1[c(2,4,7,14,18),'col2'] <- NA

> head(df1,10)
   id     col1      col2
1   1       NA 71.073951
2   2 20.51551        NA
3   3       NA 14.308692
4   4 82.31036        NA
5   5 15.89198 44.190152
6   6 82.72071 20.761311
7   7       NA        NA
8   8 87.80027  8.615998
9   9 26.98488 90.835827
10 10 14.61701  5.184187

经过多次试验和错误,我发现找到max2 列并处理 的唯一方法NA's是嵌套ifelse语句:

df1$col3 <- NA
df1$col3 <- ifelse(is.na(df1$col1) & is.na(df1$col2), NA, ifelse(!is.na(df1$col1) & is.na(df1$col2), df1$col1
            ,ifelse(is.na(df1$col1) & !is.na(df1$col2), df1$col2
            ,ifelse(!is.na(df1$col1) & !is.na(df1$col2), apply(df1[,c('col1','col2')],1,max), df1$col3)))
            )

这使:

> df1
   id     col1      col2     col3
1   1       NA 71.073951 71.07395
2   2 20.51551        NA 20.51551
3   3       NA 14.308692 14.30869
4   4 82.31036        NA 82.31036
5   5 15.89198 44.190152 44.19015
6   6 82.72071 20.761311 82.72071
7   7       NA        NA       NA
8   8 87.80027  8.615998 87.80027
9   9 26.98488 90.835827 90.83583
10 10 14.61701  5.184187 14.61701

这些似乎过于复杂。有没有人有更好的解决方案?

4

3 回答 3

2

一步解决所有NA问题-Inf

df1$col3 <- apply(
              df1[2:3],
              1,
              function(x) ifelse(all(is.na(x)),NA,max(x,na.rm=TRUE))
                 )

或替代使用pmax

df1$col3 <- apply(df1[2:3],1,function(x) pmax(x[1],x[2],na.rm=TRUE))

结果:

   id      col1      col2      col3
1   1        NA 18.614950 18.614950
2   2 13.492240        NA 13.492240
3   3        NA  3.430721  3.430721
4   4 51.825729        NA 51.825729
5   5 54.134056 61.749744 61.749744
6   6 14.953350 44.932748 44.932748
7   7        NA        NA        NA
于 2013-06-11T05:20:43.727 回答
2
df2 <- df1
df2[is.na(df2)] <- -Inf
df1$col3 <- apply(df2[,2:3], 1, max)
df1[df1 == -Inf] <- NA

> head(df1, 10)
   id     col1     col2     col3
1   1       NA 37.28201 37.28201
2   2 88.24088       NA 88.24088
3   3       NA 39.17717 39.17717
4   4 50.17916       NA 50.17916
5   5 98.85015 17.52801 98.85015
6   6 55.52681 19.26055 55.52681
7   7       NA       NA       NA
8   8 63.04393 39.31208 63.04393
9   9 61.32155 62.51975 62.51975
10 10  3.36093 57.22048 57.22048

如果您可以忍受警告,您可以将其简化为:

df1$col3 <- apply(df1[,2:3], 1, max, na.rm=TRUE)
df1[df1 == -Inf] <- NA
于 2013-06-11T05:12:26.553 回答
2

matrixStats 包中的colMaxs在这种情况下非常有用:

library(matrixStats)
df1$col3 <- rowMaxs(as.matrix(df1[,c('col1', 'col2')]), na.rm=TRUE)

您仍然需要将 替换InfNA

df1$col3[! is.finite(df1$col3)] <- NA
> df1
   id       col1        col2     col3
1   1         NA 96.28183765 96.28184
2   2 87.2118114          NA 87.21181
3   3         NA 32.29219511 32.29220
4   4 22.4360128          NA 22.43601
5   5 65.1566856 66.41860327 66.41860
6   6  3.2917126 98.97801816 98.97802
7   7         NA          NA       NA
8   8 54.0993429 75.50337298 75.50337
9   9 63.2100595  0.09996961 63.21006
10 10 17.7253830 15.14493935 17.72538
11 11 12.4315424 42.89584451 42.89584
12 12         NA 44.14509016 44.14509
13 13 75.6886849 89.15621126 89.15621
14 14 53.5786272          NA 53.57863
15 15  0.8187993 98.81274502 98.81275
16 16 23.9812847  1.02842648 23.98128
17 17 24.3978092 26.65967294 26.65967
18 18         NA          NA       NA
19 19 36.5956202 88.78191581 88.78192
20 20 27.3246589 70.89648896 70.89649
于 2013-06-11T15:12:51.870 回答