6

我有两个数据框。一 ( df1) 包含所有感兴趣的列和行,但包括缺失的观察值。另一个 ( df2) 包括用于代替缺失观测值的值,并且仅包括在 中至少NA存在一个的列和行df1。我想以某种方式合并这两个数据集以获得desired.result.

这似乎是一个非常简单的问题要解决,但我正在画一个空白。我不能merge去上班。也许我可以写嵌套for-loops,但还没有这样做。我也试aggregate了几次。我有点害怕发布这个问题,担心我的R卡可能会被吊销。抱歉,如果这是重复的。我在这里和谷歌进行了相当深入的搜索。谢谢你的任何建议。R优选碱溶液。

df1 = read.table(text = "
  county year1 year2 year3
    aa     10    20   30
    bb      1    NA    3
    cc      5    10   NA
    dd    100    NA  200
", sep = "", header = TRUE)

df2 = read.table(text = "
  county year2 year3
    bb      2   NA
    cc     NA   15
    dd    150   NA
", sep = "", header = TRUE)

desired.result = read.table(text = "
  county year1 year2 year3
    aa     10    20   30
    bb      1     2    3
    cc      5    10   15
    dd    100   150  200
", sep = "", header = TRUE)
4

3 回答 3

9

aggregate可以这样做:

aggregate(. ~ county,
          data=merge(df1, df2, all=TRUE), # Merged data, including NAs
          na.action=na.pass,              # Aggregate rows with missing values...
          FUN=sum, na.rm=TRUE)            # ...but instruct "sum" to ignore them.
##   county year2 year3 year1
## 1     aa    20    30    10
## 2     bb     2     3     1
## 3     cc    10    15     5
## 4     dd   150   200   100
于 2013-04-05T01:16:57.290 回答
2

这将做:

m <- merge(df1, df2, by="county", all=TRUE)

dotx <- m[,grepl("\\.x",names(m))]

doty <- m[,grepl("\\.y",names(m))]

dotx[is.na(dotx)] <- doty[is.na(dotx)]

names(dotx) <- sapply(strsplit(names(dotx),"\\."), `[`, 1)

result <- cbind(m[,!grepl("\\.x",names(m)) & !grepl("\\.y",names(m))], dotx)

检查:

> result
  county year1 year2 year3
1     aa    10    20    30
2     bb     1     2     3
3     cc     5    10    15
4     dd   100   150   200
于 2013-04-05T00:43:20.673 回答
2

另一种选择取消reshape2并以长格式工作:

library(reshape2)
## reshape to long format
df1.m <- melt(df1)
df2.m <- melt(df2)
## get common values
idx <- df1.m$county %in% df2.m$county & 
       df1.m$variable%in% df2.m$variable
## replace NA values 
df1.m[idx,]$value <- ifelse(is.na(df1.m[idx,]$value),
                            df2.m$value , 
                            df1.m[idx,]$value)
## get the wide format
dcast(data=df1.m,county~variable)

  county year1 year2 year3
1     aa    10    20    30
2     bb     1     2     3
3     cc     5    10    15
4     dd   100   150   200
于 2013-04-05T01:18:11.517 回答