0

在我的数据框中,有一些行具有相同的 ID,但测试年份和年龄的值不同。我想折叠重复的行并为不同的值创建新列。

我是 R 的新手,并且已经为此苦苦挣扎了一段时间。

这是数据框:

>df
    id 项目 testyr1 testyr2 age1 age2
1 16S AS 2008 不适用 29 不适用
2 32S AS 2004 不适用 30 不适用
3 37S 不适用 2011 不适用 36
4 50S AS 2004 不适用 23 不适用
5 50S AS 1998 不适用 16 不适用
6 55S AS 2007 不适用 28 不适用

testyr1应该有最早的年份和testyr2最晚的年份。age1应该是小年和age2大年。

输出应该是:

      id 项目 testyr1 testyr2 age1 age2   
1 16S AS 2008 不适用 29 不适用  
2 32S AS 2004 不适用 30 不适用  
3 37S 不适用 2011 不适用 36  
4 50S AS 1998 2004 16 23  
6 55S AS 2007 不适用 28 不适用  

我试图写一个循环,但不知道如何结束它:

df.undup <- c()
df.undup <- c()    
for (i in 1:nrow(df)){   
  if i == i+1    
    df$testyr1 != NA {   

    testyr2 = max(testyr1)   
    testyr1 = min(testyr1)   
    nage2 = max(nage1)   
    nage1 = min(nage1)   
  }   
 else{   
    testyr2 = max(testyr2)   
    testyr1 = min(testyr2)   
    nage2 = max(nage2)   
    nage1 = min(nage2)   
  }   
}   

任何帮助将不胜感激。

4

2 回答 2

3
library(plyr)

data <- read.csv(textConnection("id,project,testyr1,testyr2,age1,age2
16S,AS,2008,NA,29,NA
32S,AS,2004,NA,30,NA
37S,AS,NA,2011,NA,36
50S,AS,2004,NA,23,NA
50S,AS,1998,NA,16,NA
55S,AS,2007,NA,28,NA"))


new_data <- ddply(data, .(id), function(x) {
  return(data.frame(id = unique(x$id), project = unique(x$project), 
    testyr1 = min(x$testyr1), 
    testyr2 = max(x$testyr2), age1= min(x$age1), age2 = max(x$age2)))
    })

> new_data

    id project testyr1 testyr2 age1 age2
1 16S      AS    2008      NA   29   NA
2 32S      AS    2004      NA   30   NA
3 37S      AS      NA    2011   NA   36
4 50S      AS    2004      NA   23   NA
5 50S      AS    1998      NA   16   NA
6 55S      AS    2007      NA   28   NA

# But your result example suggests you want the lowest 
# of testyr to be in testyr1 and the highest of the combined
# testyrs to be in testyr2. Same logic for ages.
# If so, the one below should work:

new_data <- ddply(data, .(id), function(x) {
    if(dim(x)[1]>1) {
    years <- c(x$testyr1, x$testyr2)
    ages <-  c(x$age1, x$age2)
    return(data.frame(id = unique(x$id), project = unique(x$project), 
        testyr1 = min(years, na.rm=T), testyr2 = max(years , na.rm=T), 
        age1= min(ages, na.rm=T), age2 = max(ages, na.rm=T)))   
    } else {
    return(data.frame(id = unique(x$id), project = unique(x$project), 
        testyr1 = x$testyr1, testyr2 = x$testyr2, 
        age1= x$age1, age2 = x$age2)) 
    }       
    })

> new_data
   id project testyr1 testyr2 age1 age2
1 16S      AS    2008      NA   29   NA
2 32S      AS    2004      NA   30   NA
3 37S      AS      NA    2011   NA   36
4 50S      AS    1998    2004   16   23
5 55S      AS    2007      NA   28   NA
于 2012-07-31T20:41:27.177 回答
0

真的怀疑这是最有效的方法,但我的大脑目前无法正常工作。

temp = names(which(table(df$id) > 1))
temp1 = vector("list")
for (i in 1:length(temp)) {
  temp1[[i]] = df[df$id == temp[i], ]
  temp1[[i]] = data.frame(temp1[[i]][1, 1:2], 
                     testyr1 = min(temp1[[i]]$testyr1), 
                     testyr2 = max(temp1[[i]]$testyr1), 
                     age1 = min(temp1[[i]]$age1), 
                     age2 = max(temp1[[i]]$age1))
}

rbind(df[-c(which(df$id %in% temp)), ], do.call(rbind, temp1))
#    id project testyr1 testyr2 age1 age2
# 1 16S      AS    2008      NA   29   NA
# 2 32S      AS    2004      NA   30   NA
# 3 37S      AS      NA    2011   NA   36
# 6 55S      AS    2007      NA   28   NA
# 4 50S      AS    1998    2004   16   23

### rm(i, temp, temp1) ### Cleanup the workspace
于 2012-07-31T20:46:08.327 回答