2

我有一个看起来像这样的数据框(.txt)[其中“dayX”=果蝇生存试验中的死亡日期,下面的数字是当天在该治疗组合中死亡的果蝇数量,X 或 A是治疗,m&f也是治疗,第一个数字是行,第二个数字是块]

    line    day1    day2    day3    day4    day5
1   Xm1.1   0   0   0   2   0
2   Xm1.2   0   0   1   0   0
3   Xm2.1   1   1   0   0   0
4   Xm2.2   0   0   0   3   1
5   Xf1.1   0   3   0   0   1
6   Xf1.2   0   0   1   0   0
7   Xf2.1   2   0   2   0   0
8   Xf2.2   1   0   1   0   0
9   Am1.1   0   0   0   0   2
10  Am1.2   0   0   1   0   0
11  Am2.1   0   2   0   0   1
12  Am2.2   0   2   0   0   0
13  Af1.1   3   0   0   1   0
14  Af1.2   0   1   3   0   0
15  Af1.1   0   0   0   1   0
16  Af2.2   1   0   0   0   0

并希望它使用 R->

    XA  mf  line    block   individual  age
1   X   m   1   1   1   4
2   X   m   1   1   2   4
3   X   m   1   2   1   3

等等...

生成的数据框从个人死亡之日开始收集“年龄”值,如上部数据框中的评分,例如在治疗 Xm1.1 的第 4 天(第 4 天)有两只苍蝇死亡,因此 R 创建两行,一包含提取的关于第一个个体的信息,因此被标记为个体“1”,然后是另一个具有相同信息的行,除了标记为个体“2”。如果第三个个体在第 5 天在相同的治疗中死亡,将第三行与上述两行相同,只是“年龄”为“5”,个人为“3”。当它移动到下一个治疗行时,在本例中为 Xm1.2,该治疗集中第一个死亡的个体将被标记为个体“1” (在这种情况下,它在第 3 天死亡)。在我的示例中,总共有 38 人死亡,因此我试图让 R 构建一个 38*6 的 df(不包括标题)。

有没有办法获取我的数据框 [实际版本约为 50*640,每个 X/A、m/f、行 (1:40)、块 (1-4) 的独特组合大约有 50 个人,所以 ~32000 个人死亡]以自动方式到6 *〜32000的结束数据框?

如果可以帮助您尝试解决方案,则可以使用此代码构建这两个示例数据框:

test<-data.frame(1:16);colnames(test)=("line")
test$line=c("Xm1.1","Xm1.2","Xm2.1","Xm2.2","Xf1.1","Xf1.2","Xf2.1","Xf2.2","Am1.1","Am1.2","Am2.1","Am2.2","Af1.1","Af1.2","Af2.1","Af2.2")
test$day1=rep(0,16);test$day2=rep(0,16);test$day3=rep(0,16);test$day4=rep(0,16);test$day5=rep(0,16)
test$day4[1]=2;test$day3[2]=1;test$day2[3]=1;test$day4[4]=3;test$day5[5]=1;
test$day3[6]=1;test$day1[7]=2;test$day1[8]=1;test$day5[9]=3;test$day3[10]=1;
test$day2[11]=2;test$day2[12]=2;test$day4[13]=1;test$day3[14]=3;test$day4[15]=1;
test$day1[16]=1;test$day3[7]=2;test$day3[8]=1;test$day2[5]=3;test$day1[3]=1;
test$day5[11]=1;test$day5[9]=2;test$day5[4]=1;test$day1[13]=3;test$day2[14]=1;

test2=data.frame(rep(1:3),rep(1:3),rep(1:3),rep(1:3),rep(1:3),rep(1:3))
colnames(test2)=c("XA","mf","line","block","individual","age")
test2$XA[1]="X";test2$mf[1]="m";test2$line[1]=1;test2$block[1]=1;test2$individual[1]=1;test2$age[1]=4;
test2$XA[2]="X";test2$mf[2]="m";test2$line[2]=1;test2$block[2]=1;test2$individual[2]=2;test2$age[2]=4;
test2$XA[3]="X";test2$mf[3]="m";test2$line[3]=1;test2$block[3]=2;test2$individual[3]=1;test2$age[3]=3;



为制作这个虚拟数据集的漫长过程道歉,遭受睡眠剥夺和时差的困扰,并且几个月没有使用 R,如果你在 R 中运行代码,你希望能更好地看到我的目标

-------------------------------------------------- ----------------------------------

作者:Rg255:目前停留在源自@Arun 的答案(我添加了strsplit (as.character(dt$line) , "" ))部分来解决一个错误)

df=read.table("C:\\Users\\...\\data.txt",header=T)
require(data.table)
head(df[1:20])
dt <- as.data.table(df)
dt <- dt[, {dd <- unlist(.SD, use.names = FALSE); 
            list(individual = sequence(dd[dd>0]), 
                 age = rep(which(dd>0), dd[dd>0])
            )}, by=line]
out <- as.data.table(data.frame(do.call(rbind, strsplit(as.character(dt$line), ""))[, c(1:3,5)], stringsAsFactors=FALSE))
setnames(out, c("XA", "mf", "line", "block"))
out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
out <- cbind(out, dt[, list(individual, age)])

产生以下输出:

     > df=read.table("C:\\Users\\..\\data.txt",header=T)
        > require(data.table)
        > head(df[1:20])
           line Day4 Day6 Day8 Day10 Day12 Day14 Day16 Day18 Day20 Day22 Day24 Day26 Day28 Day30 Day32 Day34 Day36 Day38 Day40
        1 Xm1.1    0    0    0     0     0     0     0     0     0     0     0     0     0     1     0     0     1     4     2
        2 Xm2.1    0    0    0     0     0     0     0     0     0     2     0     0     0     1     2     1     0     2     0
        3 Xm3.1    0    0    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     2     1
        4 Xm4.1    0    0    0     0     0     0     0     0     0     0     0     0     1     1     0     1     2     3     8
        5 Xm5.1    0    0    0     0     0     0     0     0     0     0     0     0     0     2     2     3     3     3     6
        6 Xm6.1    0    0    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     1


> dt <- as.data.table(df)
        > dt <- dt[, {dd <- unlist(.SD, use.names = FALSE); 
        +             list(individual = sequence(dd[dd>0]), 
        +                  age = rep(which(dd>0), dd[dd>0])
        +             )}, by=line]
        > out <- as.data.table(data.frame(do.call(rbind, strsplit(as.character(dt$line), ""))[, c(1:3,5)], stringsAsFactors=FALSE))

        Warning message:
            In function (..., deparse.level = 1)  :
              number of columns of result is not a multiple of vector length (arg 1)


    > setnames(out, c("XA", "mf", "line", "block"))
        > out[, `:=`(line = as.numeric(line), block = as.numeric(block))]


    Error in `[.data.table`(out, , `:=`(line = as.numeric(line), block = as.numeric(block))) : 
          LHS of := must be a single column name, when with=TRUE. When with=FALSE the LHS may be a vector of column names or positions.
        In addition: Warning message:
        In eval(expr, envir, enclos) : NAs introduced by coercion


    > out <- cbind(out, dt[, list(individual, age)])
        > 
4

1 回答 1

0

这里有一个data.table解决方案。该line列必须具有唯一值。

require(data.table)
df <- read.table("data.txt", header=TRUE, stringsAsFactors=FALSE)
dt <- as.data.table(df)
dt <- dt[, {dd <- unlist(.SD, use.names = FALSE); 
                list(individual = sequence(dd[dd>0]), 
                age = rep(which(dd>0), dd[dd>0])
                )}, by=line]
out <- as.data.table(data.frame(do.call(rbind, 
        strsplit(gsub("([[:alpha:]])([[:alpha:]])([0-9]+)\\.([0-9]+)$", 
        "\\1 \\2 \\3 \\4", dt$line), " ")), stringsAsFactors=FALSE))
setnames(out, c("XA", "mf", "line", "block"))
out[, `:=`(line = as.numeric(line), block = as.numeric(block))]
out <- cbind(out, dt[, list(individual, age)])

这适用于您的data.txt文件。

于 2013-06-11T13:35:59.597 回答