r - 用于处理基因型数据的 R 代码

Question

我有这个数据称为mydf.

我需要将列中的字母（DNA 字母）REF与( ) 匹配ALT，并将相应的数值粘贴在一起作为.colnames(x)"A","T","G","C""REF,ALT"

但是，我有一些行"snp:+[0-9]"和"flat$"列中TYPE。

现在对于"flat$"我想要的行：

ALT如果字母是唯一的，则将尽可能多"snp:+[0-9]"的相应"start"id 中的值相加ALT，包括扁线本身（请参阅用大括号括起来的脚本以获得一条扁线）
ALT再次将该值粘贴"REF,ALT"为（REF两者的值相同"snp:+[0-9]"且"flat$"具有相同的起始 ID）
得到结果中所示的输出。

我已经为一条扁平线做到了这一点，但我需要帮助制作该功能，flatcase以便它对所有扁平线都执行相同的操作。

我怎样才能做一个功能来做到这一点flatcase？

代码

normalCase <- function(x, ns) {
      ref.idx <- which(ns == "REF")
      ref.allele <- x[ref.idx]
      ref.count <- x[which(ns == ref.allele)]

      alt.idx <- which(ns == "ALT")
      alt.allele <- x[alt.idx]
      alt.count <- x[which(ns == alt.allele)]

      paste(ref.count, alt.count, sep=",")
    }



    flatcase??{

     g<-x[,"start"]=="chr16:2530921"& grepl("snp:+[0-9]",x[,"TYPE"])
     myt<-x[g,]
     x[g,"ALT"]
     unique(x[g,"ALT"])
     c<-unique(x[g,"ALT"])
     flat<-myt[grepl("flat$",myt[,"TYPE"]),]
     c<-unique(x[g,"ALT"])
    alt.count<- sum(as.numeric(flat[c]))
    }

    calculateAD <- function(x, mat, ns) {
      if (grepl("flat$", x[which(ns == 'TYPE')])) {
        flatCase(x, mat, ns)
      } else {
        normalCase(x, ns)
      }
    }


    bamAD <- function(x) {
       new.x <- cbind(x, apply(x, 1, calculateAD, x, colnames(x)))
      colnames(new.x)[ncol(new.x)] <- "bam.AD"
      new.x      
    }

我为 flatCase 尝试过的功能是：

flatCase <- function(x, mat, ns) {
  id.idx <- which(ns == 'start')
  type.idx <- which(ns == 'TYPE')
  ref.idx <- which(ns == 'REF')
  alt.idx <- which(ns == 'ALT')


  id <- x[id.idx]
  #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
  #m <- mat[mat[, id.idx] == id & mat[, type.idx] == "snp", ]
  m<-mat[grepl(id,mat[, id.idx]) & grepl("snp:+[0-9]",mat[, type.idx]),]
  #flat<-mat[grepl("flat$",mat[, type.idx]),]
  ref.allele <- x[ref.idx]
  ref.count<-x[which(ns == ref.allele)]


  alt.count <- sum(apply(m, 1, function(x) as.numeric(x[which(ns == x[alt.idx])])))
  paste(ref.count, alt.count, sep=",") 
}

我的

x <- as.matrix(read.csv(text="start,A,T,G,C,REF,ALT,TYPE 
chr20:5363934,95,29,14,59,C,T,snp
chr5:8529759,24,1,28,41,G,C,snp
chr14:9620689,65,49,41,96,T,G,snp
chr18:547375,94,1,51,67,G,C,snp
chr8:5952145,27,80,25,96,T,T,snp
chr14:8694382,68,94,26,30,A,A,snp
chr16:2530921,49,15,79,72,A,T,snp:2530921
chr16:2530921,49,15,79,72,A,G,snp:2530921
chr16:2530921,49,15,79,72,A,T,snp:2530921flat
chr16:2533924,42,13,19,52,G,T,snp:2533924flat
chr16:2543344,4,13,13,42,G,T,snp:2543344flat
chr16:2543344,4,23,13,42,G,A,snp:2543344
chr14:4214117,73,49,18,77,G,A,snp
chr4:7799768,36,28,1,16,C,A,snp
chr3:9141263,27,41,93,90,A,A,snp", stringsAsFactors=FALSE))

结果：

       start           A    T    G    C    REF ALT TYPE              bam.AD  
 [1,] "chr20:5363934" "95" "29" "14" "59" "C" "T" "snp"             "59,29" 
 [2,] "chr5:8529759"  "24" " 1" "28" "41" "G" "C" "snp"             "28,41" 
 [3,] "chr14:9620689" "65" "49" "41" "96" "T" "G" "snp"             "49,41" 
 [4,] "chr18:547375"  "94" " 1" "51" "67" "G" "C" "snp"             "51,67" 
 [5,] "chr8:5952145"  "27" "80" "25" "96" "T" "T" "snp"             "80,80" 
 [6,] "chr14:8694382" "68" "94" "26" "30" "A" "A" "snp"             "68,68" 
 [7,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921"     "49,15" 
 [8,] "chr16:2530921" "49" "15" "79" "72" "A" "G" "snp:2530921"     "49,79" 
 [9,] "chr16:2530921" "49" "15" "79" "72" "A" "T" "snp:2530921flat" "49,94"
[10,] "chr16:2533924" "42" "13" "19" "52" "G" "T" "snp:2533924flat" "19,13"  
[11,] "chr16:2543344" "42" "13" "13" "42" "G" "T" "snp:2543344flat" "13,55" 
[12,] "chr16:2543344" "42" "23" "13" "42" "G" "A" "snp:2543344"     "13,42" 
[13,] "chr14:4214117" "73" "49" "18" "77" "G" "A" "snp"             "18,73" 
[14,] "chr4:7799768"  "36" "28" " 1" "16" "C" "A" "snp"             "16,36" 
[15,] "chr3:9141263"  "27" "41" "93" "90" "A" "A" "snp"             "27,27"

score 2 · Accepted Answer

这是一种方法来完成这一切，矢量化。

首先，请注意，无论类型如何，REF 都是相同的。我们可以通过使用 REF 作为矩阵中的坐标来快速查找它，例如，第 1 行有 REF C，所以如果我们查找坐标 (1, "C")，我们会得到该行的 REF 值。

# the REFs are the same regardless of TYPE
rownames(x) <- 1:nrow(x)
ref <- x[cbind(1:nrow(x), x[, 'REF'])]

看看cbind(1:nrow(x), x[, 'REF'])：这只是一个坐标列表，(row number, REF)我们用它来查找 REF 编号。

然后我们对 ALT 做同样的事情：

alt <- x[cbind(1:nrow(x), x[, 'ALT'])]

但是，我们必须确保如果类型是“flat”，我们将所有其他 ALT 添加到“flat”行的 ALT（如您所说，只有唯一的）。

首先，找出哪些行是平的：

which.flat <- grep('flat$', x[, 'TYPE'])

接下来，对于每个扁平行，查找具有相同“开始”（x[, 'start'] == x[i, 'start']即位）的其他行的 ALT，并排除具有重复 ALT 的行（x[, 'ALT'] != x[i, 'ALT']即位）。这i是当前平线的索引。将它们全部添加到扁平线的 ALT 中。sapplyjust 为每条扁平线向量化了这一切。

# add the other alts to the alt of the 'flat' line.
alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
    function (i) {
        sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
             x[, 'ALT'] != x[i, 'ALT'] ]))
    })

现在我们只是粘贴在一起：

x <- cbind(x, bam.AD=paste(ref, alt, sep=','))

结果与您的结果相同，除了我认为您犯了错误的第 10 行 - 只有一行带有“chr16:2533924”，其 ALT 为“T”（值 13），bam.AD“19,13”也是如此（你有“19,42”，好像 ALT 是“A”，但不是）。

如果您必须在问题中坚持使用函数形式（非常缓慢且效率低下！），它与我所做的基本相同（因此为什么您可以在没有apply调用的情况下完成它并完全跳过循环）：

flatCase <- function(x, mat, ns) { # 获取平面行的 alt <- as.numeric(x[x['ALT']])

# get the other rows with the same 'start' and different 'ALT'
xx <- mat[mat[, 'start'] == x['start'] & mat[, 'ALT'] != x['ALT'], ,drop=F]
if (nrow(xx) > 0) {
  # grab all the alts as done before
  rownames(xx) <- 1:nrow(xx)
  alt <- alt + sum(as.numeric(xx[cbind(1:nrow(xx), xx[, 'ALT'])]))
 }

ref <- x[x['REF']]
return(paste(ref, alt, sep=','))
}

但是，如前所述，如果将其矢量化，则上面的整个代码将减少到几行，并且速度更快：

newBamAD <- function (x) {
    # the version above
    rownames(x) <- 1:nrow(x)
    ref <- x[cbind(1:nrow(x), x[, 'REF'])]
    alt <- x[cbind(1:nrow(x), x[, 'ALT'])]
    which.flat <- grep('flat$', x[, 'TYPE'])
    alt[which.flat] <- as.numeric(alt[which.flat]) + sapply(which.flat,
        function (i) {
            sum(as.numeric(alt[ x[, 'start'] == x[i, 'start'] &
                 x[, 'ALT'] != x[i, 'ALT'] ]))
        })
    cbind(x, bam.AD=paste(ref, alt, sep=','))
}

library(rbenchmark)
benchmark(
  bamAD=bamAD(x),
  newBamAD=newBamAD(x)
)
#       test replications elapsed relative user.self sys.self user.child sys.child
# 1    bamAD          100   0.082    3.905     0.072    0.004          0         0
# 2 newBamAD          100   0.021    1.000     0.020    0.000          0         0

矢量化版本几乎快 4 倍。

score 2 · Accepted Answer

另一种方法：

# create dataframe
mydf <- as.data.frame(x, stringsAsFactors=FALSE)
# create temporary values based on REF and ALT
mydf$REFval <- diag(as.matrix(mydf[, mydf$REF]))
mydf$ALTval <- diag(as.matrix(mydf[, mydf$ALT]))

在下一步中，您说“如果 ALT 字母是唯一的”对 ALT 求和，但没有指定如果 ALT 相同但值不同则使用哪个值。由于值相同，因此在您的示例数据集中并不重要，因此在下面的代码中，我假设要使用最后一个 ALT 值。

# sum up ALT values for all start ID
require(dplyr)
mydfs <- mydf %>% group_by(start, ALT) %>%
  summarize(ALTkeep=last(ALTval)) %>%  # assume keep last one if same ALT
  group_by(start) %>%
  summarize(ALTflat=sum(as.numeric(ALTkeep)))

# merge back into main dataframe
mydf <- left_join(mydf, mydfs)
# select ALT value for bam.AD depending on "flat$" in TYPE
mydf$bam.AD <- with(mydf,
  paste(REFval, ifelse(grepl("flat$", TYPE), ALTflat, ALTval), sep=","))

# optional clean up of temporary values
mydf <- mydf[, !(names(mydf) %in% c("REFval", "ALTval", "ALTflat"))]

你想要的输出

                                   start  A  T  G  C REF ALT            TYPE bam.AD
1                          chr20:5363934 95 29 14 59   C   T             snp  59,29
2                           chr5:8529759 24  1 28 41   G   C             snp  28,41
3                          chr14:9620689 65 49 41 96   T   G             snp  49,41
4                           chr18:547375 94  1 51 67   G   C             snp  51,67
5                           chr8:5952145 27 80 25 96   T   T             snp  80,80
6                          chr14:8694382 68 94 26 30   A   A             snp  68,68
7                          chr16:2530921 49 15 79 72   A   T     snp:2530921  49,15
8                          chr16:2530921 49 15 79 72   A   G     snp:2530921  49,79
9                          chr16:2530921 49 15 79 72   A   T snp:2530921flat  49,94
10                         chr16:2533924 42 13 19 52   G   T snp:2533924flat  19,13
11                         chr16:2543344  4 13 13 42   G   T snp:2543344flat  13,55
12                         chr16:2543344 42 23 13 42   G   A     snp:2543344  13,42
13                         chr14:4214117 73 49 18 77   G   A             snp  18,73
14                          chr4:7799768 36 28  1 16   C   A             snp  16,36
15                          chr3:9141263 27 41 93 90   A   A             snp  27,27

r - 用于处理基因型数据的 R 代码

2 回答 2

Related

Reference