#count maximum number of columns in the "file"
maxcol <- max(count.fields("D:/file.txt"))
x <- read.table("D:/file.txt",as.is=TRUE,fill=TRUE,col.names=1:maxcol)
x[x==""]<-NA
indices<-which(substr(as.matrix(x),start=1,stop=5)=="txGN=",arr.ind=TRUE)
x<-cbind(x,NA)
for(i in 1:nrow(indices)){
na1<-which(is.na(x[indices[i,1],]))[1]
x[indices[i,1],na1]<-x[indices[i,1],indices[i,2]]
}
x
X1 X2 X3 X4 X5 X6 X7 NA
1 chr1 880942 taPN=-1 taWT=3 txGN=SAMD11 txID=uc001abw FUNC=nonsyn txGN=SAMD11
2 chr1 894573 txDN=-3 txGN=NOC2L txID=uc003 intronic txGN=NOC2L <NA>
3 chr1 10626 txDN=-9 txID=uc2 txST=+ <NA> <NA> <NA>
#If you want to "remove" NA's:
x[is.na(x)]<-""
编辑:
这是一个不在 R 中创建数据框的版本(为了减少内存需求),而是将结果附加到新文件中:
maxcol <- max(count.fields("D:/file.txt"))
maxrow <- length(readLines("D:/file.txt"))
# bit inefficient, we read the whole file to get the number of lines
stepsize<-50 # how many lines are read at once
k<-0
while(TRUE){
if((k+1)*stepsize > maxrow){
x <- read.table("D:/file.txt",as.is=TRUE,fill=TRUE,col.names=1:maxcol,
skip=k*stepsize,nrow=maxrow-k*stepsize+1)
} else x <- read.table("D:/file.txt",as.is=TRUE,fill=TRUE,
col.names=1:maxcol, skip=k*stepsize,nrow=stepsize)
if(nrow(x)==0) break #end loop when finished
x[x==""]<-NA
indices<-which(substr(as.matrix(x),start=1,stop=5)=="txGN=",arr.ind=TRUE)
x<-cbind(x,NA)
for(i in 1:nrow(indices)){
na1<-which(is.na(x[indices[i,1],]))[1]
x[indices[i,1],na1]<-x[indices[i,1],indices[i,2]]
}
# New stuff, change sep and eol if needed
write.table(x, file = "D:/filenew.txt", append = TRUE, quote = FALSE,
sep = " ", eol = "\n", na = "",row.names = FALSE, col.names = FALSE)
k<-k+1
}
read.table("D:/filenew.txt",as.is=TRUE,fill=TRUE,col.names=1:(maxcol+1))