我有一个gff文件和一个csv看起来像的文件:
# CSV dataframe
file.csv <- read.table(text = "Sample Name Estimate Std.Err P.Adjust
Sample_1 B005300.2.1 0.345930183 0.05662846 1.58E-06
Sample_1 B005230.2.1 0.048159129 0.013862871 0.019181546
Sample_1 B006450.2.1 -0.263951161 0.079297432 0.027327576
Sample_2 B005230.2.1 39.04308043 11.23861018 0.019181546
Sample_2 B006260.1.1 0.003968994 0.00063087 6.12E-07
Sample_2 B006170.2.1 0.117171563 0.024018888 0.000272761
Sample_3 B006450.2.1 0.012033053 0.003670908 0.030632664
Sample_3 B006980.1-c2.1 -0.007653796 0.002047582 0.009944649
Sample_3 B006980.1.1 -0.011369481 0.002871014 0.00539717", header = TRUE)
# GFF GRanges, example data
#dput(head(GFF))
GFF <- new("GRanges", seqnames = new("Rle", values = structure(1L, .Label = c("Bch01", "Bch02", "Bch03", "Bch04", "Bch05"), class = "factor"), lengths = 6L,
elementMetadata = NULL, metadata = list()), ranges = new("IRanges",
start = c(21882L, 21882L, 21882L, 21882L, 22697L, 22697L),
width = c(126L, 126L, 126L, 126L, 60L, 60L), NAMES = NULL,
elementType = "ANY", elementMetadata = NULL, metadata = list()),
strand = new("Rle", values = structure(2L, .Label = c("+",
"-", "*"), class = "factor"), lengths = 6L, elementMetadata = NULL,
metadata = list()), seqinfo = new("Seqinfo", seqnames = c("Bch01", "Bch02", "Bch03", "Bch04", "Bch05"), seqlengths = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), is_circular = c(NA, NA, NA, NA, NA), genome = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_)), elementMetadata = new("DFrame", rownames = NULL, nrows = 6L,
listData = list(source = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = "maker", class = "factor"), type = structure(c(1L,
2L, 3L, 4L, 1L, 2L), .Label = c("CDS", "exon", "gene",
"mRNA", "three_prime_UTR", "five_prime_UTR"), class = "factor"),
score = c(NA, NA, NA, 126, NA, NA), phase = c(0L,
NA, NA, NA, 0L, NA), ID = c("B024400.1.1:cds",
"B024400.1.1:exon:2", "B024400.1",
"B024400.1.1", "B008910.1.1:cds",
"B008910.1.1:exon:4"), Parent = new("CompressedCharacterList",
elementType = "character", elementMetadata = NULL,
metadata = list(), unlistData = c("B024400.1.1",
"B024400.1.1", "B024400.1",
"B008910.1.1", "B008910.1.1"
), partitioning = new("PartitioningByEnd", end = c(1L,
2L, 2L, 3L, 4L, 5L), NAMES = NULL, elementType = "ANY",
elementMetadata = NULL, metadata = list())),
Name = c(NA, NA, "B024400.1", "B024400.1.1",
NA, NA), Note = new("CompressedCharacterList", elementType = "character",
elementMetadata = NULL, metadata = list(), unlistData = c("Similar to B024400.1.1: LOW QUALITY:50S ribosomal protein L4, chloroplastic", "Similar to B024400.1.1: LOW QUALITY:50S ribosomal protein L4, chloroplastic"), partitioning = new("PartitioningByEnd", end = c(0L,
0L, 1L, 2L, 2L, 2L), NAMES = NULL, elementType = "ANY",
elementMetadata = NULL, metadata = list())),
ref_id = c(NA, NA, "B024400.1.1", "B024400.1.1",
NA, NA), Dbxref = new("CompressedCharacterList",
elementType = "character", elementMetadata = NULL,
metadata = list(), unlistData = character(0),
partitioning = new("PartitioningByEnd", end = c(0L,
0L, 0L, 0L, 0L, 0L), NAMES = NULL, elementType = "ANY",
elementMetadata = NULL, metadata = list())),
Ontology_term = new("CompressedCharacterList", elementType = "character",
elementMetadata = NULL, metadata = list(), unlistData = character(0),
partitioning = new("PartitioningByEnd", end = c(0L,
0L, 0L, 0L, 0L, 0L), NAMES = NULL, elementType = "ANY",
elementMetadata = NULL, metadata = list()))),
elementType = "ANY", elementMetadata = NULL, metadata = list()),
elementType = "ANY", metadata = list())
我想按列合并这两个文件Name。我试过了:
GFF = rtracklayer::import("gene_models.gff")
merge_data<-merge(file.csv,GFF,by="Name")
但是在 csv 文件中,我Name对不同的值都有相同的值,Samples例如B005230.2.1两者都相同Sample1,Sample2而它只在GFF文件中出现一次。因此,合并文件搞砸了。我将不胜感激任何帮助解决这个问题。谢谢!