r - how to insert missing observations on a data frame

Question

I have a data that are observations over time. Unfortunately, some large gaps of time points are missing on a treatment. They are not coded as NA and if I make a plot out of them it becomes apparent.

My data frame looks like this. The number of samples per time points are irregular. (edit: sorry for not making the example reproducible)s

    structure(list(A = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 144L, 144L, 144L, 1809L, 1809L, 1809L, 
1809L, 1809L, 1809L, 1809L, 1809L, 1809L, 1809L, 1809L, 1809L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 2070L, 
2070L, 2070L, 2070L, 2070L, 2757L, 2757L, 2757L, 2909L, 2909L, 
2909L, 2909L, 2909L, 2909L, 2909L, 2909L, 2909L, 2909L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 2975L, 
2975L, 2975L, 2975L, 2975L), cond = structure(c(2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Con", 
"Si"), class = "factor"), T = c(416L, 417L, 418L, 419L, 420L, 
423L, 424L, 425L, 426L, 427L, 428L, 429L, 430L, 431L, 432L, 433L, 
434L, 435L, 436L, 437L, 438L, 439L, 440L, 441L, 442L, 443L, 444L, 
445L, 446L, 447L, 448L, 449L, 450L, 451L, 452L, 453L, 454L, 458L, 
503L, 504L, 505L, 506L, 507L, 508L, 509L, 510L, 511L, 512L, 513L, 
514L, 515L, 516L, 517L, 518L, 519L, 520L, 521L, 522L, 523L, 524L, 
525L, 526L, 527L, 528L, 272L, 276L, 277L, 350L, 351L, 352L, 353L, 
354L, 355L, 356L, 357L, 358L, 359L, 360L, 361L, 372L, 373L, 374L, 
375L, 376L, 377L, 378L, 379L, 380L, 381L, 382L, 383L, 384L, 385L, 
386L, 387L, 388L, 389L, 390L, 391L, 392L, 393L, 394L, 395L, 396L, 
397L, 398L, 399L, 400L, 401L, 437L, 438L, 439L, 440L, 441L, 442L, 
443L, 444L, 445L, 446L, 447L, 448L, 449L, 450L, 451L, 452L, 453L, 
454L, 455L, 493L, 494L, 495L, 382L, 383L, 384L, 385L, 386L, 387L, 
388L, 389L, 390L, 391L, 523L, 524L, 525L, 526L, 527L, 528L, 529L, 
530L, 531L, 532L, 533L, 534L, 535L, 536L, 537L, 538L, 539L, 540L, 
541L, 542L, 543L, 544L, 545L, 546L, 547L, 548L, 549L, 550L, 551L, 
552L, 553L, 554L, 555L, 556L, 557L, 582L, 583L, 584L, 585L, 586L, 
587L, 588L, 589L, 590L, 591L, 592L, 593L, 594L, 595L, 596L), 
    Vlog = c(1.199206203, 0.92297866, 0.74831703, 1.180533889, 
    0.846435768, 1.823185531, 1.775303408, 0.9253633, 1.562371106, 
    1.237695416, 1.310507835, 1.431774566, 2.259365243, 1.721204598, 
    0.976929098, 0.673510525, 1.194940048, 0.878373924, 1.399859784, 
    1.04183351, 0.362465228, 1.345074816, 0.839639722, 1.235884973, 
    0.946877821, 0.810708992, 0.620516467, 0.99590939, 0.446167467, 
    0.635246561, 0.508835353, 0.470349764, 0.505083592, 0.363685506, 
    0.841427562, 1.502579534, 1.503814969, 1.962735861, 1.190111689, 
    1.208627789, 1.212606926, 1.3052429, 1.19648953, 1.399151795, 
    1.359988717, 1.530933258, 1.324386434, 1.429685474, 1.550040003, 
    1.209836455, 0.976675012, 1.396991989, 1.309972472, 0.884831368, 
    0.940578242, 0.622109712, 0.196736781, 0, 1.861481047, 1.166587204, 
    1.154778081, 0.750716468, 0.822148942, 0.324409805, 0.810379036, 
    2.218975354, 0.837542999, 1.597505982, 1.34988859, 2.109471773, 
    1.408734988, 1.006914696, 1.680242618, 1.842263128, 2.19564511, 
    1.80944452, 1.194273373, 1.953931263, 1.943781916, 2.136530509, 
    2.174627732, 1.837702354, 1.744745221, 1.744745221, 2.065910366, 
    1.3644043, 1.935629046, 1.327947423, 1.703751191, 1.595793931, 
    2.32443327, 1.815054551, 1.381916487, 1.535930503, 1.762742848, 
    1.214377396, 1.745046639, 0, 0, 1.314421325, 2.12544409, 
    1.961225517, 1.722393773, 1.763882649, 2.246794342, 1.462888398, 
    0, 2.699085109, 0.982206846, 1.678694356, 1.339419526, 1.856762396, 
    1.604863093, 1.439867691, 1.210451327, 0.988645101, 1.581116604, 
    0.868888993, 1.385699365, 1.377180499, 1.584445411, 1.76153307, 
    1.153021042, 1.427814276, 1.867219352, 1.726781152, 2.045476901, 
    1.231462515, 1.282774459, 1.194170351, 1.423430455, 1.813916126, 
    1.697914719, 1.343711186, 1.619115871, 1.590854952, 1.165150441, 
    0.84551636, 0.925836885, 0.0009995, 0, 2.672041587, 1.630536406, 
    2.084775235, 0.879027692, 2.150052605, 1.171591247, 2.589254624, 
    1.09594206, 1.788420568, 0.879027692, 1.768910948, 1.544705476, 
    0.961905249, 2.03675983, 1.189770451, 2.125034005, 1.921180059, 
    1.587902512, 1.113485404, 1.826744807, 0.961905249, 1.423828826, 
    1.392463308, 1.355448604, 1.638531529, 1.158778559, 1.257058585, 
    1.641075408, 1.652573524, 1.435915015, 1.072776171, 1.240686858, 
    1.647779212, 1.089811169, 1.723723056, 2.094419336, 0.544066958, 
    0.894454037, 1.651688305, 1.153416081, 0.961905249, 2.457446983, 
    0.704322704, 1.544705476, 1.970925317, 1.402837317, 1.651688305, 
    1.358923164, 1.153416081, 2.056674373)), .Names = c("A", 
"cond", "T", "Vlog"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 
47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 
60L, 61L, 62L, 63L, 64L, 66L, 67L, 68L, 201L, 202L, 203L, 204L, 
205L, 206L, 207L, 208L, 209L, 210L, 211L, 212L, 213L, 214L, 215L, 
216L, 217L, 218L, 219L, 220L, 221L, 222L, 223L, 224L, 225L, 226L, 
227L, 228L, 229L, 230L, 231L, 232L, 233L, 234L, 235L, 236L, 237L, 
238L, 239L, 240L, 241L, 242L, 243L, 244L, 245L, 246L, 247L, 248L, 
249L, 250L, 251L, 252L, 253L, 254L, 255L, 256L, 257L, 258L, 259L, 
260L, 261L, 695L, 696L, 697L, 698L, 699L, 700L, 701L, 702L, 703L, 
704L, 705L, 706L, 707L, 708L, 709L, 710L, 711L, 712L, 713L, 714L, 
715L, 716L, 717L, 718L, 719L, 720L, 721L, 722L, 723L, 724L, 725L, 
726L, 727L, 728L, 729L, 730L, 731L, 732L, 733L, 734L, 735L, 736L, 
737L, 738L, 739L, 740L, 741L, 742L, 743L, 744L, 745L, 746L, 747L, 
748L, 749L, 750L, 751L, 752L, 753L, 754L, 755L, 756L, 757L), class = "data.frame")

Is there a way of spotting the missing time points and insert n rows to it? What I thought of is to check the missing time points by making a freq table for each time point per treatment and then inserting a row. This is doable with a short time series but not with a large one. I am not sure if someone could help do it a little bit easier? Thanks!

edit: T is sequential but the number of data per T varies. And I want to insert a number of rows for each T. Hope the edits made it clear. :)

score 2 · Accepted Answer

这在很大程度上取决于您希望解决方案的通用性。但是，如果您想要一个非通用的解决方案，您可以非常简单地执行 #1。在这里，我假设您将T其用作时间变量。

insert_miss <- function(df, time_val= "T", by= 1) {
  val <- get(time_val, envir= as.environment(df))
  val_range <- range(val)
  comp <- seq(val_range[1], val_range[2], by=by)
  which_miss <- comp[!comp %in% val]
  # generating a sample row depends a lot on your particular problem
  # also, specifically how to impute the missing values depends on your 
  # specific problem / domain
  ## here's one simple solution which is not generic
  row_samp <- df[1,]
  df2 <- do.call("rbind", replicate(length(which_miss), row_samp, simplify= FALSE))
  df2[[time_val]] <- which_miss
  others <- which(names(df2) != time_val)
  df2[, others] <- NA
  return(df2)
}

跑

insert_miss(<your_df>)
R> A cond   T Vlog
1 NA   NA 421   NA
2 NA   NA 422   NA

score 0 · Accepted Answer

假设您的数据框被调用ts.df并且 T 变量是连续的（因为它在每个数据点上增加一且仅增加一），您可以生成具有所有 T 值范围内的 data.frame 并将其外部连接到您现有的data.frame 自动填写 NA：

ids <- data.frame(T=seq(from=min(ts.df$T), to=max(ts.df$T)), A=0, cond="Si")
ts.df <- merge(ts.df, ids, all.y=TRUE)
ggplot(ts.df, aes(T, Vlog)) + geom_line() + geom_point()

这将为所有行的变量分配Si值并为变量分配值。第一个似乎是正确的，第二个与您的图表无关。cond0A

您可能需要按条件拆分整个 data.frame，运行上面的代码以修改一个条件的子集并再次加入 data.frames 以使其在当前ggplot()调用中正常工作，但是由于您尚未发布可重现的问题示例，因此我能做的只有这么多。

score 0 · Accepted Answer

您的示例数据与您发布的图表图像不匹配，但这是随机数据的答案

# random x-y series
set.seed(123)
dat <- data.frame(x=1:200,
                  y=cumsum(rnorm(200)))

# punch some holes
dat <- dat[-c(20:40, 90:120), ]

# for each point, find gap to next point
diff2next <- with(dat, x[-1] - x[-nrow(dat)])

# now find position of non consecutive points (i.e. where gap > 1)
holes_start <- which(diff2next > 1)
holes_end <- holes_start + 1 #(by definition the gap ends with the next point)

# that's it. here's a plot of the line and the identified holes
ggplot() + 
  geom_line(data=dat, aes(x, y)) + # the line
  geom_point(data=dat[c(holes_start, holes_end), ], 
             aes(x, y), color='red') # the hole start/ends

r - how to insert missing observations on a data frame

3 回答 3

跑

Related

Reference