0

我有一组 10 年(2009-2020),间隔 30 分钟的气象数据集,但数据在 2 年(2015-2017)的夜间(~17:00 到第二天~08:00)有缺失值由于仪器的电池故障。变量是:气温、湿度、风、辐射。

  1. 有没有人有任何好主意来填充这些值?或者
  2. 如果我想在同一时间步(其余年份)使用每个变量的平均值填充它们,该怎么做?请问有什么想法吗?

我尝试使用fill_by_function(fun = mean)padR,但它使用所有数据集的平均值和单个值。

下面我放了一个我的数据框结构的切片示例:

chhota_1 <- data %>%
  slice(89845:89900) %>%
  pad() %>%
  fill_by_value(na.pad=TRUE)

数据结构,一目了然

4

1 回答 1

0

您可以通过data.table(在许多其他方法中)完成它:

library(data.table)

data = data.table(
  "timestep" = rep(1:5, 4),
  "year" = rep(2010:2013, each = 5),
  "value" = rnorm(20)
)

# create NA year
data[year == 2012, value := NA_real_]
> data
    timestep year      value
 1:        1 2010  0.7383885
 2:        2 2010  0.6547628
 3:        3 2010 -0.9825121
 4:        4 2010  0.2670737
 5:        5 2010  0.8688922
 6:        1 2011  0.1509628
 7:        2 2011  1.2482106
 8:        3 2011 -0.9492589
 9:        4 2011  1.2311409
10:        5 2011 -0.1162351
11:        1 2012         NA
12:        2 2012         NA
13:        3 2012         NA
14:        4 2012         NA
15:        5 2012         NA
16:        1 2013 -1.0179958
17:        2 2013  0.4368148
18:        3 2013  0.7547140
19:        4 2013 -0.4759922
20:        5 2013 -0.2393624

按该时间步长(其他年份)的平均值填充NA

data[, value := ifelse(is.na(value), mean(value, na.rm = TRUE), value), by = "timestep"]
> data
    timestep year       value
 1:        1 2010  0.73838849
 2:        2 2010  0.65476283
 3:        3 2010 -0.98251205
 4:        4 2010  0.26707371
 5:        5 2010  0.86889218
 6:        1 2011  0.15096278
 7:        2 2011  1.24821056
 8:        3 2011 -0.94925891
 9:        4 2011  1.23114088
10:        5 2011 -0.11623511
11:        1 2012 -0.04288152
12:        2 2012  0.77992939
13:        3 2012 -0.39235232
14:        4 2012  0.34074078
15:        5 2012  0.17109823
16:        1 2013 -1.01799582
17:        2 2013  0.43681478
18:        3 2013  0.75471400
19:        4 2013 -0.47599225
20:        5 2013 -0.23936237

编辑:

从提供的样本数据和评论中:

> library(lubridate)
> library(data.table)
> 
> data = fread("~/Downloads/test_data.csv")
> 
> data[, timestamp := as.POSIXct(fast_strptime(timestamp, format = "%m/%d/%Y %H:%M"))]
> 
> data[, date := format(timestamp, "%m-%d")]
> data[, year := format(timestamp, "%Y")]
> 
> data[is.na(value), ]
                 timestamp value     time  date year
    1: 2012-06-22 11:00:00    NA 11:00:00 06-22 2012
    2: 2015-02-22 18:00:00    NA 18:00:00 02-22 2015
    3: 2015-02-22 18:30:00    NA 18:30:00 02-22 2015
    4: 2015-02-22 19:00:00    NA 19:00:00 02-22 2015
    5: 2015-02-22 19:30:00    NA 19:30:00 02-22 2015
   ---                                              
16194: 2017-04-07 05:30:00    NA  5:30:00 04-07 2017
16195: 2017-04-07 06:30:00    NA  6:30:00 04-07 2017
16196: 2017-04-07 18:30:00    NA 18:30:00 04-07 2017
16197: 2017-04-07 23:00:00    NA 23:00:00 04-07 2017
16198: 2017-04-08 19:00:00    NA 19:00:00 04-08 2017
> 
> data[, value := ifelse(is.na(value), mean(value, na.rm = TRUE), value), by = c("time", "date")]
> data[is.na(value), ]
Empty data.table (0 rows and 5 cols): timestamp,value,time,date,year
于 2021-03-01T13:45:43.583 回答