r - 使用ggplot在R中的时间序列中着色预测间隔

Question

请参考数据的输出。您可以直接向下滚动到目标和问题陈述。也许您不需要数据，因为您之前可能遇到过这个问题。

调用所需的库

library(zoo)
library(ggplot2)
library(scales)
library(plotly)
library(ggthemes)
library(forecast)
library(plotly)
library(DescTools)

数据输入

dput(ridership.ts)
structure(c(1709L, 1621L, 1973L, 1812L, 1975L, 1862L, 1940L, 
2013L, 1596L, 1725L, 1676L, 1814L, 1615L, 1557L, 1891L, 1956L, 
1885L, 1623L, 1903L, 1997L, 1704L, 1810L, 1862L, 1875L, 1705L, 
1619L, 1837L, 1957L, 1917L, 1882L, 1933L, 1996L, 1673L, 1753L, 
1720L, 1734L, 1563L, 1574L, 1903L, 1834L, 1831L, 1776L, 1868L, 
1907L, 1686L, 1779L, 1776L, 1783L, 1548L, 1497L, 1798L, 1733L, 
1772L, 1761L, 1792L, 1875L, 1571L, 1647L, 1673L, 1657L, 1382L, 
1361L, 1559L, 1608L, 1697L, 1693L, 1836L, 1943L, 1551L, 1687L, 
1576L, 1700L, 1397L, 1372L, 1708L, 1655L, 1763L, 1776L, 1934L, 
2008L, 1616L, 1774L, 1732L, 1797L, 1570L, 1413L, 1755L, 1825L, 
1843L, 1826L, 1968L, 1922L, 1670L, 1791L, 1817L, 1847L, 1599L, 
1549L, 1832L, 1840L, 1846L, 1865L, 1966L, 1949L, 1607L, 1804L, 
1850L, 1836L, 1542L, 1617L, 1920L, 1971L, 1992L, 2010L, 2054L, 
2097L, 1824L, 1977L, 1981L, 2000L, 1683L, 1663L, 2008L, 2024L, 
2047L, 2073L, 2127L, 2203L, 1708L, 1951L, 1974L, 1985L, 1760L, 
1771L, 2020L, 2048L, 2069L, 1994L, 2075L, 2027L, 1734L, 1917L, 
1858L, 1996L, 1778L, 1749L, 2066L, 2099L, 2105L, 2130L, 2223L, 
2174L, 1931L, 2121L, 2076L, 2141L, 1832L, 1838L, 2132L), .Tsp = c(1991, 
2004.16666666667, 12), class = "ts")

创建 ts 对象的数据框以使用 ggplot

tsd = data.frame(time = as.Date(ridership.ts), 
                 value = as.matrix(ridership.ts))

建立线性模型

ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2))

向现有数据框 tsd 添加新列

tsd$linear_fit = as.matrix(ridership.lm$fitted.values)

定义验证和培训期的长度

nValid = 36 
nTrain = length(ridership.ts) - nValid

训练数据

train.ts = window(ridership.ts, 
                  start = c(1991, 1),
                  end = c(1991, nTrain))

验证数据

valid.ts = window(ridership.ts, 
                  start = c(1991, nTrain + 1), 
                  end = c(1991, nTrain + nValid))

建筑模型

ridership.lm = tslm(train.ts ~ trend + I(trend^2))

使用我们的构建模型进行预测

ridership.lm.pred = forecast(ridership.lm, h = nValid, level = 0)

为拟合的模型值制作数据框

tsd_train_model = data.frame(time = as.Date(train.ts), 
                             lm_fit_train = as.matrix(ridership.lm$fitted.values))

为绘图目的制作数据框

forecast_df = data.frame(time = as.Date(valid.ts), 
                         value = as.matrix(ridership.lm.pred$mean))

使用 ggplot 创建绘图

p1 = ggplot(data = tsd, 
            aes(x = time, y = value)) + 
  geom_line(color = 'blue') + 
  ylim(1300, 2300) + 
  geom_line(data = tsd_train_model, 
            aes(x = time, y = lm_fit_train), 
            color = 'red')

p2 = p1 + 
  geom_line(data = forecast_df, 
            aes(x = time, y = value), 
            col = 'red', linetype = 'dotted') + 
  scale_x_date(breaks = date_breaks('1 years'), 
               labels = date_format('%b-%y')) +
  geom_vline(xintercept = as.numeric(c(tsd_train_model[NROW(tsd_train_model), ]$time,  #last date of training period
                                       forecast_df[NROW(forecast_df), ]$time))) #last date of testing period 

p3 = p2 + 
  annotate('text', 
           x = c(tsd_train_model[NROW(tsd_train_model)/2, ]$time, 
                 forecast_df[NROW(forecast_df) / 2,]$time),
           y = 2250, 
           label = c('Training Period', 'Validation Period'))

目的：我想在预测线的两侧（图中红色虚线）添加5个百分位和95个百分位的预测误差，并对区域进行阴影处理。

我使用分位数来生成预测范围

q = quantile(ridership.lm.pred$residuals, c(.05, .95))

percentile_5 = as.numeric(q[1])
percentile_95 = as.numeric(q[2])

为预测数据添加 5 个百分位和 95 个百分位

yl = forecast_df$value + percentile_5 
ym =  forecast_df$value  + percentile_95

问题：如果我使用下面的命令，那么它不会在整个验证期内显示阴影区域。

p3 + geom_ribbon(data = forecast_df, 
                 aes(ymin = yl, 
                     ymax = ym), 
                 fill="gray30")

NROW(yl)
 [1]36

sum(is.na(yl))
[1] 0

NROW(ym)
[1] 36

sum(is.na(ym))
[1] 0

尝试过的事情：如果我用任何其他值替换 ymin 和 ymax 的值，例如如果我使用下面的命令，那么我会得到命令下方显示的图

p3 + geom_ribbon(data = forecast_df, 
                 aes(ymin = rep(1750,36), 
                     ymax = rep(2000,36), 
                     fill="gray30"))

我的问题：

谁能告诉我图 2 中输出背后的原因？为什么 R 给出如图 2 所示的奇怪输出？

谁能帮我用ggplot遮蔽整个区域？

score 3 · Accepted Answer

TLDR：ylim(1300, 2300) +从您的ggplot代码中删除该行。

当您使用scale_x_***()/ scale_y_***（或等效地xlim()/ ylim()）设置绘图的限制时，绘图将丢弃所有超出此范围的数据点。在需要 ymin 和 ymax 值的 geom_ribbon 的情况下，当与 ymax 对应的值被删除时（因为它们大于 2300），色带不能仅用 ymin 绘制，因此色带在此之前停止。

如果您真的只想为范围 (1300, 2300) 绘图，请coord_cartesian()改为在内部设置限制。这使绘图可以缩放到范围限制，而不会丢弃外部的数据点。有关更多信息，请参阅文档。

以下其他非必要建议：

为了在 ggplot 中绘图，我通常会尽量将所有内容保持在同一个数据框中，以利用美学映射中的公共变量。这是我的做法：

将所有内容组合到一个数据框中：

library(dplyr)
df <- left_join(tsd %>% select(time, value),
                rbind(tsd_train_model %>% 
                        rename(fit = lm_fit_train) %>%
                        mutate(status = "train"),
                      forecast_df %>%
                        rename(fit = value) %>%
                        mutate(status = "valid")))
df <- df %>%
  mutate(yl = ifelse(status == "valid", fit + percentile_5, NA),
         ym = ifelse(status == "valid", fit + percentile_95, NA))

> head(df)
        time value      fit status yl ym
1 1991-01-01  1709 1882.681  train NA NA
2 1991-02-01  1621 1876.546  train NA NA
3 1991-03-01  1973 1870.518  train NA NA
4 1991-04-01  1812 1864.597  train NA NA
5 1991-05-01  1975 1858.784  train NA NA
6 1991-06-01  1862 1853.078  train NA NA

> tail(df)
          time value      fit status       yl       ym
154 2003-10-01  2121 2190.490  valid 1934.914 2397.875
155 2003-11-01  2076 2200.756  valid 1945.179 2408.141
156 2003-12-01  2141 2211.129  valid 1955.553 2418.514
157 2004-01-01  1832 2221.609  valid 1966.033 2428.994
158 2004-02-01  1838 2232.197  valid 1976.620 2439.582
159 2004-03-01  2132 2242.891  valid 1987.315 2450.277

阴谋

ggplot(data = df,
       aes(x = time)) +

  # place the ribbon below all other geoms for easier viewing, & increase transparency
  geom_ribbon(aes(ymin = yl, ymax = ym), fill = "gray30", alpha = 0.2) +

  # original values
  geom_line(aes(y = value), color = "blue") +

  # fitted values (line type differs by training / validation)
  geom_line(aes(y = fit, linetype = status), color = "red") +

  # indicates validation range
  geom_vline(xintercept = c(min(df$time[df$status=="valid"]),
                            max(df$time[df$status=="valid"]))) +

  scale_x_date(breaks = scales::date_breaks('1 year'), 
               labels = scales::date_format('%b-%y')) +

  # hide legend for line type (comment this line out if you want to show it)
  scale_linetype(guide = F) + 

  # limits can be tweaked here
  coord_cartesian(ylim = c(1300, 2500)) +

  # plain white plot background for easier viewing
  theme_classic()

编辑：使图例更容易的替代解决方案：

# create long data frame where all values (original / training / validation) are
# in the same column
df2 <- rbind(tsd %>% select(time, value) %>%
               mutate(status = "original"),
             tsd_train_model %>% 
               rename(value = lm_fit_train) %>%
               mutate(status = "train"),
             forecast_df %>%
               mutate(status = "valid")) %>%
  mutate(yl = ifelse(status == "valid", value + percentile_5, NA),
         ym = ifelse(status == "valid", value + percentile_95, NA))

# in the scales for colour / line type, define the same labels in order to
# combine the two legends
ggplot(data = df2,
       aes(x = time)) +
  geom_ribbon(data = subset(df2, !is.na(yl)),
              aes(ymin = yl, ymax = ym, fill = "interval"), alpha = 0.2) +
  geom_line(aes(y = value, color = status, linetype = status)) +
  geom_vline(xintercept = c(min(df2$time[df$status=="valid"]),
                            max(df2$time[df$status=="valid"]))) +
  scale_x_date(breaks = scales::date_breaks('1 year'), 
               labels = scales::date_format('%b-%y')) +
  scale_color_manual(name = "",
                     values = c("original" = "blue",
                                "train" = "red",
                                "valid" = "red")) +
  scale_linetype_manual(name = "",
                 values = c("original" = "solid",
                            "train" = "solid",
                            "valid" = "longdash")) +
  scale_fill_manual(name = "",
                    values = c("interval" = "gray30")) +
  coord_cartesian(ylim = c(1300, 2500)) +
  theme_classic() +
  theme(legend.position = "bottom")

r - 使用ggplot在R中的时间序列中着色预测间隔

1 回答 1

Related

Reference