- Some basic forecasting methods
- Forecasting time series with complex seasonality
- Dealing with missing values and outliers
School of Economics and Management
Beihang University
http://yanfei.site
library(fpp2) library(ggplot2) library(forecast) autoplot(beer, main = 'Australian quarterly beer production')
autoplot(pigs, main = 'Number of pigs slaughtered in Victoria')
autoplot(dowjones, main = 'Dow−Jones index')
beer <- window(ausbeer, start = 1992, end = c(2005,4)) beer.arima <- auto.arima(beer) autoplot(forecast(beer.arima))
Average method: forecast of all future values is equal to mean of historical data.
Naïve method: forecasts equal to last observed value.
Seasonal naïve method: forecasts equal to last value from same season.
## average method beerfit.mean <- meanf(beer, h = 11) ## naive method beerfit.naive <- naive(beer, h = 11) ## seasonal naive method beerfit.snaive <- snaive(beer, h = 11) cols <- c("mean" = "#0000ee","naive" = "#ee0000","snaive" = "green") ## plot autoplot(beerfit.mean, PI = FALSE, main ='Forecasts for quarterly beer production') + geom_line(aes(x=time(beerfit.mean$mean),y=beerfit.mean$mean,colour='mean')) + geom_line(aes(x=time(beerfit.naive$mean),y=beerfit.naive$mean,colour='naive')) + geom_line(aes(x=time(beerfit.snaive$mean),y=beerfit.snaive$mean,colour='snaive')) + guides(fill=FALSE) + scale_colour_manual(name="Forecasts",values=cols)
## check whether the residuals are random checkresiduals(beerfit.mean)
## ## Ljung-Box test ## ## data: Residuals from Mean ## Q* = 148.53, df = 7, p-value < 2.2e-16 ## ## Model df: 1. Total lags used: 8
checkresiduals(beerfit.naive)
## ## Ljung-Box test ## ## data: Residuals from Naive method ## Q* = 130.18, df = 8, p-value < 2.2e-16 ## ## Model df: 0. Total lags used: 8
checkresiduals(beerfit.snaive)
## ## Ljung-Box test ## ## data: Residuals from Seasonal naive method ## Q* = 32.302, df = 8, p-value = 8.224e-05 ## ## Model df: 0. Total lags used: 8
## check accuracy of forecasting beer2 <- window(ausbeer, start=2006) accuracy(beerfit.mean, beer2)
## ME RMSE MAE MPE MAPE MASE ## Training set 8.121418e-15 44.17630 35.91135 -0.9510944 7.995509 2.444228 ## Test set -1.718344e+01 38.01454 33.77760 -4.7345524 8.169955 2.298999 ## ACF1 Theil's U ## Training set -0.12566970 NA ## Test set -0.08286364 0.7901651
accuracy(beerfit.naive, beer2)
## ME RMSE MAE MPE MAPE MASE ## Training set 0.7090909 66.60207 55.43636 -0.8987351 12.26632 3.773156 ## Test set -62.2727273 70.90647 63.90909 -15.5431822 15.87645 4.349833 ## ACF1 Theil's U ## Training set -0.25475212 NA ## Test set -0.08286364 1.428524
accuracy(beerfit.snaive, beer2)
## ME RMSE MAE MPE MAPE MASE ## Training set -1.846154 17.24261 14.69231 -0.4803931 3.401224 1.0000000 ## Test set -2.545455 12.96849 11.27273 -0.7530978 2.729847 0.7672537 ## ACF1 Theil's U ## Training set -0.3408329 NA ## Test set -0.1786912 0.22573
Time series can be decomposed into:
fit <- stl(elecequip, t.window=15, s.window="periodic", robust=TRUE) autoplot(fit)
fcast <- forecast(fit, method="naive", h=24) autoplot(fcast) + ylab("New orders index")
ARMA models: \(y_t = c + \phi_1 y_{t-1} + \cdots + \phi_p y_{t-p} + \theta_1 e_{t-1} + \cdots + \theta_1 e_{t-q}\).
ARIMA models: \((1-B)^dy_t\) follows an ARMA model.
fit <- auto.arima(internet) autoplot(forecast(fit))
checkresiduals(fit)
## ## Ljung-Box test ## ## data: Residuals from ARIMA(1,1,1) ## Q* = 7.8338, df = 8, p-value = 0.4499 ## ## Model df: 2. Total lags used: 10
fit <- auto.arima(h02) autoplot(forecast(fit))
checkresiduals(fit)
## ## Ljung-Box test ## ## data: Residuals from ARIMA(4,1,1)(0,1,2)[12] ## Q* = 26.081, df = 17, p-value = 0.07299 ## ## Model df: 7. Total lags used: 24
The general form of a multiple regression model is \[y_t = \beta_0 + \beta_1 x_{1,t} + \beta_2 x_{2,t} + \cdots + \beta_k x_{k,t} + \epsilon_t.\]
Assumptions:
autoplot(uschange, facet=TRUE)
fit.consMR <- tslm( Consumption ~ Income + Production + Unemployment + Savings, data=uschange) summary(fit.consMR)
## ## Call: ## tslm(formula = Consumption ~ Income + Production + Unemployment + ## Savings, data = uschange) ## ## Residuals: ## Min 1Q Median 3Q Max ## -0.88296 -0.17638 -0.03679 0.15251 1.20553 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 0.26729 0.03721 7.184 1.68e-11 *** ## Income 0.71449 0.04219 16.934 < 2e-16 *** ## Production 0.04589 0.02588 1.773 0.0778 . ## Unemployment -0.20477 0.10550 -1.941 0.0538 . ## Savings -0.04527 0.00278 -16.287 < 2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.3286 on 182 degrees of freedom ## Multiple R-squared: 0.754, Adjusted R-squared: 0.7486 ## F-statistic: 139.5 on 4 and 182 DF, p-value: < 2.2e-16
autoplot(uschange[,'Consumption'], series="Data") + autolayer(fitted(fit.consMR), series="Fitted") + xlab("Year") + ylab("") + ggtitle("Percent change in US consumption expenditure") + guides(colour=guide_legend(title=" "))
checkresiduals(fit.consMR)
## ## Breusch-Godfrey test for serial correlation of order up to 8 ## ## data: Residuals from Linear regression model ## LM test = 14.874, df = 8, p-value = 0.06163
p1 <- autoplot(calls) + ylab("Call volume") + xlab("Weeks") + scale_x_continuous(breaks=seq(1,33,by=2)) p2 <- autoplot(window(calls, end=4)) + ylab("Call volume") + xlab("Weeks") + scale_x_continuous(minor_breaks = seq(1,4,by=0.2)) gridExtra::grid.arrange(p1,p2)
calls %>% mstl() %>% autoplot() + xlab("Week")
calls %>% stlf() %>% autoplot() + xlab("Week")
In our case, the seasonal periods are 169 and 845, so the Fourier terms are of the form
\[sin(\frac{2\pi kt}{169}), ~cos(\frac{2\pi kt}{169}), ~sin(\frac{2\pi kt}{845}), ~cos(\frac{2\pi kt}{845}).\] for \(k = 1, 2, \dots\).
calls <- subset(calls, start=length(calls)-2000) fit <- auto.arima(calls, seasonal=FALSE, lambda=0, xreg=fourier(calls, K=c(10,10))) fit %>% forecast(xreg=fourier(calls, K=c(10,10), h=2*169)) %>% autoplot(include=5*169) + ylab("Call volume") + xlab("Weeks")
We can replace the missing values with estimates.
gold2 <- na.interp(gold) autoplot(gold2, series="Interpolated") + autolayer(gold, series="Original") + scale_color_manual( values=c(`Interpolated`="red",`Original`="gray"))
tsoutliers()
function is designed to identify outliers, and to suggest potential replacement values.tsoutliers(gold)
## $index ## [1] 770 ## ## $replacements ## [1] 494.9
tsclean()
which identifies and replaces outliers, and also replaces missing values.gold %>% tsclean() %>% auto.arima() %>% forecast(h=50) %>% autoplot()