Logistics

Piazza

Moving average

?filter
?acf
set.seed(530476)
N = 16
w = rnorm(N)
x = filter(w, filter=rep(1/5,5))

some_ts = cbind(w, x)
some_ts
## Time Series:
## Start = 1 
## End = 16 
## Frequency = 1 
##              w           x
##  1  1.23631391          NA
##  2  0.78710744          NA
##  3  0.81519587  1.10301804
##  4  1.19440525  0.63245977
##  5  1.48206772  0.32931051
##  6 -1.11647744  0.14128862
##  7 -0.72863888 -0.27947937
##  8 -0.12491356 -0.14966540
##  9 -0.90943470  0.09717911
## 10  2.13113758  0.25192277
## 11  0.11774511  0.47017839
## 12  0.04507944  0.82286839
## 13  0.96636451  0.78492754
## 14  0.85401530  0.86675345
## 15  1.94143334          NA
## 16  0.52687464          NA
plot(some_ts)

x_acf = acf(x, na.action = na.omit)

str(acf(x, na.action = na.pass, plot=F))
## List of 6
##  $ acf   : num [1:13, 1, 1] 1 0.64 0.265 -0.133 -0.478 ...
##  $ type  : chr "correlation"
##  $ n.used: int 16
##  $ lag   : num [1:13, 1, 1] 0 1 2 3 4 5 6 7 8 9 ...
##  $ series: chr "x"
##  $ snames: NULL
##  - attr(*, "class")= chr "acf"
str(acf(x, na.action = na.omit, plot=F))
## List of 6
##  $ acf   : num [1:11, 1, 1] 1 0.64 0.265 -0.133 -0.478 ...
##  $ type  : chr "correlation"
##  $ n.used: int 12
##  $ lag   : num [1:11, 1, 1] 0 1 2 3 4 5 6 7 8 9 ...
##  $ series: chr "x"
##  $ snames: NULL
##  - attr(*, "class")= chr "acf"
data.frame(lag = x_acf$lag,
           acf = x_acf$acf)
##    lag         acf
## 1    0  1.00000000
## 2    1  0.64044331
## 3    2  0.26515841
## 4    3 -0.13337146
## 5    4 -0.47796442
## 6    5 -0.53302529
## 7    6 -0.43248404
## 8    7 -0.27296786
## 9    8 -0.02072622
## 10   9  0.15040616
## 11  10  0.16646444

By the way, could you tell what is the difference in those calls?

Transformation

Sedimentary deposits from one location in Massachusetts for 634 years, beginning nearly 12,000 years ago.

?astsa::varve
varve_more = cbind(astsa::varve, log(astsa::varve), diff(log(astsa::varve)))
plot(varve_more)

?log
?diff

Seasonality

Lets look at some built-in datasets in R.

library(help = "datasets")

For example, Quarterly Approval Ratings of US Presidents

presidents
##      Qtr1 Qtr2 Qtr3 Qtr4
## 1945   NA   87   82   75
## 1946   63   50   43   32
## 1947   35   60   54   55
## 1948   36   39   NA   NA
## 1949   69   57   57   51
## 1950   45   37   46   39
## 1951   36   24   32   23
## 1952   25   32   NA   32
## 1953   59   74   75   60
## 1954   71   61   71   57
## 1955   71   68   79   73
## 1956   76   71   67   75
## 1957   79   62   63   57
## 1958   60   49   48   52
## 1959   57   62   61   66
## 1960   71   62   61   57
## 1961   72   83   71   78
## 1962   79   71   62   74
## 1963   76   64   62   57
## 1964   80   73   69   69
## 1965   71   64   69   62
## 1966   63   46   56   44
## 1967   44   52   38   46
## 1968   36   49   35   44
## 1969   59   65   65   56
## 1970   66   53   61   52
## 1971   51   48   54   49
## 1972   49   61   NA   NA
## 1973   68   44   40   27
## 1974   28   25   24   24
plot(presidents, main="Quarterly Approval Ratings of US Presidents")

For example, Monthly Airline Passenger Numbers 1949-1960

AirPassengers
##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949 112 118 132 129 121 135 148 148 136 119 104 118
## 1950 115 126 141 135 125 149 170 170 158 133 114 140
## 1951 145 150 178 163 172 178 199 199 184 162 146 166
## 1952 171 180 193 181 183 218 230 242 209 191 172 194
## 1953 196 196 236 235 229 243 264 272 237 211 180 201
## 1954 204 188 235 227 234 264 302 293 259 229 203 229
## 1955 242 233 267 269 270 315 364 347 312 274 237 278
## 1956 284 277 317 313 318 374 413 405 355 306 271 306
## 1957 315 301 356 348 355 422 465 467 404 347 305 336
## 1958 340 318 362 348 363 435 491 505 404 359 310 337
## 1959 360 342 406 396 420 472 548 559 463 407 362 405
## 1960 417 391 419 461 472 535 622 606 508 461 390 432
plot(cbind(AirPassengers, log10(AirPassengers)),
     main="Monthly Airline Passenger Numbers 1949-1960")

AirPassengers_df = data.frame(
  p = log10(AirPassengers),
  t = time(AirPassengers),
  m = as.factor(cycle(AirPassengers))
)
head(AirPassengers_df)
##          p        t m
## 1 2.049218 1949.000 1
## 2 2.071882 1949.083 2
## 3 2.120574 1949.167 3
## 4 2.110590 1949.250 4
## 5 2.082785 1949.333 5
## 6 2.130334 1949.417 6
makeDF = function(x) {
  df = data.frame(
    y = log10(x),
    t = time(x),
    s = as.factor(cycle(x))
  )
}
seaLS1 = function(x) {
  df = makeDF(x)
  lm(y ~ t - 1 + s, data = df)
}
seaLS2 = function(x) {
  df = makeDF(x)
  lm(y ~ t + s, data = df)
}
seaLS3 = function(x) {
  df = makeDF(x)
  lm(y ~ t + s, data = df, contrasts = list(s = contr.sum))
}
seaLS4 = function(x) {
  df = makeDF(x)
  lm(y ~ t, data = df)
}

mod1 = seaLS1(AirPassengers)
mod2 = seaLS2(AirPassengers)
mod3 = seaLS3(AirPassengers)
mod4 = seaLS4(AirPassengers)

Can you tell the difference between seaLS1, seaLS2, seaLS3 and seaLS4? (Try model.matrix(mod1) etc!)

summary(mod1)
## 
## Call:
## lm(formula = y ~ t - 1 + s, data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.067911 -0.017813  0.001597  0.019139  0.057468 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)    
## t    5.247e-02  6.217e-04   84.40   <2e-16 ***
## s1  -1.002e+02  1.215e+00  -82.47   <2e-16 ***
## s2  -1.002e+02  1.215e+00  -82.47   <2e-16 ***
## s3  -1.002e+02  1.215e+00  -82.42   <2e-16 ***
## s4  -1.002e+02  1.215e+00  -82.43   <2e-16 ***
## s5  -1.002e+02  1.215e+00  -82.43   <2e-16 ***
## s6  -1.001e+02  1.215e+00  -82.38   <2e-16 ***
## s7  -1.001e+02  1.216e+00  -82.34   <2e-16 ***
## s8  -1.001e+02  1.216e+00  -82.34   <2e-16 ***
## s9  -1.002e+02  1.216e+00  -82.39   <2e-16 ***
## s10 -1.002e+02  1.216e+00  -82.43   <2e-16 ***
## s11 -1.003e+02  1.216e+00  -82.48   <2e-16 ***
## s12 -1.002e+02  1.216e+00  -82.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02576 on 131 degrees of freedom
## Multiple R-squared:  0.9999, Adjusted R-squared:  0.9999 
## F-statistic: 9.734e+04 on 13 and 131 DF,  p-value: < 2.2e-16
summary(mod2)
## 
## Call:
## lm(formula = y ~ t + s, data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.067911 -0.017813  0.001597  0.019139  0.057468 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.002e+02  1.215e+00 -82.467  < 2e-16 ***
## t            5.247e-02  6.217e-04  84.399  < 2e-16 ***
## s2          -9.578e-03  1.051e-02  -0.911  0.36400    
## s3           4.698e-02  1.052e-02   4.468 1.69e-05 ***
## s4           3.340e-02  1.052e-02   3.176  0.00186 ** 
## s5           3.237e-02  1.052e-02   3.078  0.00254 ** 
## s6           8.542e-02  1.052e-02   8.121 2.98e-13 ***
## s7           1.306e-01  1.052e-02  12.411  < 2e-16 ***
## s8           1.265e-01  1.052e-02  12.026  < 2e-16 ***
## s9           6.371e-02  1.052e-02   6.054 1.39e-08 ***
## s10          3.705e-03  1.052e-02   0.352  0.72537    
## s11         -5.871e-02  1.053e-02  -5.577 1.34e-07 ***
## s12         -9.260e-03  1.053e-02  -0.879  0.38082    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02576 on 131 degrees of freedom
## Multiple R-squared:  0.9835, Adjusted R-squared:  0.982 
## F-statistic: 649.4 on 12 and 131 DF,  p-value: < 2.2e-16
summary(mod3)
## 
## Call:
## lm(formula = y ~ t + s, data = df, contrasts = list(s = contr.sum))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.067911 -0.017813  0.001597  0.019139  0.057468 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.002e+02  1.215e+00 -82.419  < 2e-16 ***
## t            5.247e-02  6.217e-04  84.399  < 2e-16 ***
## s1          -3.709e-02  7.124e-03  -5.207 7.25e-07 ***
## s2          -4.667e-02  7.122e-03  -6.553 1.18e-09 ***
## s3           9.887e-03  7.121e-03   1.388 0.167355    
## s4          -3.693e-03  7.119e-03  -0.519 0.604821    
## s5          -4.724e-03  7.119e-03  -0.664 0.508153    
## s6           4.832e-02  7.118e-03   6.789 3.56e-10 ***
## s7           9.347e-02  7.118e-03  13.130  < 2e-16 ***
## s8           8.943e-02  7.119e-03  12.562  < 2e-16 ***
## s9           2.661e-02  7.119e-03   3.738 0.000276 ***
## s10         -3.339e-02  7.121e-03  -4.689 6.81e-06 ***
## s11         -9.580e-02  7.122e-03 -13.451  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02576 on 131 degrees of freedom
## Multiple R-squared:  0.9835, Adjusted R-squared:  0.982 
## F-statistic: 649.4 on 12 and 131 DF,  p-value: < 2.2e-16
summary(mod4)
## 
## Call:
## lm(formula = y ~ t, data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.134016 -0.045113 -0.007798  0.042291  0.128280 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -99.969307   2.839818  -35.20   <2e-16 ***
## t             0.052367   0.001453   36.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06038 on 142 degrees of freedom
## Multiple R-squared:  0.9015, Adjusted R-squared:  0.9008 
## F-statistic:  1300 on 1 and 142 DF,  p-value: < 2.2e-16
anova(mod1, mod4)
## Analysis of Variance Table
## 
## Model 1: y ~ t - 1 + s
## Model 2: y ~ t
##   Res.Df     RSS  Df Sum of Sq      F    Pr(>F)    
## 1    131 0.08690                                   
## 2    142 0.51774 -11  -0.43085 59.047 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_two_hists = function(x1, x2) {
  colvec = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5))
  rangevec = c(min(x1, x2), max(x1, x2))
  hist(x1,
       breaks=seq(rangevec[1], rangevec[2], length.out=20),
       col=colvec[1],
       xlim=rangevec,
       main="Residual",
       xlab="residual"
  )
  hist(x2,
       breaks=seq(rangevec[1], rangevec[2], length.out=20),
       col=colvec[2],
       xlim=rangevec,
       add=T
  )
  legend("topright",
         legend=c(deparse(substitute(x1)), deparse(substitute(x2))),
         col=colvec,
         pch=15)
}
plot_two_hists(resid(mod1), resid(mod4))

qqnorm(resid(mod1)); qqline(resid(mod1))

qqnorm(resid(mod4)); qqline(resid(mod4))

Look at the example of JohnsonJohnson

?JohnsonJohnson

The help of JohnsonJohnson has the example:

JJ <- log10(JohnsonJohnson)
plot(cbind(x=JohnsonJohnson, logx=JJ),
     main="Johnson & Johnson Quarterly Earnings")
fit <- StructTS(JJ, type = "BSM")
tsdiag(fit)
sm <- tsSmooth(fit)
plot(cbind(JJ, sm[, 1], sm[, 3]-0.5), plot.type = "single",
     col = c("black", "green", "blue"))
abline(h = -0.5, col = "grey60")

monthplot(fit)