Random Forest, eXtreme Gradient Boost, and Keras Models

Additional work

Timeseries in R from RStudio Conference 2020

Now to take a look at GDP using Random Forest, eXtreme Gradient Boost, and Keras. This will use first differences because of the trobule that Random Forest and XGBoost models have with out of sample numbers. An example of that will be at the bottom.

library(quantmod)
library(lubridate)
library(tidyverse)
library(plotly)
library(randomForest)
library(xgboost)
library(tsibble)
library(fable)
library(keras)
library(tensorflow)

quantmod::getSymbols(Symbols = "GDP",src="FRED")

## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.

## [1] "GDP"

# Quantmod retruns XTS objects, so put the data in a numeric list and create a date list
Periods = length(GDP)
DateStart = date(GDP[1])
DateEnd = date(GDP[nrow(GDP)])
DateSeries = seq(DateStart, by = "quarter", length.out = Periods)

GDPData = data.frame("GDP" = numeric(length(GDP)))
GDPData$GDP = as.numeric(GDP$GDP)

# Create differences and add lags
GDPData = GDPData %>%
  mutate(
    "Diff" = c(NA, diff(GDP))
    ,"Lag1Diff" = lag(Diff, 1)
    ,"Lag2Diff" = lag(Diff, 2)
    ,"Lag3Diff" = lag(Diff, 3)
    ,"Lag4Diff" = lag(Diff, 4)
    ,"Lag5Diff" = lag(Diff, 5)
  )

# Split data into training and test sets
TrainY = data.matrix(GDPData$Lag1Diff[7:(Periods-10)])
TestY = data.matrix(GDPData$Lag1Diff[(Periods-9):Periods])

TrainX = data.matrix(GDPData[7:(Periods-10), 3:ncol(GDPData)])
TestX = data.matrix(GDPData[(Periods-9):Periods, 3:ncol(GDPData)])

First run will be a Random Forest model.

# Create model
RFModel = randomForest(x = TrainX
                       ,y = as.vector(TrainY)
                       ,ntree = 2000
                       ,mtry = 2
                       ,nodesize = 25)

# Plot train fit
RFCheck = predict(RFModel)

plot(x = DateSeries[7:(Periods-10)], y = GDPData$Diff[7:(Periods-10)])
lines(x = DateSeries[7:(Periods-10)], y = RFCheck, col = "red")

# Plot test fit
RFPredict = predict(RFModel, newdata = TestX)

plot(x = DateSeries[(Periods-9):Periods], y = GDPData$Diff[(Periods-9):Periods])
lines(x = DateSeries[(Periods-9):Periods], y = RFPredict, col = "blue")

# RMSE Test
RFRMSE = sum(sqrt((TestY - RFPredict)^2))

Then an XGBoost model.

# Create model
XGBParam = list(
  eta = .1
  ,gamma = .1
  ,min_child_weight = .01
  ,max_depth = 5
  ,subsample = 1
  ,colsample_bynode = 1
  ,tree_method = "exact"
  ,objective = "reg:squarederror"
)

XGBModel = xgb.train(data = xgb.DMatrix(TrainX, label = TrainY)
                  ,nthread = 7
                  ,nrounds = 2000
                  ,watchlist = list(eval = xgb.DMatrix(TestX, label = TestY))
                  ,early_stopping_rounds = 20
                  ,params = XGBParam
                  ,verbose = FALSE)

# Plot train fit
XGBCheck = predict(XGBModel, newdata = xgb.DMatrix(TrainX))

plot(x = DateSeries[7:(Periods-10)], y = GDPData$Diff[7:(Periods-10)])
lines(x = DateSeries[7:(Periods-10)], y = XGBCheck, col = "red")

# Plot test fit
XGBPredict = predict(XGBModel, newdata = xgb.DMatrix(TestX))

plot(x = DateSeries[(Periods-9):Periods], y = GDPData$Diff[(Periods-9):Periods])
lines(x = DateSeries[(Periods-9):Periods], y = XGBPredict, col = "blue")

# RMSE Test
XGBRMSE = sum(sqrt((TestY - XGBPredict)^2))

Then a simple Convolutional Neural Network (CNN) model with Keras.

# Define model
model = keras_model_sequential() %>%   
  layer_dense(units=512
             ,input_shape=c(ncol(TrainX))
             ,activation="relu"
             ) %>%  
  layer_dropout(.2) %>%
  layer_dense(units=1, activation = "linear")

# Define loss and optimization
model %>% compile(loss = 'mse'
                  ,optimizer = optimizer_rmsprop()
                  ,metrics = list("mean_squared_error")
)

# Fit model
history <- model %>% 
  fit(TrainX,
      TrainY, 
      validation_split = .2,
      batch_size = 32, 
      epochs = 100,
      callbacks = list(
        callback_early_stopping(patience = 20, 
                                restore_best_weights = TRUE),
        callback_reduce_lr_on_plateau(factor = 0.1, 
                                      patience = 5)
        )
      ,verbose = FALSE
      )

# Plot training and validation by epoch
plot(history)

## `geom_smooth()` using formula 'y ~ x'

# check how model performs against original data
CheckCNN = model %>% predict(TrainX)

plot(x = DateSeries[7:(Periods-10)], y = GDPData$Diff[7:(Periods-10)])
lines(x = DateSeries[7:(Periods-10)], y = CheckCNN, col = "red")

# Plot test fit
PredictCNN = model %>% predict(TestX)

plot(x = DateSeries[(Periods-9):Periods], y = GDPData$Diff[(Periods-9):Periods])
lines(x = DateSeries[(Periods-9):Periods], y = PredictCNN, col = "blue")

# RMSE Test
CNNRMSE = sum(sqrt((TestY - PredictCNN)^2))

Then a complex Convolutional Neural Network (CNN) model with Keras.

# Define model
model = keras_model_sequential() %>%   
  layer_dense(units=128
             ,input_shape=c(ncol(TrainX))
             ,activation="relu"
             ) %>%  
  layer_dropout(.2) %>%
  layer_dense(units=64
             ,activation="relu"
             ) %>%  
  layer_dropout(.2) %>%
  layer_dense(units=1, activation = "linear")

# Define loss and optimization
model %>% compile(loss = 'mse'
                  ,optimizer = optimizer_rmsprop()
                  ,metrics = list("mean_squared_error")
)

# Fit model
history <- model %>% 
  fit(TrainX,
      TrainY, 
      validation_split = .1,
      batch_size = 8, 
      epochs = 100,
      callbacks = list(
        callback_early_stopping(patience = 20, 
                                restore_best_weights = TRUE),
        callback_reduce_lr_on_plateau(factor = 0.01, 
                                      patience = 5)
        )
      ,verbose = FALSE
      )

# Plot training and validation by epoch
plot(history)

## `geom_smooth()` using formula 'y ~ x'

# check how model performs against original data
CheckCCNN = model %>% predict(TrainX)

plot(x = DateSeries[7:(Periods-10)], y = GDPData$Diff[7:(Periods-10)])
lines(x = DateSeries[7:(Periods-10)], y = CheckCCNN, col = "red")

# Plot test fit
PredictCCNN = model %>% predict(TestX)

plot(x = DateSeries[(Periods-9):Periods], y = GDPData$Diff[(Periods-9):Periods])
lines(x = DateSeries[(Periods-9):Periods], y = PredictCCNN, col = "blue")

# RMSE Test
CCNNRMSE = sum(sqrt((TestY - PredictCCNN)^2))

Then a very complex Convolutional Neural Network (CNN) model with Keras.

# Define model
model = keras_model_sequential() %>%   
  layer_dense(units=1024
             ,input_shape=c(ncol(TrainX))
             ,activation="relu"
             ) %>%  
  layer_dense(units=512
             ,activation="relu"
             ) %>%  
    layer_dense(units=256
             ,activation="relu"
             ) %>%  
  layer_dense(units=128
             ,activation="relu"
             ) %>%  
  layer_dense(units=1, activation = "linear")

# Define loss and optimization
model %>% compile(loss = 'mse'
                  ,optimizer = optimizer_rmsprop()
                  ,metrics = list("mean_squared_error")
)

# Fit model
history <- model %>% 
  fit(TrainX,
      TrainY, 
      validation_split = .1,
      batch_size = 8, 
      epochs = 100,
      callbacks = list(
        callback_early_stopping(patience = 20, 
                                restore_best_weights = TRUE),
        callback_reduce_lr_on_plateau(factor = 0.01, 
                                      patience = 5)
        )
      ,verbose = FALSE
      )

# Plot training and validation by epoch
plot(history)

## `geom_smooth()` using formula 'y ~ x'

# check how model performs against original data
CheckVCNN = model %>% predict(TrainX)

plot(x = DateSeries[7:(Periods-10)], y = GDPData$Diff[7:(Periods-10)])
lines(x = DateSeries[7:(Periods-10)], y = CheckVCNN, col = "red")

# Plot test fit
PredictVCNN = model %>% predict(TestX)

plot(x = DateSeries[(Periods-9):Periods], y = GDPData$Diff[(Periods-9):Periods])
lines(x = DateSeries[(Periods-9):Periods], y = PredictVCNN, col = "blue")

# RMSE Test
VCNNRMSE = sum(sqrt((TestY - PredictVCNN)^2))

And finally a Long Short-Term Memory (LSTM) model with Keras.

# Define Arrays
TrainArray = array(data = TrainX, dim = c(nrow(TrainX),ncol(TrainX),1))
TestArray = array(data = TestX, dim = c(nrow(TestX),ncol(TestX),1))

# Define model
model = keras_model_sequential() %>%   
  layer_lstm(units=512
             ,input_shape=c(dim(TrainArray)[2],dim(TrainArray)[3])
             ,activation="relu"
             ,return_sequences = TRUE
             ) %>%  
  layer_dropout(.4) %>%
  layer_lstm(units=512
             ,activation="relu"
             ) %>% 
  layer_dropout(.4) %>%
  layer_dense(units=1, activation = "linear")

# Define loss and optimization
model %>% compile(loss = 'mse'
                  ,optimizer = optimizer_rmsprop()
                  ,metrics = list("mean_squared_error")
)

# Fit model
history <- model %>% 
  fit(TrainArray,
      TrainY, 
      validation_split = .1,
      batch_size = 32, 
      epochs = 100,
      callbacks = list(
        callback_early_stopping(patience = 20, 
                                restore_best_weights = TRUE),
        callback_reduce_lr_on_plateau(factor = 0.1, 
                                      patience = 5)
        )
      ,verbose = FALSE
      )

# Plot training and validation by epoch
plot(history)

## `geom_smooth()` using formula 'y ~ x'

# check how model performs against original data
CheckLSTM = model %>% predict(TrainArray)

plot(x = DateSeries[7:(Periods-10)], y = GDPData$Diff[7:(Periods-10)])
lines(x = DateSeries[7:(Periods-10)], y = CheckLSTM, col = "red")

# Plot test fit
PredictLSTM = model %>% predict(TestArray)

plot(x = DateSeries[(Periods-9):Periods], y = GDPData$Diff[(Periods-9):Periods])
lines(x = DateSeries[(Periods-9):Periods], y = PredictLSTM, col = "blue")

# RMSE Test
LSTMRMSE = sum(sqrt((TestY - PredictLSTM)^2))

Ending with a comparison of RMSE of each model

# Random Forest
RFRMSE

## [1] 553.1265

# eXtreme Gradient Boost
XGBRMSE

## [1] 56.03166

# Convolutional Neural Network
CNNRMSE

## [1] 33.81289

# Complex Convolutional Neural Network
CCNNRMSE

## [1] 103.4713

# Very complex Convolutional Neural Network
VCNNRMSE

## [1] 75.55692

# Long Short-Term Memory Neural Network
LSTMRMSE

## [1] 208.1026

Random Forest, eXtreme Gradient Boost, and Keras Models

Will Crouch

2020-09-03

Additional work