R Code

The following generic function cv() has a default method that should work with most statistical models in R, and is therefore more general than the cv.glm() function in the boot package. The helper function getModelData() isn’t completely bullet-proof but should in most cases be able to retrieve the data to which the model was fit, making it unnecessary to specify the data argument to cv(). The mse() and BayesRule() functions provide cross-validation criteria for numeric and binary responses respectively. cv.default() also provides for parallel computations on computers with multiple processors or cores.

getModelData <- function(model) {

  # returns a data frame with the data to which the model was fit

  # model: a statistical model object that responds to model.frame() and formula() 

  data1 <- data <- model.frame(model)

  vars <- all.vars(formula(model))

  if ("pi" %in% vars) {

    vars <- setdiff(vars, "pi")

    message("the symbol 'pi' is treated as a numeric constant in the model formula")

  }

  cols <- colnames(data)

  check <- vars %in% cols

  if (!(all(check))) {

    missing.cols <- !check

    data1 <- expand.model.frame(model, vars[missing.cols])

  }

  missing.cols <- !cols %in% colnames(data1)

  if (any(missing.cols)) {

    data1 <- cbind(data1, data[missing.cols])

  }

  cols <- colnames(data1)

  valid <- make.names(cols) == cols | grepl("^\\(.*\\)$", cols)

  data1[valid]

}



mse <- function(y, yhat){

  mean((y - yhat)^2)

}



BayesRule <- function(y, yhat){

  y <- as.numeric(y) - 1

  yhat <- round(yhat)

  mean(y != yhat) # proportion in error

}



# a generally more efficient update() method for "glm" objects

# that uses the coefficients from the overall fit for start values

update.glm <- function(object, ...){

  update.default(object, start=coef(object), ...)

}



cv <- function(model, data, criterion, k, ...){

  # Cross Validation

  # Args:

  #   model: model object

  #   data: data frame to which the model was fit

  #   criterion: cross-validation criterion function

  #   k: perform k-fold cross-validation

  UseMethod("cv")

} 



cv.default <- function(model, data=getModelData(model), criterion=mse, k=nrow(data), 

                       predict.type="response", parallel=FALSE, 

                       ncores=parallel::detectCores(logical=FALSE), 

                       remove.missing=TRUE, ...){

  # Args:

  #   model: a model object that responds to model.frame(), update(), and predict() 

  #   data: data frame to which the model was fit (not usually necessary)

  #   criterion: cross-validation criterion function of form f(y.obs, y.fitted) 

  #              (default is mse)

  #   k: perform k-fold cross-validation (default is n-fold)

  #   predict.type: to be passed to type argument of predict() (default is "response")

  #   parallel: do computations in parallel? (default is FALSE)

  #   ncores: number of cores to use for parallel computations 

  #           (default is number of physical cores detected)

  #   remove.missing: use only complete cases in data (default is TRUE)

  #   ...: to match generic

  # Returns: a "cv" object with the cv criterion averaged across the folds,

  #          the bias-adjsuted averaged cv criterion,

  #          the criterion applied to the model fit to the full data set,

  #          and the initial state of R's random number generator

  f <- function(i){

    # helper function to compute cv criterion for each fold

    indices.i <- indices[starts[i]:ends[i]]

    model.i <- update(model, data=data[ - indices.i, ])

    fit.i <- predict(model.i, newdata=data[indices.i, ], type=predict.type)

    fit.o.i <- predict(model.i, newdata=data, type=predict.type)

    c(criterion(y[indices.i], fit.i), criterion(y, fit.o.i))

  }

  save.seed <- .Random.seed

  y <- model.response(model.frame(model))

  if (remove.missing && any(incomplete <- !complete.cases(data))){

    data <- na.omit(data)

    warning(sum(incomplete), " cases removed due to missingness")

  }

  n <- nrow(data)  

  if (!is.numeric(k) || length(k) > 1 || k > n || k < 2 || k != round(k)){

    stop("k must be an integer between 2 and n")

  }

  nk <-  n %/% k # number of cases in each fold

  rem <- n %% k  # remainder

  folds <- rep(nk, k) + c(rep(1, rem), rep(0, k - rem)) # allocate remainder

  ends <- cumsum(folds) # end of each fold

  starts <- c(1, ends + 1)[-(k + 1)] # start of each fold

  indices <- sample(n, n) # permute cases

  if (parallel && ncores > 1){

    if (!require("doParallel")) stop("doParallel package is missing")

    cl <- makeCluster(ncores) 

    registerDoParallel(cl)

    result <- foreach(i = 1:k, .combine=rbind) %dopar% {

      f(i)

    }

    stopCluster(cl)

  } else {

    result <- matrix(0, k, 2)

    for (i in 1:k){

      result[i, ] <- f(i)

    }

  }

  cv <- weighted.mean(result[, 1], folds)

  cv.full <- criterion(y, predict(model, newdata=data, type=predict.type))

  adj.cv <- cv + cv.full - weighted.mean(result[, 2], folds)

  result <- list("CV crit" = cv, "adj CV crit" = adj.cv, "full crit" = cv.full, 

                 "k" = if (k == n) 0 else k, initial.random.seed=save.seed)

  class(result) <- "cv"

  result

}



# NB: there really should be a cv.lm() method that works more efficiently by 

#     updating the QR decomposition for the least-squares fit to the full data



print.cv <- function(x, ...){

  k <- x[["k"]]

  if (k == 0){

    cat("Leave-One-Out Cross-Validatiohn")

  } else {

    cat(k, "-Fold Cross Validation", sep="")

  }

  cat("\ncross-validation criterion =", x[["CV crit"]])

  cat("\nbias-adjusted cross-validation criterion =", x[["adj CV crit"]])

  cat("\nfull-sample criterion =", x[["full crit"]], "\n")

  invisible(x)

}

Applications to Linear Models

Here’s the regression from An Introduction to Statistical Learning worked with both cv.glm(), as in the text, and with cv():

library(ISLR)

library(car)
## Loading required package: carData
lm.fit <- lm(mpg ~ horsepower, data=Auto)

glm.fit <- glm(mpg ~ horsepower, data=Auto)

compareCoefs(lm.fit, glm.fit)
## Calls:

## 1: lm(formula = mpg ~ horsepower, data = Auto)

## 2: glm(formula = mpg ~ horsepower, data = Auto)

## 

##              Model 1  Model 2

## (Intercept)   39.936   39.936

## SE             0.717    0.717

##                              

## horsepower  -0.15784 -0.15784

## SE           0.00645  0.00645

## 
# leave-one-out (n-fold) cross validation

boot::cv.glm(Auto, glm.fit)$delta
## [1] 24.23151 24.23114
cv(glm.fit)
## Leave-One-Out Cross-Validatiohn

## cross-validation criterion = 24.23151

## bias-adjusted cross-validation criterion = 24.23114

## full-sample criterion = 23.94366
cv(lm.fit)
## Leave-One-Out Cross-Validatiohn

## cross-validation criterion = 24.23151

## bias-adjusted cross-validation criterion = 24.23114

## full-sample criterion = 23.94366
# 10-fold cross validation

# note: cv() and cv.glm() don't match exactly because of

#       different algorithms

set.seed(1234)

cv(glm.fit, k=10)
## 10-Fold Cross Validation

## cross-validation criterion = 24.3794

## bias-adjusted cross-validation criterion = 24.35646

## full-sample criterion = 23.94366
set.seed(1234)

boot::cv.glm(Auto, glm.fit, K=10)$delta
## [1] 24.43519 24.40883
# timing comparisons

library(microbenchmark)

microbenchmark(boot=boot::cv.glm(Auto, glm.fit), mine=cv(glm.fit),

               parallel=cv(glm.fit, parallel=TRUE), times=10)
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
## Unit: milliseconds

##      expr       min        lq      mean    median        uq       max

##      boot  844.4091  880.7626  920.2713  886.9686  912.7904 1137.3773

##      mine  807.6179  816.4587  857.6038  839.8327  904.4142  928.5349

##  parallel 1958.8596 2025.5836 2222.0949 2206.8769 2382.2078 2729.5941

##  neval cld

##     10  a 

##     10  a 

##     10   b
               # parallel is slower when n is small

Parallel computation is advantageous for a larger problem:

set.seed(4321)

D <- data.frame(x=rnorm(1e4))

D$y <- 1 + 2*D$x + rnorm(1e4)

mod <- glm(y ~ x, data=D) # use glm() for boot.cv.glm()



# n-fold CV:

system.time(print(boot::cv.glm(D, mod)$delta)) # takes awhile
## [1] 0.9971502 0.9971501
##    user  system elapsed 

##   89.75    0.02   90.28
system.time(print(cv(mod)))                    # takes awhile
## Leave-One-Out Cross-Validatiohn

## cross-validation criterion = 0.9971502

## bias-adjusted cross-validation criterion = 0.9971501

## full-sample criterion = 0.9967483
##    user  system elapsed 

##  103.14    0.01  103.44
system.time(print(cv(mod, parallel=TRUE)))     # possibly much faster (depending on no. of cores)
## Leave-One-Out Cross-Validatiohn

## cross-validation criterion = 0.9971502

## bias-adjusted cross-validation criterion = 0.9971501

## full-sample criterion = 0.9967483
##    user  system elapsed 

##    3.86    0.49   19.06

Application to Logistic Regression

Here’s an example based on a logistic regression from the text:

m.caravan <- glm(Purchase ~ ., data=Caravan, family=binomial)   
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
set.seed(123)

system.time(print(cv(m.caravan, k=10, criterion=BayesRule)))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type

## == : prediction from a rank-deficient fit may be misleading



## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type

## == : prediction from a rank-deficient fit may be misleading
## 10-Fold Cross Validation

## cross-validation criterion = 0.06269323

## bias-adjusted cross-validation criterion = 0.06231543

## full-sample criterion = 0.05994504
##    user  system elapsed 

##    3.17    0.13    3.30
set.seed(123)

system.time(print(cv(m.caravan, k=10, criterion=BayesRule, parallel=TRUE)))
## 10-Fold Cross Validation

## cross-validation criterion = 0.06269323

## bias-adjusted cross-validation criterion = 0.06231543

## full-sample criterion = 0.05994504
##    user  system elapsed 

##    0.20    0.19    4.31
set.seed(123)

system.time(print(boot::cv.glm(Caravan, m.caravan, K=10, 

                               cost=function(y, yhat) mean(y != round(yhat)))$delta))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type

## == : prediction from a rank-deficient fit may be misleading



## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type

## == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred



## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] 0.06183442 0.06143897
##    user  system elapsed 

##    7.33    0.14    7.49

The warnings reflect numerical problems in fitting the model, which has 85 predictors.