R Code

The following generic function cv() has a default method that should work with most statistical models in R, and is therefore more general than the cv.glm() function in the boot package. The helper function getModelData() isn’t completely bullet-proof but should in most cases be able to retrieve the data to which the model was fit, making it unnecessary to specify the data argument to cv(). The mse() and BayesRule() functions provide cross-validation criteria for numeric and binary responses respectively. cv.default() also provides for parallel computations on computers with multiple processors or cores; the examples below are run on a computer with two physical cores—pretty much the worst case for parallel computations. cv.lm() takes advantage of the base-R lm.influence() function to produce more efficient computations in the leave-one-out case, for which naively updating the model is wasteful.

getModelData <- function(model) {
    # returns a data frame with the data to which the model was fit
    # model: a statistical model object that responds to model.frame() and formula() 
    data1 <- data <- model.frame(model)
    vars <- all.vars(formula(model))
    if ("pi" %in% vars) {
        vars <- setdiff(vars, "pi")
        message("the symbol 'pi' is treated as a numeric constant in the model formula")
    }
    cols <- colnames(data)
    check <- vars %in% cols
    if (!(all(check))) {
        missing.cols <- !check
        data1 <- expand.model.frame(model, vars[missing.cols])
    }
    missing.cols <- !cols %in% colnames(data1)
    if (any(missing.cols)) {
        data1 <- cbind(data1, data[missing.cols])
    }
    cols <- colnames(data1)
    valid <- make.names(cols) == cols | grepl("^\\(.*\\)$", cols)
    data1[valid]
}

mse <- function(y, yhat){
    mean((y - yhat)^2)
}

BayesRule <- function(y, yhat){
    y <- as.numeric(y) - 1
    yhat <- round(yhat)
    mean(y != yhat) # proportion in error
}

# a generally more efficient update() method for "glm" objects
# that uses the coefficients from the overall fit for start values
update.glm <- function(object, ...){
    update.default(object, start=coef(object), ...)
}

cv <- function(model, data, criterion, k, seed, ...){
    # Cross Validation
    # Args:
    #   model: model object
    #   data: data frame to which the model was fit
    #   criterion: cross-validation criterion function
    #   k: perform k-fold cross-validation
    #   seed: for R's random-number generator (optional)

    UseMethod("cv")
} 

cv.default <- function(model, data=getModelData(model), criterion=mse, k=nrow(data), seed,
                       predict.type="response", parallel=FALSE, 
                       ncores=parallel::detectCores(logical=FALSE), 
                       remove.missing=TRUE, ...){
    # Args:
    #   model: a model object that responds to model.frame(), update(), and predict() 
    #   data: data frame to which the model was fit (not usually necessary)
    #   criterion: cross-validation criterion function of form f(y.obs, y.fitted) 
    #              (default is mse)
    #   k: perform k-fold cross-validation (default is n-fold)
    #   predict.type: to be passed to type argument of predict() (default is "response")
    #   parallel: do computations in parallel? (default is FALSE)
    #   ncores: number of cores to use for parallel computations 
    #           (default is number of physical cores detected)
    #   remove.missing: use only complete cases in data (default is TRUE)
    #   ...: to match generic
    # Returns: a "cv" object with the cv criterion averaged across the folds,
    #          the bias-adjsuted averaged cv criterion,
    #          the criterion applied to the model fit to the full data set,
    #          and the initial state of R's random number generator
    f <- function(i){
        # helper function to compute cv criterion for each fold
        indices.i <- indices[starts[i]:ends[i]]
        model.i <- update(model, data=data[ - indices.i, ])
        fit.i <- predict(model.i, newdata=data[indices.i, ], type=predict.type)
        fit.o.i <- predict(model.i, newdata=data, type=predict.type)
        c(criterion(y[indices.i], fit.i), criterion(y, fit.o.i))
    }
    if (missing(seed)) seed <- sample(1e6, 1)
    set.seed(seed)
    if (inherits(na.action(model), "exclude")){
        model <- update(model, na.action="na.omit")
    }
    y <- model.response(model.frame(model))
    if (remove.missing && any(incomplete <- !complete.cases(data))){
        data <- na.omit(data)
        warning(sum(incomplete), " cases removed due to missingness")
    }
    n <- nrow(data)  
    if (!is.numeric(k) || length(k) > 1 || k > n || k < 2 || k != round(k)){
        stop("k must be an integer between 2 and n")
    }
    nk <-  n %/% k # number of cases in each fold
    rem <- n %% k  # remainder
    folds <- rep(nk, k) + c(rep(1, rem), rep(0, k - rem)) # allocate remainder
    ends <- cumsum(folds) # end of each fold
    starts <- c(1, ends + 1)[-(k + 1)] # start of each fold
    indices <- sample(n, n) # permute cases
    if (parallel && ncores > 1){
        if (!require("doParallel")) stop("doParallel package is missing")
        cl <- makeCluster(ncores) 
        registerDoParallel(cl)
        result <- foreach(i = 1:k, .combine=rbind) %dopar% {
            f(i)
        }
        stopCluster(cl)
    } else {
        result <- matrix(0, k, 2)
        for (i in 1:k){
            result[i, ] <- f(i)
        }
    }
    cv <- weighted.mean(result[, 1], folds)
    cv.full <- criterion(y, predict(model, newdata=data, type=predict.type))
    adj.cv <- cv + cv.full - weighted.mean(result[, 2], folds)
    result <- list("CV crit" = cv, "adj CV crit" = adj.cv, "full crit" = cv.full, 
                   "k" = if (k == n) 0 else k, initial.random.seed=seed,
                   "criterion" = deparse(substitute(criterion)))
    class(result) <- "cv"
    result
}

# This lm method is about 2 orders of magnitude faster for the leave-one-out case

cv.lm <- function(model, data=getModelData(model), criterion=mse, k=nrow(data), seed,
                       remove.missing=TRUE, ...){
    # parallel used only if k < n
    if (inherits(na.action(model), "exclude")){
        model <- update(model, na.action="na.omit")
    }
    if (remove.missing && any(incomplete <- !complete.cases(data))){
        data <- na.omit(data)
        warning(sum(incomplete), " cases removed due to missingness")
    }
    n <- nrow(data)  
    if (k < n) NextMethod()
    else{
        b <- coef(model)
        p <- length(b)
        X <- model.matrix(model)
        y <- model.response(model.frame(model))
        yhat <- numeric(n)
        crit.o <- numeric(n)
        bi <- matrix(b, n, p, byrow=TRUE) - lm.influence(model)$coefficients
        for (i in 1:n){
            yhat[i] <- X[i, ] %*% bi[i, ]
            crit.o[i] <- criterion(y, X %*% bi[i, ])
        }
        cv <- criterion(y, yhat)
        cv.full <- criterion(y, na.omit(fitted(model)))
        adj.cv <- cv + cv.full - mean(crit.o)
        result <- list("CV crit" = cv, "adj CV crit" = adj.cv, "full crit" = cv.full, 
                       "k" = if (k == n) 0 else k, initial.random.seed=NA,
                       "criterion" = deparse(substitute(criterion)))
        class(result) <- "cv"
        result
    }
}

cv.glm <- function(model, data=getModelData(model), criterion=mse, k=nrow(data), seed, ...){
  # to prevent the lm method from being inherited
  if (missing(seed)) seed <- sample(1e6, 1)
  cv.default(model, data, criterion, k, seed, ...)
}

print.cv <- function(x, ...){
    k <- x[["k"]]
    if (k == 0){
        cat("Leave-One-Out Cross-Validation")
    } else {
        cat(k, "-Fold Cross Validation", sep="")
    }
    cat("\ncross-validation", x$criterion, " =", x[["CV crit"]])
    cat("\nbias-adjusted cross-validation", x$criterion, "=", x[["adj CV crit"]])
    cat("\nfull-sample", x$criterion, "=", x[["full crit"]], "\n")
    invisible(x)
}

Applications to Linear Models

Here’s the regression from An Introduction to Statistical Learning worked with both cv.glm(), as in the text, and with cv():

library(ISLR)
library(car)
## Loading required package: carData
lm.fit <- lm(mpg ~ horsepower, data=Auto)
glm.fit <- glm(mpg ~ horsepower, data=Auto)
compareCoefs(lm.fit, glm.fit)
## Calls:
## 1: lm(formula = mpg ~ horsepower, data = Auto)
## 2: glm(formula = mpg ~ horsepower, data = Auto)
## 
##              Model 1  Model 2
## (Intercept)   39.936   39.936
## SE             0.717    0.717
##                              
## horsepower  -0.15784 -0.15784
## SE           0.00645  0.00645
## 
# leave-one-out (n-fold) cross validation
boot::cv.glm(Auto, glm.fit)$delta
## [1] 24.23151 24.23114
cv(glm.fit)
## Leave-One-Out Cross-Validation
## cross-validation criterion  = 24.23151
## bias-adjusted cross-validation criterion = 24.23114
## full-sample criterion = 23.94366
cv(lm.fit)
## Leave-One-Out Cross-Validation
## cross-validation mse  = 24.23151
## bias-adjusted cross-validation mse = 24.23114
## full-sample mse = 23.94366
# 10-fold cross validation
# note: cv() and cv.glm() don't match exactly because of
#       different algorithms
cv(glm.fit, k=10, seed=1234)
## 10-Fold Cross Validation
## cross-validation criterion  = 24.3794
## bias-adjusted cross-validation criterion = 24.35646
## full-sample criterion = 23.94366
set.seed(1234)
boot::cv.glm(Auto, glm.fit, K=10)$delta
## [1] 24.43519 24.40883
# timing comparisons
library(microbenchmark)
microbenchmark(boot=boot::cv.glm(Auto, glm.fit), 
               default.method=cv(glm.fit),
               parallel=cv(glm.fit, parallel=TRUE),
               lm.method=cv(lm.fit),
               times=10)
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
## Unit: milliseconds
##            expr       min        lq       mean     median        uq       max
##            boot  860.8506 1715.2260 1722.53867 1757.10305 2099.6193 2270.1166
##  default.method  807.0266  831.4812 1039.97738  968.33425 1274.5198 1615.9819
##        parallel 1355.9717 1944.0094 2185.50720 2252.57060 2333.9232 2774.5792
##       lm.method    6.4001    7.9085   21.47046   12.89635   14.7217  112.7871
##  neval  cld
##     10   c 
##     10  b  
##     10    d
##     10 a

Parallel computation can be advantageous for a larger problem (but for a linear model, the "lm" method is most efficient):

set.seed(4321)
D <- data.frame(x=rnorm(1e4))
D$y <- 1 + 2*D$x + rnorm(1e4)
mod.glm <- glm(y ~ x, data=D) # use glm() for boot.cv.glm()
mod.lm <- lm(y ~ x, data=D)

# n-fold CV: 
system.time(print(boot::cv.glm(D, mod.glm)$delta)) # takes awhile
## [1] 0.9971502 0.9971501
##    user  system elapsed 
##  134.70    2.31  163.55
system.time(print(cv(mod.glm)))                    # takes awhile
## Leave-One-Out Cross-Validation
## cross-validation criterion  = 0.9971502
## bias-adjusted cross-validation criterion = 0.9971501
## full-sample criterion = 0.9967483
##    user  system elapsed 
##  103.98    0.10  106.00
system.time(print(cv(mod.glm, parallel=TRUE)))     # possibly much faster (depending on no. of cores)
## Leave-One-Out Cross-Validation
## cross-validation criterion  = 0.9971502
## bias-adjusted cross-validation criterion = 0.9971501
## full-sample criterion = 0.9967483
##    user  system elapsed 
##    3.44    0.29   47.76
system.time(print(cv(mod.lm)))                     # lm method fastest
## Leave-One-Out Cross-Validation
## cross-validation mse  = 0.9971502
## bias-adjusted cross-validation mse = 0.9971501
## full-sample mse = 0.9967483
##    user  system elapsed 
##    1.32    0.08    1.44

Application to Logistic Regression

Here’s an example based on a logistic regression from the text:

m.caravan <- glm(Purchase ~ ., data=Caravan, family=binomial)   
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
system.time(print(cv(m.caravan, k=10, seed=123, criterion=BayesRule)))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## 10-Fold Cross Validation
## cross-validation criterion  = 0.06269323
## bias-adjusted cross-validation criterion = 0.06231543
## full-sample criterion = 0.05994504
##    user  system elapsed 
##    3.45    0.02    3.52
system.time(print(cv(m.caravan, k=10, seed=123, criterion=BayesRule, parallel=TRUE)))
## 10-Fold Cross Validation
## cross-validation criterion  = 0.06269323
## bias-adjusted cross-validation criterion = 0.06231543
## full-sample criterion = 0.05994504
##    user  system elapsed 
##    0.09    0.04    5.33
set.seed(123)
system.time(print(boot::cv.glm(Caravan, m.caravan, K=10, 
                               cost=function(y, yhat) mean(y != round(yhat)))$delta))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## [1] 0.06183442 0.06143897
##    user  system elapsed 
##    7.83    0.21    8.08

The warnings reflect numerical problems in fitting the model, which has 85 predictors.