#' Generate CV folds for nested cross-validation
#'
#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
#'
#' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data
#' @param inner_k Number of inner CV (parameter optimization) folds
#' @param vec Vector containing the true values of the classification
#' @param grid Parameter grid for optimization
#' @param seed integer used as seed for random number generation
#' @return A nested set of lists with row numbers
#' @export
#' @examples
#' cv_generator(outer_k, inner_k, dfm, class_type)
#################################################################################################
#################################### Generate CV folds ##########################################
#################################################################################################
cv_generator <- function ( outer_k , inner_k , vec , grid , seed ) {
### Generate inner folds for nested cv
inner_loop <- function ( i , folds , vec , inner_k , grid , seed ) {
# RNG needs to be set explicitly for each fold
set.seed ( seed , kind = " Mersenne-Twister" , normal.kind = " Inversion" )
inner_folds <- createFolds ( as.factor ( vec [ - folds [ [i ] ] ] ) , k = inner_k )
grid <- crossing ( grid , inner_fold = names ( inner_folds ) , outer_fold = names ( folds ) [i ] )
return ( list ( grid = grid , inner_folds = inner_folds , outer_fold = names ( folds ) [i ] ) )
}
### Generate outer folds for nested cv
generate_folds <- function ( outer_k , inner_k , vec , grid , seed ) {
set.seed ( seed , kind = " Mersenne-Twister" , normal.kind = " Inversion" )
if ( is.null ( outer_k ) ) { # If no outer_k, use all data to generate inner_k folds for parameter optimization
inner_folds <- createFolds ( as.factor ( vec ) , k = inner_k )
grid <- crossing ( grid , inner_fold = names ( inner_folds ) )
return ( list ( grid = grid ,
inner_folds = inner_folds ) )
} else if ( outer_k < 1 ) { # Create holdout validation for model performance estimation, with test set equal to outer_k
folds <- createDataPartition ( as.factor ( vec ) , p = outer_k )
} else { # Do full nested CV
folds <- createFolds ( as.factor ( vec ) , k = outer_k )
}
# Generate grid of hyperparameters for model optimization, and include inner folds row numbers
grid_folds <- lapply ( 1 : length ( folds ) ,
inner_loop ,
folds = folds ,
vec = vec ,
inner_k = inner_k ,
grid = grid ,
seed = seed )
# Extract grid dataframe from results
grid <- grid_folds %>% purrr :: map ( 1 ) %>% dplyr :: bind_rows ( )
# Extract row numbers for inner folds from results
inner_folds <- grid_folds %>% purrr :: map ( 2 )
# Extract the names of the inner folds from results
names ( inner_folds ) <- grid_folds %>% purrr :: map ( 3 ) %>% unlist ( .)
return ( list ( grid = grid ,
outer_folds = folds ,
inner_folds = inner_folds ) )
}
return ( generate_folds ( outer_k , inner_k = inner_k , vec = vec , grid = grid , seed = seed ) )
}