@ -4,8 +4,7 @@
#'
#' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data
#' @param inner_k Number of inner CV (parameter optimization) folds
#' @param dfm DFM containing the labeled documents
#' @param class_type Name of the column in docvars containing the classification
#' @param vec Vector containing the true values of the classification
#' @param grid Parameter grid for optimization
#' @param seed integer used as seed for random number generation
#' @return A nested set of lists with row numbers
@ -15,36 +14,35 @@
#################################################################################################
#################################### Generate CV folds ##########################################
#################################################################################################
cv_generator <- function ( outer_k , inner_k , dfm, class_type , grid , seed ) {
cv_generator <- function ( outer_k , inner_k , vec , grid , seed ) {
### Generate inner folds for nested cv
inner_loop <- function ( i , folds , dfm, inner_k , class_type , grid , seed ) {
inner_loop <- function ( i , folds , vec, inner_k , grid , seed ) {
# RNG needs to be set explicitly for each fold
set.seed ( seed , kind = " Mersenne-Twister" , normal.kind = " Inversion" )
inner_folds <- createFolds ( as.factor ( docvars ( dfm [ - folds [ [i ] ] , ] , class_type ) ) , k = inner_k )
inner_folds <- createFolds ( as.factor ( vec [ - folds [ [i ] ] ] ) , k = inner_k )
grid <- crossing ( grid , inner_fold = names ( inner_folds ) , outer_fold = names ( folds ) [i ] )
return ( list ( grid = grid , inner_folds = inner_folds , outer_fold = names ( folds ) [i ] ) )
}
### Generate outer folds for nested cv
generate_folds <- function ( outer_k , inner_k , dfm, class_type , grid , seed ) {
generate_folds <- function ( outer_k , inner_k , vec , grid , seed ) {
set.seed ( seed , kind = " Mersenne-Twister" , normal.kind = " Inversion" )
if ( is.null ( outer_k ) ) { # If no outer_k, use all data to generate inner_k folds for parameter optimization
inner_folds <- createFolds ( as.factor ( docvars ( dfm , class_type ) ) , k = inner_k )
inner_folds <- createFolds ( as.factor ( vec ) , k = inner_k )
grid <- crossing ( grid , inner_fold = names ( inner_folds ) )
return ( list ( grid = grid ,
inner_folds = inner_folds ) )
} else if ( outer_k < 1 ) { # Create holdout validation for model performance estimation, with test set equal to outer_k
folds <- createDataPartition ( as.factor ( docvars ( dfm , class_type ) ) , p = outer_k )
folds <- createDataPartition ( as.factor ( vec ) , p = outer_k )
} else { # Do full nested CV
folds <- createFolds ( as.factor ( docvars ( dfm , class_type ) ) , k = outer_k )
folds <- createFolds ( as.factor ( vec ) , k = outer_k )
}
# Generate grid of hyperparameters for model optimization, and include inner folds row numbers
grid_folds <- lapply ( 1 : length ( folds ) ,
inner_loop ,
folds = folds ,
dfm = dfm ,
vec = vec ,
inner_k = inner_k ,
class_type = class_type ,
grid = grid ,
seed = seed )
@ -60,5 +58,5 @@ cv_generator <- function(outer_k, inner_k, dfm, class_type, grid, seed) {
outer_folds = folds ,
inner_folds = inner_folds ) )
}
return ( generate_folds ( outer_k , inner_k = inner_k , dfm = dfm , class_type = class_type , grid = grid , seed = seed ) )
return ( generate_folds ( outer_k , inner_k = inner_k , vec = vec , grid = grid , seed = seed ) )
}