#' Generate a classification model #' #' Generate a nested cross validated classification model based on a dfm with class labels as docvars #' Currently only supports Naïve Bayes using quanteda's textmodel_nb #' Hyperparemeter optimization is enabled through the grid parameter #' A grid should be generated from vectors with the labels as described for each model, using the crossing() command #' For Naïve Bayes, the following parameters can be used: #' - percentiles (cutoff point for tf-idf feature selection) #' - measures (what measure to use for determining feature importance, see textstat_keyness for options) #' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars #' @param outer_k Number of outer cross-validation folds (for performance estimation) #' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection) #' @param class_type Type of classification to model ("junk", "aggregate", or "codes") #' @param opt_measure Label of measure in confusion matrix to use as performance indicator #' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename) #' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search) #' @param seed Integer to use as seed for random number generation, ensures replicability #' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb) #' @param we_vectors Matrix with word embedding vectors #' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1 #' @return A list containing all relevant output #' @export #' @examples #' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) ################################################################################################# #################################### Function to generate classification models ################# ################################################################################################# modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, we_vectors, cores = 1) { ## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building folds <- cv_generator(outer_k,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed) inner_grid <- folds$grid outer_folds <- folds$outer_folds inner_folds <- folds$inner_folds ## Create multithread work pool for future_lapply plan(strategy = multiprocess, workers = cores) ## Use estimator function to build models for every parameter combination in grid inner_cv_output <- future_lapply(1:nrow(inner_grid), estimator, grid = inner_grid, outer_folds = outer_folds, inner_folds = inner_folds, dfm = dfm, class_type = class_type, we_vectors = we_vectors, model = model) %>% future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row bind_rows(.) outer_grid <- inner_cv_output %>% # Group by outer folds, and by parameters used for model tuning group_by_at(c("outer_fold", colnames(grid))) %>% # Get mean values for all numeric (performance indicator) variables summarise_if(is.numeric, mean, na.rm = F) %>% # Group the mean performance indicators by outer_fold group_by(outer_fold) %>% # Get for each outer_fold the row with the highest value of opt_measure slice(which.max((!!as.name(opt_measure)))) %>% # Select only the columns outer_fold, and the columns that are in the original parameter grid select(outer_fold, colnames(grid)) # Use the estimator function to build optimum models for each outer_fold outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator, grid = outer_grid, outer_folds = outer_folds, inner_folds = NULL, dfm = dfm, class_type = class_type, we_vectors = we_vectors, model = model) %>% future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid bind_rows(.) # Create (inner) folds for parameter optimization on the entire dataset final_folds <- cv_generator(NULL,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed) final_grid <- final_folds$grid final_inner <- final_folds$inner_folds # Use the estimator function to estimate the performance of each row in final_grid final_cv_output <- future_lapply(1:nrow(final_grid), estimator, grid = final_grid, outer_folds = NULL, inner_folds = final_inner, dfm = dfm, class_type = class_type, we_vectors = we_vectors, model = model) %>% future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid bind_rows(.) final_params <- final_cv_output %>% # Group final parameter optimization cv results by parameters used for optimization group_by_at(colnames(grid)) %>% # Get mean performance metrics for each fold summarise_if(is.numeric, mean, na.rm = F) %>% # Ungroup to allow for slicing ungroup() %>% # Select row with highest value of opt_measure slice(which.max((!!as.name(opt_measure)))) %>% # Keep only the columns that are present in the original parameter grid select(colnames(grid)) # Use the estimator function to estimate the final model, using the optimum parameters provided in final_params model_final <- estimator(1, grid = final_params, outer_folds = NULL, inner_folds = NULL, dfm = dfm, class_type = class_type, model = model) # Create list with output variables output <- list(final_cv_output = final_cv_output, final_params = final_params, outer_cv_output = outer_cv_output, model_final = model_final, grid = grid, seed = seed, opt_measure = opt_measure, model = model, country = country, class_type = class_type) # Return ouput return(output) }