diff --git a/.gitignore b/.gitignore index 807ea25..6fef2a0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .Rproj.user .Rhistory .RData +*.RData +*.Rds diff --git a/NAMESPACE b/NAMESPACE index 3bea41b..0878746 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,13 +4,19 @@ export(actor_fetcher) export(actorizer) export(bulk_writer) export(class_update) +export(cv_generator) export(dfm_gen) export(dupe_detect) export(elastic_update) export(elasticizer) +export(estimator) +export(feat_select) export(lemma_writer) export(merger) +export(metric_gen) export(modelizer) +export(modelizer_old) +export(out_parser) export(query_gen_actors) export(query_string) export(ud_update) diff --git a/R/cv_generator.R b/R/cv_generator.R new file mode 100644 index 0000000..2c8af81 --- /dev/null +++ b/R/cv_generator.R @@ -0,0 +1,64 @@ +#' Generate CV folds for nested cross-validation +#' +#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination +#' +#' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data +#' @param inner_k Number of inner CV (parameter optimization) folds +#' @param dfm DFM containing the labeled documents +#' @param class_type Name of the column in docvars containing the classification +#' @param grid Parameter grid for optimization +#' @param seed integer used as seed for random number generation +#' @return A nested set of lists with row numbers +#' @export +#' @examples +#' cv_generator(outer_k, inner_k, dfm, class_type) +################################################################################################# +#################################### Generate CV folds ########################################## +################################################################################################# +cv_generator <- function(outer_k, inner_k, dfm, class_type, grid, seed) { + ### Generate inner folds for nested cv + inner_loop <- function(i, folds, dfm, inner_k, class_type, grid, seed) { + # RNG needs to be set explicitly for each fold + set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") + inner_folds <- createFolds(as.factor(docvars(dfm[-folds[[i]],], class_type)), k= inner_k) + grid <- crossing(grid, inner_fold = names(inner_folds), outer_fold = names(folds)[i]) + return(list(grid = grid, inner_folds = inner_folds, outer_fold = names(folds)[i])) + } + + ### Generate outer folds for nested cv + generate_folds <- function(outer_k, inner_k, dfm, class_type, grid, seed){ + set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") + if (is.null(outer_k)) { # If no outer_k, use all data to generate inner_k folds for parameter optimization + inner_folds <- createFolds(as.factor(docvars(dfm, class_type)), k= inner_k) + grid <- crossing(grid, inner_fold = names(inner_folds)) + return(list(grid = grid, + inner_folds = inner_folds)) + } else if (outer_k < 1) { # Create holdout validation for model performance estimation, with test set equal to outer_k + folds <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k) + } else { # Do full nested CV + folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k) + } + # Generate grid of hyperparameters for model optimization, and include inner folds row numbers + grid_folds <- lapply(1:length(folds), + inner_loop, + folds = folds, + dfm = dfm, + inner_k = inner_k, + class_type = class_type, + grid = grid, + seed = seed) + + # Extract grid dataframe from results + grid <- grid_folds %>% purrr::map(1) %>% dplyr::bind_rows() + + # Extract row numbers for inner folds from results + inner_folds <- grid_folds %>% purrr::map(2) + + # Extract the names of the inner folds from results + names(inner_folds) <- grid_folds %>% purrr::map(3) %>% unlist(.) + return(list(grid = grid, + outer_folds = folds, + inner_folds = inner_folds)) + } + return(generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed)) +} diff --git a/R/dfm_gen.R b/R/dfm_gen.R index f645f32..187cbfa 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -6,6 +6,7 @@ #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available) +#' @param tolower Boolean indicating whether dfm features should be lowercased #' @return A Quanteda dfm #' @export #' @examples @@ -18,7 +19,7 @@ # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack -dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1) { +dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1, tolower = T) { # Create subset with just ids, codes and text out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field @@ -62,6 +63,6 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1) { } } dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>% - dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex") + dfm(tolower = tolower, stem = F, remove_punct = T, valuetype = "regex") return(dfm) } diff --git a/R/estimator.R b/R/estimator.R new file mode 100644 index 0000000..1242fa0 --- /dev/null +++ b/R/estimator.R @@ -0,0 +1,96 @@ +#' Generate models and get classifications on test sets +#' +#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination +#' +#' @param row Row number of current item in grid +#' @param grid Grid with model parameters and CV folds +#' @param outer_folds List with row numbers for outer folds +#' @param dfm DFM containing labeled documents +#' @param class_type Name of column in docvars() containing the classes +#' @param model Model to use (currently only nb) +#' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values. +#' @export +#' @examples +#' estimator(row, grid, outer_folds, dfm, class_type, model) +################################################################################################# +#################################### Generate CV folds ########################################## +################################################################################################# + +### Classification function +estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model) { + # Get parameters for current iteration + params <- grid[row,] + + # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance + if ("inner_fold" %in% colnames(params) && "outer_fold" %in% colnames(params)) { + dfm_train <- dfm[-outer_folds[[params$outer_fold]],] %>% + .[-inner_folds[[params$outer_fold]][[params$inner_fold]],] + dfm_test <- dfm[-outer_folds[[params$outer_fold]],] %>% + .[inner_folds[[params$outer_fold]][[params$inner_fold]],] + # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data + } else if ("outer_fold" %in% colnames(params)) { + dfm_train <- dfm[-outer_folds[[params$outer_fold]],] + dfm_test <- dfm[outer_folds[[params$outer_fold]],] + # If only inner folds, validate performance directly on inner folds + } else if ("inner_fold" %in% colnames(params)) { + dfm_train <- dfm[-inner_folds[[params$inner_fold]],] + dfm_test <- dfm[inner_folds[[params$inner_fold]],] + # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model + } else { + final <- T ### Indicate final modeling run on whole dataset + dfm_train <- dfm + } + ## Currently scheme_tf is not used explicitly + # if (model == 'nb') { + # scheme_tf <- 'count' # The 'old' way + # } else { + # scheme_tf <- 'prop' # The 'new' way + # } + + ### Getting features from training dataset + # Getting idf from training data, and using it to normalize both training and testing feature occurence + dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) + idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0) + dfm_train <- dfm_weight(dfm_train, weights = idf) + + # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select) + words <- unique(unlist(lapply(unique(docvars(dfm_train, class_type)), + feat_select, + dfm = dfm_train, + class_type = class_type, + percentile = params$percentiles, + measure = params$measures + ))) + dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T) + + + if (model == "nb") { + text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial") + } + if (model == "svm") { + text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon) + } + if (model == 'nnet') { + idC <- class.ind(as.factor(docvars(dfm_train, class_type))) + text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2) + } + ### Add more if statements for different models + + # If training on whole dataset, return final model, and idf values from dataset + if (exists("final")) { + return(list(text_model=text_model, idf=idf)) + } else { # Create a test set, and classify test items + dfm_test <- dfm_weight(dfm_test, weights = idf) + # Use force=T to keep only features present in both training and test set + pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T) + + return(data.frame( + tv = I(list(docvars(dfm_test, class_type))), # True values from test set + pred = I(list(pred)), # Predictions of test set + params, # Parameters used to generate classification model + text_model = I(list(text_model)), # The classification model + idf = I(list(idf)), # IDF of the training dataset used for model creation + stringsAsFactors = F + )) + } +} diff --git a/R/feat_select.R b/R/feat_select.R new file mode 100644 index 0000000..f411d9b --- /dev/null +++ b/R/feat_select.R @@ -0,0 +1,28 @@ +#' Select features using quanteda textstat_keyness +#' +#' Select features based on the textstat_keyness function and a percentile cutoff +#' Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic +#' +#' @param topic The topic to determine keywords for +#' @param dfm The input dfm +#' @param class_type Name of the column in docvars containing the classification +#' @param percentile Cutoff for the list of words that should be returned +#' @param measure Measure to use in determining keyness, default = chi2; see textstat_keyness for other options +#' @return A vector of words that are key to the topic +#' @export +#' @examples +#' feat_select(topic, dfm, class_type, percentile, measure="chi2") +################################################################################################# +#################################### Feature selection ########################################## +################################################################################################# + +feat_select <- function (topic, dfm, class_type, percentile, measure="chi2") { + # Use quanteda textstat_keyness to determine feature importance + keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>% + na.omit() + # Convert keyness values to absolute values, to take into account both positive and negative extremes + keyness[,2] <- abs(keyness[,2]) + # Keep only the words with an absolute keyness value falling in the top [percentile] percentile + keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature + return(keyness) +} diff --git a/R/metric_gen.R b/R/metric_gen.R new file mode 100644 index 0000000..579d74a --- /dev/null +++ b/R/metric_gen.R @@ -0,0 +1,47 @@ +#' Generate performance statistics for models +#' +#' Generate performance statistics for models, based on their predictions and the true values +#' +#' @param x A data frame containing at least the columns "pred" and "tv" +#' @return x, with additional columns for performance metrics +#' @export +#' @examples +#' metric_gen(x) +################################################################################################# +############################# Performance metric generation ##################################### +################################################################################################# + +metric_gen <- function(x) { + ### Fix for missing classes in multiclass classification + ### Sorting u for easier interpretation of confusion matrix + u <- as.character(sort(as.numeric(union(unlist(x$pred), unlist(x$tv))))) + # Create a crosstable with predictions and true values + class_table <- table(prediction = factor(unlist(x$pred), u), trueValues = factor(unlist(x$tv), u)) + + # When only two classes, set positive class explicitly as the class with the highest value + if (length(unique(u)) == 2) { + conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u)) + weighted_measures <- as.data.frame(conf_mat$byClass) + macro_measures <- as.data.frame(conf_mat$byClass) + } else { + # Create a confusion matrix + conf_mat <- confusionMatrix(class_table, mode = "everything") + # Set "positive" value to NA, because not applicable + conf_mat$positive <- NA + # Compute weighted performance measures + weighted_measures <- colSums(conf_mat$byClass * colSums(conf_mat$table))/sum(colSums(conf_mat$table)) + # Compute unweighted performance measures (divide by number of classes, each class equally important) + macro_measures <- colSums(conf_mat$byClass)/nrow(conf_mat$byClass) + # Replace NaN's by 0 when occurring + weighted_measures[is.nan(weighted_measures)] <- 0 + macro_measures[is.nan(macro_measures)] <- 0 + } + return(cbind(x, + as.data.frame(t(conf_mat$overall)), + 'weighted' = t(as.data.frame(weighted_measures)), + 'macro' = t(as.data.frame(macro_measures)), + pos_cat = conf_mat$positive, + conf_mat = I(list(conf_mat)) + ) + ) +} diff --git a/R/modelizer.R b/R/modelizer.R index a83e698..f3c0824 100644 --- a/R/modelizer.R +++ b/R/modelizer.R @@ -8,293 +8,122 @@ #' - percentiles (cutoff point for tf-idf feature selection) #' - measures (what measure to use for determining feature importance, see textstat_keyness for options) #' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars -#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds) -#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer) -#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid) -#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner) -#' @param seed Integer to use as seed for random number generation, ensures replicability #' @param outer_k Number of outer cross-validation folds (for performance estimation) #' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection) -#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb) #' @param class_type Type of classification to model ("junk", "aggregate", or "codes") #' @param opt_measure Label of measure in confusion matrix to use as performance indicator #' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename) #' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search) -#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation +#' @param seed Integer to use as seed for random number generation, ensures replicability +#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb) +#' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1 +#' @return An .Rds file in the current working directory (getwd()) with a list containing all relevant output #' @export #' @examples -#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid) +#' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) ################################################################################################# #################################### Function to generate classification models ################# ################################################################################################# -modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) { - ### Functions ### - feat_select <- function (topic, dfm, class_type, percentile,measure) { - keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>% - na.omit() - keyness[,2] <- abs(keyness[,2]) - keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature - return(keyness) - } - ### Generate inner folds for nested cv - inner_loop <- function(fold, dfm, inner_k, class_type) { - # RNG needs to be set explicitly for each fold - set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") - ## Either createDataPartition for simple holdout parameter optimization - ## Or createFolds for proper inner CV for nested CV - # if (inner_k <= 1) { - # inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8) - # } else { - inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k) - # } - return(c(outer_fold = list(fold),inner_folds)) - } - - ### Generate outer folds for nested cv - generate_folds <- function(outer_k, inner_k, dfm, class_type){ - set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") - folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k) - return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type)) - } - - ### Gets called for every parameter combination, and calls classifier for every inner cv fold - inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) { - print(str_c('params ',row)) - params <- grid[row,] - # For each inner fold, cross validate the specified parameters - res <- - bind_rows(mclapply(inner_folds, - classifier, - outer_fold = outer_fold, - params = params, - dfm = dfm, - class_type = class_type, - model = model, - cores_feats = cores_feats, - mc.cores = cores_inner - ) - ) - # print(res) - # print(res[1,1]) - # print('inner_cv') - return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params)) - } - - ### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid - outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) { - print('outer cv') - # If fold contains both inner folds and outer fold - if (length(fold) == inner_k + 1) { - inner_folds <- fold[-1] - outer_fold <- fold$outer_fold - # For each row in grid, cross-validate results - res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1), - inner_cv, - cores_feats= cores_feats, - grid = grid, - dfm = dfm, - class_type = class_type, - model = model, - outer_fold = outer_fold, - inner_folds = inner_folds, - cores_inner = cores_inner, - mc.cores = cores_grid) - ) - # print(res) - # print(res[1,1]) - # print('outer_cv') - # Determine optimum hyperparameters within outer fold training set - optimum <- res[which.max(res[,opt_measure]),] %>% - select(percentiles: ncol(.)) - # Validate performance of optimum hyperparameters on outer fold test set - return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats)) - } else { - # If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters - inner_folds <- fold - outer_fold <- NULL - res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1), - inner_cv, - cores_feats= cores_feats, - grid = grid, - dfm = dfm, - class_type = class_type, - model = model, - outer_fold = outer_fold, - inner_folds = inner_folds, - cores_inner = cores_inner, - mc.cores = cores_grid) - ) - # print(res) - # print(res[1,1]) - # print('line 126, final model parameter optimization') - return(res) - } - } - - # ### Custom tfidf function to allow same idf for different dfm's - # custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) { - # if (!nfeat(x) || !ndoc(x)) return(x) - # tfreq <- dfm_weight(x, scheme = scheme_tf, base = base) - # if (nfeat(x) != length(dfreq)) - # stop("missing some values in idf calculation") - # # get the document indexes - # j <- as(tfreq, "dgTMatrix")@j + 1 - # # replace just the non-zero values by product with idf - # x@x <- tfreq@x * dfreq[j] - # # record attributes - # - # ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2 - # - # # x@weightTf <- tfreq@weightTf - # # x@weightDf <- c(list(scheme = scheme_df, base = base), args) - # return(x) - # } - - ### Classification function - classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) { - # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance - if (length(inner_fold) > 0 && length(outer_fold) > 0) { - dfm_train <- dfm[-outer_fold,] %>% - .[-inner_fold,] - dfm_test <- dfm[-outer_fold,] %>% - .[inner_fold,] - # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data - } else if (length(outer_fold) > 0 ) { - dfm_train <- dfm[-outer_fold,] - dfm_test <- dfm[outer_fold,] - validation_cv <- T - # If only inner folds, validate performance directly on inner folds (is the same as above?) - } else if (length(inner_fold) > 0 ) { - dfm_train <- dfm[-inner_fold,] - dfm_test <- dfm[inner_fold,] - # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model - } else { - final <- T ### Indicate final modeling run on whole dataset - dfm_train <- dfm - } - if (model == 'nb') { - scheme_tf <- 'count' # The 'old' way - } else { - scheme_tf <- 'prop' # The 'new' way - } - ### Getting features from training dataset - # Getting idf from training data, and using it to normalize both training and testing feature occurence - dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) - idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0) - dfm_train <- dfm_weight(dfm_train, weights = idf) - # Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher - # than the threshold on two or more of those categories - words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)), - feat_select, - dfm = dfm_train, - class_type = class_type, - percentile = params$percentiles, - measure = params$measures, - mc.cores = cores_feats - ))) - # dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq) - dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T) - - - if (model == "nb") { - text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial") - } - if (model == "svm") { - text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon) - } - if (model == 'nnet') { - idC <- class.ind(as.factor(docvars(dfm_train, class_type))) - text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2) - } - # if (model == 'neuralnet') { - # dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T) - # dfm_test <- custom_tfidf(dfm_test, scheme_tf = "prop", scheme_df = "inverse", base = 10, dfreq = dfreq[words]) - # - # idC <- class.ind(as.factor(docvars(dfm_train, class_type))) - # colnames(idC) <- NULL - # nn_train <- cbind(idC, dfm_train) - # n <- colnames(nn_train)[3:length(colnames(nn_train))] - # f <- as.formula(paste0("feat1 + feat2 ~ `", paste0(n, collapse = "` + `"),"`")) - # idC_out <- class.ind(as.factor(docvars(dfm_test, class_type))) - # colnames(idC_out) <- NULL - # nn_test <- cbind(idC_out, dfm_test) - # nn <- neuralnet(f,data=nn_train,hidden=3,linear.output=F,act.fct = 'logistic',lifesign = 'full', threshold = .005) - # pr.nn <- compute(nn,nn_test[,3:length(colnames(nn_test))]) - # class_table <- table(prediction = as.matrix(round(pr.nn$net.result[,2])), trueValues = as.matrix(idC_out[,2])) - # nn_conf_mat <- confusionMatrix(class_table, mode = "everything") - # } - ### Add more if statements for different models - if (exists("final")) { - return(list(text_model=text_model, idf=idf)) - } else { - ### Removing all features not in training set from test set and weighting the remaining features according to training idf - dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T) - dfm_test <- dfm_weight(dfm_test, weights = idf) - pred <- predict(text_model, newdata = dfm_test, type = 'class') - ### Fix for single-class 'predictions' in borderline situations - # if (length(unique(pred)) == 1 & class_type == 'junk') { - # if (unique(pred) == '0') { - # pred[1] <- '1' - # } else { - # pred[1] <- '0' - # } - # } - - ### Fix for missing classes in multiclass classification - u <- union(pred, docvars(dfm_test, class_type)) - - class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u)) - if (length(unique(u)) == 2) { - conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u)) - } else { - conf_mat <- confusionMatrix(class_table, mode = "everything") - conf_mat$positive <- NA - } - if (exists("validation_cv")) { - return(data.frame( - tv = docvars(dfm_test, class_type), - pred = pred, - params = params, - pos_cat = conf_mat$positive, - stringsAsFactors = F - )) - } - if (is.matrix(conf_mat$byClass) == T) { - return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params)) - } else { - return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive)) - } - } - } - - ### If outer_k is 1, do a holdout training run, with only cross-validation for parameter optimization, else, do nested CV - ### If holdout, training/test distribution is the same as for inner CV - if (outer_k == 1) { - outer_fold <- createDataPartition(as.factor(docvars(dfm, class_type)), p=1-(1/inner_k)*(inner_k-1)) - folds <- lapply(outer_fold,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type) - } else { - ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script) - folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type) - } - - ## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator) - performance <- mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer) - - ## Set seed and generate folds for final hyperparameter optimization search (using CV) - set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") - folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k)) - - ## Get the final hyperparameter performance for all value combinations in grid - params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)) - - ## Select optimum final hyperparameters - optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>% - select(percentiles: ncol(.)) - - ## Estimate final model on whole dataset, using optimum final hyperparameters determined above - model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer))) - rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment()) - save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment()) - return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData')) +modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) { + ## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building + folds <- cv_generator(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed) + inner_grid <- folds$grid + outer_folds <- folds$outer_folds + inner_folds <- folds$inner_folds + + ## Create multithread work pool for future_lapply + plan(strategy = multiprocess, workers = cores) + + ## Use estimator function to build models for every parameter combination in grid + inner_cv_output <- future_lapply(1:nrow(inner_grid), estimator, + grid = inner_grid, + outer_folds = outer_folds, + inner_folds = inner_folds, + dfm = dfm, + class_type = class_type, + model = model) %>% + future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row + bind_rows(.) + + + outer_grid <- inner_cv_output %>% + # Group by outer folds, and by parameters used for model tuning + group_by_at(c("outer_fold", colnames(grid))) %>% + # Get mean values for all numeric (performance indicator) variables + summarise_if(is.numeric, mean, na.rm = F) %>% + # Group the mean performance indicators by outer_fold + group_by(outer_fold) %>% + # Get for each outer_fold the row with the highest value of opt_measure + slice(which.max((!!as.name(opt_measure)))) %>% + # Select only the columns outer_fold, and the columns that are in the original parameter grid + select(outer_fold, colnames(grid)) + + # Use the estimator function to build optimum models for each outer_fold + outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator, + grid = outer_grid, + outer_folds = outer_folds, + inner_folds = NULL, + dfm = dfm, + class_type = class_type, + model = model) %>% + future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid + bind_rows(.) + + # Create (inner) folds for parameter optimization on the entire dataset + final_folds <- cv_generator(NULL,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed) + final_grid <- final_folds$grid + final_inner <- final_folds$inner_folds + + # Use the estimator function to estimate the performance of each row in final_grid + final_cv_output <- future_lapply(1:nrow(final_grid), estimator, + grid = final_grid, + outer_folds = NULL, + inner_folds = final_inner, + dfm = dfm, + class_type = class_type, + model = model) %>% + future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid + bind_rows(.) + + + final_params <- final_cv_output %>% + # Group final parameter optimization cv results by parameters used for optimization + group_by_at(colnames(grid)) %>% + # Get mean performance metrics for each fold + summarise_if(is.numeric, mean, na.rm = F) %>% + # Ungroup to allow for slicing + ungroup() %>% + # Select row with highest value of opt_measure + slice(which.max((!!as.name(opt_measure)))) %>% + # Keep only the columns that are present in the original parameter grid + select(colnames(grid)) + # Use the estimator function to estimate the final model, using the optimum parameters provided in final_params + model_final <- estimator(1, + grid = final_params, + outer_folds = NULL, + inner_folds = NULL, + dfm = dfm, + class_type = class_type, + model = model) + # Create list with output variables + output <- list(final_cv_output = final_cv_output, + final_params = final_params, + outer_cv_output = outer_cv_output, + model_final = model_final, + grid = grid, + seed = seed, + opt_measure = opt_measure, + model = model, + country = country, + class_type = class_type) + + # Set name for output .Rds file + filename <- paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.Rds') + + # Save RDS file with output + saveRDS(output, file = filename) + + # Return ouput RDS filename + return(filename) } diff --git a/R/modelizer_old.R b/R/modelizer_old.R new file mode 100644 index 0000000..db7221d --- /dev/null +++ b/R/modelizer_old.R @@ -0,0 +1,301 @@ +#' Generate a classification model +#' +#' Generate a nested cross validated classification model based on a dfm with class labels as docvars +#' Currently only supports Naïve Bayes using quanteda's textmodel_nb +#' Hyperparemeter optimization is enabled through the grid parameter +#' A grid should be generated from vectors with the labels as described for each model, using the crossing() command +#' For Naïve Bayes, the following parameters can be used: +#' - percentiles (cutoff point for tf-idf feature selection) +#' - measures (what measure to use for determining feature importance, see textstat_keyness for options) +#' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars +#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds) +#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer) +#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid) +#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner) +#' @param seed Integer to use as seed for random number generation, ensures replicability +#' @param outer_k Number of outer cross-validation folds (for performance estimation) +#' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection) +#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb) +#' @param class_type Type of classification to model ("junk", "aggregate", or "codes") +#' @param opt_measure Label of measure in confusion matrix to use as performance indicator +#' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename) +#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search) +#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation +#' @export +#' @examples +#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid) +################################################################################################# +#################################### Function to generate classification models ################# +################################################################################################# + +modelizer_old <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) { + ### Functions ### + feat_select <- function (topic, dfm, class_type, percentile,measure) { + keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>% + na.omit() + keyness[,2] <- abs(keyness[,2]) + keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature + return(keyness) + } + ### Generate inner folds for nested cv + inner_loop <- function(fold, dfm, inner_k, class_type) { + # RNG needs to be set explicitly for each fold + set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") + ## Either createDataPartition for simple holdout parameter optimization + ## Or createFolds for proper inner CV for nested CV + # if (inner_k <= 1) { + # inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8) + # } else { + inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k) + # } + return(c(outer_fold = list(fold),inner_folds)) + } + + ### Generate outer folds for nested cv + generate_folds <- function(outer_k, inner_k, dfm, class_type){ + set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") + folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k) + return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type)) + } + + ### Gets called for every parameter combination, and calls classifier for every inner cv fold + inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) { + print(str_c('params ',row)) + params <- grid[row,] + # For each inner fold, cross validate the specified parameters + res <- + bind_rows(mclapply(inner_folds, + classifier, + outer_fold = outer_fold, + params = params, + dfm = dfm, + class_type = class_type, + model = model, + cores_feats = cores_feats, + mc.cores = cores_inner + ) + ) + # print(res) + # print(res[1,1]) + # print('inner_cv') + return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params)) + } + + ### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid + outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) { + print('outer cv') + # If fold contains both inner folds and outer fold + if (length(fold) == inner_k + 1) { + inner_folds <- fold[-1] + outer_fold <- fold$outer_fold + # For each row in grid, cross-validate results + res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1), + inner_cv, + cores_feats= cores_feats, + grid = grid, + dfm = dfm, + class_type = class_type, + model = model, + outer_fold = outer_fold, + inner_folds = inner_folds, + cores_inner = cores_inner, + mc.cores = cores_grid) + ) + # print(res) + # print(res[1,1]) + # print('outer_cv') + # Determine optimum hyperparameters within outer fold training set + optimum <- res[which.max(res[,opt_measure]),] %>% + select(percentiles: ncol(.)) + # Validate performance of optimum hyperparameters on outer fold test set + return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats)) + } else { + # If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters + inner_folds <- fold + outer_fold <- NULL + res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1), + inner_cv, + cores_feats= cores_feats, + grid = grid, + dfm = dfm, + class_type = class_type, + model = model, + outer_fold = outer_fold, + inner_folds = inner_folds, + cores_inner = cores_inner, + mc.cores = cores_grid) + ) + # print(res) + # print(res[1,1]) + # print('line 126, final model parameter optimization') + return(res) + } + } + + # ### Custom tfidf function to allow same idf for different dfm's + # custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) { + # if (!nfeat(x) || !ndoc(x)) return(x) + # tfreq <- dfm_weight(x, scheme = scheme_tf, base = base) + # if (nfeat(x) != length(dfreq)) + # stop("missing some values in idf calculation") + # # get the document indexes + # j <- as(tfreq, "dgTMatrix")@j + 1 + # # replace just the non-zero values by product with idf + # x@x <- tfreq@x * dfreq[j] + # # record attributes + # + # ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2 + # + # # x@weightTf <- tfreq@weightTf + # # x@weightDf <- c(list(scheme = scheme_df, base = base), args) + # return(x) + # } + + ### Classification function + classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) { + # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance + if (length(inner_fold) > 0 && length(outer_fold) > 0) { + dfm_train <- dfm[-outer_fold,] %>% + .[-inner_fold,] + dfm_test <- dfm[-outer_fold,] %>% + .[inner_fold,] + # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data + } else if (length(outer_fold) > 0 ) { + dfm_train <- dfm[-outer_fold,] + dfm_test <- dfm[outer_fold,] + validation_cv <- T + # If only inner folds, validate performance directly on inner folds (is the same as above?) + } else if (length(inner_fold) > 0 ) { + dfm_train <- dfm[-inner_fold,] + dfm_test <- dfm[inner_fold,] + # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model + } else { + final <- T ### Indicate final modeling run on whole dataset + dfm_train <- dfm + } + if (model == 'nb') { + scheme_tf <- 'count' # The 'old' way + } else { + scheme_tf <- 'prop' # The 'new' way + } + ### Getting features from training dataset + # Getting idf from training data, and using it to normalize both training and testing feature occurence + dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) + idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0) + dfm_train <- dfm_weight(dfm_train, weights = idf) + # Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher + # than the threshold on two or more of those categories + words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)), + feat_select, + dfm = dfm_train, + class_type = class_type, + percentile = params$percentiles, + measure = params$measures, + mc.cores = cores_feats + ))) + # dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq) + dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T) + + + if (model == "nb") { + text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial") + } + if (model == "svm") { + text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon) + } + if (model == 'nnet') { + idC <- class.ind(as.factor(docvars(dfm_train, class_type))) + text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2) + } + # if (model == 'neuralnet') { + # dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T) + # dfm_test <- custom_tfidf(dfm_test, scheme_tf = "prop", scheme_df = "inverse", base = 10, dfreq = dfreq[words]) + # + # idC <- class.ind(as.factor(docvars(dfm_train, class_type))) + # colnames(idC) <- NULL + # nn_train <- cbind(idC, dfm_train) + # n <- colnames(nn_train)[3:length(colnames(nn_train))] + # f <- as.formula(paste0("feat1 + feat2 ~ `", paste0(n, collapse = "` + `"),"`")) + # idC_out <- class.ind(as.factor(docvars(dfm_test, class_type))) + # colnames(idC_out) <- NULL + # nn_test <- cbind(idC_out, dfm_test) + # nn <- neuralnet(f,data=nn_train,hidden=3,linear.output=F,act.fct = 'logistic',lifesign = 'full', threshold = .005) + # pr.nn <- compute(nn,nn_test[,3:length(colnames(nn_test))]) + # class_table <- table(prediction = as.matrix(round(pr.nn$net.result[,2])), trueValues = as.matrix(idC_out[,2])) + # nn_conf_mat <- confusionMatrix(class_table, mode = "everything") + # } + ### Add more if statements for different models + if (exists("final")) { + return(list(text_model=text_model, idf=idf)) + } else { + ### Removing all features not in training set from test set and weighting the remaining features according to training idf + dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T) + dfm_test <- dfm_weight(dfm_test, weights = idf) + pred <- predict(text_model, newdata = dfm_test, type = 'class') + ### Fix for single-class 'predictions' in borderline situations + # if (length(unique(pred)) == 1 & class_type == 'junk') { + # if (unique(pred) == '0') { + # pred[1] <- '1' + # } else { + # pred[1] <- '0' + # } + # } + + ### Fix for missing classes in multiclass classification + u <- union(pred, docvars(dfm_test, class_type)) + + class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u)) + if (length(unique(u)) == 2) { + conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u)) + } else { + conf_mat <- confusionMatrix(class_table, mode = "everything") + conf_mat$positive <- NA + } + if (exists("validation_cv")) { + return(data.frame( + tv = docvars(dfm_test, class_type), + pred = pred, + params = params, + pos_cat = conf_mat$positive, + stringsAsFactors = F + )) + } + if (is.matrix(conf_mat$byClass) == T) { + return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params)) + } else { + return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive)) + } + } + } + + ### If outer_k is 1, do a holdout training run, with only cross-validation for parameter optimization, else, do nested CV + ### If holdout, training/test distribution is the same as for inner CV + if (outer_k < 1) { + set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") + outer_fold <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k) + folds <- lapply(outer_fold,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type) + } else { + ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script) + folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type) + } + + ## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator) + performance <- mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer) + + ## Set seed and generate folds for final hyperparameter optimization search (using CV) + set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") + folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k)) + + ## Get the final hyperparameter performance for all value combinations in grid + params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)) + + ## Select optimum final hyperparameters + optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>% + select(percentiles: ncol(.)) + + ## Estimate final model on whole dataset, using optimum final hyperparameters determined above + model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer))) + rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment()) + save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment()) + return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData')) +} diff --git a/R/out_parser.R b/R/out_parser.R index 5216e26..4d30fde 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -1,11 +1,12 @@ #' Parse raw text into a single field #' -#' Parse raw text into a single field +#' Parse raw text from the MaML database into a single field #' @param out The original output data frame #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code) #' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available) #' @return a parsed output data frame including the additional column 'merged', containing the merged text +#' @export #' @examples #' out_parser(out,field) diff --git a/man/class_update.Rd b/man/class_update.Rd index 5d7b26b..fe6c587 100644 --- a/man/class_update.Rd +++ b/man/class_update.Rd @@ -4,9 +4,9 @@ \alias{class_update} \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information} \usage{ -class_update(out, localhost = T, model_final, dfm_words, varname, text, - words, clean, ver, - es_super = .rs.askForPassword("ElasticSearch WRITE")) +class_update(out, localhost = T, model_final, varname, text, words, + clean, ver, es_super = .rs.askForPassword("ElasticSearch WRITE"), + cores = 1) } \arguments{ \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} @@ -15,8 +15,6 @@ class_update(out, localhost = T, model_final, dfm_words, varname, text, \item{model_final}{The classification model (e.g. output from textstat_nb(), svm() or others)} -\item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)} - \item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags} @@ -28,6 +26,8 @@ class_update(out, localhost = T, model_final, dfm_words, varname, text, \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} \item{es_super}{Password for write access to ElasticSearch} + +\item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)} } \value{ As this is a nested function used within elasticizer, there is no return output diff --git a/man/cv_generator.Rd b/man/cv_generator.Rd new file mode 100644 index 0000000..aef2925 --- /dev/null +++ b/man/cv_generator.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cv_generator.R +\name{cv_generator} +\alias{cv_generator} +\title{Generate CV folds for nested cross-validation} +\usage{ +cv_generator(outer_k, inner_k, dfm, class_type, grid, seed) +} +\arguments{ +\item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data} + +\item{inner_k}{Number of inner CV (parameter optimization) folds} + +\item{dfm}{DFM containing the labeled documents} + +\item{class_type}{Name of the column in docvars containing the classification} + +\item{grid}{Parameter grid for optimization} + +\item{seed}{integer used as seed for random number generation} +} +\value{ +A nested set of lists with row numbers +} +\description{ +Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination +} +\examples{ +cv_generator(outer_k, inner_k, dfm, class_type) +} diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index dfe2e17..28105e0 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -4,8 +4,8 @@ \alias{dfm_gen} \title{Generates dfm from ElasticSearch output} \usage{ -dfm_gen(out, words = "999", text = "lemmas", clean, - cores = detectCores()) +dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, + tolower = T) } \arguments{ \item{out}{The elasticizer-generated data frame} @@ -17,6 +17,8 @@ dfm_gen(out, words = "999", text = "lemmas", clean, \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} \item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)} + +\item{tolower}{Boolean indicating whether dfm features should be lowercased} } \value{ A Quanteda dfm diff --git a/man/estimator.Rd b/man/estimator.Rd new file mode 100644 index 0000000..3dabd48 --- /dev/null +++ b/man/estimator.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/estimator.R +\name{estimator} +\alias{estimator} +\title{Generate models and get classifications on test sets} +\usage{ +estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model) +} +\arguments{ +\item{row}{Row number of current item in grid} + +\item{grid}{Grid with model parameters and CV folds} + +\item{outer_folds}{List with row numbers for outer folds} + +\item{dfm}{DFM containing labeled documents} + +\item{class_type}{Name of column in docvars() containing the classes} + +\item{model}{Model to use (currently only nb)} +} +\value{ +Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values. +} +\description{ +Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination +} +\examples{ +estimator(row, grid, outer_folds, dfm, class_type, model) +} diff --git a/man/feat_select.Rd b/man/feat_select.Rd new file mode 100644 index 0000000..dde878b --- /dev/null +++ b/man/feat_select.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feat_select.R +\name{feat_select} +\alias{feat_select} +\title{Select features using quanteda textstat_keyness} +\usage{ +feat_select(topic, dfm, class_type, percentile, measure = "chi2") +} +\arguments{ +\item{topic}{The topic to determine keywords for} + +\item{dfm}{The input dfm} + +\item{class_type}{Name of the column in docvars containing the classification} + +\item{percentile}{Cutoff for the list of words that should be returned} + +\item{measure}{Measure to use in determining keyness, default = chi2; see textstat_keyness for other options} +} +\value{ +A vector of words that are key to the topic +} +\description{ +Select features based on the textstat_keyness function and a percentile cutoff +Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic +} +\examples{ +feat_select(topic, dfm, class_type, percentile, measure="chi2") +} diff --git a/man/lemma_writer.Rd b/man/lemma_writer.Rd index 4cd8305..406fb96 100644 --- a/man/lemma_writer.Rd +++ b/man/lemma_writer.Rd @@ -4,7 +4,8 @@ \alias{lemma_writer} \title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings} \usage{ -lemma_writer(out, file, localhost = F, documents = F, cores = 1) +lemma_writer(out, file, localhost = F, documents = F, lemma = F, + cores = 1) } \arguments{ \item{out}{The elasticizer-generated data frame} @@ -15,6 +16,8 @@ lemma_writer(out, file, localhost = F, documents = F, cores = 1) \item{documents}{Indicate whether the writer should output to a single file, or individual documents} +\item{lemma}{Indicate whether document output should be lemmas or original document} + \item{cores}{Indicate the number of cores to use for parallel processing} } \value{ diff --git a/man/metric_gen.Rd b/man/metric_gen.Rd new file mode 100644 index 0000000..b603462 --- /dev/null +++ b/man/metric_gen.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metric_gen.R +\name{metric_gen} +\alias{metric_gen} +\title{Generate performance statistics for models} +\usage{ +metric_gen(x) +} +\arguments{ +\item{x}{A data frame containing at least the columns "pred" and "tv"} +} +\value{ +x, with additional columns for performance metrics +} +\description{ +Generate performance statistics for models, based on their predictions and the true values +} +\examples{ +metric_gen(x) +} diff --git a/man/modelizer.Rd b/man/modelizer.Rd index 5bd3697..32293aa 100644 --- a/man/modelizer.Rd +++ b/man/modelizer.Rd @@ -4,28 +4,16 @@ \alias{modelizer} \title{Generate a classification model} \usage{ -modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, - outer_k, inner_k, model, class_type, opt_measure, country, grid) +modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, + seed, model, cores = 1) } \arguments{ \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars} -\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)} - -\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)} - -\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)} - -\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)} - -\item{seed}{Integer to use as seed for random number generation, ensures replicability} - \item{outer_k}{Number of outer cross-validation folds (for performance estimation)} \item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)} -\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)} - \item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")} \item{opt_measure}{Label of measure in confusion matrix to use as performance indicator} @@ -33,9 +21,15 @@ modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, \item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)} \item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)} + +\item{seed}{Integer to use as seed for random number generation, ensures replicability} + +\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)} + +\item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1} } \value{ -An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation +An .Rds file in the current working directory (getwd()) with a list containing all relevant output } \description{ Generate a nested cross validated classification model based on a dfm with class labels as docvars @@ -47,5 +41,5 @@ For Naïve Bayes, the following parameters can be used: - measures (what measure to use for determining feature importance, see textstat_keyness for options) } \examples{ -modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid) +modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) } diff --git a/man/modelizer_old.Rd b/man/modelizer_old.Rd new file mode 100644 index 0000000..6000ea7 --- /dev/null +++ b/man/modelizer_old.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/modelizer_old.R +\name{modelizer_old} +\alias{modelizer_old} +\title{Generate a classification model} +\usage{ +modelizer_old(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, + outer_k, inner_k, model, class_type, opt_measure, country, grid) +} +\arguments{ +\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars} + +\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)} + +\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)} + +\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)} + +\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)} + +\item{seed}{Integer to use as seed for random number generation, ensures replicability} + +\item{outer_k}{Number of outer cross-validation folds (for performance estimation)} + +\item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)} + +\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)} + +\item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")} + +\item{opt_measure}{Label of measure in confusion matrix to use as performance indicator} + +\item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)} + +\item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)} +} +\value{ +An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation +} +\description{ +Generate a nested cross validated classification model based on a dfm with class labels as docvars +Currently only supports Naïve Bayes using quanteda's textmodel_nb +Hyperparemeter optimization is enabled through the grid parameter +A grid should be generated from vectors with the labels as described for each model, using the crossing() command +For Naïve Bayes, the following parameters can be used: +- percentiles (cutoff point for tf-idf feature selection) +- measures (what measure to use for determining feature importance, see textstat_keyness for options) +} +\examples{ +modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid) +} diff --git a/man/out_parser.Rd b/man/out_parser.Rd index 666eabb..de20048 100644 --- a/man/out_parser.Rd +++ b/man/out_parser.Rd @@ -19,7 +19,7 @@ out_parser(out, field, clean = F, cores = 1) a parsed output data frame including the additional column 'merged', containing the merged text } \description{ -Parse raw text into a single field +Parse raw text from the MaML database into a single field } \examples{ out_parser(out,field)