diff --git a/DESCRIPTION b/DESCRIPTION index 0f036aa..3dfe698 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,9 @@ Depends: R (>= 3.3.1), parallel, tidyverse, quanteda, - httr + httr, + caret, + e1071 License: Copyright Erik de Vries Encoding: UTF-8 LazyData: true diff --git a/NAMESPACE b/NAMESPACE index 72b479d..11e193e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,10 @@ # Generated by roxygen2: do not edit by hand export(bulk_writer) +export(class_update) export(dfm_gen) export(dupe_detect) export(elastic_update) export(elasticizer) export(merger) +export(modelizer) diff --git a/R/class_update.R b/R/class_update.R new file mode 100644 index 0000000..845526c --- /dev/null +++ b/R/class_update.R @@ -0,0 +1,32 @@ +#' Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information +#' +#' Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information +#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function) +#' @param model_final The classification model (e.g. output from textstat_nb(), svm() or others) +#' @param dfm_words A dfm containing all the words and only the words used to generate the model (is used for subsetting) +#' @param varname String containing the variable name to use for the classification result, usually has the format computerCodes.varname +#' @param es_super Password for write access to ElasticSearch +#' @return As this is a nested function used within elasticizer, there is no return output +#' @export +#' @examples +#' elasticizer(query, src = T, es_pwd = es_pwd, update = class_update, model_final = model_final, dfm_words = dfm_words, varname = computerCodes.varname, es_super = es_super) +################################################################################################# +#################################### Update any kind of classification ########################## +################################################################################################# +class_update <- function(out, model_final, dfm_words, varname, es_super) { + print('updating') + dfm <- dfm_gen(out, text = 'lemmas') %>% + dfm_keep(dfm_words, valuetype="fixed", verbose=T) + pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm)) + bulk <- apply(pred, 1, bulk_writer, varname = varname) + res <- elastic_update(bulk, es_super = es_super) + stop_for_status(res) + content(res, "parsed", "application/json") + appData <- content(res) + if (appData$errors == T){ + print(appData) + stop("Aborting, errors found during updating") + } + print("updated") + Sys.sleep(1) +} diff --git a/R/elasticizer.R b/R/elasticizer.R index 68fcda0..dd4e429 100644 --- a/R/elasticizer.R +++ b/R/elasticizer.R @@ -5,6 +5,7 @@ #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved #' @param index The name of the Elasticsearch index to search through #' @param update When set, indicates an update function to use on each batch of 1000 articles +#' @param ... Parameters passed on to the update function #' @return A data frame containing all the search results #' @export diff --git a/R/modelizer.R b/R/modelizer.R new file mode 100644 index 0000000..10f072b --- /dev/null +++ b/R/modelizer.R @@ -0,0 +1,224 @@ +#' Generate a classification model +#' +#' Generate a nested cross validated classification model based on a dfm with class labels as docvars +#' Currently only supports Naïve Bayes using quanteda's textmodel_nb +#' Hyperparemeter optimization is enabled through the grid parameter +#' A grid should be generated from vectors with the labels as described for each model, using the crossing() command +#' For Naïve Bayes, the following parameters can be used: +#' - percentiles (cutoff point for tf-idf feature selection) +#' - measures (what measure to use for determining feature importance, see textstat_keyness for options) +#' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars +#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds) +#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer) +#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid) +#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner) +#' @param seed Integer to use as seed for random number generation, ensures replicability +#' @param outer_k Number of outer cross-validation folds (for performance estimation) +#' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection) +#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb) +#' @param class_type Type of classification to model ("junk", "aggregate", or "codes") +#' @param opt_measure Label of measure in confusion matrix to use as performance indicator +#' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename) +#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search) +#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation +#' @export +#' @examples +#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid) +################################################################################################# +#################################### Function to generate classification models ################# +################################################################################################# + +modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) { + ### Functions ### + feat_select <- function (topic, dfm, class_type, percentile,measure) { + keyness <- textstat_keyness(dfm, measure = measure, docvars(dfm, class_type) == as.numeric(topic)) %>% + na.omit() + keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature + return(keyness) + } + ### Generate inner folds for nested cv + inner_loop <- function(fold, dfm, inner_k, class_type) { + # RNG needs to be set explicitly for each fold + set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") + ## Either createDataPartition for simple holdout parameter optimization + ## Or createFolds for proper inner CV for nested CV + # if (inner_k <= 1) { + # inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8) + # } else { + inner_folds <- createFolds(as.factor(docvars(dfm[-fold], class_type)), k= inner_k) + # } + return(c(outer_fold = list(fold),inner_folds)) + } + + ### Generate outer folds for nested cv + generate_folds <- function(outer_k, inner_k, dfm, class_type){ + set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") + folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k) + return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type)) + } + + ### Gets called for every parameter combination, and calls classifier for every inner cv fold + inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) { + params <- grid[row,] + # For each inner fold, cross validate the specified parameters + res <- + bind_rows(mclapply(inner_folds, + classifier, + outer_fold = outer_fold, + params = params, + dfm = dfm, + class_type = class_type, + model = model, + cores_feats = cores_feats, + mc.cores = cores_inner + ) + ) + + return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params)) + } + + ### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid + outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) { + # If fold contains both inner folds and outer fold + if (length(fold) == inner_k + 1) { + inner_folds <- fold[-1] + outer_fold <- fold$outer_fold + # For each row in grid, cross-validate results + res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1), + inner_cv, + cores_feats= cores_feats, + grid = grid, + dfm = dfm, + class_type = class_type, + model = model, + outer_fold = outer_fold, + inner_folds = inner_folds, + cores_inner = cores_inner, + mc.cores = cores_grid) + ) + # Determine optimum hyperparameters within outer fold training set + optimum <- res[which.max(res[,opt_measure]),] %>% + select(percentiles: ncol(.)) + # Validate performance of optimum hyperparameters on outer fold test set + return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats)) + } else { + # If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters + inner_folds <- fold + outer_fold <- NULL + res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1), + inner_cv, + cores_feats= cores_feats, + grid = grid, + dfm = dfm, + class_type = class_type, + model = model, + outer_fold = outer_fold, + inner_folds = inner_folds, + cores_inner = cores_inner, + mc.cores = cores_grid) + ) + return(res) + } + } + + ### Custom tfidf function to allow same idf for different dfm's + custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) { + if (!nfeat(x) || !ndoc(x)) return(x) + tfreq <- dfm_weight(x, scheme = scheme_tf, base = base) + if (nfeat(x) != length(dfreq)) + stop("missing some values in idf calculation") + # get the document indexes + j <- as(tfreq, "dgTMatrix")@j + 1 + # replace just the non-zero values by product with idf + x@x <- tfreq@x * dfreq[j] + # record attributes + x@weightTf <- tfreq@weightTf + x@weightDf <- c(list(scheme = scheme_df, base = base), args) + return(x) + } + + ### Classification function + classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) { + # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance + if (length(inner_fold) > 0 && length(outer_fold) > 0) { + dfm_train <- dfm[-outer_fold] %>% + .[-inner_fold] + dfm_test <- dfm[-outer_fold] %>% + .[inner_fold] + # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data + } else if (length(outer_fold) > 0 ) { + dfm_train <- dfm[-outer_fold] + dfm_test <- dfm[outer_fold] + # If only inner folds, validate performance directly on inner folds (is the same as above?) + } else if (length(inner_fold) > 0 ) { + dfm_train <- dfm[-inner_fold] + dfm_test <- dfm[inner_fold] + # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model + } else { + final <- T ### Indicate final modeling run on whole dataset + dfm_train <- dfm + } + ### Getting features from training dataset + # Getting idf from training data, and using it to normalize both training and testing feature occurence + dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) + dfreq <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0, use.names=T) + dfm_train <- custom_tfidf(dfm_train, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) + words <- unlist(mclapply(unique(docvars(dfm_train, class_type)), + feat_select, + dfm = dfm_train, + class_type = class_type, + percentile = params$percentiles, + measure = params$measures, + mc.cores = cores_feats + )) + dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T) + + + if (model == "nb") { + text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial") + } + if (model == "svm") { + text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = "linear", gamma = params$gamma, cost = params$cost, epsilon = params$epsilon) + } + ### Add more if statements for different models + if (exists("final") == T) { + return(text_model) + } else { + ### Removing all features not in training set from test set and weighting the remaining features according to training idf + dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T) + dfm_test <- custom_tfidf(dfm_test, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq[words]) + pred <- predict(text_model, newdata = dfm_test) + class_table <- table(prediction = pred, trueValues = docvars(dfm_test, class_type)) + conf_mat <- confusionMatrix(class_table, mode = "everything") + if (is.matrix(conf_mat$byClass) == T) { + return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params)) + } else { + return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive)) + } + } + } + + ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script) + folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type) + + ## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator) + performance <- bind_rows(mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)) %>% + bind_rows(., colMeans(select(., 1:`Balanced Accuracy`))) + + ## Set seed and generate folds for final hyperparameter optimization search (using CV) + set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") + folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k)) + + ## Get the final hyperparameter performance for all value combinations in grid + params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)) + + ## Select optimum final hyperparameters + optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>% + select(percentiles: ncol(.)) + + ## Estimate final model on whole dataset, using optimum final hyperparameters determined above + model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = detectCores()) + rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment()) + save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment()) + return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData')) +} diff --git a/man/class_update.Rd b/man/class_update.Rd new file mode 100644 index 0000000..12c5548 --- /dev/null +++ b/man/class_update.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/class_update.R +\name{class_update} +\alias{class_update} +\title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information} +\usage{ +class_update(out, model_final, dfm_words, varname, es_super) +} +\arguments{ +\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} + +\item{model_final}{The classification model (e.g. output from textstat_nb(), svm() or others)} + +\item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)} + +\item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname} + +\item{es_super}{Password for write access to ElasticSearch} +} +\value{ +As this is a nested function used within elasticizer, there is no return output +} +\description{ +Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information +} +\examples{ +elasticizer(query, src = T, es_pwd = es_pwd, update = class_update, model_final = model_final, dfm_words = dfm_words, varname = computerCodes.varname, es_super = es_super) +} diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd index c69975f..5ac31ad 100644 --- a/man/elasticizer.Rd +++ b/man/elasticizer.Rd @@ -16,6 +16,8 @@ elasticizer(query, src = T, index = "maml", \item{index}{The name of the Elasticsearch index to search through} \item{update}{When set, indicates an update function to use on each batch of 1000 articles} + +\item{...}{Parameters passed on to the update function} } \value{ A data frame containing all the search results diff --git a/man/modelizer.Rd b/man/modelizer.Rd new file mode 100644 index 0000000..5bd3697 --- /dev/null +++ b/man/modelizer.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/modelizer.R +\name{modelizer} +\alias{modelizer} +\title{Generate a classification model} +\usage{ +modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, + outer_k, inner_k, model, class_type, opt_measure, country, grid) +} +\arguments{ +\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars} + +\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)} + +\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)} + +\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)} + +\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)} + +\item{seed}{Integer to use as seed for random number generation, ensures replicability} + +\item{outer_k}{Number of outer cross-validation folds (for performance estimation)} + +\item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)} + +\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)} + +\item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")} + +\item{opt_measure}{Label of measure in confusion matrix to use as performance indicator} + +\item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)} + +\item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)} +} +\value{ +An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation +} +\description{ +Generate a nested cross validated classification model based on a dfm with class labels as docvars +Currently only supports Naïve Bayes using quanteda's textmodel_nb +Hyperparemeter optimization is enabled through the grid parameter +A grid should be generated from vectors with the labels as described for each model, using the crossing() command +For Naïve Bayes, the following parameters can be used: +- percentiles (cutoff point for tf-idf feature selection) +- measures (what measure to use for determining feature importance, see textstat_keyness for options) +} +\examples{ +modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid) +}