Updated elasticizer docs, created modelizer and class_update functions

7 years ago · d203de0b2a
parent c815dc7f2b
commit d203de0b2a
8 changed files with 343 additions and 1 deletions
--- a/4
+++ b/4
@ -9,7 +9,9 @@ Depends: R (>= 3.3.1),
    parallel,
    tidyverse,
    quanteda,
-    httr
+    httr,
+    caret,
+    e1071
 License: Copyright Erik de Vries
 Encoding: UTF-8
 LazyData: true
--- a/2
+++ b/2
@ -1,8 +1,10 @@
 # Generated by roxygen2: do not edit by hand

 export(bulk_writer)
+export(class_update)
 export(dfm_gen)
 export(dupe_detect)
 export(elastic_update)
 export(elasticizer)
 export(merger)
+export(modelizer)
--- a/R/class_update.R
+++ b/R/class_update.R
@ -0,0 +1,32 @@
+#' Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information
+#'
+#' Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information
+#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
+#' @param model_final The classification model (e.g. output from textstat_nb(), svm() or others)
+#' @param dfm_words A dfm containing all the words and only the words used to generate the model (is used for subsetting)
+#' @param varname String containing the variable name to use for the classification result, usually has the format computerCodes.varname
+#' @param es_super Password for write access to ElasticSearch
+#' @return As this is a nested function used within elasticizer, there is no return output
+#' @export
+#' @examples
+#' elasticizer(query, src = T, es_pwd = es_pwd, update = class_update, model_final = model_final, dfm_words = dfm_words, varname = computerCodes.varname, es_super = es_super)
+#################################################################################################
+#################################### Update any kind of classification ##########################
+#################################################################################################
+class_update <- function(out, model_final, dfm_words, varname, es_super) {
+  print('updating')
+  dfm <- dfm_gen(out, text = 'lemmas') %>%
+    dfm_keep(dfm_words, valuetype="fixed", verbose=T)
+  pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm))
+  bulk <- apply(pred, 1, bulk_writer, varname = varname)
+  res <- elastic_update(bulk, es_super = es_super)
+  stop_for_status(res)
+  content(res, "parsed", "application/json")
+  appData <- content(res)
+  if (appData$errors == T){
+    print(appData)
+    stop("Aborting, errors found during updating")
+  }
+  print("updated")
+  Sys.sleep(1)
+}
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -5,6 +5,7 @@
 #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
 #' @param index The name of the Elasticsearch index to search through
 #' @param update When set, indicates an update function to use on each batch of 1000 articles
+#' @param ... Parameters passed on to the update function

 #' @return A data frame containing all the search results
 #' @export
--- a/R/modelizer.R
+++ b/R/modelizer.R
@ -0,0 +1,224 @@
+#' Generate a classification model
+#'
+#' Generate a nested cross validated classification model based on a dfm with class labels as docvars
+#' Currently only supports Naïve Bayes using quanteda's textmodel_nb
+#' Hyperparemeter optimization is enabled through the grid parameter
+#' A grid should be generated from vectors with the labels as described for each model, using the crossing() command
+#' For Naïve Bayes, the following parameters can be used:
+#' - percentiles (cutoff point for tf-idf feature selection)
+#' - measures (what measure to use for determining feature importance, see textstat_keyness for options)
+#' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars
+#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds)
+#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)
+#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)
+#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)
+#' @param seed Integer to use as seed for random number generation, ensures replicability
+#' @param outer_k Number of outer cross-validation folds (for performance estimation)
+#' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection)
+#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
+#' @param class_type Type of classification to model ("junk", "aggregate", or "codes")
+#' @param opt_measure Label of measure in confusion matrix to use as performance indicator
+#' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename)
+#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
+#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
+#' @export
+#' @examples
+#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
+#################################################################################################
+#################################### Function to generate classification models #################
+#################################################################################################
+
+modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
+  ### Functions ###
+  feat_select <- function (topic, dfm, class_type, percentile,measure) {
+    keyness <- textstat_keyness(dfm, measure = measure, docvars(dfm, class_type) == as.numeric(topic)) %>%
+      na.omit()
+    keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
+    return(keyness)
+  }
+  ### Generate inner folds for nested cv
+  inner_loop <- function(fold, dfm, inner_k, class_type) {
+    # RNG needs to be set explicitly for each fold
+    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+    ## Either createDataPartition for simple holdout parameter optimization
+    ## Or createFolds for proper inner CV for nested CV
+    # if (inner_k <= 1) {
+    #   inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
+    # } else {
+    inner_folds <- createFolds(as.factor(docvars(dfm[-fold], class_type)), k= inner_k)
+    # }
+    return(c(outer_fold = list(fold),inner_folds))
+  }
+
+  ### Generate outer folds for nested cv
+  generate_folds <- function(outer_k, inner_k, dfm, class_type){
+    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+    folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
+    return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type))
+  }
+
+  ### Gets called for every parameter combination, and calls classifier for every inner cv fold
+  inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
+    params <- grid[row,]
+    # For each inner fold, cross validate the specified parameters
+    res <-
+      bind_rows(mclapply(inner_folds,
+                         classifier,
+                         outer_fold = outer_fold,
+                         params = params,
+                         dfm = dfm,
+                         class_type = class_type,
+                         model = model,
+                         cores_feats = cores_feats,
+                         mc.cores = cores_inner
+      )
+      )
+
+    return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params))
+  }
+
+  ### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
+  outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
+    # If fold contains both inner folds and outer fold
+    if (length(fold) == inner_k + 1) {
+      inner_folds <- fold[-1]
+      outer_fold <- fold$outer_fold
+      # For each row in grid, cross-validate results
+      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
+                                inner_cv,
+                                cores_feats= cores_feats,
+                                grid = grid,
+                                dfm = dfm,
+                                class_type = class_type,
+                                model = model,
+                                outer_fold = outer_fold,
+                                inner_folds = inner_folds,
+                                cores_inner = cores_inner,
+                                mc.cores = cores_grid)
+      )
+      # Determine optimum hyperparameters within outer fold training set
+      optimum <- res[which.max(res[,opt_measure]),] %>%
+        select(percentiles: ncol(.))
+      # Validate performance of optimum hyperparameters on outer fold test set
+      return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats))
+    } else {
+      # If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters
+      inner_folds <- fold
+      outer_fold <- NULL
+      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
+                                inner_cv,
+                                cores_feats= cores_feats,
+                                grid = grid,
+                                dfm = dfm,
+                                class_type = class_type,
+                                model = model,
+                                outer_fold = outer_fold,
+                                inner_folds = inner_folds,
+                                cores_inner = cores_inner,
+                                mc.cores = cores_grid)
+      )
+      return(res)
+    }
+  }
+
+  ### Custom tfidf function to allow same idf for different dfm's
+  custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
+    if (!nfeat(x) || !ndoc(x)) return(x)
+    tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
+    if (nfeat(x) != length(dfreq))
+      stop("missing some values in idf calculation")
+    # get the document indexes
+    j <- as(tfreq, "dgTMatrix")@j + 1
+    # replace just the non-zero values by product with idf
+    x@x <- tfreq@x * dfreq[j]
+    # record attributes
+    x@weightTf <- tfreq@weightTf
+    x@weightDf <- c(list(scheme = scheme_df, base = base), args)
+    return(x)
+  }
+
+  ### Classification function
+  classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
+    # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
+    if (length(inner_fold) > 0 && length(outer_fold) > 0)  {
+      dfm_train <- dfm[-outer_fold] %>%
+        .[-inner_fold]
+      dfm_test <- dfm[-outer_fold] %>%
+        .[inner_fold]
+      # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
+    } else if (length(outer_fold) > 0 ) {
+      dfm_train <- dfm[-outer_fold]
+      dfm_test <- dfm[outer_fold]
+      # If only inner folds, validate performance directly on inner folds (is the same as above?)
+    } else if (length(inner_fold) > 0 ) {
+      dfm_train <- dfm[-inner_fold]
+      dfm_test <- dfm[inner_fold]
+      # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
+    } else {
+      final <- T ### Indicate final modeling run on whole dataset
+      dfm_train <- dfm
+    }
+    ### Getting features from training dataset
+    # Getting idf from training data, and using it to normalize both training and testing feature occurence
+    dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
+    dfreq <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0, use.names=T)
+    dfm_train <- custom_tfidf(dfm_train, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq)
+    words <- unlist(mclapply(unique(docvars(dfm_train, class_type)),
+                             feat_select,
+                             dfm = dfm_train,
+                             class_type = class_type,
+                             percentile = params$percentiles,
+                             measure = params$measures,
+                             mc.cores = cores_feats
+    ))
+    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
+
+
+    if (model == "nb") {
+      text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
+    }
+    if (model == "svm") {
+      text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = "linear", gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
+    }
+    ### Add more if statements for different models
+    if (exists("final") == T) {
+      return(text_model)
+    } else {
+      ### Removing all features not in training set from test set and weighting the remaining features according to training idf
+      dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
+      dfm_test <- custom_tfidf(dfm_test, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq[words])
+      pred <- predict(text_model, newdata = dfm_test)
+      class_table <- table(prediction = pred, trueValues = docvars(dfm_test, class_type))
+      conf_mat <- confusionMatrix(class_table, mode = "everything")
+      if (is.matrix(conf_mat$byClass) == T) {
+        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
+      } else {
+        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive))
+      }
+    }
+  }
+
+  ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script)
+  folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type)
+
+  ## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator)
+  performance <- bind_rows(mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)) %>%
+    bind_rows(., colMeans(select(., 1:`Balanced Accuracy`)))
+
+  ## Set seed and generate folds for final hyperparameter optimization search (using CV)
+  set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+  folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k))
+
+  ## Get the final hyperparameter performance for all value combinations in grid
+  params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer))
+
+  ## Select optimum final hyperparameters
+  optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>%
+    select(percentiles: ncol(.))
+
+  ## Estimate final model on whole dataset, using optimum final hyperparameters determined above
+  model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = detectCores())
+  rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
+  save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
+  return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
+}
--- a/man/class_update.Rd
+++ b/man/class_update.Rd
@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/class_update.R
+\name{class_update}
+\alias{class_update}
+\title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information}
+\usage{
+class_update(out, model_final, dfm_words, varname, es_super)
+}
+\arguments{
+\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
+
+\item{model_final}{The classification model (e.g. output from textstat_nb(), svm() or others)}
+
+\item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)}
+
+\item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname}
+
+\item{es_super}{Password for write access to ElasticSearch}
+}
+\value{
+As this is a nested function used within elasticizer, there is no return output
+}
+\description{
+Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information
+}
+\examples{
+elasticizer(query, src = T, es_pwd = es_pwd, update = class_update, model_final = model_final, dfm_words = dfm_words, varname = computerCodes.varname, es_super = es_super)
+}
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -16,6 +16,8 @@ elasticizer(query, src = T, index = "maml",
 \item{index}{The name of the Elasticsearch index to search through}

 \item{update}{When set, indicates an update function to use on each batch of 1000 articles}
+
+\item{...}{Parameters passed on to the update function}
 }
 \value{
 A data frame containing all the search results
--- a/man/modelizer.Rd
+++ b/man/modelizer.Rd
@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/modelizer.R
+\name{modelizer}
+\alias{modelizer}
+\title{Generate a classification model}
+\usage{
+modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
+  outer_k, inner_k, model, class_type, opt_measure, country, grid)
+}
+\arguments{
+\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
+
+\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)}
+
+\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)}
+
+\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)}
+
+\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)}
+
+\item{seed}{Integer to use as seed for random number generation, ensures replicability}
+
+\item{outer_k}{Number of outer cross-validation folds (for performance estimation)}
+
+\item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}
+
+\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
+
+\item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}
+
+\item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
+
+\item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}
+
+\item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
+}
+\value{
+An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
+}
+\description{
+Generate a nested cross validated classification model based on a dfm with class labels as docvars
+Currently only supports Naïve Bayes using quanteda's textmodel_nb
+Hyperparemeter optimization is enabled through the grid parameter
+A grid should be generated from vectors with the labels as described for each model, using the crossing() command
+For Naïve Bayes, the following parameters can be used:
+- percentiles (cutoff point for tf-idf feature selection)
+- measures (what measure to use for determining feature importance, see textstat_keyness for options)
+}
+\examples{
+modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
+}