From 4a0f2206fdfe9d4c94369e742f4e85c514e902a5 Mon Sep 17 00:00:00 2001
From: Erik de Vries <erik.devries@uis.no>
Date: Thu, 15 Oct 2020 16:53:49 +0200
Subject: [PATCH] removed multicore support, added parameters for dfm_gen

---
 R/class_update.R    |  2 +-
 R/dfm_gen.R         | 27 ++++++++++++++++++---------
 R/estimator.R       |  4 ++--
 R/modelizer.R       |  8 ++++++--
 man/class_update.Rd |  3 +--
 man/dfm_gen.Rd      | 14 +++++++++++++-
 6 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/R/class_update.R b/R/class_update.R
index 68d1294..df9296b 100644
--- a/R/class_update.R
+++ b/R/class_update.R
@@ -18,7 +18,7 @@
 #################################################################################################
 #################################### Update any kind of classification ##########################
 #################################################################################################
-class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) {
+class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
   print('updating')
   dfm <- dfm_gen(out, text = text, words = words, clean = clean)
   if (!is.null(model_final$idf)) {
diff --git a/R/dfm_gen.R b/R/dfm_gen.R
index d086b0c..f74d1a6 100644
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@@ -6,6 +6,8 @@
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @param tolower Boolean indicating whether dfm features should be lowercased
+#' @param binary Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE
+#' @param ngrams Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1
 #' @return A Quanteda dfm
 #' @export
 #' @examples
@@ -18,7 +20,7 @@
 
 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
 
-dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
+dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T, binary=F, ngrams=1) {
   # Create subset with just ids, codes and text
   out <- out %>%
     select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
@@ -40,15 +42,17 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
           .$codes == 3101 ~ 1,
           .$codes == 34 ~ 1,
           TRUE ~ 0
-        )
+        ),
+        aggregate = .$codes %>%
+          str_pad(4, side="right", pad="a") %>%
+          str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
+          .[,2] %>%
+          as.numeric(),
+        nondomestic = as.numeric(`_source.codes.nonDomestic`)
       ) %>%
-      mutate(aggregate = .$codes %>%
-               str_pad(4, side="right", pad="a") %>%
-               str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
-               .[,2] %>%
-               as.numeric()
+      mutate(
       )
-   vardoc <- out[,-seq(1,(length(names(out))-3),1)]
+   vardoc <- select(out, codes, junk, aggregate, nondomestic)
   } else {
     vardoc <- NULL
   }
@@ -62,6 +66,11 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
     }
   }
   dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
-    dfm(tolower = tolower, stem = F, remove_punct = T, valuetype = "regex")
+    tokens(remove_punct = T) %>%
+    tokens_ngrams(n = ngrams, skip = 0, concatenator = '_') %>%
+    dfm(tolower = tolower, stem = F, valuetype = "regex")
+  if (binary) {
+    dfm <- dfm_weight(dfm, scheme = 'boolean')
+  }
   return(dfm)
 }
diff --git a/R/estimator.R b/R/estimator.R
index d5d8344..ee46187 100644
--- a/R/estimator.R
+++ b/R/estimator.R
@@ -38,7 +38,7 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
     dfm_test <- dfm[inner_folds[[params$inner_fold]],]
     # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
   } else {
-    final <- T ### Indicate final modeling run on whole dataset
+    dfm_test <- NULL
     dfm_train <- dfm
   }
 
@@ -82,7 +82,7 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
   ### Add more if statements for different models
 
   # If training on whole dataset, return final model, and idf values from dataset
-  if (exists("final")) {
+  if (is.null(dfm_test)) {
     return(list(text_model=text_model, idf=idf))
   } else { # Create a test set, and classify test items
     # Use force=T to keep only features present in both training and test set
diff --git a/R/modelizer.R b/R/modelizer.R
index dec7b85..f042672 100644
--- a/R/modelizer.R
+++ b/R/modelizer.R
@@ -60,6 +60,8 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
     slice(which.max((!!as.name(opt_measure)))) %>%
     # Select only the columns outer_fold, and the columns that are in the original parameter grid
     select(outer_fold, colnames(grid))
+  ## Create multithread work pool for future_lapply
+  plan(strategy = multiprocess, workers = cores)
 
   # Use the estimator function to build optimum models for each outer_fold
   outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator,
@@ -78,6 +80,9 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
   final_grid <- final_folds$grid
   final_inner <- final_folds$inner_folds
 
+  ## Create multithread work pool for future_lapply
+  plan(strategy = multiprocess, workers = cores)
+
   # Use the estimator function to estimate the performance of each row in final_grid
   final_cv_output <- future_lapply(1:nrow(final_grid), estimator,
                                    grid = final_grid,
@@ -111,8 +116,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
                            class_type = class_type,
                            model = model)
   # Create list with output variables
-  output <- list(final_cv_output = final_cv_output,
-                 final_params = final_params,
+  output <- list(final_params = final_params,
                  outer_cv_output = outer_cv_output,
                  model_final = model_final,
                  grid = grid,
diff --git a/man/class_update.Rd b/man/class_update.Rd
index b183f5a..e3e3ccc 100644
--- a/man/class_update.Rd
+++ b/man/class_update.Rd
@@ -13,8 +13,7 @@ class_update(
   words,
   clean,
   ver,
-  es_super = .rs.askForPassword("ElasticSearch WRITE"),
-  cores = 1
+  es_super = .rs.askForPassword("ElasticSearch WRITE")
 )
 }
 \arguments{
diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd
index 595d08b..408bba1 100644
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@@ -4,7 +4,15 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T)
+dfm_gen(
+  out,
+  words = "999",
+  text = "lemmas",
+  clean,
+  tolower = T,
+  binary = F,
+  ngrams = 1
+)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
@@ -16,6 +24,10 @@ dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T)
 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
 
 \item{tolower}{Boolean indicating whether dfm features should be lowercased}
+
+\item{binary}{Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE}
+
+\item{ngrams}{Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1}
 }
 \value{
 A Quanteda dfm