removed multicore support, added parameters for dfm_gen

5 years ago · 4a0f2206fd
parent 274c9179cb
commit 4a0f2206fd
6 changed files with 41 additions and 17 deletions
--- a/R/class_update.R
+++ b/R/class_update.R
@ -18,7 +18,7 @@
 #################################################################################################
 #################################### Update any kind of classification ##########################
 #################################################################################################
-class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) {
+class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
  print('updating')
  dfm <- dfm_gen(out, text = text, words = words, clean = clean)
  if (!is.null(model_final$idf)) {
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -6,6 +6,8 @@
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @param tolower Boolean indicating whether dfm features should be lowercased
 #' @param binary Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE
 #' @param ngrams Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1
 #' @return A Quanteda dfm
 #' @export
 #' @examples
@ -18,7 +20,7 @@
 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
-dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
+dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T, binary=F, ngrams=1) {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
@ -40,15 +42,17 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
          .$codes == 3101 ~ 1,
          .$codes == 34 ~ 1,
          TRUE ~ 0
-        )
+        ),
-      ) %>%
+        aggregate = .$codes %>%
      mutate(aggregate = .$codes %>%
          str_pad(4, side="right", pad="a") %>%
          str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
          .[,2] %>%
-               as.numeric()
+          as.numeric(),
        nondomestic = as.numeric(`_source.codes.nonDomestic`)
      ) %>%
      mutate(
      )
-   vardoc <- out[,-seq(1,(length(names(out))-3),1)]
+   vardoc <- select(out, codes, junk, aggregate, nondomestic)
  } else {
    vardoc <- NULL
  }
@ -62,6 +66,11 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
    }
  }
  dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
-    dfm(tolower = tolower, stem = F, remove_punct = T, valuetype = "regex")
+    tokens(remove_punct = T) %>%
    tokens_ngrams(n = ngrams, skip = 0, concatenator = '_') %>%
    dfm(tolower = tolower, stem = F, valuetype = "regex")
  if (binary) {
    dfm <- dfm_weight(dfm, scheme = 'boolean')
  }
  return(dfm)
 }
--- a/R/estimator.R
+++ b/R/estimator.R
@ -38,7 +38,7 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
    dfm_test <- dfm[inner_folds[[params$inner_fold]],]
    # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
  } else {
-    final <- T ### Indicate final modeling run on whole dataset
+    dfm_test <- NULL
    dfm_train <- dfm
  }
@ -82,7 +82,7 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
  ### Add more if statements for different models
  # If training on whole dataset, return final model, and idf values from dataset
-  if (exists("final")) {
+  if (is.null(dfm_test)) {
    return(list(text_model=text_model, idf=idf))
  } else { # Create a test set, and classify test items
    # Use force=T to keep only features present in both training and test set
--- a/R/modelizer.R
+++ b/R/modelizer.R
@ -60,6 +60,8 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
    slice(which.max((!!as.name(opt_measure)))) %>%
    # Select only the columns outer_fold, and the columns that are in the original parameter grid
    select(outer_fold, colnames(grid))
  ## Create multithread work pool for future_lapply
  plan(strategy = multiprocess, workers = cores)
  # Use the estimator function to build optimum models for each outer_fold
  outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator,
@ -78,6 +80,9 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
  final_grid <- final_folds$grid
  final_inner <- final_folds$inner_folds
  ## Create multithread work pool for future_lapply
  plan(strategy = multiprocess, workers = cores)
  # Use the estimator function to estimate the performance of each row in final_grid
  final_cv_output <- future_lapply(1:nrow(final_grid), estimator,
                                   grid = final_grid,
@ -111,8 +116,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
                           class_type = class_type,
                           model = model)
  # Create list with output variables
-  output <- list(final_cv_output = final_cv_output,
+  output <- list(final_params = final_params,
                 final_params = final_params,
                 outer_cv_output = outer_cv_output,
                 model_final = model_final,
                 grid = grid,
--- a/man/class_update.Rd
+++ b/man/class_update.Rd
@ -13,8 +13,7 @@ class_update(
  words,
  clean,
  ver,
-  es_super = .rs.askForPassword("ElasticSearch WRITE"),
+  es_super = .rs.askForPassword("ElasticSearch WRITE")
  cores = 1
 )
 }
 \arguments{
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,7 +4,15 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T)
+dfm_gen(
  out,
  words = "999",
  text = "lemmas",
  clean,
  tolower = T,
  binary = F,
  ngrams = 1
 )
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
@ -16,6 +24,10 @@ dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T)
 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
 \item{tolower}{Boolean indicating whether dfm features should be lowercased}
 \item{binary}{Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE}
 \item{ngrams}{Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1}
 }
 \value{
 A Quanteda dfm