From 4a0f2206fdfe9d4c94369e742f4e85c514e902a5 Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Thu, 15 Oct 2020 16:53:49 +0200 Subject: [PATCH] removed multicore support, added parameters for dfm_gen --- R/class_update.R | 2 +- R/dfm_gen.R | 27 ++++++++++++++++++--------- R/estimator.R | 4 ++-- R/modelizer.R | 8 ++++++-- man/class_update.Rd | 3 +-- man/dfm_gen.Rd | 14 +++++++++++++- 6 files changed, 41 insertions(+), 17 deletions(-) diff --git a/R/class_update.R b/R/class_update.R index 68d1294..df9296b 100644 --- a/R/class_update.R +++ b/R/class_update.R @@ -18,7 +18,7 @@ ################################################################################################# #################################### Update any kind of classification ########################## ################################################################################################# -class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) { +class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) { print('updating') dfm <- dfm_gen(out, text = text, words = words, clean = clean) if (!is.null(model_final$idf)) { diff --git a/R/dfm_gen.R b/R/dfm_gen.R index d086b0c..f74d1a6 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -6,6 +6,8 @@ #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @param tolower Boolean indicating whether dfm features should be lowercased +#' @param binary Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE +#' @param ngrams Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1 #' @return A Quanteda dfm #' @export #' @examples @@ -18,7 +20,7 @@ # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack -dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) { +dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T, binary=F, ngrams=1) { # Create subset with just ids, codes and text out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field @@ -40,15 +42,17 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) { .$codes == 3101 ~ 1, .$codes == 34 ~ 1, TRUE ~ 0 - ) + ), + aggregate = .$codes %>% + str_pad(4, side="right", pad="a") %>% + str_match("([0-9]{1,2})?[0|a][1-9|a]") %>% + .[,2] %>% + as.numeric(), + nondomestic = as.numeric(`_source.codes.nonDomestic`) ) %>% - mutate(aggregate = .$codes %>% - str_pad(4, side="right", pad="a") %>% - str_match("([0-9]{1,2})?[0|a][1-9|a]") %>% - .[,2] %>% - as.numeric() + mutate( ) - vardoc <- out[,-seq(1,(length(names(out))-3),1)] + vardoc <- select(out, codes, junk, aggregate, nondomestic) } else { vardoc <- NULL } @@ -62,6 +66,11 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) { } } dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>% - dfm(tolower = tolower, stem = F, remove_punct = T, valuetype = "regex") + tokens(remove_punct = T) %>% + tokens_ngrams(n = ngrams, skip = 0, concatenator = '_') %>% + dfm(tolower = tolower, stem = F, valuetype = "regex") + if (binary) { + dfm <- dfm_weight(dfm, scheme = 'boolean') + } return(dfm) } diff --git a/R/estimator.R b/R/estimator.R index d5d8344..ee46187 100644 --- a/R/estimator.R +++ b/R/estimator.R @@ -38,7 +38,7 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod dfm_test <- dfm[inner_folds[[params$inner_fold]],] # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model } else { - final <- T ### Indicate final modeling run on whole dataset + dfm_test <- NULL dfm_train <- dfm } @@ -82,7 +82,7 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod ### Add more if statements for different models # If training on whole dataset, return final model, and idf values from dataset - if (exists("final")) { + if (is.null(dfm_test)) { return(list(text_model=text_model, idf=idf)) } else { # Create a test set, and classify test items # Use force=T to keep only features present in both training and test set diff --git a/R/modelizer.R b/R/modelizer.R index dec7b85..f042672 100644 --- a/R/modelizer.R +++ b/R/modelizer.R @@ -60,6 +60,8 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g slice(which.max((!!as.name(opt_measure)))) %>% # Select only the columns outer_fold, and the columns that are in the original parameter grid select(outer_fold, colnames(grid)) + ## Create multithread work pool for future_lapply + plan(strategy = multiprocess, workers = cores) # Use the estimator function to build optimum models for each outer_fold outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator, @@ -78,6 +80,9 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g final_grid <- final_folds$grid final_inner <- final_folds$inner_folds + ## Create multithread work pool for future_lapply + plan(strategy = multiprocess, workers = cores) + # Use the estimator function to estimate the performance of each row in final_grid final_cv_output <- future_lapply(1:nrow(final_grid), estimator, grid = final_grid, @@ -111,8 +116,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g class_type = class_type, model = model) # Create list with output variables - output <- list(final_cv_output = final_cv_output, - final_params = final_params, + output <- list(final_params = final_params, outer_cv_output = outer_cv_output, model_final = model_final, grid = grid, diff --git a/man/class_update.Rd b/man/class_update.Rd index b183f5a..e3e3ccc 100644 --- a/man/class_update.Rd +++ b/man/class_update.Rd @@ -13,8 +13,7 @@ class_update( words, clean, ver, - es_super = .rs.askForPassword("ElasticSearch WRITE"), - cores = 1 + es_super = .rs.askForPassword("ElasticSearch WRITE") ) } \arguments{ diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index 595d08b..408bba1 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -4,7 +4,15 @@ \alias{dfm_gen} \title{Generates dfm from ElasticSearch output} \usage{ -dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T) +dfm_gen( + out, + words = "999", + text = "lemmas", + clean, + tolower = T, + binary = F, + ngrams = 1 +) } \arguments{ \item{out}{The elasticizer-generated data frame} @@ -16,6 +24,10 @@ dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T) \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} \item{tolower}{Boolean indicating whether dfm features should be lowercased} + +\item{binary}{Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE} + +\item{ngrams}{Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1} } \value{ A Quanteda dfm