removed multicore support, added parameters for dfm_gen

master
Erik de Vries 4 years ago
parent 274c9179cb
commit 4a0f2206fd

@ -18,7 +18,7 @@
################################################################################################# #################################################################################################
#################################### Update any kind of classification ########################## #################################### Update any kind of classification ##########################
################################################################################################# #################################################################################################
class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) { class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
print('updating') print('updating')
dfm <- dfm_gen(out, text = text, words = words, clean = clean) dfm <- dfm_gen(out, text = text, words = words, clean = clean)
if (!is.null(model_final$idf)) { if (!is.null(model_final$idf)) {

@ -6,6 +6,8 @@
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @param tolower Boolean indicating whether dfm features should be lowercased #' @param tolower Boolean indicating whether dfm features should be lowercased
#' @param binary Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE
#' @param ngrams Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1
#' @return A Quanteda dfm #' @return A Quanteda dfm
#' @export #' @export
#' @examples #' @examples
@ -18,7 +20,7 @@
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) { dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T, binary=F, ngrams=1) {
# Create subset with just ids, codes and text # Create subset with just ids, codes and text
out <- out %>% out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
@ -40,15 +42,17 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
.$codes == 3101 ~ 1, .$codes == 3101 ~ 1,
.$codes == 34 ~ 1, .$codes == 34 ~ 1,
TRUE ~ 0 TRUE ~ 0
) ),
aggregate = .$codes %>%
str_pad(4, side="right", pad="a") %>%
str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
.[,2] %>%
as.numeric(),
nondomestic = as.numeric(`_source.codes.nonDomestic`)
) %>% ) %>%
mutate(aggregate = .$codes %>% mutate(
str_pad(4, side="right", pad="a") %>%
str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
.[,2] %>%
as.numeric()
) )
vardoc <- out[,-seq(1,(length(names(out))-3),1)] vardoc <- select(out, codes, junk, aggregate, nondomestic)
} else { } else {
vardoc <- NULL vardoc <- NULL
} }
@ -62,6 +66,11 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
} }
} }
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>% dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
dfm(tolower = tolower, stem = F, remove_punct = T, valuetype = "regex") tokens(remove_punct = T) %>%
tokens_ngrams(n = ngrams, skip = 0, concatenator = '_') %>%
dfm(tolower = tolower, stem = F, valuetype = "regex")
if (binary) {
dfm <- dfm_weight(dfm, scheme = 'boolean')
}
return(dfm) return(dfm)
} }

@ -38,7 +38,7 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
dfm_test <- dfm[inner_folds[[params$inner_fold]],] dfm_test <- dfm[inner_folds[[params$inner_fold]],]
# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
} else { } else {
final <- T ### Indicate final modeling run on whole dataset dfm_test <- NULL
dfm_train <- dfm dfm_train <- dfm
} }
@ -82,7 +82,7 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
### Add more if statements for different models ### Add more if statements for different models
# If training on whole dataset, return final model, and idf values from dataset # If training on whole dataset, return final model, and idf values from dataset
if (exists("final")) { if (is.null(dfm_test)) {
return(list(text_model=text_model, idf=idf)) return(list(text_model=text_model, idf=idf))
} else { # Create a test set, and classify test items } else { # Create a test set, and classify test items
# Use force=T to keep only features present in both training and test set # Use force=T to keep only features present in both training and test set

@ -60,6 +60,8 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
slice(which.max((!!as.name(opt_measure)))) %>% slice(which.max((!!as.name(opt_measure)))) %>%
# Select only the columns outer_fold, and the columns that are in the original parameter grid # Select only the columns outer_fold, and the columns that are in the original parameter grid
select(outer_fold, colnames(grid)) select(outer_fold, colnames(grid))
## Create multithread work pool for future_lapply
plan(strategy = multiprocess, workers = cores)
# Use the estimator function to build optimum models for each outer_fold # Use the estimator function to build optimum models for each outer_fold
outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator, outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator,
@ -78,6 +80,9 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
final_grid <- final_folds$grid final_grid <- final_folds$grid
final_inner <- final_folds$inner_folds final_inner <- final_folds$inner_folds
## Create multithread work pool for future_lapply
plan(strategy = multiprocess, workers = cores)
# Use the estimator function to estimate the performance of each row in final_grid # Use the estimator function to estimate the performance of each row in final_grid
final_cv_output <- future_lapply(1:nrow(final_grid), estimator, final_cv_output <- future_lapply(1:nrow(final_grid), estimator,
grid = final_grid, grid = final_grid,
@ -111,8 +116,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
class_type = class_type, class_type = class_type,
model = model) model = model)
# Create list with output variables # Create list with output variables
output <- list(final_cv_output = final_cv_output, output <- list(final_params = final_params,
final_params = final_params,
outer_cv_output = outer_cv_output, outer_cv_output = outer_cv_output,
model_final = model_final, model_final = model_final,
grid = grid, grid = grid,

@ -13,8 +13,7 @@ class_update(
words, words,
clean, clean,
ver, ver,
es_super = .rs.askForPassword("ElasticSearch WRITE"), es_super = .rs.askForPassword("ElasticSearch WRITE")
cores = 1
) )
} }
\arguments{ \arguments{

@ -4,7 +4,15 @@
\alias{dfm_gen} \alias{dfm_gen}
\title{Generates dfm from ElasticSearch output} \title{Generates dfm from ElasticSearch output}
\usage{ \usage{
dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T) dfm_gen(
out,
words = "999",
text = "lemmas",
clean,
tolower = T,
binary = F,
ngrams = 1
)
} }
\arguments{ \arguments{
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}
@ -16,6 +24,10 @@ dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T)
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
\item{tolower}{Boolean indicating whether dfm features should be lowercased} \item{tolower}{Boolean indicating whether dfm features should be lowercased}
\item{binary}{Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE}
\item{ngrams}{Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1}
} }
\value{ \value{
A Quanteda dfm A Quanteda dfm

Loading…
Cancel
Save