actorizer, dfm_gen, modelizer, out_parser: replaced all instances of detectCores by cores parameter (which defaults to detectCores)

6 years ago · 8051a81b66
parent ac37d836f5
commit 8051a81b66
8 changed files with 30 additions and 17 deletions
--- a/R/actorizer.R
+++ b/R/actorizer.R
@ -9,11 +9,12 @@
 #' @param identifier String used to mark highlights. Should be a lowercase string
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @param es_super Password for write access to ElasticSearch
 #' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
 #' @return As this is a nested function used within elasticizer, there is no return output
 #' @export
 #' @examples
 #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
-actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) {
+actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = detectCores()) {
  ### Function to filter out false positives using regex
  exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
    min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
@ -118,7 +119,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
    }
  }
-  out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
+  out <- mamlr:::out_parser(out, field = 'highlight', clean = F, cores = cores)
  offsetter <- function(x, pre_tags, post_tags) {
    return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
  }
@ -126,7 +127,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
  postfix[postfix==''] <- NA
  pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
  post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
-  out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = detectCores())
+  out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = cores)
  # ids <- fromJSON(ids)
  updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer,
@ -138,7 +139,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
                                pre_tags = pre_tags,
                                post_tags_regex = post_tags_regex,
                                post_tags = post_tags,
-                                mc.cores = detectCores()))
+                                mc.cores = cores))
  if (nrow(updates) == 0) {
    print("Nothing to update for this batch")
    return(NULL)
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -5,6 +5,7 @@
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
 #' @return A Quanteda dfm
 #' @export
 #' @examples
@ -17,16 +18,16 @@
 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
-dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
+dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = detectCores()) {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
  if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
-    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = detectCores()))
+    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = cores))
  }
  if (text == "full") {
-    out <- mamlr:::out_parser(out, field = '_source' , clean = clean)
+    out <- mamlr:::out_parser(out, field = '_source' , clean = clean, cores = cores)
  }
  if ('_source.codes.majorTopic' %in% colnames(out)) {
    out <- out %>%
--- a/R/modelizer.R
+++ b/R/modelizer.R
@ -269,7 +269,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
    select(percentiles: ncol(.))
  ## Estimate final model on whole dataset, using optimum final hyperparameters determined above
-  model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = detectCores())
+  model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer)))
  rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
  save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
  return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -4,6 +4,7 @@
 #' @param out The original output data frame
 #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
 #' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
 #' @return a parsed output data frame including the additional column 'merged', containing the merged text
 #' @examples
 #' out_parser(out,field)
@ -11,7 +12,7 @@
 #################################################################################################
 #################################### Parser function for output fields ##########################
 #################################################################################################
-out_parser <- function(out, field, clean = F) {
+out_parser <- function(out, field, clean = F, cores = detectCores()) {
  fncols <- function(data, cname) {
    add <-cname[!cname%in%names(data)]
@ -80,7 +81,7 @@ out_parser <- function(out, field, clean = F) {
  if (Sys.info()[['sysname']] == "Windows") {
    cores <- 1
  } else {
-    cores <- detectCores()
+    cores <- cores
  }
  out <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field, mc.cores = cores))
 }
--- a/man/actorizer.Rd
+++ b/man/actorizer.Rd
@ -4,8 +4,8 @@
 \alias{actorizer}
 \title{Updater function for elasticizer: Conduct actor searches}
 \usage{
-actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super,
+actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags,
-  ver)
+  es_super, ver, cores = detectCores())
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
@ -18,11 +18,13 @@ actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super,
 \item{postfix}{Regex containing postfixes that should be excluded from hits}
 \item{identifier}{String used to mark highlights. Should be a lowercase string}
 \item{es_super}{Password for write access to ElasticSearch}
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 \item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
 \item{identifier}{String used to mark highlights. Should be a lowercase string}
 }
 \value{
 As this is a nested function used within elasticizer, there is no return output
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,7 +4,8 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas", clean)
+dfm_gen(out, words = "999", text = "lemmas", clean,
  cores = detectCores())
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
@ -14,6 +15,8 @@ dfm_gen(out, words = "999", text = "lemmas", clean)
 \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
 \item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
 }
 \value{
 A Quanteda dfm
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -6,7 +6,8 @@
 \usage{
 elasticizer(query, src = T, index = "maml",
  es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024,
-  max_batch = Inf, update = NULL, localhost = F, ...)
+  max_batch = Inf, time_scroll = "5m", update = NULL,
  localhost = F, ...)
 }
 \arguments{
 \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -21,6 +22,8 @@ elasticizer(query, src = T, index = "maml",
 \item{max_batch}{Maximum number batches to retrieve}
 \item{time_scroll}{Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)}
 \item{update}{When set, indicates an update function to use on each batch of 1000 articles}
 \item{...}{Parameters passed on to the update function}
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@ -4,7 +4,7 @@
 \alias{out_parser}
 \title{Parse raw text into a single field}
 \usage{
-out_parser(out, field, clean = F)
+out_parser(out, field, clean = F, cores = detectCores())
 }
 \arguments{
 \item{out}{The original output data frame}
@ -12,6 +12,8 @@ out_parser(out, field, clean = F)
 \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
 \item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)}
 }
 \value{
 a parsed output data frame including the additional column 'merged', containing the merged text