diff --git a/R/actorizer.R b/R/actorizer.R index 79d3f60..c7cf19b 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -9,11 +9,12 @@ #' @param identifier String used to mark highlights. Should be a lowercase string #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') #' @param es_super Password for write access to ElasticSearch +#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available) #' @return As this is a nested function used within elasticizer, there is no return output #' @export #' @examples #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super) -actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) { +actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = detectCores()) { ### Function to filter out false positives using regex exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) { min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence @@ -118,7 +119,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t } } - out <- mamlr:::out_parser(out, field = 'highlight', clean = F) + out <- mamlr:::out_parser(out, field = 'highlight', clean = F, cores = cores) offsetter <- function(x, pre_tags, post_tags) { return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags)))) } @@ -126,7 +127,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t postfix[postfix==''] <- NA pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags) post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags) - out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = detectCores()) + out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = cores) # ids <- fromJSON(ids) updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, @@ -138,7 +139,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t pre_tags = pre_tags, post_tags_regex = post_tags_regex, post_tags = post_tags, - mc.cores = detectCores())) + mc.cores = cores)) if (nrow(updates) == 0) { print("Nothing to update for this batch") return(NULL) diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 4974951..8f66735 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -5,6 +5,7 @@ #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). +#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available) #' @return A Quanteda dfm #' @export #' @examples @@ -17,16 +18,16 @@ # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack -dfm_gen <- function(out, words = '999', text = "lemmas", clean) { +dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = detectCores()) { # Create subset with just ids, codes and text out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field fields <- length(names(out)) if (text == "lemmas" || text == 'ud' || text == 'ud_upos') { - out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = detectCores())) + out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = cores)) } if (text == "full") { - out <- mamlr:::out_parser(out, field = '_source' , clean = clean) + out <- mamlr:::out_parser(out, field = '_source' , clean = clean, cores = cores) } if ('_source.codes.majorTopic' %in% colnames(out)) { out <- out %>% diff --git a/R/modelizer.R b/R/modelizer.R index 4415a65..8bd43de 100644 --- a/R/modelizer.R +++ b/R/modelizer.R @@ -269,7 +269,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se select(percentiles: ncol(.)) ## Estimate final model on whole dataset, using optimum final hyperparameters determined above - model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = detectCores()) + model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer))) rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment()) save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment()) return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData')) diff --git a/R/out_parser.R b/R/out_parser.R index 9300306..5bfbb73 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -4,6 +4,7 @@ #' @param out The original output data frame #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code) +#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available) #' @return a parsed output data frame including the additional column 'merged', containing the merged text #' @examples #' out_parser(out,field) @@ -11,7 +12,7 @@ ################################################################################################# #################################### Parser function for output fields ########################## ################################################################################################# -out_parser <- function(out, field, clean = F) { +out_parser <- function(out, field, clean = F, cores = detectCores()) { fncols <- function(data, cname) { add <-cname[!cname%in%names(data)] @@ -80,7 +81,7 @@ out_parser <- function(out, field, clean = F) { if (Sys.info()[['sysname']] == "Windows") { cores <- 1 } else { - cores <- detectCores() + cores <- cores } out <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field, mc.cores = cores)) } diff --git a/man/actorizer.Rd b/man/actorizer.Rd index c2cc681..1345941 100644 --- a/man/actorizer.Rd +++ b/man/actorizer.Rd @@ -4,8 +4,8 @@ \alias{actorizer} \title{Updater function for elasticizer: Conduct actor searches} \usage{ -actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super, - ver) +actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, + es_super, ver, cores = detectCores()) } \arguments{ \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} @@ -18,11 +18,13 @@ actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super, \item{postfix}{Regex containing postfixes that should be excluded from hits} -\item{identifier}{String used to mark highlights. Should be a lowercase string} - \item{es_super}{Password for write access to ElasticSearch} \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} + +\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)} + +\item{identifier}{String used to mark highlights. Should be a lowercase string} } \value{ As this is a nested function used within elasticizer, there is no return output diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index 5ed3679..dfe2e17 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -4,7 +4,8 @@ \alias{dfm_gen} \title{Generates dfm from ElasticSearch output} \usage{ -dfm_gen(out, words = "999", text = "lemmas", clean) +dfm_gen(out, words = "999", text = "lemmas", clean, + cores = detectCores()) } \arguments{ \item{out}{The elasticizer-generated data frame} @@ -14,6 +15,8 @@ dfm_gen(out, words = "999", text = "lemmas", clean) \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} + +\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)} } \value{ A Quanteda dfm diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd index ada666c..8dd3b67 100644 --- a/man/elasticizer.Rd +++ b/man/elasticizer.Rd @@ -6,7 +6,8 @@ \usage{ elasticizer(query, src = T, index = "maml", es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, - max_batch = Inf, update = NULL, localhost = F, ...) + max_batch = Inf, time_scroll = "5m", update = NULL, + localhost = F, ...) } \arguments{ \item{query}{A JSON-formatted query in the Elasticsearch query DSL} @@ -21,6 +22,8 @@ elasticizer(query, src = T, index = "maml", \item{max_batch}{Maximum number batches to retrieve} +\item{time_scroll}{Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)} + \item{update}{When set, indicates an update function to use on each batch of 1000 articles} \item{...}{Parameters passed on to the update function} diff --git a/man/out_parser.Rd b/man/out_parser.Rd index cb13609..eb8cae7 100644 --- a/man/out_parser.Rd +++ b/man/out_parser.Rd @@ -4,7 +4,7 @@ \alias{out_parser} \title{Parse raw text into a single field} \usage{ -out_parser(out, field, clean = F) +out_parser(out, field, clean = F, cores = detectCores()) } \arguments{ \item{out}{The original output data frame} @@ -12,6 +12,8 @@ out_parser(out, field, clean = F) \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)} + +\item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)} } \value{ a parsed output data frame including the additional column 'merged', containing the merged text