actorizer, dfm_gen, modelizer, out_parser: replaced all instances of detectCores by cores parameter (which defaults to detectCores)

master
Erik de Vries 5 years ago
parent ac37d836f5
commit 8051a81b66

@ -9,11 +9,12 @@
#' @param identifier String used to mark highlights. Should be a lowercase string #' @param identifier String used to mark highlights. Should be a lowercase string
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param es_super Password for write access to ElasticSearch #' @param es_super Password for write access to ElasticSearch
#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
#' @return As this is a nested function used within elasticizer, there is no return output #' @return As this is a nested function used within elasticizer, there is no return output
#' @export #' @export
#' @examples #' @examples
#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super) #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) { actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = detectCores()) {
### Function to filter out false positives using regex ### Function to filter out false positives using regex
exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) { exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
@ -118,7 +119,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
} }
} }
out <- mamlr:::out_parser(out, field = 'highlight', clean = F) out <- mamlr:::out_parser(out, field = 'highlight', clean = F, cores = cores)
offsetter <- function(x, pre_tags, post_tags) { offsetter <- function(x, pre_tags, post_tags) {
return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags)))) return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
} }
@ -126,7 +127,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
postfix[postfix==''] <- NA postfix[postfix==''] <- NA
pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags) pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags) post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = detectCores()) out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = cores)
# ids <- fromJSON(ids) # ids <- fromJSON(ids)
updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer,
@ -138,7 +139,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
pre_tags = pre_tags, pre_tags = pre_tags,
post_tags_regex = post_tags_regex, post_tags_regex = post_tags_regex,
post_tags = post_tags, post_tags = post_tags,
mc.cores = detectCores())) mc.cores = cores))
if (nrow(updates) == 0) { if (nrow(updates) == 0) {
print("Nothing to update for this batch") print("Nothing to update for this batch")
return(NULL) return(NULL)

@ -5,6 +5,7 @@
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
#' @return A Quanteda dfm #' @return A Quanteda dfm
#' @export #' @export
#' @examples #' @examples
@ -17,16 +18,16 @@
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out, words = '999', text = "lemmas", clean) { dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = detectCores()) {
# Create subset with just ids, codes and text # Create subset with just ids, codes and text
out <- out %>% out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out)) fields <- length(names(out))
if (text == "lemmas" || text == 'ud' || text == 'ud_upos') { if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = detectCores())) out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = cores))
} }
if (text == "full") { if (text == "full") {
out <- mamlr:::out_parser(out, field = '_source' , clean = clean) out <- mamlr:::out_parser(out, field = '_source' , clean = clean, cores = cores)
} }
if ('_source.codes.majorTopic' %in% colnames(out)) { if ('_source.codes.majorTopic' %in% colnames(out)) {
out <- out %>% out <- out %>%

@ -269,7 +269,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
select(percentiles: ncol(.)) select(percentiles: ncol(.))
## Estimate final model on whole dataset, using optimum final hyperparameters determined above ## Estimate final model on whole dataset, using optimum final hyperparameters determined above
model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = detectCores()) model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer)))
rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment()) rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment()) save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData')) return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))

@ -4,6 +4,7 @@
#' @param out The original output data frame #' @param out The original output data frame
#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code) #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
#' @return a parsed output data frame including the additional column 'merged', containing the merged text #' @return a parsed output data frame including the additional column 'merged', containing the merged text
#' @examples #' @examples
#' out_parser(out,field) #' out_parser(out,field)
@ -11,7 +12,7 @@
################################################################################################# #################################################################################################
#################################### Parser function for output fields ########################## #################################### Parser function for output fields ##########################
################################################################################################# #################################################################################################
out_parser <- function(out, field, clean = F) { out_parser <- function(out, field, clean = F, cores = detectCores()) {
fncols <- function(data, cname) { fncols <- function(data, cname) {
add <-cname[!cname%in%names(data)] add <-cname[!cname%in%names(data)]
@ -80,7 +81,7 @@ out_parser <- function(out, field, clean = F) {
if (Sys.info()[['sysname']] == "Windows") { if (Sys.info()[['sysname']] == "Windows") {
cores <- 1 cores <- 1
} else { } else {
cores <- detectCores() cores <- cores
} }
out <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field, mc.cores = cores)) out <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field, mc.cores = cores))
} }

@ -4,8 +4,8 @@
\alias{actorizer} \alias{actorizer}
\title{Updater function for elasticizer: Conduct actor searches} \title{Updater function for elasticizer: Conduct actor searches}
\usage{ \usage{
actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super, actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags,
ver) es_super, ver, cores = detectCores())
} }
\arguments{ \arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
@ -18,11 +18,13 @@ actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super,
\item{postfix}{Regex containing postfixes that should be excluded from hits} \item{postfix}{Regex containing postfixes that should be excluded from hits}
\item{identifier}{String used to mark highlights. Should be a lowercase string}
\item{es_super}{Password for write access to ElasticSearch} \item{es_super}{Password for write access to ElasticSearch}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
\item{identifier}{String used to mark highlights. Should be a lowercase string}
} }
\value{ \value{
As this is a nested function used within elasticizer, there is no return output As this is a nested function used within elasticizer, there is no return output

@ -4,7 +4,8 @@
\alias{dfm_gen} \alias{dfm_gen}
\title{Generates dfm from ElasticSearch output} \title{Generates dfm from ElasticSearch output}
\usage{ \usage{
dfm_gen(out, words = "999", text = "lemmas", clean) dfm_gen(out, words = "999", text = "lemmas", clean,
cores = detectCores())
} }
\arguments{ \arguments{
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}
@ -14,6 +15,8 @@ dfm_gen(out, words = "999", text = "lemmas", clean)
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
} }
\value{ \value{
A Quanteda dfm A Quanteda dfm

@ -6,7 +6,8 @@
\usage{ \usage{
elasticizer(query, src = T, index = "maml", elasticizer(query, src = T, index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024,
max_batch = Inf, update = NULL, localhost = F, ...) max_batch = Inf, time_scroll = "5m", update = NULL,
localhost = F, ...)
} }
\arguments{ \arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL} \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -21,6 +22,8 @@ elasticizer(query, src = T, index = "maml",
\item{max_batch}{Maximum number batches to retrieve} \item{max_batch}{Maximum number batches to retrieve}
\item{time_scroll}{Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)}
\item{update}{When set, indicates an update function to use on each batch of 1000 articles} \item{update}{When set, indicates an update function to use on each batch of 1000 articles}
\item{...}{Parameters passed on to the update function} \item{...}{Parameters passed on to the update function}

@ -4,7 +4,7 @@
\alias{out_parser} \alias{out_parser}
\title{Parse raw text into a single field} \title{Parse raw text into a single field}
\usage{ \usage{
out_parser(out, field, clean = F) out_parser(out, field, clean = F, cores = detectCores())
} }
\arguments{ \arguments{
\item{out}{The original output data frame} \item{out}{The original output data frame}
@ -12,6 +12,8 @@ out_parser(out, field, clean = F)
\item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
\item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)}
} }
\value{ \value{
a parsed output data frame including the additional column 'merged', containing the merged text a parsed output data frame including the additional column 'merged', containing the merged text

Loading…
Cancel
Save