diff --git a/R/elasticizer.R b/R/elasticizer.R index 0e353d7..69a8116 100644 --- a/R/elasticizer.R +++ b/R/elasticizer.R @@ -8,6 +8,7 @@ #' @param batch_size Batch size #' @param max_batch Maximum number batches to retrieve #' @param time_scroll Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute) +#' @param dump Boolean indicating whether the data frames should be returned, or dumped as .Rds files #' @param update When set, indicates an update function to use on each batch of 1000 articles #' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200) #' @param ... Parameters passed on to the update function @@ -19,7 +20,7 @@ ################################################################################################# #################################### Get data from ElasticSearch ################################ ################################################################################################# -elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, time_scroll = "5m", update = NULL, localhost = F, ...){ +elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, time_scroll = "5m", dump = F, update = NULL, localhost = F, ...){ retries <- 10 ### Number of retries on error sleep <- 30 ### Number of seconds between retries httr::set_config(httr::config(http_version = 0)) @@ -141,6 +142,8 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw if (length(update) > 0) { scroll_clear(conn = conn, x = json$`_scroll_id`) return("Done updating") + } else if (dump) { + saveRDS(out, file = paste0('df_raw',as.numeric(as.POSIXct(Sys.time())),'.Rds')) } else { scroll_clear(conn = conn, x = json$`_scroll_id`) return(out) diff --git a/R/out_parser.R b/R/out_parser.R index 4d30fde..ebc6066 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -4,7 +4,6 @@ #' @param out The original output data frame #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code) -#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available) #' @return a parsed output data frame including the additional column 'merged', containing the merged text #' @export #' @examples @@ -13,8 +12,7 @@ ################################################################################################# #################################### Parser function for output fields ########################## ################################################################################################# -out_parser <- function(out, field, clean = F, cores = 1) { - plan(multiprocess, workers = cores) +out_parser <- function(out, field, clean = F) { fncols <- function(data, cname) { add <-cname[!cname%in%names(data)] @@ -80,5 +78,5 @@ out_parser <- function(out, field, clean = F, cores = 1) { {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } return(doc) } - out <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field)) + return(par_parser(1:nrow(out), out=out, clean=clean, field=field)) } diff --git a/R/sentencizer.R b/R/sentencizer.R index 85747d7..bd60f65 100644 --- a/R/sentencizer.R +++ b/R/sentencizer.R @@ -2,7 +2,7 @@ #' #' Generate sentence-level dataset with sentiment and actor presence #' @param out Data frame produced by elasticizer -#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable. +#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values. #' @param validation Boolean indicating whether human validation should be performed on sentiment scoring #' @return No return value, data per batch is saved in an RDS file #' @export diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd index 6fcba75..9e28fb5 100644 --- a/man/elasticizer.Rd +++ b/man/elasticizer.Rd @@ -12,6 +12,7 @@ elasticizer( batch_size = 1024, max_batch = Inf, time_scroll = "5m", + dump = F, update = NULL, localhost = F, ... @@ -32,6 +33,8 @@ elasticizer( \item{time_scroll}{Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)} +\item{dump}{Boolean indicating whether the data frames should be returned, or dumped as .Rds files} + \item{update}{When set, indicates an update function to use on each batch of 1000 articles} \item{...}{Parameters passed on to the update function} diff --git a/man/out_parser.Rd b/man/out_parser.Rd index de20048..3c28cea 100644 --- a/man/out_parser.Rd +++ b/man/out_parser.Rd @@ -4,7 +4,7 @@ \alias{out_parser} \title{Parse raw text into a single field} \usage{ -out_parser(out, field, clean = F, cores = 1) +out_parser(out, field, clean = F) } \arguments{ \item{out}{The original output data frame} @@ -12,8 +12,6 @@ out_parser(out, field, clean = F, cores = 1) \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)} - -\item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)} } \value{ a parsed output data frame including the additional column 'merged', containing the merged text diff --git a/man/sentencizer.Rd b/man/sentencizer.Rd index d4d58ae..3d40d5f 100644 --- a/man/sentencizer.Rd +++ b/man/sentencizer.Rd @@ -9,7 +9,7 @@ sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F) \arguments{ \item{out}{Data frame produced by elasticizer} -\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.} +\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.} \item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring} }