elasticizer: added option to dump data frames to rds files

out_parser: changed to single core, due to performance increase sentencizer: corrected documentation for sent_dict parameter
5 years ago · 5d99ec9509
parent aa6587b204
commit 5d99ec9509
6 changed files with 12 additions and 10 deletions
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -8,6 +8,7 @@
 #' @param batch_size Batch size
 #' @param max_batch Maximum number batches to retrieve
 #' @param time_scroll Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)
 #' @param dump Boolean indicating whether the data frames should be returned, or dumped as .Rds files
 #' @param update When set, indicates an update function to use on each batch of 1000 articles
 #' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param ... Parameters passed on to the update function
@ -19,7 +20,7 @@
 #################################################################################################
 #################################### Get data from ElasticSearch ################################
 #################################################################################################
-elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, time_scroll = "5m", update = NULL, localhost = F, ...){
+elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, time_scroll = "5m", dump = F, update = NULL, localhost = F, ...){
  retries <- 10 ### Number of retries on error
  sleep <- 30 ### Number of seconds between retries
  httr::set_config(httr::config(http_version = 0))
@ -141,6 +142,8 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
    if (length(update) > 0) {
      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return("Done updating")
    } else if (dump) {
      saveRDS(out, file = paste0('df_raw',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
    } else {
      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return(out)
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -4,7 +4,6 @@
 #' @param out The original output data frame
 #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
 #' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
 #' @return a parsed output data frame including the additional column 'merged', containing the merged text
 #' @export
 #' @examples
@ -13,8 +12,7 @@
 #################################################################################################
 #################################### Parser function for output fields ##########################
 #################################################################################################
-out_parser <- function(out, field, clean = F, cores = 1) {
+out_parser <- function(out, field, clean = F) {
  plan(multiprocess, workers = cores)
  fncols <- function(data, cname) {
    add <-cname[!cname%in%names(data)]
@ -80,5 +78,5 @@ out_parser <- function(out, field, clean = F, cores = 1) {
      {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . }
    return(doc)
  }
-  out <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field))
+  return(par_parser(1:nrow(out), out=out, clean=clean, field=field))
 }
--- a/R/sentencizer.R
+++ b/R/sentencizer.R
@ -2,7 +2,7 @@
 #'
 #' Generate sentence-level dataset with sentiment and actor presence
 #' @param out Data frame produced by elasticizer
-#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
+#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.
 #' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
 #' @return No return value, data per batch is saved in an RDS file
 #' @export
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -12,6 +12,7 @@ elasticizer(
  batch_size = 1024,
  max_batch = Inf,
  time_scroll = "5m",
  dump = F,
  update = NULL,
  localhost = F,
  ...
@ -32,6 +33,8 @@ elasticizer(
 \item{time_scroll}{Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)}
 \item{dump}{Boolean indicating whether the data frames should be returned, or dumped as .Rds files}
 \item{update}{When set, indicates an update function to use on each batch of 1000 articles}
 \item{...}{Parameters passed on to the update function}
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@ -4,7 +4,7 @@
 \alias{out_parser}
 \title{Parse raw text into a single field}
 \usage{
-out_parser(out, field, clean = F, cores = 1)
+out_parser(out, field, clean = F)
 }
 \arguments{
 \item{out}{The original output data frame}
@ -12,8 +12,6 @@ out_parser(out, field, clean = F, cores = 1)
 \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
 \item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)}
 }
 \value{
 a parsed output data frame including the additional column 'merged', containing the merged text
--- a/man/sentencizer.Rd
+++ b/man/sentencizer.Rd
@ -9,7 +9,7 @@ sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
 \arguments{
 \item{out}{Data frame produced by elasticizer}
-\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.}
+\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.}
 \item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
 }