elasticizer: added option to dump data frames to rds files

out_parser: changed to single core, due to performance increase
sentencizer: corrected documentation for sent_dict parameter
master
Your Name 5 years ago
parent aa6587b204
commit 5d99ec9509

@ -8,6 +8,7 @@
#' @param batch_size Batch size
#' @param max_batch Maximum number batches to retrieve
#' @param time_scroll Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)
#' @param dump Boolean indicating whether the data frames should be returned, or dumped as .Rds files
#' @param update When set, indicates an update function to use on each batch of 1000 articles
#' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @param ... Parameters passed on to the update function
@ -19,7 +20,7 @@
#################################################################################################
#################################### Get data from ElasticSearch ################################
#################################################################################################
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, time_scroll = "5m", update = NULL, localhost = F, ...){
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, time_scroll = "5m", dump = F, update = NULL, localhost = F, ...){
retries <- 10 ### Number of retries on error
sleep <- 30 ### Number of seconds between retries
httr::set_config(httr::config(http_version = 0))
@ -141,6 +142,8 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
if (length(update) > 0) {
scroll_clear(conn = conn, x = json$`_scroll_id`)
return("Done updating")
} else if (dump) {
saveRDS(out, file = paste0('df_raw',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
} else {
scroll_clear(conn = conn, x = json$`_scroll_id`)
return(out)

@ -4,7 +4,6 @@
#' @param out The original output data frame
#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
#' @return a parsed output data frame including the additional column 'merged', containing the merged text
#' @export
#' @examples
@ -13,8 +12,7 @@
#################################################################################################
#################################### Parser function for output fields ##########################
#################################################################################################
out_parser <- function(out, field, clean = F, cores = 1) {
plan(multiprocess, workers = cores)
out_parser <- function(out, field, clean = F) {
fncols <- function(data, cname) {
add <-cname[!cname%in%names(data)]
@ -80,5 +78,5 @@ out_parser <- function(out, field, clean = F, cores = 1) {
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . }
return(doc)
}
out <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field))
return(par_parser(1:nrow(out), out=out, clean=clean, field=field))
}

@ -2,7 +2,7 @@
#'
#' Generate sentence-level dataset with sentiment and actor presence
#' @param out Data frame produced by elasticizer
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.
#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
#' @return No return value, data per batch is saved in an RDS file
#' @export

@ -12,6 +12,7 @@ elasticizer(
batch_size = 1024,
max_batch = Inf,
time_scroll = "5m",
dump = F,
update = NULL,
localhost = F,
...
@ -32,6 +33,8 @@ elasticizer(
\item{time_scroll}{Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)}
\item{dump}{Boolean indicating whether the data frames should be returned, or dumped as .Rds files}
\item{update}{When set, indicates an update function to use on each batch of 1000 articles}
\item{...}{Parameters passed on to the update function}

@ -4,7 +4,7 @@
\alias{out_parser}
\title{Parse raw text into a single field}
\usage{
out_parser(out, field, clean = F, cores = 1)
out_parser(out, field, clean = F)
}
\arguments{
\item{out}{The original output data frame}
@ -12,8 +12,6 @@ out_parser(out, field, clean = F, cores = 1)
\item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
\item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)}
}
\value{
a parsed output data frame including the additional column 'merged', containing the merged text

@ -9,7 +9,7 @@ sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
\arguments{
\item{out}{Data frame produced by elasticizer}
\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.}
\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.}
\item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
}

Loading…
Cancel
Save