diff --git a/NAMESPACE b/NAMESPACE index ca7d071..5150447 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(actor_aggregation) +export(actor_fetcher) export(actorizer) export(aggregator) export(aggregator_elastic) diff --git a/R/actor_fetcher.R b/R/actor_fetcher.R new file mode 100644 index 0000000..b3d21a6 --- /dev/null +++ b/R/actor_fetcher.R @@ -0,0 +1,95 @@ +#' Generate actor data frames (with sentiment) from database +#' +#' Generate actor data frames (with sentiment) from database +#' @param out Data frame produced by elasticizer +#' @param sent_dict Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format) +#' @param cores Number of threads to use for parallel processing +#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring +#' @return No return value, data per batch is saved in an RDS file +#' @export +#' @examples +#' actor_fetcher(out, sent_dict = NULL, cores = 1) +################################################################################################# +#################################### Aggregate actor results ################################ +################################################################################################# +actor_fetcher <- function(out, sent_dict = NULL, cores = 1, localhost = NULL, validation = F) { + plan(multiprocess, workers = cores) + ### Functions + ### Calculate sentiment scores for each actor-document + sent_scorer <- function(row, out_row, ud_sent) { + ### Contains sentiment per sentence for actor + actor_tone <- filter(ud_sent, sentence_id %in% unlist(out_row[row,]$sentence_id)) + + ### Aggregated sentiment per actor (over all sentences containing actor) + actor <- summarise(actor_tone, + sent = sum(sent_sum)/sum(words), + sent_sum = sum(sent_sum), + sent_words = sum(sent_words), + words = sum(words), + arousal = sum(sent_words)/sum(words) + ) + return(cbind(out_row[row,],data.frame(actor = actor))) + } + + par_sent <- function(row, out, sent_dict = NULL) { + out_row <- out[row,] + ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids. + if (is.null(sent_dict) == F) { + ud_sent <- out_row$`_source.ud`[[1]] %>% + select(-one_of('exists')) %>% + unnest() %>% + filter(upos != 'PUNCT') %>% # For getting proper word counts + mutate(V1 = str_c(lemma,'_',upos)) %>% + left_join(sent_dict, by = 'V1') %>% + ### Setting binary sentiment as unit of analysis + mutate(V2 = V3) %>% + group_by(sentence_id) %>% + mutate( + V2 = case_when( + is.na(V2) == T ~ 0, + TRUE ~ V2 + ) + ) %>% + summarise(sent_sum = sum(V2), + words = length(lemma), + sent_words = length(na.omit(V3))) %>% + mutate( + sent = sent_sum/words, + arousal = sent_words/words + ) + out_row <- select(out_row, -`_source.ud`) %>% + unnest(`_source.computerCodes.actorsDetail`, .preserve = colnames(.)) + ### Aggregated sentiment per article (over all sentences in article) + text_sent <- summarise(ud_sent, + sent = sum(sent_sum)/sum(words), + sent_sum = sum(sent_sum), + sent_words = sum(sent_words), + words = sum(words), + arousal = sum(sent_words)/sum(words) + ) + out_row <- bind_rows(lapply(seq(1,nrow(out_row),1),sent_scorer, out_row = out_row, ud_sent = ud_sent)) %>% + cbind(., text = text_sent) + if (validation == T) { + codes_sent <- filter(ud_sent, sentence_id == out_row$`_source.codes.sentence.id`[1]) %>% + select(-sentence_id) + out_row <- cbind(out_row, codes = codes_sent) + } + } else { + out_row <- unnest(out_row, `_source.computerCodes.actorsDetail`, .preserve = colnames(.)) + } + out_row <- out_row %>% + mutate( + year = strftime(`_source.publication_date`, format = '%Y'), + yearmonth = strftime(`_source.publication_date`, format = '%Y%m'), + yearmonthday = strftime(`_source.publication_date`, format = '%Y%m%d'), + yearweek = strftime(`_source.publication_date`, format = "%Y%V") + ) %>% + select(-`_source.computerCodes.actorsDetail`, + -`_score`, + -`_index`, + -`_type`) + return(out_row) + } + saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds')) + return() +} diff --git a/man/actor_aggregation.Rd b/man/actor_aggregation.Rd index 33e40db..8d60aff 100644 --- a/man/actor_aggregation.Rd +++ b/man/actor_aggregation.Rd @@ -5,7 +5,7 @@ \title{Generate aggregated actor measures from raw data} \usage{ actor_aggregation(row, actors, es_pwd, localhost, - default_operator = "OR") + default_operator = "OR", sent_dict = NULL, cores = detectCores()) } \arguments{ \item{row}{The row of the actors data frame used for aggregation} diff --git a/man/actor_fetcher.Rd b/man/actor_fetcher.Rd new file mode 100644 index 0000000..5c7417b --- /dev/null +++ b/man/actor_fetcher.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/actor_fetcher.R +\name{actor_fetcher} +\alias{actor_fetcher} +\title{Generate actor data frames (with sentiment) from database} +\usage{ +actor_fetcher(out, sent_dict = NULL, cores = 1, localhost = NULL, + validation = F) +} +\arguments{ +\item{out}{Data frame produced by elasticizer} + +\item{sent_dict}{Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)} + +\item{cores}{Number of threads to use for parallel processing} + +\item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring} +} +\value{ +No return value, data per batch is saved in an RDS file +} +\description{ +Generate actor data frames (with sentiment) from database +} +\examples{ +actor_fetcher(out, sent_dict = NULL, cores = 1) +} diff --git a/man/aggregator_elastic.Rd b/man/aggregator_elastic.Rd index 89a2f44..547b1dc 100644 --- a/man/aggregator_elastic.Rd +++ b/man/aggregator_elastic.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/actor_aggregation_db.R +% Please edit documentation in R/aggregator_elastic.R \name{aggregator_elastic} \alias{aggregator_elastic} \title{Generate and store aggregate actor measures to elasticsearch} \usage{ -aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super) +aggregator_elastic(out, localhost = F, actorids, ver, es_super) } \arguments{ \item{out}{The output provided by elasticizer()} @@ -15,8 +15,6 @@ aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super) \item{ver}{String indicating the version of the update} -\item{cores}{Numeric value indicating the number of cores to use for processing} - \item{es_super}{Write password for ES} } \value{ @@ -26,5 +24,5 @@ Return value is based on output of elastic_update() Generate and store aggregate actor measures to elasticsearch } \examples{ -aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super) +aggregator_elastic(out, localhost = F, actorids, ver, es_super) }