actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents

6 years ago · 6f5ace8c52
parent edd4b785a5
commit 6f5ace8c52
5 changed files with 127 additions and 6 deletions
--- a/1
+++ b/1
@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 export(actor_aggregation)
 export(actor_fetcher)
 export(actorizer)
 export(aggregator)
 export(aggregator_elastic)
--- a/R/actor_fetcher.R
+++ b/R/actor_fetcher.R
@ -0,0 +1,95 @@
 #' Generate actor data frames (with sentiment) from database
 #'
 #' Generate actor data frames (with sentiment) from database
 #' @param out Data frame produced by elasticizer
 #' @param sent_dict Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)
 #' @param cores Number of threads to use for parallel processing
 #' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
 #' @return No return value, data per batch is saved in an RDS file
 #' @export
 #' @examples
 #' actor_fetcher(out, sent_dict = NULL, cores = 1)
 #################################################################################################
 #################################### Aggregate actor results ################################
 #################################################################################################
 actor_fetcher <- function(out, sent_dict = NULL, cores = 1, localhost = NULL, validation = F) {
  plan(multiprocess, workers = cores)
  ### Functions
  ### Calculate sentiment scores for each actor-document
  sent_scorer <- function(row, out_row, ud_sent) {
    ### Contains sentiment per sentence for actor
    actor_tone <- filter(ud_sent, sentence_id %in% unlist(out_row[row,]$sentence_id))
    ### Aggregated sentiment per actor (over all sentences containing actor)
    actor <- summarise(actor_tone,
                       sent = sum(sent_sum)/sum(words),
                       sent_sum = sum(sent_sum),
                       sent_words = sum(sent_words),
                       words = sum(words),
                       arousal = sum(sent_words)/sum(words)
    )
    return(cbind(out_row[row,],data.frame(actor = actor)))
  }
  par_sent <- function(row, out, sent_dict = NULL) {
    out_row <- out[row,]
    ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
    if (is.null(sent_dict) == F) {
      ud_sent <- out_row$`_source.ud`[[1]] %>%
        select(-one_of('exists')) %>%
        unnest() %>%
        filter(upos != 'PUNCT') %>% # For getting proper word counts
        mutate(V1 = str_c(lemma,'_',upos)) %>%
        left_join(sent_dict, by = 'V1') %>%
        ### Setting binary sentiment as unit of analysis
        mutate(V2 = V3) %>%
        group_by(sentence_id) %>%
        mutate(
          V2 = case_when(
            is.na(V2) == T ~ 0,
            TRUE ~ V2
          )
        ) %>%
        summarise(sent_sum = sum(V2),
                  words = length(lemma),
                  sent_words = length(na.omit(V3))) %>%
        mutate(
          sent = sent_sum/words,
          arousal = sent_words/words
        )
      out_row <- select(out_row, -`_source.ud`) %>%
        unnest(`_source.computerCodes.actorsDetail`, .preserve = colnames(.))
      ### Aggregated sentiment per article (over all sentences in article)
      text_sent <- summarise(ud_sent,
                             sent = sum(sent_sum)/sum(words),
                             sent_sum = sum(sent_sum),
                             sent_words = sum(sent_words),
                             words = sum(words),
                             arousal = sum(sent_words)/sum(words)
      )
      out_row <- bind_rows(lapply(seq(1,nrow(out_row),1),sent_scorer, out_row = out_row, ud_sent = ud_sent)) %>%
        cbind(., text = text_sent)
      if (validation == T) {
        codes_sent <- filter(ud_sent, sentence_id == out_row$`_source.codes.sentence.id`[1]) %>%
          select(-sentence_id)
        out_row <- cbind(out_row, codes = codes_sent)
      }
    } else {
      out_row <- unnest(out_row, `_source.computerCodes.actorsDetail`, .preserve = colnames(.))
    }
    out_row <- out_row %>%
      mutate(
        year = strftime(`_source.publication_date`, format = '%Y'),
        yearmonth = strftime(`_source.publication_date`, format = '%Y%m'),
        yearmonthday = strftime(`_source.publication_date`, format = '%Y%m%d'),
        yearweek = strftime(`_source.publication_date`, format = "%Y%V")
      ) %>%
      select(-`_source.computerCodes.actorsDetail`,
             -`_score`,
             -`_index`,
             -`_type`)
    return(out_row)
  }
  saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
  return()
 }
--- a/man/actor_aggregation.Rd
+++ b/man/actor_aggregation.Rd
@ -5,7 +5,7 @@
 \title{Generate aggregated actor measures from raw data}
 \usage{
 actor_aggregation(row, actors, es_pwd, localhost,
-  default_operator = "OR")
+  default_operator = "OR", sent_dict = NULL, cores = detectCores())
 }
 \arguments{
 \item{row}{The row of the actors data frame used for aggregation}
--- a/man/actor_fetcher.Rd
+++ b/man/actor_fetcher.Rd
@ -0,0 +1,27 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/actor_fetcher.R
 \name{actor_fetcher}
 \alias{actor_fetcher}
 \title{Generate actor data frames (with sentiment) from database}
 \usage{
 actor_fetcher(out, sent_dict = NULL, cores = 1, localhost = NULL,
  validation = F)
 }
 \arguments{
 \item{out}{Data frame produced by elasticizer}
 \item{sent_dict}{Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)}
 \item{cores}{Number of threads to use for parallel processing}
 \item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
 }
 \value{
 No return value, data per batch is saved in an RDS file
 }
 \description{
 Generate actor data frames (with sentiment) from database
 }
 \examples{
 actor_fetcher(out, sent_dict = NULL, cores = 1)
 }
--- a/man/aggregator_elastic.Rd
+++ b/man/aggregator_elastic.Rd
@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/actor_aggregation_db.R
+% Please edit documentation in R/aggregator_elastic.R
 \name{aggregator_elastic}
 \alias{aggregator_elastic}
 \title{Generate and store aggregate actor measures to elasticsearch}
 \usage{
-aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
+aggregator_elastic(out, localhost = F, actorids, ver, es_super)
 }
 \arguments{
 \item{out}{The output provided by elasticizer()}
@ -15,8 +15,6 @@ aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
 \item{ver}{String indicating the version of the update}
 \item{cores}{Numeric value indicating the number of cores to use for processing}
 \item{es_super}{Write password for ES}
 }
 \value{
@ -26,5 +24,5 @@ Return value is based on output of elastic_update()
 Generate and store aggregate actor measures to elasticsearch
 }
 \examples{
-aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
+aggregator_elastic(out, localhost = F, actorids, ver, es_super)
 }