mamlr/R/actor_fetcher.R

#' Generate actor data frames (with sentiment) from database
#'
#' Generate actor data frames (with sentiment) from database
#' @param out Data frame produced by elasticizer
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.
#' @param actor_ids Optional vector containing the actor ids to be collected
#' @param cores Number of threads to use for parallel processing
#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
#' @return No return value, data per batch is saved in an RDS file
#' @export
#' @examples
#' actor_fetcher(out, sent_dict = NULL, cores = 1)
#################################################################################################
#################################### Aggregate actor results ################################
#################################################################################################
actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, localhost = NULL, validation = F) {
  plan(multiprocess, workers = cores)
  ### Functions
  ### Calculate sentiment scores for each actor-document
  sent_scorer <- function(row, out_row, ud_sent) {
    ### Contains sentiment per sentence for actor
    actor_tone <- filter(ud_sent, sentence_id %in% unlist(out_row[row,]$sentence_id))

    ### Aggregated sentiment per actor (over all sentences containing actor)
    actor <- summarise(actor_tone,
                       sent = sum(sent_sum)/sum(words),
                       sent_sum = sum(sent_sum),
                       sent_words = sum(sent_words),
                       words = sum(words),
                       arousal = sum(sent_words)/sum(words)
    )
    return(cbind(out_row[row,],data.frame(actor = actor)))
  }

  aggregator <- function (pid, dupe_df) {
    ### Party ids excluding actors
    p_ids <- c(str_c(pid,'_f'),str_c(pid,'_s'))
    ### Party ids including actors
    p_ids_a <- c(p_ids,str_c(pid,'_a'))
    summarizer <- function (p_ids, dupe_df, merged_id) {
        id <- dupe_df$`_id`[[1]]
        dupe_df <- dupe_df %>%
          filter(ids %in% p_ids)
        if (nrow(dupe_df) > 0) {
          return(
            dupe_df %>% summarise(
              `_id` = first(`_id`),
              `_source.doctype` = first(`_source.doctype`),
              `_source.publication_date` = first(`_source.publication_date`),
              prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
              sentence_id = list(sort(unique(unlist(sentence_id)))),
              rel_first = list(max(unlist(rel_first))),
              ids = merged_id,
              occ = list(length(unique(unlist(sentence_id)))),
              first = list(min(unlist(sentence_id))),
              actor_start = list(sort(unique(unlist(actor_start)))),
              actor_end = list(sort(unique(unlist(actor_end)))),
              sentence_start = list(sort(unique(unlist(sentence_start)))),
              sentence_end = list(sort(unique(unlist(sentence_end))))
            )
          )
        } else {
          print(paste0('id:',id))
          return(NULL)
        }
    }
    party <- summarizer(p_ids, dupe_df, str_c(pid,'_mfs'))
    party_actor <- summarizer(p_ids_a, dupe_df, str_c(pid,'_mfsa'))
    return(bind_rows(party,party_actor))
  }

  par_sent <- function(row, out, sent_dict = NULL) {
    out_row <- out[row,]
    ### Generating sentence-level sentiment scores from ud
    if (is.null(sent_dict) == F) {
      ud_sent <- out_row$`_source.ud`[[1]] %>%
        select(-one_of('exists')) %>%
        unnest() %>%
        filter(upos != 'PUNCT') # For getting proper word counts
        if ("lem_u" %in% colnames(sent_dict)) {
          ud_sent <- ud_sent %>%
            mutate(lem_u = str_c(lemma,'_',upos)) %>%
            left_join(sent_dict, by = 'lem_u')
        } else if ("lemma" %in% colnames(sent_dict)) {
          ud_sent <- ud_sent %>%
            left_join(sent_dict, by = 'lemma') %>%
            mutate(lem_u = lemma)
        }
      ud_sent <- ud_sent %>%
        group_by(sentence_id) %>%
        mutate(
          prox = case_when(
            is.na(prox) == T ~ 0,
            TRUE ~ prox
          )
        ) %>%
        summarise(sent_sum = sum(prox),
                  words = length(lemma),
                  sent_words = sum(prox != 0),
                  sent_lemmas = list(lem_u[prox != 0])) %>%
        mutate(
          sent = sent_sum/words,
          arousal = sent_words/words
        )
      out_row <- select(out_row, -`_source.ud`)
    }

    if (validation == T) {
      codes_sent <- filter(ud_sent, sentence_id == out_row$`_source.codes.sentence.id`[1])
      return(cbind(out_row, codes = codes_sent))
    }

    ### Unnest out_row to individual actor ids
    out_row <- out_row %>%
      unnest(`_source.computerCodes.actorsDetail`) %>%
      mutate(ids_list = ids) %>%
      unnest(ids) %>%
      mutate(
        pids = str_sub(ids, start = 1, end = -3)
      )

    if (!is.null(actor_ids)) {
      out_row <- filter(out_row, ids %in% actor_ids )
    }

    ### Get list of party ids occuring more than once in the document
    pids_table <- table(out_row$pids)
    dupe_pids <- names(pids_table[pids_table > 1])%>%
      str_subset(pattern = fixed('P_'))
    single_pids <- names(pids_table[pids_table <= 1]) %>%
      str_subset(pattern = fixed('P_'))
    ### Data frame containing only duplicate party ids
    dupe_df <- out_row %>%
      filter(pids %in% dupe_pids)
    ### Data frame containing only single party ids
    single_df <- out_row %>%
      filter(pids %in% single_pids)

    ### Data frame for single occurrence mfsa
    single_party_actor <- single_df %>%
      mutate(
        ids = str_c(pids,'_mfsa')
      )
    ### Data frame for single occurence mfs
    single_party <- single_df %>%
      filter(!endsWith(ids, '_a')) %>%
      mutate(
        ids = str_c(pids,'_mfs')
      )
    out_row <- out_row %>%
      filter(startsWith(ids,'A_')) %>%
      bind_rows(., single_party, single_party_actor)
    ### For each of the party ids in the list above, aggregate to _mfs and _mfsa
    if (length(dupe_pids) > 0) {
      aggregate <- bind_rows(lapply(dupe_pids, aggregator, dupe_df = dupe_df))
      out_row <- bind_rows(out_row, aggregate)
    }

    ### Generating sentiment scores for article and actors
    if (is.null(sent_dict) == F) {
      ### Aggregated sentiment per article (over all sentences in article)
      text_sent <- summarise(ud_sent,
                             sent = sum(sent_sum)/sum(words),
                             sent_sum = sum(sent_sum),
                             sent_words = sum(sent_words),
                             words = sum(words),
                             arousal = sum(sent_words)/sum(words)
      )
      out_row <- bind_rows(lapply(seq(1,nrow(out_row),1),sent_scorer, out_row = out_row, ud_sent = ud_sent)) %>%
        cbind(., text = text_sent)
    }
    out_row <- out_row %>%
      mutate(
        year = strftime(`_source.publication_date`, format = '%Y'),
        yearmonth = strftime(`_source.publication_date`, format = '%Y%m'),
        yearmonthday = strftime(`_source.publication_date`, format = '%Y%m%d'),
        yearweek = strftime(`_source.publication_date`, format = "%Y%V")
      ) %>%
      select(#-`_source.computerCodes.actorsDetail`,
             -`_score`,
             -`_index`,
             -`_type`,
             -pids)
    return(out_row)
  }
  saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
  return()
}
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`#' Generate actor data frames (with sentiment) from database`
			`#'`
			`#' Generate actor data frames (with sentiment) from database`
			`#' @param out Data frame produced by elasticizer`
actor_fetcher: added option for using dictionaries with just lemmas, besides the option of using lemma_upos dictionaries 4 years ago			`#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.`
actor_fetcher: Allow generation of actor df containing only specified actor ids and aggregations 4 years ago			`#' @param actor_ids Optional vector containing the actor ids to be collected`
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`#' @param cores Number of threads to use for parallel processing`
			`#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring`
			`#' @return No return value, data per batch is saved in an RDS file`
			`#' @export`
			`#' @examples`
			`#' actor_fetcher(out, sent_dict = NULL, cores = 1)`
			`#################################################################################################`
			`#################################### Aggregate actor results ################################`
			`#################################################################################################`
actor_fetcher: Allow generation of actor df containing only specified actor ids and aggregations 4 years ago			`actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, localhost = NULL, validation = F) {`
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`plan(multiprocess, workers = cores)`
			`### Functions`
			`### Calculate sentiment scores for each actor-document`
			`sent_scorer <- function(row, out_row, ud_sent) {`
			`### Contains sentiment per sentence for actor`
			`actor_tone <- filter(ud_sent, sentence_id %in% unlist(out_row[row,]$sentence_id))`

			`### Aggregated sentiment per actor (over all sentences containing actor)`
			`actor <- summarise(actor_tone,`
			`sent = sum(sent_sum)/sum(words),`
			`sent_sum = sum(sent_sum),`
			`sent_words = sum(sent_words),`
			`words = sum(words),`
			`arousal = sum(sent_words)/sum(words)`
			`)`
			`return(cbind(out_row[row,],data.frame(actor = actor)))`
			`}`

actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			`aggregator <- function (pid, dupe_df) {`
			`### Party ids excluding actors`
			`p_ids <- c(str_c(pid,'_f'),str_c(pid,'_s'))`
			`### Party ids including actors`
			`p_ids_a <- c(p_ids,str_c(pid,'_a'))`
actor_fetcher: added handling of exception where all actorsids related to a party are individual actors 5 years ago			`summarizer <- function (p_ids, dupe_df, merged_id) {`
actor_fetcher: quick fix 5 years ago			id <- dupe_df$`_id`[[1]]
actor_fetcher: added handling of exception where all actorsids related to a party are individual actors 5 years ago			`dupe_df <- dupe_df %>%`
			`filter(ids %in% p_ids)`
			`if (nrow(dupe_df) > 0) {`
			`return(`
			`dupe_df %>% summarise(`
			`_id` = first(`_id`),
			`_source.doctype` = first(`_source.doctype`),
			`_source.publication_date` = first(`_source.publication_date`),
			`prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),`
			`sentence_id = list(sort(unique(unlist(sentence_id)))),`
			`rel_first = list(max(unlist(rel_first))),`
			`ids = merged_id,`
			`occ = list(length(unique(unlist(sentence_id)))),`
			`first = list(min(unlist(sentence_id))),`
			`actor_start = list(sort(unique(unlist(actor_start)))),`
			`actor_end = list(sort(unique(unlist(actor_end)))),`
			`sentence_start = list(sort(unique(unlist(sentence_start)))),`
			`sentence_end = list(sort(unique(unlist(sentence_end))))`
			`)`
actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			`)`
actor_fetcher: added handling of exception where all actorsids related to a party are individual actors 5 years ago			`} else {`
actor_fetcher: added minimum verbosity to identify cases in which an actor is present without a party mention 5 years ago			`print(paste0('id:',id))`
actor_fetcher: added handling of exception where all actorsids related to a party are individual actors 5 years ago			`return(NULL)`
			`}`
actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			`}`
			`party <- summarizer(p_ids, dupe_df, str_c(pid,'_mfs'))`
			`party_actor <- summarizer(p_ids_a, dupe_df, str_c(pid,'_mfsa'))`
			`return(bind_rows(party,party_actor))`
			`}`

actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`par_sent <- function(row, out, sent_dict = NULL) {`
			`out_row <- out[row,]`
actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			`### Generating sentence-level sentiment scores from ud`
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`if (is.null(sent_dict) == F) {`
			ud_sent <- out_row$`_source.ud`[[1]] %>%
			`select(-one_of('exists')) %>%`
			`unnest() %>%`
actor_fetcher: small fixes 4 years ago			`filter(upos != 'PUNCT') # For getting proper word counts`
actor_fetcher: added option for using dictionaries with just lemmas, besides the option of using lemma_upos dictionaries 4 years ago			`if ("lem_u" %in% colnames(sent_dict)) {`
			`ud_sent <- ud_sent %>%`
			`mutate(lem_u = str_c(lemma,'_',upos)) %>%`
			`left_join(sent_dict, by = 'lem_u')`
			`} else if ("lemma" %in% colnames(sent_dict)) {`
			`ud_sent <- ud_sent %>%`
actor_fetcher: small fixes 4 years ago			`left_join(sent_dict, by = 'lemma') %>%`
			`mutate(lem_u = lemma)`
actor_fetcher: added option for using dictionaries with just lemmas, besides the option of using lemma_upos dictionaries 4 years ago			`}`
			`ud_sent <- ud_sent %>%`
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`group_by(sentence_id) %>%`
			`mutate(`
actor_fetcher: added lemma output when validating, to detect most problematic lemmas 5 years ago			`prox = case_when(`
			`is.na(prox) == T ~ 0,`
			`TRUE ~ prox`
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`)`
			`) %>%`
actor_fetcher: added lemma output when validating, to detect most problematic lemmas 5 years ago			`summarise(sent_sum = sum(prox),`
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`words = length(lemma),`
actor_fetcher: added lemma output when validating, to detect most problematic lemmas 5 years ago			`sent_words = sum(prox != 0),`
			`sent_lemmas = list(lem_u[prox != 0])) %>%`
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`mutate(`
			`sent = sent_sum/words,`
			`arousal = sent_words/words`
			`)`
actor_fetch: fixed error where source.ud would not exist 5 years ago			out_row <- select(out_row, -`_source.ud`)
actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			`}`
actor_aggregation,aggregator,aggregator_elastic: moved out of package directory to Old actor_fetcher: moved sentiment validation code block 5 years ago
			`if (validation == T) {`
			codes_sent <- filter(ud_sent, sentence_id == out_row$`_source.codes.sentence.id`[1])
			`return(cbind(out_row, codes = codes_sent))`
			`}`

actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			`### Unnest out_row to individual actor ids`
actor_fetch: fixed error where source.ud would not exist 5 years ago			`out_row <- out_row %>%`
actor_fetcher: Updated to tidyr 1.0.0, no longer using preserve, slightly different approach to keeping ids_list, and not removing actorsDetail anymore because it does not exist 5 years ago			unnest(`_source.computerCodes.actorsDetail`) %>%
			`mutate(ids_list = ids) %>%`
			`unnest(ids) %>%`
actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			`mutate(`
			`pids = str_sub(ids, start = 1, end = -3)`
			`)`

actor_fetcher: Allow generation of actor df containing only specified actor ids and aggregations 4 years ago			`if (!is.null(actor_ids)) {`
actor_fetcher: small fixes 4 years ago			`out_row <- filter(out_row, ids %in% actor_ids )`
actor_fetcher: Allow generation of actor df containing only specified actor ids and aggregations 4 years ago			`}`

actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			`### Get list of party ids occuring more than once in the document`
			`pids_table <- table(out_row$pids)`
actor_fetcher: added minimum verbosity to identify cases in which an actor is present without a party mention 5 years ago			`dupe_pids <- names(pids_table[pids_table > 1])%>%`
			`str_subset(pattern = fixed('P_'))`
actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			`single_pids <- names(pids_table[pids_table <= 1]) %>%`
			`str_subset(pattern = fixed('P_'))`
			`### Data frame containing only duplicate party ids`
			`dupe_df <- out_row %>%`
			`filter(pids %in% dupe_pids)`
			`### Data frame containing only single party ids`
			`single_df <- out_row %>%`
			`filter(pids %in% single_pids)`

			`### Data frame for single occurrence mfsa`
			`single_party_actor <- single_df %>%`
			`mutate(`
			`ids = str_c(pids,'_mfsa')`
			`)`
			`### Data frame for single occurence mfs`
			`single_party <- single_df %>%`
			`filter(!endsWith(ids, '_a')) %>%`
			`mutate(`
			`ids = str_c(pids,'_mfs')`
			`)`
			`out_row <- out_row %>%`
			`filter(startsWith(ids,'A_')) %>%`
			`bind_rows(., single_party, single_party_actor)`
			`### For each of the party ids in the list above, aggregate to _mfs and _mfsa`
			`if (length(dupe_pids) > 0) {`
			`aggregate <- bind_rows(lapply(dupe_pids, aggregator, dupe_df = dupe_df))`
			`out_row <- bind_rows(out_row, aggregate)`
			`}`

			`### Generating sentiment scores for article and actors`
			`if (is.null(sent_dict) == F) {`
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`### Aggregated sentiment per article (over all sentences in article)`
			`text_sent <- summarise(ud_sent,`
			`sent = sum(sent_sum)/sum(words),`
			`sent_sum = sum(sent_sum),`
			`sent_words = sum(sent_words),`
			`words = sum(words),`
			`arousal = sum(sent_words)/sum(words)`
			`)`
			`out_row <- bind_rows(lapply(seq(1,nrow(out_row),1),sent_scorer, out_row = out_row, ud_sent = ud_sent)) %>%`
			`cbind(., text = text_sent)`
			`}`
			`out_row <- out_row %>%`
			`mutate(`
			year = strftime(`_source.publication_date`, format = '%Y'),
			yearmonth = strftime(`_source.publication_date`, format = '%Y%m'),
			yearmonthday = strftime(`_source.publication_date`, format = '%Y%m%d'),
			yearweek = strftime(`_source.publication_date`, format = "%Y%V")
			`) %>%`
actor_fetcher: Updated to tidyr 1.0.0, no longer using preserve, slightly different approach to keeping ids_list, and not removing actorsDetail anymore because it does not exist 5 years ago			select(#-`_source.computerCodes.actorsDetail`,
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			-`_score`,
			-`_index`,
actor_fetcher: integrated party merging into actor_fetcher in what hopefully is the most efficient way 5 years ago			-`_type`,
			`-pids)`
actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents 5 years ago			`return(out_row)`
			`}`
			`saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))`
			`return()`
			`}`