actor_merger: overhaul to include cutoffs at sentence level as intended, also included options to generate sentiment for text only (don't provide actors_meta or actor_groups)

5 years ago · 18d47762d2
parent 74909ca3a0
commit 18d47762d2
1 changed files with 112 additions and 58 deletions
--- a/R/actor_merger.R
+++ b/R/actor_merger.R
@ -4,6 +4,8 @@
 #' @param df Data frame with actor ids, produced by sentencizer
 #' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors")
 #' @param actor_groups Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
+#' @param pos_cutoff Optional value above which sentence-level sentiment scores should be considered "positive"
+#' @param neg_cutoff Optional value below which sentence-level sentiment scores should be considered "negative"
 #' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
 #' @export
 #' @examples
@ -16,7 +18,7 @@
 ### some individual actors, where the partyId of an individual actor doesn't match an actual
 ### partyId in the actor dataset

-actor_merger <- function(df, actors_meta, actor_groups = NULL) {
+actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff = NULL, neg_cutoff = NULL) {
  grouper <- function(id2, df) {
    if ('P_1206_a' %in% id2) {
      id2 <- c('P_212_a','P_1771_a',id2)
@ -30,6 +32,7 @@ actor_merger <- function(df, actors_meta, actor_groups = NULL) {
    return(df[ids %in% id2,] %>%
             .[!duplicated(.,by = c('id','sentence_id')),.(
               actor.sent = sum(sent_sum)/sum(words),
+               actor.sent_binary = sum(sent_binary_weighted)/sum(words),
               actor.sent_sum = sum(sent_sum),
               actor.sent_words = sum(sent_words),
               actor.words = sum(words),
@ -50,8 +53,6 @@ actor_merger <- function(df, actors_meta, actor_groups = NULL) {
    id = as.factor(`_id`)
  ), .SDcols = !c('_source.doctype','_source.publication_date','_id')]

-  text_sent <- df[,.SD, .SDcols = c('id', 'doctype',grep('text\\.',names(df), value = T))]
-
  ## Create bogus variables if sentiment is not scored
  if(!"sent_sum" %in% colnames(df)) {
    df <- df[,.(
@ -64,61 +65,50 @@ actor_merger <- function(df, actors_meta, actor_groups = NULL) {
  ## Unnest to sentence level
  df <- df[,lapply(.SD, unlist, recursive=F),
           .SDcols = c('sentence_id', 'sent_sum', 'words', 'sent_words','ids'),
-           by = list(id,publication_date)]
-
-
-
-  text_noactors <- df[lengths(ids) == 0L,
-                      .(noactor.sent = sum(sent_sum)/sum(words),
-                        noactor.sent_sum = sum(sent_sum),
-                        noactor.sent_words = sum(sent_words),
-                        noactor.words = sum(words),
-                        noactor.arousal = sum(sent_words)/sum(words),
-                        noactor.first = first(sentence_id),
-                        noactor.occ = .N
-                        ), by = list(id)]
-
-
+           by = list(id,publication_date,doctype)]
+  df <- df[,.(
+    (.SD),
+    sent = sent_sum/words
+  )][,.(
+    (.SD),
+    sent_binary = case_when(
+      sent > pos_cutoff ~ 1,
+      sent == 0 ~ 0,
+      sent >= neg_cutoff & sent <= pos_cutoff ~ 0,
+      TRUE ~ -1
+    )
+  )][,.(
+    (.SD),
+    sent_binary_weighted = sent_binary*words
+  )]

+  text_sent <- df[,
+                      .(text.sent = sum(sent_sum)/sum(words),
+                        text.sent_binary = sum(sent_binary_weighted)/sum(words),
+                        text.sent_sum = sum(sent_sum),
+                        text.sent_words = sum(sent_words),
+                        text.words = sum(words),
+                        text.arousal = sum(sent_words)/sum(words),
+                        text.sentences = .N,
+                        doctype = first(doctype),
+                        publication_date = first(publication_date)
+                      ), by = list(id)]
+#
+#   test2 <- text_sent %>% mutate(
+#     sent_bin = case_when(
+#       text.sent > pos_cutoff ~ 1,
+#       text.sent == 0 ~ 0,
+#       text.sent >= neg_cutoff & text.sent <= pos_cutoff ~ 0,
+#       TRUE ~ -1
+#     ),
+#     sent_bin_weighted = case_when(
+#       text.sent_binary > 0 ~ 1,
+#       text.sent_binary < 0 ~ -1,
+#       TRUE ~ 0
+#     )
+#   )

-  all <- df[lengths(ids) > 0L,
-            .(actor.sent = sum(sent_sum)/sum(words),
-              actor.sent_sum = sum(sent_sum),
-              actor.sent_words = sum(sent_words),
-              actor.words = sum(words),
-              actor.arousal = sum(sent_words)/sum(words),
-              actor.first = first(sentence_id),
-              actor.occ = .N,
-              publication_date = first(publication_date),
-              ids = 'all'), by = list(id)]
-
-  all_ind <- df[str_detect(ids, 'A_'),
-            .(actor.sent = sum(sent_sum)/sum(words),
-              actor.sent_sum = sum(sent_sum),
-              actor.sent_words = sum(sent_words),
-              actor.words = sum(words),
-              actor.arousal = sum(sent_words)/sum(words),
-              actor.first = first(sentence_id),
-              actor.occ = .N,
-              publication_date = first(publication_date),
-              ids = 'ind'), by = list(id)]
-
-  all_par <- df[str_detect(ids, '_f|_s'),
-            .(actor.sent = sum(sent_sum)/sum(words),
-              actor.sent_sum = sum(sent_sum),
-              actor.sent_words = sum(sent_words),
-              actor.words = sum(words),
-              actor.arousal = sum(sent_words)/sum(words),
-              actor.first = first(sentence_id),
-              actor.occ = .N,
-              publication_date = first(publication_date),
-              ids = 'par'), by = list(id)]
-
-  ## Unnest to actor level
-  df <- df[,.(ids = as.character(unlist(ids))),
-                  by = list(id,publication_date,sentence_id, sent_sum, words, sent_words)]
-
-  ## Create aggregations according to list of actorId vectors in ids
+    ## Create aggregations according to list of actorId vectors in ids
  if(!is.null(actor_groups)) {
    output <- lapply(actor_groups,grouper, df = df) %>%
      rbindlist(.) %>%
@ -132,10 +122,63 @@ actor_merger <- function(df, actors_meta, actor_groups = NULL) {
        yearweek = strftime(publication_date, format = "%Y%V")
      )
    return(output)
-  } else {
+  } else if(!is.null(actors_meta)) {
+    text_noactors <- df[lengths(ids) == 0L,
+                        .(noactor.sent = sum(sent_sum)/sum(words),
+                          noactor.sent_binary = sum(sent_binary_weighted)/sum(words),
+                          noactor.sent_sum = sum(sent_sum),
+                          noactor.sent_words = sum(sent_words),
+                          noactor.words = sum(words),
+                          noactor.arousal = sum(sent_words)/sum(words),
+                          noactor.first = first(sentence_id),
+                          noactor.occ = .N
+                        ), by = list(id)]
+
+    all <- df[lengths(ids) > 0L,
+              .(actor.sent = sum(sent_sum)/sum(words),
+                actor.sent_binary = sum(sent_binary_weighted)/sum(words),
+                actor.sent_sum = sum(sent_sum),
+                actor.sent_words = sum(sent_words),
+                actor.words = sum(words),
+                actor.arousal = sum(sent_words)/sum(words),
+                actor.first = first(sentence_id),
+                actor.occ = .N,
+                publication_date = first(publication_date),
+                ids = 'all'), by = list(id)]
+
+    all_ind <- df[str_detect(ids, 'A_'),
+                  .(actor.sent = sum(sent_sum)/sum(words),
+                    actor.sent_binary = sum(sent_binary_weighted)/sum(words),
+                    actor.sent_sum = sum(sent_sum),
+                    actor.sent_words = sum(sent_words),
+                    actor.words = sum(words),
+                    actor.arousal = sum(sent_words)/sum(words),
+                    actor.first = first(sentence_id),
+                    actor.occ = .N,
+                    publication_date = first(publication_date),
+                    ids = 'ind'), by = list(id)]
+
+    all_par <- df[str_detect(ids, '_f|_s'),
+                  .(actor.sent = sum(sent_sum)/sum(words),
+                    actor.sent_binary = sum(sent_binary_weighted)/sum(words),
+                    actor.sent_sum = sum(sent_sum),
+                    actor.sent_words = sum(sent_words),
+                    actor.words = sum(words),
+                    actor.arousal = sum(sent_words)/sum(words),
+                    actor.first = first(sentence_id),
+                    actor.occ = .N,
+                    publication_date = first(publication_date),
+                    ids = 'par'), by = list(id)]
+
+    ## Unnest to actor level
+    df <- df[,.(ids = as.character(unlist(ids))),
+             by = list(id,publication_date,sentence_id, sent_sum, words, sent_words,sent_binary_weighted)]
+
+
    ## Create aggregate measures for individual actors
    actors <- df[str_starts(ids, 'A_'),
                 .(actor.sent = sum(sent_sum)/sum(words),
+                   actor.sent_binary = sum(sent_binary_weighted)/sum(words),
                   actor.sent_sum = sum(sent_sum),
                   actor.sent_words = sum(sent_words),
                   actor.words = sum(words),
@ -177,6 +220,7 @@ actor_merger <- function(df, actors_meta, actor_groups = NULL) {
      (.SD)
    ), .SDcols = -c('ids')][,.(
      actor.sent = sum(sent_sum)/sum(words),
+      actor.sent_binary = sum(sent_binary_weighted)/sum(words),
      actor.sent_sum = sum(sent_sum),
      actor.sent_words = sum(sent_words),
      actor.words = sum(words),
@ -193,6 +237,7 @@ actor_merger <- function(df, actors_meta, actor_groups = NULL) {
      (.SD)
    ),.SDcols = -c('ids')][,.(
      actor.sent = sum(sent_sum)/sum(words),
+      actor.sent_binary = sum(sent_binary_weighted)/sum(words),
      actor.sent_sum = sum(sent_sum),
      actor.sent_words = sum(sent_words),
      actor.words = sum(words),
@ -205,7 +250,7 @@ actor_merger <- function(df, actors_meta, actor_groups = NULL) {

    ## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
    df <- bind_rows(actors, parties, parties_actors, all, all_ind, all_par) %>%
-      left_join(.,text_sent, by="id") %>%
+      left_join(.,text_sent, by=c("id","publication_date")) %>%
      left_join(.,text_noactors, by="id") %>%
      mutate(
        actor.prom = actor.occ/text.sentences,
@ -218,6 +263,15 @@ actor_merger <- function(df, actors_meta, actor_groups = NULL) {
      ungroup() %>%
      select(-contains('Search'),-starts_with('not'))
    return(df)
+  } else {
+    df <- text_sent %>%
+      mutate(
+        year = strftime(publication_date, format = '%Y'),
+        yearmonth = strftime(publication_date, format = '%Y%m'),
+        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
+        yearweek = strftime(publication_date, format = "%Y%V")
+      ) %>%
+      ungroup()
  }
 }