actor_merger: changed computation of arousal, and removed uninformative variables

5 years ago · 955f034e6a
parent 3cdb68b196
commit 955f034e6a
1 changed files with 18 additions and 50 deletions
--- a/R/actor_merger.R
+++ b/R/actor_merger.R
@ -31,12 +31,10 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
    }
    return(df[ids %in% id2,] %>%
             .[!duplicated(.,by = c('id','sentence_id')),.(
-               actor.sent = sum(sent_sum)/sum(words),
-               actor.sent_binary = sum(sent_binary_weighted)/sum(words),
-               actor.sent_sum = sum(sent_sum),
+               actor.sent = sum(sent_binary_weighted)/sum(words),
               actor.sent_words = sum(sent_words),
               actor.words = sum(words),
-               actor.arousal = sum(sent_words)/sum(words),
+               actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
               actor.first = first(sentence_id),
               actor.occ = .N,
               publication_date = first(publication_date),
@ -83,30 +81,14 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
  )]

  text_sent <- df[,
-                      .(text.sent = sum(sent_sum)/sum(words),
-                        text.sent_binary = sum(sent_binary_weighted)/sum(words),
-                        text.sent_sum = sum(sent_sum),
+                      .(text.sent = sum(sent_binary_weighted)/sum(words),
                        text.sent_words = sum(sent_words),
                        text.words = sum(words),
-                        text.arousal = sum(sent_words)/sum(words),
+                        text.arousal = sum(abs(sent_binary_weighted))/sum(words),
                        text.sentences = .N,
                        doctype = first(doctype),
                        publication_date = first(publication_date)
                      ), by = list(id)]
-#
-#   test2 <- text_sent %>% mutate(
-#     sent_bin = case_when(
-#       text.sent > pos_cutoff ~ 1,
-#       text.sent == 0 ~ 0,
-#       text.sent >= neg_cutoff & text.sent <= pos_cutoff ~ 0,
-#       TRUE ~ -1
-#     ),
-#     sent_bin_weighted = case_when(
-#       text.sent_binary > 0 ~ 1,
-#       text.sent_binary < 0 ~ -1,
-#       TRUE ~ 0
-#     )
-#   )

    ## Create aggregations according to list of actorId vectors in ids
  if(!is.null(actor_groups)) {
@ -124,47 +106,39 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
    return(output)
  } else if(!is.null(actors_meta)) {
    text_noactors <- df[lengths(ids) == 0L,
-                        .(noactor.sent = sum(sent_sum)/sum(words),
-                          noactor.sent_binary = sum(sent_binary_weighted)/sum(words),
-                          noactor.sent_sum = sum(sent_sum),
+                        .(noactor.sent = sum(sent_binary_weighted)/sum(words),
                          noactor.sent_words = sum(sent_words),
                          noactor.words = sum(words),
-                          noactor.arousal = sum(sent_words)/sum(words),
+                          noactor.arousal = sum(abs(sent_binary_weighted))/sum(words),
                          noactor.first = first(sentence_id),
                          noactor.occ = .N
                        ), by = list(id)]

    all <- df[lengths(ids) > 0L,
-              .(actor.sent = sum(sent_sum)/sum(words),
-                actor.sent_binary = sum(sent_binary_weighted)/sum(words),
-                actor.sent_sum = sum(sent_sum),
+              .(actor.sent = sum(sent_binary_weighted)/sum(words),
                actor.sent_words = sum(sent_words),
                actor.words = sum(words),
-                actor.arousal = sum(sent_words)/sum(words),
+                actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
                actor.first = first(sentence_id),
                actor.occ = .N,
                publication_date = first(publication_date),
                ids = 'all'), by = list(id)]

    all_ind <- df[str_detect(ids, 'A_'),
-                  .(actor.sent = sum(sent_sum)/sum(words),
-                    actor.sent_binary = sum(sent_binary_weighted)/sum(words),
-                    actor.sent_sum = sum(sent_sum),
+                  .(actor.sent = sum(sent_binary_weighted)/sum(words),
                    actor.sent_words = sum(sent_words),
                    actor.words = sum(words),
-                    actor.arousal = sum(sent_words)/sum(words),
+                    actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
                    actor.first = first(sentence_id),
                    actor.occ = .N,
                    publication_date = first(publication_date),
                    ids = 'ind'), by = list(id)]

    all_par <- df[str_detect(ids, '_f|_s'),
-                  .(actor.sent = sum(sent_sum)/sum(words),
-                    actor.sent_binary = sum(sent_binary_weighted)/sum(words),
-                    actor.sent_sum = sum(sent_sum),
+                  .(actor.sent = sum(sent_binary_weighted)/sum(words),
                    actor.sent_words = sum(sent_words),
                    actor.words = sum(words),
-                    actor.arousal = sum(sent_words)/sum(words),
+                    actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
                    actor.first = first(sentence_id),
                    actor.occ = .N,
                    publication_date = first(publication_date),
@ -177,12 +151,10 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff

    ## Create aggregate measures for individual actors
    actors <- df[str_starts(ids, 'A_'),
-                 .(actor.sent = sum(sent_sum)/sum(words),
-                   actor.sent_binary = sum(sent_binary_weighted)/sum(words),
-                   actor.sent_sum = sum(sent_sum),
+                 .(actor.sent = sum(sent_binary_weighted)/sum(words),
                   actor.sent_words = sum(sent_words),
                   actor.words = sum(words),
-                   actor.arousal = sum(sent_words)/sum(words),
+                   actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
                   actor.first = first(sentence_id),
                   actor.occ = .N,
                   publication_date = first(publication_date)), by = list(id, ids)]
@ -219,12 +191,10 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
                      TRUE ~ ids),
      (.SD)
    ), .SDcols = -c('ids')][,.(
-      actor.sent = sum(sent_sum)/sum(words),
-      actor.sent_binary = sum(sent_binary_weighted)/sum(words),
-      actor.sent_sum = sum(sent_sum),
+      actor.sent = sum(sent_binary_weighted)/sum(words),
      actor.sent_words = sum(sent_words),
      actor.words = sum(words),
-      actor.arousal = sum(sent_words)/sum(words),
+      actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
      actor.first = first(sentence_id),
      actor.occ = .N,
      publication_date = first(publication_date)
@ -236,12 +206,10 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
      ids = str_sub(ids, start = 1, end = -3),
      (.SD)
    ),.SDcols = -c('ids')][,.(
-      actor.sent = sum(sent_sum)/sum(words),
-      actor.sent_binary = sum(sent_binary_weighted)/sum(words),
-      actor.sent_sum = sum(sent_sum),
+      actor.sent = sum(sent_binary_weighted)/sum(words),
      actor.sent_words = sum(sent_words),
      actor.words = sum(words),
-      actor.arousal = sum(sent_words)/sum(words),
+      actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
      actor.first = first(sentence_id),
      actor.occ = .N,
      publication_date = first(publication_date)