From 9ccfd2952e8d28138d8f5765935b2a7f8abb532c Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 25 May 2020 15:48:46 +0200
Subject: [PATCH] sentencizer: minor updates

---
 R/sentencizer.R | 86 +++++++++++++++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 31 deletions(-)

diff --git a/R/sentencizer.R b/R/sentencizer.R
index 4960d89..ea7be7d 100644
--- a/R/sentencizer.R
+++ b/R/sentencizer.R
@@ -3,10 +3,11 @@
 #' Generate actor data frames (with sentiment) from database
 #' @param out Data frame produced by elasticizer
 #' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
+#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
 #' @return No return value, data per batch is saved in an RDS file
 #' @export
 #' @examples
-#' sentencizer(out, sent_dict = NULL)
+#' sentencizer(out, sent_dict = NULL, validation = F)
 #################################################################################################
 #################################### Aggregate actor results ################################
 #################################################################################################
@@ -14,14 +15,14 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
   par_sent <- function(row, out, sent_dict = NULL) {
     out <- out[row,]
     metadata <- out %>%
-      select(`_id`,`_source.publication_date`, `_source.doctype`)
+      select(`_id`,contains("_source"),-contains("computerCodes.actors"),-contains("ud"))
     ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
       unnest(cols = colnames(.)) %>%
       select(-one_of('exists')) %>%
       unnest(cols = colnames(.)) %>%
       filter(upos != 'PUNCT')
 
-    if (is.null(sent_dict) == F) {
+    if (!is.null(sent_dict)) {
       if ("lem_u" %in% colnames(sent_dict)) {
         ud_sent <- ud_sent %>%
           mutate(lem_u = str_c(lemma,'_',upos)) %>%
@@ -48,40 +49,63 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
           arousal = sent_words/words
         )
     } else {
-      ud_sent <- ud_sent %>% group_by(sentence_id) %>% summarise()
+      ud_sent <- ud_sent %>% group_by(`_id`,sentence_id) %>% summarise()
     }
     out <- select(out, -`_source.ud`)
 
+    if (validation == T) {
+      codes_sent <- ud_sent %>%
+        left_join(.,out, by='_id') %>%
+        rowwise() %>%
+        filter(sentence_id == `_source.codes.sentence.id`)
+      return(codes_sent)
+    }
+
     ### Unnest out_row to individual actor ids
-    out <- out %>%
-      unnest(`_source.computerCodes.actorsDetail`) %>%
-      mutate(ids_list = ids) %>%
-      unnest(ids) %>%
-      unnest(sentence_id) %>%
-      group_by(`_id`,sentence_id) %>%
-      summarise(
-        ids = list(ids)
-      ) %>%
-      left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
-      group_by(`_id`)
 
-    text_sent <- out %>%
-      summarise(
-        text.sent_sum = sum(sent_sum),
-        text.words = sum(words),
-        text.sent_words = sum(sent_words),
-        text.sent_lemmas = I(list(unlist(sent_lemmas))),
-        text.sentences = n()
-      ) %>%
-      mutate(
-        text.sent = text.sent_sum/text.words,
-        text.arousal = text.sent_words/text.words
-      )
+    if("_source.computerCodes.actorsDetail2" %in% colnames(out)) {
+      out <- out %>%
+        unnest(`_source.computerCodes.actorsDetail`) %>%
+        # mutate(ids_list = ids) %>%
+        unnest(ids) %>%
+        unnest(sentence_id) %>%
+        group_by(`_id`,sentence_id) %>%
+        summarise(
+          ids = list(ids)
+        )
+    } else {
+      out <- out %>%
+        group_by(`_id`) %>%
+        summarise() %>%
+        mutate(sentence_id = 1)
+    }
+
 
-    out <- out %>%
-      summarise_all(list) %>%
-      left_join(.,text_sent,by='_id') %>%
-      left_join(.,metadata,by='_id')
+      out <- out %>%
+        left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
+        group_by(`_id`)
+    if(!is.null(sent_dict)) {
+      text_sent <- out %>%
+        summarise(
+          text.sent_sum = sum(sent_sum),
+          text.words = sum(words),
+          text.sent_words = sum(sent_words),
+          text.sent_lemmas = I(list(unlist(sent_lemmas))),
+          text.sentences = n()
+        ) %>%
+        mutate(
+          text.sent = text.sent_sum/text.words,
+          text.arousal = text.sent_words/text.words
+        )
+      out <- out %>%
+        summarise_all(list) %>%
+        left_join(.,text_sent,by='_id') %>%
+        left_join(.,metadata,by='_id')
+    } else {
+      out <- out %>%
+        summarise_all(list) %>%
+        left_join(.,metadata,by='_id')
+    }
     return(out)
   }
   saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))