sentencizer: added new function for sentiment coding and actor collection

5 years ago · 98325bde8f
parent 7f958bbc11
commit 98325bde8f
4 changed files with 118 additions and 1 deletions
--- a/2
+++ b/2
@ -20,4 +20,4 @@ Depends: R (>= 3.3.1),
 License: Copyright Erik de Vries
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.0
--- a/1
+++ b/1
@ -20,4 +20,5 @@ export(out_parser)
 export(preproc)
 export(query_gen_actors)
 export(query_string)
+export(sentencizer)
 export(ud_update)
--- a/R/sentencizer.R
+++ b/R/sentencizer.R
@ -0,0 +1,94 @@
+#' Generate actor data frames (with sentiment) from database
+#'
+#' Generate actor data frames (with sentiment) from database
+#' @param out Data frame produced by elasticizer
+#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
+#' @return No return value, data per batch is saved in an RDS file
+#' @export
+#' @examples
+#' sentencizer(out, sent_dict = NULL)
+#################################################################################################
+#################################### Aggregate actor results ################################
+#################################################################################################
+sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) {
+  par_sent <- function(row, out, sent_dict = NULL) {
+    out <- out[row,]
+    metadata <- out %>%
+      select(`_id`,`_source.publication_date`, `_source.doctype`)
+    ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
+      unnest(cols = colnames(.)) %>%
+      select(-one_of('exists')) %>%
+      unnest(cols = colnames(.)) %>%
+      filter(upos != 'PUNCT')
+
+    if (is.null(sent_dict) == F) {
+      if ("lem_u" %in% colnames(sent_dict)) {
+        ud_sent <- ud_sent %>%
+          mutate(lem_u = str_c(lemma,'_',upos)) %>%
+          left_join(sent_dict, by = 'lem_u')
+      } else if ("lemma" %in% colnames(sent_dict)) {
+        ud_sent <- ud_sent %>%
+          left_join(sent_dict, by = 'lemma') %>%
+          mutate(lem_u = lemma)
+      }
+      ud_sent <- ud_sent %>%
+        group_by(`_id`,sentence_id) %>%
+        mutate(
+          prox = case_when(
+            is.na(prox) == T ~ 0,
+            TRUE ~ prox
+          )
+        ) %>%
+        summarise(sent_sum = sum(prox),
+                  words = length(lemma),
+                  sent_words = sum(prox != 0),
+                  sent_lemmas = list(lem_u[prox != 0])) %>%
+        mutate(
+          sent = sent_sum/words,
+          arousal = sent_words/words
+        )
+    } else {
+      ud_sent <- ud_sent %>% group_by(sentence_id) %>% summarise()
+    }
+    out <- select(out, -`_source.ud`)
+
+    ### Unnest out_row to individual actor ids
+    out <- out %>%
+      unnest(`_source.computerCodes.actorsDetail`) %>%
+      mutate(ids_list = ids) %>%
+      unnest(ids) %>%
+      unnest(sentence_id) %>%
+      group_by(`_id`,sentence_id) %>%
+      summarise(
+        ids = list(ids)
+      ) %>%
+      left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
+      group_by(`_id`)
+
+    text_sent <- out %>%
+      summarise(
+        text.sent_sum = sum(sent_sum),
+        text.words = sum(words),
+        text.sent_words = sum(sent_words),
+        text.sent_lemmas = I(list(unlist(sent_lemmas))),
+        text.sentences = n()
+      ) %>%
+      mutate(
+        text.sent = text.sent_sum/text.words,
+        text.arousal = text.sent_words/text.words
+      )
+
+    out <- out %>%
+      summarise_all(list) %>%
+      left_join(.,text_sent,by='_id') %>%
+      left_join(.,metadata,by='_id')
+    return(out)
+  }
+  saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
+  return()
+  ### Keeping the option for parallel computation
+  # microbenchmark::microbenchmark(out_normal <- par_sent(1:nrow(out),out = out, sent_dict=sent_dict), times = 1)
+  # plan(multiprocess, workers = cores)
+  # chunks <- split(1:nrow(out), sort(1:nrow(out)%%cores))
+  # microbenchmark::microbenchmark(out_par <- bind_rows(future_lapply(chunks,par_sent, out=out, sent_dict=sent_dict)), times = 1)
+}
--- a/man/sentencizer.Rd
+++ b/man/sentencizer.Rd
@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sentencizer.R
+\name{sentencizer}
+\alias{sentencizer}
+\title{Generate actor data frames (with sentiment) from database}
+\usage{
+sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
+}
+\arguments{
+\item{out}{Data frame produced by elasticizer}
+
+\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.}
+}
+\value{
+No return value, data per batch is saved in an RDS file
+}
+\description{
+Generate actor data frames (with sentiment) from database
+}
+\examples{
+sentencizer(out, sent_dict = NULL)
+}