diff --git a/DESCRIPTION b/DESCRIPTION index b1679d8..a4f0d7d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,4 +20,4 @@ Depends: R (>= 3.3.1), License: Copyright Erik de Vries Encoding: UTF-8 LazyData: true -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.0 diff --git a/NAMESPACE b/NAMESPACE index c7b163c..196f93f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,4 +20,5 @@ export(out_parser) export(preproc) export(query_gen_actors) export(query_string) +export(sentencizer) export(ud_update) diff --git a/R/sentencizer.R b/R/sentencizer.R new file mode 100644 index 0000000..4960d89 --- /dev/null +++ b/R/sentencizer.R @@ -0,0 +1,94 @@ +#' Generate actor data frames (with sentiment) from database +#' +#' Generate actor data frames (with sentiment) from database +#' @param out Data frame produced by elasticizer +#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable. +#' @return No return value, data per batch is saved in an RDS file +#' @export +#' @examples +#' sentencizer(out, sent_dict = NULL) +################################################################################################# +#################################### Aggregate actor results ################################ +################################################################################################# +sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) { + par_sent <- function(row, out, sent_dict = NULL) { + out <- out[row,] + metadata <- out %>% + select(`_id`,`_source.publication_date`, `_source.doctype`) + ud_sent <- out %>% select(`_id`,`_source.ud`) %>% + unnest(cols = colnames(.)) %>% + select(-one_of('exists')) %>% + unnest(cols = colnames(.)) %>% + filter(upos != 'PUNCT') + + if (is.null(sent_dict) == F) { + if ("lem_u" %in% colnames(sent_dict)) { + ud_sent <- ud_sent %>% + mutate(lem_u = str_c(lemma,'_',upos)) %>% + left_join(sent_dict, by = 'lem_u') + } else if ("lemma" %in% colnames(sent_dict)) { + ud_sent <- ud_sent %>% + left_join(sent_dict, by = 'lemma') %>% + mutate(lem_u = lemma) + } + ud_sent <- ud_sent %>% + group_by(`_id`,sentence_id) %>% + mutate( + prox = case_when( + is.na(prox) == T ~ 0, + TRUE ~ prox + ) + ) %>% + summarise(sent_sum = sum(prox), + words = length(lemma), + sent_words = sum(prox != 0), + sent_lemmas = list(lem_u[prox != 0])) %>% + mutate( + sent = sent_sum/words, + arousal = sent_words/words + ) + } else { + ud_sent <- ud_sent %>% group_by(sentence_id) %>% summarise() + } + out <- select(out, -`_source.ud`) + + ### Unnest out_row to individual actor ids + out <- out %>% + unnest(`_source.computerCodes.actorsDetail`) %>% + mutate(ids_list = ids) %>% + unnest(ids) %>% + unnest(sentence_id) %>% + group_by(`_id`,sentence_id) %>% + summarise( + ids = list(ids) + ) %>% + left_join(ud_sent,.,by = c('_id','sentence_id')) %>% + group_by(`_id`) + + text_sent <- out %>% + summarise( + text.sent_sum = sum(sent_sum), + text.words = sum(words), + text.sent_words = sum(sent_words), + text.sent_lemmas = I(list(unlist(sent_lemmas))), + text.sentences = n() + ) %>% + mutate( + text.sent = text.sent_sum/text.words, + text.arousal = text.sent_words/text.words + ) + + out <- out %>% + summarise_all(list) %>% + left_join(.,text_sent,by='_id') %>% + left_join(.,metadata,by='_id') + return(out) + } + saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds')) + return() + ### Keeping the option for parallel computation + # microbenchmark::microbenchmark(out_normal <- par_sent(1:nrow(out),out = out, sent_dict=sent_dict), times = 1) + # plan(multiprocess, workers = cores) + # chunks <- split(1:nrow(out), sort(1:nrow(out)%%cores)) + # microbenchmark::microbenchmark(out_par <- bind_rows(future_lapply(chunks,par_sent, out=out, sent_dict=sent_dict)), times = 1) +} diff --git a/man/sentencizer.Rd b/man/sentencizer.Rd new file mode 100644 index 0000000..51b9128 --- /dev/null +++ b/man/sentencizer.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sentencizer.R +\name{sentencizer} +\alias{sentencizer} +\title{Generate actor data frames (with sentiment) from database} +\usage{ +sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F) +} +\arguments{ +\item{out}{Data frame produced by elasticizer} + +\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.} +} +\value{ +No return value, data per batch is saved in an RDS file +} +\description{ +Generate actor data frames (with sentiment) from database +} +\examples{ +sentencizer(out, sent_dict = NULL) +}