diff --git a/R/sentencizer.R b/R/sentencizer.R index 4b4ecf7..76d0c84 100644 --- a/R/sentencizer.R +++ b/R/sentencizer.R @@ -1,6 +1,6 @@ -#' Generate actor data frames (with sentiment) from database +#' Generate sentence-level dataset with sentiment and actor presence #' -#' Generate actor data frames (with sentiment) from database +#' Generate sentence-level dataset with sentiment and actor presence #' @param out Data frame produced by elasticizer #' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable. #' @param validation Boolean indicating whether human validation should be performed on sentiment scoring @@ -9,29 +9,38 @@ #' @examples #' sentencizer(out, sent_dict = NULL, validation = F) ################################################################################################# -#################################### Aggregate actor results ################################ +#################################### Generate sentence-level dataset############################# ################################################################################################# sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) { + ## Despite the function name, parallel processing is not used, because it is slower par_sent <- function(row, out, sent_dict = NULL) { out <- out[row,] + ## Create df with article metadata (fields that are included in the elasticizer function) metadata <- out %>% select(`_id`,contains("_source"),-contains("computerCodes.actors"),-contains("ud")) + + ## Unnest documents into individual words ud_sent <- out %>% select(`_id`,`_source.ud`) %>% unnest(cols = colnames(.)) %>% select(-one_of('exists')) %>% unnest(cols = colnames(.)) %>% filter(upos != 'PUNCT') + ## If there is a dictionary, apply it if (!is.null(sent_dict)) { + ## If the dictionary contains the column lem_u, assume lemma_upos format if ("lem_u" %in% colnames(sent_dict)) { ud_sent <- ud_sent %>% mutate(lem_u = str_c(lemma,'_',upos)) %>% left_join(sent_dict, by = 'lem_u') + ## If the dictionary contains the column lemma, assume simple lemma format } else if ("lemma" %in% colnames(sent_dict)) { ud_sent <- ud_sent %>% left_join(sent_dict, by = 'lemma') %>% mutate(lem_u = lemma) } + + ## Group by sentences, and generate dictionary scores per sentence ud_sent <- ud_sent %>% group_by(`_id`,sentence_id) %>% mutate( @@ -48,11 +57,15 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) sent = sent_sum/words, arousal = sent_words/words ) + ## If there is no dictionary, create an "empty" ud_sent, with just sentence ids } else { ud_sent <- ud_sent %>% group_by(`_id`,sentence_id) %>% summarise() } + + ## Remove ud ouptut from source before further processing out <- select(out, -`_source.ud`) + ## If dictionary validation, return just the sentences that have been hand-coded if (validation == T) { codes_sent <- ud_sent %>% left_join(.,out, by='_id') %>% @@ -61,9 +74,9 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) return(codes_sent) } - ### Unnest out_row to individual actor ids - if("_source.computerCodes.actorsDetail" %in% colnames(out)) { + + ## If actor details in source, create vector of actor ids for each sentence out <- out %>% unnest(`_source.computerCodes.actorsDetail`) %>% # mutate(ids_list = ids) %>% @@ -74,16 +87,19 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) ids = list(ids) ) } else { + ## If no actor details, keep one row per article and add a bogus sentence_id out <- out %>% group_by(`_id`) %>% summarise() %>% mutate(sentence_id = 1) } - + ## Combine ud_sent with the source dataset out <- out %>% left_join(ud_sent,.,by = c('_id','sentence_id')) %>% group_by(`_id`) + + ## If there is a sent_dict, generate sentiment scores on article level if(!is.null(sent_dict)) { text_sent <- out %>% summarise( @@ -102,6 +118,7 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) left_join(.,text_sent,by='_id') %>% left_join(.,metadata,by='_id') } else { + ## If no sent_dict, summarise all and join with metadata (see top) out <- out %>% summarise_all(list) %>% left_join(.,metadata,by='_id')