sentencizer: commented code

master
Your Name 4 years ago
parent ec8afc4990
commit 4e867214dd

@ -1,6 +1,6 @@
#' Generate actor data frames (with sentiment) from database #' Generate sentence-level dataset with sentiment and actor presence
#' #'
#' Generate actor data frames (with sentiment) from database #' Generate sentence-level dataset with sentiment and actor presence
#' @param out Data frame produced by elasticizer #' @param out Data frame produced by elasticizer
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable. #' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring #' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
@ -9,29 +9,38 @@
#' @examples #' @examples
#' sentencizer(out, sent_dict = NULL, validation = F) #' sentencizer(out, sent_dict = NULL, validation = F)
################################################################################################# #################################################################################################
#################################### Aggregate actor results ################################ #################################### Generate sentence-level dataset#############################
################################################################################################# #################################################################################################
sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) { sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) {
## Despite the function name, parallel processing is not used, because it is slower
par_sent <- function(row, out, sent_dict = NULL) { par_sent <- function(row, out, sent_dict = NULL) {
out <- out[row,] out <- out[row,]
## Create df with article metadata (fields that are included in the elasticizer function)
metadata <- out %>% metadata <- out %>%
select(`_id`,contains("_source"),-contains("computerCodes.actors"),-contains("ud")) select(`_id`,contains("_source"),-contains("computerCodes.actors"),-contains("ud"))
## Unnest documents into individual words
ud_sent <- out %>% select(`_id`,`_source.ud`) %>% ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
unnest(cols = colnames(.)) %>% unnest(cols = colnames(.)) %>%
select(-one_of('exists')) %>% select(-one_of('exists')) %>%
unnest(cols = colnames(.)) %>% unnest(cols = colnames(.)) %>%
filter(upos != 'PUNCT') filter(upos != 'PUNCT')
## If there is a dictionary, apply it
if (!is.null(sent_dict)) { if (!is.null(sent_dict)) {
## If the dictionary contains the column lem_u, assume lemma_upos format
if ("lem_u" %in% colnames(sent_dict)) { if ("lem_u" %in% colnames(sent_dict)) {
ud_sent <- ud_sent %>% ud_sent <- ud_sent %>%
mutate(lem_u = str_c(lemma,'_',upos)) %>% mutate(lem_u = str_c(lemma,'_',upos)) %>%
left_join(sent_dict, by = 'lem_u') left_join(sent_dict, by = 'lem_u')
## If the dictionary contains the column lemma, assume simple lemma format
} else if ("lemma" %in% colnames(sent_dict)) { } else if ("lemma" %in% colnames(sent_dict)) {
ud_sent <- ud_sent %>% ud_sent <- ud_sent %>%
left_join(sent_dict, by = 'lemma') %>% left_join(sent_dict, by = 'lemma') %>%
mutate(lem_u = lemma) mutate(lem_u = lemma)
} }
## Group by sentences, and generate dictionary scores per sentence
ud_sent <- ud_sent %>% ud_sent <- ud_sent %>%
group_by(`_id`,sentence_id) %>% group_by(`_id`,sentence_id) %>%
mutate( mutate(
@ -48,11 +57,15 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
sent = sent_sum/words, sent = sent_sum/words,
arousal = sent_words/words arousal = sent_words/words
) )
## If there is no dictionary, create an "empty" ud_sent, with just sentence ids
} else { } else {
ud_sent <- ud_sent %>% group_by(`_id`,sentence_id) %>% summarise() ud_sent <- ud_sent %>% group_by(`_id`,sentence_id) %>% summarise()
} }
## Remove ud ouptut from source before further processing
out <- select(out, -`_source.ud`) out <- select(out, -`_source.ud`)
## If dictionary validation, return just the sentences that have been hand-coded
if (validation == T) { if (validation == T) {
codes_sent <- ud_sent %>% codes_sent <- ud_sent %>%
left_join(.,out, by='_id') %>% left_join(.,out, by='_id') %>%
@ -61,9 +74,9 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
return(codes_sent) return(codes_sent)
} }
### Unnest out_row to individual actor ids
if("_source.computerCodes.actorsDetail" %in% colnames(out)) { if("_source.computerCodes.actorsDetail" %in% colnames(out)) {
## If actor details in source, create vector of actor ids for each sentence
out <- out %>% out <- out %>%
unnest(`_source.computerCodes.actorsDetail`) %>% unnest(`_source.computerCodes.actorsDetail`) %>%
# mutate(ids_list = ids) %>% # mutate(ids_list = ids) %>%
@ -74,16 +87,19 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
ids = list(ids) ids = list(ids)
) )
} else { } else {
## If no actor details, keep one row per article and add a bogus sentence_id
out <- out %>% out <- out %>%
group_by(`_id`) %>% group_by(`_id`) %>%
summarise() %>% summarise() %>%
mutate(sentence_id = 1) mutate(sentence_id = 1)
} }
## Combine ud_sent with the source dataset
out <- out %>% out <- out %>%
left_join(ud_sent,.,by = c('_id','sentence_id')) %>% left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
group_by(`_id`) group_by(`_id`)
## If there is a sent_dict, generate sentiment scores on article level
if(!is.null(sent_dict)) { if(!is.null(sent_dict)) {
text_sent <- out %>% text_sent <- out %>%
summarise( summarise(
@ -102,6 +118,7 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
left_join(.,text_sent,by='_id') %>% left_join(.,text_sent,by='_id') %>%
left_join(.,metadata,by='_id') left_join(.,metadata,by='_id')
} else { } else {
## If no sent_dict, summarise all and join with metadata (see top)
out <- out %>% out <- out %>%
summarise_all(list) %>% summarise_all(list) %>%
left_join(.,metadata,by='_id') left_join(.,metadata,by='_id')

Loading…
Cancel
Save