sentencizer: minor updates

master
Your Name 5 years ago
parent 98325bde8f
commit 9ccfd2952e

@ -3,10 +3,11 @@
#' Generate actor data frames (with sentiment) from database #' Generate actor data frames (with sentiment) from database
#' @param out Data frame produced by elasticizer #' @param out Data frame produced by elasticizer
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable. #' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
#' @return No return value, data per batch is saved in an RDS file #' @return No return value, data per batch is saved in an RDS file
#' @export #' @export
#' @examples #' @examples
#' sentencizer(out, sent_dict = NULL) #' sentencizer(out, sent_dict = NULL, validation = F)
################################################################################################# #################################################################################################
#################################### Aggregate actor results ################################ #################################### Aggregate actor results ################################
################################################################################################# #################################################################################################
@ -14,14 +15,14 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
par_sent <- function(row, out, sent_dict = NULL) { par_sent <- function(row, out, sent_dict = NULL) {
out <- out[row,] out <- out[row,]
metadata <- out %>% metadata <- out %>%
select(`_id`,`_source.publication_date`, `_source.doctype`) select(`_id`,contains("_source"),-contains("computerCodes.actors"),-contains("ud"))
ud_sent <- out %>% select(`_id`,`_source.ud`) %>% ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
unnest(cols = colnames(.)) %>% unnest(cols = colnames(.)) %>%
select(-one_of('exists')) %>% select(-one_of('exists')) %>%
unnest(cols = colnames(.)) %>% unnest(cols = colnames(.)) %>%
filter(upos != 'PUNCT') filter(upos != 'PUNCT')
if (is.null(sent_dict) == F) { if (!is.null(sent_dict)) {
if ("lem_u" %in% colnames(sent_dict)) { if ("lem_u" %in% colnames(sent_dict)) {
ud_sent <- ud_sent %>% ud_sent <- ud_sent %>%
mutate(lem_u = str_c(lemma,'_',upos)) %>% mutate(lem_u = str_c(lemma,'_',upos)) %>%
@ -48,40 +49,63 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
arousal = sent_words/words arousal = sent_words/words
) )
} else { } else {
ud_sent <- ud_sent %>% group_by(sentence_id) %>% summarise() ud_sent <- ud_sent %>% group_by(`_id`,sentence_id) %>% summarise()
} }
out <- select(out, -`_source.ud`) out <- select(out, -`_source.ud`)
if (validation == T) {
codes_sent <- ud_sent %>%
left_join(.,out, by='_id') %>%
rowwise() %>%
filter(sentence_id == `_source.codes.sentence.id`)
return(codes_sent)
}
### Unnest out_row to individual actor ids ### Unnest out_row to individual actor ids
out <- out %>%
unnest(`_source.computerCodes.actorsDetail`) %>%
mutate(ids_list = ids) %>%
unnest(ids) %>%
unnest(sentence_id) %>%
group_by(`_id`,sentence_id) %>%
summarise(
ids = list(ids)
) %>%
left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
group_by(`_id`)
text_sent <- out %>% if("_source.computerCodes.actorsDetail2" %in% colnames(out)) {
summarise( out <- out %>%
text.sent_sum = sum(sent_sum), unnest(`_source.computerCodes.actorsDetail`) %>%
text.words = sum(words), # mutate(ids_list = ids) %>%
text.sent_words = sum(sent_words), unnest(ids) %>%
text.sent_lemmas = I(list(unlist(sent_lemmas))), unnest(sentence_id) %>%
text.sentences = n() group_by(`_id`,sentence_id) %>%
) %>% summarise(
mutate( ids = list(ids)
text.sent = text.sent_sum/text.words, )
text.arousal = text.sent_words/text.words } else {
) out <- out %>%
group_by(`_id`) %>%
summarise() %>%
mutate(sentence_id = 1)
}
out <- out %>% out <- out %>%
summarise_all(list) %>% left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
left_join(.,text_sent,by='_id') %>% group_by(`_id`)
left_join(.,metadata,by='_id') if(!is.null(sent_dict)) {
text_sent <- out %>%
summarise(
text.sent_sum = sum(sent_sum),
text.words = sum(words),
text.sent_words = sum(sent_words),
text.sent_lemmas = I(list(unlist(sent_lemmas))),
text.sentences = n()
) %>%
mutate(
text.sent = text.sent_sum/text.words,
text.arousal = text.sent_words/text.words
)
out <- out %>%
summarise_all(list) %>%
left_join(.,text_sent,by='_id') %>%
left_join(.,metadata,by='_id')
} else {
out <- out %>%
summarise_all(list) %>%
left_join(.,metadata,by='_id')
}
return(out) return(out)
} }
saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds')) saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))

Loading…
Cancel
Save