@ -3,10 +3,11 @@
#' Generate actor data frames (with sentiment) from database
#' Generate actor data frames (with sentiment) from database
#' @param out Data frame produced by elasticizer
#' @param out Data frame produced by elasticizer
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
#' @return No return value, data per batch is saved in an RDS file
#' @return No return value, data per batch is saved in an RDS file
#' @export
#' @export
#' @examples
#' @examples
#' sentencizer(out, sent_dict = NULL )
#' sentencizer(out, sent_dict = NULL , validation = F )
#################################################################################################
#################################################################################################
#################################### Aggregate actor results ################################
#################################### Aggregate actor results ################################
#################################################################################################
#################################################################################################
@ -14,14 +15,14 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
par_sent <- function ( row , out , sent_dict = NULL ) {
par_sent <- function ( row , out , sent_dict = NULL ) {
out <- out [row , ]
out <- out [row , ]
metadata <- out %>%
metadata <- out %>%
select ( `_id` , `_source.publication_date` , `_source.doctype` )
select ( `_id` , contains ( " _source" ) , - contains ( " computerCodes.actors" ) , - contains ( " ud" ) )
ud_sent <- out %>% select ( `_id` , `_source.ud` ) %>%
ud_sent <- out %>% select ( `_id` , `_source.ud` ) %>%
unnest ( cols = colnames ( .) ) %>%
unnest ( cols = colnames ( .) ) %>%
select ( - one_of ( ' exists' ) ) %>%
select ( - one_of ( ' exists' ) ) %>%
unnest ( cols = colnames ( .) ) %>%
unnest ( cols = colnames ( .) ) %>%
filter ( upos != ' PUNCT' )
filter ( upos != ' PUNCT' )
if ( is.null ( sent_dict ) == F ) {
if ( ! is.null ( sent_dict ) ) {
if ( " lem_u" %in% colnames ( sent_dict ) ) {
if ( " lem_u" %in% colnames ( sent_dict ) ) {
ud_sent <- ud_sent %>%
ud_sent <- ud_sent %>%
mutate ( lem_u = str_c ( lemma , ' _' , upos ) ) %>%
mutate ( lem_u = str_c ( lemma , ' _' , upos ) ) %>%
@ -48,40 +49,63 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
arousal = sent_words / words
arousal = sent_words / words
)
)
} else {
} else {
ud_sent <- ud_sent %>% group_by ( sentence_id) %>% summarise ( )
ud_sent <- ud_sent %>% group_by ( `_id`, sentence_id) %>% summarise ( )
}
}
out <- select ( out , - `_source.ud` )
out <- select ( out , - `_source.ud` )
if ( validation == T ) {
codes_sent <- ud_sent %>%
left_join ( .,out , by = ' _id' ) %>%
rowwise ( ) %>%
filter ( sentence_id == `_source.codes.sentence.id` )
return ( codes_sent )
}
### Unnest out_row to individual actor ids
### Unnest out_row to individual actor ids
out <- out %>%
unnest ( `_source.computerCodes.actorsDetail` ) %>%
mutate ( ids_list = ids ) %>%
unnest ( ids ) %>%
unnest ( sentence_id ) %>%
group_by ( `_id` , sentence_id ) %>%
summarise (
ids = list ( ids )
) %>%
left_join ( ud_sent , .,by = c ( ' _id' , ' sentence_id' ) ) %>%
group_by ( `_id` )
text_sent <- out %>%
if ( " _source.computerCodes.actorsDetail2" %in% colnames ( out ) ) {
summarise (
out <- out %>%
text.sent_sum = sum ( sent_sum ) ,
unnest ( `_source.computerCodes.actorsDetail` ) %>%
text.words = sum ( words ) ,
# mutate(ids_list = ids) %>%
text.sent_words = sum ( sent_words ) ,
unnest ( ids ) %>%
text.sent_lemmas = I ( list ( unlist ( sent_lemmas ) ) ) ,
unnest ( sentence_id ) %>%
text.sentences = n ( )
group_by ( `_id` , sentence_id ) %>%
) %>%
summarise (
mutate (
ids = list ( ids )
text.sent = text.sent_sum / text.words ,
)
text.arousal = text.sent_words / text.words
} else {
)
out <- out %>%
group_by ( `_id` ) %>%
summarise ( ) %>%
mutate ( sentence_id = 1 )
}
out <- out %>%
out <- out %>%
summarise_all ( list ) %>%
left_join ( ud_sent , .,by = c ( ' _id' , ' sentence_id' ) ) %>%
left_join ( .,text_sent , by = ' _id' ) %>%
group_by ( `_id` )
left_join ( .,metadata , by = ' _id' )
if ( ! is.null ( sent_dict ) ) {
text_sent <- out %>%
summarise (
text.sent_sum = sum ( sent_sum ) ,
text.words = sum ( words ) ,
text.sent_words = sum ( sent_words ) ,
text.sent_lemmas = I ( list ( unlist ( sent_lemmas ) ) ) ,
text.sentences = n ( )
) %>%
mutate (
text.sent = text.sent_sum / text.words ,
text.arousal = text.sent_words / text.words
)
out <- out %>%
summarise_all ( list ) %>%
left_join ( .,text_sent , by = ' _id' ) %>%
left_join ( .,metadata , by = ' _id' )
} else {
out <- out %>%
summarise_all ( list ) %>%
left_join ( .,metadata , by = ' _id' )
}
return ( out )
return ( out )
}
}
saveRDS ( par_sent ( 1 : nrow ( out ) , out = out , sent_dict = sent_dict ) , file = paste0 ( ' df_out' , as.numeric ( as.POSIXct ( Sys.time ( ) ) ) , ' .Rds' ) )
saveRDS ( par_sent ( 1 : nrow ( out ) , out = out , sent_dict = sent_dict ) , file = paste0 ( ' df_out' , as.numeric ( as.POSIXct ( Sys.time ( ) ) ) , ' .Rds' ) )