@ -3,7 +3,7 @@
#' Aggregate sentence-level dataset containing actors (from sentencizer())
#' Aggregate sentence-level dataset containing actors (from sentencizer())
#' @param df Data frame with actor ids, produced by sentencizer
#' @param df Data frame with actor ids, produced by sentencizer
#' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors")
#' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors")
#' @param id s Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
#' @param actor_group s Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
#' @export
#' @export
#' @examples
#' @examples
@ -12,49 +12,91 @@
#################################### Generate actor-article dataset #############################
#################################### Generate actor-article dataset #############################
#################################################################################################
#################################################################################################
actor_merger <- function ( df , actors_meta , ids = NULL ) {
### NOTE: The exceptions for various partyId_a ids has been implemented because of an error with
grouper <- function ( id , df ) {
### some individual actors, where the partyId of an individual actor doesn't match an actual
return ( df %>%
### partyId in the actor dataset
rowwise ( ) %>%
filter ( length ( intersect ( id , ids ) ) > 0 ) %>%
actor_merger <- function ( df , actors_meta , actor_groups = NULL ) {
group_by ( `_id` ) %>%
grouper <- function ( id2 , df ) {
summarise ( actor.sent = sum ( sent_sum ) / sum ( words ) ,
if ( ' P_1206_a' %in% id2 ) {
actor.sent_sum = sum ( sent_sum ) ,
id2 <- c ( ' P_212_a' , ' P_1771_a' , id2 )
actor.sent_words = sum ( sent_words ) ,
}
actor.words = sum ( words ) ,
if ( ' P_1605_a' %in% id2 ) {
actor.arousal = sum ( sent_words ) / sum ( words ) ,
id2 <- c ( ' P_1606_a' , id2 )
actor.first = first ( sentence_id ) ,
}
actor.occ = n ( ) ,
if ( ' P_1629_a' %in% id2 ) {
publication_date = as.Date ( first ( `_source.publication_date` ) ) ,
id2 <- c ( str_c ( ' P_' , as.character ( 1630 : 1647 ) , ' _a' ) , id2 )
doctype = first ( `_source.doctype` ) ) %>%
}
mutate (
return ( df [ids %in% id2 , ] %>%
ids = str_c ( id , collapse = ' -' )
.[ ! duplicated ( .,by = c ( ' id' , ' sentence_id' ) ) , .(
)
actor.sent = sum ( sent_sum ) / sum ( words ) ,
)
actor.sent_sum = sum ( sent_sum ) ,
actor.sent_words = sum ( sent_words ) ,
actor.words = sum ( words ) ,
actor.arousal = sum ( sent_words ) / sum ( words ) ,
actor.first = first ( sentence_id ) ,
actor.occ = .N ,
publication_date = first ( publication_date ) ,
ids = str_c ( id2 , collapse = ' -' )
) , by = c ( ' id' ) ]
)
}
}
## Remove some of the metadata from the source df
## Remove some of the metadata from the source df
text_sent <- df %>%
df <- data.table ( df ) [ , .(
select ( `_id` , starts_with ( " text." ) , - ends_with ( " sent_lemmas" ) )
( .SD ) ,
df <- df %>%
doctype = as.factor ( `_source.doctype` ) ,
ungroup ( ) %>%
publication_date = as.Date ( `_source.publication_date` ) ,
select ( - ends_with ( " sent_lemmas" ) , - starts_with ( " text." ) ) %>%
id = as.factor ( `_id` )
unnest ( cols = colnames ( .) ) ## Unnest to sentence level
) , .SDcols = ! c ( ' _source.doctype' , ' _source.publication_date' , ' _id' ) ]
text_sent <- df [ , .SD , .SDcols = c ( ' id' , ' doctype' , grep ( ' text\\.' , names ( df ) , value = T ) ) ]
## Unnest to sentence level
df <- df [ , lapply ( .SD , unlist , recursive = F ) ,
.SDcols = c ( ' sentence_id' , ' sent_sum' , ' words' , ' sent_words' , ' ids' ) ,
by = list ( id , publication_date ) ]
## Create bogus variables if sentiment is not scored
## Create bogus variables if sentiment is not scored
if ( ! " sent_sum" %in% colnames ( df ) ) {
if ( ! " sent_sum" %in% colnames ( df ) ) {
df <- df %>%
df <- df [ , .(
mutate (
( .SD ) ,
sent_words = 0 ,
sent_words = 0 ,
sent_sum = 0 ,
sent_sum = 0
)
) , .SDcols = - c ( ' sent_words' , ' sent_sum' ) ]
}
}
text_noactors <- df [lengths ( ids ) == 0L ,
.(noactor.sent = sum ( sent_sum ) / sum ( words ) ,
noactor.sent_sum = sum ( sent_sum ) ,
noactor.sent_words = sum ( sent_words ) ,
noactor.words = sum ( words ) ,
noactor.arousal = sum ( sent_words ) / sum ( words ) ,
noactor.first = first ( sentence_id ) ,
noactor.occ = .N ) , by = list ( id ) ]
all <- df [lengths ( ids ) > 0L ,
.(actor.sent = sum ( sent_sum ) / sum ( words ) ,
actor.sent_sum = sum ( sent_sum ) ,
actor.sent_words = sum ( sent_words ) ,
actor.words = sum ( words ) ,
actor.arousal = sum ( sent_words ) / sum ( words ) ,
actor.first = first ( sentence_id ) ,
actor.occ = .N ,
ids = ' all' ) , by = list ( id ) ]
## Unnest to actor level
df <- df [ , .(ids = as.character ( unlist ( ids ) ) ) ,
by = list ( id , publication_date , sentence_id , sent_sum , words , sent_words ) ]
## Create aggregations according to list of actorId vectors in ids
## Create aggregations according to list of actorId vectors in ids
if ( ! is.null ( ids ) ) {
if ( ! is.null ( actor_group s) ) {
output <- lapply ( ids , grouper , df = df ) %>%
output <- lapply ( actor_group s, grouper , df = df ) %>%
bind_rows ( .) %>%
rbindlist ( .) %>%
left_join ( text_sent , by = " _id" ) %>%
left_join ( text_sent , by = " id") %>%
mutate (
mutate (
actor.prom = actor.occ / text.sentences ,
actor.prom = actor.occ / text.sentences ,
actor.rel_first = 1 - ( actor.first / text.sentences ) ,
actor.rel_first = 1 - ( actor.first / text.sentences ) ,
@ -65,127 +107,78 @@ actor_merger <- function(df, actors_meta, ids = NULL) {
)
)
return ( output )
return ( output )
} else {
} else {
text_noactors <- df %>%
rowwise ( ) %>%
filter ( is.null ( unlist ( ids ) ) ) %>%
group_by ( `_id` ) %>%
summarise ( noactor.sent = sum ( sent_sum ) / sum ( words ) ,
noactor.sent_sum = sum ( sent_sum ) ,
noactor.sent_words = sum ( sent_words ) ,
noactor.words = sum ( words ) ,
noactor.arousal = sum ( sent_words ) / sum ( words ) ,
noactor.first = first ( sentence_id ) ,
noactor.occ = n ( ) ,
publication_date = as.Date ( first ( `_source.publication_date` ) ) ,
doctype = first ( `_source.doctype` ) ) %>%
select ( `_id` , starts_with ( ' noactor.' ) )
all <- df %>%
rowwise ( ) %>%
filter ( ! is.null ( unlist ( ids ) ) ) %>%
group_by ( `_id` ) %>%
summarise ( actor.sent = sum ( sent_sum ) / sum ( words ) ,
actor.sent_sum = sum ( sent_sum ) ,
actor.sent_words = sum ( sent_words ) ,
actor.words = sum ( words ) ,
actor.arousal = sum ( sent_words ) / sum ( words ) ,
actor.first = first ( sentence_id ) ,
actor.occ = n ( ) ,
publication_date = as.Date ( first ( `_source.publication_date` ) ) ,
doctype = first ( `_source.doctype` ) ) %>%
mutate (
ids = " all"
)
df <- df %>%
unnest ( cols = ids ) %>% ## Unnest to actor level
mutate (
`_source.publication_date` = as.Date ( `_source.publication_date` )
)
## Create aggregate measures for individual actors
## Create aggregate measures for individual actors
actors <- df %>%
actors <- df [str_starts ( ids , ' A_' ) ,
filter ( str_starts ( ids , " A_" ) ) %>%
.(actor.sent = sum ( sent_sum ) / sum ( words ) ,
group_by ( `_id` , ids ) %>%
actor.sent_sum = sum ( sent_sum ) ,
summarise ( actor.sent = sum ( sent_sum ) / sum ( words ) ,
actor.sent_words = sum ( sent_words ) ,
actor.sent_sum = sum ( sent_sum ) ,
actor.words = sum ( words ) ,
actor.sent_words = sum ( sent_words ) ,
actor.arousal = sum ( sent_words ) / sum ( words ) ,
actor.words = sum ( words ) ,
actor.first = first ( sentence_id ) ,
actor.arousal = sum ( sent_words ) / sum ( words ) ,
actor.occ = .N ,
actor.first = first ( sentence_id ) ,
publication_date = first ( publication_date ) ) , by = list ( id , ids ) ]
actor.occ = n ( ) ,
publication_date = first ( `_source.publication_date` ) ,
doctype = first ( `_source.doctype` )
)
## Create actor metadata dataframe per active date (one row per day per actor)
## Create actor metadata dataframe per active date (one row per day per actor)
colnames ( actors_meta ) <- str_replace ( colnames ( actors_meta ) , ' _source.' , ' ' )
colnames ( actors_meta ) <- str_replace ( colnames ( actors_meta ) , ' _source.' , ' ' )
actors_meta <- actors_meta %>%
actors_meta <- actors_meta [ ,
mutate (
.( ( .SD ) ,
startDate = as.Date ( startDate ) ,
startDate = as.Date ( startDate ) ,
endDate = as.Date ( endDate ) ,
endDate = as.Date ( endDate ) ,
ids = actorId
ids = ifelse ( actorId != ' ' , actorId , partyId )
) %>%
) , .SDcols = - c ( ' _id' , ' startDate' , ' endDate' , ' _index' , ' _type' , ' _score' )
select ( - `_id` )
]
party_meta <- actors_meta %>%
actors <- actors_meta [actors ,
filter ( `function` == ' Party' ) %>%
c ( ' x.startDate' , ' x.endDate' , colnames ( actors ) , ' lastName' , ' firstName' , ' function.' , ' gender' , ' yearOfBirth' , ' parlPeriod' , ' partyId' , ' ministerName' , ' ministryId' , ' actorId' , ' startDate' , ' endDate' ) ,
mutate (
on = .(ids = ids , startDate <= publication_date , endDate >= publication_date ) ,
ids = partyId
allow.cartesian = T ,
)
mult = ' all' ,
actors <- as.data.table ( actors_meta ) [as.data.table ( actors ) ,
with = F ] [ , .(
c ( ' x.startDate' , ' x.endDate' , colnames ( actors ) , ' lastName' , ' firstName' , ' function' , ' gender' , ' yearOfBirth' , ' parlPeriod' , ' partyId' , ' ministerName' , ' ministryId' , ' actorId' , ' startDate' , ' endDate' ) ,
startDate = x.startDate ,
on = .(ids = ids , startDate <= publication_date , endDate >= publication_date ) ,
endDate = x.endDate ,
allow.cartesian = T ,
( .SD )
mult = ' all' ,
) , .SDcols = - c ( ' x.startDate' , ' x.endDate' , ' startDate' , ' endDate' ) ]
with = F ] %>%
mutate ( startDate = x.startDate ,
endDate = x.endDate ) %>%
select ( - starts_with ( ' x.' ) )
## Generate party-actor aggregations (mfsa)
## Generate party-actor aggregations (mfsa)
parties_actors <- df %>%
# identical(as.data.frame(setcolorder(setorderv(parties_actors,c('id','ids')), colnames(parties_actors_dp))),as.data.frame(parties_actors_dp))
filter ( str_starts ( ids , " P_" ) ) %>%
mutate (
parties_actors <- df [str_starts ( ids , ' P_' ) , .(
ids = str_sub ( ids , start = 1 , end = -3 )
ids = str_sub ( ids , start = 1 , end = -3 ) ,
) %>%
( .SD )
group_by ( `_id` , ids ) %>%
) , .SDcols = - c ( ' ids' ) ] [ , .(
summarise ( actor.sent = sum ( sent_sum ) / sum ( words ) ,
ids = case_when ( ids == ' P_212' ~ ' P_1206' ,
actor.sent_sum = sum ( sent_sum ) ,
ids == ' P_1771' ~ ' P_1206' ,
actor.sent_words = sum ( sent_words ) ,
ids == ' P_1606' ~ ' P_1605' ,
actor.words = sum ( words ) ,
ids %in% str_c ( ' P_' , as.character ( 1630 : 1647 ) ) ~ ' P_1629' ,
actor.arousal = sum ( sent_words ) / sum ( words ) ,
TRUE ~ ids ) ,
actor.first = first ( sentence_id ) ,
( .SD )
actor.occ = n ( ) ,
) , .SDcols = - c ( ' ids' ) ] [ , .(
publication_date = first ( `_source.publication_date` ) ,
actor.sent = sum ( sent_sum ) / sum ( words ) ,
doctype = first ( `_source.doctype` ) ) %>%
actor.sent_sum = sum ( sent_sum ) ,
left_join ( ., party_meta , actors_meta , by = c ( ' ids' ) ) %>%
actor.sent_words = sum ( sent_words ) ,
mutate (
actor.words = sum ( words ) ,
ids = str_c ( ids , " _mfsa" )
actor.arousal = sum ( sent_words ) / sum ( words ) ,
)
actor.first = first ( sentence_id ) ,
actor.occ = .N
) , by = c ( ' id' , ' ids' ) ]
parties_actors <- actors_meta [parties_actors , on = c ( ' ids' ) ] [ ! is.na ( id ) , .(ids = str_c ( ids , " _mfsa" ) , ( .SD ) ) , .SDcols = - c ( ' ids' ) ]
## Generate party aggregations (mfs)
## Generate party aggregations (mfs)
parties <- df %>%
parties <- df [str_ends ( ids , ' _f' ) | str_ends ( ids , ' _s' ) , .(
filter ( str_ends ( ids , " _f" ) | str_ends ( ids , " _s" ) ) %>%
ids = str_sub ( ids , start = 1 , end = -3 ) ,
mutate (
( .SD )
ids = str_sub ( ids , start = 1 , end = -3 )
) , .SDcols = - c ( ' ids' ) ] [ , .(
) %>%
actor.sent = sum ( sent_sum ) / sum ( words ) ,
group_by ( `_id` , ids ) %>%
actor.sent_sum = sum ( sent_sum ) ,
summarise ( actor.sent = sum ( sent_sum ) / sum ( words ) ,
actor.sent_words = sum ( sent_words ) ,
actor.sent_sum = sum ( sent_sum ) ,
actor.words = sum ( words ) ,
actor.sent_words = sum ( sent_words ) ,
actor.arousal = sum ( sent_words ) / sum ( words ) ,
actor.words = sum ( words ) ,
actor.first = first ( sentence_id ) ,
actor.arousal = sum ( sent_words ) / sum ( words ) ,
actor.occ = .N
actor.first = first ( sentence_id ) ,
) , by = c ( ' id' , ' ids' ) ]
actor.occ = n ( ) ,
parties <- actors_meta [parties , on = c ( ' ids' ) ] [ ! is.na ( id ) , .(ids = str_c ( ids , " _mfs" ) , ( .SD ) ) , .SDcols = - c ( ' ids' ) ]
publication_date = first ( `_source.publication_date` ) ,
doctype = first ( `_source.doctype` ) ) %>%
left_join ( ., party_meta , actors_meta , by = c ( ' ids' ) ) %>%
mutate (
ids = str_c ( ids , " _mfs" )
)
## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
df <- bind_rows ( actors , parties , parties_actors , all ) %>%
df <- bind_rows ( actors , parties , parties_actors , all ) %>%
left_join ( .,text_sent , by = " _ id") %>%
left_join ( .,text_sent , by = " id" ) %>%
left_join ( .,text_noactors , by = " _ id") %>%
left_join ( .,text_noactors , by = " id" ) %>%
mutate (
mutate (
actor.prom = actor.occ / text.sentences ,
actor.prom = actor.occ / text.sentences ,
actor.rel_first = 1 - ( actor.first / text.sentences ) ,
actor.rel_first = 1 - ( actor.first / text.sentences ) ,
@ -195,7 +188,7 @@ actor_merger <- function(df, actors_meta, ids = NULL) {
yearweek = strftime ( publication_date , format = " %Y%V" )
yearweek = strftime ( publication_date , format = " %Y%V" )
) %>%
) %>%
ungroup ( ) %>%
ungroup ( ) %>%
select ( - contains ( ' Search' ) , - starts_with ( ' not' ) , - `_index` , - `_type` , - `_score` )
select ( - contains ( ' Search' ) , - starts_with ( ' not' ) )
return ( df )
return ( df )
}
}
}
}