actor_merger: total rewrite based on data.table for performance reasons. Added some exceptions due to non-existing partyIds that some individual actors have in the actor database

master
Your Name 4 years ago
parent 2c8a88f9a0
commit b7f1afddd1

@ -3,7 +3,7 @@
#' Aggregate sentence-level dataset containing actors (from sentencizer()) #' Aggregate sentence-level dataset containing actors (from sentencizer())
#' @param df Data frame with actor ids, produced by sentencizer #' @param df Data frame with actor ids, produced by sentencizer
#' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors") #' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors")
#' @param ids Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties) #' @param actor_groups Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list #' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
#' @export #' @export
#' @examples #' @examples
@ -12,180 +12,173 @@
#################################### Generate actor-article dataset ############################# #################################### Generate actor-article dataset #############################
################################################################################################# #################################################################################################
actor_merger <- function(df, actors_meta, ids = NULL) { ### NOTE: The exceptions for various partyId_a ids has been implemented because of an error with
grouper <- function(id, df) { ### some individual actors, where the partyId of an individual actor doesn't match an actual
return(df %>% ### partyId in the actor dataset
rowwise() %>%
filter(length(intersect(id,ids)) > 0) %>% actor_merger <- function(df, actors_meta, actor_groups = NULL) {
group_by(`_id`) %>% grouper <- function(id2, df) {
summarise(actor.sent = sum(sent_sum)/sum(words), if ('P_1206_a' %in% id2) {
id2 <- c('P_212_a','P_1771_a',id2)
}
if ('P_1605_a' %in% id2) {
id2 <- c('P_1606_a', id2)
}
if ('P_1629_a' %in% id2) {
id2 <- c(str_c('P_',as.character(1630:1647),'_a'), id2)
}
return(df[ids %in% id2,] %>%
.[!duplicated(.,by = c('id','sentence_id')),.(
actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum), actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words), actor.sent_words = sum(sent_words),
actor.words = sum(words), actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words), actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id), actor.first = first(sentence_id),
actor.occ = n(), actor.occ = .N,
publication_date = as.Date(first(`_source.publication_date`)), publication_date = first(publication_date),
doctype = first(`_source.doctype`)) %>% ids = str_c(id2, collapse = '-')
mutate( ), by = c('id')]
ids = str_c(id, collapse = '-')
)
) )
} }
## Remove some of the metadata from the source df ## Remove some of the metadata from the source df
text_sent <- df %>% df <- data.table(df)[,.(
select(`_id`,starts_with("text."),-ends_with("sent_lemmas")) (.SD),
df <- df %>% doctype = as.factor(`_source.doctype`),
ungroup() %>% publication_date = as.Date(`_source.publication_date`),
select(-ends_with("sent_lemmas"),-starts_with("text.")) %>% id = as.factor(`_id`)
unnest(cols = colnames(.)) ## Unnest to sentence level ), .SDcols = !c('_source.doctype','_source.publication_date','_id')]
text_sent <- df[,.SD, .SDcols = c('id', 'doctype',grep('text\\.',names(df), value = T))]
## Unnest to sentence level
df <- df[,lapply(.SD, unlist, recursive=F),
.SDcols = c('sentence_id', 'sent_sum', 'words', 'sent_words','ids'),
by = list(id,publication_date)]
## Create bogus variables if sentiment is not scored ## Create bogus variables if sentiment is not scored
if(!"sent_sum" %in% colnames(df)) { if(!"sent_sum" %in% colnames(df)) {
df <- df %>% df <- df[,.(
mutate( (.SD),
sent_words = 0, sent_words = 0,
sent_sum = 0, sent_sum = 0
) ),.SDcols = -c('sent_words','sent_sum')]
} }
## Create aggregations according to list of actorId vectors in ids text_noactors <- df[lengths(ids) == 0L,
if(!is.null(ids)) { .(noactor.sent = sum(sent_sum)/sum(words),
output <- lapply(ids,grouper, df = df) %>%
bind_rows(.) %>%
left_join(text_sent, by="_id") %>%
mutate(
actor.prom = actor.occ/text.sentences,
actor.rel_first = 1-(actor.first/text.sentences),
year = strftime(publication_date, format = '%Y'),
yearmonth = strftime(publication_date, format = '%Y%m'),
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V")
)
return(output)
} else {
text_noactors <- df %>%
rowwise() %>%
filter(is.null(unlist(ids))) %>%
group_by(`_id`) %>%
summarise(noactor.sent = sum(sent_sum)/sum(words),
noactor.sent_sum = sum(sent_sum), noactor.sent_sum = sum(sent_sum),
noactor.sent_words = sum(sent_words), noactor.sent_words = sum(sent_words),
noactor.words = sum(words), noactor.words = sum(words),
noactor.arousal = sum(sent_words)/sum(words), noactor.arousal = sum(sent_words)/sum(words),
noactor.first = first(sentence_id), noactor.first = first(sentence_id),
noactor.occ = n(), noactor.occ = .N), by = list(id)]
publication_date = as.Date(first(`_source.publication_date`)),
doctype = first(`_source.doctype`)) %>%
select(`_id`,starts_with('noactor.'))
all <- df %>% all <- df[lengths(ids) > 0L,
rowwise() %>% .(actor.sent = sum(sent_sum)/sum(words),
filter(!is.null(unlist(ids))) %>%
group_by(`_id`) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum), actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words), actor.sent_words = sum(sent_words),
actor.words = sum(words), actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words), actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id), actor.first = first(sentence_id),
actor.occ = n(), actor.occ = .N,
publication_date = as.Date(first(`_source.publication_date`)), ids = 'all'), by = list(id)]
doctype = first(`_source.doctype`)) %>%
mutate( ## Unnest to actor level
ids = "all" df <- df[,.(ids = as.character(unlist(ids))),
) by = list(id,publication_date,sentence_id, sent_sum, words, sent_words)]
df <- df %>% ## Create aggregations according to list of actorId vectors in ids
unnest(cols = ids) %>% ## Unnest to actor level if(!is.null(actor_groups)) {
output <- lapply(actor_groups,grouper, df = df) %>%
rbindlist(.) %>%
left_join(text_sent, by="id") %>%
mutate( mutate(
`_source.publication_date` = as.Date(`_source.publication_date`) actor.prom = actor.occ/text.sentences,
actor.rel_first = 1-(actor.first/text.sentences),
year = strftime(publication_date, format = '%Y'),
yearmonth = strftime(publication_date, format = '%Y%m'),
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V")
) )
return(output)
} else {
## Create aggregate measures for individual actors ## Create aggregate measures for individual actors
actors <- df %>% actors <- df[str_starts(ids, 'A_'),
filter(str_starts(ids,"A_")) %>% .(actor.sent = sum(sent_sum)/sum(words),
group_by(`_id`,ids) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum), actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words), actor.sent_words = sum(sent_words),
actor.words = sum(words), actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words), actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id), actor.first = first(sentence_id),
actor.occ = n(), actor.occ = .N,
publication_date = first(`_source.publication_date`), publication_date = first(publication_date)), by = list(id, ids)]
doctype = first(`_source.doctype`)
)
## Create actor metadata dataframe per active date (one row per day per actor) ## Create actor metadata dataframe per active date (one row per day per actor)
colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','') colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','')
actors_meta <- actors_meta %>% actors_meta <- actors_meta[,
mutate( .((.SD),
startDate = as.Date(startDate), startDate = as.Date(startDate),
endDate = as.Date(endDate), endDate = as.Date(endDate),
ids = actorId ids = ifelse(actorId != '', actorId, partyId)
) %>% ), .SDcols = -c('_id','startDate','endDate','_index','_type','_score')
select(-`_id`) ]
party_meta <- actors_meta %>% actors <- actors_meta[actors,
filter(`function` == 'Party') %>% c('x.startDate','x.endDate',colnames(actors), 'lastName','firstName','function.','gender','yearOfBirth','parlPeriod','partyId','ministerName','ministryId','actorId','startDate','endDate'),
mutate(
ids = partyId
)
actors <- as.data.table(actors_meta)[as.data.table(actors),
c('x.startDate','x.endDate',colnames(actors), 'lastName','firstName','function','gender','yearOfBirth','parlPeriod','partyId','ministerName','ministryId','actorId','startDate','endDate'),
on =.(ids = ids, startDate <= publication_date, endDate >= publication_date), on =.(ids = ids, startDate <= publication_date, endDate >= publication_date),
allow.cartesian = T, allow.cartesian = T,
mult = 'all', mult = 'all',
with = F] %>% with = F][,.(
mutate(startDate = x.startDate, startDate = x.startDate,
endDate = x.endDate) %>% endDate = x.endDate,
select(-starts_with('x.')) (.SD)
), .SDcols = -c('x.startDate', 'x.endDate','startDate','endDate')]
## Generate party-actor aggregations (mfsa) ## Generate party-actor aggregations (mfsa)
parties_actors <- df %>% # identical(as.data.frame(setcolorder(setorderv(parties_actors,c('id','ids')), colnames(parties_actors_dp))),as.data.frame(parties_actors_dp))
filter(str_starts(ids,"P_")) %>%
mutate( parties_actors <- df[str_starts(ids,'P_'),.(
ids = str_sub(ids, start = 1, end = -3) ids = str_sub(ids, start = 1, end = -3),
) %>% (.SD)
group_by(`_id`,ids) %>% ),.SDcols = -c('ids')][, .(
summarise(actor.sent = sum(sent_sum)/sum(words), ids = case_when(ids == 'P_212' ~ 'P_1206',
ids == 'P_1771' ~ 'P_1206',
ids == 'P_1606' ~ 'P_1605',
ids %in% str_c('P_',as.character(1630:1647)) ~ 'P_1629',
TRUE ~ ids),
(.SD)
), .SDcols = -c('ids')][,.(
actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum), actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words), actor.sent_words = sum(sent_words),
actor.words = sum(words), actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words), actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id), actor.first = first(sentence_id),
actor.occ = n(), actor.occ = .N
publication_date = first(`_source.publication_date`), ), by = c('id','ids')]
doctype = first(`_source.doctype`)) %>% parties_actors <- actors_meta[parties_actors, on = c('ids')][!is.na(id),.(ids = str_c(ids,"_mfsa"), (.SD)), .SDcols = -c('ids')]
left_join(., party_meta, actors_meta, by=c('ids')) %>%
mutate(
ids = str_c(ids,"_mfsa")
)
## Generate party aggregations (mfs) ## Generate party aggregations (mfs)
parties <- df %>% parties <- df[str_ends(ids,'_f') | str_ends(ids,'_s'),.(
filter(str_ends(ids,"_f") | str_ends(ids,"_s")) %>% ids = str_sub(ids, start = 1, end = -3),
mutate( (.SD)
ids = str_sub(ids, start = 1, end = -3) ),.SDcols = -c('ids')][,.(
) %>% actor.sent = sum(sent_sum)/sum(words),
group_by(`_id`,ids) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum), actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words), actor.sent_words = sum(sent_words),
actor.words = sum(words), actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words), actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id), actor.first = first(sentence_id),
actor.occ = n(), actor.occ = .N
publication_date = first(`_source.publication_date`), ), by = c('id','ids')]
doctype = first(`_source.doctype`)) %>% parties <- actors_meta[parties, on = c('ids')][!is.na(id),.(ids = str_c(ids,"_mfs"), (.SD)), .SDcols = -c('ids')]
left_join(., party_meta, actors_meta, by=c('ids')) %>%
mutate(
ids = str_c(ids,"_mfs")
)
## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies ## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
df <- bind_rows(actors, parties, parties_actors, all) %>% df <- bind_rows(actors, parties, parties_actors, all) %>%
left_join(.,text_sent, by="_id") %>% left_join(.,text_sent, by="id") %>%
left_join(.,text_noactors, by="_id") %>% left_join(.,text_noactors, by="id") %>%
mutate( mutate(
actor.prom = actor.occ/text.sentences, actor.prom = actor.occ/text.sentences,
actor.rel_first = 1-(actor.first/text.sentences), actor.rel_first = 1-(actor.first/text.sentences),
@ -195,7 +188,7 @@ actor_merger <- function(df, actors_meta, ids = NULL) {
yearweek = strftime(publication_date, format = "%Y%V") yearweek = strftime(publication_date, format = "%Y%V")
) %>% ) %>%
ungroup() %>% ungroup() %>%
select(-contains('Search'),-starts_with('not'), -`_index`, -`_type`, -`_score`) select(-contains('Search'),-starts_with('not'))
return(df) return(df)
} }
} }

@ -4,14 +4,14 @@
\alias{actor_merger} \alias{actor_merger}
\title{Aggregate sentence-level dataset containing actors (from sentencizer())} \title{Aggregate sentence-level dataset containing actors (from sentencizer())}
\usage{ \usage{
actor_merger(df, actors_meta, ids = NULL) actor_merger(df, actors_meta, actor_groups = NULL)
} }
\arguments{ \arguments{
\item{df}{Data frame with actor ids, produced by sentencizer} \item{df}{Data frame with actor ids, produced by sentencizer}
\item{actors_meta}{Data frame containing actor metadata obtained using elasticizer(index="actors")} \item{actors_meta}{Data frame containing actor metadata obtained using elasticizer(index="actors")}
\item{ids}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)} \item{actor_groups}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)}
} }
\value{ \value{
When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list

Loading…
Cancel
Save