#' Aggregate sentence-level dataset containing actors (from sentencizer()) #' #' Aggregate sentence-level dataset containing actors (from sentencizer()) #' @param df Data frame with actor ids, produced by sentencizer #' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors") #' @param ids Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties) #' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list #' @export #' @examples #' actor_merger(df, actors_meta, ids = NULL) ################################################################################################# #################################### Generate actor-article dataset ############################# ################################################################################################# actor_merger <- function(df, actors_meta, ids = NULL) { grouper <- function(id, df) { return(df %>% rowwise() %>% filter(length(intersect(id,ids)) > 0) %>% group_by(`_id`) %>% summarise(actor.sent = sum(sent_sum)/sum(words), actor.sent_sum = sum(sent_sum), actor.sent_words = sum(sent_words), actor.words = sum(words), actor.arousal = sum(sent_words)/sum(words), actor.first = first(sentence_id), actor.occ = n(), publication_date = as.Date(first(`_source.publication_date`)), doctype = first(`_source.doctype`)) %>% mutate( ids = str_c(id, collapse = '-') ) ) } ## Remove some of the metadata from the source df text_sent <- df %>% select(`_id`,starts_with("text."),-ends_with("sent_lemmas")) df <- df %>% ungroup() %>% select(-ends_with("sent_lemmas"),-starts_with("text.")) %>% unnest(cols = colnames(.)) ## Unnest to sentence level ## Create bogus variables if sentiment is not scored if(!"sent_sum" %in% colnames(df)) { df <- df %>% mutate( sent_words = 0, sent_sum = 0, ) } ## Create aggregations according to list of actorId vectors in ids if(!is.null(ids)) { output <- lapply(ids,grouper, df = df) %>% bind_rows(.) %>% left_join(text_sent, by="_id") %>% mutate( actor.prom = actor.occ/text.sentences, actor.rel_first = 1-(actor.first/text.sentences), year = strftime(publication_date, format = '%Y'), yearmonth = strftime(publication_date, format = '%Y%m'), yearmonthday = strftime(publication_date, format = '%Y%m%d'), yearweek = strftime(publication_date, format = "%Y%V") ) return(output) } else { all <- df %>% rowwise() %>% filter(!is.null(unlist(ids))) %>% group_by(`_id`) %>% summarise(actor.sent = sum(sent_sum)/sum(words), actor.sent_sum = sum(sent_sum), actor.sent_words = sum(sent_words), actor.words = sum(words), actor.arousal = sum(sent_words)/sum(words), actor.first = first(sentence_id), actor.occ = n(), publication_date = as.Date(first(`_source.publication_date`)), doctype = first(`_source.doctype`)) %>% mutate( ids = "all" ) df <- df %>% unnest(cols = ids) %>% ## Unnest to actor level mutate( `_source.publication_date` = as.Date(`_source.publication_date`) ) ## Create aggregate measures for individual actors actors <- df %>% filter(str_starts(ids,"A_")) %>% group_by(`_id`,ids) %>% summarise(actor.sent = sum(sent_sum)/sum(words), actor.sent_sum = sum(sent_sum), actor.sent_words = sum(sent_words), actor.words = sum(words), actor.arousal = sum(sent_words)/sum(words), actor.first = first(sentence_id), actor.occ = n(), publication_date = first(`_source.publication_date`), doctype = first(`_source.doctype`) ) ## Create actor metadata dataframe per active date (one row per day per actor) colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','') actors_meta_bydate <- actors_meta %>% mutate( startDate = as.Date(startDate), endDate = as.Date(endDate) ) %>% select( lastName,firstName,`function`,gender,yearOfBirth,parlPeriod,partyId,ministerName,ministryId,actorId,startDate,endDate ) %>% rowwise() %>% mutate( publication_date = list(seq(from=startDate, to=endDate,by="day")), ids = actorId ) %>% unnest(cols=publication_date) ## Join the actor metadata with the article data by actor id and date actors <- actors %>% left_join(.,actors_meta_bydate, by=c("ids","publication_date")) ## Generate party-actor aggregations (mfsa) parties_actors <- df %>% filter(str_starts(ids,"P_")) %>% mutate( ids = str_sub(ids, start = 1, end = -3) ) %>% group_by(`_id`,ids) %>% summarise(actor.sent = sum(sent_sum)/sum(words), actor.sent_sum = sum(sent_sum), actor.sent_words = sum(sent_words), actor.words = sum(words), actor.arousal = sum(sent_words)/sum(words), actor.first = first(sentence_id), actor.occ = n(), publication_date = first(`_source.publication_date`), doctype = first(`_source.doctype`)) %>% mutate( ids = str_c(ids,"_mfsa") ) ## Generate party aggregations (mfs) parties <- df %>% filter(str_ends(ids,"_f") | str_ends(ids,"_s")) %>% mutate( ids = str_sub(ids, start = 1, end = -3) ) %>% group_by(`_id`,ids) %>% summarise(actor.sent = sum(sent_sum)/sum(words), actor.sent_sum = sum(sent_sum), actor.sent_words = sum(sent_words), actor.words = sum(words), actor.arousal = sum(sent_words)/sum(words), actor.first = first(sentence_id), actor.occ = n(), publication_date = first(`_source.publication_date`), doctype = first(`_source.doctype`)) %>% mutate( ids = str_c(ids,"_mfs") ) ## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies df <- bind_rows(actors, parties, parties_actors, all) %>% left_join(text_sent, by="_id") %>% mutate( actor.prom = actor.occ/text.sentences, actor.rel_first = 1-(actor.first/text.sentences), year = strftime(publication_date, format = '%Y'), yearmonth = strftime(publication_date, format = '%Y%m'), yearmonthday = strftime(publication_date, format = '%Y%m%d'), yearweek = strftime(publication_date, format = "%Y%V") ) %>% ungroup() return(df) } }