diff --git a/R/actor_aggregation.R b/R/actor_aggregation.R index 3a036bb..9051299 100644 --- a/R/actor_aggregation.R +++ b/R/actor_aggregation.R @@ -22,29 +22,82 @@ ################################################################################################# #################################### Aggregate actor results ################################ ################################################################################################# -actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR') { +actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR', sent_dict = NULL, cores = detectCores()) { ### Functions aggregator <- function (id, duplicates) { - article <- filter(duplicates, `_id` == id) %>% - unnest(sentence_id, .preserve = colnames(.)) + df <- duplicates %>% + filter(`_id` == id) %>% + group_by(`_id`) %>% + summarise( + `_source.doctype` = first(`_source.doctype`), + `_source.publication_date` = first(`_source.publication_date`), + actor_end = list(sort(unique(unlist(actor_end)))), + prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])), + sentence_id = list(sort(unique(unlist(sentence_id)))), + rel_first = list(max(unlist(rel_first))), + sentence_end = list(sort(unique(unlist(sentence_end)))), + actor_start = list(sort(unique(unlist(actor_start)))), + ids = list(sort(unique(unlist(ids)))), + sentence_start = list(sort(unique(unlist(sentence_start)))), + occ = list(length(unique(unlist(sentence_id)))), + first = list(min(unlist(sentence_id))) + ) + return(df) + } + ### Calculate sentiment scores for each actor-document + par_sent <- function(row, out, sent_dict) { + out_row <- out[row,] + ### Contains sentiment per sentence for whole article + sentiment_ud <- out_row$`_source.ud`[[1]] %>% + select(-one_of('exists')) %>% + unnest() %>% + filter(upos != 'PUNCT') %>% # For getting proper word counts + mutate(V1 = str_c(lemma,'_',upos)) %>% + left_join(sent_dict, by = 'V1') %>% + ### Setting binary sentiment as unit of analysis + mutate(V2 = V3) %>% + group_by(sentence_id) %>% + mutate( + V2 = case_when( + is.na(V2) == T ~ 0, + TRUE ~ V2 + ) + ) %>% + summarise(sent_sum = sum(V2), + words = length(lemma), + sent_words = length(na.omit(V3))) %>% + mutate( + sent = sent_sum/words, + arousal = sent_words/words + ) + out_row <- select(out_row, -`_source.ud`) + ### Contains sentiment per sentence for actor + actor_tone <- filter(sentiment_ud, sentence_id %in% unlist(out_row$sentence_id)) - occ <- length(unlist(unique(article$sentence_id1))) - sentence_count <- round(article$occ[[1]]/article$prom[[1]]) - prom <- occ/sentence_count - rel_first <- 1-(min(article$sentence_id1)/sentence_count) - return(bind_cols(as.list(article[1,1:6]), # Sentence id, start and end position for actor sentences - data.frame(occ = I(list(occ)), # Number of sentences in which actor occurs - prom = I(list(prom)), # Relative prominence of actor in article (number of occurences/total # sentences) - rel_first = I(list(rel_first)), # Relative position of first occurence at sentence level - first = I(list(min(article$sentence_id1))) # First sentence in which actor is mentioned - ) + ### Aggregated sentiment per actor (over all sentences containing actor) + actor <- summarise(actor_tone, + sent = sum(sent_sum)/sum(words), + sent_sum = sum(sent_sum), + sent_words = sum(sent_words), + words = sum(words), + arousal = sum(sent_words)/sum(words) ) + + ### Aggregated sentiment per article (over all sentences in article) + text <- summarise(sentiment_ud, + sent = sum(sent_sum)/sum(words), + sent_sum = sum(sent_sum), + sent_words = sum(sent_words), + words = sum(words), + arousal = sum(sent_words)/sum(words) ) + return(cbind(out_row,data.frame(actor = actor, text = text))) } + ### Creating aggregate measuers at daily, weekly, monthly and yearly level - grouper <- function(level, actor_df, actorids) { - by_newspaper <- actor_df %>% + grouper <- function(level, out, actorids, sent = F) { + by_newspaper <- out %>% mutate( sentence_count = round(unlist(occ)/unlist(prom)) ) %>% @@ -57,9 +110,9 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = sentence_count = sum(sentence_count), articles = length(`_id`), level = level - ) - - aggregate <- actor_df %>% + ) %>% + ungroup() + aggregate <- out %>% mutate( sentence_count = round(unlist(occ)/unlist(prom)) ) %>% @@ -73,12 +126,64 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = articles = length(`_id`), `_source.doctype` = 'agg', level = level - ) + ) %>% + ungroup() + + if(sent == T) { + by_newspaper_sent <- out %>% + group_by_at(vars(level, `_source.doctype`)) %>% + summarise( + actor.sent = mean(actor.sent), + actor.sent_sum = sum(actor.sent_sum), + actor.sent_words = sum(actor.sent_words), + actor.words = sum(actor.words), + actor.arousal = mean(actor.arousal), + text.sent = mean(text.sent), + text.sent_sum = sum(text.sent_sum), + text.sent_words = sum(text.sent_words), + text.words = sum(text.words), + text.arousal = mean(text.arousal) + ) %>% + ungroup() %>% + select(-level,-`_source.doctype`) + aggregate_sent <- out %>% + group_by_at(vars(level)) %>% + summarise( + actor.sent = mean(actor.sent), + actor.sent_sum = sum(actor.sent_sum), + actor.sent_words = sum(actor.sent_words), + actor.words = sum(actor.words), + actor.arousal = mean(actor.arousal), + text.sent = mean(text.sent), + text.sent_sum = sum(text.sent_sum), + text.sent_words = sum(text.sent_words), + text.words = sum(text.words), + text.arousal = mean(text.arousal) + ) %>% + ungroup() %>% + select(-level) + aggregate <- bind_cols(aggregate,aggregate_sent) + by_newspaper <- bind_cols(by_newspaper, by_newspaper_sent) + } output <- bind_rows(by_newspaper, aggregate) %>% - bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),]) + bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),]) %>% + select( + -`_index`, + -`_type`, + -`_score`, + -`_id`, + -contains('Search'), + -contains('not') + ) + colnames(output) <- gsub("_source.",'', colnames(output)) return(output) } -########################################################################################### + ########################################################################################### + if (is.null(sent_dict) == F) { + fields <- c('ud','computerCodes.actorsDetail', 'doctype', 'publication_date') + } else { + fields <- c('computerCodes.actorsDetail', 'doctype', 'publication_date') + } actor <- actors[row,] if (actor$`_source.function` == "Party"){ years = seq(2000,2019,1) @@ -101,43 +206,66 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) { if (year > 0) { - query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31] && computerCodes.junk:0') + query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31]') } else { - query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,'] && computerCodes.junk:0') + query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,']') + } + + ### Temporary exception for UK missing junk coding + if (actor$`_source.country` != 'uk') { + query <- paste0(query,' && computerCodes.junk:0') } + out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query), - fields = c('computerCodes.actorsDetail', 'doctype', 'publication_date'), default_operator = default_operator), + fields = fields, default_operator = default_operator), localhost = localhost, es_pwd = es_pwd) if (length(out$`_id`) > 0 ) { - actor_df <- out + if (is.null(sent_dict) == F) { + out_ud <- out %>% select(`_id`,`_source.ud`) + out <- out %>% select(-`_source.ud`) + } ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids. - actor_df <- actor_df %>% - unnest() %>% + out <- out %>% + unnest(`_source.computerCodes.actorsDetail`, .preserve = colnames(.)) %>% unnest(ids, .preserve = colnames(.)) %>% - filter(ids1 %in% actorids) %>% - select(-ends_with('start')) %>% - select(-ends_with('end')) %>% - select(-starts_with('ids')) + filter(ids1 %in% actorids) # %>% + # select(-ends_with('start')) %>% + # select(-ends_with('end')) %>% + # select(-starts_with('ids')) ### Only if there are more rows than articles, recalculate - if (length(unique(actor_df$`_id`)) != length(actor_df$`_id`)) { - duplicates <- actor_df[(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),] - actor_single <- actor_df[!(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),] + if (length(unique(out$`_id`)) != length(out$`_id`)) { + duplicates <- out[(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),] + actor_single <- out[!(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),] art_id <- unique(duplicates$`_id`) - dupe_merged <- bind_rows(lapply(art_id, aggregator, duplicates = duplicates)) - actor_df <- bind_rows(dupe_merged, actor_single) + dupe_merged <- bind_rows(mclapply(art_id, aggregator, duplicates = duplicates, mc.cores = cores)) + out <- bind_rows(dupe_merged, actor_single) } + + if (is.null(sent_dict) == F) { + out <- left_join(out, out_ud, by = '_id') + out <- bind_rows(mclapply(seq(1,nrow(out),1),par_sent, out = out, sent_dict = sent_dict, mc.cores = cores)) + } + ### Creating date grouping variables - actor_df <- actor_df %>% + out <- out %>% mutate( year = strftime(`_source.publication_date`, format = '%Y'), - yearmonth = strftime(actor_df$`_source.publication_date`, format = '%Y%m'), - yearmonthday = strftime(actor_df$`_source.publication_date`, format = '%Y%m%d'), - yearweek = strftime(actor_df$`_source.publication_date`, format = "%Y%V") + yearmonth = strftime(out$`_source.publication_date`, format = '%Y%m'), + yearmonthday = strftime(out$`_source.publication_date`, format = '%Y%m%d'), + yearweek = strftime(out$`_source.publication_date`, format = "%Y%V") + ) %>% + select( + -`_score`, + -`_index`, + -`_type`, + -`_score`, + -`_source.computerCodes.actorsDetail`, + -ids1 ) levels <- c('year','yearmonth','yearmonthday','yearweek') - aggregate_data <- bind_rows(lapply(levels, grouper, actor_df = actor_df, actorids = actorids)) + aggregate_data <- bind_rows(lapply(levels, grouper, out = out, actorids = actorids, sent = !is.null(sent_dict))) return(aggregate_data) } else { return() @@ -147,4 +275,3 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = print(paste0('Done with ',row,'/',nrow(actors),' actors')) return() } -