### Notes: # Do you want to search for either one OR other actorid, or both occuring in the same document? # Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits? # Search by actorId, then aggregate by month # When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly # Develop query generator for specific actors (ie combine actorId with start and end dates) #' Generate aggregated actor measures from raw data #' #' Generate aggregated actor measures from raw data #' @param row The row of the actors data frame used for aggregation #' @param actors The data frame containing actor data #' @param es_pwd The password for read access to ES #' @param localhost Boolean indicating if the script is running locally or not #' @param default_operator String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR #' @return No return value, data per actor is saved in an RDS file #' @export #' @examples #' actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR') ################################################################################################# #################################### Aggregate actor results ################################ ################################################################################################# actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR') { ### Functions aggregator <- function (id, duplicates) { article <- filter(duplicates, `_id` == id) %>% unnest(sentence_id, .preserve = colnames(.)) occ <- length(unlist(unique(article$sentence_id1))) sentence_count <- round(article$occ[[1]]/article$prom[[1]]) prom <- occ/sentence_count rel_first <- 1-(min(article$sentence_id1)/sentence_count) return(bind_cols(as.list(article[1,1:6]), # Sentence id, start and end position for actor sentences data.frame(occ = I(list(occ)), # Number of sentences in which actor occurs prom = I(list(prom)), # Relative prominence of actor in article (number of occurences/total # sentences) rel_first = I(list(rel_first)), # Relative position of first occurence at sentence level first = I(list(min(article$sentence_id1))) # First sentence in which actor is mentioned ) ) ) } ### Creating aggregate measuers at daily, weekly, monthly and yearly level grouper <- function(level, actor_df, actorids) { by_newspaper <- actor_df %>% group_by_at(vars(level, `_source.doctype`)) %>% summarise( occ = mean(unlist(occ)), prom = mean(unlist(prom)), rel_first = mean(unlist(rel_first)), first = mean(unlist(first)), articles = length(`_id`), level = level ) aggregate <- actor_df %>% group_by_at(vars(level)) %>% summarise( occ = mean(unlist(occ)), prom = mean(unlist(prom)), rel_first = mean(unlist(rel_first)), first = mean(unlist(first)), articles = length(`_id`), `_source.doctype` = 'agg', level = level ) output <- bind_rows(by_newspaper, aggregate) %>% bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),]) return(output) } ########################################################################################### actor <- actors[row,] if (actor$`_source.function` == "Party"){ years = seq(2000,2019,1) } else { years = c(0) } if (actor$`_source.function` == 'Party' && actor$party_only == T) { actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f')) } else if (actor$`_source.function` == 'Party') { actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a')) actor$party_only <- F } else { actorids <- actor$`_source.actorId` actor$party_only <- NULL } actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) { if (year > 0) { query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31] && computerCodes.junk:0') } else { query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,'] && computerCodes.junk:0') } out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query), fields = c('computerCodes.actorsDetail', 'doctype', 'publication_date'), default_operator = default_operator), localhost = localhost, es_pwd = es_pwd) if (length(out$`_id`) > 0 ) { actor_df <- out ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids. actor_df <- actor_df %>% unnest() %>% unnest(ids, .preserve = colnames(.)) %>% filter(ids1 %in% actorids) %>% select(-ends_with('start')) %>% select(-ends_with('end')) %>% select(-starts_with('ids')) ### Only if there are more rows than articles, recalculate if (length(unique(actor_df$`_id`)) != length(actor_df$`_id`)) { duplicates <- actor_df[(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),] actor_single <- actor_df[!(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),] art_id <- unique(duplicates$`_id`) dupe_merged <- bind_rows(lapply(art_id, aggregator, duplicates = duplicates)) actor_df <- bind_rows(dupe_merged, actor_single) } ### Creating date grouping variables actor_df <- actor_df %>% mutate( year = strftime(`_source.publication_date`, format = '%Y'), yearmonth = strftime(actor_df$`_source.publication_date`, format = '%Y%m'), yearmonthday = strftime(actor_df$`_source.publication_date`, format = '%Y%m%d'), yearweek = strftime(actor_df$`_source.publication_date`, format = "%Y%V") ) levels <- c('year','yearmonth','yearmonthday','yearweek') aggregate_data <- bind_rows(lapply(levels, grouper, actor_df = actor_df, actorids = actorids)) return(aggregate_data) } else { return() } } saveRDS(bind_rows(lapply(years, actor_aggregator, query, actor, actorids, default_operator, localhost, es_pwd)), file = paste0(actor$`_source.country`,'_',paste0(actorids,collapse = ''),'.Rds')) print(paste0('Done with ',row,'/',nrow(actors),' actors')) return() }