From e3b26c0be375ff8c825e60cd75fa639bccafe1aa Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Sat, 11 May 2019 17:49:53 +0200 Subject: [PATCH] actor_aggregation: Added function to generate aggregate actor measures at daily, weekly, monthly and yearly level query_string: Added default_operator parameter, to define whether whitespaces should be interpreted as AND or OR, defaults to AND --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/actor_aggregation.R | 137 +++++++++++++++++++++++++++++++++++++++ man/actor_aggregation.Rd | 29 +++++++++ man/query_string.Rd | 2 +- 5 files changed, 169 insertions(+), 2 deletions(-) create mode 100644 R/actor_aggregation.R create mode 100644 man/actor_aggregation.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 662a849..78f6e0f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,4 +18,4 @@ Depends: R (>= 3.3.1), License: Copyright Erik de Vries Encoding: UTF-8 LazyData: true -RoxygenNote: 6.1.0 +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index bf6270c..3362e08 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(actor_aggregation) export(actorizer) export(bulk_writer) export(class_update) diff --git a/R/actor_aggregation.R b/R/actor_aggregation.R new file mode 100644 index 0000000..a9e20d5 --- /dev/null +++ b/R/actor_aggregation.R @@ -0,0 +1,137 @@ +### Notes: +# Do you want to search for either one OR other actorid, or both occuring in the same document? +# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits? +# Search by actorId, then aggregate by month +# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly +# Develop query generator for specific actors (ie combine actorId with start and end dates) + + + +#' Generate aggregated actor measures from raw data +#' +#' Generate aggregated actor measures from raw data +#' @param row The row of the actors data frame used for aggregation +#' @param actors The data frame containing actor data +#' @param es_pwd The password for read access to ES +#' @param localhost Boolean indicating if the script is running locally or not +#' @param default_operator String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR +#' @return No return value, data per actor is saved in an RDS file +#' @export +#' @examples +#' actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR') +################################################################################################# +#################################### Aggregate actor results ################################ +################################################################################################# +actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR') { + actor <- actors[row,] + if (actor$`_source.function` == "Party"){ + years = seq(2000,2019,1) + } else { + years = c(0) + } + + if (actor$`_source.function` == 'Party' && actor$party_only == T) { + actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f')) + } else if (actor$`_source.function` == 'Party') { + actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a')) + actor$party_only <- F + } else { + actorids <- actor$`_source.actorId` + actor$party_only <- NULL + } + + actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) { + ### Functions + aggregator <- function (id, duplicates) { + article <- filter(duplicates, `_id` == id) %>% + unnest(sentence_id, .preserve = colnames(.)) + + occ <- length(unlist(unique(article$sentence_id1))) + sentence_count <- round(article$occ[[1]]/article$prom[[1]]) + prom <- occ/sentence_count + rel_first <- 1-(min(article$sentence_id1)/sentence_count) + return(bind_cols(as.list(article[1,1:6]), # Sentence id, start and end position for actor sentences + data.frame(occ = I(list(occ)), # Number of sentences in which actor occurs + prom = I(list(prom)), # Relative prominence of actor in article (number of occurences/total # sentences) + rel_first = I(list(rel_first)), # Relative position of first occurence at sentence level + first = I(list(min(article$sentence_id1))) # First sentence in which actor is mentioned + ) + ) + ) + } + if (year > 0) { + query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31]') + } else { + query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,']') + } + + out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query), + fields = c('computerCodes.actorsDetail', 'doctype', 'publication_date'), default_operator = default_operator), + localhost = localhost, + es_pwd = es_pwd) + if (length(out$`_id`) > 0 ) { + ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids. + actor_df <- out %>% + unnest() %>% + unnest(ids, .preserve = colnames(.)) %>% + filter(ids1 %in% actorids) %>% + select(-ends_with('start')) %>% + select(-ends_with('end')) %>% + select(-starts_with('ids')) + + ### Only if there are more rows than articles, recalculate + if (length(unique(actor_df$`_id`)) != length(actor_df$`_id`)) { + duplicates <- actor_df[(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),] + actor_single <- actor_df[!(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),] + art_id <- unique(duplicates$`_id`) + dupe_merged <- bind_rows(lapply(art_id, aggregator, duplicates = duplicates)) + actor_df <- bind_rows(dupe_merged, actor_single) + } + + ### Creating date grouping variables + actor_df <- actor_df %>% + mutate( + year = strftime(`_source.publication_date`, format = '%Y'), + yearmonth = strftime(actor_df$`_source.publication_date`, format = '%Y%m'), + yearmonthday = strftime(actor_df$`_source.publication_date`, format = '%Y%m%d'), + yearweek = strftime(actor_df$`_source.publication_date`, format = "%Y%V") + ) + ### Creating aggregate measuers at daily, weekly, monthly and yearly level + grouper <- function(level) { + by_newspaper <- actor_df %>% group_by_at(vars(level, `_source.doctype`)) %>% + summarise( + occ = mean(unlist(occ)), + prom = mean(unlist(prom)), + rel_first = mean(unlist(rel_first)), + first = mean(unlist(first)), + articles = length(`_id`), + level = level + ) + + aggregate <- actor_df %>% group_by_at(vars(level)) %>% + summarise( + occ = mean(unlist(occ)), + prom = mean(unlist(prom)), + rel_first = mean(unlist(rel_first)), + first = mean(unlist(first)), + articles = length(`_id`), + `_source.doctype` = 'agg', + level = level + ) + output <- bind_rows(by_newspaper, aggregate) %>% + bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),]) + return(output) + } + levels <- c('year','yearmonth','yearmonthday','yearweek') + aggregate_data <- bind_rows(lapply(levels, grouper)) + return(aggregate_data) + } else { + return() + } + } + + saveRDS(bind_rows(lapply(years, actor_aggregator, query, actor, actorids, default_operator, localhost, es_pwd)), file = paste0(actor$`_source.country`,'_',paste0(actorids,collapse = ''),'.Rds')) + print(paste0('Done with ',row,'/',nrow(actors),' actors')) + return() +} + diff --git a/man/actor_aggregation.Rd b/man/actor_aggregation.Rd new file mode 100644 index 0000000..33e40db --- /dev/null +++ b/man/actor_aggregation.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/actor_aggregation.R +\name{actor_aggregation} +\alias{actor_aggregation} +\title{Generate aggregated actor measures from raw data} +\usage{ +actor_aggregation(row, actors, es_pwd, localhost, + default_operator = "OR") +} +\arguments{ +\item{row}{The row of the actors data frame used for aggregation} + +\item{actors}{The data frame containing actor data} + +\item{es_pwd}{The password for read access to ES} + +\item{localhost}{Boolean indicating if the script is running locally or not} + +\item{default_operator}{String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR} +} +\value{ +No return value, data per actor is saved in an RDS file +} +\description{ +Generate aggregated actor measures from raw data +} +\examples{ +actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR') +} diff --git a/man/query_string.Rd b/man/query_string.Rd index 0c03e2e..1ebcdd7 100644 --- a/man/query_string.Rd +++ b/man/query_string.Rd @@ -4,7 +4,7 @@ \alias{query_string} \title{Generate a query string query for ElasticSearch} \usage{ -query_string(query, fields = F, random = F) +query_string(query, fields = F, random = F, default_operator = "AND") } \arguments{ \item{query}{Query string in ElasticSearch query string format}