From e3b26c0be375ff8c825e60cd75fa639bccafe1aa Mon Sep 17 00:00:00 2001
From: Erik de Vries <erik@devries.pm>
Date: Sat, 11 May 2019 17:49:53 +0200
Subject: [PATCH] actor_aggregation: Added function to generate aggregate actor
 measures at daily, weekly, monthly and yearly level query_string: Added
 default_operator parameter, to define whether whitespaces should be
 interpreted as AND or OR, defaults to AND

---
 DESCRIPTION              |   2 +-
 NAMESPACE                |   1 +
 R/actor_aggregation.R    | 137 +++++++++++++++++++++++++++++++++++++++
 man/actor_aggregation.Rd |  29 +++++++++
 man/query_string.Rd      |   2 +-
 5 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 R/actor_aggregation.R
 create mode 100644 man/actor_aggregation.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 662a849..78f6e0f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -18,4 +18,4 @@ Depends: R (>= 3.3.1),
 License: Copyright Erik de Vries
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.1.0
+RoxygenNote: 6.1.1
diff --git a/NAMESPACE b/NAMESPACE
index bf6270c..3362e08 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(actor_aggregation)
 export(actorizer)
 export(bulk_writer)
 export(class_update)
diff --git a/R/actor_aggregation.R b/R/actor_aggregation.R
new file mode 100644
index 0000000..a9e20d5
--- /dev/null
+++ b/R/actor_aggregation.R
@@ -0,0 +1,137 @@
+### Notes:
+# Do you want to search for either one OR other actorid, or both occuring in the same document?
+# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
+# Search by actorId, then aggregate by month
+# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
+# Develop query generator for specific actors (ie combine actorId with start and end dates)
+
+
+
+#' Generate aggregated actor measures from raw data
+#'
+#' Generate aggregated actor measures from raw data
+#' @param row The row of the actors data frame used for aggregation
+#' @param actors The data frame containing actor data
+#' @param es_pwd The password for read access to ES
+#' @param localhost Boolean indicating if the script is running locally or not
+#' @param default_operator String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR
+#' @return No return value, data per actor is saved in an RDS file
+#' @export
+#' @examples
+#' actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR')
+#################################################################################################
+#################################### Aggregate actor results ################################
+#################################################################################################
+actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR') {
+  actor <- actors[row,]
+  if (actor$`_source.function` == "Party"){
+    years = seq(2000,2019,1)
+  } else {
+    years = c(0)
+  }
+
+  if (actor$`_source.function` == 'Party' && actor$party_only == T) {
+    actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'))
+  } else if (actor$`_source.function` == 'Party') {
+    actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a'))
+    actor$party_only <- F
+  } else {
+    actorids <- actor$`_source.actorId`
+    actor$party_only <- NULL
+  }
+
+  actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {
+    ### Functions
+    aggregator <- function (id, duplicates) {
+      article <- filter(duplicates, `_id` == id) %>%
+        unnest(sentence_id, .preserve = colnames(.))
+
+      occ <- length(unlist(unique(article$sentence_id1)))
+      sentence_count <- round(article$occ[[1]]/article$prom[[1]])
+      prom <- occ/sentence_count
+      rel_first <- 1-(min(article$sentence_id1)/sentence_count)
+      return(bind_cols(as.list(article[1,1:6]), # Sentence id, start and end position for actor sentences
+                       data.frame(occ = I(list(occ)), # Number of sentences in which actor occurs
+                                  prom = I(list(prom)), # Relative prominence of actor in article (number of occurences/total # sentences)
+                                  rel_first = I(list(rel_first)), # Relative position of first occurence at sentence level
+                                  first = I(list(min(article$sentence_id1))) # First sentence in which actor is mentioned
+                       )
+      )
+      )
+    }
+    if (year > 0) {
+      query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31]')
+    } else {
+      query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,']')
+    }
+
+    out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query),
+                                    fields = c('computerCodes.actorsDetail', 'doctype', 'publication_date'), default_operator = default_operator),
+                       localhost = localhost,
+                       es_pwd = es_pwd)
+    if (length(out$`_id`) > 0 ) {
+      ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
+      actor_df <- out %>%
+        unnest() %>%
+        unnest(ids, .preserve = colnames(.)) %>%
+        filter(ids1 %in% actorids) %>%
+        select(-ends_with('start')) %>%
+        select(-ends_with('end')) %>%
+        select(-starts_with('ids'))
+
+      ### Only if there are more rows than articles, recalculate
+      if (length(unique(actor_df$`_id`)) != length(actor_df$`_id`)) {
+        duplicates <- actor_df[(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),]
+        actor_single <- actor_df[!(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),]
+        art_id <- unique(duplicates$`_id`)
+        dupe_merged <- bind_rows(lapply(art_id, aggregator, duplicates = duplicates))
+        actor_df <- bind_rows(dupe_merged, actor_single)
+      }
+
+      ### Creating date grouping variables
+      actor_df <- actor_df %>%
+        mutate(
+          year = strftime(`_source.publication_date`, format = '%Y'),
+          yearmonth = strftime(actor_df$`_source.publication_date`, format = '%Y%m'),
+          yearmonthday = strftime(actor_df$`_source.publication_date`, format = '%Y%m%d'),
+          yearweek = strftime(actor_df$`_source.publication_date`, format = "%Y%V")
+        )
+      ### Creating aggregate measuers at daily, weekly, monthly and yearly level
+      grouper <- function(level) {
+        by_newspaper <- actor_df %>% group_by_at(vars(level, `_source.doctype`)) %>%
+          summarise(
+            occ = mean(unlist(occ)),
+            prom = mean(unlist(prom)),
+            rel_first = mean(unlist(rel_first)),
+            first = mean(unlist(first)),
+            articles = length(`_id`),
+            level = level
+          )
+
+        aggregate <- actor_df %>% group_by_at(vars(level)) %>%
+          summarise(
+            occ = mean(unlist(occ)),
+            prom = mean(unlist(prom)),
+            rel_first = mean(unlist(rel_first)),
+            first = mean(unlist(first)),
+            articles = length(`_id`),
+            `_source.doctype` = 'agg',
+            level = level
+          )
+        output <- bind_rows(by_newspaper, aggregate) %>%
+          bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),])
+        return(output)
+      }
+      levels <- c('year','yearmonth','yearmonthday','yearweek')
+      aggregate_data <- bind_rows(lapply(levels, grouper))
+      return(aggregate_data)
+    } else {
+      return()
+    }
+  }
+
+  saveRDS(bind_rows(lapply(years, actor_aggregator, query, actor, actorids, default_operator, localhost, es_pwd)), file = paste0(actor$`_source.country`,'_',paste0(actorids,collapse = ''),'.Rds'))
+  print(paste0('Done with ',row,'/',nrow(actors),' actors'))
+  return()
+}
+
diff --git a/man/actor_aggregation.Rd b/man/actor_aggregation.Rd
new file mode 100644
index 0000000..33e40db
--- /dev/null
+++ b/man/actor_aggregation.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/actor_aggregation.R
+\name{actor_aggregation}
+\alias{actor_aggregation}
+\title{Generate aggregated actor measures from raw data}
+\usage{
+actor_aggregation(row, actors, es_pwd, localhost,
+  default_operator = "OR")
+}
+\arguments{
+\item{row}{The row of the actors data frame used for aggregation}
+
+\item{actors}{The data frame containing actor data}
+
+\item{es_pwd}{The password for read access to ES}
+
+\item{localhost}{Boolean indicating if the script is running locally or not}
+
+\item{default_operator}{String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR}
+}
+\value{
+No return value, data per actor is saved in an RDS file
+}
+\description{
+Generate aggregated actor measures from raw data
+}
+\examples{
+actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR')
+}
diff --git a/man/query_string.Rd b/man/query_string.Rd
index 0c03e2e..1ebcdd7 100644
--- a/man/query_string.Rd
+++ b/man/query_string.Rd
@@ -4,7 +4,7 @@
 \alias{query_string}
 \title{Generate a query string query for ElasticSearch}
 \usage{
-query_string(query, fields = F, random = F)
+query_string(query, fields = F, random = F, default_operator = "AND")
 }
 \arguments{
 \item{query}{Query string in ElasticSearch query string format}