mamlr/R/actor_aggregation.R

### Notes:
# Do you want to search for either one OR other actorid, or both occuring in the same document?
# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
# Search by actorId, then aggregate by month
# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
# Develop query generator for specific actors (ie combine actorId with start and end dates)


#' Generate aggregated actor measures from raw data
#'
#' Generate aggregated actor measures from raw data
#' @param row The row of the actors data frame used for aggregation
#' @param actors The data frame containing actor data
#' @param es_pwd The password for read access to ES
#' @param localhost Boolean indicating if the script is running locally or not
#' @param default_operator String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR
#' @return No return value, data per actor is saved in an RDS file
#' @export
#' @examples
#' actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR')
#################################################################################################
#################################### Aggregate actor results ################################
#################################################################################################
actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR') {
  ### Functions
  aggregator <- function (id, duplicates) {
    article <- filter(duplicates, `_id` == id) %>%
      unnest(sentence_id, .preserve = colnames(.))

    occ <- length(unlist(unique(article$sentence_id1)))
    sentence_count <- round(article$occ[[1]]/article$prom[[1]])
    prom <- occ/sentence_count
    rel_first <- 1-(min(article$sentence_id1)/sentence_count)
    return(bind_cols(as.list(article[1,1:6]), # Sentence id, start and end position for actor sentences
                     data.frame(occ = I(list(occ)), # Number of sentences in which actor occurs
                                prom = I(list(prom)), # Relative prominence of actor in article (number of occurences/total # sentences)
                                rel_first = I(list(rel_first)), # Relative position of first occurence at sentence level
                                first = I(list(min(article$sentence_id1))) # First sentence in which actor is mentioned
                     )
    )
    )
  }

  ### Creating aggregate measuers at daily, weekly, monthly and yearly level
  grouper <- function(level, actor_df, actorids) {
    by_newspaper <- actor_df %>% group_by_at(vars(level, `_source.doctype`)) %>%
      summarise(
        occ = mean(unlist(occ)),
        prom = mean(unlist(prom)),
        rel_first = mean(unlist(rel_first)),
        first = mean(unlist(first)),
        articles = length(`_id`),
        level = level
      )

    aggregate <- actor_df %>% group_by_at(vars(level)) %>%
      summarise(
        occ = mean(unlist(occ)),
        prom = mean(unlist(prom)),
        rel_first = mean(unlist(rel_first)),
        first = mean(unlist(first)),
        articles = length(`_id`),
        `_source.doctype` = 'agg',
        level = level
      )
    output <- bind_rows(by_newspaper, aggregate) %>%
      bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),])
    return(output)
  }
###########################################################################################
  actor <- actors[row,]
  if (actor$`_source.function` == "Party"){
    years = seq(2000,2019,1)
  } else {
    years = c(0)
  }

  if (actor$`_source.function` == 'Party' && actor$party_only == T) {
    actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'))
  } else if (actor$`_source.function` == 'Party') {
    actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a'))
    actor$party_only <- F
  } else {
    actorids <- actor$`_source.actorId`
    actor$party_only <- NULL
  }

  actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {
    if (year > 0) {
      query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31] && computerCodes.junk:0')
    } else {
      query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,'] && computerCodes.junk:0')
    }
    out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query),
                                    fields = c('computerCodes.actorsDetail', 'doctype', 'publication_date'), default_operator = default_operator),
                       localhost = localhost,
                       es_pwd = es_pwd)
    if (length(out$`_id`) > 0 ) {
      actor_df <- out
      ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
      actor_df <- actor_df %>%
        unnest() %>%
        unnest(ids, .preserve = colnames(.)) %>%
        filter(ids1 %in% actorids) %>%
        select(-ends_with('start')) %>%
        select(-ends_with('end')) %>%
        select(-starts_with('ids'))

      ### Only if there are more rows than articles, recalculate
      if (length(unique(actor_df$`_id`)) != length(actor_df$`_id`)) {
        duplicates <- actor_df[(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),]
        actor_single <- actor_df[!(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),]
        art_id <- unique(duplicates$`_id`)
        dupe_merged <- bind_rows(lapply(art_id, aggregator, duplicates = duplicates))
        actor_df <- bind_rows(dupe_merged, actor_single)
      }
      ### Creating date grouping variables
      actor_df <- actor_df %>%
        mutate(
          year = strftime(`_source.publication_date`, format = '%Y'),
          yearmonth = strftime(actor_df$`_source.publication_date`, format = '%Y%m'),
          yearmonthday = strftime(actor_df$`_source.publication_date`, format = '%Y%m%d'),
          yearweek = strftime(actor_df$`_source.publication_date`, format = "%Y%V")
        )
      levels <- c('year','yearmonth','yearmonthday','yearweek')
      aggregate_data <- bind_rows(lapply(levels, grouper, actor_df = actor_df, actorids = actorids))
      return(aggregate_data)
    } else {
      return()
    }
  }

  saveRDS(bind_rows(lapply(years, actor_aggregator, query, actor, actorids, default_operator, localhost, es_pwd)), file = paste0(actor$`_source.country`,'_',paste0(actorids,collapse = ''),'.Rds'))
  print(paste0('Done with ',row,'/',nrow(actors),' actors'))
  return()
}
actor_aggregation: Added function to generate aggregate actor measures at daily, weekly, monthly and yearly level query_string: Added default_operator parameter, to define whether whitespaces should be interpreted as AND or OR, defaults to AND 5 years ago			`### Notes:`
			`# Do you want to search for either one OR other actorid, or both occuring in the same document?`
			`# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?`
			`# Search by actorId, then aggregate by month`
			`# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly`
			`# Develop query generator for specific actors (ie combine actorId with start and end dates)`



			`#' Generate aggregated actor measures from raw data`
			`#'`
			`#' Generate aggregated actor measures from raw data`
			`#' @param row The row of the actors data frame used for aggregation`
			`#' @param actors The data frame containing actor data`
			`#' @param es_pwd The password for read access to ES`
			`#' @param localhost Boolean indicating if the script is running locally or not`
			`#' @param default_operator String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR`
			`#' @return No return value, data per actor is saved in an RDS file`
			`#' @export`
			`#' @examples`
			`#' actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR')`
			`#################################################################################################`
			`#################################### Aggregate actor results ################################`
			`#################################################################################################`
			`actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR') {`
actor_aggregation: small fixes to code 5 years ago			`### Functions`
			`aggregator <- function (id, duplicates) {`
			article <- filter(duplicates, `_id` == id) %>%
			`unnest(sentence_id, .preserve = colnames(.))`

			`occ <- length(unlist(unique(article$sentence_id1)))`
			`sentence_count <- round(article$occ[[1]]/article$prom[[1]])`
			`prom <- occ/sentence_count`
			`rel_first <- 1-(min(article$sentence_id1)/sentence_count)`
			`return(bind_cols(as.list(article[1,1:6]), # Sentence id, start and end position for actor sentences`
			`data.frame(occ = I(list(occ)), # Number of sentences in which actor occurs`
			`prom = I(list(prom)), # Relative prominence of actor in article (number of occurences/total # sentences)`
			`rel_first = I(list(rel_first)), # Relative position of first occurence at sentence level`
			`first = I(list(min(article$sentence_id1))) # First sentence in which actor is mentioned`
			`)`
			`)`
			`)`
			`}`

			`### Creating aggregate measuers at daily, weekly, monthly and yearly level`
			`grouper <- function(level, actor_df, actorids) {`
			by_newspaper <- actor_df %>% group_by_at(vars(level, `_source.doctype`)) %>%
			`summarise(`
			`occ = mean(unlist(occ)),`
			`prom = mean(unlist(prom)),`
			`rel_first = mean(unlist(rel_first)),`
			`first = mean(unlist(first)),`
			articles = length(`_id`),
			`level = level`
			`)`

			`aggregate <- actor_df %>% group_by_at(vars(level)) %>%`
			`summarise(`
			`occ = mean(unlist(occ)),`
			`prom = mean(unlist(prom)),`
			`rel_first = mean(unlist(rel_first)),`
			`first = mean(unlist(first)),`
			articles = length(`_id`),
			`_source.doctype` = 'agg',
			`level = level`
			`)`
			`output <- bind_rows(by_newspaper, aggregate) %>%`
			`bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),])`
			`return(output)`
			`}`
			`###########################################################################################`
actor_aggregation: Added function to generate aggregate actor measures at daily, weekly, monthly and yearly level query_string: Added default_operator parameter, to define whether whitespaces should be interpreted as AND or OR, defaults to AND 5 years ago			`actor <- actors[row,]`
			if (actor$`_source.function` == "Party"){
			`years = seq(2000,2019,1)`
			`} else {`
			`years = c(0)`
			`}`

			if (actor$`_source.function` == 'Party' && actor$party_only == T) {
			actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'))
			} else if (actor$`_source.function` == 'Party') {
			actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a'))
			`actor$party_only <- F`
			`} else {`
			actorids <- actor$`_source.actorId`
			`actor$party_only <- NULL`
			`}`

			`actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {`
			`if (year > 0) {`
actor_aggregation: only aggregate scores on non-junk articles 5 years ago			`query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31] && computerCodes.junk:0')`
actor_aggregation: Added function to generate aggregate actor measures at daily, weekly, monthly and yearly level query_string: Added default_operator parameter, to define whether whitespaces should be interpreted as AND or OR, defaults to AND 5 years ago			`} else {`
actor_aggregation: only aggregate scores on non-junk articles 5 years ago			query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,'] && computerCodes.junk:0')
actor_aggregation: Added function to generate aggregate actor measures at daily, weekly, monthly and yearly level query_string: Added default_operator parameter, to define whether whitespaces should be interpreted as AND or OR, defaults to AND 5 years ago			`}`
			out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query),
			`fields = c('computerCodes.actorsDetail', 'doctype', 'publication_date'), default_operator = default_operator),`
			`localhost = localhost,`
			`es_pwd = es_pwd)`
			if (length(out$`_id`) > 0 ) {
actor_aggregation: small fixes to code 5 years ago			`actor_df <- out`
actor_aggregation: Added function to generate aggregate actor measures at daily, weekly, monthly and yearly level query_string: Added default_operator parameter, to define whether whitespaces should be interpreted as AND or OR, defaults to AND 5 years ago			`### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.`
actor_aggregation: small fixes to code 5 years ago			`actor_df <- actor_df %>%`
actor_aggregation: Added function to generate aggregate actor measures at daily, weekly, monthly and yearly level query_string: Added default_operator parameter, to define whether whitespaces should be interpreted as AND or OR, defaults to AND 5 years ago			`unnest() %>%`
			`unnest(ids, .preserve = colnames(.)) %>%`
			`filter(ids1 %in% actorids) %>%`
			`select(-ends_with('start')) %>%`
			`select(-ends_with('end')) %>%`
			`select(-starts_with('ids'))`

			`### Only if there are more rows than articles, recalculate`
			if (length(unique(actor_df$`_id`)) != length(actor_df$`_id`)) {
			duplicates <- actor_df[(duplicated(actor_df$`_id`) \| duplicated(actor_df$`_id`, fromLast = T)),]
			actor_single <- actor_df[!(duplicated(actor_df$`_id`) \| duplicated(actor_df$`_id`, fromLast = T)),]
			art_id <- unique(duplicates$`_id`)
			`dupe_merged <- bind_rows(lapply(art_id, aggregator, duplicates = duplicates))`
			`actor_df <- bind_rows(dupe_merged, actor_single)`
			`}`
			`### Creating date grouping variables`
			`actor_df <- actor_df %>%`
			`mutate(`
			year = strftime(`_source.publication_date`, format = '%Y'),
			yearmonth = strftime(actor_df$`_source.publication_date`, format = '%Y%m'),
			yearmonthday = strftime(actor_df$`_source.publication_date`, format = '%Y%m%d'),
			yearweek = strftime(actor_df$`_source.publication_date`, format = "%Y%V")
			`)`
			`levels <- c('year','yearmonth','yearmonthday','yearweek')`
actor_aggregation: small fixes to code 5 years ago			`aggregate_data <- bind_rows(lapply(levels, grouper, actor_df = actor_df, actorids = actorids))`
actor_aggregation: Added function to generate aggregate actor measures at daily, weekly, monthly and yearly level query_string: Added default_operator parameter, to define whether whitespaces should be interpreted as AND or OR, defaults to AND 5 years ago			`return(aggregate_data)`
			`} else {`
			`return()`
			`}`
			`}`

			saveRDS(bind_rows(lapply(years, actor_aggregator, query, actor, actorids, default_operator, localhost, es_pwd)), file = paste0(actor$`_source.country`,'_',paste0(actorids,collapse = ''),'.Rds'))
			`print(paste0('Done with ',row,'/',nrow(actors),' actors'))`
			`return()`
			`}`