added aggregator and aggregator_elastic functions for aggregating and storing article level actor aggregations

master
Erik de Vries 6 years ago
parent 2281d11a68
commit 0d81d6fc7a

@ -0,0 +1,39 @@
#' Aggregator function, to aggregate actor results
#'
#' Aggregator function, to aggregate actor results
#' @param id Article id of the article for which actor aggregation should be done
#' @param actor_df The dataframe containing the actor data
#' @param merge_id The actorid that should be assigned to the merged result
#' @return A dataframe with the merged results
#' @export
#' @examples
#' aggregator(id, actor_df, merge_id)
aggregator <- function (id, actor_df, merge_id) {
article <- filter(actor_df, `_id` == id) %>%
unnest(sentence_id, .preserve = colnames(.))
occ <- length(unlist(unique(article$sentence_id1)))
sentence_count <- round(article$occ[[1]]/article$prom[[1]])
prom <- occ/sentence_count
rel_first <- 1-(min(article$sentence_id1)/sentence_count)
actor_start <- sort(unique(unlist(article$actor_start)))
actor_end <- sort(unique(unlist(article$actor_end)))
sentence_start <- sort(unique(unlist(article$sentence_start)))
sentence_end <- sort(unique(unlist(article$sentence_end)))
sentence_id <- sort(unique(unlist(article$sentence_id)))
return(data.frame(doc_id = first(article$`_id`),
sentence_id = I(list(as.integer(sentence_id))),
sentence_start = I(list(sentence_start)),
sentence_end = I(list(sentence_end)),
actor_start = I(list(actor_start)), # List of actor ud token start positions
actor_end = I(list(actor_end)), # List of actor ud token end positions
occ = occ, # Number of sentences in which actor occurs
prom = prom, # Relative prominence of actor in article (number of occurences/total # sentences)
rel_first = rel_first, # Relative position of first occurence at sentence level
first = min(article$sentence_id1), # First sentence in which actor is mentioned
ids = merge_id, # List of actor ids
stringsAsFactors = F
)
)
}

@ -0,0 +1,64 @@
### Notes:
# Do you want to search for either one OR other actorid, or both occuring in the same document?
# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
# Search by actorId, then aggregate by month
# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
# Develop query generator for specific actors (ie combine actorId with start and end dates)
#' Generate and store aggregate actor measures to elasticsearch
#'
#' Generate and store aggregate actor measures to elasticsearch
#' @param out The output provided by elasticizer()
#' @param localhost Boolean indicating if the script should run locally, or remote
#' @param es_super Write password for ES
#' @param actorids List of actorids used in the search, should be the same as the actorids used for elasticizer()
#' @param ver String indicating the version of the update
#' @param cores Numeric value indicating the number of cores to use for processing
#' @return Return value is based on output of elastic_update()
#' @export
#' @examples
#' aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
#################################################################################################
#################################### Aggregate actor results ################################
#################################################################################################
aggregator_elastic <- function(out, localhost = F, actorids, ver, cores, es_super) {
### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
actor_df <- out %>%
unnest() %>%
unnest(ids, .preserve = colnames(.)) %>%
filter(ids1 %in% actorids)
agg_party_actors <- bind_rows(mclapply(unique(actor_df$`_id`),
aggregator,
actor_df = actor_df,
merge_id = paste0(actor$`_source.partyId`,'_mfsa'),
mc.cores = cores))
party <- actor_df %>%
filter(!endsWith(ids1, '_a'))
agg_party <- bind_rows(mclapply(unique(party$`_id`),
aggregator,
actor_df = party,
merge_id = paste0(actor$`_source.partyId`,'_mfs'),
mc.cores = cores))
actors_only <- actor_df %>%
filter(endsWith(ids1, '_a'))
agg_actors <- bind_rows(mclapply(unique(actors_only$`_id`),
aggregator,
actor_df = actors_only,
merge_id = paste0(actor$`_source.partyId`,'_ma'),
mc.cores = cores))
df_out <- bind_rows(agg_party_actors, agg_party, agg_actors)
doc_ids <- df_out$doc_id
df_out <- df_out %>%
select(-1) %>%
split(as.factor(doc_ids))
df_out <- data.frame(doc_id = names(df_out), list = I(df_out))
bulk <- apply(df_out, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
return(elastic_update(bulk, es_super = es_super, localhost = localhost))
}

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/aggregator.R
\name{aggregator}
\alias{aggregator}
\title{Aggregator function, to aggregate actor results}
\usage{
aggregator(id, actor_df, merge_id)
}
\arguments{
\item{id}{Article id of the article for which actor aggregation should be done}
\item{actor_df}{The dataframe containing the actor data}
\item{merge_id}{The actorid that should be assigned to the merged result}
}
\value{
A dataframe with the merged results
}
\description{
Aggregator function, to aggregate actor results
}
\examples{
aggregator(id, actor_df, merge_id)
}

@ -0,0 +1,30 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/actor_aggregation_db.R
\name{aggregator_elastic}
\alias{aggregator_elastic}
\title{Generate and store aggregate actor measures to elasticsearch}
\usage{
aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
}
\arguments{
\item{out}{The output provided by elasticizer()}
\item{localhost}{Boolean indicating if the script should run locally, or remote}
\item{actorids}{List of actorids used in the search, should be the same as the actorids used for elasticizer()}
\item{ver}{String indicating the version of the update}
\item{cores}{Numeric value indicating the number of cores to use for processing}
\item{es_super}{Write password for ES}
}
\value{
Return value is based on output of elastic_update()
}
\description{
Generate and store aggregate actor measures to elasticsearch
}
\examples{
aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
}
Loading…
Cancel
Save