added aggregator and aggregator_elastic functions for aggregating and storing article level actor aggregations
parent
2281d11a68
commit
0d81d6fc7a
@ -0,0 +1,39 @@
|
||||
#' Aggregator function, to aggregate actor results
|
||||
#'
|
||||
#' Aggregator function, to aggregate actor results
|
||||
#' @param id Article id of the article for which actor aggregation should be done
|
||||
#' @param actor_df The dataframe containing the actor data
|
||||
#' @param merge_id The actorid that should be assigned to the merged result
|
||||
#' @return A dataframe with the merged results
|
||||
#' @export
|
||||
#' @examples
|
||||
#' aggregator(id, actor_df, merge_id)
|
||||
|
||||
aggregator <- function (id, actor_df, merge_id) {
|
||||
article <- filter(actor_df, `_id` == id) %>%
|
||||
unnest(sentence_id, .preserve = colnames(.))
|
||||
occ <- length(unlist(unique(article$sentence_id1)))
|
||||
sentence_count <- round(article$occ[[1]]/article$prom[[1]])
|
||||
prom <- occ/sentence_count
|
||||
rel_first <- 1-(min(article$sentence_id1)/sentence_count)
|
||||
actor_start <- sort(unique(unlist(article$actor_start)))
|
||||
actor_end <- sort(unique(unlist(article$actor_end)))
|
||||
sentence_start <- sort(unique(unlist(article$sentence_start)))
|
||||
sentence_end <- sort(unique(unlist(article$sentence_end)))
|
||||
sentence_id <- sort(unique(unlist(article$sentence_id)))
|
||||
|
||||
return(data.frame(doc_id = first(article$`_id`),
|
||||
sentence_id = I(list(as.integer(sentence_id))),
|
||||
sentence_start = I(list(sentence_start)),
|
||||
sentence_end = I(list(sentence_end)),
|
||||
actor_start = I(list(actor_start)), # List of actor ud token start positions
|
||||
actor_end = I(list(actor_end)), # List of actor ud token end positions
|
||||
occ = occ, # Number of sentences in which actor occurs
|
||||
prom = prom, # Relative prominence of actor in article (number of occurences/total # sentences)
|
||||
rel_first = rel_first, # Relative position of first occurence at sentence level
|
||||
first = min(article$sentence_id1), # First sentence in which actor is mentioned
|
||||
ids = merge_id, # List of actor ids
|
||||
stringsAsFactors = F
|
||||
)
|
||||
)
|
||||
}
|
@ -0,0 +1,64 @@
|
||||
### Notes:
|
||||
# Do you want to search for either one OR other actorid, or both occuring in the same document?
|
||||
# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
|
||||
# Search by actorId, then aggregate by month
|
||||
# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
|
||||
# Develop query generator for specific actors (ie combine actorId with start and end dates)
|
||||
|
||||
|
||||
|
||||
#' Generate and store aggregate actor measures to elasticsearch
|
||||
#'
|
||||
#' Generate and store aggregate actor measures to elasticsearch
|
||||
#' @param out The output provided by elasticizer()
|
||||
#' @param localhost Boolean indicating if the script should run locally, or remote
|
||||
#' @param es_super Write password for ES
|
||||
#' @param actorids List of actorids used in the search, should be the same as the actorids used for elasticizer()
|
||||
#' @param ver String indicating the version of the update
|
||||
#' @param cores Numeric value indicating the number of cores to use for processing
|
||||
#' @return Return value is based on output of elastic_update()
|
||||
#' @export
|
||||
#' @examples
|
||||
#' aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
|
||||
#################################################################################################
|
||||
#################################### Aggregate actor results ################################
|
||||
#################################################################################################
|
||||
aggregator_elastic <- function(out, localhost = F, actorids, ver, cores, es_super) {
|
||||
### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
|
||||
actor_df <- out %>%
|
||||
unnest() %>%
|
||||
unnest(ids, .preserve = colnames(.)) %>%
|
||||
filter(ids1 %in% actorids)
|
||||
|
||||
agg_party_actors <- bind_rows(mclapply(unique(actor_df$`_id`),
|
||||
aggregator,
|
||||
actor_df = actor_df,
|
||||
merge_id = paste0(actor$`_source.partyId`,'_mfsa'),
|
||||
mc.cores = cores))
|
||||
|
||||
party <- actor_df %>%
|
||||
filter(!endsWith(ids1, '_a'))
|
||||
agg_party <- bind_rows(mclapply(unique(party$`_id`),
|
||||
aggregator,
|
||||
actor_df = party,
|
||||
merge_id = paste0(actor$`_source.partyId`,'_mfs'),
|
||||
mc.cores = cores))
|
||||
|
||||
actors_only <- actor_df %>%
|
||||
filter(endsWith(ids1, '_a'))
|
||||
agg_actors <- bind_rows(mclapply(unique(actors_only$`_id`),
|
||||
aggregator,
|
||||
actor_df = actors_only,
|
||||
merge_id = paste0(actor$`_source.partyId`,'_ma'),
|
||||
mc.cores = cores))
|
||||
df_out <- bind_rows(agg_party_actors, agg_party, agg_actors)
|
||||
doc_ids <- df_out$doc_id
|
||||
df_out <- df_out %>%
|
||||
select(-1) %>%
|
||||
split(as.factor(doc_ids))
|
||||
df_out <- data.frame(doc_id = names(df_out), list = I(df_out))
|
||||
bulk <- apply(df_out, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
|
||||
return(elastic_update(bulk, es_super = es_super, localhost = localhost))
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,24 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/aggregator.R
|
||||
\name{aggregator}
|
||||
\alias{aggregator}
|
||||
\title{Aggregator function, to aggregate actor results}
|
||||
\usage{
|
||||
aggregator(id, actor_df, merge_id)
|
||||
}
|
||||
\arguments{
|
||||
\item{id}{Article id of the article for which actor aggregation should be done}
|
||||
|
||||
\item{actor_df}{The dataframe containing the actor data}
|
||||
|
||||
\item{merge_id}{The actorid that should be assigned to the merged result}
|
||||
}
|
||||
\value{
|
||||
A dataframe with the merged results
|
||||
}
|
||||
\description{
|
||||
Aggregator function, to aggregate actor results
|
||||
}
|
||||
\examples{
|
||||
aggregator(id, actor_df, merge_id)
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/actor_aggregation_db.R
|
||||
\name{aggregator_elastic}
|
||||
\alias{aggregator_elastic}
|
||||
\title{Generate and store aggregate actor measures to elasticsearch}
|
||||
\usage{
|
||||
aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
|
||||
}
|
||||
\arguments{
|
||||
\item{out}{The output provided by elasticizer()}
|
||||
|
||||
\item{localhost}{Boolean indicating if the script should run locally, or remote}
|
||||
|
||||
\item{actorids}{List of actorids used in the search, should be the same as the actorids used for elasticizer()}
|
||||
|
||||
\item{ver}{String indicating the version of the update}
|
||||
|
||||
\item{cores}{Numeric value indicating the number of cores to use for processing}
|
||||
|
||||
\item{es_super}{Write password for ES}
|
||||
}
|
||||
\value{
|
||||
Return value is based on output of elastic_update()
|
||||
}
|
||||
\description{
|
||||
Generate and store aggregate actor measures to elasticsearch
|
||||
}
|
||||
\examples{
|
||||
aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
|
||||
}
|
Loading…
Reference in new issue