actor_fetcher: elasticizer batch function to fetch actorsDetail fields from all relevant documents

master
Erik de Vries 6 years ago
parent edd4b785a5
commit 6f5ace8c52

@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(actor_aggregation) export(actor_aggregation)
export(actor_fetcher)
export(actorizer) export(actorizer)
export(aggregator) export(aggregator)
export(aggregator_elastic) export(aggregator_elastic)

@ -0,0 +1,95 @@
#' Generate actor data frames (with sentiment) from database
#'
#' Generate actor data frames (with sentiment) from database
#' @param out Data frame produced by elasticizer
#' @param sent_dict Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)
#' @param cores Number of threads to use for parallel processing
#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
#' @return No return value, data per batch is saved in an RDS file
#' @export
#' @examples
#' actor_fetcher(out, sent_dict = NULL, cores = 1)
#################################################################################################
#################################### Aggregate actor results ################################
#################################################################################################
actor_fetcher <- function(out, sent_dict = NULL, cores = 1, localhost = NULL, validation = F) {
plan(multiprocess, workers = cores)
### Functions
### Calculate sentiment scores for each actor-document
sent_scorer <- function(row, out_row, ud_sent) {
### Contains sentiment per sentence for actor
actor_tone <- filter(ud_sent, sentence_id %in% unlist(out_row[row,]$sentence_id))
### Aggregated sentiment per actor (over all sentences containing actor)
actor <- summarise(actor_tone,
sent = sum(sent_sum)/sum(words),
sent_sum = sum(sent_sum),
sent_words = sum(sent_words),
words = sum(words),
arousal = sum(sent_words)/sum(words)
)
return(cbind(out_row[row,],data.frame(actor = actor)))
}
par_sent <- function(row, out, sent_dict = NULL) {
out_row <- out[row,]
### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
if (is.null(sent_dict) == F) {
ud_sent <- out_row$`_source.ud`[[1]] %>%
select(-one_of('exists')) %>%
unnest() %>%
filter(upos != 'PUNCT') %>% # For getting proper word counts
mutate(V1 = str_c(lemma,'_',upos)) %>%
left_join(sent_dict, by = 'V1') %>%
### Setting binary sentiment as unit of analysis
mutate(V2 = V3) %>%
group_by(sentence_id) %>%
mutate(
V2 = case_when(
is.na(V2) == T ~ 0,
TRUE ~ V2
)
) %>%
summarise(sent_sum = sum(V2),
words = length(lemma),
sent_words = length(na.omit(V3))) %>%
mutate(
sent = sent_sum/words,
arousal = sent_words/words
)
out_row <- select(out_row, -`_source.ud`) %>%
unnest(`_source.computerCodes.actorsDetail`, .preserve = colnames(.))
### Aggregated sentiment per article (over all sentences in article)
text_sent <- summarise(ud_sent,
sent = sum(sent_sum)/sum(words),
sent_sum = sum(sent_sum),
sent_words = sum(sent_words),
words = sum(words),
arousal = sum(sent_words)/sum(words)
)
out_row <- bind_rows(lapply(seq(1,nrow(out_row),1),sent_scorer, out_row = out_row, ud_sent = ud_sent)) %>%
cbind(., text = text_sent)
if (validation == T) {
codes_sent <- filter(ud_sent, sentence_id == out_row$`_source.codes.sentence.id`[1]) %>%
select(-sentence_id)
out_row <- cbind(out_row, codes = codes_sent)
}
} else {
out_row <- unnest(out_row, `_source.computerCodes.actorsDetail`, .preserve = colnames(.))
}
out_row <- out_row %>%
mutate(
year = strftime(`_source.publication_date`, format = '%Y'),
yearmonth = strftime(`_source.publication_date`, format = '%Y%m'),
yearmonthday = strftime(`_source.publication_date`, format = '%Y%m%d'),
yearweek = strftime(`_source.publication_date`, format = "%Y%V")
) %>%
select(-`_source.computerCodes.actorsDetail`,
-`_score`,
-`_index`,
-`_type`)
return(out_row)
}
saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
return()
}

@ -5,7 +5,7 @@
\title{Generate aggregated actor measures from raw data} \title{Generate aggregated actor measures from raw data}
\usage{ \usage{
actor_aggregation(row, actors, es_pwd, localhost, actor_aggregation(row, actors, es_pwd, localhost,
default_operator = "OR") default_operator = "OR", sent_dict = NULL, cores = detectCores())
} }
\arguments{ \arguments{
\item{row}{The row of the actors data frame used for aggregation} \item{row}{The row of the actors data frame used for aggregation}

@ -0,0 +1,27 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/actor_fetcher.R
\name{actor_fetcher}
\alias{actor_fetcher}
\title{Generate actor data frames (with sentiment) from database}
\usage{
actor_fetcher(out, sent_dict = NULL, cores = 1, localhost = NULL,
validation = F)
}
\arguments{
\item{out}{Data frame produced by elasticizer}
\item{sent_dict}{Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)}
\item{cores}{Number of threads to use for parallel processing}
\item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
}
\value{
No return value, data per batch is saved in an RDS file
}
\description{
Generate actor data frames (with sentiment) from database
}
\examples{
actor_fetcher(out, sent_dict = NULL, cores = 1)
}

@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/actor_aggregation_db.R % Please edit documentation in R/aggregator_elastic.R
\name{aggregator_elastic} \name{aggregator_elastic}
\alias{aggregator_elastic} \alias{aggregator_elastic}
\title{Generate and store aggregate actor measures to elasticsearch} \title{Generate and store aggregate actor measures to elasticsearch}
\usage{ \usage{
aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super) aggregator_elastic(out, localhost = F, actorids, ver, es_super)
} }
\arguments{ \arguments{
\item{out}{The output provided by elasticizer()} \item{out}{The output provided by elasticizer()}
@ -15,8 +15,6 @@ aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super)
\item{ver}{String indicating the version of the update} \item{ver}{String indicating the version of the update}
\item{cores}{Numeric value indicating the number of cores to use for processing}
\item{es_super}{Write password for ES} \item{es_super}{Write password for ES}
} }
\value{ \value{
@ -26,5 +24,5 @@ Return value is based on output of elastic_update()
Generate and store aggregate actor measures to elasticsearch Generate and store aggregate actor measures to elasticsearch
} }
\examples{ \examples{
aggregator_elastic(out, localhost = F, actorids, ver, cores, es_super) aggregator_elastic(out, localhost = F, actorids, ver, es_super)
} }

Loading…
Cancel
Save