actor_merger: added function for generating actor-document data frames

actor_fetcher: removed from package other: major update to documentation
4 years ago · f022312485
parent 4e867214dd
commit f022312485
18 changed files with 360 additions and 71 deletions
--- a/1
+++ b/1
@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand

 export(actor_fetcher)
+export(actor_merger)
 export(actorizer)
 export(bulk_writer)
 export(class_update)
--- a/Old/actor_fetcher.R
+++ b/Old/actor_fetcher.R
--- a/R/actor_merger.R
+++ b/R/actor_merger.R
@ -0,0 +1,178 @@
+#' Aggregate sentence-level dataset containing actors (from sentencizer())
+#'
+#' Aggregate sentence-level dataset containing actors (from sentencizer())
+#' @param df Data frame with actor ids, produced by sentencizer
+#' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors")
+#' @param ids Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
+#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
+#' @export
+#' @examples
+#' actor_merger(df, actors_meta, ids = NULL)
+#################################################################################################
+#################################### Generate actor-article dataset #############################
+#################################################################################################
+
+actor_merger <- function(df, actors_meta, ids = NULL) {
+  grouper <- function(id, df) {
+    return(df %>%
+      rowwise() %>%
+      filter(length(intersect(id,ids)) > 0) %>%
+      group_by(`_id`) %>%
+      summarise(actor.sent = sum(sent_sum)/sum(words),
+                actor.sent_sum = sum(sent_sum),
+                actor.sent_words = sum(sent_words),
+                actor.words = sum(words),
+                actor.arousal = sum(sent_words)/sum(words),
+                actor.first = first(sentence_id),
+                actor.occ = n(),
+                publication_date = as.Date(first(`_source.publication_date`)),
+                doctype = first(`_source.doctype`)) %>%
+      mutate(
+        ids = str_c(id, collapse = '-')
+      )
+    )
+  }
+
+  ## Remove some of the metadata from the source df
+  text_sent <- df %>%
+    select(`_id`,starts_with("text."),-ends_with("sent_lemmas"))
+  df <- df %>%
+    select(-ends_with("sent_lemmas"),-starts_with("text.")) %>%
+    unnest(cols = colnames(.)) ## Unnest to sentence level
+
+  ## Create bogus variables if sentiment is not scored
+  if(!"sent_sum" %in% colnames(df)) {
+    df <- df %>%
+      mutate(
+        sent_words = 0,
+        sent_sum = 0,
+      )
+  }
+  ## Create aggregations according to list of actorId vectors in ids
+  if(!is.null(ids)) {
+    output <- lapply(ids,grouper, df = df) %>%
+      bind_rows(.) %>%
+      left_join(text_sent, by="_id") %>%
+      mutate(
+        actor.prom = actor.occ/text.sentences,
+        actor.rel_first = 1-(actor.first/text.sentences),
+        year = strftime(publication_date, format = '%Y'),
+        yearmonth = strftime(publication_date, format = '%Y%m'),
+        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
+        yearweek = strftime(publication_date, format = "%Y%V")
+      )
+    return(output)
+  } else {
+    all <- df %>%
+      rowwise() %>%
+      filter(!is.null(unlist(ids))) %>%
+      group_by(`_id`) %>%
+      summarise(actor.sent = sum(sent_sum)/sum(words),
+                actor.sent_sum = sum(sent_sum),
+                actor.sent_words = sum(sent_words),
+                actor.words = sum(words),
+                actor.arousal = sum(sent_words)/sum(words),
+                actor.first = first(sentence_id),
+                actor.occ = n(),
+                publication_date = as.Date(first(`_source.publication_date`)),
+                doctype = first(`_source.doctype`)) %>%
+      mutate(
+        ids = "all"
+      )
+
+    df <- df %>%
+      unnest(cols = ids) %>% ## Unnest to actor level
+      mutate(
+        `_source.publication_date` = as.Date(`_source.publication_date`)
+      )
+    ## Create aggregate measures for individual actors
+    actors <- df %>%
+      filter(str_starts(ids,"A_")) %>%
+      group_by(`_id`,ids) %>%
+      summarise(actor.sent = sum(sent_sum)/sum(words),
+                actor.sent_sum = sum(sent_sum),
+                actor.sent_words = sum(sent_words),
+                actor.words = sum(words),
+                actor.arousal = sum(sent_words)/sum(words),
+                actor.first = first(sentence_id),
+                actor.occ = n(),
+                publication_date = first(`_source.publication_date`),
+                doctype = first(`_source.doctype`)
+      )
+    ## Create actor metadata dataframe per active date (one row per day per actor)
+    colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','')
+    actors_meta <- actors_meta[-1128,]
+    actors_meta_bydate <- actors_meta %>%
+      mutate(
+        startDate = as.Date(startDate),
+        endDate = as.Date(endDate)
+      ) %>%
+      select(
+        lastName,firstName,`function`,gender,yearOfBirth,parlPeriod,partyId,ministerName,ministryId,actorId,startDate,endDate
+      ) %>%
+      rowwise() %>%
+      mutate(
+        publication_date = list(seq(from=startDate, to=endDate,by="day")),
+        ids = actorId
+      ) %>%
+      unnest(cols=publication_date)
+
+    ## Join the actor metadata with the article data by actor id and date
+    actors <- actors %>%
+      left_join(.,actors_meta_bydate, by=c("ids","publication_date"))
+
+    ## Generate party-actor aggregations (mfsa)
+    parties_actors <- df %>%
+      filter(str_starts(ids,"P_")) %>%
+      mutate(
+        ids = str_sub(ids, start = 1, end = -3)
+      ) %>%
+      group_by(`_id`,ids) %>%
+      summarise(actor.sent = sum(sent_sum)/sum(words),
+                actor.sent_sum = sum(sent_sum),
+                actor.sent_words = sum(sent_words),
+                actor.words = sum(words),
+                actor.arousal = sum(sent_words)/sum(words),
+                actor.first = first(sentence_id),
+                actor.occ = n(),
+                publication_date = first(`_source.publication_date`),
+                doctype = first(`_source.doctype`)) %>%
+      mutate(
+        ids = str_c(ids,"_mfsa")
+      )
+
+    ## Generate party aggregations (mfs)
+    parties <- df %>%
+      filter(str_ends(ids,"_f") | str_ends(ids,"_s")) %>%
+      mutate(
+        ids = str_sub(ids, start = 1, end = -3)
+      ) %>%
+      group_by(`_id`,ids) %>%
+      summarise(actor.sent = sum(sent_sum)/sum(words),
+                actor.sent_sum = sum(sent_sum),
+                actor.sent_words = sum(sent_words),
+                actor.words = sum(words),
+                actor.arousal = sum(sent_words)/sum(words),
+                actor.first = first(sentence_id),
+                actor.occ = n(),
+                publication_date = first(`_source.publication_date`),
+                doctype = first(`_source.doctype`)) %>%
+      mutate(
+        ids = str_c(ids,"_mfs")
+      )
+
+    ## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
+    df <- bind_rows(actors, parties, parties_actors, all) %>%
+      left_join(text_sent, by="_id") %>%
+      mutate(
+        actor.prom = actor.occ/text.sentences,
+        actor.rel_first = 1-(actor.first/text.sentences),
+        year = strftime(publication_date, format = '%Y'),
+        yearmonth = strftime(publication_date, format = '%Y%m'),
+        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
+        yearweek = strftime(publication_date, format = "%Y%V")
+      )
+    return(df)
+  }
+}
+
--- a/man/actor_fetcher.Rd
+++ b/man/actor_fetcher.Rd
@ -4,13 +4,21 @@
 \alias{actor_fetcher}
 \title{Generate actor data frames (with sentiment) from database}
 \usage{
-actor_fetcher(out, sent_dict = NULL, cores = 1, localhost = NULL,
-  validation = F)
+actor_fetcher(
+  out,
+  sent_dict = NULL,
+  actor_ids = NULL,
+  cores = 1,
+  localhost = NULL,
+  validation = F
+)
 }
 \arguments{
 \item{out}{Data frame produced by elasticizer}

-\item{sent_dict}{Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)}
+\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.}
+
+\item{actor_ids}{Optional vector containing the actor ids to be collected}

 \item{cores}{Number of threads to use for parallel processing}

--- a/man/actor_merger.Rd
+++ b/man/actor_merger.Rd
@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/actor_merger.R
+\name{actor_merger}
+\alias{actor_merger}
+\title{Aggregate sentence-level dataset containing actors (from sentencizer())}
+\usage{
+actor_merger(df, actors_meta, ids = NULL)
+}
+\arguments{
+\item{df}{Data frame with actor ids, produced by sentencizer}
+
+\item{actors_meta}{Data frame containing actor metadata obtained using elasticizer(index="actors")}
+
+\item{ids}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)}
+}
+\value{
+When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
+}
+\description{
+Aggregate sentence-level dataset containing actors (from sentencizer())
+}
+\examples{
+actor_merger(df, actors_meta, ids = NULL)
+}
--- a/man/actorizer.Rd
+++ b/man/actorizer.Rd
@ -4,8 +4,18 @@
 \alias{actorizer}
 \title{Updater function for elasticizer: Conduct actor searches}
 \usage{
-actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags,
-  es_super, ver, cores = 1)
+actorizer(
+  out,
+  localhost = F,
+  ids,
+  prefix,
+  postfix,
+  pre_tags,
+  post_tags,
+  es_super,
+  ver,
+  cores = 1
+)
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
--- a/man/class_update.Rd
+++ b/man/class_update.Rd
@ -4,9 +4,18 @@
 \alias{class_update}
 \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information}
 \usage{
-class_update(out, localhost = T, model_final, varname, text, words,
-  clean, ver, es_super = .rs.askForPassword("ElasticSearch WRITE"),
-  cores = 1)
+class_update(
+  out,
+  localhost = T,
+  model_final,
+  varname,
+  text,
+  words,
+  clean,
+  ver,
+  es_super = .rs.askForPassword("ElasticSearch WRITE"),
+  cores = 1
+)
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
--- a/man/cv_generator.Rd
+++ b/man/cv_generator.Rd
@ -4,16 +4,14 @@
 \alias{cv_generator}
 \title{Generate CV folds for nested cross-validation}
 \usage{
-cv_generator(outer_k, inner_k, dfm, class_type, grid, seed)
+cv_generator(outer_k, inner_k, vec, grid, seed)
 }
 \arguments{
 \item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data}

 \item{inner_k}{Number of inner CV (parameter optimization) folds}

-\item{dfm}{DFM containing the labeled documents}
-
-\item{class_type}{Name of the column in docvars containing the classification}
+\item{vec}{Vector containing the true values of the classification}

 \item{grid}{Parameter grid for optimization}

--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,8 +4,7 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1,
-  tolower = T)
+dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
--- a/man/dupe_detect.Rd
+++ b/man/dupe_detect.Rd
@ -4,8 +4,17 @@
 \alias{dupe_detect}
 \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
 \usage{
-dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
-  words, localhost = T, ver)
+dupe_detect(
+  row,
+  grid,
+  cutoff_lower,
+  cutoff_upper = 1,
+  es_pwd,
+  es_super,
+  words,
+  localhost = T,
+  ver
+)
 }
 \arguments{
 \item{row}{Row of grid to parse}
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -4,10 +4,18 @@
 \alias{elasticizer}
 \title{Generate a data frame out of unparsed Elasticsearch JSON}
 \usage{
-elasticizer(query, src = T, index = "maml",
-  es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024,
-  max_batch = Inf, time_scroll = "5m", update = NULL,
-  localhost = F, ...)
+elasticizer(
+  query,
+  src = T,
+  index = "maml",
+  es_pwd = .rs.askForPassword("Elasticsearch READ"),
+  batch_size = 1024,
+  max_batch = Inf,
+  time_scroll = "5m",
+  update = NULL,
+  localhost = F,
+  ...
+)
 }
 \arguments{
 \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
--- a/man/estimator.Rd
+++ b/man/estimator.Rd
@ -4,7 +4,16 @@
 \alias{estimator}
 \title{Generate models and get classifications on test sets}
 \usage{
-estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model)
+estimator(
+  row,
+  grid,
+  outer_folds,
+  inner_folds,
+  dfm,
+  class_type,
+  model,
+  we_vectors
+)
 }
 \arguments{
 \item{row}{Row number of current item in grid}
@ -18,6 +27,8 @@ estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model)
 \item{class_type}{Name of column in docvars() containing the classes}

 \item{model}{Model to use (currently only nb)}
+
+\item{we_vectors}{Matrix with word embedding vectors}
 }
 \value{
 Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
--- a/man/lemma_writer.Rd
+++ b/man/lemma_writer.Rd
@ -4,8 +4,7 @@
 \alias{lemma_writer}
 \title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings}
 \usage{
-lemma_writer(out, file, localhost = F, documents = F, lemma = F,
-  cores = 1)
+lemma_writer(out, file, localhost = F, documents = F, lemma = F, cores = 1)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
--- a/man/modelizer.Rd
+++ b/man/modelizer.Rd
@ -4,8 +4,19 @@
 \alias{modelizer}
 \title{Generate a classification model}
 \usage{
-modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid,
-  seed, model, cores = 1)
+modelizer(
+  dfm,
+  outer_k,
+  inner_k,
+  class_type,
+  opt_measure,
+  country,
+  grid,
+  seed,
+  model,
+  we_vectors,
+  cores = 1
+)
 }
 \arguments{
 \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
@ -26,10 +37,12 @@ modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid,

 \item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}

+\item{we_vectors}{Matrix with word embedding vectors}
+
 \item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1}
 }
 \value{
-An .Rds file in the current working directory (getwd()) with a list containing all relevant output
+A list containing all relevant output
 }
 \description{
 Generate a nested cross validated classification model based on a dfm with class labels as docvars
--- a/man/modelizer_old.Rd
+++ b/man/modelizer_old.Rd
@ -4,8 +4,21 @@
 \alias{modelizer_old}
 \title{Generate a classification model}
 \usage{
-modelizer_old(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
-  outer_k, inner_k, model, class_type, opt_measure, country, grid)
+modelizer_old(
+  dfm,
+  cores_outer,
+  cores_grid,
+  cores_inner,
+  cores_feats,
+  seed,
+  outer_k,
+  inner_k,
+  model,
+  class_type,
+  opt_measure,
+  country,
+  grid
+)
 }
 \arguments{
 \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
--- a/man/preproc.Rd
+++ b/man/preproc.Rd
@ -4,7 +4,7 @@
 \alias{preproc}
 \title{Preprocess dfm data for use in modeling procedure}
 \usage{
-preproc(dfm_train, dfm_test = NULL, params)
+preproc(dfm_train, dfm_test = NULL, params, we_vectors)
 }
 \arguments{
 \item{dfm_train}{Training dfm}
@ -12,6 +12,8 @@ preproc(dfm_train, dfm_test = NULL, params)
 \item{dfm_test}{Testing dfm if applicable, otherwise NULL}

 \item{params}{Row from grid with parameter optimization}
+
+\item{we_vectors}{Matrix with word embedding vectors}
 }
 \value{
 List with dfm_train and dfm_test, processed according to parameters in params
--- a/man/sentencizer.Rd
+++ b/man/sentencizer.Rd
@ -2,7 +2,7 @@
 % Please edit documentation in R/sentencizer.R
 \name{sentencizer}
 \alias{sentencizer}
-\title{Generate actor data frames (with sentiment) from database}
+\title{Generate sentence-level dataset with sentiment and actor presence}
 \usage{
 sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
 }
@ -10,13 +10,15 @@ sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
 \item{out}{Data frame produced by elasticizer}

 \item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.}
+
+\item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
 }
 \value{
 No return value, data per batch is saved in an RDS file
 }
 \description{
-Generate actor data frames (with sentiment) from database
+Generate sentence-level dataset with sentiment and actor presence
 }
 \examples{
-sentencizer(out, sent_dict = NULL)
+sentencizer(out, sent_dict = NULL, validation = F)
 }
--- a/man/ud_update.Rd
+++ b/man/ud_update.Rd
@ -4,9 +4,14 @@
 \alias{ud_update}
 \title{Elasticizer update function: generate UDpipe output from base text}
 \usage{
-ud_update(out, localhost = T, udmodel,
+ud_update(
+  out,
+  localhost = T,
+  udmodel,
  es_super = .rs.askForPassword("ElasticSearch WRITE"),
-  cores = detectCores(), ver)
+  cores = detectCores(),
+  ver
+)
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}