actor_merger: added function for generating actor-document data frames

actor_fetcher: removed from package other: major update to documentation
5 years ago · f022312485
parent 4e867214dd
commit f022312485
18 changed files with 360 additions and 71 deletions
--- a/1
+++ b/1
@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 export(actor_fetcher)
 export(actor_merger)
 export(actorizer)
 export(bulk_writer)
 export(class_update)
--- a/Old/actor_fetcher.R
+++ b/Old/actor_fetcher.R
--- a/R/actor_merger.R
+++ b/R/actor_merger.R
@ -0,0 +1,178 @@
 #' Aggregate sentence-level dataset containing actors (from sentencizer())
 #'
 #' Aggregate sentence-level dataset containing actors (from sentencizer())
 #' @param df Data frame with actor ids, produced by sentencizer
 #' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors")
 #' @param ids Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
 #' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
 #' @export
 #' @examples
 #' actor_merger(df, actors_meta, ids = NULL)
 #################################################################################################
 #################################### Generate actor-article dataset #############################
 #################################################################################################
 actor_merger <- function(df, actors_meta, ids = NULL) {
  grouper <- function(id, df) {
    return(df %>%
      rowwise() %>%
      filter(length(intersect(id,ids)) > 0) %>%
      group_by(`_id`) %>%
      summarise(actor.sent = sum(sent_sum)/sum(words),
                actor.sent_sum = sum(sent_sum),
                actor.sent_words = sum(sent_words),
                actor.words = sum(words),
                actor.arousal = sum(sent_words)/sum(words),
                actor.first = first(sentence_id),
                actor.occ = n(),
                publication_date = as.Date(first(`_source.publication_date`)),
                doctype = first(`_source.doctype`)) %>%
      mutate(
        ids = str_c(id, collapse = '-')
      )
    )
  }
  ## Remove some of the metadata from the source df
  text_sent <- df %>%
    select(`_id`,starts_with("text."),-ends_with("sent_lemmas"))
  df <- df %>%
    select(-ends_with("sent_lemmas"),-starts_with("text.")) %>%
    unnest(cols = colnames(.)) ## Unnest to sentence level
  ## Create bogus variables if sentiment is not scored
  if(!"sent_sum" %in% colnames(df)) {
    df <- df %>%
      mutate(
        sent_words = 0,
        sent_sum = 0,
      )
  }
  ## Create aggregations according to list of actorId vectors in ids
  if(!is.null(ids)) {
    output <- lapply(ids,grouper, df = df) %>%
      bind_rows(.) %>%
      left_join(text_sent, by="_id") %>%
      mutate(
        actor.prom = actor.occ/text.sentences,
        actor.rel_first = 1-(actor.first/text.sentences),
        year = strftime(publication_date, format = '%Y'),
        yearmonth = strftime(publication_date, format = '%Y%m'),
        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
        yearweek = strftime(publication_date, format = "%Y%V")
      )
    return(output)
  } else {
    all <- df %>%
      rowwise() %>%
      filter(!is.null(unlist(ids))) %>%
      group_by(`_id`) %>%
      summarise(actor.sent = sum(sent_sum)/sum(words),
                actor.sent_sum = sum(sent_sum),
                actor.sent_words = sum(sent_words),
                actor.words = sum(words),
                actor.arousal = sum(sent_words)/sum(words),
                actor.first = first(sentence_id),
                actor.occ = n(),
                publication_date = as.Date(first(`_source.publication_date`)),
                doctype = first(`_source.doctype`)) %>%
      mutate(
        ids = "all"
      )
    df <- df %>%
      unnest(cols = ids) %>% ## Unnest to actor level
      mutate(
        `_source.publication_date` = as.Date(`_source.publication_date`)
      )
    ## Create aggregate measures for individual actors
    actors <- df %>%
      filter(str_starts(ids,"A_")) %>%
      group_by(`_id`,ids) %>%
      summarise(actor.sent = sum(sent_sum)/sum(words),
                actor.sent_sum = sum(sent_sum),
                actor.sent_words = sum(sent_words),
                actor.words = sum(words),
                actor.arousal = sum(sent_words)/sum(words),
                actor.first = first(sentence_id),
                actor.occ = n(),
                publication_date = first(`_source.publication_date`),
                doctype = first(`_source.doctype`)
      )
    ## Create actor metadata dataframe per active date (one row per day per actor)
    colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','')
    actors_meta <- actors_meta[-1128,]
    actors_meta_bydate <- actors_meta %>%
      mutate(
        startDate = as.Date(startDate),
        endDate = as.Date(endDate)
      ) %>%
      select(
        lastName,firstName,`function`,gender,yearOfBirth,parlPeriod,partyId,ministerName,ministryId,actorId,startDate,endDate
      ) %>%
      rowwise() %>%
      mutate(
        publication_date = list(seq(from=startDate, to=endDate,by="day")),
        ids = actorId
      ) %>%
      unnest(cols=publication_date)
    ## Join the actor metadata with the article data by actor id and date
    actors <- actors %>%
      left_join(.,actors_meta_bydate, by=c("ids","publication_date"))
    ## Generate party-actor aggregations (mfsa)
    parties_actors <- df %>%
      filter(str_starts(ids,"P_")) %>%
      mutate(
        ids = str_sub(ids, start = 1, end = -3)
      ) %>%
      group_by(`_id`,ids) %>%
      summarise(actor.sent = sum(sent_sum)/sum(words),
                actor.sent_sum = sum(sent_sum),
                actor.sent_words = sum(sent_words),
                actor.words = sum(words),
                actor.arousal = sum(sent_words)/sum(words),
                actor.first = first(sentence_id),
                actor.occ = n(),
                publication_date = first(`_source.publication_date`),
                doctype = first(`_source.doctype`)) %>%
      mutate(
        ids = str_c(ids,"_mfsa")
      )
    ## Generate party aggregations (mfs)
    parties <- df %>%
      filter(str_ends(ids,"_f") | str_ends(ids,"_s")) %>%
      mutate(
        ids = str_sub(ids, start = 1, end = -3)
      ) %>%
      group_by(`_id`,ids) %>%
      summarise(actor.sent = sum(sent_sum)/sum(words),
                actor.sent_sum = sum(sent_sum),
                actor.sent_words = sum(sent_words),
                actor.words = sum(words),
                actor.arousal = sum(sent_words)/sum(words),
                actor.first = first(sentence_id),
                actor.occ = n(),
                publication_date = first(`_source.publication_date`),
                doctype = first(`_source.doctype`)) %>%
      mutate(
        ids = str_c(ids,"_mfs")
      )
    ## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
    df <- bind_rows(actors, parties, parties_actors, all) %>%
      left_join(text_sent, by="_id") %>%
      mutate(
        actor.prom = actor.occ/text.sentences,
        actor.rel_first = 1-(actor.first/text.sentences),
        year = strftime(publication_date, format = '%Y'),
        yearmonth = strftime(publication_date, format = '%Y%m'),
        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
        yearweek = strftime(publication_date, format = "%Y%V")
      )
    return(df)
  }
 }
--- a/man/actor_fetcher.Rd
+++ b/man/actor_fetcher.Rd
@ -4,13 +4,21 @@
 \alias{actor_fetcher}
 \title{Generate actor data frames (with sentiment) from database}
 \usage{
-actor_fetcher(out, sent_dict = NULL, cores = 1, localhost = NULL,
+actor_fetcher(
-  validation = F)
+  out,
  sent_dict = NULL,
  actor_ids = NULL,
  cores = 1,
  localhost = NULL,
  validation = F
 )
 }
 \arguments{
 \item{out}{Data frame produced by elasticizer}
-\item{sent_dict}{Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)}
+\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.}
 \item{actor_ids}{Optional vector containing the actor ids to be collected}
 \item{cores}{Number of threads to use for parallel processing}
--- a/man/actor_merger.Rd
+++ b/man/actor_merger.Rd
@ -0,0 +1,24 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/actor_merger.R
 \name{actor_merger}
 \alias{actor_merger}
 \title{Aggregate sentence-level dataset containing actors (from sentencizer())}
 \usage{
 actor_merger(df, actors_meta, ids = NULL)
 }
 \arguments{
 \item{df}{Data frame with actor ids, produced by sentencizer}
 \item{actors_meta}{Data frame containing actor metadata obtained using elasticizer(index="actors")}
 \item{ids}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)}
 }
 \value{
 When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
 }
 \description{
 Aggregate sentence-level dataset containing actors (from sentencizer())
 }
 \examples{
 actor_merger(df, actors_meta, ids = NULL)
 }
--- a/man/actorizer.Rd
+++ b/man/actorizer.Rd
@ -4,8 +4,18 @@
 \alias{actorizer}
 \title{Updater function for elasticizer: Conduct actor searches}
 \usage{
-actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags,
+actorizer(
-  es_super, ver, cores = 1)
+  out,
  localhost = F,
  ids,
  prefix,
  postfix,
  pre_tags,
  post_tags,
  es_super,
  ver,
  cores = 1
 )
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
--- a/man/class_update.Rd
+++ b/man/class_update.Rd
@ -4,9 +4,18 @@
 \alias{class_update}
 \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information}
 \usage{
-class_update(out, localhost = T, model_final, varname, text, words,
+class_update(
-  clean, ver, es_super = .rs.askForPassword("ElasticSearch WRITE"),
+  out,
-  cores = 1)
+  localhost = T,
  model_final,
  varname,
  text,
  words,
  clean,
  ver,
  es_super = .rs.askForPassword("ElasticSearch WRITE"),
  cores = 1
 )
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
--- a/man/cv_generator.Rd
+++ b/man/cv_generator.Rd
@ -4,16 +4,14 @@
 \alias{cv_generator}
 \title{Generate CV folds for nested cross-validation}
 \usage{
-cv_generator(outer_k, inner_k, dfm, class_type, grid, seed)
+cv_generator(outer_k, inner_k, vec, grid, seed)
 }
 \arguments{
 \item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data}
 \item{inner_k}{Number of inner CV (parameter optimization) folds}
-\item{dfm}{DFM containing the labeled documents}
+\item{vec}{Vector containing the true values of the classification}
 \item{class_type}{Name of the column in docvars containing the classification}
 \item{grid}{Parameter grid for optimization}
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,8 +4,7 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1,
+dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T)
  tolower = T)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
--- a/man/dupe_detect.Rd
+++ b/man/dupe_detect.Rd
@ -4,8 +4,17 @@
 \alias{dupe_detect}
 \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
 \usage{
-dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
+dupe_detect(
-  words, localhost = T, ver)
+  row,
  grid,
  cutoff_lower,
  cutoff_upper = 1,
  es_pwd,
  es_super,
  words,
  localhost = T,
  ver
 )
 }
 \arguments{
 \item{row}{Row of grid to parse}
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -4,10 +4,18 @@
 \alias{elasticizer}
 \title{Generate a data frame out of unparsed Elasticsearch JSON}
 \usage{
-elasticizer(query, src = T, index = "maml",
+elasticizer(
-  es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024,
+  query,
-  max_batch = Inf, time_scroll = "5m", update = NULL,
+  src = T,
-  localhost = F, ...)
+  index = "maml",
  es_pwd = .rs.askForPassword("Elasticsearch READ"),
  batch_size = 1024,
  max_batch = Inf,
  time_scroll = "5m",
  update = NULL,
  localhost = F,
  ...
 )
 }
 \arguments{
 \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
--- a/man/estimator.Rd
+++ b/man/estimator.Rd
@ -4,7 +4,16 @@
 \alias{estimator}
 \title{Generate models and get classifications on test sets}
 \usage{
-estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model)
+estimator(
  row,
  grid,
  outer_folds,
  inner_folds,
  dfm,
  class_type,
  model,
  we_vectors
 )
 }
 \arguments{
 \item{row}{Row number of current item in grid}
@ -18,6 +27,8 @@ estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model)
 \item{class_type}{Name of column in docvars() containing the classes}
 \item{model}{Model to use (currently only nb)}
 \item{we_vectors}{Matrix with word embedding vectors}
 }
 \value{
 Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
--- a/man/lemma_writer.Rd
+++ b/man/lemma_writer.Rd
@ -4,8 +4,7 @@
 \alias{lemma_writer}
 \title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings}
 \usage{
-lemma_writer(out, file, localhost = F, documents = F, lemma = F,
+lemma_writer(out, file, localhost = F, documents = F, lemma = F, cores = 1)
  cores = 1)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
--- a/man/modelizer.Rd
+++ b/man/modelizer.Rd
@ -4,8 +4,19 @@
 \alias{modelizer}
 \title{Generate a classification model}
 \usage{
-modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid,
+modelizer(
-  seed, model, cores = 1)
+  dfm,
  outer_k,
  inner_k,
  class_type,
  opt_measure,
  country,
  grid,
  seed,
  model,
  we_vectors,
  cores = 1
 )
 }
 \arguments{
 \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
@ -26,10 +37,12 @@ modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid,
 \item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
 \item{we_vectors}{Matrix with word embedding vectors}
 \item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1}
 }
 \value{
-An .Rds file in the current working directory (getwd()) with a list containing all relevant output
+A list containing all relevant output
 }
 \description{
 Generate a nested cross validated classification model based on a dfm with class labels as docvars
--- a/man/modelizer_old.Rd
+++ b/man/modelizer_old.Rd
@ -4,8 +4,21 @@
 \alias{modelizer_old}
 \title{Generate a classification model}
 \usage{
-modelizer_old(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
+modelizer_old(
-  outer_k, inner_k, model, class_type, opt_measure, country, grid)
+  dfm,
  cores_outer,
  cores_grid,
  cores_inner,
  cores_feats,
  seed,
  outer_k,
  inner_k,
  model,
  class_type,
  opt_measure,
  country,
  grid
 )
 }
 \arguments{
 \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
--- a/man/preproc.Rd
+++ b/man/preproc.Rd
@ -4,7 +4,7 @@
 \alias{preproc}
 \title{Preprocess dfm data for use in modeling procedure}
 \usage{
-preproc(dfm_train, dfm_test = NULL, params)
+preproc(dfm_train, dfm_test = NULL, params, we_vectors)
 }
 \arguments{
 \item{dfm_train}{Training dfm}
@ -12,6 +12,8 @@ preproc(dfm_train, dfm_test = NULL, params)
 \item{dfm_test}{Testing dfm if applicable, otherwise NULL}
 \item{params}{Row from grid with parameter optimization}
 \item{we_vectors}{Matrix with word embedding vectors}
 }
 \value{
 List with dfm_train and dfm_test, processed according to parameters in params
--- a/man/sentencizer.Rd
+++ b/man/sentencizer.Rd
@ -2,7 +2,7 @@
 % Please edit documentation in R/sentencizer.R
 \name{sentencizer}
 \alias{sentencizer}
-\title{Generate actor data frames (with sentiment) from database}
+\title{Generate sentence-level dataset with sentiment and actor presence}
 \usage{
 sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
 }
@ -10,13 +10,15 @@ sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
 \item{out}{Data frame produced by elasticizer}
 \item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.}
 \item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
 }
 \value{
 No return value, data per batch is saved in an RDS file
 }
 \description{
-Generate actor data frames (with sentiment) from database
+Generate sentence-level dataset with sentiment and actor presence
 }
 \examples{
-sentencizer(out, sent_dict = NULL)
+sentencizer(out, sent_dict = NULL, validation = F)
 }
--- a/man/ud_update.Rd
+++ b/man/ud_update.Rd
@ -4,9 +4,14 @@
 \alias{ud_update}
 \title{Elasticizer update function: generate UDpipe output from base text}
 \usage{
-ud_update(out, localhost = T, udmodel,
+ud_update(
  out,
  localhost = T,
  udmodel,
  es_super = .rs.askForPassword("ElasticSearch WRITE"),
-  cores = detectCores(), ver)
+  cores = detectCores(),
  ver
 )
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}