diff --git a/NAMESPACE b/NAMESPACE index 196f93f..afddc4b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(actor_fetcher) +export(actor_merger) export(actorizer) export(bulk_writer) export(class_update) diff --git a/R/actor_fetcher.R b/Old/actor_fetcher.R similarity index 80% rename from R/actor_fetcher.R rename to Old/actor_fetcher.R index 226d745..e7979fe 100644 --- a/R/actor_fetcher.R +++ b/Old/actor_fetcher.R @@ -38,31 +38,31 @@ actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, lo ### Party ids including actors p_ids_a <- c(p_ids,str_c(pid,'_a')) summarizer <- function (p_ids, dupe_df, merged_id) { - id <- dupe_df$`_id`[[1]] - dupe_df <- dupe_df %>% - filter(ids %in% p_ids) - if (nrow(dupe_df) > 0) { - return( - dupe_df %>% summarise( - `_id` = first(`_id`), - `_source.doctype` = first(`_source.doctype`), - `_source.publication_date` = first(`_source.publication_date`), - prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])), - sentence_id = list(sort(unique(unlist(sentence_id)))), - rel_first = list(max(unlist(rel_first))), - ids = merged_id, - occ = list(length(unique(unlist(sentence_id)))), - first = list(min(unlist(sentence_id))), - actor_start = list(sort(unique(unlist(actor_start)))), - actor_end = list(sort(unique(unlist(actor_end)))), - sentence_start = list(sort(unique(unlist(sentence_start)))), - sentence_end = list(sort(unique(unlist(sentence_end)))) - ) + id <- dupe_df$`_id`[[1]] + dupe_df <- dupe_df %>% + filter(ids %in% p_ids) + if (nrow(dupe_df) > 0) { + return( + dupe_df %>% summarise( + `_id` = first(`_id`), + `_source.doctype` = first(`_source.doctype`), + `_source.publication_date` = first(`_source.publication_date`), + prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])), + sentence_id = list(sort(unique(unlist(sentence_id)))), + rel_first = list(max(unlist(rel_first))), + ids = merged_id, + occ = list(length(unique(unlist(sentence_id)))), + first = list(min(unlist(sentence_id))), + actor_start = list(sort(unique(unlist(actor_start)))), + actor_end = list(sort(unique(unlist(actor_end)))), + sentence_start = list(sort(unique(unlist(sentence_start)))), + sentence_end = list(sort(unique(unlist(sentence_end)))) ) - } else { - print(paste0('id:',id)) - return(NULL) - } + ) + } else { + print(paste0('id:',id)) + return(NULL) + } } party <- summarizer(p_ids, dupe_df, str_c(pid,'_mfs')) party_actor <- summarizer(p_ids_a, dupe_df, str_c(pid,'_mfsa')) @@ -77,15 +77,15 @@ actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, lo select(-one_of('exists')) %>% unnest() %>% filter(upos != 'PUNCT') # For getting proper word counts - if ("lem_u" %in% colnames(sent_dict)) { - ud_sent <- ud_sent %>% - mutate(lem_u = str_c(lemma,'_',upos)) %>% - left_join(sent_dict, by = 'lem_u') - } else if ("lemma" %in% colnames(sent_dict)) { - ud_sent <- ud_sent %>% - left_join(sent_dict, by = 'lemma') %>% - mutate(lem_u = lemma) - } + if ("lem_u" %in% colnames(sent_dict)) { + ud_sent <- ud_sent %>% + mutate(lem_u = str_c(lemma,'_',upos)) %>% + left_join(sent_dict, by = 'lem_u') + } else if ("lemma" %in% colnames(sent_dict)) { + ud_sent <- ud_sent %>% + left_join(sent_dict, by = 'lemma') %>% + mutate(lem_u = lemma) + } ud_sent <- ud_sent %>% group_by(sentence_id) %>% mutate( @@ -177,10 +177,10 @@ actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, lo yearweek = strftime(`_source.publication_date`, format = "%Y%V") ) %>% select(#-`_source.computerCodes.actorsDetail`, - -`_score`, - -`_index`, - -`_type`, - -pids) + -`_score`, + -`_index`, + -`_type`, + -pids) return(out_row) } saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds')) diff --git a/R/actor_merger.R b/R/actor_merger.R new file mode 100644 index 0000000..be1ca06 --- /dev/null +++ b/R/actor_merger.R @@ -0,0 +1,178 @@ +#' Aggregate sentence-level dataset containing actors (from sentencizer()) +#' +#' Aggregate sentence-level dataset containing actors (from sentencizer()) +#' @param df Data frame with actor ids, produced by sentencizer +#' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors") +#' @param ids Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties) +#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list +#' @export +#' @examples +#' actor_merger(df, actors_meta, ids = NULL) +################################################################################################# +#################################### Generate actor-article dataset ############################# +################################################################################################# + +actor_merger <- function(df, actors_meta, ids = NULL) { + grouper <- function(id, df) { + return(df %>% + rowwise() %>% + filter(length(intersect(id,ids)) > 0) %>% + group_by(`_id`) %>% + summarise(actor.sent = sum(sent_sum)/sum(words), + actor.sent_sum = sum(sent_sum), + actor.sent_words = sum(sent_words), + actor.words = sum(words), + actor.arousal = sum(sent_words)/sum(words), + actor.first = first(sentence_id), + actor.occ = n(), + publication_date = as.Date(first(`_source.publication_date`)), + doctype = first(`_source.doctype`)) %>% + mutate( + ids = str_c(id, collapse = '-') + ) + ) + } + + ## Remove some of the metadata from the source df + text_sent <- df %>% + select(`_id`,starts_with("text."),-ends_with("sent_lemmas")) + df <- df %>% + select(-ends_with("sent_lemmas"),-starts_with("text.")) %>% + unnest(cols = colnames(.)) ## Unnest to sentence level + + ## Create bogus variables if sentiment is not scored + if(!"sent_sum" %in% colnames(df)) { + df <- df %>% + mutate( + sent_words = 0, + sent_sum = 0, + ) + } + ## Create aggregations according to list of actorId vectors in ids + if(!is.null(ids)) { + output <- lapply(ids,grouper, df = df) %>% + bind_rows(.) %>% + left_join(text_sent, by="_id") %>% + mutate( + actor.prom = actor.occ/text.sentences, + actor.rel_first = 1-(actor.first/text.sentences), + year = strftime(publication_date, format = '%Y'), + yearmonth = strftime(publication_date, format = '%Y%m'), + yearmonthday = strftime(publication_date, format = '%Y%m%d'), + yearweek = strftime(publication_date, format = "%Y%V") + ) + return(output) + } else { + all <- df %>% + rowwise() %>% + filter(!is.null(unlist(ids))) %>% + group_by(`_id`) %>% + summarise(actor.sent = sum(sent_sum)/sum(words), + actor.sent_sum = sum(sent_sum), + actor.sent_words = sum(sent_words), + actor.words = sum(words), + actor.arousal = sum(sent_words)/sum(words), + actor.first = first(sentence_id), + actor.occ = n(), + publication_date = as.Date(first(`_source.publication_date`)), + doctype = first(`_source.doctype`)) %>% + mutate( + ids = "all" + ) + + df <- df %>% + unnest(cols = ids) %>% ## Unnest to actor level + mutate( + `_source.publication_date` = as.Date(`_source.publication_date`) + ) + ## Create aggregate measures for individual actors + actors <- df %>% + filter(str_starts(ids,"A_")) %>% + group_by(`_id`,ids) %>% + summarise(actor.sent = sum(sent_sum)/sum(words), + actor.sent_sum = sum(sent_sum), + actor.sent_words = sum(sent_words), + actor.words = sum(words), + actor.arousal = sum(sent_words)/sum(words), + actor.first = first(sentence_id), + actor.occ = n(), + publication_date = first(`_source.publication_date`), + doctype = first(`_source.doctype`) + ) + ## Create actor metadata dataframe per active date (one row per day per actor) + colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','') + actors_meta <- actors_meta[-1128,] + actors_meta_bydate <- actors_meta %>% + mutate( + startDate = as.Date(startDate), + endDate = as.Date(endDate) + ) %>% + select( + lastName,firstName,`function`,gender,yearOfBirth,parlPeriod,partyId,ministerName,ministryId,actorId,startDate,endDate + ) %>% + rowwise() %>% + mutate( + publication_date = list(seq(from=startDate, to=endDate,by="day")), + ids = actorId + ) %>% + unnest(cols=publication_date) + + ## Join the actor metadata with the article data by actor id and date + actors <- actors %>% + left_join(.,actors_meta_bydate, by=c("ids","publication_date")) + + ## Generate party-actor aggregations (mfsa) + parties_actors <- df %>% + filter(str_starts(ids,"P_")) %>% + mutate( + ids = str_sub(ids, start = 1, end = -3) + ) %>% + group_by(`_id`,ids) %>% + summarise(actor.sent = sum(sent_sum)/sum(words), + actor.sent_sum = sum(sent_sum), + actor.sent_words = sum(sent_words), + actor.words = sum(words), + actor.arousal = sum(sent_words)/sum(words), + actor.first = first(sentence_id), + actor.occ = n(), + publication_date = first(`_source.publication_date`), + doctype = first(`_source.doctype`)) %>% + mutate( + ids = str_c(ids,"_mfsa") + ) + + ## Generate party aggregations (mfs) + parties <- df %>% + filter(str_ends(ids,"_f") | str_ends(ids,"_s")) %>% + mutate( + ids = str_sub(ids, start = 1, end = -3) + ) %>% + group_by(`_id`,ids) %>% + summarise(actor.sent = sum(sent_sum)/sum(words), + actor.sent_sum = sum(sent_sum), + actor.sent_words = sum(sent_words), + actor.words = sum(words), + actor.arousal = sum(sent_words)/sum(words), + actor.first = first(sentence_id), + actor.occ = n(), + publication_date = first(`_source.publication_date`), + doctype = first(`_source.doctype`)) %>% + mutate( + ids = str_c(ids,"_mfs") + ) + + ## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies + df <- bind_rows(actors, parties, parties_actors, all) %>% + left_join(text_sent, by="_id") %>% + mutate( + actor.prom = actor.occ/text.sentences, + actor.rel_first = 1-(actor.first/text.sentences), + year = strftime(publication_date, format = '%Y'), + yearmonth = strftime(publication_date, format = '%Y%m'), + yearmonthday = strftime(publication_date, format = '%Y%m%d'), + yearweek = strftime(publication_date, format = "%Y%V") + ) + return(df) + } +} + diff --git a/man/actor_fetcher.Rd b/man/actor_fetcher.Rd index 5c7417b..edff272 100644 --- a/man/actor_fetcher.Rd +++ b/man/actor_fetcher.Rd @@ -4,13 +4,21 @@ \alias{actor_fetcher} \title{Generate actor data frames (with sentiment) from database} \usage{ -actor_fetcher(out, sent_dict = NULL, cores = 1, localhost = NULL, - validation = F) +actor_fetcher( + out, + sent_dict = NULL, + actor_ids = NULL, + cores = 1, + localhost = NULL, + validation = F +) } \arguments{ \item{out}{Data frame produced by elasticizer} -\item{sent_dict}{Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)} +\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.} + +\item{actor_ids}{Optional vector containing the actor ids to be collected} \item{cores}{Number of threads to use for parallel processing} diff --git a/man/actor_merger.Rd b/man/actor_merger.Rd new file mode 100644 index 0000000..a014ef9 --- /dev/null +++ b/man/actor_merger.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/actor_merger.R +\name{actor_merger} +\alias{actor_merger} +\title{Aggregate sentence-level dataset containing actors (from sentencizer())} +\usage{ +actor_merger(df, actors_meta, ids = NULL) +} +\arguments{ +\item{df}{Data frame with actor ids, produced by sentencizer} + +\item{actors_meta}{Data frame containing actor metadata obtained using elasticizer(index="actors")} + +\item{ids}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)} +} +\value{ +When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list +} +\description{ +Aggregate sentence-level dataset containing actors (from sentencizer()) +} +\examples{ +actor_merger(df, actors_meta, ids = NULL) +} diff --git a/man/actorizer.Rd b/man/actorizer.Rd index a12d4f9..2845f88 100644 --- a/man/actorizer.Rd +++ b/man/actorizer.Rd @@ -4,8 +4,18 @@ \alias{actorizer} \title{Updater function for elasticizer: Conduct actor searches} \usage{ -actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, - es_super, ver, cores = 1) +actorizer( + out, + localhost = F, + ids, + prefix, + postfix, + pre_tags, + post_tags, + es_super, + ver, + cores = 1 +) } \arguments{ \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} diff --git a/man/class_update.Rd b/man/class_update.Rd index fe6c587..b183f5a 100644 --- a/man/class_update.Rd +++ b/man/class_update.Rd @@ -4,9 +4,18 @@ \alias{class_update} \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information} \usage{ -class_update(out, localhost = T, model_final, varname, text, words, - clean, ver, es_super = .rs.askForPassword("ElasticSearch WRITE"), - cores = 1) +class_update( + out, + localhost = T, + model_final, + varname, + text, + words, + clean, + ver, + es_super = .rs.askForPassword("ElasticSearch WRITE"), + cores = 1 +) } \arguments{ \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} diff --git a/man/cv_generator.Rd b/man/cv_generator.Rd index aef2925..4d36947 100644 --- a/man/cv_generator.Rd +++ b/man/cv_generator.Rd @@ -4,16 +4,14 @@ \alias{cv_generator} \title{Generate CV folds for nested cross-validation} \usage{ -cv_generator(outer_k, inner_k, dfm, class_type, grid, seed) +cv_generator(outer_k, inner_k, vec, grid, seed) } \arguments{ \item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data} \item{inner_k}{Number of inner CV (parameter optimization) folds} -\item{dfm}{DFM containing the labeled documents} - -\item{class_type}{Name of the column in docvars containing the classification} +\item{vec}{Vector containing the true values of the classification} \item{grid}{Parameter grid for optimization} diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index 28105e0..93ecaf9 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -4,8 +4,7 @@ \alias{dfm_gen} \title{Generates dfm from ElasticSearch output} \usage{ -dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, - tolower = T) +dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T) } \arguments{ \item{out}{The elasticizer-generated data frame} diff --git a/man/dupe_detect.Rd b/man/dupe_detect.Rd index 3fa6859..decc16a 100644 --- a/man/dupe_detect.Rd +++ b/man/dupe_detect.Rd @@ -4,8 +4,17 @@ \alias{dupe_detect} \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]} \usage{ -dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, - words, localhost = T, ver) +dupe_detect( + row, + grid, + cutoff_lower, + cutoff_upper = 1, + es_pwd, + es_super, + words, + localhost = T, + ver +) } \arguments{ \item{row}{Row of grid to parse} diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd index 8dd3b67..6fcba75 100644 --- a/man/elasticizer.Rd +++ b/man/elasticizer.Rd @@ -4,10 +4,18 @@ \alias{elasticizer} \title{Generate a data frame out of unparsed Elasticsearch JSON} \usage{ -elasticizer(query, src = T, index = "maml", - es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, - max_batch = Inf, time_scroll = "5m", update = NULL, - localhost = F, ...) +elasticizer( + query, + src = T, + index = "maml", + es_pwd = .rs.askForPassword("Elasticsearch READ"), + batch_size = 1024, + max_batch = Inf, + time_scroll = "5m", + update = NULL, + localhost = F, + ... +) } \arguments{ \item{query}{A JSON-formatted query in the Elasticsearch query DSL} diff --git a/man/estimator.Rd b/man/estimator.Rd index 3dabd48..abbb17f 100644 --- a/man/estimator.Rd +++ b/man/estimator.Rd @@ -4,7 +4,16 @@ \alias{estimator} \title{Generate models and get classifications on test sets} \usage{ -estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model) +estimator( + row, + grid, + outer_folds, + inner_folds, + dfm, + class_type, + model, + we_vectors +) } \arguments{ \item{row}{Row number of current item in grid} @@ -18,6 +27,8 @@ estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model) \item{class_type}{Name of column in docvars() containing the classes} \item{model}{Model to use (currently only nb)} + +\item{we_vectors}{Matrix with word embedding vectors} } \value{ Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values. diff --git a/man/lemma_writer.Rd b/man/lemma_writer.Rd index 406fb96..4402006 100644 --- a/man/lemma_writer.Rd +++ b/man/lemma_writer.Rd @@ -4,8 +4,7 @@ \alias{lemma_writer} \title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings} \usage{ -lemma_writer(out, file, localhost = F, documents = F, lemma = F, - cores = 1) +lemma_writer(out, file, localhost = F, documents = F, lemma = F, cores = 1) } \arguments{ \item{out}{The elasticizer-generated data frame} diff --git a/man/modelizer.Rd b/man/modelizer.Rd index 32293aa..8f106e1 100644 --- a/man/modelizer.Rd +++ b/man/modelizer.Rd @@ -4,8 +4,19 @@ \alias{modelizer} \title{Generate a classification model} \usage{ -modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, - seed, model, cores = 1) +modelizer( + dfm, + outer_k, + inner_k, + class_type, + opt_measure, + country, + grid, + seed, + model, + we_vectors, + cores = 1 +) } \arguments{ \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars} @@ -26,10 +37,12 @@ modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, \item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)} +\item{we_vectors}{Matrix with word embedding vectors} + \item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1} } \value{ -An .Rds file in the current working directory (getwd()) with a list containing all relevant output +A list containing all relevant output } \description{ Generate a nested cross validated classification model based on a dfm with class labels as docvars diff --git a/man/modelizer_old.Rd b/man/modelizer_old.Rd index 6000ea7..a88b46c 100644 --- a/man/modelizer_old.Rd +++ b/man/modelizer_old.Rd @@ -4,8 +4,21 @@ \alias{modelizer_old} \title{Generate a classification model} \usage{ -modelizer_old(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, - outer_k, inner_k, model, class_type, opt_measure, country, grid) +modelizer_old( + dfm, + cores_outer, + cores_grid, + cores_inner, + cores_feats, + seed, + outer_k, + inner_k, + model, + class_type, + opt_measure, + country, + grid +) } \arguments{ \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars} diff --git a/man/preproc.Rd b/man/preproc.Rd index ab30c92..7dbac14 100644 --- a/man/preproc.Rd +++ b/man/preproc.Rd @@ -4,7 +4,7 @@ \alias{preproc} \title{Preprocess dfm data for use in modeling procedure} \usage{ -preproc(dfm_train, dfm_test = NULL, params) +preproc(dfm_train, dfm_test = NULL, params, we_vectors) } \arguments{ \item{dfm_train}{Training dfm} @@ -12,6 +12,8 @@ preproc(dfm_train, dfm_test = NULL, params) \item{dfm_test}{Testing dfm if applicable, otherwise NULL} \item{params}{Row from grid with parameter optimization} + +\item{we_vectors}{Matrix with word embedding vectors} } \value{ List with dfm_train and dfm_test, processed according to parameters in params diff --git a/man/sentencizer.Rd b/man/sentencizer.Rd index 51b9128..d4d58ae 100644 --- a/man/sentencizer.Rd +++ b/man/sentencizer.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/sentencizer.R \name{sentencizer} \alias{sentencizer} -\title{Generate actor data frames (with sentiment) from database} +\title{Generate sentence-level dataset with sentiment and actor presence} \usage{ sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F) } @@ -10,13 +10,15 @@ sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F) \item{out}{Data frame produced by elasticizer} \item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.} + +\item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring} } \value{ No return value, data per batch is saved in an RDS file } \description{ -Generate actor data frames (with sentiment) from database +Generate sentence-level dataset with sentiment and actor presence } \examples{ -sentencizer(out, sent_dict = NULL) +sentencizer(out, sent_dict = NULL, validation = F) } diff --git a/man/ud_update.Rd b/man/ud_update.Rd index 0a80d2f..f636857 100644 --- a/man/ud_update.Rd +++ b/man/ud_update.Rd @@ -4,9 +4,14 @@ \alias{ud_update} \title{Elasticizer update function: generate UDpipe output from base text} \usage{ -ud_update(out, localhost = T, udmodel, +ud_update( + out, + localhost = T, + udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), - cores = detectCores(), ver) + cores = detectCores(), + ver +) } \arguments{ \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}