actor_merger: added function for generating actor-document data frames

actor_fetcher: removed from package
other: major update to documentation
master
Your Name 5 years ago
parent 4e867214dd
commit f022312485

@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(actor_fetcher) export(actor_fetcher)
export(actor_merger)
export(actorizer) export(actorizer)
export(bulk_writer) export(bulk_writer)
export(class_update) export(class_update)

@ -0,0 +1,178 @@
#' Aggregate sentence-level dataset containing actors (from sentencizer())
#'
#' Aggregate sentence-level dataset containing actors (from sentencizer())
#' @param df Data frame with actor ids, produced by sentencizer
#' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors")
#' @param ids Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
#' @export
#' @examples
#' actor_merger(df, actors_meta, ids = NULL)
#################################################################################################
#################################### Generate actor-article dataset #############################
#################################################################################################
actor_merger <- function(df, actors_meta, ids = NULL) {
grouper <- function(id, df) {
return(df %>%
rowwise() %>%
filter(length(intersect(id,ids)) > 0) %>%
group_by(`_id`) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = as.Date(first(`_source.publication_date`)),
doctype = first(`_source.doctype`)) %>%
mutate(
ids = str_c(id, collapse = '-')
)
)
}
## Remove some of the metadata from the source df
text_sent <- df %>%
select(`_id`,starts_with("text."),-ends_with("sent_lemmas"))
df <- df %>%
select(-ends_with("sent_lemmas"),-starts_with("text.")) %>%
unnest(cols = colnames(.)) ## Unnest to sentence level
## Create bogus variables if sentiment is not scored
if(!"sent_sum" %in% colnames(df)) {
df <- df %>%
mutate(
sent_words = 0,
sent_sum = 0,
)
}
## Create aggregations according to list of actorId vectors in ids
if(!is.null(ids)) {
output <- lapply(ids,grouper, df = df) %>%
bind_rows(.) %>%
left_join(text_sent, by="_id") %>%
mutate(
actor.prom = actor.occ/text.sentences,
actor.rel_first = 1-(actor.first/text.sentences),
year = strftime(publication_date, format = '%Y'),
yearmonth = strftime(publication_date, format = '%Y%m'),
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V")
)
return(output)
} else {
all <- df %>%
rowwise() %>%
filter(!is.null(unlist(ids))) %>%
group_by(`_id`) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = as.Date(first(`_source.publication_date`)),
doctype = first(`_source.doctype`)) %>%
mutate(
ids = "all"
)
df <- df %>%
unnest(cols = ids) %>% ## Unnest to actor level
mutate(
`_source.publication_date` = as.Date(`_source.publication_date`)
)
## Create aggregate measures for individual actors
actors <- df %>%
filter(str_starts(ids,"A_")) %>%
group_by(`_id`,ids) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = first(`_source.publication_date`),
doctype = first(`_source.doctype`)
)
## Create actor metadata dataframe per active date (one row per day per actor)
colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','')
actors_meta <- actors_meta[-1128,]
actors_meta_bydate <- actors_meta %>%
mutate(
startDate = as.Date(startDate),
endDate = as.Date(endDate)
) %>%
select(
lastName,firstName,`function`,gender,yearOfBirth,parlPeriod,partyId,ministerName,ministryId,actorId,startDate,endDate
) %>%
rowwise() %>%
mutate(
publication_date = list(seq(from=startDate, to=endDate,by="day")),
ids = actorId
) %>%
unnest(cols=publication_date)
## Join the actor metadata with the article data by actor id and date
actors <- actors %>%
left_join(.,actors_meta_bydate, by=c("ids","publication_date"))
## Generate party-actor aggregations (mfsa)
parties_actors <- df %>%
filter(str_starts(ids,"P_")) %>%
mutate(
ids = str_sub(ids, start = 1, end = -3)
) %>%
group_by(`_id`,ids) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = first(`_source.publication_date`),
doctype = first(`_source.doctype`)) %>%
mutate(
ids = str_c(ids,"_mfsa")
)
## Generate party aggregations (mfs)
parties <- df %>%
filter(str_ends(ids,"_f") | str_ends(ids,"_s")) %>%
mutate(
ids = str_sub(ids, start = 1, end = -3)
) %>%
group_by(`_id`,ids) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = first(`_source.publication_date`),
doctype = first(`_source.doctype`)) %>%
mutate(
ids = str_c(ids,"_mfs")
)
## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
df <- bind_rows(actors, parties, parties_actors, all) %>%
left_join(text_sent, by="_id") %>%
mutate(
actor.prom = actor.occ/text.sentences,
actor.rel_first = 1-(actor.first/text.sentences),
year = strftime(publication_date, format = '%Y'),
yearmonth = strftime(publication_date, format = '%Y%m'),
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V")
)
return(df)
}
}

@ -4,13 +4,21 @@
\alias{actor_fetcher} \alias{actor_fetcher}
\title{Generate actor data frames (with sentiment) from database} \title{Generate actor data frames (with sentiment) from database}
\usage{ \usage{
actor_fetcher(out, sent_dict = NULL, cores = 1, localhost = NULL, actor_fetcher(
validation = F) out,
sent_dict = NULL,
actor_ids = NULL,
cores = 1,
localhost = NULL,
validation = F
)
} }
\arguments{ \arguments{
\item{out}{Data frame produced by elasticizer} \item{out}{Data frame produced by elasticizer}
\item{sent_dict}{Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)} \item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.}
\item{actor_ids}{Optional vector containing the actor ids to be collected}
\item{cores}{Number of threads to use for parallel processing} \item{cores}{Number of threads to use for parallel processing}

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/actor_merger.R
\name{actor_merger}
\alias{actor_merger}
\title{Aggregate sentence-level dataset containing actors (from sentencizer())}
\usage{
actor_merger(df, actors_meta, ids = NULL)
}
\arguments{
\item{df}{Data frame with actor ids, produced by sentencizer}
\item{actors_meta}{Data frame containing actor metadata obtained using elasticizer(index="actors")}
\item{ids}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)}
}
\value{
When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
}
\description{
Aggregate sentence-level dataset containing actors (from sentencizer())
}
\examples{
actor_merger(df, actors_meta, ids = NULL)
}

@ -4,8 +4,18 @@
\alias{actorizer} \alias{actorizer}
\title{Updater function for elasticizer: Conduct actor searches} \title{Updater function for elasticizer: Conduct actor searches}
\usage{ \usage{
actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, actorizer(
es_super, ver, cores = 1) out,
localhost = F,
ids,
prefix,
postfix,
pre_tags,
post_tags,
es_super,
ver,
cores = 1
)
} }
\arguments{ \arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}

@ -4,9 +4,18 @@
\alias{class_update} \alias{class_update}
\title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information} \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information}
\usage{ \usage{
class_update(out, localhost = T, model_final, varname, text, words, class_update(
clean, ver, es_super = .rs.askForPassword("ElasticSearch WRITE"), out,
cores = 1) localhost = T,
model_final,
varname,
text,
words,
clean,
ver,
es_super = .rs.askForPassword("ElasticSearch WRITE"),
cores = 1
)
} }
\arguments{ \arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}

@ -4,16 +4,14 @@
\alias{cv_generator} \alias{cv_generator}
\title{Generate CV folds for nested cross-validation} \title{Generate CV folds for nested cross-validation}
\usage{ \usage{
cv_generator(outer_k, inner_k, dfm, class_type, grid, seed) cv_generator(outer_k, inner_k, vec, grid, seed)
} }
\arguments{ \arguments{
\item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data} \item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data}
\item{inner_k}{Number of inner CV (parameter optimization) folds} \item{inner_k}{Number of inner CV (parameter optimization) folds}
\item{dfm}{DFM containing the labeled documents} \item{vec}{Vector containing the true values of the classification}
\item{class_type}{Name of the column in docvars containing the classification}
\item{grid}{Parameter grid for optimization} \item{grid}{Parameter grid for optimization}

@ -4,8 +4,7 @@
\alias{dfm_gen} \alias{dfm_gen}
\title{Generates dfm from ElasticSearch output} \title{Generates dfm from ElasticSearch output}
\usage{ \usage{
dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T)
tolower = T)
} }
\arguments{ \arguments{
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}

@ -4,8 +4,17 @@
\alias{dupe_detect} \alias{dupe_detect}
\title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]} \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
\usage{ \usage{
dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, dupe_detect(
words, localhost = T, ver) row,
grid,
cutoff_lower,
cutoff_upper = 1,
es_pwd,
es_super,
words,
localhost = T,
ver
)
} }
\arguments{ \arguments{
\item{row}{Row of grid to parse} \item{row}{Row of grid to parse}

@ -4,10 +4,18 @@
\alias{elasticizer} \alias{elasticizer}
\title{Generate a data frame out of unparsed Elasticsearch JSON} \title{Generate a data frame out of unparsed Elasticsearch JSON}
\usage{ \usage{
elasticizer(query, src = T, index = "maml", elasticizer(
es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, query,
max_batch = Inf, time_scroll = "5m", update = NULL, src = T,
localhost = F, ...) index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ"),
batch_size = 1024,
max_batch = Inf,
time_scroll = "5m",
update = NULL,
localhost = F,
...
)
} }
\arguments{ \arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL} \item{query}{A JSON-formatted query in the Elasticsearch query DSL}

@ -4,7 +4,16 @@
\alias{estimator} \alias{estimator}
\title{Generate models and get classifications on test sets} \title{Generate models and get classifications on test sets}
\usage{ \usage{
estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model) estimator(
row,
grid,
outer_folds,
inner_folds,
dfm,
class_type,
model,
we_vectors
)
} }
\arguments{ \arguments{
\item{row}{Row number of current item in grid} \item{row}{Row number of current item in grid}
@ -18,6 +27,8 @@ estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model)
\item{class_type}{Name of column in docvars() containing the classes} \item{class_type}{Name of column in docvars() containing the classes}
\item{model}{Model to use (currently only nb)} \item{model}{Model to use (currently only nb)}
\item{we_vectors}{Matrix with word embedding vectors}
} }
\value{ \value{
Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values. Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.

@ -4,8 +4,7 @@
\alias{lemma_writer} \alias{lemma_writer}
\title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings} \title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings}
\usage{ \usage{
lemma_writer(out, file, localhost = F, documents = F, lemma = F, lemma_writer(out, file, localhost = F, documents = F, lemma = F, cores = 1)
cores = 1)
} }
\arguments{ \arguments{
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}

@ -4,8 +4,19 @@
\alias{modelizer} \alias{modelizer}
\title{Generate a classification model} \title{Generate a classification model}
\usage{ \usage{
modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, modelizer(
seed, model, cores = 1) dfm,
outer_k,
inner_k,
class_type,
opt_measure,
country,
grid,
seed,
model,
we_vectors,
cores = 1
)
} }
\arguments{ \arguments{
\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars} \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
@ -26,10 +37,12 @@ modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid,
\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)} \item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
\item{we_vectors}{Matrix with word embedding vectors}
\item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1} \item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1}
} }
\value{ \value{
An .Rds file in the current working directory (getwd()) with a list containing all relevant output A list containing all relevant output
} }
\description{ \description{
Generate a nested cross validated classification model based on a dfm with class labels as docvars Generate a nested cross validated classification model based on a dfm with class labels as docvars

@ -4,8 +4,21 @@
\alias{modelizer_old} \alias{modelizer_old}
\title{Generate a classification model} \title{Generate a classification model}
\usage{ \usage{
modelizer_old(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, modelizer_old(
outer_k, inner_k, model, class_type, opt_measure, country, grid) dfm,
cores_outer,
cores_grid,
cores_inner,
cores_feats,
seed,
outer_k,
inner_k,
model,
class_type,
opt_measure,
country,
grid
)
} }
\arguments{ \arguments{
\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars} \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}

@ -4,7 +4,7 @@
\alias{preproc} \alias{preproc}
\title{Preprocess dfm data for use in modeling procedure} \title{Preprocess dfm data for use in modeling procedure}
\usage{ \usage{
preproc(dfm_train, dfm_test = NULL, params) preproc(dfm_train, dfm_test = NULL, params, we_vectors)
} }
\arguments{ \arguments{
\item{dfm_train}{Training dfm} \item{dfm_train}{Training dfm}
@ -12,6 +12,8 @@ preproc(dfm_train, dfm_test = NULL, params)
\item{dfm_test}{Testing dfm if applicable, otherwise NULL} \item{dfm_test}{Testing dfm if applicable, otherwise NULL}
\item{params}{Row from grid with parameter optimization} \item{params}{Row from grid with parameter optimization}
\item{we_vectors}{Matrix with word embedding vectors}
} }
\value{ \value{
List with dfm_train and dfm_test, processed according to parameters in params List with dfm_train and dfm_test, processed according to parameters in params

@ -2,7 +2,7 @@
% Please edit documentation in R/sentencizer.R % Please edit documentation in R/sentencizer.R
\name{sentencizer} \name{sentencizer}
\alias{sentencizer} \alias{sentencizer}
\title{Generate actor data frames (with sentiment) from database} \title{Generate sentence-level dataset with sentiment and actor presence}
\usage{ \usage{
sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F) sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
} }
@ -10,13 +10,15 @@ sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
\item{out}{Data frame produced by elasticizer} \item{out}{Data frame produced by elasticizer}
\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.} \item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.}
\item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
} }
\value{ \value{
No return value, data per batch is saved in an RDS file No return value, data per batch is saved in an RDS file
} }
\description{ \description{
Generate actor data frames (with sentiment) from database Generate sentence-level dataset with sentiment and actor presence
} }
\examples{ \examples{
sentencizer(out, sent_dict = NULL) sentencizer(out, sent_dict = NULL, validation = F)
} }

@ -4,9 +4,14 @@
\alias{ud_update} \alias{ud_update}
\title{Elasticizer update function: generate UDpipe output from base text} \title{Elasticizer update function: generate UDpipe output from base text}
\usage{ \usage{
ud_update(out, localhost = T, udmodel, ud_update(
out,
localhost = T,
udmodel,
es_super = .rs.askForPassword("ElasticSearch WRITE"), es_super = .rs.askForPassword("ElasticSearch WRITE"),
cores = detectCores(), ver) cores = detectCores(),
ver
)
} }
\arguments{ \arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}

Loading…
Cancel
Save