actor_merger: added function for generating actor-document data frames

actor_fetcher: removed from package
other: major update to documentation
master
Your Name 5 years ago
parent 4e867214dd
commit f022312485

@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand
export(actor_fetcher)
export(actor_merger)
export(actorizer)
export(bulk_writer)
export(class_update)

@ -38,31 +38,31 @@ actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, lo
### Party ids including actors
p_ids_a <- c(p_ids,str_c(pid,'_a'))
summarizer <- function (p_ids, dupe_df, merged_id) {
id <- dupe_df$`_id`[[1]]
dupe_df <- dupe_df %>%
filter(ids %in% p_ids)
if (nrow(dupe_df) > 0) {
return(
dupe_df %>% summarise(
`_id` = first(`_id`),
`_source.doctype` = first(`_source.doctype`),
`_source.publication_date` = first(`_source.publication_date`),
prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
sentence_id = list(sort(unique(unlist(sentence_id)))),
rel_first = list(max(unlist(rel_first))),
ids = merged_id,
occ = list(length(unique(unlist(sentence_id)))),
first = list(min(unlist(sentence_id))),
actor_start = list(sort(unique(unlist(actor_start)))),
actor_end = list(sort(unique(unlist(actor_end)))),
sentence_start = list(sort(unique(unlist(sentence_start)))),
sentence_end = list(sort(unique(unlist(sentence_end))))
)
id <- dupe_df$`_id`[[1]]
dupe_df <- dupe_df %>%
filter(ids %in% p_ids)
if (nrow(dupe_df) > 0) {
return(
dupe_df %>% summarise(
`_id` = first(`_id`),
`_source.doctype` = first(`_source.doctype`),
`_source.publication_date` = first(`_source.publication_date`),
prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
sentence_id = list(sort(unique(unlist(sentence_id)))),
rel_first = list(max(unlist(rel_first))),
ids = merged_id,
occ = list(length(unique(unlist(sentence_id)))),
first = list(min(unlist(sentence_id))),
actor_start = list(sort(unique(unlist(actor_start)))),
actor_end = list(sort(unique(unlist(actor_end)))),
sentence_start = list(sort(unique(unlist(sentence_start)))),
sentence_end = list(sort(unique(unlist(sentence_end))))
)
} else {
print(paste0('id:',id))
return(NULL)
}
)
} else {
print(paste0('id:',id))
return(NULL)
}
}
party <- summarizer(p_ids, dupe_df, str_c(pid,'_mfs'))
party_actor <- summarizer(p_ids_a, dupe_df, str_c(pid,'_mfsa'))
@ -77,15 +77,15 @@ actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, lo
select(-one_of('exists')) %>%
unnest() %>%
filter(upos != 'PUNCT') # For getting proper word counts
if ("lem_u" %in% colnames(sent_dict)) {
ud_sent <- ud_sent %>%
mutate(lem_u = str_c(lemma,'_',upos)) %>%
left_join(sent_dict, by = 'lem_u')
} else if ("lemma" %in% colnames(sent_dict)) {
ud_sent <- ud_sent %>%
left_join(sent_dict, by = 'lemma') %>%
mutate(lem_u = lemma)
}
if ("lem_u" %in% colnames(sent_dict)) {
ud_sent <- ud_sent %>%
mutate(lem_u = str_c(lemma,'_',upos)) %>%
left_join(sent_dict, by = 'lem_u')
} else if ("lemma" %in% colnames(sent_dict)) {
ud_sent <- ud_sent %>%
left_join(sent_dict, by = 'lemma') %>%
mutate(lem_u = lemma)
}
ud_sent <- ud_sent %>%
group_by(sentence_id) %>%
mutate(
@ -177,10 +177,10 @@ actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, lo
yearweek = strftime(`_source.publication_date`, format = "%Y%V")
) %>%
select(#-`_source.computerCodes.actorsDetail`,
-`_score`,
-`_index`,
-`_type`,
-pids)
-`_score`,
-`_index`,
-`_type`,
-pids)
return(out_row)
}
saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))

@ -0,0 +1,178 @@
#' Aggregate sentence-level dataset containing actors (from sentencizer())
#'
#' Aggregate sentence-level dataset containing actors (from sentencizer())
#' @param df Data frame with actor ids, produced by sentencizer
#' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors")
#' @param ids Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
#' @export
#' @examples
#' actor_merger(df, actors_meta, ids = NULL)
#################################################################################################
#################################### Generate actor-article dataset #############################
#################################################################################################
actor_merger <- function(df, actors_meta, ids = NULL) {
grouper <- function(id, df) {
return(df %>%
rowwise() %>%
filter(length(intersect(id,ids)) > 0) %>%
group_by(`_id`) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = as.Date(first(`_source.publication_date`)),
doctype = first(`_source.doctype`)) %>%
mutate(
ids = str_c(id, collapse = '-')
)
)
}
## Remove some of the metadata from the source df
text_sent <- df %>%
select(`_id`,starts_with("text."),-ends_with("sent_lemmas"))
df <- df %>%
select(-ends_with("sent_lemmas"),-starts_with("text.")) %>%
unnest(cols = colnames(.)) ## Unnest to sentence level
## Create bogus variables if sentiment is not scored
if(!"sent_sum" %in% colnames(df)) {
df <- df %>%
mutate(
sent_words = 0,
sent_sum = 0,
)
}
## Create aggregations according to list of actorId vectors in ids
if(!is.null(ids)) {
output <- lapply(ids,grouper, df = df) %>%
bind_rows(.) %>%
left_join(text_sent, by="_id") %>%
mutate(
actor.prom = actor.occ/text.sentences,
actor.rel_first = 1-(actor.first/text.sentences),
year = strftime(publication_date, format = '%Y'),
yearmonth = strftime(publication_date, format = '%Y%m'),
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V")
)
return(output)
} else {
all <- df %>%
rowwise() %>%
filter(!is.null(unlist(ids))) %>%
group_by(`_id`) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = as.Date(first(`_source.publication_date`)),
doctype = first(`_source.doctype`)) %>%
mutate(
ids = "all"
)
df <- df %>%
unnest(cols = ids) %>% ## Unnest to actor level
mutate(
`_source.publication_date` = as.Date(`_source.publication_date`)
)
## Create aggregate measures for individual actors
actors <- df %>%
filter(str_starts(ids,"A_")) %>%
group_by(`_id`,ids) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = first(`_source.publication_date`),
doctype = first(`_source.doctype`)
)
## Create actor metadata dataframe per active date (one row per day per actor)
colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','')
actors_meta <- actors_meta[-1128,]
actors_meta_bydate <- actors_meta %>%
mutate(
startDate = as.Date(startDate),
endDate = as.Date(endDate)
) %>%
select(
lastName,firstName,`function`,gender,yearOfBirth,parlPeriod,partyId,ministerName,ministryId,actorId,startDate,endDate
) %>%
rowwise() %>%
mutate(
publication_date = list(seq(from=startDate, to=endDate,by="day")),
ids = actorId
) %>%
unnest(cols=publication_date)
## Join the actor metadata with the article data by actor id and date
actors <- actors %>%
left_join(.,actors_meta_bydate, by=c("ids","publication_date"))
## Generate party-actor aggregations (mfsa)
parties_actors <- df %>%
filter(str_starts(ids,"P_")) %>%
mutate(
ids = str_sub(ids, start = 1, end = -3)
) %>%
group_by(`_id`,ids) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = first(`_source.publication_date`),
doctype = first(`_source.doctype`)) %>%
mutate(
ids = str_c(ids,"_mfsa")
)
## Generate party aggregations (mfs)
parties <- df %>%
filter(str_ends(ids,"_f") | str_ends(ids,"_s")) %>%
mutate(
ids = str_sub(ids, start = 1, end = -3)
) %>%
group_by(`_id`,ids) %>%
summarise(actor.sent = sum(sent_sum)/sum(words),
actor.sent_sum = sum(sent_sum),
actor.sent_words = sum(sent_words),
actor.words = sum(words),
actor.arousal = sum(sent_words)/sum(words),
actor.first = first(sentence_id),
actor.occ = n(),
publication_date = first(`_source.publication_date`),
doctype = first(`_source.doctype`)) %>%
mutate(
ids = str_c(ids,"_mfs")
)
## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
df <- bind_rows(actors, parties, parties_actors, all) %>%
left_join(text_sent, by="_id") %>%
mutate(
actor.prom = actor.occ/text.sentences,
actor.rel_first = 1-(actor.first/text.sentences),
year = strftime(publication_date, format = '%Y'),
yearmonth = strftime(publication_date, format = '%Y%m'),
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V")
)
return(df)
}
}

@ -4,13 +4,21 @@
\alias{actor_fetcher}
\title{Generate actor data frames (with sentiment) from database}
\usage{
actor_fetcher(out, sent_dict = NULL, cores = 1, localhost = NULL,
validation = F)
actor_fetcher(
out,
sent_dict = NULL,
actor_ids = NULL,
cores = 1,
localhost = NULL,
validation = F
)
}
\arguments{
\item{out}{Data frame produced by elasticizer}
\item{sent_dict}{Optional dataframe containing the sentiment dictionary (see sentiment paper scripts for details on format)}
\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.}
\item{actor_ids}{Optional vector containing the actor ids to be collected}
\item{cores}{Number of threads to use for parallel processing}

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/actor_merger.R
\name{actor_merger}
\alias{actor_merger}
\title{Aggregate sentence-level dataset containing actors (from sentencizer())}
\usage{
actor_merger(df, actors_meta, ids = NULL)
}
\arguments{
\item{df}{Data frame with actor ids, produced by sentencizer}
\item{actors_meta}{Data frame containing actor metadata obtained using elasticizer(index="actors")}
\item{ids}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)}
}
\value{
When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
}
\description{
Aggregate sentence-level dataset containing actors (from sentencizer())
}
\examples{
actor_merger(df, actors_meta, ids = NULL)
}

@ -4,8 +4,18 @@
\alias{actorizer}
\title{Updater function for elasticizer: Conduct actor searches}
\usage{
actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags,
es_super, ver, cores = 1)
actorizer(
out,
localhost = F,
ids,
prefix,
postfix,
pre_tags,
post_tags,
es_super,
ver,
cores = 1
)
}
\arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}

@ -4,9 +4,18 @@
\alias{class_update}
\title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information}
\usage{
class_update(out, localhost = T, model_final, varname, text, words,
clean, ver, es_super = .rs.askForPassword("ElasticSearch WRITE"),
cores = 1)
class_update(
out,
localhost = T,
model_final,
varname,
text,
words,
clean,
ver,
es_super = .rs.askForPassword("ElasticSearch WRITE"),
cores = 1
)
}
\arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}

@ -4,16 +4,14 @@
\alias{cv_generator}
\title{Generate CV folds for nested cross-validation}
\usage{
cv_generator(outer_k, inner_k, dfm, class_type, grid, seed)
cv_generator(outer_k, inner_k, vec, grid, seed)
}
\arguments{
\item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data}
\item{inner_k}{Number of inner CV (parameter optimization) folds}
\item{dfm}{DFM containing the labeled documents}
\item{class_type}{Name of the column in docvars containing the classification}
\item{vec}{Vector containing the true values of the classification}
\item{grid}{Parameter grid for optimization}

@ -4,8 +4,7 @@
\alias{dfm_gen}
\title{Generates dfm from ElasticSearch output}
\usage{
dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1,
tolower = T)
dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T)
}
\arguments{
\item{out}{The elasticizer-generated data frame}

@ -4,8 +4,17 @@
\alias{dupe_detect}
\title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
\usage{
dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
words, localhost = T, ver)
dupe_detect(
row,
grid,
cutoff_lower,
cutoff_upper = 1,
es_pwd,
es_super,
words,
localhost = T,
ver
)
}
\arguments{
\item{row}{Row of grid to parse}

@ -4,10 +4,18 @@
\alias{elasticizer}
\title{Generate a data frame out of unparsed Elasticsearch JSON}
\usage{
elasticizer(query, src = T, index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024,
max_batch = Inf, time_scroll = "5m", update = NULL,
localhost = F, ...)
elasticizer(
query,
src = T,
index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ"),
batch_size = 1024,
max_batch = Inf,
time_scroll = "5m",
update = NULL,
localhost = F,
...
)
}
\arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL}

@ -4,7 +4,16 @@
\alias{estimator}
\title{Generate models and get classifications on test sets}
\usage{
estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model)
estimator(
row,
grid,
outer_folds,
inner_folds,
dfm,
class_type,
model,
we_vectors
)
}
\arguments{
\item{row}{Row number of current item in grid}
@ -18,6 +27,8 @@ estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model)
\item{class_type}{Name of column in docvars() containing the classes}
\item{model}{Model to use (currently only nb)}
\item{we_vectors}{Matrix with word embedding vectors}
}
\value{
Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.

@ -4,8 +4,7 @@
\alias{lemma_writer}
\title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings}
\usage{
lemma_writer(out, file, localhost = F, documents = F, lemma = F,
cores = 1)
lemma_writer(out, file, localhost = F, documents = F, lemma = F, cores = 1)
}
\arguments{
\item{out}{The elasticizer-generated data frame}

@ -4,8 +4,19 @@
\alias{modelizer}
\title{Generate a classification model}
\usage{
modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid,
seed, model, cores = 1)
modelizer(
dfm,
outer_k,
inner_k,
class_type,
opt_measure,
country,
grid,
seed,
model,
we_vectors,
cores = 1
)
}
\arguments{
\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
@ -26,10 +37,12 @@ modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid,
\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
\item{we_vectors}{Matrix with word embedding vectors}
\item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1}
}
\value{
An .Rds file in the current working directory (getwd()) with a list containing all relevant output
A list containing all relevant output
}
\description{
Generate a nested cross validated classification model based on a dfm with class labels as docvars

@ -4,8 +4,21 @@
\alias{modelizer_old}
\title{Generate a classification model}
\usage{
modelizer_old(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
outer_k, inner_k, model, class_type, opt_measure, country, grid)
modelizer_old(
dfm,
cores_outer,
cores_grid,
cores_inner,
cores_feats,
seed,
outer_k,
inner_k,
model,
class_type,
opt_measure,
country,
grid
)
}
\arguments{
\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}

@ -4,7 +4,7 @@
\alias{preproc}
\title{Preprocess dfm data for use in modeling procedure}
\usage{
preproc(dfm_train, dfm_test = NULL, params)
preproc(dfm_train, dfm_test = NULL, params, we_vectors)
}
\arguments{
\item{dfm_train}{Training dfm}
@ -12,6 +12,8 @@ preproc(dfm_train, dfm_test = NULL, params)
\item{dfm_test}{Testing dfm if applicable, otherwise NULL}
\item{params}{Row from grid with parameter optimization}
\item{we_vectors}{Matrix with word embedding vectors}
}
\value{
List with dfm_train and dfm_test, processed according to parameters in params

@ -2,7 +2,7 @@
% Please edit documentation in R/sentencizer.R
\name{sentencizer}
\alias{sentencizer}
\title{Generate actor data frames (with sentiment) from database}
\title{Generate sentence-level dataset with sentiment and actor presence}
\usage{
sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
}
@ -10,13 +10,15 @@ sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
\item{out}{Data frame produced by elasticizer}
\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.}
\item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
}
\value{
No return value, data per batch is saved in an RDS file
}
\description{
Generate actor data frames (with sentiment) from database
Generate sentence-level dataset with sentiment and actor presence
}
\examples{
sentencizer(out, sent_dict = NULL)
sentencizer(out, sent_dict = NULL, validation = F)
}

@ -4,9 +4,14 @@
\alias{ud_update}
\title{Elasticizer update function: generate UDpipe output from base text}
\usage{
ud_update(out, localhost = T, udmodel,
ud_update(
out,
localhost = T,
udmodel,
es_super = .rs.askForPassword("ElasticSearch WRITE"),
cores = detectCores(), ver)
cores = detectCores(),
ver
)
}
\arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}

Loading…
Cancel
Save