diff --git a/NAMESPACE b/NAMESPACE index 5150447..3bea41b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,7 @@ # Generated by roxygen2: do not edit by hand -export(actor_aggregation) export(actor_fetcher) export(actorizer) -export(aggregator) -export(aggregator_elastic) export(bulk_writer) export(class_update) export(dfm_gen) diff --git a/R/lemma_writer.R b/R/lemma_writer.R index d4daf45..6c6368c 100644 --- a/R/lemma_writer.R +++ b/R/lemma_writer.R @@ -2,7 +2,9 @@ #' #' Generates text output files (without punctuation) for external applications, such as GloVe embeddings #' @param out The elasticizer-generated data frame -#' @param file The file to write the output to (including path, when required) +#' @param file The file to write the output to (including path, when required). When documents = T, provide path including trailing / +#' @param documents Indicate whether the writer should output to a single file, or individual documents +#' @param cores Indicate the number of cores to use for parallel processing #' @param localhost Unused, but defaults to FALSE #' @return A Quanteda dfm #' @export @@ -14,8 +16,18 @@ #################################### Lemma text file generator ############################# ################################################################################################# -lemma_writer <- function(out, file, localhost = F) { - out <- unnest(out,`_source.ud`) - lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_') - cat(lemma, file = file, append = T) +lemma_writer <- function(out, file, localhost = F, documents = F, cores = 1) { + plan(multiprocess, workers = cores) + par_writer <- function(row, out) { + cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F) + } + if (documents == F) { + out <- unnest(out,`_source.ud`) + lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_') + cat(lemma, file = file, append = T) + } + if (documents == T) { + out <- out_parser(out, field = '_source', clean = F, cores = cores) + future_lapply(1:nrow(out), par_writer, out = out) + } } diff --git a/man/actor_aggregation.Rd b/man/actor_aggregation.Rd deleted file mode 100644 index 8d60aff..0000000 --- a/man/actor_aggregation.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/actor_aggregation.R -\name{actor_aggregation} -\alias{actor_aggregation} -\title{Generate aggregated actor measures from raw data} -\usage{ -actor_aggregation(row, actors, es_pwd, localhost, - default_operator = "OR", sent_dict = NULL, cores = detectCores()) -} -\arguments{ -\item{row}{The row of the actors data frame used for aggregation} - -\item{actors}{The data frame containing actor data} - -\item{es_pwd}{The password for read access to ES} - -\item{localhost}{Boolean indicating if the script is running locally or not} - -\item{default_operator}{String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR} -} -\value{ -No return value, data per actor is saved in an RDS file -} -\description{ -Generate aggregated actor measures from raw data -} -\examples{ -actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR') -} diff --git a/man/actorizer.Rd b/man/actorizer.Rd index 1345941..a12d4f9 100644 --- a/man/actorizer.Rd +++ b/man/actorizer.Rd @@ -5,7 +5,7 @@ \title{Updater function for elasticizer: Conduct actor searches} \usage{ actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, - es_super, ver, cores = detectCores()) + es_super, ver, cores = 1) } \arguments{ \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} diff --git a/man/aggregator.Rd b/man/aggregator.Rd deleted file mode 100644 index 22841ef..0000000 --- a/man/aggregator.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aggregator.R -\name{aggregator} -\alias{aggregator} -\title{Aggregator function, to aggregate actor results} -\usage{ -aggregator(id, actor_df, merge_id) -} -\arguments{ -\item{id}{Article id of the article for which actor aggregation should be done} - -\item{actor_df}{The dataframe containing the actor data} - -\item{merge_id}{The actorid that should be assigned to the merged result} -} -\value{ -A dataframe with the merged results -} -\description{ -Aggregator function, to aggregate actor results -} -\examples{ -aggregator(id, actor_df, merge_id) -} diff --git a/man/aggregator_elastic.Rd b/man/aggregator_elastic.Rd deleted file mode 100644 index 547b1dc..0000000 --- a/man/aggregator_elastic.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aggregator_elastic.R -\name{aggregator_elastic} -\alias{aggregator_elastic} -\title{Generate and store aggregate actor measures to elasticsearch} -\usage{ -aggregator_elastic(out, localhost = F, actorids, ver, es_super) -} -\arguments{ -\item{out}{The output provided by elasticizer()} - -\item{localhost}{Boolean indicating if the script should run locally, or remote} - -\item{actorids}{List of actorids used in the search, should be the same as the actorids used for elasticizer()} - -\item{ver}{String indicating the version of the update} - -\item{es_super}{Write password for ES} -} -\value{ -Return value is based on output of elastic_update() -} -\description{ -Generate and store aggregate actor measures to elasticsearch -} -\examples{ -aggregator_elastic(out, localhost = F, actorids, ver, es_super) -} diff --git a/man/lemma_writer.Rd b/man/lemma_writer.Rd index bc2a099..4cd8305 100644 --- a/man/lemma_writer.Rd +++ b/man/lemma_writer.Rd @@ -4,14 +4,18 @@ \alias{lemma_writer} \title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings} \usage{ -lemma_writer(out, file, localhost = F) +lemma_writer(out, file, localhost = F, documents = F, cores = 1) } \arguments{ \item{out}{The elasticizer-generated data frame} -\item{file}{The file to write the output to (including path, when required)} +\item{file}{The file to write the output to (including path, when required). When documents = T, provide path including trailing /} \item{localhost}{Unused, but defaults to FALSE} + +\item{documents}{Indicate whether the writer should output to a single file, or individual documents} + +\item{cores}{Indicate the number of cores to use for parallel processing} } \value{ A Quanteda dfm diff --git a/man/out_parser.Rd b/man/out_parser.Rd index eb8cae7..666eabb 100644 --- a/man/out_parser.Rd +++ b/man/out_parser.Rd @@ -4,7 +4,7 @@ \alias{out_parser} \title{Parse raw text into a single field} \usage{ -out_parser(out, field, clean = F, cores = detectCores()) +out_parser(out, field, clean = F, cores = 1) } \arguments{ \item{out}{The original output data frame}