lemma_writer: updated to provide support for writing raw documents to individual files using utf-8 encoding

master
Erik de Vries 5 years ago
parent 115297f597
commit 889e7e92af

@ -1,10 +1,7 @@
# Generated by roxygen2: do not edit by hand
export(actor_aggregation)
export(actor_fetcher)
export(actorizer)
export(aggregator)
export(aggregator_elastic)
export(bulk_writer)
export(class_update)
export(dfm_gen)

@ -2,7 +2,9 @@
#'
#' Generates text output files (without punctuation) for external applications, such as GloVe embeddings
#' @param out The elasticizer-generated data frame
#' @param file The file to write the output to (including path, when required)
#' @param file The file to write the output to (including path, when required). When documents = T, provide path including trailing /
#' @param documents Indicate whether the writer should output to a single file, or individual documents
#' @param cores Indicate the number of cores to use for parallel processing
#' @param localhost Unused, but defaults to FALSE
#' @return A Quanteda dfm
#' @export
@ -14,8 +16,18 @@
#################################### Lemma text file generator #############################
#################################################################################################
lemma_writer <- function(out, file, localhost = F) {
lemma_writer <- function(out, file, localhost = F, documents = F, cores = 1) {
plan(multiprocess, workers = cores)
par_writer <- function(row, out) {
cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F)
}
if (documents == F) {
out <- unnest(out,`_source.ud`)
lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_')
cat(lemma, file = file, append = T)
}
if (documents == T) {
out <- out_parser(out, field = '_source', clean = F, cores = cores)
future_lapply(1:nrow(out), par_writer, out = out)
}
}

@ -1,29 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/actor_aggregation.R
\name{actor_aggregation}
\alias{actor_aggregation}
\title{Generate aggregated actor measures from raw data}
\usage{
actor_aggregation(row, actors, es_pwd, localhost,
default_operator = "OR", sent_dict = NULL, cores = detectCores())
}
\arguments{
\item{row}{The row of the actors data frame used for aggregation}
\item{actors}{The data frame containing actor data}
\item{es_pwd}{The password for read access to ES}
\item{localhost}{Boolean indicating if the script is running locally or not}
\item{default_operator}{String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR}
}
\value{
No return value, data per actor is saved in an RDS file
}
\description{
Generate aggregated actor measures from raw data
}
\examples{
actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR')
}

@ -5,7 +5,7 @@
\title{Updater function for elasticizer: Conduct actor searches}
\usage{
actorizer(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags,
es_super, ver, cores = detectCores())
es_super, ver, cores = 1)
}
\arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}

@ -1,24 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/aggregator.R
\name{aggregator}
\alias{aggregator}
\title{Aggregator function, to aggregate actor results}
\usage{
aggregator(id, actor_df, merge_id)
}
\arguments{
\item{id}{Article id of the article for which actor aggregation should be done}
\item{actor_df}{The dataframe containing the actor data}
\item{merge_id}{The actorid that should be assigned to the merged result}
}
\value{
A dataframe with the merged results
}
\description{
Aggregator function, to aggregate actor results
}
\examples{
aggregator(id, actor_df, merge_id)
}

@ -1,28 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/aggregator_elastic.R
\name{aggregator_elastic}
\alias{aggregator_elastic}
\title{Generate and store aggregate actor measures to elasticsearch}
\usage{
aggregator_elastic(out, localhost = F, actorids, ver, es_super)
}
\arguments{
\item{out}{The output provided by elasticizer()}
\item{localhost}{Boolean indicating if the script should run locally, or remote}
\item{actorids}{List of actorids used in the search, should be the same as the actorids used for elasticizer()}
\item{ver}{String indicating the version of the update}
\item{es_super}{Write password for ES}
}
\value{
Return value is based on output of elastic_update()
}
\description{
Generate and store aggregate actor measures to elasticsearch
}
\examples{
aggregator_elastic(out, localhost = F, actorids, ver, es_super)
}

@ -4,14 +4,18 @@
\alias{lemma_writer}
\title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings}
\usage{
lemma_writer(out, file, localhost = F)
lemma_writer(out, file, localhost = F, documents = F, cores = 1)
}
\arguments{
\item{out}{The elasticizer-generated data frame}
\item{file}{The file to write the output to (including path, when required)}
\item{file}{The file to write the output to (including path, when required). When documents = T, provide path including trailing /}
\item{localhost}{Unused, but defaults to FALSE}
\item{documents}{Indicate whether the writer should output to a single file, or individual documents}
\item{cores}{Indicate the number of cores to use for parallel processing}
}
\value{
A Quanteda dfm

@ -4,7 +4,7 @@
\alias{out_parser}
\title{Parse raw text into a single field}
\usage{
out_parser(out, field, clean = F, cores = detectCores())
out_parser(out, field, clean = F, cores = 1)
}
\arguments{
\item{out}{The original output data frame}

Loading…
Cancel
Save