From a83ee5dfd0a9dce77f7e81ba20c5e01f5edd7228 Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Sun, 1 Sep 2019 13:13:08 +0200 Subject: [PATCH] lemma_writer: update to write lemma instead of full document text --- R/lemma_writer.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/R/lemma_writer.R b/R/lemma_writer.R index 6c6368c..0ff6dd2 100644 --- a/R/lemma_writer.R +++ b/R/lemma_writer.R @@ -4,6 +4,7 @@ #' @param out The elasticizer-generated data frame #' @param file The file to write the output to (including path, when required). When documents = T, provide path including trailing / #' @param documents Indicate whether the writer should output to a single file, or individual documents +#' @param lemma Indicate whether document output should be lemmas or original document #' @param cores Indicate the number of cores to use for parallel processing #' @param localhost Unused, but defaults to FALSE #' @return A Quanteda dfm @@ -16,10 +17,14 @@ #################################### Lemma text file generator ############################# ################################################################################################# -lemma_writer <- function(out, file, localhost = F, documents = F, cores = 1) { +lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1) { plan(multiprocess, workers = cores) - par_writer <- function(row, out) { - cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F) + par_writer <- function(row, out, lemma) { + if (lemma == T) { + cat(iconv(unnest(out[row,],`_source.ud`)$lemma, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F) + } else { + cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F) + } } if (documents == F) { out <- unnest(out,`_source.ud`) @@ -28,6 +33,6 @@ lemma_writer <- function(out, file, localhost = F, documents = F, cores = 1) { } if (documents == T) { out <- out_parser(out, field = '_source', clean = F, cores = cores) - future_lapply(1:nrow(out), par_writer, out = out) + future_lapply(1:nrow(out), par_writer, out = out, lemma = lemma) } }