From 4fd9222a2de6a38dd0c9eb06c09bd432d82ca1f7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 24 Aug 2020 15:50:10 +0200 Subject: [PATCH] lemma_writer: updated to write metadata csv when dumping documents in ud format out_parser: fix for generating empty columns --- R/lemma_writer.R | 13 +++++++++++-- R/out_parser.R | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/R/lemma_writer.R b/R/lemma_writer.R index 92a887f..0705007 100644 --- a/R/lemma_writer.R +++ b/R/lemma_writer.R @@ -17,7 +17,7 @@ #################################### Lemma text file generator ############################# ################################################################################################# -lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1) { +lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1, meta_file = NULL) { plan(multiprocess, workers = cores) par_writer <- function(row, out, lemma) { if (lemma == T) { @@ -26,13 +26,22 @@ lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cor cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F) } } + if (documents == F) { out <- unnest(out,`_source.ud`) lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_') cat(lemma, file = file, append = T) } if (documents == T) { - out <- out_parser(out, field = '_source', clean = F, cores = cores) + if (lemma == F) { + out <- out_parser(out, field = '_source', clean = F) + } else { + if (!is.null(meta_file)) { + meta <- select(out, -`_source.ud`) + write.table(meta, str_c(file,meta_file), sep = ",", col.names = !file.exists(str_c(file,meta_file)), append = T) + } + } future_lapply(1:nrow(out), par_writer, out = out, lemma = lemma) } + } diff --git a/R/out_parser.R b/R/out_parser.R index 3f9580f..0ad69a4 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -20,7 +20,7 @@ out_parser <- function(out, field, clean = F) { data } - out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser')) + out <- fncols(data.table(out), c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser')) par_parser <- function(row, out, field, clean) { doc <- out[row,] if (field == 'highlight') {