From 4fd9222a2de6a38dd0c9eb06c09bd432d82ca1f7 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 24 Aug 2020 15:50:10 +0200
Subject: [PATCH] lemma_writer: updated to write metadata csv when dumping
 documents in ud format out_parser: fix for generating empty columns

---
 R/lemma_writer.R | 13 +++++++++++--
 R/out_parser.R   |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/R/lemma_writer.R b/R/lemma_writer.R
index 92a887f..0705007 100644
--- a/R/lemma_writer.R
+++ b/R/lemma_writer.R
@@ -17,7 +17,7 @@
 #################################### Lemma text file generator #############################
 #################################################################################################
 
-lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1) {
+lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1, meta_file = NULL) {
   plan(multiprocess, workers = cores)
   par_writer <- function(row, out, lemma) {
     if (lemma == T) {
@@ -26,13 +26,22 @@ lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cor
       cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F)
     }
   }
+
   if (documents == F) {
     out <- unnest(out,`_source.ud`)
     lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_')
     cat(lemma, file = file, append = T)
   }
   if (documents == T) {
-    out <- out_parser(out, field = '_source', clean = F, cores = cores)
+    if (lemma == F) {
+      out <- out_parser(out, field = '_source', clean = F)
+    } else {
+      if (!is.null(meta_file)) {
+        meta <- select(out, -`_source.ud`)
+        write.table(meta, str_c(file,meta_file), sep = ",", col.names = !file.exists(str_c(file,meta_file)), append = T)
+      }
+    }
     future_lapply(1:nrow(out), par_writer, out = out, lemma = lemma)
   }
+
 }
diff --git a/R/out_parser.R b/R/out_parser.R
index 3f9580f..0ad69a4 100644
--- a/R/out_parser.R
+++ b/R/out_parser.R
@@ -20,7 +20,7 @@ out_parser <- function(out, field, clean = F) {
     data
   }
 
-  out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
+  out <- fncols(data.table(out), c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
   par_parser <- function(row, out, field, clean) {
     doc <- out[row,]
     if (field == 'highlight') {