lemma_writer: updated to write metadata csv when dumping documents in ud format

out_parser: fix for generating empty columns
master
Your Name 4 years ago
parent 955f034e6a
commit 4fd9222a2d

@ -17,7 +17,7 @@
#################################### Lemma text file generator ############################# #################################### Lemma text file generator #############################
################################################################################################# #################################################################################################
lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1) { lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1, meta_file = NULL) {
plan(multiprocess, workers = cores) plan(multiprocess, workers = cores)
par_writer <- function(row, out, lemma) { par_writer <- function(row, out, lemma) {
if (lemma == T) { if (lemma == T) {
@ -26,13 +26,22 @@ lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cor
cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F) cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F)
} }
} }
if (documents == F) { if (documents == F) {
out <- unnest(out,`_source.ud`) out <- unnest(out,`_source.ud`)
lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_') lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_')
cat(lemma, file = file, append = T) cat(lemma, file = file, append = T)
} }
if (documents == T) { if (documents == T) {
out <- out_parser(out, field = '_source', clean = F, cores = cores) if (lemma == F) {
out <- out_parser(out, field = '_source', clean = F)
} else {
if (!is.null(meta_file)) {
meta <- select(out, -`_source.ud`)
write.table(meta, str_c(file,meta_file), sep = ",", col.names = !file.exists(str_c(file,meta_file)), append = T)
}
}
future_lapply(1:nrow(out), par_writer, out = out, lemma = lemma) future_lapply(1:nrow(out), par_writer, out = out, lemma = lemma)
} }
} }

@ -20,7 +20,7 @@ out_parser <- function(out, field, clean = F) {
data data
} }
out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser')) out <- fncols(data.table(out), c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
par_parser <- function(row, out, field, clean) { par_parser <- function(row, out, field, clean) {
doc <- out[row,] doc <- out[row,]
if (field == 'highlight') { if (field == 'highlight') {

Loading…
Cancel
Save