ud_update: fixed merged output field to always contain an (extra) dot (period) at the end of the document

master
Erik de Vries 6 years ago
parent c32c9e5ad3
commit 37df81b8ff

@ -42,12 +42,14 @@ ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword
out$`_source.preteaser`,
out$`_source.teaser`,
out$`_source.text`,
'.',
sep = ". ") %>%
# Remove html tags, and multiple consequent whitespaces
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>%
str_replace_all("\\s+"," ")
# out <- filter(out, nchar(merged) > 1)
par_proc <- function(row, out, udmodel) {
doc <- out[row,]
ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%

Loading…
Cancel
Save