From 34531b0da8fd2f2a9888abca064d36499e3f0238 Mon Sep 17 00:00:00 2001
From: Erik de Vries <erik@devries.pm>
Date: Fri, 11 Jan 2019 13:59:19 +0100
Subject: [PATCH] out_parser: added option to clean output using regex to
 remove numbers and non-words dfm_gen, ud_update: updated functions to make
 use of out_parser cleaning option merger: updated regex for cleaning
 lemmatized output

---
 R/dfm_gen.R       | 6 ++++--
 R/merger.R        | 2 +-
 R/out_parser.R    | 4 +++-
 R/ud_update.R     | 2 +-
 man/dfm_gen.Rd    | 4 +++-
 man/out_parser.Rd | 4 +++-
 6 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/R/dfm_gen.R b/R/dfm_gen.R
index 14f7efe..91e6e46 100644
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@@ -4,6 +4,7 @@
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!
 #' @return A Quanteda dfm
 #' @export
 #' @examples
@@ -16,16 +17,17 @@
 
 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
 
-dfm_gen <- function(out, words = '999', text = "lemmas") {
+dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
   # Create subset with just ids, codes and text
   out <- out %>%
     select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
   fields <- length(names(out))
   if (text == "lemmas" || text == 'ud') {
     out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores()))
+
   }
   if (text == "full") {
-    out <- out_parser(out, field = '_source')
+    out <- out_parser(out, field = '_source' , clean = clean)
   }
   if ('_source.codes.majorTopic' %in% colnames(out)) {
     out <- out %>%
diff --git a/R/merger.R b/R/merger.R
index 50b0f83..a8f8706 100644
--- a/R/merger.R
+++ b/R/merger.R
@@ -25,7 +25,7 @@ merger <- function(row, out, text) {
   # Replacing $-marked punctuation with their regular forms
   lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
     ### Removing numbers and non-words containing numbers
-    str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
+    str_replace_all("\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") %>%
     # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
     paste0(.,". ")
   return(lemmas)
diff --git a/R/out_parser.R b/R/out_parser.R
index c7fe464..369e114 100644
--- a/R/out_parser.R
+++ b/R/out_parser.R
@@ -3,6 +3,7 @@
 #' Parse raw text into a single field
 #' @param out The original output data frame
 #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*
 #' @return a parsed output data frame including the additional column 'merged', containing the merged text
 #' @examples
 #' out_parser(out,field)
@@ -10,7 +11,7 @@
 #################################################################################################
 #################################### Parser function for output fields ##########################
 #################################################################################################
-out_parser <- function(out, field) {
+out_parser <- function(out, field, clean = F) {
   fncols <- function(data, cname) {
     add <-cname[!cname%in%names(data)]
 
@@ -62,6 +63,7 @@ out_parser <- function(out, field) {
   ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
   # Remove html tags, and multiple consequent whitespaces
   out$merged <- out$merged %>%
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "")  else . } %>%
     str_replace_all("<.{0,20}?>", " ") %>%
     str_replace_all('(\\. ){2,}', '. ') %>%
     str_replace_all('([!?.])\\.','\\1') %>%
diff --git a/R/ud_update.R b/R/ud_update.R
index 047308a..0d8e5c4 100644
--- a/R/ud_update.R
+++ b/R/ud_update.R
@@ -20,7 +20,7 @@
 # }
 
 ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) {
-  out <- out_parser(out, field = '_source')
+  out <- out_parser(out, field = '_source', clean = F)
   par_proc <- function(row, out, udmodel) {
     doc <- out[row,]
     ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd
index 1e3b66b..1ef1ea3 100644
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@@ -4,7 +4,7 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas")
+dfm_gen(out, words = "999", text = "lemmas", clean)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
@@ -12,6 +12,8 @@ dfm_gen(out, words = "999", text = "lemmas")
 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
 
 \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
+
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!}
 }
 \value{
 A Quanteda dfm
diff --git a/man/out_parser.Rd b/man/out_parser.Rd
index bc4e71d..a67a904 100644
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@@ -4,12 +4,14 @@
 \alias{out_parser}
 \title{Parse raw text into a single field}
 \usage{
-out_parser(out, field)
+out_parser(out, field, clean = F)
 }
 \arguments{
 \item{out}{The original output data frame}
 
 \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
+
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*}
 }
 \value{
 a parsed output data frame including the additional column 'merged', containing the merged text