From 522c872dbafbf1a15635eab2951bd4a315ad7ecd Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Mon, 4 Mar 2019 14:21:04 +0100 Subject: [PATCH] out_parser: moved cleaning regex to end of pipeline, to prevent collissions with other (mandatory) regex cleaning --- R/out_parser.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/out_parser.R b/R/out_parser.R index fbf8966..7d0b684 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -68,10 +68,10 @@ out_parser <- function(out, field, clean = F) { ### Old regex, used for duplicate detection: # \\S*?[0-9@#$%]+[^\\s!?.,;:]* out$merged <- out$merged %>% - {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>% str_replace_all("<.{0,20}?>", " ") %>% str_replace_all('(\\. ){2,}', '. ') %>% str_replace_all('([!?.])\\.','\\1') %>% - str_replace_all("\\s+"," ") + str_replace_all("\\s+"," ") %>% + {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } return(out) }