out_parser: moved cleaning regex to end of pipeline, to prevent collissions with other (mandatory) regex cleaning

master
Erik de Vries 6 years ago
parent 5b9793cd8c
commit 522c872dba

@ -68,10 +68,10 @@ out_parser <- function(out, field, clean = F) {
### Old regex, used for duplicate detection:
# \\S*?[0-9@#$%]+[^\\s!?.,;:]*
out$merged <- out$merged %>%
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>%
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>%
str_replace_all("\\s+"," ")
str_replace_all("\\s+"," ") %>%
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . }
return(out)
}

Loading…
Cancel
Save