From 34531b0da8fd2f2a9888abca064d36499e3f0238 Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Fri, 11 Jan 2019 13:59:19 +0100 Subject: [PATCH] out_parser: added option to clean output using regex to remove numbers and non-words dfm_gen, ud_update: updated functions to make use of out_parser cleaning option merger: updated regex for cleaning lemmatized output --- R/dfm_gen.R | 6 ++++-- R/merger.R | 2 +- R/out_parser.R | 4 +++- R/ud_update.R | 2 +- man/dfm_gen.Rd | 4 +++- man/out_parser.Rd | 4 +++- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 14f7efe..91e6e46 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -4,6 +4,7 @@ #' @param out The elasticizer-generated data frame #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" +#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned! #' @return A Quanteda dfm #' @export #' @examples @@ -16,16 +17,17 @@ # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack -dfm_gen <- function(out, words = '999', text = "lemmas") { +dfm_gen <- function(out, words = '999', text = "lemmas", clean) { # Create subset with just ids, codes and text out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field fields <- length(names(out)) if (text == "lemmas" || text == 'ud') { out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores())) + } if (text == "full") { - out <- out_parser(out, field = '_source') + out <- out_parser(out, field = '_source' , clean = clean) } if ('_source.codes.majorTopic' %in% colnames(out)) { out <- out %>% diff --git a/R/merger.R b/R/merger.R index 50b0f83..a8f8706 100644 --- a/R/merger.R +++ b/R/merger.R @@ -25,7 +25,7 @@ merger <- function(row, out, text) { # Replacing $-marked punctuation with their regular forms lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>% ### Removing numbers and non-words containing numbers - str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>% + str_replace_all("\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") %>% # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " paste0(.,". ") return(lemmas) diff --git a/R/out_parser.R b/R/out_parser.R index c7fe464..369e114 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -3,6 +3,7 @@ #' Parse raw text into a single field #' @param out The original output data frame #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text +#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]* #' @return a parsed output data frame including the additional column 'merged', containing the merged text #' @examples #' out_parser(out,field) @@ -10,7 +11,7 @@ ################################################################################################# #################################### Parser function for output fields ########################## ################################################################################################# -out_parser <- function(out, field) { +out_parser <- function(out, field, clean = F) { fncols <- function(data, cname) { add <-cname[!cname%in%names(data)] @@ -62,6 +63,7 @@ out_parser <- function(out, field) { ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences # Remove html tags, and multiple consequent whitespaces out$merged <- out$merged %>% + {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") else . } %>% str_replace_all("<.{0,20}?>", " ") %>% str_replace_all('(\\. ){2,}', '. ') %>% str_replace_all('([!?.])\\.','\\1') %>% diff --git a/R/ud_update.R b/R/ud_update.R index 047308a..0d8e5c4 100644 --- a/R/ud_update.R +++ b/R/ud_update.R @@ -20,7 +20,7 @@ # } ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) { - out <- out_parser(out, field = '_source') + out <- out_parser(out, field = '_source', clean = F) par_proc <- function(row, out, udmodel) { doc <- out[row,] ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index 1e3b66b..1ef1ea3 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -4,7 +4,7 @@ \alias{dfm_gen} \title{Generates dfm from ElasticSearch output} \usage{ -dfm_gen(out, words = "999", text = "lemmas") +dfm_gen(out, words = "999", text = "lemmas", clean) } \arguments{ \item{out}{The elasticizer-generated data frame} @@ -12,6 +12,8 @@ dfm_gen(out, words = "999", text = "lemmas") \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} + +\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!} } \value{ A Quanteda dfm diff --git a/man/out_parser.Rd b/man/out_parser.Rd index bc4e71d..a67a904 100644 --- a/man/out_parser.Rd +++ b/man/out_parser.Rd @@ -4,12 +4,14 @@ \alias{out_parser} \title{Parse raw text into a single field} \usage{ -out_parser(out, field) +out_parser(out, field, clean = F) } \arguments{ \item{out}{The original output data frame} \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} + +\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*} } \value{ a parsed output data frame including the additional column 'merged', containing the merged text