out_parser: added option to clean output using regex to remove numbers and non-words

dfm_gen, ud_update: updated functions to make use of out_parser cleaning option merger: updated regex for cleaning lemmatized output
7 years ago · 34531b0da8
parent 5851c56369
commit 34531b0da8
6 changed files with 15 additions and 7 deletions
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -4,6 +4,7 @@
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!
 #' @return A Quanteda dfm
 #' @export
 #' @examples
@ -16,16 +17,17 @@

 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack

-dfm_gen <- function(out, words = '999', text = "lemmas") {
+dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
  if (text == "lemmas" || text == 'ud') {
    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores()))
+
  }
  if (text == "full") {
-    out <- out_parser(out, field = '_source')
+    out <- out_parser(out, field = '_source' , clean = clean)
  }
  if ('_source.codes.majorTopic' %in% colnames(out)) {
    out <- out %>%
--- a/R/merger.R
+++ b/R/merger.R
@ -25,7 +25,7 @@ merger <- function(row, out, text) {
  # Replacing $-marked punctuation with their regular forms
  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
    ### Removing numbers and non-words containing numbers
-    str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
+    str_replace_all("\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") %>%
    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
    paste0(.,". ")
  return(lemmas)
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -3,6 +3,7 @@
 #' Parse raw text into a single field
 #' @param out The original output data frame
 #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*
 #' @return a parsed output data frame including the additional column 'merged', containing the merged text
 #' @examples
 #' out_parser(out,field)
@ -10,7 +11,7 @@
 #################################################################################################
 #################################### Parser function for output fields ##########################
 #################################################################################################
-out_parser <- function(out, field) {
+out_parser <- function(out, field, clean = F) {
  fncols <- function(data, cname) {
    add <-cname[!cname%in%names(data)]

@ -62,6 +63,7 @@ out_parser <- function(out, field) {
  ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
  # Remove html tags, and multiple consequent whitespaces
  out$merged <- out$merged %>%
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "")  else . } %>%
    str_replace_all("<.{0,20}?>", " ") %>%
    str_replace_all('(\\. ){2,}', '. ') %>%
    str_replace_all('([!?.])\\.','\\1') %>%
--- a/R/ud_update.R
+++ b/R/ud_update.R
@ -20,7 +20,7 @@
 # }

 ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) {
-  out <- out_parser(out, field = '_source')
+  out <- out_parser(out, field = '_source', clean = F)
  par_proc <- function(row, out, udmodel) {
    doc <- out[row,]
    ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,7 +4,7 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas")
+dfm_gen(out, words = "999", text = "lemmas", clean)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
@ -12,6 +12,8 @@ dfm_gen(out, words = "999", text = "lemmas")
 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}

 \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
+
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!}
 }
 \value{
 A Quanteda dfm
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@ -4,12 +4,14 @@
 \alias{out_parser}
 \title{Parse raw text into a single field}
 \usage{
-out_parser(out, field)
+out_parser(out, field, clean = F)
 }
 \arguments{
 \item{out}{The original output data frame}

 \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
+
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*}
 }
 \value{
 a parsed output data frame including the additional column 'merged', containing the merged text