From dc4daf9de46ae72f8bfe9e70a1e77009cd74d5cf Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Tue, 23 Oct 2018 10:27:21 +0200 Subject: [PATCH] Added line to replace multiple whitespace characters in full text by a single regular whitespace --- R/dfm_gen.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 2cc0e75..5ae1602 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -4,7 +4,7 @@ #' @param out The elasticizer-generated data frame #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas" -#' @return A Quanteda dfm +#' @return A Quanteda dfm #' @export #' @examples #' dfm_gen(out, words = '999') @@ -32,7 +32,8 @@ dfm_gen <- function(out,words = '999', text = c("lemmas","full")) { str_replace_na(out$`_source.text`, replacement = " "), sep = " ") %>% # Remove html tags - str_replace_all("<.*?>", " ") + str_replace_all("<.*?>", " ") %>% + str_replace_all("\\s+"," ") } # out$codes <- out$`_source.codes.majorTopic` %>% out <- out %>% @@ -58,4 +59,4 @@ dfm_gen <- function(out,words = '999', text = c("lemmas","full")) { dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>% dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1) return(dfm) -} \ No newline at end of file +}