dfm_gen, merger: Added option for generating lemma_upos hybrids for merged field

merger: Added custom clean option (sometimes not cleaning is preferred, even with lemmas) merger, out_parser: Updated regex for filtering out non-words to also include email addresses (containing both @ and .)
7 years ago · ce5f812252
parent 386ac42aee
commit ce5f812252
5 changed files with 32 additions and 14 deletions
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -3,8 +3,8 @@
 #' Generates dfm from ElasticSearch output
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
-#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
-#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!
+#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @return A Quanteda dfm
 #' @export
 #' @examples
@ -22,9 +22,8 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
-  if (text == "lemmas" || text == 'ud') {
-    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores()))
-
+  if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
+    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = detectCores()))
  }
  if (text == "full") {
    out <- out_parser(out, field = '_source' , clean = clean)
@ -57,8 +56,11 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
  if (words != "999") {
    ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary
    # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))
-    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>%
-                                          str_extract('.*[.?!]'))
+    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' '))
+
+    if(text != 'ud_upos') {
+      out$merged <- str_extract(out$merged,'.*[.?!]')
+    }
  }
  dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
    dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
--- a/R/merger.R
+++ b/R/merger.R
@ -5,6 +5,7 @@
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param out The elasticizer-generated data frame
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @return A documentified string of lemmas, one document at a time
 #' @export
 #' @examples
@ -13,7 +14,7 @@
 #################################### Reconstructing documents from lemmas########################
 #################################################################################################
 ## Only merging lemmas for now, feature selection has no impact on junk classification
-merger <- function(row, out, text) {
+merger <- function(row, out, text, clean) {
  df <- out[row,]
  # Mergin lemmas into single string
  if (text == 'lemmas') {
@ -22,10 +23,20 @@ merger <- function(row, out, text) {
  if (text == 'ud') {
    lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
  }
+  if (text == 'ud_upos') {
+    df <- unnest(df,`_source.ud`)
+    lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
+      # Regex removes all words consisting of or containing numbers, @#$%
+      # Punctuation is not taken into account, as it is already filtered out, see above
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "")  else . }
+    return(lemmas)
+  }
  # Replacing $-marked punctuation with their regular forms
  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
-    ### Removing numbers and non-words containing numbers
-    str_replace_all("\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") %>%
+    # Regex removes all words consisting of or containing numbers, @#$%
+    # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
+    # Regex also used in out_parser
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
    paste0(.,". ")
  return(lemmas)
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -62,8 +62,11 @@ out_parser <- function(out, field, clean = F) {

  ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
  # Remove html tags, and multiple consequent whitespaces
+  # Regex removes all words consisting of or containing numbers, @#$%
+  # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
+  # Regex also used in merger function
  out$merged <- out$merged %>%
-    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "")  else . } %>%
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
    str_replace_all("<.{0,20}?>", " ") %>%
    str_replace_all('(\\. ){2,}', '. ') %>%
    str_replace_all('([!?.])\\.','\\1') %>%
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -11,9 +11,9 @@ dfm_gen(out, words = "999", text = "lemmas", clean)

 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}

-\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
+\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}

-\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!}
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
 }
 \value{
 A Quanteda dfm
--- a/man/merger.Rd
+++ b/man/merger.Rd
@ -4,7 +4,7 @@
 \alias{merger}
 \title{Merges list of lemmas back into a pseudo-document}
 \usage{
-merger(row, out, text)
+merger(row, out, text, clean)
 }
 \arguments{
 \item{row}{A row number form the Elasticizer-generated data frame}
@ -13,6 +13,8 @@ merger(row, out, text)

 \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}

+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
+
 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
 }
 \value{