diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 1eb9bd9..3b9ff2e 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -3,8 +3,8 @@ #' Generates dfm from ElasticSearch output #' @param out The elasticizer-generated data frame #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document -#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" -#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned! +#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags +#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @return A Quanteda dfm #' @export #' @examples @@ -22,9 +22,8 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean) { out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field fields <- length(names(out)) - if (text == "lemmas" || text == 'ud') { - out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores())) - + if (text == "lemmas" || text == 'ud' || text == 'ud_upos') { + out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = detectCores())) } if (text == "full") { out <- out_parser(out, field = '_source' , clean = clean) @@ -57,8 +56,11 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean) { if (words != "999") { ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?")) - out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>% - str_extract('.*[.?!]')) + out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ')) + + if(text != 'ud_upos') { + out$merged <- str_extract(out$merged,'.*[.?!]') + } } dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>% dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1) diff --git a/R/merger.R b/R/merger.R index a8f8706..440c462 100644 --- a/R/merger.R +++ b/R/merger.R @@ -5,6 +5,7 @@ #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param out The elasticizer-generated data frame #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" +#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @return A documentified string of lemmas, one document at a time #' @export #' @examples @@ -13,7 +14,7 @@ #################################### Reconstructing documents from lemmas######################## ################################################################################################# ## Only merging lemmas for now, feature selection has no impact on junk classification -merger <- function(row, out, text) { +merger <- function(row, out, text, clean) { df <- out[row,] # Mergin lemmas into single string if (text == 'lemmas') { @@ -22,10 +23,20 @@ merger <- function(row, out, text) { if (text == 'ud') { lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ') } + if (text == 'ud_upos') { + df <- unnest(df,`_source.ud`) + lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>% + # Regex removes all words consisting of or containing numbers, @#$% + # Punctuation is not taken into account, as it is already filtered out, see above + {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "") else . } + return(lemmas) + } # Replacing $-marked punctuation with their regular forms lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>% - ### Removing numbers and non-words containing numbers - str_replace_all("\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") %>% + # Regex removes all words consisting of or containing numbers, @#$% + # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above + # Regex also used in out_parser + {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>% # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " paste0(.,". ") return(lemmas) diff --git a/R/out_parser.R b/R/out_parser.R index eee720b..a93e3ad 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -62,8 +62,11 @@ out_parser <- function(out, field, clean = F) { ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences # Remove html tags, and multiple consequent whitespaces + # Regex removes all words consisting of or containing numbers, @#$% + # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above + # Regex also used in merger function out$merged <- out$merged %>% - {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") else . } %>% + {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>% str_replace_all("<.{0,20}?>", " ") %>% str_replace_all('(\\. ){2,}', '. ') %>% str_replace_all('([!?.])\\.','\\1') %>% diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index 4bed478..5ed3679 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -11,9 +11,9 @@ dfm_gen(out, words = "999", text = "lemmas", clean) \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} -\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} +\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags} -\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!} +\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} } \value{ A Quanteda dfm diff --git a/man/merger.Rd b/man/merger.Rd index 4efe147..85f280a 100644 --- a/man/merger.Rd +++ b/man/merger.Rd @@ -4,7 +4,7 @@ \alias{merger} \title{Merges list of lemmas back into a pseudo-document} \usage{ -merger(row, out, text) +merger(row, out, text, clean) } \arguments{ \item{row}{A row number form the Elasticizer-generated data frame} @@ -13,6 +13,8 @@ merger(row, out, text) \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} +\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} + \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} } \value{