dfm_gen, merger: Added option for generating lemma_upos hybrids for merged field

merger: Added custom clean option (sometimes not cleaning is preferred, even with lemmas)
merger, out_parser: Updated regex for filtering out non-words to also include email addresses (containing both @ and .)
master
Erik de Vries 6 years ago
parent 386ac42aee
commit ce5f812252

@ -3,8 +3,8 @@
#' Generates dfm from ElasticSearch output #' Generates dfm from ElasticSearch output
#' @param out The elasticizer-generated data frame #' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned! #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @return A Quanteda dfm #' @return A Quanteda dfm
#' @export #' @export
#' @examples #' @examples
@ -22,9 +22,8 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
out <- out %>% out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out)) fields <- length(names(out))
if (text == "lemmas" || text == 'ud') { if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores())) out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = detectCores()))
} }
if (text == "full") { if (text == "full") {
out <- out_parser(out, field = '_source' , clean = clean) out <- out_parser(out, field = '_source' , clean = clean)
@ -57,8 +56,11 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
if (words != "999") { if (words != "999") {
### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary
# out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?")) # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))
out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>% out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' '))
str_extract('.*[.?!]'))
if(text != 'ud_upos') {
out$merged <- str_extract(out$merged,'.*[.?!]')
}
} }
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>% dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1) dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)

@ -5,6 +5,7 @@
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param out The elasticizer-generated data frame #' @param out The elasticizer-generated data frame
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @return A documentified string of lemmas, one document at a time #' @return A documentified string of lemmas, one document at a time
#' @export #' @export
#' @examples #' @examples
@ -13,7 +14,7 @@
#################################### Reconstructing documents from lemmas######################## #################################### Reconstructing documents from lemmas########################
################################################################################################# #################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification ## Only merging lemmas for now, feature selection has no impact on junk classification
merger <- function(row, out, text) { merger <- function(row, out, text, clean) {
df <- out[row,] df <- out[row,]
# Mergin lemmas into single string # Mergin lemmas into single string
if (text == 'lemmas') { if (text == 'lemmas') {
@ -22,10 +23,20 @@ merger <- function(row, out, text) {
if (text == 'ud') { if (text == 'ud') {
lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ') lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
} }
if (text == 'ud_upos') {
df <- unnest(df,`_source.ud`)
lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is not taken into account, as it is already filtered out, see above
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "") else . }
return(lemmas)
}
# Replacing $-marked punctuation with their regular forms # Replacing $-marked punctuation with their regular forms
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>% lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
### Removing numbers and non-words containing numbers # Regex removes all words consisting of or containing numbers, @#$%
str_replace_all("\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") %>% # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in out_parser
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>%
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
paste0(.,". ") paste0(.,". ")
return(lemmas) return(lemmas)

@ -62,8 +62,11 @@ out_parser <- function(out, field, clean = F) {
### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
# Remove html tags, and multiple consequent whitespaces # Remove html tags, and multiple consequent whitespaces
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in merger function
out$merged <- out$merged %>% out$merged <- out$merged %>%
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") else . } %>% {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>%
str_replace_all("<.{0,20}?>", " ") %>% str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>% str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>% str_replace_all('([!?.])\\.','\\1') %>%

@ -11,9 +11,9 @@ dfm_gen(out, words = "999", text = "lemmas", clean)
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
} }
\value{ \value{
A Quanteda dfm A Quanteda dfm

@ -4,7 +4,7 @@
\alias{merger} \alias{merger}
\title{Merges list of lemmas back into a pseudo-document} \title{Merges list of lemmas back into a pseudo-document}
\usage{ \usage{
merger(row, out, text) merger(row, out, text, clean)
} }
\arguments{ \arguments{
\item{row}{A row number form the Elasticizer-generated data frame} \item{row}{A row number form the Elasticizer-generated data frame}
@ -13,6 +13,8 @@ merger(row, out, text)
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
} }
\value{ \value{

Loading…
Cancel
Save