#' Merges list of lemmas back into a pseudo-document #' #' Merges list of lemmas back into a pseudo-document #' @param out The elasticizer-generated data frame #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @return A documentified string of lemmas, one document at a time #' @export #' @examples #' merger(out, text, clean) ################################################################################################# #################################### Reconstructing documents from lemmas######################## ################################################################################################# ## Only merging lemmas for now, feature selection has no impact on junk classification merger <- function(out, text, clean) { df <- unnest(out, cols = '_source.ud') %>% select(`_id`,lemma,upos) %>% unnest(cols = c('lemma','upos')) %>% # This line is added in the new merger function, in the old merger function this would result in the following: # 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma # 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA filter(!is.na(lemma)) %>% group_by(`_id`) if (text == 'ud_upos') { df <- df %>% filter(upos != 'PUNCT') %>% mutate( lem_u = str_c(lemma,upos,sep="_") ) %>% summarise( merged = str_c(c(lem_u), collapse= ' ') ) %>% # Regex removes all words consisting of or containing numbers, @#$% # Punctuation is not taken into account, as it is already filtered out, see above {if(clean == T) mutate(., merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "") ) else . } } if (text == 'ud') { df <- df %>% summarise( merged = str_c(c(lemma), collapse= ' ') ) %>% mutate( merged = str_replace_all(merged," \\$(.+?)", "\\1") ) %>% # Regex removes all words consisting of or containing numbers, @#$% # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above # Regex also used in out_parser # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " {if(clean == T) mutate(., merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") ) else . } %>% mutate(., merged = paste0(merged,'. ')) } return(df) }