#' Generates dfm from ElasticSearch output #' #' Generates dfm from ElasticSearch output #' @param out The elasticizer-generated data frame #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @return A Quanteda dfm #' @export #' @examples #' dfm_gen(out, words = '999') ################################################################################################# #################################### DFM generator ############################# ################################################################################################# # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack dfm_gen <- function(out,words = '999') { # Create subset with just ids, codes and text out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field fields <- length(names(out)) out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores())) # out$codes <- out$`_source.codes.majorTopic` %>% out <- out %>% mutate(codes = case_when( .$`_source.codes.timeSpent` == -1 ~ NA_character_, TRUE ~ .$`_source.codes.majorTopic` ) ) %>% mutate(junk = case_when( .$codes == 2301 ~ 1, .$codes == 3101 ~ 1, .$codes == 34 ~ 1, .$`_source.codes.timeSpent` == -1 ~ NA_real_, TRUE ~ 0 ) ) %>% mutate(aggregate = .$codes %>% str_pad(4, side="right", pad="a") %>% str_match("([0-9]{1,2})?[0|a][1-9|a]") %>% .[,2] %>% as.numeric() ) dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>% dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1) return(dfm) }