#' Generates dfm from ElasticSearch output #' #' Generates dfm from ElasticSearch output #' @param out The elasticizer-generated data frame #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @param tolower Boolean indicating whether dfm features should be lowercased #' @param binary Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE #' @param ngrams Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1 #' @return A Quanteda dfm #' @export #' @examples #' dfm_gen(out, words = '999') ################################################################################################# #################################### DFM generator ############################# ################################################################################################# # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T, binary=F, ngrams=1) { # Create subset with just ids, codes and text out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field fields <- length(names(out)) if (text == "lemmas" || text == 'ud' || text == 'ud_upos') { out <- left_join(out, merger(out, text=text, clean=clean), by = "_id") } if (text == "full") { out <- mamlr:::out_parser(out, field = '_source' , clean = clean) } if ('_source.codes.majorTopic' %in% colnames(out)) { out <- out %>% mutate(codes = `_source.codes.majorTopic`) %>% mutate(junk = case_when( .$codes == 92 ~ 1, .$codes == 91 ~ 1, .$codes == 93 ~ 1, .$codes == 2301 ~ 1, .$codes == 3101 ~ 1, .$codes == 34 ~ 1, TRUE ~ 0 ), aggregate = .$codes %>% str_pad(4, side="right", pad="a") %>% str_match("([0-9]{1,2})?[0|a][1-9|a]") %>% .[,2] %>% as.numeric(), nondomestic = as.numeric(`_source.codes.nonDomestic`) ) %>% mutate( ) vardoc <- select(out, codes, junk, aggregate, nondomestic) } else { vardoc <- NULL } if (words != "999") { ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?")) out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ')) if(text != 'ud_upos') { out$merged <- str_extract(out$merged,'.*[.?!]') } } dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>% tokens(remove_punct = T) %>% tokens_ngrams(n = ngrams, skip = 0, concatenator = '_') %>% dfm(tolower = tolower, stem = F, valuetype = "regex") if (binary) { dfm <- dfm_weight(dfm, scheme = 'boolean') } return(dfm) }