mamlr/R/dfm_gen.R

#' Generates dfm from ElasticSearch output
#'
#' Generates dfm from ElasticSearch output
#' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @return A Quanteda dfm
#' @export
#' @examples
#' dfm_gen(out, words = '999')


#################################################################################################
#################################### DFM generator #############################
#################################################################################################

# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack

dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
  if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = detectCores()))
  }
  if (text == "full") {
    out <- mamlr:::out_parser(out, field = '_source' , clean = clean)
  }
  if ('_source.codes.majorTopic' %in% colnames(out)) {
    out <- out %>%
      mutate(codes = case_when(
        .$`_source.codes.timeSpent` == -1 ~ NA_character_,
        TRUE ~ .$`_source.codes.majorTopic`
      )
      ) %>%
      mutate(junk = case_when(
        .$codes == 2301 ~ 1,
        .$codes == 3101 ~ 1,
        .$codes == 34 ~ 1,
        .$`_source.codes.timeSpent` == -1 ~ NA_real_,
        TRUE ~ 0
      )
      ) %>%
      mutate(aggregate = .$codes %>%
               str_pad(4, side="right", pad="a") %>%
               str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
               .[,2] %>%
               as.numeric()
      )
   vardoc <- out[,-seq(1,(length(names(out))-3),1)]
  } else {
    vardoc <- NULL
  }
  if (words != "999") {
    ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary
    # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))
    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' '))

    if(text != 'ud_upos') {
      out$merged <- str_extract(out$merged,'.*[.?!]')
    }
  }
  dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
    dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
  return(dfm)
}