mamlr/R/dfm_gen.R

#' Generates dfm from ElasticSearch output
#'
#' Generates dfm from ElasticSearch output
#' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas"
#' @return A Quanteda dfm
#' @export
#' @examples
#' dfm_gen(out, words = '999')


#################################################################################################
#################################### DFM generator #############################
#################################################################################################

# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack

dfm_gen <- function(out,words = '999', text = "lemmas") {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
  if (text == "lemmas") {
    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
  }
  if (text == "full") {
    out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
                        str_replace_na(out$`_source.subtitle`, replacement = " "),
                        str_replace_na(out$`_source.preteaser`, replacement = " "),
                        str_replace_na(out$`_source.teaser`, replacement = " "),
                        str_replace_na(out$`_source.text`, replacement = " "),
                        sep = " ") %>%
      # Remove html tags
      str_replace_all("<.*?>", " ") %>%
      str_replace_all("\\s+"," ")
  }
  if ('_source.codes.majorTopic' %in% colnames(out)) {
    out <- out %>%
      mutate(codes = case_when(
        .$`_source.codes.timeSpent` == -1 ~ NA_character_,
        TRUE ~ .$`_source.codes.majorTopic`
      )
      ) %>%
      mutate(junk = case_when(
        .$codes == 2301 ~ 1,
        .$codes == 3101 ~ 1,
        .$codes == 34 ~ 1,
        .$`_source.codes.timeSpent` == -1 ~ NA_real_,
        TRUE ~ 0
      )
      ) %>%
      mutate(aggregate = .$codes %>%
               str_pad(4, side="right", pad="a") %>%
               str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
               .[,2] %>%
               as.numeric()
      )
   vardoc <- out[,-seq(1,(length(names(out))-3),1)]
  } else {
    vardoc <- NULL
  }
  dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
    dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
  return(dfm)
}
First release of mamlr package 6 years ago			`#' Generates dfm from ElasticSearch output`
			`#'`
			`#' Generates dfm from ElasticSearch output`
			`#' @param out The elasticizer-generated data frame`
			`#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document`
Added option for fulltext vs lemmas merged field 6 years ago			`#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas"`
Added line to replace multiple whitespace characters in full text by a single regular whitespace 6 years ago			`#' @return A Quanteda dfm`
First release of mamlr package 6 years ago			`#' @export`
			`#' @examples`
			`#' dfm_gen(out, words = '999')`


			`#################################################################################################`
			`#################################### DFM generator #############################`
			`#################################################################################################`

			# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack

Set default to "lemmas" for dfm_gen 6 years ago			`dfm_gen <- function(out,words = '999', text = "lemmas") {`
First release of mamlr package 6 years ago			`# Create subset with just ids, codes and text`
			`out <- out %>%`
			select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
			`fields <- length(names(out))`
Added option for fulltext vs lemmas merged field 6 years ago			`if (text == "lemmas") {`
			`out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))`
			`}`
			`if (text == "full") {`
			out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
			str_replace_na(out$`_source.subtitle`, replacement = " "),
			str_replace_na(out$`_source.preteaser`, replacement = " "),
			str_replace_na(out$`_source.teaser`, replacement = " "),
			str_replace_na(out$`_source.text`, replacement = " "),
			`sep = " ") %>%`
			`# Remove html tags`
Added line to replace multiple whitespace characters in full text by a single regular whitespace 6 years ago			`str_replace_all("<.*?>", " ") %>%`
			`str_replace_all("\\s+"," ")`
Added option for fulltext vs lemmas merged field 6 years ago			`}`
Updated dfm_gen to only create derivative codes if majorTopic actually exists, and set docvars to NULL when no majorTopic codes 6 years ago			`if ('_source.codes.majorTopic' %in% colnames(out)) {`
			`out <- out %>%`
			`mutate(codes = case_when(`
			.$`_source.codes.timeSpent` == -1 ~ NA_character_,
			TRUE ~ .$`_source.codes.majorTopic`
			`)`
			`) %>%`
			`mutate(junk = case_when(`
			`.$codes == 2301 ~ 1,`
			`.$codes == 3101 ~ 1,`
			`.$codes == 34 ~ 1,`
			.$`_source.codes.timeSpent` == -1 ~ NA_real_,
			`TRUE ~ 0`
			`)`
			`) %>%`
			`mutate(aggregate = .$codes %>%`
			`str_pad(4, side="right", pad="a") %>%`
			`str_match("([0-9]{1,2})?[0\|a][1-9\|a]") %>%`
			`.[,2] %>%`
			`as.numeric()`
			`)`
			`vardoc <- out[,-seq(1,(length(names(out))-3),1)]`
			`} else {`
			`vardoc <- NULL`
			`}`
			dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
First release of mamlr package 6 years ago			`dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)`
			`return(dfm)`
Added line to replace multiple whitespace characters in full text by a single regular whitespace 6 years ago			`}`