You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mamlr/R/dfm_gen.R

67 lines
2.7 KiB

#' Generates dfm from ElasticSearch output
#'
#' Generates dfm from ElasticSearch output
#' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas"
#' @return A Quanteda dfm
#' @export
#' @examples
#' dfm_gen(out, words = '999')
#################################################################################################
#################################### DFM generator #############################
#################################################################################################
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out,words = '999', text = "lemmas") {
# Create subset with just ids, codes and text
out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out))
if (text == "lemmas") {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
}
if (text == "full") {
out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
str_replace_na(out$`_source.subtitle`, replacement = " "),
str_replace_na(out$`_source.preteaser`, replacement = " "),
str_replace_na(out$`_source.teaser`, replacement = " "),
str_replace_na(out$`_source.text`, replacement = " "),
sep = " ") %>%
# Remove html tags
str_replace_all("<.*?>", " ") %>%
str_replace_all("\\s+"," ")
}
if ('_source.codes.majorTopic' %in% colnames(out)) {
out <- out %>%
mutate(codes = as.numeric(case_when(
.$`_source.codes.timeSpent` == -1 ~ NA_character_,
TRUE ~ .$`_source.codes.majorTopic`
))
) %>%
mutate(junk = case_when(
.$codes == 2301 ~ 1,
.$codes == 3101 ~ 1,
.$codes == 34 ~ 1,
.$`_source.codes.timeSpent` == -1 ~ NA_real_,
TRUE ~ 0
)
) %>%
mutate(aggregate = .$codes %>%
str_pad(4, side="right", pad="a") %>%
str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
.[,2] %>%
as.numeric()
)
vardoc <- out[,-seq(1,(length(names(out))-3),1)]
} else {
vardoc <- NULL
}
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
return(dfm)
}