From 311838b34bf01ffe4d612bea46547737c115828a Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Tue, 23 Oct 2018 10:40:28 +0200 Subject: [PATCH] Updated dfm_gen to only create derivative codes if majorTopic actually exists, and set docvars to NULL when no majorTopic codes --- R/dfm_gen.R | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 5ae1602..be336fb 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -35,28 +35,32 @@ dfm_gen <- function(out,words = '999', text = c("lemmas","full")) { str_replace_all("<.*?>", " ") %>% str_replace_all("\\s+"," ") } - # out$codes <- out$`_source.codes.majorTopic` %>% - out <- out %>% - mutate(codes = case_when( - .$`_source.codes.timeSpent` == -1 ~ NA_character_, - TRUE ~ .$`_source.codes.majorTopic` - ) - ) %>% - mutate(junk = case_when( - .$codes == 2301 ~ 1, - .$codes == 3101 ~ 1, - .$codes == 34 ~ 1, - .$`_source.codes.timeSpent` == -1 ~ NA_real_, - TRUE ~ 0 - ) - ) %>% - mutate(aggregate = .$codes %>% - str_pad(4, side="right", pad="a") %>% - str_match("([0-9]{1,2})?[0|a][1-9|a]") %>% - .[,2] %>% - as.numeric() - ) - dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>% + if ('_source.codes.majorTopic' %in% colnames(out)) { + out <- out %>% + mutate(codes = case_when( + .$`_source.codes.timeSpent` == -1 ~ NA_character_, + TRUE ~ .$`_source.codes.majorTopic` + ) + ) %>% + mutate(junk = case_when( + .$codes == 2301 ~ 1, + .$codes == 3101 ~ 1, + .$codes == 34 ~ 1, + .$`_source.codes.timeSpent` == -1 ~ NA_real_, + TRUE ~ 0 + ) + ) %>% + mutate(aggregate = .$codes %>% + str_pad(4, side="right", pad="a") %>% + str_match("([0-9]{1,2})?[0|a][1-9|a]") %>% + .[,2] %>% + as.numeric() + ) + vardoc <- out[,-seq(1,(length(names(out))-3),1)] + } else { + vardoc <- NULL + } + dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>% dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1) return(dfm) }