diff --git a/R/dfm_gen.R b/R/dfm_gen.R index bc603e7..53b689e 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -35,18 +35,12 @@ dfm_gen <- function(out, words = '999', text = "lemmas") { str_replace_all("<.*?>", " ") %>% str_replace_all("\\s+"," ") } - if (words != "999") { - ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary - # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?")) - out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>% - str_extract('.*[.?!]')) - } if ('_source.codes.majorTopic' %in% colnames(out)) { out <- out %>% - mutate(codes = as.numeric(case_when( + mutate(codes = case_when( .$`_source.codes.timeSpent` == -1 ~ NA_character_, TRUE ~ .$`_source.codes.majorTopic` - )) + ) ) %>% mutate(junk = case_when( .$codes == 2301 ~ 1, @@ -66,6 +60,12 @@ dfm_gen <- function(out, words = '999', text = "lemmas") { } else { vardoc <- NULL } + if (words != "999") { + ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary + # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?")) + out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>% + str_extract('.*[.?!]')) + } dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>% dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1) return(dfm)