|
|
@ -35,28 +35,32 @@ dfm_gen <- function(out,words = '999', text = c("lemmas","full")) {
|
|
|
|
str_replace_all("<.*?>", " ") %>%
|
|
|
|
str_replace_all("<.*?>", " ") %>%
|
|
|
|
str_replace_all("\\s+"," ")
|
|
|
|
str_replace_all("\\s+"," ")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# out$codes <- out$`_source.codes.majorTopic` %>%
|
|
|
|
if ('_source.codes.majorTopic' %in% colnames(out)) {
|
|
|
|
out <- out %>%
|
|
|
|
out <- out %>%
|
|
|
|
mutate(codes = case_when(
|
|
|
|
mutate(codes = case_when(
|
|
|
|
.$`_source.codes.timeSpent` == -1 ~ NA_character_,
|
|
|
|
.$`_source.codes.timeSpent` == -1 ~ NA_character_,
|
|
|
|
TRUE ~ .$`_source.codes.majorTopic`
|
|
|
|
TRUE ~ .$`_source.codes.majorTopic`
|
|
|
|
)
|
|
|
|
)
|
|
|
|
) %>%
|
|
|
|
) %>%
|
|
|
|
mutate(junk = case_when(
|
|
|
|
mutate(junk = case_when(
|
|
|
|
.$codes == 2301 ~ 1,
|
|
|
|
.$codes == 2301 ~ 1,
|
|
|
|
.$codes == 3101 ~ 1,
|
|
|
|
.$codes == 3101 ~ 1,
|
|
|
|
.$codes == 34 ~ 1,
|
|
|
|
.$codes == 34 ~ 1,
|
|
|
|
.$`_source.codes.timeSpent` == -1 ~ NA_real_,
|
|
|
|
.$`_source.codes.timeSpent` == -1 ~ NA_real_,
|
|
|
|
TRUE ~ 0
|
|
|
|
TRUE ~ 0
|
|
|
|
)
|
|
|
|
)
|
|
|
|
) %>%
|
|
|
|
) %>%
|
|
|
|
mutate(aggregate = .$codes %>%
|
|
|
|
mutate(aggregate = .$codes %>%
|
|
|
|
str_pad(4, side="right", pad="a") %>%
|
|
|
|
str_pad(4, side="right", pad="a") %>%
|
|
|
|
str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
|
|
|
|
str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
|
|
|
|
.[,2] %>%
|
|
|
|
.[,2] %>%
|
|
|
|
as.numeric()
|
|
|
|
as.numeric()
|
|
|
|
)
|
|
|
|
)
|
|
|
|
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>%
|
|
|
|
vardoc <- out[,-seq(1,(length(names(out))-3),1)]
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
vardoc <- NULL
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
|
|
|
|
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
|
|
|
|
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
|
|
|
|
return(dfm)
|
|
|
|
return(dfm)
|
|
|
|
}
|
|
|
|
}
|
|
|
|