You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
72 lines
3.0 KiB
72 lines
3.0 KiB
#' Preprocess dfm data for use in modeling procedure
|
|
#'
|
|
#' Process dfm according to parameters provided in params
|
|
#'
|
|
#' @param dfm_train Training dfm
|
|
#' @param dfm_test Testing dfm if applicable, otherwise NULL
|
|
#' @param params Row from grid with parameter optimization
|
|
#' @param we_vectors Matrix with word embedding vectors
|
|
#' @return List with dfm_train and dfm_test, processed according to parameters in params
|
|
#' @export
|
|
#' @examples
|
|
#' preproc(dfm_train, dfm_test = NULL, params)
|
|
#################################################################################################
|
|
#################################### Preprocess data ############################################
|
|
#################################################################################################
|
|
|
|
|
|
### CURRENTLY UNUSED!!!###
|
|
|
|
preproc <- function(dfm_train, dfm_test = NULL, params, we_vectors) {
|
|
# Remove non-existing features from training dfm
|
|
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
|
|
if (params$tfidf) {
|
|
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
|
|
dfm_train <- dfm_weight(dfm_train, weights = idf)
|
|
if (!is.null(dfm_test)) {
|
|
dfm_test <- dfm_weight(dfm_test, weights = idf)
|
|
}
|
|
} else {
|
|
idf <- NULL
|
|
}
|
|
|
|
if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
|
|
|
|
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
|
|
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
|
|
feat_select,
|
|
dfm = dfm_train,
|
|
class_type = params$class_type,
|
|
percentile = params$feat_percentiles,
|
|
measure = params$feat_measures
|
|
)))
|
|
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
|
|
}
|
|
|
|
if (!is.null(we_vectors)) {
|
|
shared_dict <- sort(intersect(dfm_train@Dimnames$features,we_vectors$V1))
|
|
if (!is.null(dfm_test)) {
|
|
shared_dict <- sort(intersect(dfm_test@Dimnames$features,shared_dict))
|
|
dfm_test <- dfm_keep(dfm_test, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
|
|
.[, sort(colnames(.))]
|
|
}
|
|
dfm_train <- dfm_keep(dfm_train, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
|
|
.[, sort(colnames(.))]
|
|
we_matrix <- filter(we_vectors, V1 %in% shared_dict) %>%
|
|
arrange(V1) %>%
|
|
as.data.table(.) %>%
|
|
.[,2:ncol(.), with = F] %>%
|
|
as.matrix(.)
|
|
|
|
dfm_train_we_sum <- dfm_train %*% we_matrix
|
|
# dfm_train_we_mean <- dfm_train_we_sum / as.vector(rowSums(dfm_train))
|
|
|
|
if (!is.null(dfm_test)) {
|
|
dfm_test_we_sum <- dfm_test %*% we_matrix
|
|
# dfm_test_we_mean <- dfm_test_we_sum / as.vector(rowSums(dfm_test))
|
|
}
|
|
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf, dfm_train_we = dfm_train_we_sum, dfm_test_we = dfm_test_we_sum))
|
|
}
|
|
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
|
|
}
|