You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mamlr/R/preproc.R

43 lines
1.9 KiB

#' Preprocess dfm data for use in modeling procedure
#'
#' Process dfm according to parameters provided in params
#'
#' @param dfm_train Training dfm
#' @param dfm_test Testing dfm if applicable, otherwise NULL
#' @param params Row from grid with parameter optimization
#' @return List with dfm_train and dfm_test, processed according to parameters in params
#' @export
#' @examples
#' preproc(dfm_train, dfm_test = NULL, params)
#################################################################################################
#################################### Preprocess data ############################################
#################################################################################################
preproc <- function(dfm_train, dfm_test = NULL, params) {
# Remove non-existing features from training dfm
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
if (params$tfidf) {
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
dfm_train <- dfm_weight(dfm_train, weights = idf)
if (!is.null(dfm_test)) {
dfm_test <- dfm_weight(dfm_test, weights = idf)
}
} else {
idf <- NULL
}
if ("feat_percentiles" %in% colnames(params) && "feat_measures" %in% colnames(params)) {
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
feat_select,
dfm = dfm_train,
class_type = params$class_type,
percentile = params$feat_percentiles,
measure = params$feat_measures
)))
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
}
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
}