separated data preprocessing routines

class_update: check if there are idf values associated with model, before applying weights
estimator: make use of preproc() function for data preprocessing
preproc: function containing all logic with regards to text data preprocessing and weighting
master
Your Name 5 years ago
parent a3b6e19646
commit 9eae486a80

@ -17,6 +17,7 @@ export(metric_gen)
export(modelizer) export(modelizer)
export(modelizer_old) export(modelizer_old)
export(out_parser) export(out_parser)
export(preproc)
export(query_gen_actors) export(query_gen_actors)
export(query_string) export(query_string)
export(ud_update) export(ud_update)

@ -20,8 +20,10 @@
################################################################################################# #################################################################################################
class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) { class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) {
print('updating') print('updating')
dfm <- dfm_gen(out, text = text, words = words, clean = clean, cores = cores) %>% dfm <- dfm_gen(out, text = text, words = words, clean = clean, cores = cores)
dfm_weight(weights = model_final$idf) if (!is.null(model_final$idf)) {
dfm <- dfm_weight(dfm, weights = model_final$idf)
}
pred <- data.frame(id = out$`_id`, pred = predict(model_final$text_model, newdata = dfm, type = "class", force = T)) pred <- data.frame(id = out$`_id`, pred = predict(model_final$text_model, newdata = dfm, type = "class", force = T))
bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set', ver = ver) bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set', ver = ver)
res <- elastic_update(bulk, es_super = es_super, localhost = localhost) res <- elastic_update(bulk, es_super = es_super, localhost = localhost)

@ -13,7 +13,7 @@
#' @examples #' @examples
#' estimator(row, grid, outer_folds, dfm, class_type, model) #' estimator(row, grid, outer_folds, dfm, class_type, model)
################################################################################################# #################################################################################################
#################################### Generate CV folds ########################################## #################################### Generate models ############################################
################################################################################################# #################################################################################################
### Classification function ### Classification function
@ -40,29 +40,16 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
final <- T ### Indicate final modeling run on whole dataset final <- T ### Indicate final modeling run on whole dataset
dfm_train <- dfm dfm_train <- dfm
} }
## Currently scheme_tf is not used explicitly
# if (model == 'nb') {
# scheme_tf <- 'count' # The 'old' way
# } else {
# scheme_tf <- 'prop' # The 'new' way
# }
### Getting features from training dataset
# Getting idf from training data, and using it to normalize both training and testing feature occurence
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
dfm_train <- dfm_weight(dfm_train, weights = idf)
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
words <- unique(unlist(lapply(unique(docvars(dfm_train, class_type)),
feat_select,
dfm = dfm_train,
class_type = class_type,
percentile = params$percentiles,
measure = params$measures
)))
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
if (exists("final")) {
preproc_dfm <- preproc(dfm_train, NULL, params)
dfm_train <- preproc_dfm$dfm_train
} else {
preproc_dfm <- preproc(dfm_train, dfm_test, params)
dfm_train <- preproc_dfm$dfm_train
dfm_test <- preproc_dfm$dfm_test
}
idf <- preproc_dfm$idf
if (model == "nb") { if (model == "nb") {
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial") text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
@ -80,7 +67,6 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
if (exists("final")) { if (exists("final")) {
return(list(text_model=text_model, idf=idf)) return(list(text_model=text_model, idf=idf))
} else { # Create a test set, and classify test items } else { # Create a test set, and classify test items
dfm_test <- dfm_weight(dfm_test, weights = idf)
# Use force=T to keep only features present in both training and test set # Use force=T to keep only features present in both training and test set
pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T) pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T)

@ -0,0 +1,42 @@
#' Preprocess dfm data for use in modeling procedure
#'
#' Process dfm according to parameters provided in params
#'
#' @param dfm_train Training dfm
#' @param dfm_test Testing dfm if applicable, otherwise NULL
#' @param params Row from grid with parameter optimization
#' @return List with dfm_train and dfm_test, processed according to parameters in params
#' @export
#' @examples
#' preproc(dfm_train, dfm_test = NULL, params)
#################################################################################################
#################################### Preprocess data ############################################
#################################################################################################
preproc <- function(dfm_train, dfm_test = NULL, params) {
# Remove non-existing features from training dfm
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
if (params$tfidf) {
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
dfm_train <- dfm_weight(dfm_train, weights = idf)
if (!is.null(dfm_test)) {
dfm_test <- dfm_weight(dfm_test, weights = idf)
}
} else {
idf <- NULL
}
if ("feat_percentiles" %in% colnames(params) && "feat_measures" %in% colnames(params)) {
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
feat_select,
dfm = dfm_train,
class_type = params$class_type,
percentile = params$feat_percentiles,
measure = params$feat_measures
)))
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
}
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
}

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/preproc.R
\name{preproc}
\alias{preproc}
\title{Preprocess dfm data for use in modeling procedure}
\usage{
preproc(dfm_train, dfm_test = NULL, params)
}
\arguments{
\item{dfm_train}{Training dfm}
\item{dfm_test}{Testing dfm if applicable, otherwise NULL}
\item{params}{Row from grid with parameter optimization}
}
\value{
List with dfm_train and dfm_test, processed according to parameters in params
}
\description{
Process dfm according to parameters provided in params
}
\examples{
preproc(dfm_train, dfm_test = NULL, params)
}
Loading…
Cancel
Save