diff --git a/NAMESPACE b/NAMESPACE index 0878746..c7b163c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(metric_gen) export(modelizer) export(modelizer_old) export(out_parser) +export(preproc) export(query_gen_actors) export(query_string) export(ud_update) diff --git a/R/class_update.R b/R/class_update.R index 8e51472..596c1bb 100644 --- a/R/class_update.R +++ b/R/class_update.R @@ -20,8 +20,10 @@ ################################################################################################# class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) { print('updating') - dfm <- dfm_gen(out, text = text, words = words, clean = clean, cores = cores) %>% - dfm_weight(weights = model_final$idf) + dfm <- dfm_gen(out, text = text, words = words, clean = clean, cores = cores) + if (!is.null(model_final$idf)) { + dfm <- dfm_weight(dfm, weights = model_final$idf) + } pred <- data.frame(id = out$`_id`, pred = predict(model_final$text_model, newdata = dfm, type = "class", force = T)) bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set', ver = ver) res <- elastic_update(bulk, es_super = es_super, localhost = localhost) diff --git a/R/estimator.R b/R/estimator.R index 1242fa0..63ef19d 100644 --- a/R/estimator.R +++ b/R/estimator.R @@ -13,7 +13,7 @@ #' @examples #' estimator(row, grid, outer_folds, dfm, class_type, model) ################################################################################################# -#################################### Generate CV folds ########################################## +#################################### Generate models ############################################ ################################################################################################# ### Classification function @@ -40,29 +40,16 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod final <- T ### Indicate final modeling run on whole dataset dfm_train <- dfm } - ## Currently scheme_tf is not used explicitly - # if (model == 'nb') { - # scheme_tf <- 'count' # The 'old' way - # } else { - # scheme_tf <- 'prop' # The 'new' way - # } - - ### Getting features from training dataset - # Getting idf from training data, and using it to normalize both training and testing feature occurence - dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) - idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0) - dfm_train <- dfm_weight(dfm_train, weights = idf) - - # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select) - words <- unique(unlist(lapply(unique(docvars(dfm_train, class_type)), - feat_select, - dfm = dfm_train, - class_type = class_type, - percentile = params$percentiles, - measure = params$measures - ))) - dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T) + if (exists("final")) { + preproc_dfm <- preproc(dfm_train, NULL, params) + dfm_train <- preproc_dfm$dfm_train + } else { + preproc_dfm <- preproc(dfm_train, dfm_test, params) + dfm_train <- preproc_dfm$dfm_train + dfm_test <- preproc_dfm$dfm_test + } + idf <- preproc_dfm$idf if (model == "nb") { text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial") @@ -80,7 +67,6 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod if (exists("final")) { return(list(text_model=text_model, idf=idf)) } else { # Create a test set, and classify test items - dfm_test <- dfm_weight(dfm_test, weights = idf) # Use force=T to keep only features present in both training and test set pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T) diff --git a/R/preproc.R b/R/preproc.R new file mode 100644 index 0000000..c497bff --- /dev/null +++ b/R/preproc.R @@ -0,0 +1,42 @@ +#' Preprocess dfm data for use in modeling procedure +#' +#' Process dfm according to parameters provided in params +#' +#' @param dfm_train Training dfm +#' @param dfm_test Testing dfm if applicable, otherwise NULL +#' @param params Row from grid with parameter optimization +#' @return List with dfm_train and dfm_test, processed according to parameters in params +#' @export +#' @examples +#' preproc(dfm_train, dfm_test = NULL, params) +################################################################################################# +#################################### Preprocess data ############################################ +################################################################################################# +preproc <- function(dfm_train, dfm_test = NULL, params) { + # Remove non-existing features from training dfm + dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) + if (params$tfidf) { + idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0) + dfm_train <- dfm_weight(dfm_train, weights = idf) + if (!is.null(dfm_test)) { + dfm_test <- dfm_weight(dfm_test, weights = idf) + } + } else { + idf <- NULL + } + + if ("feat_percentiles" %in% colnames(params) && "feat_measures" %in% colnames(params)) { + + # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select) + words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)), + feat_select, + dfm = dfm_train, + class_type = params$class_type, + percentile = params$feat_percentiles, + measure = params$feat_measures + ))) + dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F) + } + + return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf)) +} diff --git a/man/preproc.Rd b/man/preproc.Rd new file mode 100644 index 0000000..ab30c92 --- /dev/null +++ b/man/preproc.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preproc.R +\name{preproc} +\alias{preproc} +\title{Preprocess dfm data for use in modeling procedure} +\usage{ +preproc(dfm_train, dfm_test = NULL, params) +} +\arguments{ +\item{dfm_train}{Training dfm} + +\item{dfm_test}{Testing dfm if applicable, otherwise NULL} + +\item{params}{Row from grid with parameter optimization} +} +\value{ +List with dfm_train and dfm_test, processed according to parameters in params +} +\description{ +Process dfm according to parameters provided in params +} +\examples{ +preproc(dfm_train, dfm_test = NULL, params) +}