estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R

5 years ago · 5de4e1488c
parent 77eb51a1bf
commit 5de4e1488c
3 changed files with 72 additions and 27 deletions
--- a/R/estimator.R
+++ b/R/estimator.R
@ -8,6 +8,7 @@
 #' @param dfm DFM containing labeled documents
 #' @param class_type Name of column in docvars() containing the classes
 #' @param model Model to use (currently only nb)
 #' @param we_vectors Matrix with word embedding vectors
 #' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
 #' @export
 #' @examples
@ -17,7 +18,7 @@
 #################################################################################################
 ### Classification function
-estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model) {
+estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model, we_vectors) {
  # Get parameters for current iteration
  params <- grid[row,]
@ -41,26 +42,43 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
    dfm_train <- dfm
  }
-  if (exists("final")) {
+  dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
-    preproc_dfm <- preproc(dfm_train, NULL, params)
+  if (params$tfidf) {
-    dfm_train <- preproc_dfm$dfm_train
+    idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
    dfm_train <- dfm_weight(dfm_train, weights = idf)
    if (!is.null(dfm_test)) {
      dfm_test <- dfm_weight(dfm_test, weights = idf)
    }
  } else {
-    preproc_dfm <- preproc(dfm_train, dfm_test, params)
+    idf <- NULL
-    dfm_train <- preproc_dfm$dfm_train
+  }
-    dfm_test <- preproc_dfm$dfm_test
+
  if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
    # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
    words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
                                  feat_select,
                                  dfm = dfm_train,
                                  class_type = params$class_type,
                                  percentile = params$feat_percentiles,
                                  measure = params$feat_measures
    )))
    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
    if (!is.null(dfm_test)) {
      dfm_test <- dfm_keep(dfm_test, words, valuetype="fixed", verbose=F)
    }
  }
  idf <- preproc_dfm$idf
  if (model == "nb") {
    text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
  }
  if (model == "svm") {
-    text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
+    text_model <- svm(x=as.matrix(train_data), y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
  }
  if (model == 'nnet') {
    idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
    text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
  }
  # if (model == 'nnet') {
  #   idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
  #   text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
  # }
  ### Add more if statements for different models
  # If training on whole dataset, return final model, and idf values from dataset
--- a/R/modelizer.R
+++ b/R/modelizer.R
@ -16,8 +16,9 @@
 #' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
 #' @param seed Integer to use as seed for random number generation, ensures replicability
 #' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
 #' @param we_vectors Matrix with word embedding vectors
 #' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1
-#' @return An .Rds file in the current working directory (getwd()) with a list containing all relevant output
+#' @return A list containing all relevant output
 #' @export
 #' @examples
 #' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
@ -25,9 +26,9 @@
 #################################### Function to generate classification models #################
 #################################################################################################
-modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) {
+modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, we_vectors, cores = 1) {
  ## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building
-  folds <- cv_generator(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed)
+  folds <- cv_generator(outer_k,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
  inner_grid <- folds$grid
  outer_folds <- folds$outer_folds
  inner_folds <- folds$inner_folds
@ -42,6 +43,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
                                   inner_folds = inner_folds,
                                   dfm = dfm,
                                   class_type = class_type,
                                   we_vectors = we_vectors,
                                   model = model) %>%
    future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row
    bind_rows(.)
@ -66,12 +68,13 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
                                   inner_folds = NULL,
                                   dfm = dfm,
                                   class_type = class_type,
                                   we_vectors = we_vectors,
                                   model = model) %>%
    future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid
    bind_rows(.)
  # Create (inner) folds for parameter optimization on the entire dataset
-  final_folds <- cv_generator(NULL,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed)
+  final_folds <- cv_generator(NULL,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
  final_grid <- final_folds$grid
  final_inner <- final_folds$inner_folds
@ -82,6 +85,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
                                   inner_folds = final_inner,
                                   dfm = dfm,
                                   class_type = class_type,
                                   we_vectors = we_vectors,
                                   model = model) %>%
    future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid
    bind_rows(.)
@ -118,12 +122,6 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
                 country = country,
                 class_type = class_type)
-  # Set name for output .Rds file
+  # Return ouput
-  filename <- paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.Rds')
+  return(output)
  # Save RDS file with output
  saveRDS(output, file = filename)
  # Return ouput RDS filename
  return(filename)
 }
--- a/R/preproc.R
+++ b/R/preproc.R
@ -5,6 +5,7 @@
 #' @param dfm_train Training dfm
 #' @param dfm_test Testing dfm if applicable, otherwise NULL
 #' @param params Row from grid with parameter optimization
 #' @param we_vectors Matrix with word embedding vectors
 #' @return List with dfm_train and dfm_test, processed according to parameters in params
 #' @export
 #' @examples
@ -12,7 +13,11 @@
 #################################################################################################
 #################################### Preprocess data ############################################
 #################################################################################################
-preproc <- function(dfm_train, dfm_test = NULL, params) {
+
 ### CURRENTLY UNUSED!!!###
 preproc <- function(dfm_train, dfm_test = NULL, params, we_vectors) {
  # Remove non-existing features from training dfm
  dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
  if (params$tfidf) {
@ -25,7 +30,7 @@ preproc <- function(dfm_train, dfm_test = NULL, params) {
    idf <- NULL
  }
-  if ("feat_percentiles" %in% colnames(params) && "feat_measures" %in% colnames(params)) {
+  if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
    # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
    words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
@ -38,5 +43,29 @@ preproc <- function(dfm_train, dfm_test = NULL, params) {
    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
  }
  if (!is.null(we_vectors)) {
    shared_dict <- sort(intersect(dfm_train@Dimnames$features,we_vectors$V1))
    if (!is.null(dfm_test)) {
      shared_dict <- sort(intersect(dfm_test@Dimnames$features,shared_dict))
      dfm_test <- dfm_keep(dfm_test, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
        .[, sort(colnames(.))]
    }
    dfm_train <- dfm_keep(dfm_train, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
      .[, sort(colnames(.))]
    we_matrix <- filter(we_vectors, V1 %in% shared_dict) %>%
      arrange(V1) %>%
      as.data.table(.) %>%
      .[,2:ncol(.), with = F] %>%
      as.matrix(.)
    dfm_train_we_sum <- dfm_train %*% we_matrix
    # dfm_train_we_mean <- dfm_train_we_sum / as.vector(rowSums(dfm_train))
    if (!is.null(dfm_test)) {
      dfm_test_we_sum <- dfm_test %*% we_matrix
      # dfm_test_we_mean <- dfm_test_we_sum / as.vector(rowSums(dfm_test))
    }
    return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf, dfm_train_we = dfm_train_we_sum, dfm_test_we = dfm_test_we_sum))
  }
  return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
 }