mamlr/R/estimator.R

#' Generate models and get classifications on test sets
#'
#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
#'
#' @param row Row number of current item in grid
#' @param grid Grid with model parameters and CV folds
#' @param outer_folds List with row numbers for outer folds
#' @param dfm DFM containing labeled documents
#' @param class_type Name of column in docvars() containing the classes
#' @param model Model to use (currently only nb)
#' @param we_vectors Matrix with word embedding vectors
#' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
#' @export
#' @examples
#' estimator(row, grid, outer_folds, dfm, class_type, model)
#################################################################################################
#################################### Generate models ############################################
#################################################################################################

### Classification function
estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model, we_vectors) {
  # Get parameters for current iteration
  params <- grid[row,]

  # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
  if ("inner_fold" %in% colnames(params) && "outer_fold" %in% colnames(params))  {
    dfm_train <- dfm[-outer_folds[[params$outer_fold]],] %>%
      .[-inner_folds[[params$outer_fold]][[params$inner_fold]],]
    dfm_test <- dfm[-outer_folds[[params$outer_fold]],] %>%
      .[inner_folds[[params$outer_fold]][[params$inner_fold]],]
    # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
  } else if ("outer_fold" %in% colnames(params)) {
    dfm_train <- dfm[-outer_folds[[params$outer_fold]],]
    dfm_test <- dfm[outer_folds[[params$outer_fold]],]
    # If only inner folds, validate performance directly on inner folds
  } else if ("inner_fold" %in% colnames(params)) {
    dfm_train <- dfm[-inner_folds[[params$inner_fold]],]
    dfm_test <- dfm[inner_folds[[params$inner_fold]],]
    # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
  } else {
    dfm_test <- NULL
    dfm_train <- dfm
  }

  dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
  if (params$tfidf) {
    idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
    dfm_train <- dfm_weight(dfm_train, weights = idf)
    if (!is.null(dfm_test)) {
      dfm_test <- dfm_weight(dfm_test, weights = idf)
    }
  } else {
    idf <- NULL
  }

  if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {

    # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
    words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
                                  feat_select,
                                  dfm = dfm_train,
                                  class_type = params$class_type,
                                  percentile = params$feat_percentiles,
                                  measure = params$feat_measures
    )))
    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
    if (!is.null(dfm_test)) {
      dfm_test <- dfm_keep(dfm_test, words, valuetype="fixed", verbose=F)
    }
  }

  if (model == "nb") {
    text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
  }
  if (model == "svm") {
    text_model <- svm(x=as.matrix(train_data), y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
  }
  # if (model == 'nnet') {
  #   idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
  #   text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
  # }
  ### Add more if statements for different models

  # If training on whole dataset, return final model, and idf values from dataset
  if (is.null(dfm_test)) {
    return(list(text_model=text_model, idf=idf))
  } else { # Create a test set, and classify test items
    # Use force=T to keep only features present in both training and test set
    pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T)

    return(data.frame(
      tv = I(list(docvars(dfm_test, class_type))), # True values from test set
      pred = I(list(pred)), # Predictions of test set
      params, # Parameters used to generate classification model
      text_model = I(list(text_model)), # The classification model
      idf = I(list(idf)), # IDF of the training dataset used for model creation
      stringsAsFactors = F
    ))
  }
}
revised modeling pipeline: cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exported 5 years ago			`#' Generate models and get classifications on test sets`
			`#'`
			`#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination`
			`#'`
			`#' @param row Row number of current item in grid`
			`#' @param grid Grid with model parameters and CV folds`
			`#' @param outer_folds List with row numbers for outer folds`
			`#' @param dfm DFM containing labeled documents`
			`#' @param class_type Name of column in docvars() containing the classes`
			`#' @param model Model to use (currently only nb)`
estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R 4 years ago			`#' @param we_vectors Matrix with word embedding vectors`
revised modeling pipeline: cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exported 5 years ago			`#' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.`
			`#' @export`
			`#' @examples`
			`#' estimator(row, grid, outer_folds, dfm, class_type, model)`
			`#################################################################################################`
separated data preprocessing routines class_update: check if there are idf values associated with model, before applying weights estimator: make use of preproc() function for data preprocessing preproc: function containing all logic with regards to text data preprocessing and weighting 5 years ago			`#################################### Generate models ############################################`
revised modeling pipeline: cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exported 5 years ago			`#################################################################################################`

			`### Classification function`
estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R 4 years ago			`estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model, we_vectors) {`
revised modeling pipeline: cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exported 5 years ago			`# Get parameters for current iteration`
			`params <- grid[row,]`

			`# If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance`
			`if ("inner_fold" %in% colnames(params) && "outer_fold" %in% colnames(params)) {`
			`dfm_train <- dfm[-outer_folds[[params$outer_fold]],] %>%`
			`.[-inner_folds[[params$outer_fold]][[params$inner_fold]],]`
			`dfm_test <- dfm[-outer_folds[[params$outer_fold]],] %>%`
			`.[inner_folds[[params$outer_fold]][[params$inner_fold]],]`
			`# If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data`
			`} else if ("outer_fold" %in% colnames(params)) {`
			`dfm_train <- dfm[-outer_folds[[params$outer_fold]],]`
			`dfm_test <- dfm[outer_folds[[params$outer_fold]],]`
			`# If only inner folds, validate performance directly on inner folds`
			`} else if ("inner_fold" %in% colnames(params)) {`
			`dfm_train <- dfm[-inner_folds[[params$inner_fold]],]`
			`dfm_test <- dfm[inner_folds[[params$inner_fold]],]`
			`# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model`
			`} else {`
removed multicore support, added parameters for dfm_gen 4 years ago			`dfm_test <- NULL`
revised modeling pipeline: cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exported 5 years ago			`dfm_train <- dfm`
			`}`

estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R 4 years ago			`dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)`
			`if (params$tfidf) {`
			`idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)`
			`dfm_train <- dfm_weight(dfm_train, weights = idf)`
			`if (!is.null(dfm_test)) {`
			`dfm_test <- dfm_weight(dfm_test, weights = idf)`
			`}`
separated data preprocessing routines class_update: check if there are idf values associated with model, before applying weights estimator: make use of preproc() function for data preprocessing preproc: function containing all logic with regards to text data preprocessing and weighting 5 years ago			`} else {`
estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R 4 years ago			`idf <- NULL`
			`}`

			`if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {`

			`# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)`
			`words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),`
			`feat_select,`
			`dfm = dfm_train,`
			`class_type = params$class_type,`
			`percentile = params$feat_percentiles,`
			`measure = params$feat_measures`
			`)))`
			`dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)`
			`if (!is.null(dfm_test)) {`
			`dfm_test <- dfm_keep(dfm_test, words, valuetype="fixed", verbose=F)`
			`}`
separated data preprocessing routines class_update: check if there are idf values associated with model, before applying weights estimator: make use of preproc() function for data preprocessing preproc: function containing all logic with regards to text data preprocessing and weighting 5 years ago			`}`
revised modeling pipeline: cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exported 5 years ago
			`if (model == "nb") {`
			`text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")`
			`}`
			`if (model == "svm") {`
estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R 4 years ago			`text_model <- svm(x=as.matrix(train_data), y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)`
revised modeling pipeline: cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exported 5 years ago			`}`
estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R 4 years ago			`# if (model == 'nnet') {`
			`# idC <- class.ind(as.factor(docvars(dfm_train, class_type)))`
			`# text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size(length(dfm_train@Dimnames$features)+1)+(params$size2)+2)`
			`# }`
revised modeling pipeline: cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exported 5 years ago			`### Add more if statements for different models`

			`# If training on whole dataset, return final model, and idf values from dataset`
removed multicore support, added parameters for dfm_gen 4 years ago			`if (is.null(dfm_test)) {`
revised modeling pipeline: cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exported 5 years ago			`return(list(text_model=text_model, idf=idf))`
			`} else { # Create a test set, and classify test items`
			`# Use force=T to keep only features present in both training and test set`
			`pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T)`

			`return(data.frame(`
			`tv = I(list(docvars(dfm_test, class_type))), # True values from test set`
			`pred = I(list(pred)), # Predictions of test set`
			`params, # Parameters used to generate classification model`
			`text_model = I(list(text_model)), # The classification model`
			`idf = I(list(idf)), # IDF of the training dataset used for model creation`
			`stringsAsFactors = F`
			`))`
			`}`
			`}`