cv_generator: generate folds for nested cv dfm_gen: added optional lowercasing parameter estimator: estimate model and performance based on parameters feat_select: select features based on textstat_keyness metric_gen: convert output from estimator to model performance metrics modelizer: updated for new pipeline modelizer_old: old model pipeline out_parser: now correctly exportedmaster
parent
e76a914dd2
commit
a3b6e19646
@ -1,3 +1,5 @@
|
||||
.Rproj.user
|
||||
.Rhistory
|
||||
.RData
|
||||
*.RData
|
||||
*.Rds
|
||||
|
@ -0,0 +1,64 @@
|
||||
#' Generate CV folds for nested cross-validation
|
||||
#'
|
||||
#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
|
||||
#'
|
||||
#' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data
|
||||
#' @param inner_k Number of inner CV (parameter optimization) folds
|
||||
#' @param dfm DFM containing the labeled documents
|
||||
#' @param class_type Name of the column in docvars containing the classification
|
||||
#' @param grid Parameter grid for optimization
|
||||
#' @param seed integer used as seed for random number generation
|
||||
#' @return A nested set of lists with row numbers
|
||||
#' @export
|
||||
#' @examples
|
||||
#' cv_generator(outer_k, inner_k, dfm, class_type)
|
||||
#################################################################################################
|
||||
#################################### Generate CV folds ##########################################
|
||||
#################################################################################################
|
||||
cv_generator <- function(outer_k, inner_k, dfm, class_type, grid, seed) {
|
||||
### Generate inner folds for nested cv
|
||||
inner_loop <- function(i, folds, dfm, inner_k, class_type, grid, seed) {
|
||||
# RNG needs to be set explicitly for each fold
|
||||
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||
inner_folds <- createFolds(as.factor(docvars(dfm[-folds[[i]],], class_type)), k= inner_k)
|
||||
grid <- crossing(grid, inner_fold = names(inner_folds), outer_fold = names(folds)[i])
|
||||
return(list(grid = grid, inner_folds = inner_folds, outer_fold = names(folds)[i]))
|
||||
}
|
||||
|
||||
### Generate outer folds for nested cv
|
||||
generate_folds <- function(outer_k, inner_k, dfm, class_type, grid, seed){
|
||||
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||
if (is.null(outer_k)) { # If no outer_k, use all data to generate inner_k folds for parameter optimization
|
||||
inner_folds <- createFolds(as.factor(docvars(dfm, class_type)), k= inner_k)
|
||||
grid <- crossing(grid, inner_fold = names(inner_folds))
|
||||
return(list(grid = grid,
|
||||
inner_folds = inner_folds))
|
||||
} else if (outer_k < 1) { # Create holdout validation for model performance estimation, with test set equal to outer_k
|
||||
folds <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k)
|
||||
} else { # Do full nested CV
|
||||
folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
|
||||
}
|
||||
# Generate grid of hyperparameters for model optimization, and include inner folds row numbers
|
||||
grid_folds <- lapply(1:length(folds),
|
||||
inner_loop,
|
||||
folds = folds,
|
||||
dfm = dfm,
|
||||
inner_k = inner_k,
|
||||
class_type = class_type,
|
||||
grid = grid,
|
||||
seed = seed)
|
||||
|
||||
# Extract grid dataframe from results
|
||||
grid <- grid_folds %>% purrr::map(1) %>% dplyr::bind_rows()
|
||||
|
||||
# Extract row numbers for inner folds from results
|
||||
inner_folds <- grid_folds %>% purrr::map(2)
|
||||
|
||||
# Extract the names of the inner folds from results
|
||||
names(inner_folds) <- grid_folds %>% purrr::map(3) %>% unlist(.)
|
||||
return(list(grid = grid,
|
||||
outer_folds = folds,
|
||||
inner_folds = inner_folds))
|
||||
}
|
||||
return(generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed))
|
||||
}
|
@ -0,0 +1,96 @@
|
||||
#' Generate models and get classifications on test sets
|
||||
#'
|
||||
#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
|
||||
#'
|
||||
#' @param row Row number of current item in grid
|
||||
#' @param grid Grid with model parameters and CV folds
|
||||
#' @param outer_folds List with row numbers for outer folds
|
||||
#' @param dfm DFM containing labeled documents
|
||||
#' @param class_type Name of column in docvars() containing the classes
|
||||
#' @param model Model to use (currently only nb)
|
||||
#' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
|
||||
#' @export
|
||||
#' @examples
|
||||
#' estimator(row, grid, outer_folds, dfm, class_type, model)
|
||||
#################################################################################################
|
||||
#################################### Generate CV folds ##########################################
|
||||
#################################################################################################
|
||||
|
||||
### Classification function
|
||||
estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model) {
|
||||
# Get parameters for current iteration
|
||||
params <- grid[row,]
|
||||
|
||||
# If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
|
||||
if ("inner_fold" %in% colnames(params) && "outer_fold" %in% colnames(params)) {
|
||||
dfm_train <- dfm[-outer_folds[[params$outer_fold]],] %>%
|
||||
.[-inner_folds[[params$outer_fold]][[params$inner_fold]],]
|
||||
dfm_test <- dfm[-outer_folds[[params$outer_fold]],] %>%
|
||||
.[inner_folds[[params$outer_fold]][[params$inner_fold]],]
|
||||
# If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
|
||||
} else if ("outer_fold" %in% colnames(params)) {
|
||||
dfm_train <- dfm[-outer_folds[[params$outer_fold]],]
|
||||
dfm_test <- dfm[outer_folds[[params$outer_fold]],]
|
||||
# If only inner folds, validate performance directly on inner folds
|
||||
} else if ("inner_fold" %in% colnames(params)) {
|
||||
dfm_train <- dfm[-inner_folds[[params$inner_fold]],]
|
||||
dfm_test <- dfm[inner_folds[[params$inner_fold]],]
|
||||
# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
|
||||
} else {
|
||||
final <- T ### Indicate final modeling run on whole dataset
|
||||
dfm_train <- dfm
|
||||
}
|
||||
## Currently scheme_tf is not used explicitly
|
||||
# if (model == 'nb') {
|
||||
# scheme_tf <- 'count' # The 'old' way
|
||||
# } else {
|
||||
# scheme_tf <- 'prop' # The 'new' way
|
||||
# }
|
||||
|
||||
### Getting features from training dataset
|
||||
# Getting idf from training data, and using it to normalize both training and testing feature occurence
|
||||
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
|
||||
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
|
||||
dfm_train <- dfm_weight(dfm_train, weights = idf)
|
||||
|
||||
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
|
||||
words <- unique(unlist(lapply(unique(docvars(dfm_train, class_type)),
|
||||
feat_select,
|
||||
dfm = dfm_train,
|
||||
class_type = class_type,
|
||||
percentile = params$percentiles,
|
||||
measure = params$measures
|
||||
)))
|
||||
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
|
||||
|
||||
|
||||
if (model == "nb") {
|
||||
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
|
||||
}
|
||||
if (model == "svm") {
|
||||
text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
|
||||
}
|
||||
if (model == 'nnet') {
|
||||
idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
|
||||
text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
|
||||
}
|
||||
### Add more if statements for different models
|
||||
|
||||
# If training on whole dataset, return final model, and idf values from dataset
|
||||
if (exists("final")) {
|
||||
return(list(text_model=text_model, idf=idf))
|
||||
} else { # Create a test set, and classify test items
|
||||
dfm_test <- dfm_weight(dfm_test, weights = idf)
|
||||
# Use force=T to keep only features present in both training and test set
|
||||
pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T)
|
||||
|
||||
return(data.frame(
|
||||
tv = I(list(docvars(dfm_test, class_type))), # True values from test set
|
||||
pred = I(list(pred)), # Predictions of test set
|
||||
params, # Parameters used to generate classification model
|
||||
text_model = I(list(text_model)), # The classification model
|
||||
idf = I(list(idf)), # IDF of the training dataset used for model creation
|
||||
stringsAsFactors = F
|
||||
))
|
||||
}
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
#' Select features using quanteda textstat_keyness
|
||||
#'
|
||||
#' Select features based on the textstat_keyness function and a percentile cutoff
|
||||
#' Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
|
||||
#'
|
||||
#' @param topic The topic to determine keywords for
|
||||
#' @param dfm The input dfm
|
||||
#' @param class_type Name of the column in docvars containing the classification
|
||||
#' @param percentile Cutoff for the list of words that should be returned
|
||||
#' @param measure Measure to use in determining keyness, default = chi2; see textstat_keyness for other options
|
||||
#' @return A vector of words that are key to the topic
|
||||
#' @export
|
||||
#' @examples
|
||||
#' feat_select(topic, dfm, class_type, percentile, measure="chi2")
|
||||
#################################################################################################
|
||||
#################################### Feature selection ##########################################
|
||||
#################################################################################################
|
||||
|
||||
feat_select <- function (topic, dfm, class_type, percentile, measure="chi2") {
|
||||
# Use quanteda textstat_keyness to determine feature importance
|
||||
keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
|
||||
na.omit()
|
||||
# Convert keyness values to absolute values, to take into account both positive and negative extremes
|
||||
keyness[,2] <- abs(keyness[,2])
|
||||
# Keep only the words with an absolute keyness value falling in the top [percentile] percentile
|
||||
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
|
||||
return(keyness)
|
||||
}
|
@ -0,0 +1,47 @@
|
||||
#' Generate performance statistics for models
|
||||
#'
|
||||
#' Generate performance statistics for models, based on their predictions and the true values
|
||||
#'
|
||||
#' @param x A data frame containing at least the columns "pred" and "tv"
|
||||
#' @return x, with additional columns for performance metrics
|
||||
#' @export
|
||||
#' @examples
|
||||
#' metric_gen(x)
|
||||
#################################################################################################
|
||||
############################# Performance metric generation #####################################
|
||||
#################################################################################################
|
||||
|
||||
metric_gen <- function(x) {
|
||||
### Fix for missing classes in multiclass classification
|
||||
### Sorting u for easier interpretation of confusion matrix
|
||||
u <- as.character(sort(as.numeric(union(unlist(x$pred), unlist(x$tv)))))
|
||||
# Create a crosstable with predictions and true values
|
||||
class_table <- table(prediction = factor(unlist(x$pred), u), trueValues = factor(unlist(x$tv), u))
|
||||
|
||||
# When only two classes, set positive class explicitly as the class with the highest value
|
||||
if (length(unique(u)) == 2) {
|
||||
conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
|
||||
weighted_measures <- as.data.frame(conf_mat$byClass)
|
||||
macro_measures <- as.data.frame(conf_mat$byClass)
|
||||
} else {
|
||||
# Create a confusion matrix
|
||||
conf_mat <- confusionMatrix(class_table, mode = "everything")
|
||||
# Set "positive" value to NA, because not applicable
|
||||
conf_mat$positive <- NA
|
||||
# Compute weighted performance measures
|
||||
weighted_measures <- colSums(conf_mat$byClass * colSums(conf_mat$table))/sum(colSums(conf_mat$table))
|
||||
# Compute unweighted performance measures (divide by number of classes, each class equally important)
|
||||
macro_measures <- colSums(conf_mat$byClass)/nrow(conf_mat$byClass)
|
||||
# Replace NaN's by 0 when occurring
|
||||
weighted_measures[is.nan(weighted_measures)] <- 0
|
||||
macro_measures[is.nan(macro_measures)] <- 0
|
||||
}
|
||||
return(cbind(x,
|
||||
as.data.frame(t(conf_mat$overall)),
|
||||
'weighted' = t(as.data.frame(weighted_measures)),
|
||||
'macro' = t(as.data.frame(macro_measures)),
|
||||
pos_cat = conf_mat$positive,
|
||||
conf_mat = I(list(conf_mat))
|
||||
)
|
||||
)
|
||||
}
|
@ -0,0 +1,301 @@
|
||||
#' Generate a classification model
|
||||
#'
|
||||
#' Generate a nested cross validated classification model based on a dfm with class labels as docvars
|
||||
#' Currently only supports Naïve Bayes using quanteda's textmodel_nb
|
||||
#' Hyperparemeter optimization is enabled through the grid parameter
|
||||
#' A grid should be generated from vectors with the labels as described for each model, using the crossing() command
|
||||
#' For Naïve Bayes, the following parameters can be used:
|
||||
#' - percentiles (cutoff point for tf-idf feature selection)
|
||||
#' - measures (what measure to use for determining feature importance, see textstat_keyness for options)
|
||||
#' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars
|
||||
#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds)
|
||||
#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)
|
||||
#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)
|
||||
#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)
|
||||
#' @param seed Integer to use as seed for random number generation, ensures replicability
|
||||
#' @param outer_k Number of outer cross-validation folds (for performance estimation)
|
||||
#' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection)
|
||||
#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
|
||||
#' @param class_type Type of classification to model ("junk", "aggregate", or "codes")
|
||||
#' @param opt_measure Label of measure in confusion matrix to use as performance indicator
|
||||
#' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename)
|
||||
#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
|
||||
#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
|
||||
#' @export
|
||||
#' @examples
|
||||
#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
|
||||
#################################################################################################
|
||||
#################################### Function to generate classification models #################
|
||||
#################################################################################################
|
||||
|
||||
modelizer_old <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
|
||||
### Functions ###
|
||||
feat_select <- function (topic, dfm, class_type, percentile,measure) {
|
||||
keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
|
||||
na.omit()
|
||||
keyness[,2] <- abs(keyness[,2])
|
||||
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
|
||||
return(keyness)
|
||||
}
|
||||
### Generate inner folds for nested cv
|
||||
inner_loop <- function(fold, dfm, inner_k, class_type) {
|
||||
# RNG needs to be set explicitly for each fold
|
||||
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||
## Either createDataPartition for simple holdout parameter optimization
|
||||
## Or createFolds for proper inner CV for nested CV
|
||||
# if (inner_k <= 1) {
|
||||
# inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
|
||||
# } else {
|
||||
inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k)
|
||||
# }
|
||||
return(c(outer_fold = list(fold),inner_folds))
|
||||
}
|
||||
|
||||
### Generate outer folds for nested cv
|
||||
generate_folds <- function(outer_k, inner_k, dfm, class_type){
|
||||
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||
folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
|
||||
return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type))
|
||||
}
|
||||
|
||||
### Gets called for every parameter combination, and calls classifier for every inner cv fold
|
||||
inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
|
||||
print(str_c('params ',row))
|
||||
params <- grid[row,]
|
||||
# For each inner fold, cross validate the specified parameters
|
||||
res <-
|
||||
bind_rows(mclapply(inner_folds,
|
||||
classifier,
|
||||
outer_fold = outer_fold,
|
||||
params = params,
|
||||
dfm = dfm,
|
||||
class_type = class_type,
|
||||
model = model,
|
||||
cores_feats = cores_feats,
|
||||
mc.cores = cores_inner
|
||||
)
|
||||
)
|
||||
# print(res)
|
||||
# print(res[1,1])
|
||||
# print('inner_cv')
|
||||
return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params))
|
||||
}
|
||||
|
||||
### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
|
||||
outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
|
||||
print('outer cv')
|
||||
# If fold contains both inner folds and outer fold
|
||||
if (length(fold) == inner_k + 1) {
|
||||
inner_folds <- fold[-1]
|
||||
outer_fold <- fold$outer_fold
|
||||
# For each row in grid, cross-validate results
|
||||
res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
|
||||
inner_cv,
|
||||
cores_feats= cores_feats,
|
||||
grid = grid,
|
||||
dfm = dfm,
|
||||
class_type = class_type,
|
||||
model = model,
|
||||
outer_fold = outer_fold,
|
||||
inner_folds = inner_folds,
|
||||
cores_inner = cores_inner,
|
||||
mc.cores = cores_grid)
|
||||
)
|
||||
# print(res)
|
||||
# print(res[1,1])
|
||||
# print('outer_cv')
|
||||
# Determine optimum hyperparameters within outer fold training set
|
||||
optimum <- res[which.max(res[,opt_measure]),] %>%
|
||||
select(percentiles: ncol(.))
|
||||
# Validate performance of optimum hyperparameters on outer fold test set
|
||||
return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats))
|
||||
} else {
|
||||
# If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters
|
||||
inner_folds <- fold
|
||||
outer_fold <- NULL
|
||||
res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
|
||||
inner_cv,
|
||||
cores_feats= cores_feats,
|
||||
grid = grid,
|
||||
dfm = dfm,
|
||||
class_type = class_type,
|
||||
model = model,
|
||||
outer_fold = outer_fold,
|
||||
inner_folds = inner_folds,
|
||||
cores_inner = cores_inner,
|
||||
mc.cores = cores_grid)
|
||||
)
|
||||
# print(res)
|
||||
# print(res[1,1])
|
||||
# print('line 126, final model parameter optimization')
|
||||
return(res)
|
||||
}
|
||||
}
|
||||
|
||||
# ### Custom tfidf function to allow same idf for different dfm's
|
||||
# custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
|
||||
# if (!nfeat(x) || !ndoc(x)) return(x)
|
||||
# tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
|
||||
# if (nfeat(x) != length(dfreq))
|
||||
# stop("missing some values in idf calculation")
|
||||
# # get the document indexes
|
||||
# j <- as(tfreq, "dgTMatrix")@j + 1
|
||||
# # replace just the non-zero values by product with idf
|
||||
# x@x <- tfreq@x * dfreq[j]
|
||||
# # record attributes
|
||||
#
|
||||
# ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2
|
||||
#
|
||||
# # x@weightTf <- tfreq@weightTf
|
||||
# # x@weightDf <- c(list(scheme = scheme_df, base = base), args)
|
||||
# return(x)
|
||||
# }
|
||||
|
||||
### Classification function
|
||||
classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
|
||||
# If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
|
||||
if (length(inner_fold) > 0 && length(outer_fold) > 0) {
|
||||
dfm_train <- dfm[-outer_fold,] %>%
|
||||
.[-inner_fold,]
|
||||
dfm_test <- dfm[-outer_fold,] %>%
|
||||
.[inner_fold,]
|
||||
# If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
|
||||
} else if (length(outer_fold) > 0 ) {
|
||||
dfm_train <- dfm[-outer_fold,]
|
||||
dfm_test <- dfm[outer_fold,]
|
||||
validation_cv <- T
|
||||
# If only inner folds, validate performance directly on inner folds (is the same as above?)
|
||||
} else if (length(inner_fold) > 0 ) {
|
||||
dfm_train <- dfm[-inner_fold,]
|
||||
dfm_test <- dfm[inner_fold,]
|
||||
# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
|
||||
} else {
|
||||
final <- T ### Indicate final modeling run on whole dataset
|
||||
dfm_train <- dfm
|
||||
}
|
||||
if (model == 'nb') {
|
||||
scheme_tf <- 'count' # The 'old' way
|
||||
} else {
|
||||
scheme_tf <- 'prop' # The 'new' way
|
||||
}
|
||||
### Getting features from training dataset
|
||||
# Getting idf from training data, and using it to normalize both training and testing feature occurence
|
||||
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
|
||||
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
|
||||
dfm_train <- dfm_weight(dfm_train, weights = idf)
|
||||
# Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
|
||||
# than the threshold on two or more of those categories
|
||||
words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
|
||||
feat_select,
|
||||
dfm = dfm_train,
|
||||
class_type = class_type,
|
||||
percentile = params$percentiles,
|
||||
measure = params$measures,
|
||||
mc.cores = cores_feats
|
||||
)))
|
||||
# dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq)
|
||||
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
|
||||
|
||||
|
||||
if (model == "nb") {
|
||||
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
|
||||
}
|
||||
if (model == "svm") {
|
||||
text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
|
||||
}
|
||||
if (model == 'nnet') {
|
||||
idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
|
||||
text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
|
||||
}
|
||||
# if (model == 'neuralnet') {
|
||||
# dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
|
||||
# dfm_test <- custom_tfidf(dfm_test, scheme_tf = "prop", scheme_df = "inverse", base = 10, dfreq = dfreq[words])
|
||||
#
|
||||
# idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
|
||||
# colnames(idC) <- NULL
|
||||
# nn_train <- cbind(idC, dfm_train)
|
||||
# n <- colnames(nn_train)[3:length(colnames(nn_train))]
|
||||
# f <- as.formula(paste0("feat1 + feat2 ~ `", paste0(n, collapse = "` + `"),"`"))
|
||||
# idC_out <- class.ind(as.factor(docvars(dfm_test, class_type)))
|
||||
# colnames(idC_out) <- NULL
|
||||
# nn_test <- cbind(idC_out, dfm_test)
|
||||
# nn <- neuralnet(f,data=nn_train,hidden=3,linear.output=F,act.fct = 'logistic',lifesign = 'full', threshold = .005)
|
||||
# pr.nn <- compute(nn,nn_test[,3:length(colnames(nn_test))])
|
||||
# class_table <- table(prediction = as.matrix(round(pr.nn$net.result[,2])), trueValues = as.matrix(idC_out[,2]))
|
||||
# nn_conf_mat <- confusionMatrix(class_table, mode = "everything")
|
||||
# }
|
||||
### Add more if statements for different models
|
||||
if (exists("final")) {
|
||||
return(list(text_model=text_model, idf=idf))
|
||||
} else {
|
||||
### Removing all features not in training set from test set and weighting the remaining features according to training idf
|
||||
dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
|
||||
dfm_test <- dfm_weight(dfm_test, weights = idf)
|
||||
pred <- predict(text_model, newdata = dfm_test, type = 'class')
|
||||
### Fix for single-class 'predictions' in borderline situations
|
||||
# if (length(unique(pred)) == 1 & class_type == 'junk') {
|
||||
# if (unique(pred) == '0') {
|
||||
# pred[1] <- '1'
|
||||
# } else {
|
||||
# pred[1] <- '0'
|
||||
# }
|
||||
# }
|
||||
|
||||
### Fix for missing classes in multiclass classification
|
||||
u <- union(pred, docvars(dfm_test, class_type))
|
||||
|
||||
class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u))
|
||||
if (length(unique(u)) == 2) {
|
||||
conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
|
||||
} else {
|
||||
conf_mat <- confusionMatrix(class_table, mode = "everything")
|
||||
conf_mat$positive <- NA
|
||||
}
|
||||
if (exists("validation_cv")) {
|
||||
return(data.frame(
|
||||
tv = docvars(dfm_test, class_type),
|
||||
pred = pred,
|
||||
params = params,
|
||||
pos_cat = conf_mat$positive,
|
||||
stringsAsFactors = F
|
||||
))
|
||||
}
|
||||
if (is.matrix(conf_mat$byClass) == T) {
|
||||
return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
|
||||
} else {
|
||||
return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
### If outer_k is 1, do a holdout training run, with only cross-validation for parameter optimization, else, do nested CV
|
||||
### If holdout, training/test distribution is the same as for inner CV
|
||||
if (outer_k < 1) {
|
||||
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||
outer_fold <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k)
|
||||
folds <- lapply(outer_fold,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type)
|
||||
} else {
|
||||
## Generate nested CV folds, based on number of inner and outer folds defined (see start of script)
|
||||
folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type)
|
||||
}
|
||||
|
||||
## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator)
|
||||
performance <- mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)
|
||||
|
||||
## Set seed and generate folds for final hyperparameter optimization search (using CV)
|
||||
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||
folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k))
|
||||
|
||||
## Get the final hyperparameter performance for all value combinations in grid
|
||||
params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer))
|
||||
|
||||
## Select optimum final hyperparameters
|
||||
optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>%
|
||||
select(percentiles: ncol(.))
|
||||
|
||||
## Estimate final model on whole dataset, using optimum final hyperparameters determined above
|
||||
model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer)))
|
||||
rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
|
||||
save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
|
||||
return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/cv_generator.R
|
||||
\name{cv_generator}
|
||||
\alias{cv_generator}
|
||||
\title{Generate CV folds for nested cross-validation}
|
||||
\usage{
|
||||
cv_generator(outer_k, inner_k, dfm, class_type, grid, seed)
|
||||
}
|
||||
\arguments{
|
||||
\item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data}
|
||||
|
||||
\item{inner_k}{Number of inner CV (parameter optimization) folds}
|
||||
|
||||
\item{dfm}{DFM containing the labeled documents}
|
||||
|
||||
\item{class_type}{Name of the column in docvars containing the classification}
|
||||
|
||||
\item{grid}{Parameter grid for optimization}
|
||||
|
||||
\item{seed}{integer used as seed for random number generation}
|
||||
}
|
||||
\value{
|
||||
A nested set of lists with row numbers
|
||||
}
|
||||
\description{
|
||||
Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
|
||||
}
|
||||
\examples{
|
||||
cv_generator(outer_k, inner_k, dfm, class_type)
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/estimator.R
|
||||
\name{estimator}
|
||||
\alias{estimator}
|
||||
\title{Generate models and get classifications on test sets}
|
||||
\usage{
|
||||
estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model)
|
||||
}
|
||||
\arguments{
|
||||
\item{row}{Row number of current item in grid}
|
||||
|
||||
\item{grid}{Grid with model parameters and CV folds}
|
||||
|
||||
\item{outer_folds}{List with row numbers for outer folds}
|
||||
|
||||
\item{dfm}{DFM containing labeled documents}
|
||||
|
||||
\item{class_type}{Name of column in docvars() containing the classes}
|
||||
|
||||
\item{model}{Model to use (currently only nb)}
|
||||
}
|
||||
\value{
|
||||
Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
|
||||
}
|
||||
\description{
|
||||
Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
|
||||
}
|
||||
\examples{
|
||||
estimator(row, grid, outer_folds, dfm, class_type, model)
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/feat_select.R
|
||||
\name{feat_select}
|
||||
\alias{feat_select}
|
||||
\title{Select features using quanteda textstat_keyness}
|
||||
\usage{
|
||||
feat_select(topic, dfm, class_type, percentile, measure = "chi2")
|
||||
}
|
||||
\arguments{
|
||||
\item{topic}{The topic to determine keywords for}
|
||||
|
||||
\item{dfm}{The input dfm}
|
||||
|
||||
\item{class_type}{Name of the column in docvars containing the classification}
|
||||
|
||||
\item{percentile}{Cutoff for the list of words that should be returned}
|
||||
|
||||
\item{measure}{Measure to use in determining keyness, default = chi2; see textstat_keyness for other options}
|
||||
}
|
||||
\value{
|
||||
A vector of words that are key to the topic
|
||||
}
|
||||
\description{
|
||||
Select features based on the textstat_keyness function and a percentile cutoff
|
||||
Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
|
||||
}
|
||||
\examples{
|
||||
feat_select(topic, dfm, class_type, percentile, measure="chi2")
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/metric_gen.R
|
||||
\name{metric_gen}
|
||||
\alias{metric_gen}
|
||||
\title{Generate performance statistics for models}
|
||||
\usage{
|
||||
metric_gen(x)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{A data frame containing at least the columns "pred" and "tv"}
|
||||
}
|
||||
\value{
|
||||
x, with additional columns for performance metrics
|
||||
}
|
||||
\description{
|
||||
Generate performance statistics for models, based on their predictions and the true values
|
||||
}
|
||||
\examples{
|
||||
metric_gen(x)
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/modelizer_old.R
|
||||
\name{modelizer_old}
|
||||
\alias{modelizer_old}
|
||||
\title{Generate a classification model}
|
||||
\usage{
|
||||
modelizer_old(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
|
||||
outer_k, inner_k, model, class_type, opt_measure, country, grid)
|
||||
}
|
||||
\arguments{
|
||||
\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
|
||||
|
||||
\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)}
|
||||
|
||||
\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)}
|
||||
|
||||
\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)}
|
||||
|
||||
\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)}
|
||||
|
||||
\item{seed}{Integer to use as seed for random number generation, ensures replicability}
|
||||
|
||||
\item{outer_k}{Number of outer cross-validation folds (for performance estimation)}
|
||||
|
||||
\item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}
|
||||
|
||||
\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
|
||||
|
||||
\item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}
|
||||
|
||||
\item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
|
||||
|
||||
\item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}
|
||||
|
||||
\item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
|
||||
}
|
||||
\value{
|
||||
An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
|
||||
}
|
||||
\description{
|
||||
Generate a nested cross validated classification model based on a dfm with class labels as docvars
|
||||
Currently only supports Naïve Bayes using quanteda's textmodel_nb
|
||||
Hyperparemeter optimization is enabled through the grid parameter
|
||||
A grid should be generated from vectors with the labels as described for each model, using the crossing() command
|
||||
For Naïve Bayes, the following parameters can be used:
|
||||
- percentiles (cutoff point for tf-idf feature selection)
|
||||
- measures (what measure to use for determining feature importance, see textstat_keyness for options)
|
||||
}
|
||||
\examples{
|
||||
modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
|
||||
}
|
Loading…
Reference in new issue