revised modeling pipeline:

cv_generator: generate folds for nested cv
dfm_gen: added optional lowercasing parameter
estimator: estimate model and performance based on parameters
feat_select: select features based on textstat_keyness
metric_gen: convert output from estimator to model performance metrics
modelizer: updated for new pipeline
modelizer_old: old model pipeline
out_parser: now correctly exported
master
Your Name 5 years ago
parent e76a914dd2
commit a3b6e19646

2
.gitignore vendored

@ -1,3 +1,5 @@
.Rproj.user .Rproj.user
.Rhistory .Rhistory
.RData .RData
*.RData
*.Rds

@ -4,13 +4,19 @@ export(actor_fetcher)
export(actorizer) export(actorizer)
export(bulk_writer) export(bulk_writer)
export(class_update) export(class_update)
export(cv_generator)
export(dfm_gen) export(dfm_gen)
export(dupe_detect) export(dupe_detect)
export(elastic_update) export(elastic_update)
export(elasticizer) export(elasticizer)
export(estimator)
export(feat_select)
export(lemma_writer) export(lemma_writer)
export(merger) export(merger)
export(metric_gen)
export(modelizer) export(modelizer)
export(modelizer_old)
export(out_parser)
export(query_gen_actors) export(query_gen_actors)
export(query_string) export(query_string)
export(ud_update) export(ud_update)

@ -0,0 +1,64 @@
#' Generate CV folds for nested cross-validation
#'
#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
#'
#' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data
#' @param inner_k Number of inner CV (parameter optimization) folds
#' @param dfm DFM containing the labeled documents
#' @param class_type Name of the column in docvars containing the classification
#' @param grid Parameter grid for optimization
#' @param seed integer used as seed for random number generation
#' @return A nested set of lists with row numbers
#' @export
#' @examples
#' cv_generator(outer_k, inner_k, dfm, class_type)
#################################################################################################
#################################### Generate CV folds ##########################################
#################################################################################################
cv_generator <- function(outer_k, inner_k, dfm, class_type, grid, seed) {
### Generate inner folds for nested cv
inner_loop <- function(i, folds, dfm, inner_k, class_type, grid, seed) {
# RNG needs to be set explicitly for each fold
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
inner_folds <- createFolds(as.factor(docvars(dfm[-folds[[i]],], class_type)), k= inner_k)
grid <- crossing(grid, inner_fold = names(inner_folds), outer_fold = names(folds)[i])
return(list(grid = grid, inner_folds = inner_folds, outer_fold = names(folds)[i]))
}
### Generate outer folds for nested cv
generate_folds <- function(outer_k, inner_k, dfm, class_type, grid, seed){
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
if (is.null(outer_k)) { # If no outer_k, use all data to generate inner_k folds for parameter optimization
inner_folds <- createFolds(as.factor(docvars(dfm, class_type)), k= inner_k)
grid <- crossing(grid, inner_fold = names(inner_folds))
return(list(grid = grid,
inner_folds = inner_folds))
} else if (outer_k < 1) { # Create holdout validation for model performance estimation, with test set equal to outer_k
folds <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k)
} else { # Do full nested CV
folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
}
# Generate grid of hyperparameters for model optimization, and include inner folds row numbers
grid_folds <- lapply(1:length(folds),
inner_loop,
folds = folds,
dfm = dfm,
inner_k = inner_k,
class_type = class_type,
grid = grid,
seed = seed)
# Extract grid dataframe from results
grid <- grid_folds %>% purrr::map(1) %>% dplyr::bind_rows()
# Extract row numbers for inner folds from results
inner_folds <- grid_folds %>% purrr::map(2)
# Extract the names of the inner folds from results
names(inner_folds) <- grid_folds %>% purrr::map(3) %>% unlist(.)
return(list(grid = grid,
outer_folds = folds,
inner_folds = inner_folds))
}
return(generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed))
}

@ -6,6 +6,7 @@
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available) #' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
#' @param tolower Boolean indicating whether dfm features should be lowercased
#' @return A Quanteda dfm #' @return A Quanteda dfm
#' @export #' @export
#' @examples #' @examples
@ -18,7 +19,7 @@
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1) { dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1, tolower = T) {
# Create subset with just ids, codes and text # Create subset with just ids, codes and text
out <- out %>% out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
@ -62,6 +63,6 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1) {
} }
} }
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>% dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex") dfm(tolower = tolower, stem = F, remove_punct = T, valuetype = "regex")
return(dfm) return(dfm)
} }

@ -0,0 +1,96 @@
#' Generate models and get classifications on test sets
#'
#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
#'
#' @param row Row number of current item in grid
#' @param grid Grid with model parameters and CV folds
#' @param outer_folds List with row numbers for outer folds
#' @param dfm DFM containing labeled documents
#' @param class_type Name of column in docvars() containing the classes
#' @param model Model to use (currently only nb)
#' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
#' @export
#' @examples
#' estimator(row, grid, outer_folds, dfm, class_type, model)
#################################################################################################
#################################### Generate CV folds ##########################################
#################################################################################################
### Classification function
estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model) {
# Get parameters for current iteration
params <- grid[row,]
# If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
if ("inner_fold" %in% colnames(params) && "outer_fold" %in% colnames(params)) {
dfm_train <- dfm[-outer_folds[[params$outer_fold]],] %>%
.[-inner_folds[[params$outer_fold]][[params$inner_fold]],]
dfm_test <- dfm[-outer_folds[[params$outer_fold]],] %>%
.[inner_folds[[params$outer_fold]][[params$inner_fold]],]
# If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
} else if ("outer_fold" %in% colnames(params)) {
dfm_train <- dfm[-outer_folds[[params$outer_fold]],]
dfm_test <- dfm[outer_folds[[params$outer_fold]],]
# If only inner folds, validate performance directly on inner folds
} else if ("inner_fold" %in% colnames(params)) {
dfm_train <- dfm[-inner_folds[[params$inner_fold]],]
dfm_test <- dfm[inner_folds[[params$inner_fold]],]
# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
} else {
final <- T ### Indicate final modeling run on whole dataset
dfm_train <- dfm
}
## Currently scheme_tf is not used explicitly
# if (model == 'nb') {
# scheme_tf <- 'count' # The 'old' way
# } else {
# scheme_tf <- 'prop' # The 'new' way
# }
### Getting features from training dataset
# Getting idf from training data, and using it to normalize both training and testing feature occurence
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
dfm_train <- dfm_weight(dfm_train, weights = idf)
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
words <- unique(unlist(lapply(unique(docvars(dfm_train, class_type)),
feat_select,
dfm = dfm_train,
class_type = class_type,
percentile = params$percentiles,
measure = params$measures
)))
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
if (model == "nb") {
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
}
if (model == "svm") {
text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
}
if (model == 'nnet') {
idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
}
### Add more if statements for different models
# If training on whole dataset, return final model, and idf values from dataset
if (exists("final")) {
return(list(text_model=text_model, idf=idf))
} else { # Create a test set, and classify test items
dfm_test <- dfm_weight(dfm_test, weights = idf)
# Use force=T to keep only features present in both training and test set
pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T)
return(data.frame(
tv = I(list(docvars(dfm_test, class_type))), # True values from test set
pred = I(list(pred)), # Predictions of test set
params, # Parameters used to generate classification model
text_model = I(list(text_model)), # The classification model
idf = I(list(idf)), # IDF of the training dataset used for model creation
stringsAsFactors = F
))
}
}

@ -0,0 +1,28 @@
#' Select features using quanteda textstat_keyness
#'
#' Select features based on the textstat_keyness function and a percentile cutoff
#' Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
#'
#' @param topic The topic to determine keywords for
#' @param dfm The input dfm
#' @param class_type Name of the column in docvars containing the classification
#' @param percentile Cutoff for the list of words that should be returned
#' @param measure Measure to use in determining keyness, default = chi2; see textstat_keyness for other options
#' @return A vector of words that are key to the topic
#' @export
#' @examples
#' feat_select(topic, dfm, class_type, percentile, measure="chi2")
#################################################################################################
#################################### Feature selection ##########################################
#################################################################################################
feat_select <- function (topic, dfm, class_type, percentile, measure="chi2") {
# Use quanteda textstat_keyness to determine feature importance
keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
na.omit()
# Convert keyness values to absolute values, to take into account both positive and negative extremes
keyness[,2] <- abs(keyness[,2])
# Keep only the words with an absolute keyness value falling in the top [percentile] percentile
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
return(keyness)
}

@ -0,0 +1,47 @@
#' Generate performance statistics for models
#'
#' Generate performance statistics for models, based on their predictions and the true values
#'
#' @param x A data frame containing at least the columns "pred" and "tv"
#' @return x, with additional columns for performance metrics
#' @export
#' @examples
#' metric_gen(x)
#################################################################################################
############################# Performance metric generation #####################################
#################################################################################################
metric_gen <- function(x) {
### Fix for missing classes in multiclass classification
### Sorting u for easier interpretation of confusion matrix
u <- as.character(sort(as.numeric(union(unlist(x$pred), unlist(x$tv)))))
# Create a crosstable with predictions and true values
class_table <- table(prediction = factor(unlist(x$pred), u), trueValues = factor(unlist(x$tv), u))
# When only two classes, set positive class explicitly as the class with the highest value
if (length(unique(u)) == 2) {
conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
weighted_measures <- as.data.frame(conf_mat$byClass)
macro_measures <- as.data.frame(conf_mat$byClass)
} else {
# Create a confusion matrix
conf_mat <- confusionMatrix(class_table, mode = "everything")
# Set "positive" value to NA, because not applicable
conf_mat$positive <- NA
# Compute weighted performance measures
weighted_measures <- colSums(conf_mat$byClass * colSums(conf_mat$table))/sum(colSums(conf_mat$table))
# Compute unweighted performance measures (divide by number of classes, each class equally important)
macro_measures <- colSums(conf_mat$byClass)/nrow(conf_mat$byClass)
# Replace NaN's by 0 when occurring
weighted_measures[is.nan(weighted_measures)] <- 0
macro_measures[is.nan(macro_measures)] <- 0
}
return(cbind(x,
as.data.frame(t(conf_mat$overall)),
'weighted' = t(as.data.frame(weighted_measures)),
'macro' = t(as.data.frame(macro_measures)),
pos_cat = conf_mat$positive,
conf_mat = I(list(conf_mat))
)
)
}

@ -8,293 +8,122 @@
#' - percentiles (cutoff point for tf-idf feature selection) #' - percentiles (cutoff point for tf-idf feature selection)
#' - measures (what measure to use for determining feature importance, see textstat_keyness for options) #' - measures (what measure to use for determining feature importance, see textstat_keyness for options)
#' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars #' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars
#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds)
#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)
#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)
#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)
#' @param seed Integer to use as seed for random number generation, ensures replicability
#' @param outer_k Number of outer cross-validation folds (for performance estimation) #' @param outer_k Number of outer cross-validation folds (for performance estimation)
#' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection) #' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection)
#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
#' @param class_type Type of classification to model ("junk", "aggregate", or "codes") #' @param class_type Type of classification to model ("junk", "aggregate", or "codes")
#' @param opt_measure Label of measure in confusion matrix to use as performance indicator #' @param opt_measure Label of measure in confusion matrix to use as performance indicator
#' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename) #' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename)
#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search) #' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation #' @param seed Integer to use as seed for random number generation, ensures replicability
#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
#' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1
#' @return An .Rds file in the current working directory (getwd()) with a list containing all relevant output
#' @export #' @export
#' @examples #' @examples
#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid) #' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
################################################################################################# #################################################################################################
#################################### Function to generate classification models ################# #################################### Function to generate classification models #################
################################################################################################# #################################################################################################
modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) { modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) {
### Functions ### ## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building
feat_select <- function (topic, dfm, class_type, percentile,measure) { folds <- cv_generator(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed)
keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>% inner_grid <- folds$grid
na.omit() outer_folds <- folds$outer_folds
keyness[,2] <- abs(keyness[,2]) inner_folds <- folds$inner_folds
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
return(keyness)
}
### Generate inner folds for nested cv
inner_loop <- function(fold, dfm, inner_k, class_type) {
# RNG needs to be set explicitly for each fold
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
## Either createDataPartition for simple holdout parameter optimization
## Or createFolds for proper inner CV for nested CV
# if (inner_k <= 1) {
# inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
# } else {
inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k)
# }
return(c(outer_fold = list(fold),inner_folds))
}
### Generate outer folds for nested cv ## Create multithread work pool for future_lapply
generate_folds <- function(outer_k, inner_k, dfm, class_type){ plan(strategy = multiprocess, workers = cores)
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type))
}
### Gets called for every parameter combination, and calls classifier for every inner cv fold ## Use estimator function to build models for every parameter combination in grid
inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) { inner_cv_output <- future_lapply(1:nrow(inner_grid), estimator,
print(str_c('params ',row)) grid = inner_grid,
params <- grid[row,] outer_folds = outer_folds,
# For each inner fold, cross validate the specified parameters inner_folds = inner_folds,
res <-
bind_rows(mclapply(inner_folds,
classifier,
outer_fold = outer_fold,
params = params,
dfm = dfm, dfm = dfm,
class_type = class_type, class_type = class_type,
model = model, model = model) %>%
cores_feats = cores_feats, future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row
mc.cores = cores_inner bind_rows(.)
)
)
# print(res) outer_grid <- inner_cv_output %>%
# print(res[1,1]) # Group by outer folds, and by parameters used for model tuning
# print('inner_cv') group_by_at(c("outer_fold", colnames(grid))) %>%
return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params)) # Get mean values for all numeric (performance indicator) variables
} summarise_if(is.numeric, mean, na.rm = F) %>%
# Group the mean performance indicators by outer_fold
### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid group_by(outer_fold) %>%
outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) { # Get for each outer_fold the row with the highest value of opt_measure
print('outer cv') slice(which.max((!!as.name(opt_measure)))) %>%
# If fold contains both inner folds and outer fold # Select only the columns outer_fold, and the columns that are in the original parameter grid
if (length(fold) == inner_k + 1) { select(outer_fold, colnames(grid))
inner_folds <- fold[-1]
outer_fold <- fold$outer_fold # Use the estimator function to build optimum models for each outer_fold
# For each row in grid, cross-validate results outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator,
res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1), grid = outer_grid,
inner_cv, outer_folds = outer_folds,
cores_feats= cores_feats, inner_folds = NULL,
grid = grid,
dfm = dfm, dfm = dfm,
class_type = class_type, class_type = class_type,
model = model, model = model) %>%
outer_fold = outer_fold, future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid
inner_folds = inner_folds, bind_rows(.)
cores_inner = cores_inner,
mc.cores = cores_grid) # Create (inner) folds for parameter optimization on the entire dataset
) final_folds <- cv_generator(NULL,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed)
# print(res) final_grid <- final_folds$grid
# print(res[1,1]) final_inner <- final_folds$inner_folds
# print('outer_cv')
# Determine optimum hyperparameters within outer fold training set # Use the estimator function to estimate the performance of each row in final_grid
optimum <- res[which.max(res[,opt_measure]),] %>% final_cv_output <- future_lapply(1:nrow(final_grid), estimator,
select(percentiles: ncol(.)) grid = final_grid,
# Validate performance of optimum hyperparameters on outer fold test set outer_folds = NULL,
return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats)) inner_folds = final_inner,
} else {
# If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters
inner_folds <- fold
outer_fold <- NULL
res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
inner_cv,
cores_feats= cores_feats,
grid = grid,
dfm = dfm, dfm = dfm,
class_type = class_type, class_type = class_type,
model = model, model = model) %>%
outer_fold = outer_fold, future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid
inner_folds = inner_folds, bind_rows(.)
cores_inner = cores_inner,
mc.cores = cores_grid)
) final_params <- final_cv_output %>%
# print(res) # Group final parameter optimization cv results by parameters used for optimization
# print(res[1,1]) group_by_at(colnames(grid)) %>%
# print('line 126, final model parameter optimization') # Get mean performance metrics for each fold
return(res) summarise_if(is.numeric, mean, na.rm = F) %>%
} # Ungroup to allow for slicing
} ungroup() %>%
# Select row with highest value of opt_measure
# ### Custom tfidf function to allow same idf for different dfm's slice(which.max((!!as.name(opt_measure)))) %>%
# custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) { # Keep only the columns that are present in the original parameter grid
# if (!nfeat(x) || !ndoc(x)) return(x) select(colnames(grid))
# tfreq <- dfm_weight(x, scheme = scheme_tf, base = base) # Use the estimator function to estimate the final model, using the optimum parameters provided in final_params
# if (nfeat(x) != length(dfreq)) model_final <- estimator(1,
# stop("missing some values in idf calculation") grid = final_params,
# # get the document indexes outer_folds = NULL,
# j <- as(tfreq, "dgTMatrix")@j + 1 inner_folds = NULL,
# # replace just the non-zero values by product with idf dfm = dfm,
# x@x <- tfreq@x * dfreq[j]
# # record attributes
#
# ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2
#
# # x@weightTf <- tfreq@weightTf
# # x@weightDf <- c(list(scheme = scheme_df, base = base), args)
# return(x)
# }
### Classification function
classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
# If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
if (length(inner_fold) > 0 && length(outer_fold) > 0) {
dfm_train <- dfm[-outer_fold,] %>%
.[-inner_fold,]
dfm_test <- dfm[-outer_fold,] %>%
.[inner_fold,]
# If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
} else if (length(outer_fold) > 0 ) {
dfm_train <- dfm[-outer_fold,]
dfm_test <- dfm[outer_fold,]
validation_cv <- T
# If only inner folds, validate performance directly on inner folds (is the same as above?)
} else if (length(inner_fold) > 0 ) {
dfm_train <- dfm[-inner_fold,]
dfm_test <- dfm[inner_fold,]
# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
} else {
final <- T ### Indicate final modeling run on whole dataset
dfm_train <- dfm
}
if (model == 'nb') {
scheme_tf <- 'count' # The 'old' way
} else {
scheme_tf <- 'prop' # The 'new' way
}
### Getting features from training dataset
# Getting idf from training data, and using it to normalize both training and testing feature occurence
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
dfm_train <- dfm_weight(dfm_train, weights = idf)
# Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
# than the threshold on two or more of those categories
words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
feat_select,
dfm = dfm_train,
class_type = class_type, class_type = class_type,
percentile = params$percentiles, model = model)
measure = params$measures, # Create list with output variables
mc.cores = cores_feats output <- list(final_cv_output = final_cv_output,
))) final_params = final_params,
# dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq) outer_cv_output = outer_cv_output,
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T) model_final = model_final,
grid = grid,
seed = seed,
if (model == "nb") { opt_measure = opt_measure,
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial") model = model,
} country = country,
if (model == "svm") { class_type = class_type)
text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
}
if (model == 'nnet') {
idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
}
# if (model == 'neuralnet') {
# dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
# dfm_test <- custom_tfidf(dfm_test, scheme_tf = "prop", scheme_df = "inverse", base = 10, dfreq = dfreq[words])
#
# idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
# colnames(idC) <- NULL
# nn_train <- cbind(idC, dfm_train)
# n <- colnames(nn_train)[3:length(colnames(nn_train))]
# f <- as.formula(paste0("feat1 + feat2 ~ `", paste0(n, collapse = "` + `"),"`"))
# idC_out <- class.ind(as.factor(docvars(dfm_test, class_type)))
# colnames(idC_out) <- NULL
# nn_test <- cbind(idC_out, dfm_test)
# nn <- neuralnet(f,data=nn_train,hidden=3,linear.output=F,act.fct = 'logistic',lifesign = 'full', threshold = .005)
# pr.nn <- compute(nn,nn_test[,3:length(colnames(nn_test))])
# class_table <- table(prediction = as.matrix(round(pr.nn$net.result[,2])), trueValues = as.matrix(idC_out[,2]))
# nn_conf_mat <- confusionMatrix(class_table, mode = "everything")
# }
### Add more if statements for different models
if (exists("final")) {
return(list(text_model=text_model, idf=idf))
} else {
### Removing all features not in training set from test set and weighting the remaining features according to training idf
dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
dfm_test <- dfm_weight(dfm_test, weights = idf)
pred <- predict(text_model, newdata = dfm_test, type = 'class')
### Fix for single-class 'predictions' in borderline situations
# if (length(unique(pred)) == 1 & class_type == 'junk') {
# if (unique(pred) == '0') {
# pred[1] <- '1'
# } else {
# pred[1] <- '0'
# }
# }
### Fix for missing classes in multiclass classification
u <- union(pred, docvars(dfm_test, class_type))
class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u))
if (length(unique(u)) == 2) {
conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
} else {
conf_mat <- confusionMatrix(class_table, mode = "everything")
conf_mat$positive <- NA
}
if (exists("validation_cv")) {
return(data.frame(
tv = docvars(dfm_test, class_type),
pred = pred,
params = params,
pos_cat = conf_mat$positive,
stringsAsFactors = F
))
}
if (is.matrix(conf_mat$byClass) == T) {
return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
} else {
return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive))
}
}
}
### If outer_k is 1, do a holdout training run, with only cross-validation for parameter optimization, else, do nested CV
### If holdout, training/test distribution is the same as for inner CV
if (outer_k == 1) {
outer_fold <- createDataPartition(as.factor(docvars(dfm, class_type)), p=1-(1/inner_k)*(inner_k-1))
folds <- lapply(outer_fold,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type)
} else {
## Generate nested CV folds, based on number of inner and outer folds defined (see start of script)
folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type)
}
## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator)
performance <- mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)
## Set seed and generate folds for final hyperparameter optimization search (using CV)
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k))
## Get the final hyperparameter performance for all value combinations in grid # Set name for output .Rds file
params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)) filename <- paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.Rds')
## Select optimum final hyperparameters # Save RDS file with output
optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>% saveRDS(output, file = filename)
select(percentiles: ncol(.))
## Estimate final model on whole dataset, using optimum final hyperparameters determined above # Return ouput RDS filename
model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer))) return(filename)
rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
} }

@ -0,0 +1,301 @@
#' Generate a classification model
#'
#' Generate a nested cross validated classification model based on a dfm with class labels as docvars
#' Currently only supports Naïve Bayes using quanteda's textmodel_nb
#' Hyperparemeter optimization is enabled through the grid parameter
#' A grid should be generated from vectors with the labels as described for each model, using the crossing() command
#' For Naïve Bayes, the following parameters can be used:
#' - percentiles (cutoff point for tf-idf feature selection)
#' - measures (what measure to use for determining feature importance, see textstat_keyness for options)
#' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars
#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds)
#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)
#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)
#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)
#' @param seed Integer to use as seed for random number generation, ensures replicability
#' @param outer_k Number of outer cross-validation folds (for performance estimation)
#' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection)
#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
#' @param class_type Type of classification to model ("junk", "aggregate", or "codes")
#' @param opt_measure Label of measure in confusion matrix to use as performance indicator
#' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename)
#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
#' @export
#' @examples
#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
#################################################################################################
#################################### Function to generate classification models #################
#################################################################################################
modelizer_old <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
### Functions ###
feat_select <- function (topic, dfm, class_type, percentile,measure) {
keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
na.omit()
keyness[,2] <- abs(keyness[,2])
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
return(keyness)
}
### Generate inner folds for nested cv
inner_loop <- function(fold, dfm, inner_k, class_type) {
# RNG needs to be set explicitly for each fold
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
## Either createDataPartition for simple holdout parameter optimization
## Or createFolds for proper inner CV for nested CV
# if (inner_k <= 1) {
# inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
# } else {
inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k)
# }
return(c(outer_fold = list(fold),inner_folds))
}
### Generate outer folds for nested cv
generate_folds <- function(outer_k, inner_k, dfm, class_type){
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type))
}
### Gets called for every parameter combination, and calls classifier for every inner cv fold
inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
print(str_c('params ',row))
params <- grid[row,]
# For each inner fold, cross validate the specified parameters
res <-
bind_rows(mclapply(inner_folds,
classifier,
outer_fold = outer_fold,
params = params,
dfm = dfm,
class_type = class_type,
model = model,
cores_feats = cores_feats,
mc.cores = cores_inner
)
)
# print(res)
# print(res[1,1])
# print('inner_cv')
return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params))
}
### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
print('outer cv')
# If fold contains both inner folds and outer fold
if (length(fold) == inner_k + 1) {
inner_folds <- fold[-1]
outer_fold <- fold$outer_fold
# For each row in grid, cross-validate results
res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
inner_cv,
cores_feats= cores_feats,
grid = grid,
dfm = dfm,
class_type = class_type,
model = model,
outer_fold = outer_fold,
inner_folds = inner_folds,
cores_inner = cores_inner,
mc.cores = cores_grid)
)
# print(res)
# print(res[1,1])
# print('outer_cv')
# Determine optimum hyperparameters within outer fold training set
optimum <- res[which.max(res[,opt_measure]),] %>%
select(percentiles: ncol(.))
# Validate performance of optimum hyperparameters on outer fold test set
return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats))
} else {
# If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters
inner_folds <- fold
outer_fold <- NULL
res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
inner_cv,
cores_feats= cores_feats,
grid = grid,
dfm = dfm,
class_type = class_type,
model = model,
outer_fold = outer_fold,
inner_folds = inner_folds,
cores_inner = cores_inner,
mc.cores = cores_grid)
)
# print(res)
# print(res[1,1])
# print('line 126, final model parameter optimization')
return(res)
}
}
# ### Custom tfidf function to allow same idf for different dfm's
# custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
# if (!nfeat(x) || !ndoc(x)) return(x)
# tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
# if (nfeat(x) != length(dfreq))
# stop("missing some values in idf calculation")
# # get the document indexes
# j <- as(tfreq, "dgTMatrix")@j + 1
# # replace just the non-zero values by product with idf
# x@x <- tfreq@x * dfreq[j]
# # record attributes
#
# ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2
#
# # x@weightTf <- tfreq@weightTf
# # x@weightDf <- c(list(scheme = scheme_df, base = base), args)
# return(x)
# }
### Classification function
classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
# If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
if (length(inner_fold) > 0 && length(outer_fold) > 0) {
dfm_train <- dfm[-outer_fold,] %>%
.[-inner_fold,]
dfm_test <- dfm[-outer_fold,] %>%
.[inner_fold,]
# If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
} else if (length(outer_fold) > 0 ) {
dfm_train <- dfm[-outer_fold,]
dfm_test <- dfm[outer_fold,]
validation_cv <- T
# If only inner folds, validate performance directly on inner folds (is the same as above?)
} else if (length(inner_fold) > 0 ) {
dfm_train <- dfm[-inner_fold,]
dfm_test <- dfm[inner_fold,]
# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
} else {
final <- T ### Indicate final modeling run on whole dataset
dfm_train <- dfm
}
if (model == 'nb') {
scheme_tf <- 'count' # The 'old' way
} else {
scheme_tf <- 'prop' # The 'new' way
}
### Getting features from training dataset
# Getting idf from training data, and using it to normalize both training and testing feature occurence
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
dfm_train <- dfm_weight(dfm_train, weights = idf)
# Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
# than the threshold on two or more of those categories
words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
feat_select,
dfm = dfm_train,
class_type = class_type,
percentile = params$percentiles,
measure = params$measures,
mc.cores = cores_feats
)))
# dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq)
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
if (model == "nb") {
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
}
if (model == "svm") {
text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
}
if (model == 'nnet') {
idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
}
# if (model == 'neuralnet') {
# dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
# dfm_test <- custom_tfidf(dfm_test, scheme_tf = "prop", scheme_df = "inverse", base = 10, dfreq = dfreq[words])
#
# idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
# colnames(idC) <- NULL
# nn_train <- cbind(idC, dfm_train)
# n <- colnames(nn_train)[3:length(colnames(nn_train))]
# f <- as.formula(paste0("feat1 + feat2 ~ `", paste0(n, collapse = "` + `"),"`"))
# idC_out <- class.ind(as.factor(docvars(dfm_test, class_type)))
# colnames(idC_out) <- NULL
# nn_test <- cbind(idC_out, dfm_test)
# nn <- neuralnet(f,data=nn_train,hidden=3,linear.output=F,act.fct = 'logistic',lifesign = 'full', threshold = .005)
# pr.nn <- compute(nn,nn_test[,3:length(colnames(nn_test))])
# class_table <- table(prediction = as.matrix(round(pr.nn$net.result[,2])), trueValues = as.matrix(idC_out[,2]))
# nn_conf_mat <- confusionMatrix(class_table, mode = "everything")
# }
### Add more if statements for different models
if (exists("final")) {
return(list(text_model=text_model, idf=idf))
} else {
### Removing all features not in training set from test set and weighting the remaining features according to training idf
dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
dfm_test <- dfm_weight(dfm_test, weights = idf)
pred <- predict(text_model, newdata = dfm_test, type = 'class')
### Fix for single-class 'predictions' in borderline situations
# if (length(unique(pred)) == 1 & class_type == 'junk') {
# if (unique(pred) == '0') {
# pred[1] <- '1'
# } else {
# pred[1] <- '0'
# }
# }
### Fix for missing classes in multiclass classification
u <- union(pred, docvars(dfm_test, class_type))
class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u))
if (length(unique(u)) == 2) {
conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
} else {
conf_mat <- confusionMatrix(class_table, mode = "everything")
conf_mat$positive <- NA
}
if (exists("validation_cv")) {
return(data.frame(
tv = docvars(dfm_test, class_type),
pred = pred,
params = params,
pos_cat = conf_mat$positive,
stringsAsFactors = F
))
}
if (is.matrix(conf_mat$byClass) == T) {
return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
} else {
return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive))
}
}
}
### If outer_k is 1, do a holdout training run, with only cross-validation for parameter optimization, else, do nested CV
### If holdout, training/test distribution is the same as for inner CV
if (outer_k < 1) {
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
outer_fold <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k)
folds <- lapply(outer_fold,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type)
} else {
## Generate nested CV folds, based on number of inner and outer folds defined (see start of script)
folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type)
}
## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator)
performance <- mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)
## Set seed and generate folds for final hyperparameter optimization search (using CV)
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k))
## Get the final hyperparameter performance for all value combinations in grid
params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer))
## Select optimum final hyperparameters
optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>%
select(percentiles: ncol(.))
## Estimate final model on whole dataset, using optimum final hyperparameters determined above
model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer)))
rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
}

@ -1,11 +1,12 @@
#' Parse raw text into a single field #' Parse raw text into a single field
#' #'
#' Parse raw text into a single field #' Parse raw text from the MaML database into a single field
#' @param out The original output data frame #' @param out The original output data frame
#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code) #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available) #' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
#' @return a parsed output data frame including the additional column 'merged', containing the merged text #' @return a parsed output data frame including the additional column 'merged', containing the merged text
#' @export
#' @examples #' @examples
#' out_parser(out,field) #' out_parser(out,field)

@ -4,9 +4,9 @@
\alias{class_update} \alias{class_update}
\title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information} \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information}
\usage{ \usage{
class_update(out, localhost = T, model_final, dfm_words, varname, text, class_update(out, localhost = T, model_final, varname, text, words,
words, clean, ver, clean, ver, es_super = .rs.askForPassword("ElasticSearch WRITE"),
es_super = .rs.askForPassword("ElasticSearch WRITE")) cores = 1)
} }
\arguments{ \arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
@ -15,8 +15,6 @@ class_update(out, localhost = T, model_final, dfm_words, varname, text,
\item{model_final}{The classification model (e.g. output from textstat_nb(), svm() or others)} \item{model_final}{The classification model (e.g. output from textstat_nb(), svm() or others)}
\item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)}
\item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname} \item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname}
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
@ -28,6 +26,8 @@ class_update(out, localhost = T, model_final, dfm_words, varname, text,
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{es_super}{Password for write access to ElasticSearch} \item{es_super}{Password for write access to ElasticSearch}
\item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)}
} }
\value{ \value{
As this is a nested function used within elasticizer, there is no return output As this is a nested function used within elasticizer, there is no return output

@ -0,0 +1,30 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cv_generator.R
\name{cv_generator}
\alias{cv_generator}
\title{Generate CV folds for nested cross-validation}
\usage{
cv_generator(outer_k, inner_k, dfm, class_type, grid, seed)
}
\arguments{
\item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data}
\item{inner_k}{Number of inner CV (parameter optimization) folds}
\item{dfm}{DFM containing the labeled documents}
\item{class_type}{Name of the column in docvars containing the classification}
\item{grid}{Parameter grid for optimization}
\item{seed}{integer used as seed for random number generation}
}
\value{
A nested set of lists with row numbers
}
\description{
Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
}
\examples{
cv_generator(outer_k, inner_k, dfm, class_type)
}

@ -4,8 +4,8 @@
\alias{dfm_gen} \alias{dfm_gen}
\title{Generates dfm from ElasticSearch output} \title{Generates dfm from ElasticSearch output}
\usage{ \usage{
dfm_gen(out, words = "999", text = "lemmas", clean, dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1,
cores = detectCores()) tolower = T)
} }
\arguments{ \arguments{
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}
@ -17,6 +17,8 @@ dfm_gen(out, words = "999", text = "lemmas", clean,
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)} \item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
\item{tolower}{Boolean indicating whether dfm features should be lowercased}
} }
\value{ \value{
A Quanteda dfm A Quanteda dfm

@ -0,0 +1,30 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/estimator.R
\name{estimator}
\alias{estimator}
\title{Generate models and get classifications on test sets}
\usage{
estimator(row, grid, outer_folds, inner_folds, dfm, class_type, model)
}
\arguments{
\item{row}{Row number of current item in grid}
\item{grid}{Grid with model parameters and CV folds}
\item{outer_folds}{List with row numbers for outer folds}
\item{dfm}{DFM containing labeled documents}
\item{class_type}{Name of column in docvars() containing the classes}
\item{model}{Model to use (currently only nb)}
}
\value{
Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
}
\description{
Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
}
\examples{
estimator(row, grid, outer_folds, dfm, class_type, model)
}

@ -0,0 +1,29 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/feat_select.R
\name{feat_select}
\alias{feat_select}
\title{Select features using quanteda textstat_keyness}
\usage{
feat_select(topic, dfm, class_type, percentile, measure = "chi2")
}
\arguments{
\item{topic}{The topic to determine keywords for}
\item{dfm}{The input dfm}
\item{class_type}{Name of the column in docvars containing the classification}
\item{percentile}{Cutoff for the list of words that should be returned}
\item{measure}{Measure to use in determining keyness, default = chi2; see textstat_keyness for other options}
}
\value{
A vector of words that are key to the topic
}
\description{
Select features based on the textstat_keyness function and a percentile cutoff
Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
}
\examples{
feat_select(topic, dfm, class_type, percentile, measure="chi2")
}

@ -4,7 +4,8 @@
\alias{lemma_writer} \alias{lemma_writer}
\title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings} \title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings}
\usage{ \usage{
lemma_writer(out, file, localhost = F, documents = F, cores = 1) lemma_writer(out, file, localhost = F, documents = F, lemma = F,
cores = 1)
} }
\arguments{ \arguments{
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}
@ -15,6 +16,8 @@ lemma_writer(out, file, localhost = F, documents = F, cores = 1)
\item{documents}{Indicate whether the writer should output to a single file, or individual documents} \item{documents}{Indicate whether the writer should output to a single file, or individual documents}
\item{lemma}{Indicate whether document output should be lemmas or original document}
\item{cores}{Indicate the number of cores to use for parallel processing} \item{cores}{Indicate the number of cores to use for parallel processing}
} }
\value{ \value{

@ -0,0 +1,20 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/metric_gen.R
\name{metric_gen}
\alias{metric_gen}
\title{Generate performance statistics for models}
\usage{
metric_gen(x)
}
\arguments{
\item{x}{A data frame containing at least the columns "pred" and "tv"}
}
\value{
x, with additional columns for performance metrics
}
\description{
Generate performance statistics for models, based on their predictions and the true values
}
\examples{
metric_gen(x)
}

@ -4,28 +4,16 @@
\alias{modelizer} \alias{modelizer}
\title{Generate a classification model} \title{Generate a classification model}
\usage{ \usage{
modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid,
outer_k, inner_k, model, class_type, opt_measure, country, grid) seed, model, cores = 1)
} }
\arguments{ \arguments{
\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars} \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)}
\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)}
\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)}
\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)}
\item{seed}{Integer to use as seed for random number generation, ensures replicability}
\item{outer_k}{Number of outer cross-validation folds (for performance estimation)} \item{outer_k}{Number of outer cross-validation folds (for performance estimation)}
\item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)} \item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}
\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
\item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")} \item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}
\item{opt_measure}{Label of measure in confusion matrix to use as performance indicator} \item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
@ -33,9 +21,15 @@ modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
\item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)} \item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}
\item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)} \item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
\item{seed}{Integer to use as seed for random number generation, ensures replicability}
\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
\item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1}
} }
\value{ \value{
An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation An .Rds file in the current working directory (getwd()) with a list containing all relevant output
} }
\description{ \description{
Generate a nested cross validated classification model based on a dfm with class labels as docvars Generate a nested cross validated classification model based on a dfm with class labels as docvars
@ -47,5 +41,5 @@ For Naïve Bayes, the following parameters can be used:
- measures (what measure to use for determining feature importance, see textstat_keyness for options) - measures (what measure to use for determining feature importance, see textstat_keyness for options)
} }
\examples{ \examples{
modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid) modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
} }

@ -0,0 +1,51 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/modelizer_old.R
\name{modelizer_old}
\alias{modelizer_old}
\title{Generate a classification model}
\usage{
modelizer_old(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
outer_k, inner_k, model, class_type, opt_measure, country, grid)
}
\arguments{
\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)}
\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)}
\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)}
\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)}
\item{seed}{Integer to use as seed for random number generation, ensures replicability}
\item{outer_k}{Number of outer cross-validation folds (for performance estimation)}
\item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}
\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
\item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}
\item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
\item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}
\item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
}
\value{
An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
}
\description{
Generate a nested cross validated classification model based on a dfm with class labels as docvars
Currently only supports Naïve Bayes using quanteda's textmodel_nb
Hyperparemeter optimization is enabled through the grid parameter
A grid should be generated from vectors with the labels as described for each model, using the crossing() command
For Naïve Bayes, the following parameters can be used:
- percentiles (cutoff point for tf-idf feature selection)
- measures (what measure to use for determining feature importance, see textstat_keyness for options)
}
\examples{
modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
}

@ -19,7 +19,7 @@ out_parser(out, field, clean = F, cores = 1)
a parsed output data frame including the additional column 'merged', containing the merged text a parsed output data frame including the additional column 'merged', containing the merged text
} }
\description{ \description{
Parse raw text into a single field Parse raw text from the MaML database into a single field
} }
\examples{ \examples{
out_parser(out,field) out_parser(out,field)

Loading…
Cancel
Save