From 5de4e1488c8a1f73320051e1e4e011c560635f17 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 22 Jun 2020 15:07:46 +0200 Subject: [PATCH] estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R --- R/estimator.R | 44 +++++++++++++++++++++++++++++++------------- R/modelizer.R | 22 ++++++++++------------ R/preproc.R | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 72 insertions(+), 27 deletions(-) diff --git a/R/estimator.R b/R/estimator.R index 63ef19d..d5d8344 100644 --- a/R/estimator.R +++ b/R/estimator.R @@ -8,6 +8,7 @@ #' @param dfm DFM containing labeled documents #' @param class_type Name of column in docvars() containing the classes #' @param model Model to use (currently only nb) +#' @param we_vectors Matrix with word embedding vectors #' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values. #' @export #' @examples @@ -17,7 +18,7 @@ ################################################################################################# ### Classification function -estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model) { +estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model, we_vectors) { # Get parameters for current iteration params <- grid[row,] @@ -41,26 +42,43 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod dfm_train <- dfm } - if (exists("final")) { - preproc_dfm <- preproc(dfm_train, NULL, params) - dfm_train <- preproc_dfm$dfm_train + dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) + if (params$tfidf) { + idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0) + dfm_train <- dfm_weight(dfm_train, weights = idf) + if (!is.null(dfm_test)) { + dfm_test <- dfm_weight(dfm_test, weights = idf) + } } else { - preproc_dfm <- preproc(dfm_train, dfm_test, params) - dfm_train <- preproc_dfm$dfm_train - dfm_test <- preproc_dfm$dfm_test + idf <- NULL + } + + if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) { + + # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select) + words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)), + feat_select, + dfm = dfm_train, + class_type = params$class_type, + percentile = params$feat_percentiles, + measure = params$feat_measures + ))) + dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F) + if (!is.null(dfm_test)) { + dfm_test <- dfm_keep(dfm_test, words, valuetype="fixed", verbose=F) + } } - idf <- preproc_dfm$idf if (model == "nb") { text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial") } if (model == "svm") { - text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon) - } - if (model == 'nnet') { - idC <- class.ind(as.factor(docvars(dfm_train, class_type))) - text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2) + text_model <- svm(x=as.matrix(train_data), y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon) } + # if (model == 'nnet') { + # idC <- class.ind(as.factor(docvars(dfm_train, class_type))) + # text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2) + # } ### Add more if statements for different models # If training on whole dataset, return final model, and idf values from dataset diff --git a/R/modelizer.R b/R/modelizer.R index f3c0824..dec7b85 100644 --- a/R/modelizer.R +++ b/R/modelizer.R @@ -16,8 +16,9 @@ #' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search) #' @param seed Integer to use as seed for random number generation, ensures replicability #' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb) +#' @param we_vectors Matrix with word embedding vectors #' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1 -#' @return An .Rds file in the current working directory (getwd()) with a list containing all relevant output +#' @return A list containing all relevant output #' @export #' @examples #' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) @@ -25,9 +26,9 @@ #################################### Function to generate classification models ################# ################################################################################################# -modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) { +modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, we_vectors, cores = 1) { ## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building - folds <- cv_generator(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed) + folds <- cv_generator(outer_k,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed) inner_grid <- folds$grid outer_folds <- folds$outer_folds inner_folds <- folds$inner_folds @@ -42,6 +43,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g inner_folds = inner_folds, dfm = dfm, class_type = class_type, + we_vectors = we_vectors, model = model) %>% future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row bind_rows(.) @@ -66,12 +68,13 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g inner_folds = NULL, dfm = dfm, class_type = class_type, + we_vectors = we_vectors, model = model) %>% future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid bind_rows(.) # Create (inner) folds for parameter optimization on the entire dataset - final_folds <- cv_generator(NULL,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed) + final_folds <- cv_generator(NULL,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed) final_grid <- final_folds$grid final_inner <- final_folds$inner_folds @@ -82,6 +85,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g inner_folds = final_inner, dfm = dfm, class_type = class_type, + we_vectors = we_vectors, model = model) %>% future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid bind_rows(.) @@ -118,12 +122,6 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g country = country, class_type = class_type) - # Set name for output .Rds file - filename <- paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.Rds') - - # Save RDS file with output - saveRDS(output, file = filename) - - # Return ouput RDS filename - return(filename) + # Return ouput + return(output) } diff --git a/R/preproc.R b/R/preproc.R index c497bff..82c9588 100644 --- a/R/preproc.R +++ b/R/preproc.R @@ -5,6 +5,7 @@ #' @param dfm_train Training dfm #' @param dfm_test Testing dfm if applicable, otherwise NULL #' @param params Row from grid with parameter optimization +#' @param we_vectors Matrix with word embedding vectors #' @return List with dfm_train and dfm_test, processed according to parameters in params #' @export #' @examples @@ -12,7 +13,11 @@ ################################################################################################# #################################### Preprocess data ############################################ ################################################################################################# -preproc <- function(dfm_train, dfm_test = NULL, params) { + + +### CURRENTLY UNUSED!!!### + +preproc <- function(dfm_train, dfm_test = NULL, params, we_vectors) { # Remove non-existing features from training dfm dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) if (params$tfidf) { @@ -25,7 +30,7 @@ preproc <- function(dfm_train, dfm_test = NULL, params) { idf <- NULL } - if ("feat_percentiles" %in% colnames(params) && "feat_measures" %in% colnames(params)) { + if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) { # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select) words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)), @@ -38,5 +43,29 @@ preproc <- function(dfm_train, dfm_test = NULL, params) { dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F) } + if (!is.null(we_vectors)) { + shared_dict <- sort(intersect(dfm_train@Dimnames$features,we_vectors$V1)) + if (!is.null(dfm_test)) { + shared_dict <- sort(intersect(dfm_test@Dimnames$features,shared_dict)) + dfm_test <- dfm_keep(dfm_test, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>% + .[, sort(colnames(.))] + } + dfm_train <- dfm_keep(dfm_train, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>% + .[, sort(colnames(.))] + we_matrix <- filter(we_vectors, V1 %in% shared_dict) %>% + arrange(V1) %>% + as.data.table(.) %>% + .[,2:ncol(.), with = F] %>% + as.matrix(.) + + dfm_train_we_sum <- dfm_train %*% we_matrix + # dfm_train_we_mean <- dfm_train_we_sum / as.vector(rowSums(dfm_train)) + + if (!is.null(dfm_test)) { + dfm_test_we_sum <- dfm_test %*% we_matrix + # dfm_test_we_mean <- dfm_test_we_sum / as.vector(rowSums(dfm_test)) + } + return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf, dfm_train_we = dfm_train_we_sum, dfm_test_we = dfm_test_we_sum)) + } return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf)) }