estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R

master
Your Name 4 years ago
parent 77eb51a1bf
commit 5de4e1488c

@ -8,6 +8,7 @@
#' @param dfm DFM containing labeled documents #' @param dfm DFM containing labeled documents
#' @param class_type Name of column in docvars() containing the classes #' @param class_type Name of column in docvars() containing the classes
#' @param model Model to use (currently only nb) #' @param model Model to use (currently only nb)
#' @param we_vectors Matrix with word embedding vectors
#' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values. #' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
#' @export #' @export
#' @examples #' @examples
@ -17,7 +18,7 @@
################################################################################################# #################################################################################################
### Classification function ### Classification function
estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model) { estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model, we_vectors) {
# Get parameters for current iteration # Get parameters for current iteration
params <- grid[row,] params <- grid[row,]
@ -41,26 +42,43 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
dfm_train <- dfm dfm_train <- dfm
} }
if (exists("final")) { dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
preproc_dfm <- preproc(dfm_train, NULL, params) if (params$tfidf) {
dfm_train <- preproc_dfm$dfm_train idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
dfm_train <- dfm_weight(dfm_train, weights = idf)
if (!is.null(dfm_test)) {
dfm_test <- dfm_weight(dfm_test, weights = idf)
}
} else { } else {
preproc_dfm <- preproc(dfm_train, dfm_test, params) idf <- NULL
dfm_train <- preproc_dfm$dfm_train }
dfm_test <- preproc_dfm$dfm_test
if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
feat_select,
dfm = dfm_train,
class_type = params$class_type,
percentile = params$feat_percentiles,
measure = params$feat_measures
)))
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
if (!is.null(dfm_test)) {
dfm_test <- dfm_keep(dfm_test, words, valuetype="fixed", verbose=F)
}
} }
idf <- preproc_dfm$idf
if (model == "nb") { if (model == "nb") {
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial") text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
} }
if (model == "svm") { if (model == "svm") {
text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon) text_model <- svm(x=as.matrix(train_data), y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
}
if (model == 'nnet') {
idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
} }
# if (model == 'nnet') {
# idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
# text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
# }
### Add more if statements for different models ### Add more if statements for different models
# If training on whole dataset, return final model, and idf values from dataset # If training on whole dataset, return final model, and idf values from dataset

@ -16,8 +16,9 @@
#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search) #' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
#' @param seed Integer to use as seed for random number generation, ensures replicability #' @param seed Integer to use as seed for random number generation, ensures replicability
#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb) #' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
#' @param we_vectors Matrix with word embedding vectors
#' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1 #' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1
#' @return An .Rds file in the current working directory (getwd()) with a list containing all relevant output #' @return A list containing all relevant output
#' @export #' @export
#' @examples #' @examples
#' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) #' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
@ -25,9 +26,9 @@
#################################### Function to generate classification models ################# #################################### Function to generate classification models #################
################################################################################################# #################################################################################################
modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) { modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, we_vectors, cores = 1) {
## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building ## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building
folds <- cv_generator(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed) folds <- cv_generator(outer_k,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
inner_grid <- folds$grid inner_grid <- folds$grid
outer_folds <- folds$outer_folds outer_folds <- folds$outer_folds
inner_folds <- folds$inner_folds inner_folds <- folds$inner_folds
@ -42,6 +43,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
inner_folds = inner_folds, inner_folds = inner_folds,
dfm = dfm, dfm = dfm,
class_type = class_type, class_type = class_type,
we_vectors = we_vectors,
model = model) %>% model = model) %>%
future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row
bind_rows(.) bind_rows(.)
@ -66,12 +68,13 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
inner_folds = NULL, inner_folds = NULL,
dfm = dfm, dfm = dfm,
class_type = class_type, class_type = class_type,
we_vectors = we_vectors,
model = model) %>% model = model) %>%
future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid
bind_rows(.) bind_rows(.)
# Create (inner) folds for parameter optimization on the entire dataset # Create (inner) folds for parameter optimization on the entire dataset
final_folds <- cv_generator(NULL,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed) final_folds <- cv_generator(NULL,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
final_grid <- final_folds$grid final_grid <- final_folds$grid
final_inner <- final_folds$inner_folds final_inner <- final_folds$inner_folds
@ -82,6 +85,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
inner_folds = final_inner, inner_folds = final_inner,
dfm = dfm, dfm = dfm,
class_type = class_type, class_type = class_type,
we_vectors = we_vectors,
model = model) %>% model = model) %>%
future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid
bind_rows(.) bind_rows(.)
@ -118,12 +122,6 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
country = country, country = country,
class_type = class_type) class_type = class_type)
# Set name for output .Rds file # Return ouput
filename <- paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.Rds') return(output)
# Save RDS file with output
saveRDS(output, file = filename)
# Return ouput RDS filename
return(filename)
} }

@ -5,6 +5,7 @@
#' @param dfm_train Training dfm #' @param dfm_train Training dfm
#' @param dfm_test Testing dfm if applicable, otherwise NULL #' @param dfm_test Testing dfm if applicable, otherwise NULL
#' @param params Row from grid with parameter optimization #' @param params Row from grid with parameter optimization
#' @param we_vectors Matrix with word embedding vectors
#' @return List with dfm_train and dfm_test, processed according to parameters in params #' @return List with dfm_train and dfm_test, processed according to parameters in params
#' @export #' @export
#' @examples #' @examples
@ -12,7 +13,11 @@
################################################################################################# #################################################################################################
#################################### Preprocess data ############################################ #################################### Preprocess data ############################################
################################################################################################# #################################################################################################
preproc <- function(dfm_train, dfm_test = NULL, params) {
### CURRENTLY UNUSED!!!###
preproc <- function(dfm_train, dfm_test = NULL, params, we_vectors) {
# Remove non-existing features from training dfm # Remove non-existing features from training dfm
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
if (params$tfidf) { if (params$tfidf) {
@ -25,7 +30,7 @@ preproc <- function(dfm_train, dfm_test = NULL, params) {
idf <- NULL idf <- NULL
} }
if ("feat_percentiles" %in% colnames(params) && "feat_measures" %in% colnames(params)) { if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select) # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)), words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
@ -38,5 +43,29 @@ preproc <- function(dfm_train, dfm_test = NULL, params) {
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F) dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
} }
if (!is.null(we_vectors)) {
shared_dict <- sort(intersect(dfm_train@Dimnames$features,we_vectors$V1))
if (!is.null(dfm_test)) {
shared_dict <- sort(intersect(dfm_test@Dimnames$features,shared_dict))
dfm_test <- dfm_keep(dfm_test, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
.[, sort(colnames(.))]
}
dfm_train <- dfm_keep(dfm_train, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
.[, sort(colnames(.))]
we_matrix <- filter(we_vectors, V1 %in% shared_dict) %>%
arrange(V1) %>%
as.data.table(.) %>%
.[,2:ncol(.), with = F] %>%
as.matrix(.)
dfm_train_we_sum <- dfm_train %*% we_matrix
# dfm_train_we_mean <- dfm_train_we_sum / as.vector(rowSums(dfm_train))
if (!is.null(dfm_test)) {
dfm_test_we_sum <- dfm_test %*% we_matrix
# dfm_test_we_mean <- dfm_test_we_sum / as.vector(rowSums(dfm_test))
}
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf, dfm_train_we = dfm_train_we_sum, dfm_test_we = dfm_test_we_sum))
}
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf)) return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
} }

Loading…
Cancel
Save