estimator, modelizer, preproc: Removed experimental we-vector support, and disabled inefficiently implemented preproc.R

master
Your Name 5 years ago
parent 77eb51a1bf
commit 5de4e1488c

@ -8,6 +8,7 @@
#' @param dfm DFM containing labeled documents
#' @param class_type Name of column in docvars() containing the classes
#' @param model Model to use (currently only nb)
#' @param we_vectors Matrix with word embedding vectors
#' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
#' @export
#' @examples
@ -17,7 +18,7 @@
#################################################################################################
### Classification function
estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model) {
estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model, we_vectors) {
# Get parameters for current iteration
params <- grid[row,]
@ -41,26 +42,43 @@ estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, mod
dfm_train <- dfm
}
if (exists("final")) {
preproc_dfm <- preproc(dfm_train, NULL, params)
dfm_train <- preproc_dfm$dfm_train
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
if (params$tfidf) {
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
dfm_train <- dfm_weight(dfm_train, weights = idf)
if (!is.null(dfm_test)) {
dfm_test <- dfm_weight(dfm_test, weights = idf)
}
} else {
preproc_dfm <- preproc(dfm_train, dfm_test, params)
dfm_train <- preproc_dfm$dfm_train
dfm_test <- preproc_dfm$dfm_test
idf <- NULL
}
if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
feat_select,
dfm = dfm_train,
class_type = params$class_type,
percentile = params$feat_percentiles,
measure = params$feat_measures
)))
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
if (!is.null(dfm_test)) {
dfm_test <- dfm_keep(dfm_test, words, valuetype="fixed", verbose=F)
}
}
idf <- preproc_dfm$idf
if (model == "nb") {
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
}
if (model == "svm") {
text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
}
if (model == 'nnet') {
idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
text_model <- svm(x=as.matrix(train_data), y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
}
# if (model == 'nnet') {
# idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
# text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
# }
### Add more if statements for different models
# If training on whole dataset, return final model, and idf values from dataset

@ -16,8 +16,9 @@
#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
#' @param seed Integer to use as seed for random number generation, ensures replicability
#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
#' @param we_vectors Matrix with word embedding vectors
#' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1
#' @return An .Rds file in the current working directory (getwd()) with a list containing all relevant output
#' @return A list containing all relevant output
#' @export
#' @examples
#' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
@ -25,9 +26,9 @@
#################################### Function to generate classification models #################
#################################################################################################
modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1) {
modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, we_vectors, cores = 1) {
## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building
folds <- cv_generator(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed)
folds <- cv_generator(outer_k,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
inner_grid <- folds$grid
outer_folds <- folds$outer_folds
inner_folds <- folds$inner_folds
@ -42,6 +43,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
inner_folds = inner_folds,
dfm = dfm,
class_type = class_type,
we_vectors = we_vectors,
model = model) %>%
future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row
bind_rows(.)
@ -66,12 +68,13 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
inner_folds = NULL,
dfm = dfm,
class_type = class_type,
we_vectors = we_vectors,
model = model) %>%
future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid
bind_rows(.)
# Create (inner) folds for parameter optimization on the entire dataset
final_folds <- cv_generator(NULL,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed)
final_folds <- cv_generator(NULL,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
final_grid <- final_folds$grid
final_inner <- final_folds$inner_folds
@ -82,6 +85,7 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
inner_folds = final_inner,
dfm = dfm,
class_type = class_type,
we_vectors = we_vectors,
model = model) %>%
future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid
bind_rows(.)
@ -118,12 +122,6 @@ modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, g
country = country,
class_type = class_type)
# Set name for output .Rds file
filename <- paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.Rds')
# Save RDS file with output
saveRDS(output, file = filename)
# Return ouput RDS filename
return(filename)
# Return ouput
return(output)
}

@ -5,6 +5,7 @@
#' @param dfm_train Training dfm
#' @param dfm_test Testing dfm if applicable, otherwise NULL
#' @param params Row from grid with parameter optimization
#' @param we_vectors Matrix with word embedding vectors
#' @return List with dfm_train and dfm_test, processed according to parameters in params
#' @export
#' @examples
@ -12,7 +13,11 @@
#################################################################################################
#################################### Preprocess data ############################################
#################################################################################################
preproc <- function(dfm_train, dfm_test = NULL, params) {
### CURRENTLY UNUSED!!!###
preproc <- function(dfm_train, dfm_test = NULL, params, we_vectors) {
# Remove non-existing features from training dfm
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
if (params$tfidf) {
@ -25,7 +30,7 @@ preproc <- function(dfm_train, dfm_test = NULL, params) {
idf <- NULL
}
if ("feat_percentiles" %in% colnames(params) && "feat_measures" %in% colnames(params)) {
if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
@ -38,5 +43,29 @@ preproc <- function(dfm_train, dfm_test = NULL, params) {
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
}
if (!is.null(we_vectors)) {
shared_dict <- sort(intersect(dfm_train@Dimnames$features,we_vectors$V1))
if (!is.null(dfm_test)) {
shared_dict <- sort(intersect(dfm_test@Dimnames$features,shared_dict))
dfm_test <- dfm_keep(dfm_test, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
.[, sort(colnames(.))]
}
dfm_train <- dfm_keep(dfm_train, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
.[, sort(colnames(.))]
we_matrix <- filter(we_vectors, V1 %in% shared_dict) %>%
arrange(V1) %>%
as.data.table(.) %>%
.[,2:ncol(.), with = F] %>%
as.matrix(.)
dfm_train_we_sum <- dfm_train %*% we_matrix
# dfm_train_we_mean <- dfm_train_we_sum / as.vector(rowSums(dfm_train))
if (!is.null(dfm_test)) {
dfm_test_we_sum <- dfm_test %*% we_matrix
# dfm_test_we_mean <- dfm_test_we_sum / as.vector(rowSums(dfm_test))
}
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf, dfm_train_we = dfm_train_we_sum, dfm_test_we = dfm_test_we_sum))
}
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
}

Loading…
Cancel
Save