diff --git a/R/cv_generator.R b/R/cv_generator.R index 2c8af81..8e818e2 100644 --- a/R/cv_generator.R +++ b/R/cv_generator.R @@ -4,8 +4,7 @@ #' #' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data #' @param inner_k Number of inner CV (parameter optimization) folds -#' @param dfm DFM containing the labeled documents -#' @param class_type Name of the column in docvars containing the classification +#' @param vec Vector containing the true values of the classification #' @param grid Parameter grid for optimization #' @param seed integer used as seed for random number generation #' @return A nested set of lists with row numbers @@ -15,36 +14,35 @@ ################################################################################################# #################################### Generate CV folds ########################################## ################################################################################################# -cv_generator <- function(outer_k, inner_k, dfm, class_type, grid, seed) { +cv_generator <- function(outer_k, inner_k, vec, grid, seed) { ### Generate inner folds for nested cv - inner_loop <- function(i, folds, dfm, inner_k, class_type, grid, seed) { + inner_loop <- function(i, folds, vec, inner_k, grid, seed) { # RNG needs to be set explicitly for each fold set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") - inner_folds <- createFolds(as.factor(docvars(dfm[-folds[[i]],], class_type)), k= inner_k) + inner_folds <- createFolds(as.factor(vec[-folds[[i]]]), k= inner_k) grid <- crossing(grid, inner_fold = names(inner_folds), outer_fold = names(folds)[i]) return(list(grid = grid, inner_folds = inner_folds, outer_fold = names(folds)[i])) } ### Generate outer folds for nested cv - generate_folds <- function(outer_k, inner_k, dfm, class_type, grid, seed){ + generate_folds <- function(outer_k, inner_k, vec, grid, seed){ set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") if (is.null(outer_k)) { # If no outer_k, use all data to generate inner_k folds for parameter optimization - inner_folds <- createFolds(as.factor(docvars(dfm, class_type)), k= inner_k) + inner_folds <- createFolds(as.factor(vec), k= inner_k) grid <- crossing(grid, inner_fold = names(inner_folds)) return(list(grid = grid, inner_folds = inner_folds)) } else if (outer_k < 1) { # Create holdout validation for model performance estimation, with test set equal to outer_k - folds <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k) + folds <- createDataPartition(as.factor(vec), p=outer_k) } else { # Do full nested CV - folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k) + folds <- createFolds(as.factor(vec), k= outer_k) } # Generate grid of hyperparameters for model optimization, and include inner folds row numbers grid_folds <- lapply(1:length(folds), inner_loop, folds = folds, - dfm = dfm, + vec = vec, inner_k = inner_k, - class_type = class_type, grid = grid, seed = seed) @@ -60,5 +58,5 @@ cv_generator <- function(outer_k, inner_k, dfm, class_type, grid, seed) { outer_folds = folds, inner_folds = inner_folds)) } - return(generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed)) + return(generate_folds(outer_k,inner_k = inner_k, vec = vec, grid = grid, seed = seed)) } diff --git a/R/dupe_detect.R b/R/dupe_detect.R index 993e2d5..011fd4e 100644 --- a/R/dupe_detect.R +++ b/R/dupe_detect.R @@ -21,17 +21,8 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) { params <- grid[row,] print(paste0('Parsing ',params$doctypes,' on ',params$dates )) - query <- paste0('{"query": - {"bool": {"filter":[{"term":{"doctype": "',params$doctypes,'"}}, - {"range" : { - "publication_date" : { - "gte" : "',params$dates,'T00:00:00Z", - "lt" : "',params$dates+1,'T00:00:00Z" - } - }}] - - } } }') - out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost) + query <- paste0('doctype:',params$doctypes,' && publication_date:',params$dates,' && !computerCodes._delete:1') + out <- elasticizer(query_string(query, fields = c('country','text','title','subtitle','teaser','preteaser')), es_pwd = es_pwd, localhost= localhost) if (class(out$hits$hits) != 'list') { dfm <- dfm_gen(out, text = "full", words = words, clean = T) if (sum(dfm[1,]) > 0) {