dupe_detect: Changed query from json to query_string style, and added filter for already detected duplicates

cv_generator: Changed code to use a generic vector of true values to draw the conditional random sample, instead of dfm/docvars specifically
master
Your Name 5 years ago
parent e499d70671
commit 5bd36dcb44

@ -4,8 +4,7 @@
#' #'
#' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data #' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data
#' @param inner_k Number of inner CV (parameter optimization) folds #' @param inner_k Number of inner CV (parameter optimization) folds
#' @param dfm DFM containing the labeled documents #' @param vec Vector containing the true values of the classification
#' @param class_type Name of the column in docvars containing the classification
#' @param grid Parameter grid for optimization #' @param grid Parameter grid for optimization
#' @param seed integer used as seed for random number generation #' @param seed integer used as seed for random number generation
#' @return A nested set of lists with row numbers #' @return A nested set of lists with row numbers
@ -15,36 +14,35 @@
################################################################################################# #################################################################################################
#################################### Generate CV folds ########################################## #################################### Generate CV folds ##########################################
################################################################################################# #################################################################################################
cv_generator <- function(outer_k, inner_k, dfm, class_type, grid, seed) { cv_generator <- function(outer_k, inner_k, vec, grid, seed) {
### Generate inner folds for nested cv ### Generate inner folds for nested cv
inner_loop <- function(i, folds, dfm, inner_k, class_type, grid, seed) { inner_loop <- function(i, folds, vec, inner_k, grid, seed) {
# RNG needs to be set explicitly for each fold # RNG needs to be set explicitly for each fold
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
inner_folds <- createFolds(as.factor(docvars(dfm[-folds[[i]],], class_type)), k= inner_k) inner_folds <- createFolds(as.factor(vec[-folds[[i]]]), k= inner_k)
grid <- crossing(grid, inner_fold = names(inner_folds), outer_fold = names(folds)[i]) grid <- crossing(grid, inner_fold = names(inner_folds), outer_fold = names(folds)[i])
return(list(grid = grid, inner_folds = inner_folds, outer_fold = names(folds)[i])) return(list(grid = grid, inner_folds = inner_folds, outer_fold = names(folds)[i]))
} }
### Generate outer folds for nested cv ### Generate outer folds for nested cv
generate_folds <- function(outer_k, inner_k, dfm, class_type, grid, seed){ generate_folds <- function(outer_k, inner_k, vec, grid, seed){
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion") set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
if (is.null(outer_k)) { # If no outer_k, use all data to generate inner_k folds for parameter optimization if (is.null(outer_k)) { # If no outer_k, use all data to generate inner_k folds for parameter optimization
inner_folds <- createFolds(as.factor(docvars(dfm, class_type)), k= inner_k) inner_folds <- createFolds(as.factor(vec), k= inner_k)
grid <- crossing(grid, inner_fold = names(inner_folds)) grid <- crossing(grid, inner_fold = names(inner_folds))
return(list(grid = grid, return(list(grid = grid,
inner_folds = inner_folds)) inner_folds = inner_folds))
} else if (outer_k < 1) { # Create holdout validation for model performance estimation, with test set equal to outer_k } else if (outer_k < 1) { # Create holdout validation for model performance estimation, with test set equal to outer_k
folds <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k) folds <- createDataPartition(as.factor(vec), p=outer_k)
} else { # Do full nested CV } else { # Do full nested CV
folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k) folds <- createFolds(as.factor(vec), k= outer_k)
} }
# Generate grid of hyperparameters for model optimization, and include inner folds row numbers # Generate grid of hyperparameters for model optimization, and include inner folds row numbers
grid_folds <- lapply(1:length(folds), grid_folds <- lapply(1:length(folds),
inner_loop, inner_loop,
folds = folds, folds = folds,
dfm = dfm, vec = vec,
inner_k = inner_k, inner_k = inner_k,
class_type = class_type,
grid = grid, grid = grid,
seed = seed) seed = seed)
@ -60,5 +58,5 @@ cv_generator <- function(outer_k, inner_k, dfm, class_type, grid, seed) {
outer_folds = folds, outer_folds = folds,
inner_folds = inner_folds)) inner_folds = inner_folds))
} }
return(generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type, grid = grid, seed = seed)) return(generate_folds(outer_k,inner_k = inner_k, vec = vec, grid = grid, seed = seed))
} }

@ -21,17 +21,8 @@
dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) { dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) {
params <- grid[row,] params <- grid[row,]
print(paste0('Parsing ',params$doctypes,' on ',params$dates )) print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
query <- paste0('{"query": query <- paste0('doctype:',params$doctypes,' && publication_date:',params$dates,' && !computerCodes._delete:1')
{"bool": {"filter":[{"term":{"doctype": "',params$doctypes,'"}}, out <- elasticizer(query_string(query, fields = c('country','text','title','subtitle','teaser','preteaser')), es_pwd = es_pwd, localhost= localhost)
{"range" : {
"publication_date" : {
"gte" : "',params$dates,'T00:00:00Z",
"lt" : "',params$dates+1,'T00:00:00Z"
}
}}]
} } }')
out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost)
if (class(out$hits$hits) != 'list') { if (class(out$hits$hits) != 'list') {
dfm <- dfm_gen(out, text = "full", words = words, clean = T) dfm <- dfm_gen(out, text = "full", words = words, clean = T)
if (sum(dfm[1,]) > 0) { if (sum(dfm[1,]) > 0) {

Loading…
Cancel
Save