diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 8f66735..a55cd27 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -31,18 +31,13 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = detectCo } if ('_source.codes.majorTopic' %in% colnames(out)) { out <- out %>% - mutate(codes = case_when( - .$`_source.codes.timeSpent` == -1 ~ NA_character_, - TRUE ~ .$`_source.codes.majorTopic` - ) - ) %>% mutate(junk = case_when( - .$codes == 2301 ~ 1, - .$codes == 3101 ~ 1, - .$codes == 34 ~ 1, - .$`_source.codes.timeSpent` == -1 ~ NA_real_, - TRUE ~ 0 - ) + .$codes == 2301 ~ 1, + .$codes == 3101 ~ 1, + .$codes == 34 ~ 1, + .$`_source.codes.timeSpent` == -1 ~ NA_real_, + TRUE ~ 0 + ) ) %>% mutate(aggregate = .$codes %>% str_pad(4, side="right", pad="a") %>% diff --git a/R/modelizer.R b/R/modelizer.R index a39d734..b6df864 100644 --- a/R/modelizer.R +++ b/R/modelizer.R @@ -256,8 +256,15 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se } } - ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script) - folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type) + ### If outer_k is 1, do a holdout training run, with only cross-validation for parameter optimization, else, do nested CV + ### If holdout, training/test distribution is the same as for inner CV + if (outer_k == 1) { + outer_fold <- createDataPartition(as.factor(docvars(dfm, class_type)), p=1-(1/inner_k)*(inner_k-1)) + folds <- lapply(outer_fold,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type) + } else { + ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script) + folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type) + } ## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator) performance <- mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer) diff --git a/R/query_string.R b/R/query_string.R index b771798..962535f 100644 --- a/R/query_string.R +++ b/R/query_string.R @@ -12,7 +12,7 @@ #################################### Get data from ElasticSearch ################################ ################################################################################################# -query_string <- function(query, fields = F, random = F) { +query_string <- function(query, fields = F, random = F, default_operator = "AND") { if (typeof(fields) == 'logical') { fields <- '*' } @@ -28,7 +28,7 @@ query_string <- function(query, fields = F, random = F) { "query_string" : { "default_field" : "text", "query" : "',query,'", - "default_operator": "AND", + "default_operator": "',default_operator,'", "allow_leading_wildcard" : false } }] @@ -50,7 +50,7 @@ query_string <- function(query, fields = F, random = F) { "query_string" : { "default_field" : "text", "query" : "',query,'", - "default_operator": "AND", + "default_operator": "',default_operator,'", "allow_leading_wildcard" : false } }]