From 919e71ac684c7df7fc2677ed312a508b487e4dd8 Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Wed, 7 Nov 2018 15:10:10 +0100 Subject: [PATCH] Updated feature selection in modelizer function (see comment on lines 166/167) --- R/modelizer.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/modelizer.R b/R/modelizer.R index 10f072b..4366132 100644 --- a/R/modelizer.R +++ b/R/modelizer.R @@ -163,14 +163,16 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0) dfreq <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0, use.names=T) dfm_train <- custom_tfidf(dfm_train, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) - words <- unlist(mclapply(unique(docvars(dfm_train, class_type)), + # Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher + # than the threshold on two or more of those categories + words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)), feat_select, dfm = dfm_train, class_type = class_type, percentile = params$percentiles, measure = params$measures, mc.cores = cores_feats - )) + ))) dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)