From adc4b3c639e80177cbaf76f899e9554f60c6df9e Mon Sep 17 00:00:00 2001
From: Erik de Vries <erik@devries.pm>
Date: Wed, 7 Nov 2018 15:10:10 +0100
Subject: [PATCH] Updated feature selection in modelizer function (see comment
 on lines 166/167)

---
 R/modelizer.R      | 6 ++++--
 man/dupe_detect.Rd | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/R/modelizer.R b/R/modelizer.R
index 10f072b..4366132 100644
--- a/R/modelizer.R
+++ b/R/modelizer.R
@@ -163,14 +163,16 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
     dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
     dfreq <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0, use.names=T)
     dfm_train <- custom_tfidf(dfm_train, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq)
-    words <- unlist(mclapply(unique(docvars(dfm_train, class_type)),
+    # Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
+    # than the threshold on two or more of those categories
+    words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
                              feat_select,
                              dfm = dfm_train,
                              class_type = class_type,
                              percentile = params$percentiles,
                              measure = params$measures,
                              mc.cores = cores_feats
-    ))
+    )))
     dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
 
 
diff --git a/man/dupe_detect.Rd b/man/dupe_detect.Rd
index 03710b5..1d2c0b6 100644
--- a/man/dupe_detect.Rd
+++ b/man/dupe_detect.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/duplicate_detection.R
+% Please edit documentation in R/dupe_detect.R
 \name{dupe_detect}
 \alias{dupe_detect}
 \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}