diff --git a/R/dfm_gen.R b/R/dfm_gen.R
index 91e6e46..1eb9bd9 100644
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@@ -4,7 +4,7 @@
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
-#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!
 #' @return A Quanteda dfm
 #' @export
 #' @examples
diff --git a/R/dupe_detect.R b/R/dupe_detect.R
index 63e4e9e..f03a58f 100644
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@@ -33,15 +33,17 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
                   } } }')
   out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost)
   if (class(out$hits$hits) != 'list') {
-    dfm <- dfm_gen(out, text = "full", words = words)
+    dfm <- dfm_gen(out, text = "full", words = words, clean = T)
     if (sum(dfm[1,]) > 0) {
       simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
       diag(simil) <- NA
-      df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
-        rownames_to_column("rowid") %>%
+      duplicates <- which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)
+      duplicates <- cbind(duplicates, rowid= rownames(duplicates))
+      rownames(duplicates) <- seq(1:length(rownames(duplicates)))
+      df <- as.data.frame(duplicates, make.names = NA) %>%
         mutate(colid = colnames(simil)[col]) %>%
-        .[,c(1,4)] %>%
-        group_by(colid) %>% summarise(rowid=list(rowid))
+        .[,c(3,4)] %>%
+        group_by(rowid) %>% summarise(colid=list(colid))
       text <- capture.output(stream_out(df))
       # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
       simil[upper.tri(simil)] <- NA
diff --git a/R/out_parser.R b/R/out_parser.R
index 369e114..eee720b 100644
--- a/R/out_parser.R
+++ b/R/out_parser.R
@@ -3,7 +3,7 @@
 #' Parse raw text into a single field
 #' @param out The original output data frame
 #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
-#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
 #' @return a parsed output data frame including the additional column 'merged', containing the merged text
 #' @examples
 #' out_parser(out,field)
diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd
index 1ef1ea3..4bed478 100644
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@@ -13,7 +13,7 @@ dfm_gen(out, words = "999", text = "lemmas", clean)
 
 \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
 
-\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!}
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!}
 }
 \value{
 A Quanteda dfm
diff --git a/man/out_parser.Rd b/man/out_parser.Rd
index a67a904..cb13609 100644
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@@ -11,7 +11,7 @@ out_parser(out, field, clean = F)
 
 \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
 
-\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*}
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
 }
 \value{
 a parsed output data frame including the additional column 'merged', containing the merged text