diff --git a/R/dfm_gen.R b/R/dfm_gen.R
index fee5173..bc603e7 100644
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@@ -16,13 +16,13 @@
 
 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
 
-dfm_gen <- function(out,words = '999', text = "lemmas") {
+dfm_gen <- function(out, words = '999', text = "lemmas") {
   # Create subset with just ids, codes and text
   out <- out %>%
     select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
   fields <- length(names(out))
   if (text == "lemmas") {
-    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
+    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, mc.cores = detectCores()))
   }
   if (text == "full") {
     out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
@@ -35,6 +35,12 @@ dfm_gen <- function(out,words = '999', text = "lemmas") {
       str_replace_all("<.*?>", " ") %>%
       str_replace_all("\\s+"," ")
   }
+  if (words != "999") {
+    ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary
+    # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))
+    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>%
+      str_extract('.*[.?!]'))
+  }
   if ('_source.codes.majorTopic' %in% colnames(out)) {
     out <- out %>%
       mutate(codes = as.numeric(case_when(
diff --git a/R/dupe_detect.R b/R/dupe_detect.R
index cf01a0f..4010ed8 100644
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@@ -6,16 +6,17 @@
 #' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)
 #' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)
 #' @param es_pwd Password for Elasticsearch read access
+#' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
 #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
 #' @export
 #' @examples
-#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd)
+#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, words)
 
 #################################################################################################
 #################################### Duplicate detector ################################
 #################################################################################################
 
-dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd) {
+dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words) {
   params <- grid[row,]
   print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
   query <- paste0('{"query":
@@ -31,7 +32,7 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd) {
 
 
   out <- elasticizer(query, es_pwd = es_pwd)
-  dfm <- dfm_gen(out, text = "full")
+  dfm <- dfm_gen(out, text = "full", words = words)
   simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
   diag(simil) <- NA
   df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
diff --git a/R/merger.R b/R/merger.R
index 8d17318..975d32a 100644
--- a/R/merger.R
+++ b/R/merger.R
@@ -12,7 +12,7 @@
 #################################### Reconstructing documents from lemmas########################
 #################################################################################################
 ## Only merging lemmas for now, feature selection has no impact on junk classification
-merger <- function(row, words = '999', out = out) {
+merger <- function(row, out = out) {
   df <- out[row,]
   # Mergin lemmas into single string
   lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
@@ -22,7 +22,5 @@ merger <- function(row, words = '999', out = out) {
     str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
     # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
     paste0(.,". ")
-  if (words != "999") {
-    lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
   return(lemmas)
-}
\ No newline at end of file
+}