From e70b6ccf7a127e436385de102cf8665f2a7f8457 Mon Sep 17 00:00:00 2001
From: Erik de Vries <erik@devries.pm>
Date: Mon, 4 Feb 2019 14:16:04 +0100
Subject: [PATCH] actorizer: fixed sentence_count and out_parser calls
 out_parser: Added comment with old regex

---
 R/actorizer.R  | 4 ++--
 R/out_parser.R | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/actorizer.R b/R/actorizer.R
index 9ea4b13..5ed6042 100644
--- a/R/actorizer.R
+++ b/R/actorizer.R
@@ -26,7 +26,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
     ### Also needs fix for empty strings (non-NA)
     doc <- out[row,]
     ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`))
-    sentence_count <- length(unique(ud$sentence))
+    sentence_count <- length(unique(ud$sentence_id))
     ud <- ud %>%
       filter(grepl(paste0(identifier), sentence)) %>% # Only select sentences that contain the identifier
       filter(!str_detect(sentence, postfix)) %>% # Filter out sentences with matching postfixes (false positives)
@@ -44,7 +44,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
 
     return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids)))))
   }
-  out <- out_parser(out, field = 'highlight', clean = F)
+  out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
   ids <- fromJSON(ids)
   updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores()))
   bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
diff --git a/R/out_parser.R b/R/out_parser.R
index a93e3ad..fbf8966 100644
--- a/R/out_parser.R
+++ b/R/out_parser.R
@@ -65,6 +65,8 @@ out_parser <- function(out, field, clean = F) {
   # Regex removes all words consisting of or containing numbers, @#$%
   # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
   # Regex also used in merger function
+  ### Old regex, used for duplicate detection:
+  # \\S*?[0-9@#$%]+[^\\s!?.,;:]*
   out$merged <- out$merged %>%
     {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
     str_replace_all("<.{0,20}?>", " ") %>%