From e70b6ccf7a127e436385de102cf8665f2a7f8457 Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Mon, 4 Feb 2019 14:16:04 +0100 Subject: [PATCH] actorizer: fixed sentence_count and out_parser calls out_parser: Added comment with old regex --- R/actorizer.R | 4 ++-- R/out_parser.R | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/actorizer.R b/R/actorizer.R index 9ea4b13..5ed6042 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -26,7 +26,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier ### Also needs fix for empty strings (non-NA) doc <- out[row,] ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) - sentence_count <- length(unique(ud$sentence)) + sentence_count <- length(unique(ud$sentence_id)) ud <- ud %>% filter(grepl(paste0(identifier), sentence)) %>% # Only select sentences that contain the identifier filter(!str_detect(sentence, postfix)) %>% # Filter out sentences with matching postfixes (false positives) @@ -44,7 +44,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))))) } - out <- out_parser(out, field = 'highlight', clean = F) + out <- mamlr:::out_parser(out, field = 'highlight', clean = F) ids <- fromJSON(ids) updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores())) bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver) diff --git a/R/out_parser.R b/R/out_parser.R index a93e3ad..fbf8966 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -65,6 +65,8 @@ out_parser <- function(out, field, clean = F) { # Regex removes all words consisting of or containing numbers, @#$% # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above # Regex also used in merger function + ### Old regex, used for duplicate detection: + # \\S*?[0-9@#$%]+[^\\s!?.,;:]* out$merged <- out$merged %>% {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>% str_replace_all("<.{0,20}?>", " ") %>%