diff --git a/R/actorizer.R b/R/actorizer.R index 9ea4b13..5ed6042 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -26,7 +26,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier ### Also needs fix for empty strings (non-NA) doc <- out[row,] ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) - sentence_count <- length(unique(ud$sentence)) + sentence_count <- length(unique(ud$sentence_id)) ud <- ud %>% filter(grepl(paste0(identifier), sentence)) %>% # Only select sentences that contain the identifier filter(!str_detect(sentence, postfix)) %>% # Filter out sentences with matching postfixes (false positives) @@ -44,7 +44,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))))) } - out <- out_parser(out, field = 'highlight', clean = F) + out <- mamlr:::out_parser(out, field = 'highlight', clean = F) ids <- fromJSON(ids) updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores())) bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver) diff --git a/R/out_parser.R b/R/out_parser.R index a93e3ad..fbf8966 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -65,6 +65,8 @@ out_parser <- function(out, field, clean = F) { # Regex removes all words consisting of or containing numbers, @#$% # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above # Regex also used in merger function + ### Old regex, used for duplicate detection: + # \\S*?[0-9@#$%]+[^\\s!?.,;:]* out$merged <- out$merged %>% {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>% str_replace_all("<.{0,20}?>", " ") %>%