actorizer: fixed sentence_count and out_parser calls

out_parser: Added comment with old regex
6 years ago · e70b6ccf7a
parent 9b0ac775af
commit e70b6ccf7a
2 changed files with 4 additions and 2 deletions
--- a/R/actorizer.R
+++ b/R/actorizer.R
@ -26,7 +26,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
    ### Also needs fix for empty strings (non-NA)
    doc <- out[row,]
    ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`))
-    sentence_count <- length(unique(ud$sentence))
+    sentence_count <- length(unique(ud$sentence_id))
    ud <- ud %>%
      filter(grepl(paste0(identifier), sentence)) %>% # Only select sentences that contain the identifier
      filter(!str_detect(sentence, postfix)) %>% # Filter out sentences with matching postfixes (false positives)
@ -44,7 +44,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier

    return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids)))))
  }
-  out <- out_parser(out, field = 'highlight', clean = F)
+  out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
  ids <- fromJSON(ids)
  updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores()))
  bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -65,6 +65,8 @@ out_parser <- function(out, field, clean = F) {
  # Regex removes all words consisting of or containing numbers, @#$%
  # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
  # Regex also used in merger function
+  ### Old regex, used for duplicate detection:
+  # \\S*?[0-9@#$%]+[^\\s!?.,;:]*
  out$merged <- out$merged %>%
    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
    str_replace_all("<.{0,20}?>", " ") %>%