actorizer: fixed sentence_count and out_parser calls

out_parser: Added comment with old regex
master
Erik de Vries 6 years ago
parent 9b0ac775af
commit e70b6ccf7a

@ -26,7 +26,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
### Also needs fix for empty strings (non-NA) ### Also needs fix for empty strings (non-NA)
doc <- out[row,] doc <- out[row,]
ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`))
sentence_count <- length(unique(ud$sentence)) sentence_count <- length(unique(ud$sentence_id))
ud <- ud %>% ud <- ud %>%
filter(grepl(paste0(identifier), sentence)) %>% # Only select sentences that contain the identifier filter(grepl(paste0(identifier), sentence)) %>% # Only select sentences that contain the identifier
filter(!str_detect(sentence, postfix)) %>% # Filter out sentences with matching postfixes (false positives) filter(!str_detect(sentence, postfix)) %>% # Filter out sentences with matching postfixes (false positives)
@ -44,7 +44,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))))) return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids)))))
} }
out <- out_parser(out, field = 'highlight', clean = F) out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
ids <- fromJSON(ids) ids <- fromJSON(ids)
updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores())) updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores()))
bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver) bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)

@ -65,6 +65,8 @@ out_parser <- function(out, field, clean = F) {
# Regex removes all words consisting of or containing numbers, @#$% # Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in merger function # Regex also used in merger function
### Old regex, used for duplicate detection:
# \\S*?[0-9@#$%]+[^\\s!?.,;:]*
out$merged <- out$merged %>% out$merged <- out$merged %>%
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>% {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>%
str_replace_all("<.{0,20}?>", " ") %>% str_replace_all("<.{0,20}?>", " ") %>%

Loading…
Cancel
Save