actorizer: update to use '|||' as highlight indicator, and set up ud output merging accordingly

master
Erik de Vries 6 years ago
parent 5665b6d622
commit eae1a22609

@ -30,14 +30,17 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
select(-one_of('exists')) %>% # Removing ud.exists variable select(-one_of('exists')) %>% # Removing ud.exists variable
unnest() unnest()
ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`))
ud[,'actor'] <- NA
### The exception below is only valid for the UK, where the original UDPipe output misses a dot at the end of the article, but the actor output does not markers <- which(str_detect(ud$lemma, coll("|||")))
### (UK output is older than actor output, should be updated) ud[markers+1,'actor'] <- T
ud <- ud[-markers,]
## The exception below is only valid for the UK, where the original UDPipe output misses a dot at the end of the article, but the actor output does not
## (UK output is older than actor output, should be updated)
if (length(ud_org$sentence_id) == length(ud$sentence_id)-1) { if (length(ud_org$sentence_id) == length(ud$sentence_id)-1) {
ud <- ud[-length(ud$sentence_id),] ud <- ud[-length(ud$sentence_id),]
} }
if (length(ud_org$sentence_id) == length(ud$sentence_id)) { if (length(ud_org$sentence_id) == length(ud$sentence_id)) {
ud <- bind_cols(ud_org, sentence = ud$sentence, token = ud$token, doc_id = ud$doc_id) ud <- bind_cols(ud_org, sentence = ud$sentence, token = ud$token, doc_id = ud$doc_id, actor = ud$actor)
} else { } else {
err = T err = T
print(paste0('ud_org and ud_actor not the same length for id ', doc$`_id`)) print(paste0('ud_org and ud_actor not the same length for id ', doc$`_id`))
@ -46,10 +49,9 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
} }
sentence_count <- length(unique(ud$sentence_id)) sentence_count <- length(unique(ud$sentence_id))
ud <- ud %>% ud <- ud %>%
filter(grepl(paste0(identifier), sentence)) %>% # Only select sentences that contain the identifier filter(T, actor) %>% # Only select tokens containing actor
filter(!str_detect(sentence, postfix)) %>% # Filter out sentences with matching postfixes (false positives) filter(!str_detect(sentence, postfix)) %>% # Filter out sentences with matching postfixes (false positives)
filter(!str_detect(sentence, prefix)) %>% # Filter out sentences with matching prefixes (false positives) filter(!str_detect(sentence, prefix)) %>% # Filter out sentences with matching prefixes (false positives)
filter(grepl(paste0(identifier,'.*'), token)) %>% # Only select tokens that start with the identifier
group_by(doc_id) %>% group_by(doc_id) %>%
summarise( summarise(
sentence_id = list(list(as.integer(sentence_id))), sentence_id = list(list(as.integer(sentence_id))),

Loading…
Cancel
Save