From 835d2332bc20983c53a1f2ef62747b85557581bf Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Tue, 5 Feb 2019 13:26:24 +0100 Subject: [PATCH] actorizer: now uses the original udpipe output for sentence and token ids. When the actorized and original udpipe output do not have the same number of rows, it prints an error and sets err to TRUE in actorDetails --- R/actorizer.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/R/actorizer.R b/R/actorizer.R index 5ed6042..dee0638 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -24,8 +24,18 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier postfix = '$^' } ### Also needs fix for empty strings (non-NA) + err <- F doc <- out[row,] + ud_org <- doc$`_source.ud`[[1]] %>% + select(-one_of('exists')) %>% # Removing ud.exists variable + unnest() ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) + if (length(ud_org$sentence_id) == length(ud$sentence_id)) { + ud <- bind_cols(ud_org, sentence = ud$sentence, token = ud$token, doc_id = ud$doc_id) + } else { + err = T + print(paste0('ud_org and ud_actor not the same length for id ', doc$`_id`)) + } sentence_count <- length(unique(ud$sentence_id)) ud <- ud %>% filter(grepl(paste0(identifier), sentence)) %>% # Only select sentences that contain the identifier @@ -42,7 +52,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier prominence <- occurences/sentence_count # Relative prominence of actor in article (number of occurences/total # sentences) rel_first <- 1-(ud$sentence_id[[1]][[1]][1]/sentence_count) # Relative position of first occurence at sentence level - return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))))) + return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))), err = err)) } out <- mamlr:::out_parser(out, field = 'highlight', clean = F) ids <- fromJSON(ids)