diff --git a/R/actorizer.R b/R/actorizer.R index b64a957..784323a 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -15,7 +15,20 @@ #' @examples #' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super) actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super, ver) { - sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier) { + exceptionizer <- function(id, ud, doc, markers, regex_identifier, prefix, postfix) { + min <- min(ud$start[ud$sentence_id == id]) + max <- max(ud$end[ud$sentence_id == id]) + split <- markers[markers %in% seq(min, max, 1)] + max <- max+(length(split)*nchar(identifier)) + sentence <- str_sub(doc$highlight, min, max) + if (!str_detect(sentence, paste0(regex_identifier,postfix)) && !str_detect(sentence, paste0(prefix,regex_identifier))) { + return(id) + } else { + return(NULL) + } + } + sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier, type) { + print(row) ### If no pre or postfixes, match *not nothing* i.e. anything if (is.na(prefix) || prefix == '') { prefix = '$^' @@ -26,48 +39,84 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier ### Also needs fix for empty strings (non-NA) err <- F doc <- out[row,] - ud_org <- doc$`_source.ud`[[1]] %>% + ud <- doc$`_source.ud`[[1]] %>% select(-one_of('exists')) %>% # Removing ud.exists variable - unnest() - ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) - ud[,'actor'] <- NA - markers <- which(str_detect(ud$lemma, coll("|||"))) - ud[markers+1,'actor'] <- T - ud <- ud[-markers,] - ## The exception below is only valid for the UK, where the original UDPipe output misses a dot at the end of the article, but the actor output does not - ## (UK output is older than actor output, should be updated) - if (length(ud_org$sentence_id) == length(ud$sentence_id)-1) { - ud <- ud[-length(ud$sentence_id),] + unnest() %>% + mutate(doc_id = doc$`_id`) + # ud <- as.data.frame(udpipe(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) + markers <- doc$markers[[1]][,'start'] + if (length(setdiff(markers,ud$start)) > 0) { + err <- T + ud <- ud %>% + group_by(doc_id) %>% + summarise( + sentence_id = list(list(as.integer(0))), + sentence_start = list(list(0)), + sentence_end = list(list(0)) + ) + occurences <- 0 + prominence <- 0 + rel_first <- 0 + + return(data.frame(ud,actor_start = I(list(list(markers))), occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))), err = err)) } - if (length(ud_org$sentence_id) == length(ud$sentence_id)) { - ud <- bind_cols(ud_org, sentence = ud$sentence, token = ud$token, doc_id = ud$doc_id, actor = ud$actor) + ud$actor[ud$start %in% markers] <- T + sentence_count <- length(unique(ud$sentence_id)) + actor_sentences <- unique(na.omit(ud$sentence_id[ud$actor == T])) + if (type == "Party") { + sentence_ids <- lapply(actor_sentences, exceptionizer, ud = ud, doc = doc, markers = markers, regex_identifier = regex_identifier, prefix = prefix, postfix = postfix) } else { - err = T - print(paste0('ud_org and ud_actor not the same length for id ', doc$`_id`)) - print(length(ud_org$sentence_id)) - print(length(ud$sentence_id)) + sentence_ids <- actor_sentences } - sentence_count <- length(unique(ud$sentence_id)) + ud <- ud %>% - filter(T, actor) %>% # Only select tokens containing actor - filter(!str_detect(sentence, postfix)) %>% # Filter out sentences with matching postfixes (false positives) - filter(!str_detect(sentence, prefix)) %>% # Filter out sentences with matching prefixes (false positives) + filter(sentence_id %in% sentence_ids) %>% + group_by(sentence_id) %>% + summarise ( + sentence_start = as.integer(min(start)), + sentence_end = as.integer(max(end)), + doc_id = first(doc_id) + ) %>% group_by(doc_id) %>% summarise( sentence_id = list(list(as.integer(sentence_id))), - token_id = list(list(as.integer(token_id))), - text = list(list(unique(as.character(sentence)))) + sentence_start = list(list(sentence_start)), + sentence_end = list(list(sentence_end)) ) occurences <- length(unique(ud$sentence_id[[1]][[1]])) # Number of sentences in which actor occurs prominence <- occurences/sentence_count # Relative prominence of actor in article (number of occurences/total # sentences) rel_first <- 1-(ud$sentence_id[[1]][[1]][1]/sentence_count) # Relative position of first occurence at sentence level - return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))), err = err)) + return(data.frame(ud,actor_start = I(list(list(markers))), occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))), err = err)) + + + # ## The exception below is only valid for the UK, where the original UDPipe output misses a dot at the end of the article, but the actor output does not + # ## (UK output is older than actor output, should be updated) + # if (length(ud_org$sentence_id) == length(ud$sentence_id)-1) { + # ud <- ud[-length(ud$sentence_id),] + # } + # if (length(ud_org$sentence_id) == length(ud$sentence_id)) { + # ud <- bind_cols(ud_org, sentence = ud$sentence, token = ud$token, doc_id = ud$doc_id, actor = ud$actor) + # } else { + # err = T + # print(paste0('ud_org and ud_actor not the same length for id ', doc$`_id`)) + # print(length(ud_org$sentence_id)) + # print(length(ud$sentence_id)) + # } + } out <- mamlr:::out_parser(out, field = 'highlight', clean = F) + # out$highlight <- out$merged + # out <- mamlr:::out_parser(out, field = '_source', clean = F) + offsetter <- function(x) { + return(x-((row(x)-1)*nchar(identifier))) + } + regex_identifier <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", identifier) + out$markers <- lapply(str_locate_all(out$merged,coll(identifier)), offsetter) + ids <- fromJSON(ids) - updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores())) - bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver) - bulk <- c(bulk,apply(updates[c(1,8)], 1, bulk_writer, varname='actors', type = 'add', ver = ver)) + updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, type = type, mc.cores = detectCores())) + bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail2', type = 'add', ver = ver) + bulk <- c(bulk,apply(updates[c(1,8)], 1, bulk_writer, varname='actors2', type = 'add', ver = ver)) return(elastic_update(bulk, es_super = es_super, localhost = localhost)) } diff --git a/R/ud_update.R b/R/ud_update.R index 0d8e5c4..d5ba72c 100644 --- a/R/ud_update.R +++ b/R/ud_update.R @@ -20,13 +20,12 @@ # } ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) { - out <- out_parser(out, field = '_source', clean = F) + out <- mamlr:::out_parser(out, field = '_source', clean = F) par_proc <- function(row, out, udmodel) { doc <- out[row,] - ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% + ud <- as.data.frame(udpipe(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% group_by(doc_id) %>% summarise( - paragraph_id = list(list(as.integer(paragraph_id))), sentence_id = list(list(as.integer(sentence_id))), token_id = list(list(as.integer(token_id))), lemma = list(list(as.character(lemma))), @@ -34,6 +33,8 @@ ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword feats = list(list(as.character(feats))), head_token_id = list(list(as.integer(head_token_id))), dep_rel = list(list(as.character(dep_rel))), + start = list(list(as.integer(start))), + end = list(list(as.integer(end))), exists = list(list(TRUE)) ) return(ud)