diff --git a/R/actorizer.R b/R/actorizer.R index 679df15..8da8569 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -7,68 +7,65 @@ #' @param prefix Regex containing prefixes that should be excluded from hits #' @param postfix Regex containing postfixes that should be excluded from hits #' @param identifier String used to mark highlights. Should be a lowercase string -#' @param udmodel The udpipe model used for parsing every hit #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') #' @param es_super Password for write access to ElasticSearch #' @return As this is a nested function used within elasticizer, there is no return output #' @export #' @examples -#' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super) -actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super, ver) { +#' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super) +actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, es_super, ver) { + ### Function to filter out false positives using regex exceptionizer <- function(id, ud, doc, markers, regex_identifier, prefix, postfix) { - min <- min(ud$start[ud$sentence_id == id]) - max <- max(ud$end[ud$sentence_id == id]) - split <- markers[markers %in% seq(min, max, 1)] - max <- max+(length(split)*nchar(identifier)) - sentence <- str_sub(doc$highlight, min, max) + min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence + max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence + split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence + max <- max+(length(split)*nchar(identifier)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6) + sentence <- str_sub(doc$merged, min, max) # Extract sentence from text + # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing if (!str_detect(sentence, paste0(regex_identifier,postfix)) && !str_detect(sentence, paste0(prefix,regex_identifier))) { return(id) } else { return(NULL) } } + ranger <- function(x, ud) { + return(which((ud$start <= x) & (ud$end >= x))) + } sentencizer <- function(row, out, ids, prefix, postfix, identifier, type) { - print(row) - ### If no pre or postfixes, match *not nothing* i.e. anything - if (is.na(prefix) || prefix == '') { - prefix = '$^' - } - if (is.na(postfix) || postfix == '') { - postfix = '$^' - } - ### Also needs fix for empty strings (non-NA) - err <- F doc <- out[row,] + # Extracting ud output from document ud <- doc$`_source.ud`[[1]] %>% select(-one_of('exists')) %>% # Removing ud.exists variable unnest() %>% mutate(doc_id = doc$`_id`) - # ud <- as.data.frame(udpipe(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) - markers <- doc$markers[[1]][,'start'] - if (length(setdiff(markers,ud$start)) > 0) { - err <- T - ud <- ud %>% - group_by(doc_id) %>% - summarise( - sentence_id = list(list(as.integer(0))), - sentence_start = list(list(0)), - sentence_end = list(list(0)) - ) - occurences <- 0 - prominence <- 0 - rel_first <- 0 + markers <- doc$markers[[1]][,'start'] # Extract list of markers + # Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word)) + rows <- unlist(mclapply(markers, ranger, ud = ud, mc.cores = detectCores())) - return(data.frame(ud,actor_start = I(list(list(markers))), occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))), err = err)) - } - ud$actor[ud$start %in% markers] <- T - sentence_count <- length(unique(ud$sentence_id)) - actor_sentences <- unique(na.omit(ud$sentence_id[ud$actor == T])) + # Setting up an actor variable + ud$actor <- F + ud$actor[rows] <- T + + sentence_count <- max(ud$sentence_id) # Number of sentences in article + actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor + actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor + actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor + + # Conducting regex filtering on matches only when actor type is Party if (type == "Party") { + ### If no pre or postfixes, match *not nothing* i.e. anything + if (is.na(prefix) || prefix == '') { + prefix = '$^' + } + if (is.na(postfix) || postfix == '') { + postfix = '$^' + } sentence_ids <- lapply(actor_sentences, exceptionizer, ud = ud, doc = doc, markers = markers, regex_identifier = regex_identifier, prefix = prefix, postfix = postfix) } else { sentence_ids <- actor_sentences } + # Generating nested sentence start and end positions for actor sentences ud <- ud %>% filter(sentence_id %in% sentence_ids) %>% group_by(sentence_id) %>% @@ -79,35 +76,24 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier ) %>% group_by(doc_id) %>% summarise( - sentence_id = list(list(as.integer(sentence_id))), - sentence_start = list(list(sentence_start)), - sentence_end = list(list(sentence_end)) + sentence_id = list(as.integer(sentence_id)), + sentence_start = list(sentence_start), + sentence_end = list(sentence_end) ) - occurences <- length(unique(ud$sentence_id[[1]][[1]])) # Number of sentences in which actor occurs - prominence <- occurences/sentence_count # Relative prominence of actor in article (number of occurences/total # sentences) - rel_first <- 1-(ud$sentence_id[[1]][[1]][1]/sentence_count) # Relative position of first occurence at sentence level - - return(data.frame(ud,actor_start = I(list(list(markers))), occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))), err = err)) - - - # ## The exception below is only valid for the UK, where the original UDPipe output misses a dot at the end of the article, but the actor output does not - # ## (UK output is older than actor output, should be updated) - # if (length(ud_org$sentence_id) == length(ud$sentence_id)-1) { - # ud <- ud[-length(ud$sentence_id),] - # } - # if (length(ud_org$sentence_id) == length(ud$sentence_id)) { - # ud <- bind_cols(ud_org, sentence = ud$sentence, token = ud$token, doc_id = ud$doc_id, actor = ud$actor) - # } else { - # err = T - # print(paste0('ud_org and ud_actor not the same length for id ', doc$`_id`)) - # print(length(ud_org$sentence_id)) - # print(length(ud$sentence_id)) - # } + return( + data.frame(ud, # Sentence id, start and end position for actor sentences + actor_start = I(list(actor_start)), # List of actor ud token start positions + actor_end = I(list(actor_end)), # List of actor ud token end positions + occ = length(unique(actor_sentences)), # Number of sentences in which actor occurs + prom = length(unique(actor_sentences))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences) + rel_first = 1-(min(actor_sentences)/sentence_count), # Relative position of first occurence at sentence level + first = min(actor_sentences), # First sentence in which actor is mentioned + ids = I(list(ids)) # List of actor ids + ) + ) } out <- mamlr:::out_parser(out, field = 'highlight', clean = F) - # out$highlight <- out$merged - # out <- mamlr:::out_parser(out, field = '_source', clean = F) offsetter <- function(x) { return(x-((row(x)-1)*nchar(identifier))) } @@ -116,7 +102,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier ids <- fromJSON(ids) updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, type = type, mc.cores = detectCores())) - bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail2', type = 'add', ver = ver) - bulk <- c(bulk,apply(updates[c(1,9)], 1, bulk_writer, varname='actors2', type = 'add', ver = ver)) + bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver) + bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver)) return(elastic_update(bulk, es_super = es_super, localhost = localhost)) } diff --git a/R/bulk_writer.R b/R/bulk_writer.R index 947538a..11f3a50 100644 --- a/R/bulk_writer.R +++ b/R/bulk_writer.R @@ -20,8 +20,8 @@ bulk_writer <- function(x, index = 'maml', varname, type, ver) { ### Create a json object if more than one variable besides _id, otherwise use value as-is if (length(x) > 2) { - json <- toJSON(bind_rows(x)[-1], collapse = T) - x} else { + json <- toJSON(list(x[-1]), collapse = T) + } else { names(x) <- NULL json <- toJSON(x[-1], collapse = T) } diff --git a/R/elastic_update.R b/R/elastic_update.R index fd455c9..96bddc4 100644 --- a/R/elastic_update.R +++ b/R/elastic_update.R @@ -30,8 +30,8 @@ elastic_update <- function(x, es_super = 'secret', localhost = T) { httr:::stop_for_status(res) appData <- httr:::content(res) if (appData$errors == T){ - print(appData) - stop("Aborting, errors found during updating") + print("Aborting, errors found during updating") + return(appData) } print("updated") return(1) diff --git a/R/ud_update.R b/R/ud_update.R index d5ba72c..d8632d1 100644 --- a/R/ud_update.R +++ b/R/ud_update.R @@ -26,16 +26,16 @@ ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword ud <- as.data.frame(udpipe(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% group_by(doc_id) %>% summarise( - sentence_id = list(list(as.integer(sentence_id))), - token_id = list(list(as.integer(token_id))), - lemma = list(list(as.character(lemma))), - upos = list(list(as.character(upos))), - feats = list(list(as.character(feats))), - head_token_id = list(list(as.integer(head_token_id))), - dep_rel = list(list(as.character(dep_rel))), - start = list(list(as.integer(start))), - end = list(list(as.integer(end))), - exists = list(list(TRUE)) + sentence_id = list(as.integer(sentence_id)), + token_id = list(as.integer(token_id)), + lemma = list(as.character(lemma)), + upos = list(as.character(upos)), + feats = list(as.character(feats)), + head_token_id = list(as.integer(head_token_id)), + dep_rel = list(as.character(dep_rel)), + start = list(as.integer(start)), + end = list(as.integer(end)), + exists = list(TRUE) ) return(ud) }