mamlr/R/actorizer.R

#' Updater function for elasticizer: Conduct actor searches
#'
#' Updater function for elasticizer: Conduct actor searches
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @param ids List of actor ids
#' @param prefix Regex containing prefixes that should be excluded from hits
#' @param postfix Regex containing postfixes that should be excluded from hits
#' @param identifier String used to mark highlights. Should be a lowercase string
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param es_super Password for write access to ElasticSearch
#' @return As this is a nested function used within elasticizer, there is no return output
#' @export
#' @examples
#' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super)
actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, es_super, ver) {
  ### Function to filter out false positives using regex
  exceptionizer <- function(id, ud, doc, markers, regex_identifier, prefix, postfix) {
    min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
    max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
    split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
    max <- max+(length(split)*nchar(identifier)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
    sentence <- str_sub(doc$merged, min, max) # Extract sentence from text
    # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
    if (!str_detect(sentence, paste0(regex_identifier,postfix)) && !str_detect(sentence, paste0(prefix,regex_identifier))) {
      return(id)
    } else {
      return(NULL)
    }
  }
  ranger <- function(x, ud) {
    return(which((ud$start <= x) & (ud$end >= x)))
  }
  sentencizer <- function(row, out, ids, prefix, postfix, identifier, type) {
    doc <- out[row,]
    # Extracting ud output from document
    ud <- doc$`_source.ud`[[1]] %>%
      select(-one_of('exists')) %>% # Removing ud.exists variable
      unnest() %>%
      mutate(doc_id = doc$`_id`)
    markers <- doc$markers[[1]][,'start'] # Extract list of markers
    # Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
    rows <- unlist(lapply(markers, ranger, ud = ud))

    # Setting up an actor variable
    ud$actor <- F
    ud$actor[rows] <- T

    sentence_count <- max(ud$sentence_id) # Number of sentences in article
    actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
    actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
    actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor

    # Conducting regex filtering on matches only when actor type is Party
    if (type == "Party") {
      ### If no pre or postfixes, match *not nothing* i.e. anything
      if (is.na(prefix) || prefix == '') {
        prefix = '$^'
      }
      if (is.na(postfix) || postfix == '') {
        postfix = '$^'
      }
      sentence_ids <- lapply(actor_sentences, exceptionizer, ud = ud, doc = doc, markers = markers, regex_identifier = regex_identifier, prefix = prefix, postfix = postfix)
    } else {
      sentence_ids <- actor_sentences
    }

    # Generating nested sentence start and end positions for actor sentences
    ud <- ud %>%
      filter(sentence_id %in% sentence_ids) %>%
      group_by(sentence_id) %>%
      summarise (
        sentence_start = as.integer(min(start)),
        sentence_end = as.integer(max(end)),
        doc_id = first(doc_id)
      ) %>%
      group_by(doc_id) %>%
      summarise(
        sentence_id = list(as.integer(sentence_id)),
        sentence_start = list(sentence_start),
        sentence_end = list(sentence_end)
      )

    return(
      data.frame(ud, # Sentence id, start and end position for actor sentences
                 actor_start = I(list(actor_start)), # List of actor ud token start positions
                 actor_end = I(list(actor_end)), # List of actor ud token end positions
                 occ = length(unique(actor_sentences)), # Number of sentences in which actor occurs
                 prom = length(unique(actor_sentences))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
                 rel_first = 1-(min(actor_sentences)/sentence_count), # Relative position of first occurence at sentence level
                 first = min(actor_sentences), # First sentence in which actor is mentioned
                 ids = I(list(ids)) # List of actor ids
      )
    )
  }
  out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
  offsetter <- function(x) {
    return(x-((row(x)-1)*nchar(identifier)))
  }
  regex_identifier <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", identifier)
  out$markers <- lapply(str_locate_all(out$merged,coll(identifier)), offsetter)

  ids <- fromJSON(ids)
  updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, type = type, mc.cores = detectCores()))
  bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
  bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
  return(elastic_update(bulk, es_super = es_super, localhost = localhost))
}
elasticizer: Updated bulk size to 1024 (a power of 2) and set a timeout of 900s every 500000 updates query_gen_actors: Added an additional generator for the "Institution" type (for EU support) actorizer: Created an updater function to search for actors and use UDPipe to parse the results 6 years ago			`#' Updater function for elasticizer: Conduct actor searches`
			`#'`
			`#' Updater function for elasticizer: Conduct actor searches`
			`#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)`
			`#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)`
			`#' @param ids List of actor ids`
			`#' @param prefix Regex containing prefixes that should be excluded from hits`
			`#' @param postfix Regex containing postfixes that should be excluded from hits`
			`#' @param identifier String used to mark highlights. Should be a lowercase string`
actorizer, ud_update: implemented 'ver' variable for keeping track of updates 6 years ago			`#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')`
elasticizer: Updated bulk size to 1024 (a power of 2) and set a timeout of 900s every 500000 updates query_gen_actors: Added an additional generator for the "Institution" type (for EU support) actorizer: Created an updater function to search for actors and use UDPipe to parse the results 6 years ago			`#' @param es_super Password for write access to ElasticSearch`
			`#' @return As this is a nested function used within elasticizer, there is no return output`
			`#' @export`
			`#' @examples`
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`#' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super)`
			`actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, es_super, ver) {`
			`### Function to filter out false positives using regex`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			`exceptionizer <- function(id, ud, doc, markers, regex_identifier, prefix, postfix) {`
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence`
			`max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence`
			`split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence`
			`max <- max+(length(split)*nchar(identifier)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)`
			`sentence <- str_sub(doc$merged, min, max) # Extract sentence from text`
			`# Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			`if (!str_detect(sentence, paste0(regex_identifier,postfix)) && !str_detect(sentence, paste0(prefix,regex_identifier))) {`
			`return(id)`
			`} else {`
			`return(NULL)`
			`}`
			`}`
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`ranger <- function(x, ud) {`
			`return(which((ud$start <= x) & (ud$end >= x)))`
			`}`
actorizer: fix to columns selected for actors variable, removed udmodel requirement 6 years ago			`sentencizer <- function(row, out, ids, prefix, postfix, identifier, type) {`
elasticizer: Updated bulk size to 1024 (a power of 2) and set a timeout of 900s every 500000 updates query_gen_actors: Added an additional generator for the "Institution" type (for EU support) actorizer: Created an updater function to search for actors and use UDPipe to parse the results 6 years ago			`doc <- out[row,]`
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`# Extracting ud output from document`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			ud <- doc$`_source.ud`[[1]] %>%
actorizer: now uses the original udpipe output for sentence and token ids. When the actorized and original udpipe output do not have the same number of rows, it prints an error and sets err to TRUE in actorDetails 6 years ago			`select(-one_of('exists')) %>% # Removing ud.exists variable`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			`unnest() %>%`
			mutate(doc_id = doc$`_id`)
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`markers <- doc$markers[[1]][,'start'] # Extract list of markers`
			`# Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-\|\|\|EU is treated as a single word))`
actorizer: removed nested mclapply 6 years ago			`rows <- unlist(lapply(markers, ranger, ud = ud))`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`# Setting up an actor variable`
			`ud$actor <- F`
			`ud$actor[rows] <- T`

			`sentence_count <- max(ud$sentence_id) # Number of sentences in article`
			`actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor`
			`actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor`
			`actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor`

			`# Conducting regex filtering on matches only when actor type is Party`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			`if (type == "Party") {`
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`### If no pre or postfixes, match not nothing i.e. anything`
			`if (is.na(prefix) \|\| prefix == '') {`
			`prefix = '$^'`
			`}`
			`if (is.na(postfix) \|\| postfix == '') {`
			`postfix = '$^'`
			`}`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			`sentence_ids <- lapply(actor_sentences, exceptionizer, ud = ud, doc = doc, markers = markers, regex_identifier = regex_identifier, prefix = prefix, postfix = postfix)`
actorizer: now uses the original udpipe output for sentence and token ids. When the actorized and original udpipe output do not have the same number of rows, it prints an error and sets err to TRUE in actorDetails 6 years ago			`} else {`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			`sentence_ids <- actor_sentences`
actorizer: now uses the original udpipe output for sentence and token ids. When the actorized and original udpipe output do not have the same number of rows, it prints an error and sets err to TRUE in actorDetails 6 years ago			`}`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`# Generating nested sentence start and end positions for actor sentences`
elasticizer: Updated bulk size to 1024 (a power of 2) and set a timeout of 900s every 500000 updates query_gen_actors: Added an additional generator for the "Institution" type (for EU support) actorizer: Created an updater function to search for actors and use UDPipe to parse the results 6 years ago			`ud <- ud %>%`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			`filter(sentence_id %in% sentence_ids) %>%`
			`group_by(sentence_id) %>%`
			`summarise (`
			`sentence_start = as.integer(min(start)),`
			`sentence_end = as.integer(max(end)),`
			`doc_id = first(doc_id)`
			`) %>%`
elasticizer: Updated bulk size to 1024 (a power of 2) and set a timeout of 900s every 500000 updates query_gen_actors: Added an additional generator for the "Institution" type (for EU support) actorizer: Created an updater function to search for actors and use UDPipe to parse the results 6 years ago			`group_by(doc_id) %>%`
			`summarise(`
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`sentence_id = list(as.integer(sentence_id)),`
			`sentence_start = list(sentence_start),`
			`sentence_end = list(sentence_end)`
elasticizer: Updated bulk size to 1024 (a power of 2) and set a timeout of 900s every 500000 updates query_gen_actors: Added an additional generator for the "Institution" type (for EU support) actorizer: Created an updater function to search for actors and use UDPipe to parse the results 6 years ago			`)`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`return(`
			`data.frame(ud, # Sentence id, start and end position for actor sentences`
			`actor_start = I(list(actor_start)), # List of actor ud token start positions`
			`actor_end = I(list(actor_end)), # List of actor ud token end positions`
			`occ = length(unique(actor_sentences)), # Number of sentences in which actor occurs`
			`prom = length(unique(actor_sentences))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)`
			`rel_first = 1-(min(actor_sentences)/sentence_count), # Relative position of first occurence at sentence level`
			`first = min(actor_sentences), # First sentence in which actor is mentioned`
			`ids = I(list(ids)) # List of actor ids`
			`)`
			`)`
elasticizer: Updated bulk size to 1024 (a power of 2) and set a timeout of 900s every 500000 updates query_gen_actors: Added an additional generator for the "Institution" type (for EU support) actorizer: Created an updater function to search for actors and use UDPipe to parse the results 6 years ago			`}`
actorizer: fixed sentence_count and out_parser calls out_parser: Added comment with old regex 6 years ago			`out <- mamlr:::out_parser(out, field = 'highlight', clean = F)`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			`offsetter <- function(x) {`
			`return(x-((row(x)-1)*nchar(identifier)))`
			`}`
			`regex_identifier <- gsub("([.\|()\\^{}+$*?]\|\\[\|\\])", "\\\\\\1", identifier)`
			`out$markers <- lapply(str_locate_all(out$merged,coll(identifier)), offsetter)`

elasticizer: Updated bulk size to 1024 (a power of 2) and set a timeout of 900s every 500000 updates query_gen_actors: Added an additional generator for the "Institution" type (for EU support) actorizer: Created an updater function to search for actors and use UDPipe to parse the results 6 years ago			`ids <- fromJSON(ids)`
actorizer: fix to columns selected for actors variable, removed udmodel requirement 6 years ago			`updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, type = type, mc.cores = detectCores()))`
actorizer: Removed udmodel dependencies, commented code, changed nested lists to flat lists bulk_writer: changed handling of single-row dataframe parsing to JSON elastic_update: changed function to return instead of print appData on error ud_update: Changed nested lists to flat lists, and added start and end character positions 6 years ago			`bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)`
			`bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))`
elasticizer: Updated bulk size to 1024 (a power of 2) and set a timeout of 900s every 500000 updates query_gen_actors: Added an additional generator for the "Institution" type (for EU support) actorizer: Created an updater function to search for actors and use UDPipe to parse the results 6 years ago			`return(elastic_update(bulk, es_super = es_super, localhost = localhost))`
			`}`