actorizer: totally revamped way of finding actors

elasticizer: updated dump handling to create a dump for every batch, instead of one big file at the end out_parser: streamlined code query_gen_actors: only include relevant fields ud_update: changed function parameters to work with elasticizer dump function
5 years ago · 77eb51a1bf
parent 0e593075ee
commit 77eb51a1bf
7 changed files with 144 additions and 145 deletions
--- a/R/actorizer.R
+++ b/R/actorizer.R
@ -9,58 +9,64 @@
 #' @param identifier String used to mark highlights. Should be a lowercase string
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @param es_super Password for write access to ElasticSearch
 #' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
 #' @return As this is a nested function used within elasticizer, there is no return output
 #' @export
 #' @examples
 #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
-actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = 1) {
+actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) {
  plan(multiprocess, workers = cores)
  ### Function to filter out false positives using regex
  exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
    min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
    max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
    split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
    min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
    max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
    sentence <- paste0(' ',str_sub(doc$merged, min, max),' ') # Extract sentence from text, adding whitespaces before and after for double negation (i.e. Con only when preceded by "("))
    # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
    if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
      return(id)
    } else {
      return(NULL)
    }
  }
  ranger <- function(x, ud) {
    return(which((ud$start <= x) & (ud$end >= x)))
  }
  sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
    doc <- out[row,]
-    if (nchar(doc$merged) > 990000) {
+    if (sum(nchar(doc$merged) > 990000)) {
-      return(
+      stop("One or more documents in this batch exceed 990000 characters")
        data.frame(
          err = T,
          errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
        )
      )
    }
    # Extracting ud output from document
-    ud <- doc$`_source.ud`[[1]] %>%
+    ud <- doc %>%
-      select(-one_of('exists')) %>% # Removing ud.exists variable
+      select(`_id`,`_source.ud`, merged) %>%
-      unnest() %>%
+      unnest(cols = c("_source.ud")) %>%
-      mutate(doc_id = doc$`_id`)
+      select(`_id`,lemma,start,end, sentence_id,merged) %>%
-    markers <- doc$markers[[1]][,'start'] # Extract list of markers
+      unnest(cols = colnames(.))
    # Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
    rows <- unlist(lapply(markers, ranger, ud = ud))
    # Setting up an actor variable
    ud$actor <- F
    ud$actor[rows] <- T
-    sentence_count <- max(ud$sentence_id) # Number of sentences in article
+    sentences <- ud %>%
-    actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
+      group_by(`_id`, sentence_id) %>%
      summarise(
        sentence_start = min(start),
        sentence_end = max(end)
      ) %>%
      mutate(
        sentence_count = n()
      )
    hits <- left_join(ud, markers, by='_id') %>%
      mutate(
        actor = case_when(
          start <= marker_start  & end >= marker_start ~ T,
          T ~ F
        )
      ) %>%
      select(`_id`, sentence_id, start, end,actor,merged) %>%
      filter(actor) %>%
      group_by(`_id`,sentence_id) %>%
      summarise(
        actor = any(actor),
        actor_start = I(list(start)),
        actor_end = I(list(end)),
        n_markers = length(start),
        merged = first(merged)
      ) %>%
      left_join(.,sentences, by=c('_id','sentence_id')) %>%
      ungroup %>%
      arrange(`_id`,sentence_id) %>%
      group_by(`_id`) %>%
      mutate(n_markers = cumsum(n_markers)) %>%
      mutate(
        sentence_start_tags = sentence_start+((nchar(pre_tags)+nchar(post_tags))*(lag(n_markers, default = 0))),
        sentence_end_tags = sentence_end+((nchar(pre_tags)+nchar(post_tags))*(n_markers))
      ) %>%
      mutate(
        sentence = paste0(' ',str_sub(merged, sentence_start_tags, sentence_end_tags),' ')
      ) %>%
      select(-merged) %>%
      ungroup()
    # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
    if (!is.na(prefix) || !is.na(postfix)) {
      ### If no pre or postfixes, match *not nothing* i.e. anything
@ -70,68 +76,50 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
      if (is.na(postfix)) {
        postfix = '$^'
      }
-      sentence_ids <- unlist(lapply(actor_sentences,
+      hits <- hits %>%
-                             exceptionizer,
+      filter(
-                             ud = ud,
+        !str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) & !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))
                             doc = doc,
                             markers = markers,
                             pre_tags_regex = pre_tags_regex,
                             pre_tags = pre_tags,
                             post_tags_regex = post_tags_regex,
                             post_tags = post_tags,
                             prefix = prefix,
                             postfix = postfix))
    } else {
      sentence_ids <- actor_sentences
    }
    if (length(sentence_ids > 0)) {
      # Generating nested sentence start and end positions for actor sentences
      ud <- ud %>%
        filter(sentence_id %in% sentence_ids)
      actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
      actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
      ud <- ud %>%
        group_by(sentence_id) %>%
        summarise (
          sentence_start = as.integer(min(start)),
          sentence_end = as.integer(max(end)),
          doc_id = first(doc_id)
        ) %>%
        group_by(doc_id) %>%
        summarise(
          sentence_id = list(as.integer(sentence_id)),
          sentence_start = list(sentence_start),
          sentence_end = list(sentence_end)
        )
      return(
        data.frame(ud, # Sentence id, start and end position for actor sentences
                   actor_start = I(list(actor_start)), # List of actor ud token start positions
                   actor_end = I(list(actor_end)), # List of actor ud token end positions
                   occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs
                   prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
                   rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level
                   first = min(sentence_ids), # First sentence in which actor is mentioned
                   ids = I(list(ids)) # List of actor ids
        )
      )
    } else {
      return(NULL)
    }
    hits <- hits %>%
      group_by(`_id`) %>%
      summarise(
        sentence_id = list(as.integer(sentence_id)),
        sentence_start = list(sentence_start),
        sentence_end = list(sentence_end),
        actor_start = I(list(unlist(actor_start))), # List of actor ud token start positions
        actor_end = I(list(unlist(actor_end))), # List of actor ud token end positions
        occ = length(unique(unlist(sentence_id))), # Number of sentences in which actor occurs
        first = min(unlist(sentence_id)), # First sentence in which actor is mentioned
        ids = I(list(ids)),
        sentence_count = first(sentence_count)# List of actor ids
      ) %>%
      mutate(
        prom = occ/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
        rel_first = 1-(first/sentence_count), # Relative position of first occurence at sentence level
      ) %>%
      select(`_id`:occ, prom,rel_first,first,ids)
    return(hits)
  }
-  out <- mamlr:::out_parser(out, field = 'highlight', clean = F, cores = cores)
+  out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
  offsetter <- function(x, pre_tags, post_tags) {
-    return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
+    return(as.list(as.data.frame(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))))
  }
  prefix[prefix==''] <- NA
  postfix[postfix==''] <- NA
  pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
  post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
  out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags)
  markers <- out %>%
    select(`_id`,markers) %>%
    unnest_wider(markers) %>%
    rename(marker_start = start, marker_end = end) %>%
    unnest(colnames(.))
  # ids <- fromJSON(ids)
-  updates <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), sentencizer,
+  updates <- sentencizer(1:1024,
                                out = out,
                                ids = ids,
                                postfix = postfix,
@ -139,7 +127,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
                                pre_tags_regex = pre_tags_regex,
                                pre_tags = pre_tags,
                                post_tags_regex = post_tags_regex,
-                                post_tags = post_tags))
+                                post_tags = post_tags)
  if (nrow(updates) == 0) {
    print("Nothing to update for this batch")
    return(NULL)
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -108,6 +108,9 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
    hits <- length(json$hits$hits)
    batch <- 1
    print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
    if (dump) {
      saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
    }
    if (length(update) > 0){
      update(out, localhost = localhost, ...)
    }
@ -134,6 +137,8 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
        if (length(update) > 0){
          out <-  jsonlite:::flatten(json$hits$hits)
          update(out, localhost = localhost, ...)
        } else if (dump) {
          saveRDS(jsonlite:::flatten(json$hits$hits), file = paste0('batch_',batch*batch_size,'.Rds'))
        } else {
          out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
        }
@ -143,7 +148,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return("Done updating")
    } else if (dump) {
-      saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
+      return("Dumping complete")
    } else {
      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return(out)
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -24,43 +24,50 @@ out_parser <- function(out, field, clean = F) {
  par_parser <- function(row, out, field, clean) {
    doc <- out[row,]
    if (field == 'highlight') {
      doc <- replace(doc, doc=="NULL", NA)
      ### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
      doc$highlight.title[is.na(doc$highlight.title)] <- doc$`_source.title`[is.na(doc$highlight.title)]
      doc$highlight.text[is.na(doc$highlight.text)] <- doc$`_source.text`[is.na(doc$highlight.text)]
      doc$highlight.teaser[is.na(doc$highlight.teaser)] <- doc$`_source.teaser`[is.na(doc$highlight.teaser)]
      doc$highlight.subtitle[is.na(doc$highlight.subtitle)] <- doc$`_source.subtitle`[is.na(doc$highlight.subtitle)]
      doc$highlight.preteaser[is.na(doc$highlight.preteaser)] <- doc$`_source.preteaser`[is.na(doc$highlight.preteaser)]
      doc <- doc %>%
-        mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
+        unnest(cols = starts_with("highlight")) %>%
-        mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
+      mutate(across(starts_with("highlight"), na_if, "NULL")) %>%
-        mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
+        mutate(highlight.title = coalesce(highlight.title, `_source.title`),
-        mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
+               highlight.subtitle = coalesce(highlight.subtitle, `_source.subtitle`),
-        mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
+               highlight.preteaser = coalesce(highlight.preteaser, `_source.preteaser`),
-      doc$merged <- str_c(doc$highlight.title,
+               highlight.teaser = coalesce(highlight.teaser, `_source.teaser`),
-                          doc$highlight.subtitle,
+               highlight.text = coalesce(highlight.text, `_source.text`)
-                          doc$highlight.preteaser,
+        ) %>%
-                          doc$highlight.teaser,
+        mutate(highlight.title = str_replace_na(highlight.title, replacement = ''),
-                          doc$highlight.text,
+               highlight.subtitle = str_replace_na(highlight.subtitle, replacement = ''),
-                          '',
+               highlight.preteaser = str_replace_na(highlight.preteaser, replacement = ''),
-                          sep = ". ")
+               highlight.teaser = str_replace_na(highlight.teaser, replacement = ''),
               highlight.text = str_replace_na(highlight.text, replacement = '')
        ) %>%
        mutate(
          merged = str_c(highlight.title,
                         highlight.subtitle,
                         highlight.preteaser,
                         highlight.teaser,
                         highlight.text,
                         '',
                         sep = ". ")
        )
    }
    if (field == '_source') {
      doc <- doc %>%
-        mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
+        mutate(`_source.title` = str_replace_na(`_source.title`, replacement = ''),
-        mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
+               `_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = ''),
-        mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
+               `_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = ''),
-        mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
+               `_source.teaser` = str_replace_na(`_source.teaser`, replacement = ''),
-        mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
+               `_source.text` = str_replace_na(`_source.text`, replacement = '')
-      doc$merged <- str_c(doc$`_source.title`,
+               ) %>%
-                          doc$`_source.subtitle`,
+      mutate(
-                          doc$`_source.preteaser`,
+        merged = str_c(`_source.title`,
-                          doc$`_source.teaser`,
+                       `_source.subtitle`,
-                          doc$`_source.text`,
+                       `_source.preteaser`,
-                          '',
+                       `_source.teaser`,
-                          sep = ". ")
+                       `_source.text`,
                       '',
                       sep = ". ")
      )
    }
    ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
--- a/R/query_gen_actors.R
+++ b/R/query_gen_actors.R
@ -15,7 +15,7 @@
 #################################################################################################
 query_gen_actors <- function(actor, country, pre_tags, post_tags) {
  generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
-    return(paste0('{"_source": ["ud"],
+    return(paste0('{"_source": ["ud","title","subtitle","preteaser","teaser","text"],
    "query":
                    {"bool": {
                      "filter":[
--- a/R/ud_update.R
+++ b/R/ud_update.R
@ -1,10 +1,10 @@
-#' Elasticizer update function: generate UDpipe output from base text
+#' Generate UDpipe output from base text
 #'
-#' Elasticizer update function: generate UDpipe output from base text
+#' Generate UDpipe output from base text
-#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
+#' @param file Filename of file to read in, also used for generating output file name
-#' @param udmodel UDpipe model to use
+#' @param wd Working directory where *file*s can be found
 #' @param ud_file Filename of udpipe model to use, should be in *wd*
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @param file Filename for output (ud_ is automatically prepended)
 #' @return A vector of 1's indicating the success of each update call
 #' @export
 #' @examples
@ -17,9 +17,11 @@
 #   }
 # }
-ud_update <- function(out, udmodel, ver, file) {
+ud_update <- function(file, wd, ud_file, ver) {
-  out <- mamlr:::out_parser(out, field = '_source', clean = F)
+  out <- readRDS(str_c(wd,'/',file)) %>%
-  ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
+    out_parser(., field = '_source', clean = F)
  ud_model <- udpipe_load_model(file = str_c(wd,'/',ud_file))
  ud <- as.data.frame(udpipe(ud_model, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
    group_by(doc_id) %>%
    summarise(
      sentence_id = list(as.integer(sentence_id)),
@ -34,7 +36,7 @@ ud_update <- function(out, udmodel, ver, file) {
      exists = list(TRUE)
   )
  bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
-  saveRDS(bulk, file = paste0('ud_',file))
+  saveRDS(bulk, file = str_c(wd,'/ud_',file))
  # res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
  return()
 }
--- a/man/actorizer.Rd
+++ b/man/actorizer.Rd
@ -13,8 +13,7 @@ actorizer(
  pre_tags,
  post_tags,
  es_super,
-  ver,
+  ver
  cores = 1
 )
 }
 \arguments{
@ -32,8 +31,6 @@ actorizer(
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 \item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
 \item{identifier}{String used to mark highlights. Should be a lowercase string}
 }
 \value{
--- a/man/ud_update.Rd
+++ b/man/ud_update.Rd
@ -2,24 +2,24 @@
 % Please edit documentation in R/ud_update.R
 \name{ud_update}
 \alias{ud_update}
-\title{Elasticizer update function: generate UDpipe output from base text}
+\title{Generate UDpipe output from base text}
 \usage{
-ud_update(out, udmodel, ver)
+ud_update(file, wd, ud_file, ver)
 }
 \arguments{
-\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
+\item{file}{Filename of file to read in, also used for generating output file name}
-\item{udmodel}{UDpipe model to use}
+\item{wd}{Working directory where *file*s can be found}
-\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
+\item{ud_file}{Filename of udpipe model to use, should be in *wd*}
-\item{file}{Filename for output (ud_ is automatically prepended)}
+\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 }
 \value{
 A vector of 1's indicating the success of each update call
 }
 \description{
-Elasticizer update function: generate UDpipe output from base text
+Generate UDpipe output from base text
 }
 \examples{
 ud_update(out, udmodel, ver, file)