From 77eb51a1bf793cf6791490beecedb2771f5eee26 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 19 Jun 2020 11:34:18 +0200
Subject: [PATCH] actorizer: totally revamped way of finding actors
 elasticizer: updated dump handling to create a dump for every batch, instead
 of one big file at the end out_parser: streamlined code query_gen_actors:
 only include relevant fields ud_update: changed function parameters to work
 with elasticizer dump function

---
 R/actorizer.R        | 172 ++++++++++++++++++++-----------------------
 R/elasticizer.R      |   7 +-
 R/out_parser.R       |  69 +++++++++--------
 R/query_gen_actors.R |   2 +-
 R/ud_update.R        |  20 ++---
 man/actorizer.Rd     |   5 +-
 man/ud_update.Rd     |  14 ++--
 7 files changed, 144 insertions(+), 145 deletions(-)

diff --git a/R/actorizer.R b/R/actorizer.R
index 89216b5..fdc90f2 100644
--- a/R/actorizer.R
+++ b/R/actorizer.R
@@ -9,58 +9,64 @@
 #' @param identifier String used to mark highlights. Should be a lowercase string
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @param es_super Password for write access to ElasticSearch
-#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
 #' @return As this is a nested function used within elasticizer, there is no return output
 #' @export
 #' @examples
 #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
-actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = 1) {
-  plan(multiprocess, workers = cores)
-  ### Function to filter out false positives using regex
-  exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
-    min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
-    max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
-    split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
-    min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
-    max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
-    sentence <- paste0(' ',str_sub(doc$merged, min, max),' ') # Extract sentence from text, adding whitespaces before and after for double negation (i.e. Con only when preceded by "("))
-
-    # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
-    if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
-      return(id)
-    } else {
-      return(NULL)
-    }
-  }
-  ranger <- function(x, ud) {
-    return(which((ud$start <= x) & (ud$end >= x)))
-  }
+actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) {
   sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
     doc <- out[row,]
-    if (nchar(doc$merged) > 990000) {
-      return(
-        data.frame(
-          err = T,
-          errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
-        )
-      )
+    if (sum(nchar(doc$merged) > 990000)) {
+      stop("One or more documents in this batch exceed 990000 characters")
     }
     # Extracting ud output from document
-    ud <- doc$`_source.ud`[[1]] %>%
-      select(-one_of('exists')) %>% # Removing ud.exists variable
-      unnest() %>%
-      mutate(doc_id = doc$`_id`)
-    markers <- doc$markers[[1]][,'start'] # Extract list of markers
-    # Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
-    rows <- unlist(lapply(markers, ranger, ud = ud))
-
-    # Setting up an actor variable
-    ud$actor <- F
-    ud$actor[rows] <- T
+    ud <- doc %>%
+      select(`_id`,`_source.ud`, merged) %>%
+      unnest(cols = c("_source.ud")) %>%
+      select(`_id`,lemma,start,end, sentence_id,merged) %>%
+      unnest(cols = colnames(.))
 
-    sentence_count <- max(ud$sentence_id) # Number of sentences in article
-    actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
+    sentences <- ud %>%
+      group_by(`_id`, sentence_id) %>%
+      summarise(
+        sentence_start = min(start),
+        sentence_end = max(end)
+      ) %>%
+      mutate(
+        sentence_count = n()
+      )
 
+    hits <- left_join(ud, markers, by='_id') %>%
+      mutate(
+        actor = case_when(
+          start <= marker_start  & end >= marker_start ~ T,
+          T ~ F
+        )
+      ) %>%
+      select(`_id`, sentence_id, start, end,actor,merged) %>%
+      filter(actor) %>%
+      group_by(`_id`,sentence_id) %>%
+      summarise(
+        actor = any(actor),
+        actor_start = I(list(start)),
+        actor_end = I(list(end)),
+        n_markers = length(start),
+        merged = first(merged)
+      ) %>%
+      left_join(.,sentences, by=c('_id','sentence_id')) %>%
+      ungroup %>%
+      arrange(`_id`,sentence_id) %>%
+      group_by(`_id`) %>%
+      mutate(n_markers = cumsum(n_markers)) %>%
+      mutate(
+        sentence_start_tags = sentence_start+((nchar(pre_tags)+nchar(post_tags))*(lag(n_markers, default = 0))),
+        sentence_end_tags = sentence_end+((nchar(pre_tags)+nchar(post_tags))*(n_markers))
+      ) %>%
+      mutate(
+        sentence = paste0(' ',str_sub(merged, sentence_start_tags, sentence_end_tags),' ')
+      ) %>%
+      select(-merged) %>%
+      ungroup()
     # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
     if (!is.na(prefix) || !is.na(postfix)) {
       ### If no pre or postfixes, match *not nothing* i.e. anything
@@ -70,68 +76,50 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
       if (is.na(postfix)) {
         postfix = '$^'
       }
-      sentence_ids <- unlist(lapply(actor_sentences,
-                             exceptionizer,
-                             ud = ud,
-                             doc = doc,
-                             markers = markers,
-                             pre_tags_regex = pre_tags_regex,
-                             pre_tags = pre_tags,
-                             post_tags_regex = post_tags_regex,
-                             post_tags = post_tags,
-                             prefix = prefix,
-                             postfix = postfix))
-    } else {
-      sentence_ids <- actor_sentences
-    }
-    if (length(sentence_ids > 0)) {
-      # Generating nested sentence start and end positions for actor sentences
-      ud <- ud %>%
-        filter(sentence_id %in% sentence_ids)
-      actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
-      actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
-      ud <- ud %>%
-        group_by(sentence_id) %>%
-        summarise (
-          sentence_start = as.integer(min(start)),
-          sentence_end = as.integer(max(end)),
-          doc_id = first(doc_id)
-        ) %>%
-        group_by(doc_id) %>%
-        summarise(
-          sentence_id = list(as.integer(sentence_id)),
-          sentence_start = list(sentence_start),
-          sentence_end = list(sentence_end)
-        )
-
-      return(
-        data.frame(ud, # Sentence id, start and end position for actor sentences
-                   actor_start = I(list(actor_start)), # List of actor ud token start positions
-                   actor_end = I(list(actor_end)), # List of actor ud token end positions
-                   occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs
-                   prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
-                   rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level
-                   first = min(sentence_ids), # First sentence in which actor is mentioned
-                   ids = I(list(ids)) # List of actor ids
-        )
+      hits <- hits %>%
+      filter(
+        !str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) & !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))
       )
-    } else {
-      return(NULL)
     }
 
+    hits <- hits %>%
+      group_by(`_id`) %>%
+      summarise(
+        sentence_id = list(as.integer(sentence_id)),
+        sentence_start = list(sentence_start),
+        sentence_end = list(sentence_end),
+        actor_start = I(list(unlist(actor_start))), # List of actor ud token start positions
+        actor_end = I(list(unlist(actor_end))), # List of actor ud token end positions
+        occ = length(unique(unlist(sentence_id))), # Number of sentences in which actor occurs
+        first = min(unlist(sentence_id)), # First sentence in which actor is mentioned
+        ids = I(list(ids)),
+        sentence_count = first(sentence_count)# List of actor ids
+      ) %>%
+      mutate(
+        prom = occ/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
+        rel_first = 1-(first/sentence_count), # Relative position of first occurence at sentence level
+      ) %>%
+      select(`_id`:occ, prom,rel_first,first,ids)
+    return(hits)
+
   }
-  out <- mamlr:::out_parser(out, field = 'highlight', clean = F, cores = cores)
+  out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
   offsetter <- function(x, pre_tags, post_tags) {
-    return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
+    return(as.list(as.data.frame(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))))
   }
   prefix[prefix==''] <- NA
   postfix[postfix==''] <- NA
   pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
   post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
   out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags)
+  markers <- out %>%
+    select(`_id`,markers) %>%
+    unnest_wider(markers) %>%
+    rename(marker_start = start, marker_end = end) %>%
+    unnest(colnames(.))
 
   # ids <- fromJSON(ids)
-  updates <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), sentencizer,
+  updates <- sentencizer(1:1024,
                                 out = out,
                                 ids = ids,
                                 postfix = postfix,
@@ -139,7 +127,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
                                 pre_tags_regex = pre_tags_regex,
                                 pre_tags = pre_tags,
                                 post_tags_regex = post_tags_regex,
-                                post_tags = post_tags))
+                                post_tags = post_tags)
   if (nrow(updates) == 0) {
     print("Nothing to update for this batch")
     return(NULL)
diff --git a/R/elasticizer.R b/R/elasticizer.R
index 10f5e9f..0f63381 100644
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@@ -108,6 +108,9 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
     hits <- length(json$hits$hits)
     batch <- 1
     print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
+    if (dump) {
+      saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
+    }
     if (length(update) > 0){
       update(out, localhost = localhost, ...)
     }
@@ -134,6 +137,8 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
         if (length(update) > 0){
           out <-  jsonlite:::flatten(json$hits$hits)
           update(out, localhost = localhost, ...)
+        } else if (dump) {
+          saveRDS(jsonlite:::flatten(json$hits$hits), file = paste0('batch_',batch*batch_size,'.Rds'))
         } else {
           out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
         }
@@ -143,7 +148,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
       scroll_clear(conn = conn, x = json$`_scroll_id`)
       return("Done updating")
     } else if (dump) {
-      saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
+      return("Dumping complete")
     } else {
       scroll_clear(conn = conn, x = json$`_scroll_id`)
       return(out)
diff --git a/R/out_parser.R b/R/out_parser.R
index ebc6066..2683201 100644
--- a/R/out_parser.R
+++ b/R/out_parser.R
@@ -24,43 +24,50 @@ out_parser <- function(out, field, clean = F) {
   par_parser <- function(row, out, field, clean) {
     doc <- out[row,]
     if (field == 'highlight') {
-      doc <- replace(doc, doc=="NULL", NA)
-      ### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
-      doc$highlight.title[is.na(doc$highlight.title)] <- doc$`_source.title`[is.na(doc$highlight.title)]
-      doc$highlight.text[is.na(doc$highlight.text)] <- doc$`_source.text`[is.na(doc$highlight.text)]
-      doc$highlight.teaser[is.na(doc$highlight.teaser)] <- doc$`_source.teaser`[is.na(doc$highlight.teaser)]
-      doc$highlight.subtitle[is.na(doc$highlight.subtitle)] <- doc$`_source.subtitle`[is.na(doc$highlight.subtitle)]
-      doc$highlight.preteaser[is.na(doc$highlight.preteaser)] <- doc$`_source.preteaser`[is.na(doc$highlight.preteaser)]
 
       doc <- doc %>%
-        mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
-        mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
-        mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
-        mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
-        mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
-      doc$merged <- str_c(doc$highlight.title,
-                          doc$highlight.subtitle,
-                          doc$highlight.preteaser,
-                          doc$highlight.teaser,
-                          doc$highlight.text,
-                          '',
-                          sep = ". ")
+        unnest(cols = starts_with("highlight")) %>%
+      mutate(across(starts_with("highlight"), na_if, "NULL")) %>%
+        mutate(highlight.title = coalesce(highlight.title, `_source.title`),
+               highlight.subtitle = coalesce(highlight.subtitle, `_source.subtitle`),
+               highlight.preteaser = coalesce(highlight.preteaser, `_source.preteaser`),
+               highlight.teaser = coalesce(highlight.teaser, `_source.teaser`),
+               highlight.text = coalesce(highlight.text, `_source.text`)
+        ) %>%
+        mutate(highlight.title = str_replace_na(highlight.title, replacement = ''),
+               highlight.subtitle = str_replace_na(highlight.subtitle, replacement = ''),
+               highlight.preteaser = str_replace_na(highlight.preteaser, replacement = ''),
+               highlight.teaser = str_replace_na(highlight.teaser, replacement = ''),
+               highlight.text = str_replace_na(highlight.text, replacement = '')
+        ) %>%
+        mutate(
+          merged = str_c(highlight.title,
+                         highlight.subtitle,
+                         highlight.preteaser,
+                         highlight.teaser,
+                         highlight.text,
+                         '',
+                         sep = ". ")
+        )
     }
 
     if (field == '_source') {
       doc <- doc %>%
-        mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
-        mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
-        mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
-        mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
-        mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
-      doc$merged <- str_c(doc$`_source.title`,
-                          doc$`_source.subtitle`,
-                          doc$`_source.preteaser`,
-                          doc$`_source.teaser`,
-                          doc$`_source.text`,
-                          '',
-                          sep = ". ")
+        mutate(`_source.title` = str_replace_na(`_source.title`, replacement = ''),
+               `_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = ''),
+               `_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = ''),
+               `_source.teaser` = str_replace_na(`_source.teaser`, replacement = ''),
+               `_source.text` = str_replace_na(`_source.text`, replacement = '')
+               ) %>%
+      mutate(
+        merged = str_c(`_source.title`,
+                       `_source.subtitle`,
+                       `_source.preteaser`,
+                       `_source.teaser`,
+                       `_source.text`,
+                       '',
+                       sep = ". ")
+      )
     }
 
     ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
diff --git a/R/query_gen_actors.R b/R/query_gen_actors.R
index ece6119..1626775 100644
--- a/R/query_gen_actors.R
+++ b/R/query_gen_actors.R
@@ -15,7 +15,7 @@
 #################################################################################################
 query_gen_actors <- function(actor, country, pre_tags, post_tags) {
   generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
-    return(paste0('{"_source": ["ud"],
+    return(paste0('{"_source": ["ud","title","subtitle","preteaser","teaser","text"],
     "query":
                     {"bool": {
                       "filter":[
diff --git a/R/ud_update.R b/R/ud_update.R
index cac40bd..88b7f32 100644
--- a/R/ud_update.R
+++ b/R/ud_update.R
@@ -1,10 +1,10 @@
-#' Elasticizer update function: generate UDpipe output from base text
+#' Generate UDpipe output from base text
 #'
-#' Elasticizer update function: generate UDpipe output from base text
-#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
-#' @param udmodel UDpipe model to use
+#' Generate UDpipe output from base text
+#' @param file Filename of file to read in, also used for generating output file name
+#' @param wd Working directory where *file*s can be found
+#' @param ud_file Filename of udpipe model to use, should be in *wd*
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
-#' @param file Filename for output (ud_ is automatically prepended)
 #' @return A vector of 1's indicating the success of each update call
 #' @export
 #' @examples
@@ -17,9 +17,11 @@
 #   }
 # }
 
-ud_update <- function(out, udmodel, ver, file) {
-  out <- mamlr:::out_parser(out, field = '_source', clean = F)
-  ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
+ud_update <- function(file, wd, ud_file, ver) {
+  out <- readRDS(str_c(wd,'/',file)) %>%
+    out_parser(., field = '_source', clean = F)
+  ud_model <- udpipe_load_model(file = str_c(wd,'/',ud_file))
+  ud <- as.data.frame(udpipe(ud_model, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
     group_by(doc_id) %>%
     summarise(
       sentence_id = list(as.integer(sentence_id)),
@@ -34,7 +36,7 @@ ud_update <- function(out, udmodel, ver, file) {
       exists = list(TRUE)
    )
   bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
-  saveRDS(bulk, file = paste0('ud_',file))
+  saveRDS(bulk, file = str_c(wd,'/ud_',file))
   # res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
   return()
 }
diff --git a/man/actorizer.Rd b/man/actorizer.Rd
index 2845f88..f578db5 100644
--- a/man/actorizer.Rd
+++ b/man/actorizer.Rd
@@ -13,8 +13,7 @@ actorizer(
   pre_tags,
   post_tags,
   es_super,
-  ver,
-  cores = 1
+  ver
 )
 }
 \arguments{
@@ -32,8 +31,6 @@ actorizer(
 
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 
-\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
-
 \item{identifier}{String used to mark highlights. Should be a lowercase string}
 }
 \value{
diff --git a/man/ud_update.Rd b/man/ud_update.Rd
index 91bf9e4..6d0c475 100644
--- a/man/ud_update.Rd
+++ b/man/ud_update.Rd
@@ -2,24 +2,24 @@
 % Please edit documentation in R/ud_update.R
 \name{ud_update}
 \alias{ud_update}
-\title{Elasticizer update function: generate UDpipe output from base text}
+\title{Generate UDpipe output from base text}
 \usage{
-ud_update(out, udmodel, ver)
+ud_update(file, wd, ud_file, ver)
 }
 \arguments{
-\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
+\item{file}{Filename of file to read in, also used for generating output file name}
 
-\item{udmodel}{UDpipe model to use}
+\item{wd}{Working directory where *file*s can be found}
 
-\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
+\item{ud_file}{Filename of udpipe model to use, should be in *wd*}
 
-\item{file}{Filename for output (ud_ is automatically prepended)}
+\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 }
 \value{
 A vector of 1's indicating the success of each update call
 }
 \description{
-Elasticizer update function: generate UDpipe output from base text
+Generate UDpipe output from base text
 }
 \examples{
 ud_update(out, udmodel, ver, file)