actorizer: major fix to ud parsing, changed regex to remove html tags to only include tags with a maximum of 20 characters in them

ud_update: major fix to ud parsing, changed regex to remove html tags to only include tags with a maximum of 20 characters in them elastic_update: set the minimum break between retries from 10 to 30 seconds elasticizer: implementation of retries for elasticizer function, 10 retries with a break of 30 seconds in between
7 years ago · 54dfb6a8ca
parent b042fdb1e3
commit 54dfb6a8ca
4 changed files with 40 additions and 11 deletions
--- a/R/actorizer.R
+++ b/R/actorizer.R
@ -22,7 +22,6 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
  }

  sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier) {
-
    ### If no pre or postfixes, match *not nothing* i.e. anything
    if (is.na(prefix) || prefix == '') {
      prefix = '$^'
@ -32,8 +31,6 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
    }
    ### Also needs fix for empty strings (non-NA)
    doc <- out[row,]
-    print(doc$merged)
-    print(row)
    ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) %>%
      filter(upos != "PUNCT") # Removing punctuation to get accurate word counts
    sentence_count <- length(unique(ud$sentence))
@ -71,12 +68,12 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
                      str_replace_na(unlist(out$highlight.teaser), replacement = " "),
                      str_replace_na(unlist(out$highlight.text), replacement = " "),
                      sep = " ") %>%
-    # Replacing html tags with whitespaces
-    str_replace_all("<.*?>", " ") %>%
+    # Remove html tags, and multiple consequent whitespaces
+    str_replace_all("<.{0,20}?>", " ") %>%
    str_replace_all("\\s+"," ")

  ids <- fromJSON(ids)
-  updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = 1))
+  updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores()))
  bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add')
  bulk <- c(bulk,apply(updates[c(1,8)], 1, bulk_writer, varname='actors', type = 'add'))
  return(elastic_update(bulk, es_super = es_super, localhost = localhost))
--- a/R/elastic_update.R
+++ b/R/elastic_update.R
@ -25,7 +25,7 @@ elastic_update <- function(x, es_super = 'secret', localhost = T) {
               , encode = "raw"
               , add_headers("Content-Type" = "application/json")
               , times = 10
-               , pause_min = 10
+               , pause_min = 30
  )
  httr:::stop_for_status(res)
  appData <- httr:::content(res)
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -18,6 +18,8 @@
 #################################### Get data from ElasticSearch ################################
 #################################################################################################
 elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), size = 1024, update = NULL, localhost = F, ...){
+  retries <- 10 ### Number of retries on error
+  sleep <- 30 ### Number of seconds between retries
  httr::set_config(httr::config(http_version = 0))
  if (localhost == F) {
    connect(es_port = 443,
@ -39,10 +41,30 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
  }
  # Get all results - one approach is to use a while loop
  if (src == T) {
-    res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T)
+    res <- NULL
+    attempt <- 0
+    while( is.null(res) && attempt <= retries ) {
+      if (attempt > 0) {
+        Sys.sleep(sleep)
+      }
+      attempt <- attempt + 1
+      try(
+        res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T)
+      )
+    }
  }
  if (src == F) {
-    res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T, source = F)
+    res <- NULL
+    attempt <- 0
+    while( is.null(res) && attempt <= retries ) {
+      if (attempt > 0) {
+        Sys.sleep(sleep)
+      }
+      attempt <- attempt + 1
+      try(
+        res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T, source = F)
+      )
+    }
  }
  json <- fromJSON(res)
  if (json$hits$total == 0) {
@ -57,7 +79,17 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
      update(out, localhost = localhost, ...)
    }
    while(hits != 0){
-      res <- scroll(json$`_scroll_id`, time_scroll="20m", raw=T)
+      res <- NULL
+      attempt <- 0
+      while( is.null(res) && attempt <= retries ) {
+        if (attempt > 0) {
+          Sys.sleep(sleep)
+        }
+        attempt <- attempt + 1
+        try(
+          res <- scroll(json$`_scroll_id`, time_scroll="20m", raw=T)
+        )
+      }
      json <- fromJSON(res)
      hits <- length(json$hits$hits)
      if(hits > 0) {
--- a/R/ud_update.R
+++ b/R/ud_update.R
@ -18,7 +18,7 @@ ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword
                      str_replace_na(out$`_source.text`, replacement = " "),
                      sep = " ") %>%
    # Remove html tags, and multiple consequent whitespaces
-    str_replace_all("<.*?>", " ") %>%
+    str_replace_all("<.{0,20}?>", " ") %>%
    str_replace_all("\\s+"," ")
  par_proc <- function(row, out, udmodel) {
    doc <- out[row,]