diff --git a/R/actorizer.R b/R/actorizer.R index de98a2b..e53baa1 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -22,7 +22,6 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier } sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier) { - ### If no pre or postfixes, match *not nothing* i.e. anything if (is.na(prefix) || prefix == '') { prefix = '$^' @@ -32,8 +31,6 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier } ### Also needs fix for empty strings (non-NA) doc <- out[row,] - print(doc$merged) - print(row) ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) %>% filter(upos != "PUNCT") # Removing punctuation to get accurate word counts sentence_count <- length(unique(ud$sentence)) @@ -71,12 +68,12 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier str_replace_na(unlist(out$highlight.teaser), replacement = " "), str_replace_na(unlist(out$highlight.text), replacement = " "), sep = " ") %>% - # Replacing html tags with whitespaces - str_replace_all("<.*?>", " ") %>% + # Remove html tags, and multiple consequent whitespaces + str_replace_all("<.{0,20}?>", " ") %>% str_replace_all("\\s+"," ") ids <- fromJSON(ids) - updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = 1)) + updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores())) bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add') bulk <- c(bulk,apply(updates[c(1,8)], 1, bulk_writer, varname='actors', type = 'add')) return(elastic_update(bulk, es_super = es_super, localhost = localhost)) diff --git a/R/elastic_update.R b/R/elastic_update.R index 5d1a374..fd455c9 100644 --- a/R/elastic_update.R +++ b/R/elastic_update.R @@ -25,7 +25,7 @@ elastic_update <- function(x, es_super = 'secret', localhost = T) { , encode = "raw" , add_headers("Content-Type" = "application/json") , times = 10 - , pause_min = 10 + , pause_min = 30 ) httr:::stop_for_status(res) appData <- httr:::content(res) diff --git a/R/elasticizer.R b/R/elasticizer.R index 3633aba..86ce8c0 100644 --- a/R/elasticizer.R +++ b/R/elasticizer.R @@ -18,6 +18,8 @@ #################################### Get data from ElasticSearch ################################ ################################################################################################# elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), size = 1024, update = NULL, localhost = F, ...){ + retries <- 10 ### Number of retries on error + sleep <- 30 ### Number of seconds between retries httr::set_config(httr::config(http_version = 0)) if (localhost == F) { connect(es_port = 443, @@ -39,10 +41,30 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw } # Get all results - one approach is to use a while loop if (src == T) { - res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T) + res <- NULL + attempt <- 0 + while( is.null(res) && attempt <= retries ) { + if (attempt > 0) { + Sys.sleep(sleep) + } + attempt <- attempt + 1 + try( + res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T) + ) + } } if (src == F) { - res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T, source = F) + res <- NULL + attempt <- 0 + while( is.null(res) && attempt <= retries ) { + if (attempt > 0) { + Sys.sleep(sleep) + } + attempt <- attempt + 1 + try( + res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T, source = F) + ) + } } json <- fromJSON(res) if (json$hits$total == 0) { @@ -57,7 +79,17 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw update(out, localhost = localhost, ...) } while(hits != 0){ - res <- scroll(json$`_scroll_id`, time_scroll="20m", raw=T) + res <- NULL + attempt <- 0 + while( is.null(res) && attempt <= retries ) { + if (attempt > 0) { + Sys.sleep(sleep) + } + attempt <- attempt + 1 + try( + res <- scroll(json$`_scroll_id`, time_scroll="20m", raw=T) + ) + } json <- fromJSON(res) hits <- length(json$hits$hits) if(hits > 0) { diff --git a/R/ud_update.R b/R/ud_update.R index 97ed87b..64f1b57 100644 --- a/R/ud_update.R +++ b/R/ud_update.R @@ -18,7 +18,7 @@ ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword str_replace_na(out$`_source.text`, replacement = " "), sep = " ") %>% # Remove html tags, and multiple consequent whitespaces - str_replace_all("<.*?>", " ") %>% + str_replace_all("<.{0,20}?>", " ") %>% str_replace_all("\\s+"," ") par_proc <- function(row, out, udmodel) { doc <- out[row,]