From a2735241059122ba09855f6d53a5c550d68fb61d Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Tue, 23 Oct 2018 14:23:30 +0200 Subject: [PATCH] Added support for custom update function to elasticizer --- R/elastic_update.R | 2 +- R/elasticizer.R | 16 ++++++++++++---- man/elasticizer.Rd | 4 +++- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/R/elastic_update.R b/R/elastic_update.R index 6e9621d..8bf7c37 100644 --- a/R/elastic_update.R +++ b/R/elastic_update.R @@ -25,4 +25,4 @@ elastic_update <- function(x, es_super = 'secret') { # content(res, "parsed", "application/json") # appData <- content(res) return(res) -} \ No newline at end of file +} diff --git a/R/elasticizer.R b/R/elasticizer.R index 1994db1..9bc14c3 100644 --- a/R/elasticizer.R +++ b/R/elasticizer.R @@ -4,6 +4,7 @@ #' @param query A JSON-formatted query in the Elasticsearch query DSL #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved #' @param index The name of the Elasticsearch index to search through +#' @param update When set, indicates an update function to use on each batch of 1000 articles #' @return A data frame containing all the search results #' @export @@ -12,7 +13,7 @@ ################################################################################################# #################################### Get data from ElasticSearch ################################ ################################################################################################# -elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){ +elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL){ connect(es_port = 443, es_transport = 'https', es_host = 'linux01.uis.no', @@ -21,7 +22,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw es_pwd = es_pwd, errors = 'complete') # Get all results - one approach is to use a while loop - if (src == T) { + if (src == T || length(update) > 0 ) { res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T) } if (src == F) { @@ -36,6 +37,9 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw hits <- 1 batch <- 1 print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.')) + if (length(update) > 0){ + update() + } while(hits != 0){ res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T) json <- fromJSON(res) @@ -43,9 +47,13 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw if(hits > 0) { batch <- batch+1 print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.')) - out <- bind_rows(out, jsonlite:::flatten(json$hits$hits)) + if (length(update) > 0){ + update() + } else { + out <- bind_rows(out, jsonlite:::flatten(json$hits$hits)) + } } } return(out) } -} \ No newline at end of file +} diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd index 357b95f..58ed520 100644 --- a/man/elasticizer.Rd +++ b/man/elasticizer.Rd @@ -5,7 +5,7 @@ \title{Generate a data frame out of unparsed Elasticsearch JSON} \usage{ elasticizer(query, src = T, index = "maml", - es_pwd = .rs.askForPassword("Elasticsearch READ")) + es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL) } \arguments{ \item{query}{A JSON-formatted query in the Elasticsearch query DSL} @@ -13,6 +13,8 @@ elasticizer(query, src = T, index = "maml", \item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved} \item{index}{The name of the Elasticsearch index to search through} + +\item{update}{When set, indicates an update function to use on each batch of 1000 articles} } \value{ A data frame containing all the search results