Added support for custom update function to elasticizer

master
Erik de Vries 6 years ago
parent 311838b34b
commit a273524105

@ -4,6 +4,7 @@
#' @param query A JSON-formatted query in the Elasticsearch query DSL #' @param query A JSON-formatted query in the Elasticsearch query DSL
#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
#' @param index The name of the Elasticsearch index to search through #' @param index The name of the Elasticsearch index to search through
#' @param update When set, indicates an update function to use on each batch of 1000 articles
#' @return A data frame containing all the search results #' @return A data frame containing all the search results
#' @export #' @export
@ -12,7 +13,7 @@
################################################################################################# #################################################################################################
#################################### Get data from ElasticSearch ################################ #################################### Get data from ElasticSearch ################################
################################################################################################# #################################################################################################
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL){
connect(es_port = 443, connect(es_port = 443,
es_transport = 'https', es_transport = 'https',
es_host = 'linux01.uis.no', es_host = 'linux01.uis.no',
@ -21,7 +22,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
es_pwd = es_pwd, es_pwd = es_pwd,
errors = 'complete') errors = 'complete')
# Get all results - one approach is to use a while loop # Get all results - one approach is to use a while loop
if (src == T) { if (src == T || length(update) > 0 ) {
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T) res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
} }
if (src == F) { if (src == F) {
@ -36,6 +37,9 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
hits <- 1 hits <- 1
batch <- 1 batch <- 1
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.')) print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
if (length(update) > 0){
update()
}
while(hits != 0){ while(hits != 0){
res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T) res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
json <- fromJSON(res) json <- fromJSON(res)
@ -43,9 +47,13 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
if(hits > 0) { if(hits > 0) {
batch <- batch+1 batch <- batch+1
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.')) print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
if (length(update) > 0){
update()
} else {
out <- bind_rows(out, jsonlite:::flatten(json$hits$hits)) out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
} }
} }
}
return(out) return(out)
} }
} }

@ -5,7 +5,7 @@
\title{Generate a data frame out of unparsed Elasticsearch JSON} \title{Generate a data frame out of unparsed Elasticsearch JSON}
\usage{ \usage{
elasticizer(query, src = T, index = "maml", elasticizer(query, src = T, index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ")) es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL)
} }
\arguments{ \arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL} \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -13,6 +13,8 @@ elasticizer(query, src = T, index = "maml",
\item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved} \item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}
\item{index}{The name of the Elasticsearch index to search through} \item{index}{The name of the Elasticsearch index to search through}
\item{update}{When set, indicates an update function to use on each batch of 1000 articles}
} }
\value{ \value{
A data frame containing all the search results A data frame containing all the search results

Loading…
Cancel
Save