Added support for custom update function to elasticizer

master
Erik de Vries 6 years ago
parent 311838b34b
commit a273524105

@ -4,6 +4,7 @@
#' @param query A JSON-formatted query in the Elasticsearch query DSL
#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
#' @param index The name of the Elasticsearch index to search through
#' @param update When set, indicates an update function to use on each batch of 1000 articles
#' @return A data frame containing all the search results
#' @export
@ -12,7 +13,7 @@
#################################################################################################
#################################### Get data from ElasticSearch ################################
#################################################################################################
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL){
connect(es_port = 443,
es_transport = 'https',
es_host = 'linux01.uis.no',
@ -21,7 +22,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
es_pwd = es_pwd,
errors = 'complete')
# Get all results - one approach is to use a while loop
if (src == T) {
if (src == T || length(update) > 0 ) {
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
}
if (src == F) {
@ -36,6 +37,9 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
hits <- 1
batch <- 1
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
if (length(update) > 0){
update()
}
while(hits != 0){
res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
json <- fromJSON(res)
@ -43,7 +47,11 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
if(hits > 0) {
batch <- batch+1
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
if (length(update) > 0){
update()
} else {
out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
}
}
}
return(out)

@ -5,7 +5,7 @@
\title{Generate a data frame out of unparsed Elasticsearch JSON}
\usage{
elasticizer(query, src = T, index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ"))
es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL)
}
\arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -13,6 +13,8 @@ elasticizer(query, src = T, index = "maml",
\item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}
\item{index}{The name of the Elasticsearch index to search through}
\item{update}{When set, indicates an update function to use on each batch of 1000 articles}
}
\value{
A data frame containing all the search results

Loading…
Cancel
Save