elasticizer: renamed size parameter to batch_size, created max_batch parameter to limit the number of results returned

query_string: renamed x parameter to query, added fields parameter to select what fields to return and random boolean parameter to define whether the returned results should be randomized
master
Erik de Vries 6 years ago
parent d0e9bf565b
commit 4f8b1f2024

@ -5,7 +5,8 @@
#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
#' @param index The name of the Elasticsearch index to search through #' @param index The name of the Elasticsearch index to search through
#' @param es_pwd The password for Elasticsearch read access #' @param es_pwd The password for Elasticsearch read access
#' @param size Batch size #' @param batch_size Batch size
#' @param max_batch Maximum number batches to retrieve
#' @param update When set, indicates an update function to use on each batch of 1000 articles #' @param update When set, indicates an update function to use on each batch of 1000 articles
#' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200) #' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @param ... Parameters passed on to the update function #' @param ... Parameters passed on to the update function
@ -17,7 +18,7 @@
################################################################################################# #################################################################################################
#################################### Get data from ElasticSearch ################################ #################################### Get data from ElasticSearch ################################
################################################################################################# #################################################################################################
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), size = 1024, update = NULL, localhost = F, ...){ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, update = NULL, localhost = F, ...){
retries <- 10 ### Number of retries on error retries <- 10 ### Number of retries on error
sleep <- 30 ### Number of seconds between retries sleep <- 30 ### Number of seconds between retries
httr::set_config(httr::config(http_version = 0)) httr::set_config(httr::config(http_version = 0))
@ -49,7 +50,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
} }
attempt <- attempt + 1 attempt <- attempt + 1
try( try(
res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T) res <- Search(index = index, time_scroll="20m",body = query, size = batch_size, raw=T)
) )
} }
} }
@ -62,7 +63,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
} }
attempt <- attempt + 1 attempt <- attempt + 1
try( try(
res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T, source = F) res <- Search(index = index, time_scroll="20m",body = query, size = batch_size, raw=T, source = F)
) )
} }
} }
@ -72,13 +73,13 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
} else { } else {
out <- jsonlite:::flatten(json$hits$hits) out <- jsonlite:::flatten(json$hits$hits)
total <- json$hits$total total <- json$hits$total
hits <- 1 hits <- length(json$hits$hits)
batch <- 1 batch <- 1
print(paste0('Processing documents ',batch*size-size,' through ',batch*size,' out of ',total,' documents.')) print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
if (length(update) > 0){ if (length(update) > 0){
update(out, localhost = localhost, ...) update(out, localhost = localhost, ...)
} }
while(hits != 0){ while(hits > 0 && batch < max_batch ){
res <- NULL res <- NULL
attempt <- 0 attempt <- 0
while( is.null(res) && attempt <= retries ) { while( is.null(res) && attempt <= retries ) {
@ -94,7 +95,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
hits <- length(json$hits$hits) hits <- length(json$hits$hits)
if(hits > 0) { if(hits > 0) {
batch <- batch+1 batch <- batch+1
print(paste0('Processing documents ',batch*size-size,' through ',batch*size,' out of ',total,' documents.')) print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
if (length(update) > 0){ if (length(update) > 0){
out <- jsonlite:::flatten(json$hits$hits) out <- jsonlite:::flatten(json$hits$hits)
update(out, localhost = localhost, ...) update(out, localhost = localhost, ...)

@ -1,26 +1,62 @@
#' Generate a query string query for ElasticSearch #' Generate a query string query for ElasticSearch
#' #'
#' Generate a query string query for ElasticSearch #' Generate a query string query for ElasticSearch
#' @param x Query string in ElasticSearch query string format #' @param query Query string in ElasticSearch query string format
#' @param fields List of field names to return, defaults to all
#' @param random Return randomized results. Boolean, defaults to FALSE
#' @return A formatted ElasticSearch query string query #' @return A formatted ElasticSearch query string query
#' @export #' @export
#' @examples #' @examples
#' query_string(x) #' query_string(query)
################################################################################################# #################################################################################################
#################################### Get data from ElasticSearch ################################ #################################### Get data from ElasticSearch ################################
################################################################################################# #################################################################################################
query_string <- function(x) { query_string <- function(query, fields = F, random = F) {
if (fields == F) {
fields <- '*'
}
if (random == T) {
return(paste0( return(paste0(
'{ '{
"_source": ',toJSON(fields),',
"query": {
"function_score": {
"query": { "query": {
"bool":{
"filter": [{
"query_string" : { "query_string" : {
"default_field" : "text", "default_field" : "text",
"query" : "',x,'", "query" : "',query,'",
"default_operator": "AND", "default_operator": "AND",
"allow_leading_wildcard" : false "allow_leading_wildcard" : false
} }
}]
}
},
"random_score": {},
"boost_mode": "sum"
}
} }
}' }'
)) ))
} else {
return(paste0(
'{
"_source": ',toJSON(fields),',
"query": {
"bool":{
"filter": [{
"query_string" : {
"default_field" : "text",
"query" : "',query,'",
"default_operator": "AND",
"allow_leading_wildcard" : false
}
}]
}
}
}'
))
}
} }

@ -5,7 +5,7 @@
\title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]} \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
\usage{ \usage{
dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
words, localhost = T) words, localhost = T, ver)
} }
\arguments{ \arguments{
\item{row}{Row of grid to parse} \item{row}{Row of grid to parse}
@ -23,6 +23,8 @@ dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
\item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])} \item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])}
\item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)} \item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
} }
\value{ \value{
dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory

@ -5,8 +5,8 @@
\title{Generate a data frame out of unparsed Elasticsearch JSON} \title{Generate a data frame out of unparsed Elasticsearch JSON}
\usage{ \usage{
elasticizer(query, src = T, index = "maml", elasticizer(query, src = T, index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ"), size = 1024, es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024,
update = NULL, localhost = F, ...) max_batch = Inf, update = NULL, localhost = F, ...)
} }
\arguments{ \arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL} \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -17,7 +17,9 @@ elasticizer(query, src = T, index = "maml",
\item{es_pwd}{The password for Elasticsearch read access} \item{es_pwd}{The password for Elasticsearch read access}
\item{size}{Batch size} \item{batch_size}{Batch size}
\item{max_batch}{Maximum number batches to retrieve}
\item{update}{When set, indicates an update function to use on each batch of 1000 articles} \item{update}{When set, indicates an update function to use on each batch of 1000 articles}

@ -4,12 +4,12 @@
\alias{out_parser} \alias{out_parser}
\title{Parse raw text into a single field} \title{Parse raw text into a single field}
\usage{ \usage{
out_parser(out, type) out_parser(out, field)
} }
\arguments{ \arguments{
\item{out}{The original output data frame} \item{out}{The original output data frame}
\item{type}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
} }
\value{ \value{
a parsed output data frame including the additional column 'merged', containing the merged text a parsed output data frame including the additional column 'merged', containing the merged text
@ -18,5 +18,5 @@ a parsed output data frame including the additional column 'merged', containing
Parse raw text into a single field Parse raw text into a single field
} }
\examples{ \examples{
out_parser(out,type) out_parser(out,field)
} }

@ -4,10 +4,14 @@
\alias{query_string} \alias{query_string}
\title{Generate a query string query for ElasticSearch} \title{Generate a query string query for ElasticSearch}
\usage{ \usage{
query_string(x) query_string(query, fields = F, random = F)
} }
\arguments{ \arguments{
\item{x}{Query string in ElasticSearch query string format} \item{query}{Query string in ElasticSearch query string format}
\item{fields}{List of field names to return, defaults to all}
\item{random}{Return randomized results. Boolean, defaults to FALSE}
} }
\value{ \value{
A formatted ElasticSearch query string query A formatted ElasticSearch query string query
@ -16,5 +20,5 @@ A formatted ElasticSearch query string query
Generate a query string query for ElasticSearch Generate a query string query for ElasticSearch
} }
\examples{ \examples{
query_string(x) query_string(query)
} }

Loading…
Cancel
Save