diff --git a/R/elasticizer.R b/R/elasticizer.R index 86ce8c0..20a33e8 100644 --- a/R/elasticizer.R +++ b/R/elasticizer.R @@ -5,7 +5,8 @@ #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved #' @param index The name of the Elasticsearch index to search through #' @param es_pwd The password for Elasticsearch read access -#' @param size Batch size +#' @param batch_size Batch size +#' @param max_batch Maximum number batches to retrieve #' @param update When set, indicates an update function to use on each batch of 1000 articles #' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200) #' @param ... Parameters passed on to the update function @@ -17,7 +18,7 @@ ################################################################################################# #################################### Get data from ElasticSearch ################################ ################################################################################################# -elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), size = 1024, update = NULL, localhost = F, ...){ +elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, update = NULL, localhost = F, ...){ retries <- 10 ### Number of retries on error sleep <- 30 ### Number of seconds between retries httr::set_config(httr::config(http_version = 0)) @@ -49,7 +50,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw } attempt <- attempt + 1 try( - res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T) + res <- Search(index = index, time_scroll="20m",body = query, size = batch_size, raw=T) ) } } @@ -62,7 +63,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw } attempt <- attempt + 1 try( - res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T, source = F) + res <- Search(index = index, time_scroll="20m",body = query, size = batch_size, raw=T, source = F) ) } } @@ -72,13 +73,13 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw } else { out <- jsonlite:::flatten(json$hits$hits) total <- json$hits$total - hits <- 1 + hits <- length(json$hits$hits) batch <- 1 - print(paste0('Processing documents ',batch*size-size,' through ',batch*size,' out of ',total,' documents.')) + print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.')) if (length(update) > 0){ update(out, localhost = localhost, ...) } - while(hits != 0){ + while(hits > 0 && batch < max_batch ){ res <- NULL attempt <- 0 while( is.null(res) && attempt <= retries ) { @@ -94,7 +95,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw hits <- length(json$hits$hits) if(hits > 0) { batch <- batch+1 - print(paste0('Processing documents ',batch*size-size,' through ',batch*size,' out of ',total,' documents.')) + print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.')) if (length(update) > 0){ out <- jsonlite:::flatten(json$hits$hits) update(out, localhost = localhost, ...) diff --git a/R/query_string.R b/R/query_string.R index d45e7e9..c0f332f 100644 --- a/R/query_string.R +++ b/R/query_string.R @@ -1,26 +1,62 @@ #' Generate a query string query for ElasticSearch #' #' Generate a query string query for ElasticSearch -#' @param x Query string in ElasticSearch query string format +#' @param query Query string in ElasticSearch query string format +#' @param fields List of field names to return, defaults to all +#' @param random Return randomized results. Boolean, defaults to FALSE #' @return A formatted ElasticSearch query string query #' @export #' @examples -#' query_string(x) +#' query_string(query) ################################################################################################# #################################### Get data from ElasticSearch ################################ ################################################################################################# -query_string <- function(x) { +query_string <- function(query, fields = F, random = F) { + if (fields == F) { + fields <- '*' + } + if (random == T) { + return(paste0( + '{ + "_source": ',toJSON(fields),', + "query": { + "function_score": { + "query": { + "bool":{ + "filter": [{ + "query_string" : { + "default_field" : "text", + "query" : "',query,'", + "default_operator": "AND", + "allow_leading_wildcard" : false + } + }] + } + }, + "random_score": {}, + "boost_mode": "sum" + } + } + }' + )) + } else { return(paste0( '{ - "query": { - "query_string" : { - "default_field" : "text", - "query" : "',x,'", - "default_operator": "AND", - "allow_leading_wildcard" : false + "_source": ',toJSON(fields),', + "query": { + "bool":{ + "filter": [{ + "query_string" : { + "default_field" : "text", + "query" : "',query,'", + "default_operator": "AND", + "allow_leading_wildcard" : false + } + }] } - } -}' + } + }' )) + } } diff --git a/man/dupe_detect.Rd b/man/dupe_detect.Rd index ee2b699..3fa6859 100644 --- a/man/dupe_detect.Rd +++ b/man/dupe_detect.Rd @@ -5,7 +5,7 @@ \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]} \usage{ dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, - words, localhost = T) + words, localhost = T, ver) } \arguments{ \item{row}{Row of grid to parse} @@ -23,6 +23,8 @@ dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, \item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])} \item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)} + +\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} } \value{ dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd index bc5df2f..ada666c 100644 --- a/man/elasticizer.Rd +++ b/man/elasticizer.Rd @@ -5,8 +5,8 @@ \title{Generate a data frame out of unparsed Elasticsearch JSON} \usage{ elasticizer(query, src = T, index = "maml", - es_pwd = .rs.askForPassword("Elasticsearch READ"), size = 1024, - update = NULL, localhost = F, ...) + es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, + max_batch = Inf, update = NULL, localhost = F, ...) } \arguments{ \item{query}{A JSON-formatted query in the Elasticsearch query DSL} @@ -17,7 +17,9 @@ elasticizer(query, src = T, index = "maml", \item{es_pwd}{The password for Elasticsearch read access} -\item{size}{Batch size} +\item{batch_size}{Batch size} + +\item{max_batch}{Maximum number batches to retrieve} \item{update}{When set, indicates an update function to use on each batch of 1000 articles} diff --git a/man/out_parser.Rd b/man/out_parser.Rd index 4e804ce..bc4e71d 100644 --- a/man/out_parser.Rd +++ b/man/out_parser.Rd @@ -4,12 +4,12 @@ \alias{out_parser} \title{Parse raw text into a single field} \usage{ -out_parser(out, type) +out_parser(out, field) } \arguments{ \item{out}{The original output data frame} -\item{type}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} +\item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} } \value{ a parsed output data frame including the additional column 'merged', containing the merged text @@ -18,5 +18,5 @@ a parsed output data frame including the additional column 'merged', containing Parse raw text into a single field } \examples{ -out_parser(out,type) +out_parser(out,field) } diff --git a/man/query_string.Rd b/man/query_string.Rd index 666c9f4..0c03e2e 100644 --- a/man/query_string.Rd +++ b/man/query_string.Rd @@ -4,10 +4,14 @@ \alias{query_string} \title{Generate a query string query for ElasticSearch} \usage{ -query_string(x) +query_string(query, fields = F, random = F) } \arguments{ -\item{x}{Query string in ElasticSearch query string format} +\item{query}{Query string in ElasticSearch query string format} + +\item{fields}{List of field names to return, defaults to all} + +\item{random}{Return randomized results. Boolean, defaults to FALSE} } \value{ A formatted ElasticSearch query string query @@ -16,5 +20,5 @@ A formatted ElasticSearch query string query Generate a query string query for ElasticSearch } \examples{ -query_string(x) +query_string(query) }