elasticizer: renamed size parameter to batch_size, created max_batch parameter to limit the number of results returned

query_string: renamed x parameter to query, added fields parameter to select what fields to return and random boolean parameter to define whether the returned results should be randomized
7 years ago · 4f8b1f2024
parent d0e9bf565b
commit 4f8b1f2024
6 changed files with 74 additions and 29 deletions
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -5,7 +5,8 @@
 #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
 #' @param index The name of the Elasticsearch index to search through
 #' @param es_pwd The password for Elasticsearch read access
-#' @param size Batch size
+#' @param batch_size Batch size
 #' @param max_batch Maximum number batches to retrieve
 #' @param update When set, indicates an update function to use on each batch of 1000 articles
 #' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param ... Parameters passed on to the update function
@ -17,7 +18,7 @@
 #################################################################################################
 #################################### Get data from ElasticSearch ################################
 #################################################################################################
-elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), size = 1024, update = NULL, localhost = F, ...){
+elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, update = NULL, localhost = F, ...){
  retries <- 10 ### Number of retries on error
  sleep <- 30 ### Number of seconds between retries
  httr::set_config(httr::config(http_version = 0))
@ -49,7 +50,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
      }
      attempt <- attempt + 1
      try(
-        res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T)
+        res <- Search(index = index, time_scroll="20m",body = query, size = batch_size, raw=T)
      )
    }
  }
@ -62,7 +63,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
      }
      attempt <- attempt + 1
      try(
-        res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T, source = F)
+        res <- Search(index = index, time_scroll="20m",body = query, size = batch_size, raw=T, source = F)
      )
    }
  }
@ -72,13 +73,13 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
  } else {
    out <-  jsonlite:::flatten(json$hits$hits)
    total <- json$hits$total
-    hits <- 1
+    hits <- length(json$hits$hits)
    batch <- 1
-    print(paste0('Processing documents ',batch*size-size,' through ',batch*size,' out of ',total,' documents.'))
+    print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
    if (length(update) > 0){
      update(out, localhost = localhost, ...)
    }
-    while(hits != 0){
+    while(hits > 0 && batch < max_batch ){
      res <- NULL
      attempt <- 0
      while( is.null(res) && attempt <= retries ) {
@ -94,7 +95,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
      hits <- length(json$hits$hits)
      if(hits > 0) {
        batch <- batch+1
-        print(paste0('Processing documents ',batch*size-size,' through ',batch*size,' out of ',total,' documents.'))
+        print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
        if (length(update) > 0){
          out <-  jsonlite:::flatten(json$hits$hits)
          update(out, localhost = localhost, ...)
--- a/R/query_string.R
+++ b/R/query_string.R
@ -1,26 +1,62 @@
 #' Generate a query string query for ElasticSearch
 #'
 #' Generate a query string query for ElasticSearch
-#' @param x Query string in ElasticSearch query string format
+#' @param query Query string in ElasticSearch query string format
 #' @param fields List of field names to return, defaults to all
 #' @param random Return randomized results. Boolean, defaults to FALSE
 #' @return A formatted ElasticSearch query string query
 #' @export
 #' @examples
-#' query_string(x)
+#' query_string(query)
 #################################################################################################
 #################################### Get data from ElasticSearch ################################
 #################################################################################################
-query_string <- function(x) {
+query_string <- function(query, fields = F, random = F) {
  if (fields == F) {
    fields <- '*'
  }
  if (random == T) {
    return(paste0(
      '{
      "_source": ',toJSON(fields),',
        "query": {
          "function_score": {
            "query": {
              "bool":{
                "filter": [{
                  "query_string" : {
                      "default_field" : "text",
-            "query" : "',x,'",
+                      "query" : "',query,'",
                      "default_operator": "AND",
                      "allow_leading_wildcard" : false
                  }
                }]
              }
            },
            "random_score": {},
            "boost_mode": "sum"
          }
        }
      }'
    ))
  } else {
  return(paste0(
    '{
      "_source": ',toJSON(fields),',
      "query": {
        "bool":{
          "filter": [{
            "query_string" : {
                "default_field" : "text",
                "query" : "',query,'",
                "default_operator": "AND",
                "allow_leading_wildcard" : false
            }
          }]
        }
      }
    }'
  ))
  }
 }
--- a/man/dupe_detect.Rd
+++ b/man/dupe_detect.Rd
@ -5,7 +5,7 @@
 \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
 \usage{
 dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
-  words, localhost = T)
+  words, localhost = T, ver)
 }
 \arguments{
 \item{row}{Row of grid to parse}
@ -23,6 +23,8 @@ dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
 \item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])}
 \item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)}
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 }
 \value{
 dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -5,8 +5,8 @@
 \title{Generate a data frame out of unparsed Elasticsearch JSON}
 \usage{
 elasticizer(query, src = T, index = "maml",
-  es_pwd = .rs.askForPassword("Elasticsearch READ"), size = 1024,
+  es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024,
-  update = NULL, localhost = F, ...)
+  max_batch = Inf, update = NULL, localhost = F, ...)
 }
 \arguments{
 \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -17,7 +17,9 @@ elasticizer(query, src = T, index = "maml",
 \item{es_pwd}{The password for Elasticsearch read access}
-\item{size}{Batch size}
+\item{batch_size}{Batch size}
 \item{max_batch}{Maximum number batches to retrieve}
 \item{update}{When set, indicates an update function to use on each batch of 1000 articles}
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@ -4,12 +4,12 @@
 \alias{out_parser}
 \title{Parse raw text into a single field}
 \usage{
-out_parser(out, type)
+out_parser(out, field)
 }
 \arguments{
 \item{out}{The original output data frame}
-\item{type}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
+\item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
 }
 \value{
 a parsed output data frame including the additional column 'merged', containing the merged text
@ -18,5 +18,5 @@ a parsed output data frame including the additional column 'merged', containing
 Parse raw text into a single field
 }
 \examples{
-out_parser(out,type)
+out_parser(out,field)
 }
--- a/man/query_string.Rd
+++ b/man/query_string.Rd
@ -4,10 +4,14 @@
 \alias{query_string}
 \title{Generate a query string query for ElasticSearch}
 \usage{
-query_string(x)
+query_string(query, fields = F, random = F)
 }
 \arguments{
-\item{x}{Query string in ElasticSearch query string format}
+\item{query}{Query string in ElasticSearch query string format}
 \item{fields}{List of field names to return, defaults to all}
 \item{random}{Return randomized results. Boolean, defaults to FALSE}
 }
 \value{
 A formatted ElasticSearch query string query
@ -16,5 +20,5 @@ A formatted ElasticSearch query string query
 Generate a query string query for ElasticSearch
 }
 \examples{
-query_string(x)
+query_string(query)
 }