Added generic actor search query generator. Updated elasticizer and elastic_update to connect either to the remote server, or a local ES instance

master
Erik de Vries 6 years ago
parent 3e66c7e1cd
commit 11d8b31c60

@ -11,7 +11,8 @@ Depends: R (>= 3.3.1),
quanteda, quanteda,
httr, httr,
caret, caret,
e1071 e1071,
udpipe
License: Copyright Erik de Vries License: Copyright Erik de Vries
Encoding: UTF-8 Encoding: UTF-8
LazyData: true LazyData: true

@ -8,4 +8,5 @@ export(elastic_update)
export(elasticizer) export(elasticizer)
export(merger) export(merger)
export(modelizer) export(modelizer)
export(query_gen_actors)
export(query_string) export(query_string)

@ -3,17 +3,23 @@
#' Push a line-delimited JSON string to Elasticsearch as bulk update #' Push a line-delimited JSON string to Elasticsearch as bulk update
#' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update #' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update
#' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database #' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database
#' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @return An html response object indicating the status of the update #' @return An html response object indicating the status of the update
#' @export #' @export
#' @examples #' @examples
#' elastic_update(x, es_super = 'secret') #' elastic_update(x, es_super = 'secret', local = F)
################################################################################################# #################################################################################################
#################################### Elasticsearch Updater ################################ #################################### Elasticsearch Updater ################################
################################################################################################# #################################################################################################
elastic_update <- function(x, es_super = 'secret') { elastic_update <- function(x, es_super = 'secret', local = F) {
bulk <- paste0(x,'\n') bulk <- paste0(x,'\n')
if (local == F) {
url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty&refresh=wait_for') url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty&refresh=wait_for')
}
if (local == T) {
url <- 'http://localhost:9200/_bulk?pretty'
}
res <- RETRY("POST", url = url res <- RETRY("POST", url = url
, body = bulk , body = bulk
, encode = "raw" , encode = "raw"

@ -5,16 +5,18 @@
#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
#' @param index The name of the Elasticsearch index to search through #' @param index The name of the Elasticsearch index to search through
#' @param update When set, indicates an update function to use on each batch of 1000 articles #' @param update When set, indicates an update function to use on each batch of 1000 articles
#' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @param ... Parameters passed on to the update function #' @param ... Parameters passed on to the update function
#' @return A data frame containing all the search results #' @return A data frame containing all the search results
#' @export #' @export
#' @examples #' @examples
#' elasticizer(query, src = TRUE, index = "maml") #' elasticizer(query, src = TRUE, index = "maml", update = NULL, local = F)
################################################################################################# #################################################################################################
#################################### Get data from ElasticSearch ################################ #################################### Get data from ElasticSearch ################################
################################################################################################# #################################################################################################
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL, ...){ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL, local = F, ...){
if (local == F) {
connect(es_port = 443, connect(es_port = 443,
es_transport = 'https', es_transport = 'https',
es_host = 'linux01.uis.no', es_host = 'linux01.uis.no',
@ -22,6 +24,16 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
es_user = 'es', es_user = 'es',
es_pwd = es_pwd, es_pwd = es_pwd,
errors = 'complete') errors = 'complete')
}
if (local == T){
connect(es_port = 9200,
es_transport = 'http',
es_host = 'localhost',
es_path = '',
es_user = '',
es_pwd = '',
errors = 'complete')
}
# Get all results - one approach is to use a while loop # Get all results - one approach is to use a while loop
if (src == T) { if (src == T) {
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T) res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)

@ -0,0 +1,130 @@
#' Generate actor search queries based on data in actor db
#'
#' Generate actor search queries based on data in actor db
#' @param actor A row from the output of elasticizer() when run on the 'actor' index
#' @param country 2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.
#' @return A data frame containing the queries, related actor ids and actor function
#' @export
#' @examples
#' query_gen_actors(actor,country)
#################################################################################################
#################################### Actor search query generator ###############################
#################################################################################################
query_gen_actors <- function(actor, country) {
highlight <- paste0('"highlight" : {
"fields" : {
"text" : {},
"teaser" : {},
"preteaser" : {},
"title" : {},
"subtitle" : {}
},
"number_of_fragments": 0,
"order": "none",
"type":"unified",
"fragment_size":0,
"pre_tags":"',identifier,'",
"post_tags": ""
}')
if (country == "no") {
genitive <- 's'
definitive <- 'en'
definitive_genitive <- 'ens'
} else {
genitive <- ''
definitive <- ''
definitive_genitive <- ''
}
if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader") {
lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')')
## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name
## Regardless of whether the last name hit is because of a minister name or a full name proximity hit
query_string <- paste0('(((\\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,genitive,
'\\"~5) AND ',lastname)
}
if (actor$`_source.function` == "PartyLeader") {
query_string <- paste0(query_string,'))')
ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.partyId`),str_c, "_pl")))
}
if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM") {
## Modifiers are only applied to minister titles
capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title))
capital_gen <- unlist(lapply(capital, str_c, genitive))
capital_def <- unlist(lapply(capital, str_c, definitive))
capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive))
gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive))
def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive))
defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive))
names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ')
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))')
ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.ministryId`,actor$`_source.partyId`),str_c, "_min")))
}
if (actor$`_source.function` == "Party") {
actor$`_source.startDate` <- "2000-01-01"
actor$`_source.endDate` <- "2099-01-01"
names <- paste(c(unlist(actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"')
query_string <- paste0('(\\"',names,'\\")')
query <- paste0('{"query":
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
ids <- c(toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p"))))
if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
names <- paste(c(unlist(actor$`_source.partyNameSearch`)), collapse = '\\" \\"')
query_string <- paste0('(\\"',names,'\\")')
query2 <- paste0('{"query":
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
ids <- c(ids, toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p"))))
query <- c(query, query2)
fn <- c('PartyAbbreviation','Party')
} else {
fn <- c('PartyAbbreviation')
}
return(data.frame(query = query, ids = I(ids), type = fn, prefix = actor$`_source.searchAnd`, postfix = actor$`_source.searchAndNot`, stringsAsFactors = F))
}
query <- paste0('{"query":
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
fn <- actor$`_source.function`
return(data.frame(query = query, ids = I(ids), type = fn, stringsAsFactors = F))
}

@ -4,12 +4,14 @@
\alias{elastic_update} \alias{elastic_update}
\title{Push a line-delimited JSON string to Elasticsearch as bulk update} \title{Push a line-delimited JSON string to Elasticsearch as bulk update}
\usage{ \usage{
elastic_update(x, es_super = "secret") elastic_update(x, es_super = "secret", local = F)
} }
\arguments{ \arguments{
\item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update} \item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update}
\item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database} \item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database}
\item{local}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
} }
\value{ \value{
An html response object indicating the status of the update An html response object indicating the status of the update
@ -18,5 +20,5 @@ An html response object indicating the status of the update
Push a line-delimited JSON string to Elasticsearch as bulk update Push a line-delimited JSON string to Elasticsearch as bulk update
} }
\examples{ \examples{
elastic_update(x, es_super = 'secret') elastic_update(x, es_super = 'secret', local = F)
} }

@ -6,7 +6,7 @@
\usage{ \usage{
elasticizer(query, src = T, index = "maml", elasticizer(query, src = T, index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL, es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL,
...) local = F, ...)
} }
\arguments{ \arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL} \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -17,6 +17,8 @@ elasticizer(query, src = T, index = "maml",
\item{update}{When set, indicates an update function to use on each batch of 1000 articles} \item{update}{When set, indicates an update function to use on each batch of 1000 articles}
\item{local}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
\item{...}{Parameters passed on to the update function} \item{...}{Parameters passed on to the update function}
} }
\value{ \value{
@ -26,5 +28,5 @@ A data frame containing all the search results
Generate a data frame out of unparsed Elasticsearch JSON Generate a data frame out of unparsed Elasticsearch JSON
} }
\examples{ \examples{
elasticizer(query, src = TRUE, index = "maml") elasticizer(query, src = TRUE, index = "maml", update = NULL, local = F)
} }

@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/query_gen_actors.R
\name{query_gen_actors}
\alias{query_gen_actors}
\title{Generate actor search queries based on data in actor db}
\usage{
query_gen_actors(actor, country)
}
\arguments{
\item{actor}{A row from the output of elasticizer() when run on the 'actor' index}
\item{country}{2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.}
}
\value{
A data frame containing the queries, related actor ids and actor function
}
\description{
Generate actor search queries based on data in actor db
}
\examples{
query_gen_actors(actor,country)
}
Loading…
Cancel
Save