diff --git a/DESCRIPTION b/DESCRIPTION index 3dfe698..c7871ab 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,8 @@ Depends: R (>= 3.3.1), quanteda, httr, caret, - e1071 + e1071, + udpipe License: Copyright Erik de Vries Encoding: UTF-8 LazyData: true diff --git a/NAMESPACE b/NAMESPACE index a347566..4b1e098 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,4 +8,5 @@ export(elastic_update) export(elasticizer) export(merger) export(modelizer) +export(query_gen_actors) export(query_string) diff --git a/R/elastic_update.R b/R/elastic_update.R index e9d25eb..54e307c 100644 --- a/R/elastic_update.R +++ b/R/elastic_update.R @@ -3,17 +3,23 @@ #' Push a line-delimited JSON string to Elasticsearch as bulk update #' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update #' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database +#' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200) #' @return An html response object indicating the status of the update #' @export #' @examples -#' elastic_update(x, es_super = 'secret') +#' elastic_update(x, es_super = 'secret', local = F) ################################################################################################# #################################### Elasticsearch Updater ################################ ################################################################################################# -elastic_update <- function(x, es_super = 'secret') { +elastic_update <- function(x, es_super = 'secret', local = F) { bulk <- paste0(x,'\n') - url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty&refresh=wait_for') + if (local == F) { + url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty&refresh=wait_for') + } + if (local == T) { + url <- 'http://localhost:9200/_bulk?pretty' + } res <- RETRY("POST", url = url , body = bulk , encode = "raw" diff --git a/R/elasticizer.R b/R/elasticizer.R index dd4e429..550acc4 100644 --- a/R/elasticizer.R +++ b/R/elasticizer.R @@ -5,23 +5,35 @@ #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved #' @param index The name of the Elasticsearch index to search through #' @param update When set, indicates an update function to use on each batch of 1000 articles +#' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200) #' @param ... Parameters passed on to the update function #' @return A data frame containing all the search results #' @export #' @examples -#' elasticizer(query, src = TRUE, index = "maml") +#' elasticizer(query, src = TRUE, index = "maml", update = NULL, local = F) ################################################################################################# #################################### Get data from ElasticSearch ################################ ################################################################################################# -elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL, ...){ - connect(es_port = 443, - es_transport = 'https', - es_host = 'linux01.uis.no', - es_path = 'es', - es_user = 'es', - es_pwd = es_pwd, - errors = 'complete') +elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL, local = F, ...){ + if (local == F) { + connect(es_port = 443, + es_transport = 'https', + es_host = 'linux01.uis.no', + es_path = 'es', + es_user = 'es', + es_pwd = es_pwd, + errors = 'complete') + } + if (local == T){ + connect(es_port = 9200, + es_transport = 'http', + es_host = 'localhost', + es_path = '', + es_user = '', + es_pwd = '', + errors = 'complete') + } # Get all results - one approach is to use a while loop if (src == T) { res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T) diff --git a/R/query_gen_actors.R b/R/query_gen_actors.R new file mode 100644 index 0000000..fdcbc18 --- /dev/null +++ b/R/query_gen_actors.R @@ -0,0 +1,130 @@ +#' Generate actor search queries based on data in actor db +#' +#' Generate actor search queries based on data in actor db +#' @param actor A row from the output of elasticizer() when run on the 'actor' index +#' @param country 2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc. +#' @return A data frame containing the queries, related actor ids and actor function +#' @export +#' @examples +#' query_gen_actors(actor,country) + +################################################################################################# +#################################### Actor search query generator ############################### +################################################################################################# +query_gen_actors <- function(actor, country) { + highlight <- paste0('"highlight" : { + "fields" : { + "text" : {}, + "teaser" : {}, + "preteaser" : {}, + "title" : {}, + "subtitle" : {} + }, + "number_of_fragments": 0, + "order": "none", + "type":"unified", + "fragment_size":0, + "pre_tags":"',identifier,'", + "post_tags": "" +}') +if (country == "no") { + genitive <- 's' + definitive <- 'en' + definitive_genitive <- 'ens' +} else { + genitive <- '' + definitive <- '' + definitive_genitive <- '' +} + + if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader") { + lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')') + ## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name + ## Regardless of whether the last name hit is because of a minister name or a full name proximity hit + query_string <- paste0('(((\\"', + actor$`_source.firstName`, + ' ', + actor$`_source.lastName`, + '\\"~5 OR \\"', + actor$`_source.firstName`, + ' ', + actor$`_source.lastName`,genitive, + '\\"~5) AND ',lastname) + } + if (actor$`_source.function` == "PartyLeader") { + query_string <- paste0(query_string,'))') + ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.partyId`),str_c, "_pl"))) + } + if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM") { + ## Modifiers are only applied to minister titles + capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title)) + capital_gen <- unlist(lapply(capital, str_c, genitive)) + capital_def <- unlist(lapply(capital, str_c, definitive)) + capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive)) + gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive)) + def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive)) + defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive)) + names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ') + query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))') + ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.ministryId`,actor$`_source.partyId`),str_c, "_min"))) + } + if (actor$`_source.function` == "Party") { + actor$`_source.startDate` <- "2000-01-01" + actor$`_source.endDate` <- "2099-01-01" + names <- paste(c(unlist(actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"') + query_string <- paste0('(\\"',names,'\\")') + query <- paste0('{"query": + {"bool": {"filter":[{"term":{"country":"',country,'"}}, + {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, + {"query_string" : { + "default_operator" : "OR", + "allow_leading_wildcard" : "false", + "fields": ["text","teaser","preteaser","title","subtitle"], + "query" : "', query_string,'" + } + } + ] + } },',highlight,' }') + ids <- c(toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p")))) + + if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) { + names <- paste(c(unlist(actor$`_source.partyNameSearch`)), collapse = '\\" \\"') + query_string <- paste0('(\\"',names,'\\")') + query2 <- paste0('{"query": + {"bool": {"filter":[{"term":{"country":"',country,'"}}, + {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, + {"query_string" : { + "default_operator" : "OR", + "allow_leading_wildcard" : "false", + "fields": ["text","teaser","preteaser","title","subtitle"], + "query" : "', query_string,'" + } + } + ] + } },',highlight,' }') + + + ids <- c(ids, toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p")))) + query <- c(query, query2) + fn <- c('PartyAbbreviation','Party') + } else { + fn <- c('PartyAbbreviation') + } + return(data.frame(query = query, ids = I(ids), type = fn, prefix = actor$`_source.searchAnd`, postfix = actor$`_source.searchAndNot`, stringsAsFactors = F)) + } + + query <- paste0('{"query": + {"bool": {"filter":[{"term":{"country":"',country,'"}}, + {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, + {"query_string" : { + "default_operator" : "OR", + "allow_leading_wildcard" : "false", + "fields": ["text","teaser","preteaser","title","subtitle"], + "query" : "', query_string,'" + } + } + ] + } },',highlight,' }') + fn <- actor$`_source.function` + return(data.frame(query = query, ids = I(ids), type = fn, stringsAsFactors = F)) +} diff --git a/man/elastic_update.Rd b/man/elastic_update.Rd index 1b5a43f..f7d6c40 100644 --- a/man/elastic_update.Rd +++ b/man/elastic_update.Rd @@ -4,12 +4,14 @@ \alias{elastic_update} \title{Push a line-delimited JSON string to Elasticsearch as bulk update} \usage{ -elastic_update(x, es_super = "secret") +elastic_update(x, es_super = "secret", local = F) } \arguments{ \item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update} \item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database} + +\item{local}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)} } \value{ An html response object indicating the status of the update @@ -18,5 +20,5 @@ An html response object indicating the status of the update Push a line-delimited JSON string to Elasticsearch as bulk update } \examples{ -elastic_update(x, es_super = 'secret') +elastic_update(x, es_super = 'secret', local = F) } diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd index 5ac31ad..ae7129c 100644 --- a/man/elasticizer.Rd +++ b/man/elasticizer.Rd @@ -6,7 +6,7 @@ \usage{ elasticizer(query, src = T, index = "maml", es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL, - ...) + local = F, ...) } \arguments{ \item{query}{A JSON-formatted query in the Elasticsearch query DSL} @@ -17,6 +17,8 @@ elasticizer(query, src = T, index = "maml", \item{update}{When set, indicates an update function to use on each batch of 1000 articles} +\item{local}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)} + \item{...}{Parameters passed on to the update function} } \value{ @@ -26,5 +28,5 @@ A data frame containing all the search results Generate a data frame out of unparsed Elasticsearch JSON } \examples{ -elasticizer(query, src = TRUE, index = "maml") +elasticizer(query, src = TRUE, index = "maml", update = NULL, local = F) } diff --git a/man/query_gen_actors.Rd b/man/query_gen_actors.Rd new file mode 100644 index 0000000..66d11ee --- /dev/null +++ b/man/query_gen_actors.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/query_gen_actors.R +\name{query_gen_actors} +\alias{query_gen_actors} +\title{Generate actor search queries based on data in actor db} +\usage{ +query_gen_actors(actor, country) +} +\arguments{ +\item{actor}{A row from the output of elasticizer() when run on the 'actor' index} + +\item{country}{2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.} +} +\value{ +A data frame containing the queries, related actor ids and actor function +} +\description{ +Generate actor search queries based on data in actor db +} +\examples{ +query_gen_actors(actor,country) +}