actorizer, query_gen_actors: revamped actor searches entirely

elasticizer: updated script for use with ES 7.x
master
Erik de Vries 6 years ago
parent 88fc4ec53c
commit a1b6c6a7cb

@ -7,6 +7,7 @@ export(dfm_gen)
export(dupe_detect)
export(elastic_update)
export(elasticizer)
export(lemma_writer)
export(merger)
export(modelizer)
export(query_gen_actors)

@ -12,17 +12,19 @@
#' @return As this is a nested function used within elasticizer, there is no return output
#' @export
#' @examples
#' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super)
actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, es_super, ver) {
#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
actorizer <- function(out, localhost = F, ids, prefix, postfix, identifier, es_super, ver) {
### Function to filter out false positives using regex
exceptionizer <- function(id, ud, doc, markers, regex_identifier, prefix, postfix) {
exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
max <- max+(length(split)*nchar(identifier)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
sentence <- str_sub(doc$merged, min, max) # Extract sentence from text
# Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
if (!str_detect(sentence, paste0(regex_identifier,postfix)) && !str_detect(sentence, paste0(prefix,regex_identifier))) {
if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
return(id)
} else {
return(NULL)
@ -31,8 +33,16 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
ranger <- function(x, ud) {
return(which((ud$start <= x) & (ud$end >= x)))
}
sentencizer <- function(row, out, ids, prefix, postfix, identifier, type) {
sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
doc <- out[row,]
if (nchar(doc$merged) > 990000) {
return(
data.frame(
err = T,
errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
)
)
}
# Extracting ud output from document
ud <- doc$`_source.ud`[[1]] %>%
select(-one_of('exists')) %>% # Removing ud.exists variable
@ -48,11 +58,9 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
sentence_count <- max(ud$sentence_id) # Number of sentences in article
actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
# Conducting regex filtering on matches only when actor type is Party
if (type == "Party") {
# Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
if (!is.na(prefix) || prefix != '' || !is.na(postfix) || postfix != '') {
### If no pre or postfixes, match *not nothing* i.e. anything
if (is.na(prefix) || prefix == '') {
prefix = '$^'
@ -60,14 +68,27 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
if (is.na(postfix) || postfix == '') {
postfix = '$^'
}
sentence_ids <- lapply(actor_sentences, exceptionizer, ud = ud, doc = doc, markers = markers, regex_identifier = regex_identifier, prefix = prefix, postfix = postfix)
sentence_ids <- unlist(lapply(actor_sentences,
exceptionizer,
ud = ud,
doc = doc,
markers = markers,
pre_tags_regex = pre_tags_regex,
pre_tags = pre_tags,
post_tags_regex = post_tags_regex,
post_tags = post_tags,
prefix = prefix,
postfix = postfix))
} else {
sentence_ids <- actor_sentences
}
# Generating nested sentence start and end positions for actor sentences
ud <- ud %>%
filter(sentence_id %in% sentence_ids) %>%
filter(sentence_id %in% sentence_ids)
actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
ud <- ud %>%
group_by(sentence_id) %>%
summarise (
sentence_start = as.integer(min(start)),
@ -80,28 +101,42 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
sentence_start = list(sentence_start),
sentence_end = list(sentence_end)
)
return(
data.frame(ud, # Sentence id, start and end position for actor sentences
actor_start = I(list(actor_start)), # List of actor ud token start positions
actor_end = I(list(actor_end)), # List of actor ud token end positions
occ = length(unique(actor_sentences)), # Number of sentences in which actor occurs
prom = length(unique(actor_sentences))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
rel_first = 1-(min(actor_sentences)/sentence_count), # Relative position of first occurence at sentence level
first = min(actor_sentences), # First sentence in which actor is mentioned
ids = I(list(ids)) # List of actor ids
if (length(ud$doc_id > 0)) {
return(
data.frame(ud, # Sentence id, start and end position for actor sentences
actor_start = I(list(actor_start)), # List of actor ud token start positions
actor_end = I(list(actor_end)), # List of actor ud token end positions
occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs
prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level
first = min(sentence_ids), # First sentence in which actor is mentioned
ids = I(list(ids)) # List of actor ids
)
)
)
} else {
return(NULL)
}
}
out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
offsetter <- function(x, identifier) {
return(x-((row(x)-1)*nchar(identifier)))
offsetter <- function(x, pre_tags, post_tags) {
return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
}
regex_identifier <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", identifier)
out$markers <- mclapply(str_locate_all(out$merged,coll(identifier)), offsetter, identifier = identifier, mc.cores = detectCores())
pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = detectCores())
ids <- fromJSON(ids)
updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, type = type, mc.cores = detectCores()))
# ids <- fromJSON(ids)
updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer,
out = out,
ids = ids,
postfix = postfix,
prefix=prefix,
pre_tags_regex = pre_tags_regex,
pre_tags = pre_tags,
post_tags_regex = post_tags_regex,
post_tags = post_tags,
mc.cores = detectCores()))
bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
return(elastic_update(bulk, es_super = es_super, localhost = localhost))

@ -68,11 +68,11 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
}
}
json <- fromJSON(res)
if (json$hits$total == 0) {
if (json$hits$total$value == 0) {
return(json)
} else {
out <- jsonlite:::flatten(json$hits$hits)
total <- json$hits$total
total <- json$hits$total$value
hits <- length(json$hits$hits)
batch <- 1
print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))

@ -3,7 +3,8 @@
#' Generate actor search queries based on data in actor db
#' @param actor A row from the output of elasticizer() when run on the 'actor' index
#' @param country 2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.
#' @param identifier Identifier used to mark hits in the text, identifiers are prepended before the actual hit
#' @param pre_tags Highlighter pre-tag
#' @param post_tags Highlighter post-tag
#' @return A data frame containing the queries, related actor ids and actor function
#' @export
#' @examples
@ -12,134 +13,170 @@
#################################################################################################
#################################### Actor search query generator ###############################
#################################################################################################
query_gen_actors <- function(actor, country, identifier) {
highlight <- paste0('"highlight" : {
"fields" : {
"text" : {},
"teaser" : {},
"preteaser" : {},
"title" : {},
"subtitle" : {}
},
"number_of_fragments": 0,
"order": "none",
"type":"unified",
"fragment_size":0,
"pre_tags":"',identifier,'",
"post_tags": ""
}')
if (country == "no") {
genitive <- 's'
definitive <- 'en'
definitive_genitive <- 'ens'
} else if (country == 'uk') {
genitive <- '\'s'
definitive <- 's'
definitive_genitive <- ''
} else {
genitive <- ''
definitive <- ''
definitive_genitive <- ''
}
query_gen_actors <- function(actor, country, pre_tags, post_tags) {
generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
return(paste0('{"query":
{"bool": {
"filter":[
{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',startdate,'","lte":"',enddate,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', querystring,'"
}
}
],
"must_not":[
{"term":{"computerCodes.actors":"',actorid,'"}}
]
}
},
"highlight" : {
"fields" : {
"text" : {},
"teaser" : {},
"preteaser" : {},
"title" : {},
"subtitle" : {}
},
"number_of_fragments": 0,
"order": "none",
"type":"unified",
"fragment_size":0,
"pre_tags":"', pre_tags,'",
"post_tags": "',post_tags,'"
}
}'))
}
### Setting linguistic forms for each country ###
if (country == "no" | country == "dk") {
genitive <- 's'
definitive <- 'en'
definitive_genitive <- 'ens'
} else if (country == 'uk') {
genitive <- '\'s'
} else if (country == 'nl' | country == 'be') {
genitive <- 's'
}
if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader") {
### Generating queries for individuals (ministers, PM, Party leaders and MPs)
if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader" | actor$`_source.function` == "MP") {
lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')')
## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name
## Regardless of whether the last name hit is because of a minister name or a full name proximity hit
query_string <- paste0('(((\\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,genitive,
'\\"~5) AND ',lastname)
}
if (actor$`_source.function` == "PartyLeader") {
query_string <- paste0(query_string,'))')
ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.partyId`),str_c, "_pl")))
}
if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM") {
## Modifiers are only applied to minister titles
capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title))
capital_gen <- unlist(lapply(capital, str_c, genitive))
capital_def <- unlist(lapply(capital, str_c, definitive))
capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive))
gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive))
def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive))
defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive))
names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ')
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))')
ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.ministryId`,actor$`_source.partyId`),str_c, "_min")))
}
if (actor$`_source.function` == "Institution") {
#uppercasing
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name
if (country == 'be') {
if (T %in% str_detect(actor$`_source.middleNames`,"'")) {
query_string <- paste0('(((\\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
tolower(actor$`_source.lastName`),
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
tolower(actor$`_source.lastName`),genitive,
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,genitive,
'\\"~5) AND ',lastname)
}
} else {
query_string <- paste0('(((\\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,genitive,
'\\"~5) AND ',lastname)
}
### If actor is a minister, generate minister search
if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM") {
capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title))
capital_gen <- unlist(lapply(capital, str_c, genitive))
gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive))
# If country is no or dk, search for definitive minister forms as well (including genitive forms)
if (country == "no" || country == "dk") {
capital_def <- unlist(lapply(capital, str_c, definitive))
capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive))
def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive))
defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive))
names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ')
} else {
names <- paste(c(capital,capital_gen,gen,actor$`_source.ministerSearch`), collapse = ' ')
}
# If country is uk, search for last name, minister names and minister titles (i.e. Johnson AND state AND secretary)
if (country == "uk") {
minister <- paste(c(str_to_title(actor$`_source.ministerName`),
actor$`_source.ministerName`,
str_c(str_to_title(actor$`_source.ministerName`),genitive),
str_c(actor$`_source.ministerName`,genitive)), collapse = ' ')
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND (',unlist(minister),')))')
}
# If country is nl or be, add a requirement for Minister to the query
else if (country == "nl" | country == "be") {
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND ("Minister" OR "minister")))')
} else {
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))')
}
} else {
### Else, generate search for first/last name only (MPs and Party leaders, currently)
query_string <- paste0(query_string,'))')
}
ids <- list(c(actor$`_source.actorId`,str_c(actor$`_source.partyId`,'_a')))
actorid <- actor$`_source.actorId`
query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
return(data.frame(query = query, ids = I(ids), stringsAsFactors = F))
}
### Query generation for party searches
if (actor$`_source.function` == "Party") {
actor$`_source.startDate` <- "2000-01-01"
actor$`_source.endDate` <- "2099-01-01"
if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) {
upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup))
upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
unlist(lapply(upper, str_c, definitive)),
unlist(lapply(upper, str_c, definitive_genitive)))
capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title))
capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
unlist(lapply(capital, str_c, definitive)),
unlist(lapply(capital, str_c, definitive_genitive)))
base <- actor$`_source.institutionNameSearch`
base <- c(base, unlist(lapply(base, str_c, genitive)),
unlist(lapply(base, str_c, definitive)),
unlist(lapply(base, str_c, definitive_genitive)))
names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
query_string <- paste0('(\\"',names,'\\")')
ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f")))
query <- paste0('{"query":
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
df1 <- data.frame(query = query, ids = I(ids), type = actor$`_source.function`, stringsAsFactors = F)
if (nchar(actor$`_source.partyNameSearchShort`[[1]]) > 0) {
# If uk, no or dk, search for both regular abbreviations, and genitive forms
if (country == "uk" | country == "no" | country == "dk") {
gen <- unlist(lapply(actor$`_source.partyNameSearchShort`, str_c, genitive))
names <- paste(c(gen,actor$`_source.partyNameSearchShort`), collapse = '\\\" \\\"')
} else {
names <- paste(actor$`_source.partyNameSearchShort`, collapse = '\\\" \\\"')
}
# If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself)
if (country == "dk" | country == "no") {
gen <- gen[which(nchar(gen) > 2)]
}
query_string <- paste0('(\\\"',names,'\\\")')
ids <- str_c(actor$`_source.partyId`,'_s')
actorid <- str_c(actor$`_source.partyId`,'_s')
query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
df1 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
}
if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) {
upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup))
upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
unlist(lapply(upper, str_c, definitive)),
unlist(lapply(upper, str_c, definitive_genitive)))
capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title))
capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
unlist(lapply(capital, str_c, definitive)),
unlist(lapply(capital, str_c, definitive_genitive)))
base <- actor$`_source.institutionNameSearchShort`
base <- c(base, unlist(lapply(base, str_c, genitive)),
unlist(lapply(base, str_c, definitive)),
unlist(lapply(base, str_c, definitive_genitive)))
names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
query_string <- paste0('(\\"',names,'\\")')
ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s")))
query <- paste0('{"query":
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
df2 <- data.frame(query = query, ids = I(ids), type = actor$`_source.function`, stringsAsFactors = F)
if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
if (country == "uk" | country == "no" | country == "dk") {
gen <- unlist(lapply(actor$`_source.partyNameSearch`, str_c, genitive))
names <- paste(c(gen,actor$`_source.partyNameSearch`), collapse = '\\\" \\\"')
} else {
names <- paste(actor$`_source.partyNameSearch`, collapse = '\\\" \\\"')
}
query_string <- paste0('(\\\"',names,'\\\")')
ids <- str_c(actor$`_source.partyId`,'_f')
actorid <- str_c(actor$`_source.partyId`,'_f')
query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
if (country == 'uk') {
df2 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
} else {
df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
}
}
if (exists('df1') == T & exists('df2') == T) {
return(bind_rows(df1,df2))
@ -149,63 +186,61 @@ if (country == "no") {
return(df2)
}
}
if (actor$`_source.function` == "Party") {
actor$`_source.startDate` <- "2000-01-01"
actor$`_source.endDate` <- "2099-01-01"
names <- paste(c(unlist(actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"')
query_string <- paste0('(\\"',names,'\\")')
query <- paste0('{"query":
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
ids <- c(toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p"))))
if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
names <- paste(c(unlist(actor$`_source.partyNameSearch`)), collapse = '\\" \\"')
query_string <- paste0('(\\"',names,'\\")')
query2 <- paste0('{"query":
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
ids <- c(ids, toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p"))))
query <- c(query, query2)
fn <- c('PartyAbbreviation','Party')
} else {
fn <- c('PartyAbbreviation')
}
return(data.frame(query = query, ids = I(ids), type = fn, prefix = actor$`_source.searchAnd`, postfix = actor$`_source.searchAndNot`, stringsAsFactors = F))
}
query <- paste0('{"query":
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
fn <- actor$`_source.function`
return(data.frame(query = query, ids = I(ids), type = fn, stringsAsFactors = F))
### Institution function currently not used
# if (actor$`_source.function` == "Institution") {
# #uppercasing
# firstup <- function(x) {
# substr(x, 1, 1) <- toupper(substr(x, 1, 1))
# x
# }
# actor$`_source.startDate` <- "2000-01-01"
# actor$`_source.endDate` <- "2099-01-01"
# if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) {
# upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup))
# upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
# unlist(lapply(upper, str_c, definitive)),
# unlist(lapply(upper, str_c, definitive_genitive)))
# capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title))
# capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
# unlist(lapply(capital, str_c, definitive)),
# unlist(lapply(capital, str_c, definitive_genitive)))
# base <- actor$`_source.institutionNameSearch`
# base <- c(base, unlist(lapply(base, str_c, genitive)),
# unlist(lapply(base, str_c, definitive)),
# unlist(lapply(base, str_c, definitive_genitive)))
# names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
# query_string <- paste0('(\\"',names,'\\")')
# ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f")))
# actorid <- str_c(actor$`_source.institutionId`,'_f')
# query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
# df1 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
# }
# if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) {
# upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup))
# upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
# unlist(lapply(upper, str_c, definitive)),
# unlist(lapply(upper, str_c, definitive_genitive)))
# capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title))
# capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
# unlist(lapply(capital, str_c, definitive)),
# unlist(lapply(capital, str_c, definitive_genitive)))
# base <- actor$`_source.institutionNameSearchShort`
# base <- c(base, unlist(lapply(base, str_c, genitive)),
# unlist(lapply(base, str_c, definitive)),
# unlist(lapply(base, str_c, definitive_genitive)))
# names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
# query_string <- paste0('(\\"',names,'\\")')
# ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s")))
# actorid <- str_c(actor$`_source.institutionId`,'_s')
# query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
# df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
# }
# if (exists('df1') == T & exists('df2') == T) {
# return(bind_rows(df1,df2))
# } else if (exists('df1') == T) {
# return(df1)
# } else if (exists('df2') == T) {
# return(df2)
# }
# }
}

@ -4,8 +4,8 @@
\alias{actorizer}
\title{Updater function for elasticizer: Conduct actor searches}
\usage{
actorizer(out, localhost = F, ids, type, prefix, postfix, identifier,
es_super, ver)
actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super,
ver)
}
\arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
@ -31,5 +31,5 @@ As this is a nested function used within elasticizer, there is no return output
Updater function for elasticizer: Conduct actor searches
}
\examples{
actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super)
actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
}

@ -4,14 +4,16 @@
\alias{query_gen_actors}
\title{Generate actor search queries based on data in actor db}
\usage{
query_gen_actors(actor, country, identifier)
query_gen_actors(actor, country, pre_tags, post_tags)
}
\arguments{
\item{actor}{A row from the output of elasticizer() when run on the 'actor' index}
\item{country}{2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.}
\item{identifier}{Identifier used to mark hits in the text, identifiers are prepended before the actual hit}
\item{pre_tags}{Highlighter pre-tag}
\item{post_tags}{Highlighter post-tag}
}
\value{
A data frame containing the queries, related actor ids and actor function

Loading…
Cancel
Save