actorizer, query_gen_actors: revamped actor searches entirely

elasticizer: updated script for use with ES 7.x
master
Erik de Vries 5 years ago
parent 88fc4ec53c
commit a1b6c6a7cb

@ -7,6 +7,7 @@ export(dfm_gen)
export(dupe_detect) export(dupe_detect)
export(elastic_update) export(elastic_update)
export(elasticizer) export(elasticizer)
export(lemma_writer)
export(merger) export(merger)
export(modelizer) export(modelizer)
export(query_gen_actors) export(query_gen_actors)

@ -12,17 +12,19 @@
#' @return As this is a nested function used within elasticizer, there is no return output #' @return As this is a nested function used within elasticizer, there is no return output
#' @export #' @export
#' @examples #' @examples
#' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super) #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, es_super, ver) { actorizer <- function(out, localhost = F, ids, prefix, postfix, identifier, es_super, ver) {
### Function to filter out false positives using regex ### Function to filter out false positives using regex
exceptionizer <- function(id, ud, doc, markers, regex_identifier, prefix, postfix) { exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
max <- max+(length(split)*nchar(identifier)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6) min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
sentence <- str_sub(doc$merged, min, max) # Extract sentence from text sentence <- str_sub(doc$merged, min, max) # Extract sentence from text
# Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
if (!str_detect(sentence, paste0(regex_identifier,postfix)) && !str_detect(sentence, paste0(prefix,regex_identifier))) { if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
return(id) return(id)
} else { } else {
return(NULL) return(NULL)
@ -31,8 +33,16 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
ranger <- function(x, ud) { ranger <- function(x, ud) {
return(which((ud$start <= x) & (ud$end >= x))) return(which((ud$start <= x) & (ud$end >= x)))
} }
sentencizer <- function(row, out, ids, prefix, postfix, identifier, type) { sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
doc <- out[row,] doc <- out[row,]
if (nchar(doc$merged) > 990000) {
return(
data.frame(
err = T,
errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
)
)
}
# Extracting ud output from document # Extracting ud output from document
ud <- doc$`_source.ud`[[1]] %>% ud <- doc$`_source.ud`[[1]] %>%
select(-one_of('exists')) %>% # Removing ud.exists variable select(-one_of('exists')) %>% # Removing ud.exists variable
@ -48,11 +58,9 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
sentence_count <- max(ud$sentence_id) # Number of sentences in article sentence_count <- max(ud$sentence_id) # Number of sentences in article
actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
# Conducting regex filtering on matches only when actor type is Party # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
if (type == "Party") { if (!is.na(prefix) || prefix != '' || !is.na(postfix) || postfix != '') {
### If no pre or postfixes, match *not nothing* i.e. anything ### If no pre or postfixes, match *not nothing* i.e. anything
if (is.na(prefix) || prefix == '') { if (is.na(prefix) || prefix == '') {
prefix = '$^' prefix = '$^'
@ -60,14 +68,27 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
if (is.na(postfix) || postfix == '') { if (is.na(postfix) || postfix == '') {
postfix = '$^' postfix = '$^'
} }
sentence_ids <- lapply(actor_sentences, exceptionizer, ud = ud, doc = doc, markers = markers, regex_identifier = regex_identifier, prefix = prefix, postfix = postfix) sentence_ids <- unlist(lapply(actor_sentences,
exceptionizer,
ud = ud,
doc = doc,
markers = markers,
pre_tags_regex = pre_tags_regex,
pre_tags = pre_tags,
post_tags_regex = post_tags_regex,
post_tags = post_tags,
prefix = prefix,
postfix = postfix))
} else { } else {
sentence_ids <- actor_sentences sentence_ids <- actor_sentences
} }
# Generating nested sentence start and end positions for actor sentences # Generating nested sentence start and end positions for actor sentences
ud <- ud %>% ud <- ud %>%
filter(sentence_id %in% sentence_ids) %>% filter(sentence_id %in% sentence_ids)
actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
ud <- ud %>%
group_by(sentence_id) %>% group_by(sentence_id) %>%
summarise ( summarise (
sentence_start = as.integer(min(start)), sentence_start = as.integer(min(start)),
@ -80,28 +101,42 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
sentence_start = list(sentence_start), sentence_start = list(sentence_start),
sentence_end = list(sentence_end) sentence_end = list(sentence_end)
) )
if (length(ud$doc_id > 0)) {
return( return(
data.frame(ud, # Sentence id, start and end position for actor sentences data.frame(ud, # Sentence id, start and end position for actor sentences
actor_start = I(list(actor_start)), # List of actor ud token start positions actor_start = I(list(actor_start)), # List of actor ud token start positions
actor_end = I(list(actor_end)), # List of actor ud token end positions actor_end = I(list(actor_end)), # List of actor ud token end positions
occ = length(unique(actor_sentences)), # Number of sentences in which actor occurs occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs
prom = length(unique(actor_sentences))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences) prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
rel_first = 1-(min(actor_sentences)/sentence_count), # Relative position of first occurence at sentence level rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level
first = min(actor_sentences), # First sentence in which actor is mentioned first = min(sentence_ids), # First sentence in which actor is mentioned
ids = I(list(ids)) # List of actor ids ids = I(list(ids)) # List of actor ids
)
) )
) } else {
return(NULL)
}
} }
out <- mamlr:::out_parser(out, field = 'highlight', clean = F) out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
offsetter <- function(x, identifier) { offsetter <- function(x, pre_tags, post_tags) {
return(x-((row(x)-1)*nchar(identifier))) return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
} }
regex_identifier <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", identifier) pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
out$markers <- mclapply(str_locate_all(out$merged,coll(identifier)), offsetter, identifier = identifier, mc.cores = detectCores()) post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = detectCores())
ids <- fromJSON(ids) # ids <- fromJSON(ids)
updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, type = type, mc.cores = detectCores())) updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer,
out = out,
ids = ids,
postfix = postfix,
prefix=prefix,
pre_tags_regex = pre_tags_regex,
pre_tags = pre_tags,
post_tags_regex = post_tags_regex,
post_tags = post_tags,
mc.cores = detectCores()))
bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver) bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver)) bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
return(elastic_update(bulk, es_super = es_super, localhost = localhost)) return(elastic_update(bulk, es_super = es_super, localhost = localhost))

@ -68,11 +68,11 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
} }
} }
json <- fromJSON(res) json <- fromJSON(res)
if (json$hits$total == 0) { if (json$hits$total$value == 0) {
return(json) return(json)
} else { } else {
out <- jsonlite:::flatten(json$hits$hits) out <- jsonlite:::flatten(json$hits$hits)
total <- json$hits$total total <- json$hits$total$value
hits <- length(json$hits$hits) hits <- length(json$hits$hits)
batch <- 1 batch <- 1
print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.')) print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))

@ -3,7 +3,8 @@
#' Generate actor search queries based on data in actor db #' Generate actor search queries based on data in actor db
#' @param actor A row from the output of elasticizer() when run on the 'actor' index #' @param actor A row from the output of elasticizer() when run on the 'actor' index
#' @param country 2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc. #' @param country 2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.
#' @param identifier Identifier used to mark hits in the text, identifiers are prepended before the actual hit #' @param pre_tags Highlighter pre-tag
#' @param post_tags Highlighter post-tag
#' @return A data frame containing the queries, related actor ids and actor function #' @return A data frame containing the queries, related actor ids and actor function
#' @export #' @export
#' @examples #' @examples
@ -12,134 +13,170 @@
################################################################################################# #################################################################################################
#################################### Actor search query generator ############################### #################################### Actor search query generator ###############################
################################################################################################# #################################################################################################
query_gen_actors <- function(actor, country, identifier) { query_gen_actors <- function(actor, country, pre_tags, post_tags) {
highlight <- paste0('"highlight" : { generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
"fields" : { return(paste0('{"query":
"text" : {}, {"bool": {
"teaser" : {}, "filter":[
"preteaser" : {}, {"term":{"country":"',country,'"}},
"title" : {}, {"range":{"publication_date":{"gte":"',startdate,'","lte":"',enddate,'"}}},
"subtitle" : {} {"query_string" : {
}, "default_operator" : "OR",
"number_of_fragments": 0, "allow_leading_wildcard" : "false",
"order": "none", "fields": ["text","teaser","preteaser","title","subtitle"],
"type":"unified", "query" : "', querystring,'"
"fragment_size":0, }
"pre_tags":"',identifier,'", }
"post_tags": "" ],
}') "must_not":[
if (country == "no") { {"term":{"computerCodes.actors":"',actorid,'"}}
genitive <- 's' ]
definitive <- 'en' }
definitive_genitive <- 'ens' },
} else if (country == 'uk') { "highlight" : {
genitive <- '\'s' "fields" : {
definitive <- 's' "text" : {},
definitive_genitive <- '' "teaser" : {},
} else { "preteaser" : {},
genitive <- '' "title" : {},
definitive <- '' "subtitle" : {}
definitive_genitive <- '' },
} "number_of_fragments": 0,
"order": "none",
"type":"unified",
"fragment_size":0,
"pre_tags":"', pre_tags,'",
"post_tags": "',post_tags,'"
}
}'))
}
### Setting linguistic forms for each country ###
if (country == "no" | country == "dk") {
genitive <- 's'
definitive <- 'en'
definitive_genitive <- 'ens'
} else if (country == 'uk') {
genitive <- '\'s'
} else if (country == 'nl' | country == 'be') {
genitive <- 's'
}
if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader") { ### Generating queries for individuals (ministers, PM, Party leaders and MPs)
if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader" | actor$`_source.function` == "MP") {
lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')') lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')')
## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name ## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name
## Regardless of whether the last name hit is because of a minister name or a full name proximity hit ## Regardless of whether the last name hit is because of a minister name or a full name proximity hit
query_string <- paste0('(((\\"',
actor$`_source.firstName`, ### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name
' ', if (country == 'be') {
actor$`_source.lastName`, if (T %in% str_detect(actor$`_source.middleNames`,"'")) {
'\\"~5 OR \\"', query_string <- paste0('(((\\"',
actor$`_source.firstName`, actor$`_source.firstName`,
' ', ' ',
actor$`_source.lastName`,genitive, actor$`_source.lastName`,
'\\"~5) AND ',lastname) '\\"~5 OR \\"',
} actor$`_source.firstName`,
if (actor$`_source.function` == "PartyLeader") { ' ',
query_string <- paste0(query_string,'))') tolower(actor$`_source.lastName`),
ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.partyId`),str_c, "_pl"))) '\\"~5 OR \\"',
} actor$`_source.firstName`,
if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM") { ' ',
## Modifiers are only applied to minister titles tolower(actor$`_source.lastName`),genitive,
capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title)) '\\"~5 OR \\"',
capital_gen <- unlist(lapply(capital, str_c, genitive)) actor$`_source.firstName`,
capital_def <- unlist(lapply(capital, str_c, definitive)) ' ',
capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive)) actor$`_source.lastName`,genitive,
gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive)) '\\"~5) AND ',lastname)
def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive)) }
defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive)) } else {
names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ') query_string <- paste0('(((\\"',
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))') actor$`_source.firstName`,
ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.ministryId`,actor$`_source.partyId`),str_c, "_min"))) ' ',
} actor$`_source.lastName`,
if (actor$`_source.function` == "Institution") { '\\"~5 OR \\"',
#uppercasing actor$`_source.firstName`,
firstup <- function(x) { ' ',
substr(x, 1, 1) <- toupper(substr(x, 1, 1)) actor$`_source.lastName`,genitive,
x '\\"~5) AND ',lastname)
} }
### If actor is a minister, generate minister search
if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM") {
capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title))
capital_gen <- unlist(lapply(capital, str_c, genitive))
gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive))
# If country is no or dk, search for definitive minister forms as well (including genitive forms)
if (country == "no" || country == "dk") {
capital_def <- unlist(lapply(capital, str_c, definitive))
capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive))
def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive))
defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive))
names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ')
} else {
names <- paste(c(capital,capital_gen,gen,actor$`_source.ministerSearch`), collapse = ' ')
}
# If country is uk, search for last name, minister names and minister titles (i.e. Johnson AND state AND secretary)
if (country == "uk") {
minister <- paste(c(str_to_title(actor$`_source.ministerName`),
actor$`_source.ministerName`,
str_c(str_to_title(actor$`_source.ministerName`),genitive),
str_c(actor$`_source.ministerName`,genitive)), collapse = ' ')
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND (',unlist(minister),')))')
}
# If country is nl or be, add a requirement for Minister to the query
else if (country == "nl" | country == "be") {
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND ("Minister" OR "minister")))')
} else {
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))')
}
} else {
### Else, generate search for first/last name only (MPs and Party leaders, currently)
query_string <- paste0(query_string,'))')
}
ids <- list(c(actor$`_source.actorId`,str_c(actor$`_source.partyId`,'_a')))
actorid <- actor$`_source.actorId`
query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
return(data.frame(query = query, ids = I(ids), stringsAsFactors = F))
}
### Query generation for party searches
if (actor$`_source.function` == "Party") {
actor$`_source.startDate` <- "2000-01-01" actor$`_source.startDate` <- "2000-01-01"
actor$`_source.endDate` <- "2099-01-01" actor$`_source.endDate` <- "2099-01-01"
if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) { if (nchar(actor$`_source.partyNameSearchShort`[[1]]) > 0) {
upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup)) # If uk, no or dk, search for both regular abbreviations, and genitive forms
upper <- c(upper, unlist(lapply(upper, str_c, genitive)), if (country == "uk" | country == "no" | country == "dk") {
unlist(lapply(upper, str_c, definitive)), gen <- unlist(lapply(actor$`_source.partyNameSearchShort`, str_c, genitive))
unlist(lapply(upper, str_c, definitive_genitive))) names <- paste(c(gen,actor$`_source.partyNameSearchShort`), collapse = '\\\" \\\"')
capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title)) } else {
capital <- c(capital, unlist(lapply(capital, str_c, genitive)), names <- paste(actor$`_source.partyNameSearchShort`, collapse = '\\\" \\\"')
unlist(lapply(capital, str_c, definitive)), }
unlist(lapply(capital, str_c, definitive_genitive))) # If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself)
base <- actor$`_source.institutionNameSearch` if (country == "dk" | country == "no") {
base <- c(base, unlist(lapply(base, str_c, genitive)), gen <- gen[which(nchar(gen) > 2)]
unlist(lapply(base, str_c, definitive)), }
unlist(lapply(base, str_c, definitive_genitive))) query_string <- paste0('(\\\"',names,'\\\")')
names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"') ids <- str_c(actor$`_source.partyId`,'_s')
query_string <- paste0('(\\"',names,'\\")') actorid <- str_c(actor$`_source.partyId`,'_s')
ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f"))) query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
query <- paste0('{"query": df1 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
df1 <- data.frame(query = query, ids = I(ids), type = actor$`_source.function`, stringsAsFactors = F)
} }
if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) { if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup)) if (country == "uk" | country == "no" | country == "dk") {
upper <- c(upper, unlist(lapply(upper, str_c, genitive)), gen <- unlist(lapply(actor$`_source.partyNameSearch`, str_c, genitive))
unlist(lapply(upper, str_c, definitive)), names <- paste(c(gen,actor$`_source.partyNameSearch`), collapse = '\\\" \\\"')
unlist(lapply(upper, str_c, definitive_genitive))) } else {
capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title)) names <- paste(actor$`_source.partyNameSearch`, collapse = '\\\" \\\"')
capital <- c(capital, unlist(lapply(capital, str_c, genitive)), }
unlist(lapply(capital, str_c, definitive)), query_string <- paste0('(\\\"',names,'\\\")')
unlist(lapply(capital, str_c, definitive_genitive))) ids <- str_c(actor$`_source.partyId`,'_f')
base <- actor$`_source.institutionNameSearchShort` actorid <- str_c(actor$`_source.partyId`,'_f')
base <- c(base, unlist(lapply(base, str_c, genitive)), query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
unlist(lapply(base, str_c, definitive)), if (country == 'uk') {
unlist(lapply(base, str_c, definitive_genitive))) df2 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"') } else {
query_string <- paste0('(\\"',names,'\\")') df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s"))) }
query <- paste0('{"query":
{"bool": {"filter":[{"term":{"country":"',country,'"}},
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
{"query_string" : {
"default_operator" : "OR",
"allow_leading_wildcard" : "false",
"fields": ["text","teaser","preteaser","title","subtitle"],
"query" : "', query_string,'"
}
}
]
} },',highlight,' }')
df2 <- data.frame(query = query, ids = I(ids), type = actor$`_source.function`, stringsAsFactors = F)
} }
if (exists('df1') == T & exists('df2') == T) { if (exists('df1') == T & exists('df2') == T) {
return(bind_rows(df1,df2)) return(bind_rows(df1,df2))
@ -149,63 +186,61 @@ if (country == "no") {
return(df2) return(df2)
} }
} }
if (actor$`_source.function` == "Party") { ### Institution function currently not used
actor$`_source.startDate` <- "2000-01-01" # if (actor$`_source.function` == "Institution") {
actor$`_source.endDate` <- "2099-01-01" # #uppercasing
names <- paste(c(unlist(actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"') # firstup <- function(x) {
query_string <- paste0('(\\"',names,'\\")') # substr(x, 1, 1) <- toupper(substr(x, 1, 1))
query <- paste0('{"query": # x
{"bool": {"filter":[{"term":{"country":"',country,'"}}, # }
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, # actor$`_source.startDate` <- "2000-01-01"
{"query_string" : { # actor$`_source.endDate` <- "2099-01-01"
"default_operator" : "OR", # if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) {
"allow_leading_wildcard" : "false", # upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup))
"fields": ["text","teaser","preteaser","title","subtitle"], # upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
"query" : "', query_string,'" # unlist(lapply(upper, str_c, definitive)),
} # unlist(lapply(upper, str_c, definitive_genitive)))
} # capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title))
] # capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
} },',highlight,' }') # unlist(lapply(capital, str_c, definitive)),
ids <- c(toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p")))) # unlist(lapply(capital, str_c, definitive_genitive)))
# base <- actor$`_source.institutionNameSearch`
if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) { # base <- c(base, unlist(lapply(base, str_c, genitive)),
names <- paste(c(unlist(actor$`_source.partyNameSearch`)), collapse = '\\" \\"') # unlist(lapply(base, str_c, definitive)),
query_string <- paste0('(\\"',names,'\\")') # unlist(lapply(base, str_c, definitive_genitive)))
query2 <- paste0('{"query": # names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
{"bool": {"filter":[{"term":{"country":"',country,'"}}, # query_string <- paste0('(\\"',names,'\\")')
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, # ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f")))
{"query_string" : { # actorid <- str_c(actor$`_source.institutionId`,'_f')
"default_operator" : "OR", # query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
"allow_leading_wildcard" : "false", # df1 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
"fields": ["text","teaser","preteaser","title","subtitle"], # }
"query" : "', query_string,'" # if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) {
} # upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup))
} # upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
] # unlist(lapply(upper, str_c, definitive)),
} },',highlight,' }') # unlist(lapply(upper, str_c, definitive_genitive)))
# capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title))
# capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
ids <- c(ids, toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p")))) # unlist(lapply(capital, str_c, definitive)),
query <- c(query, query2) # unlist(lapply(capital, str_c, definitive_genitive)))
fn <- c('PartyAbbreviation','Party') # base <- actor$`_source.institutionNameSearchShort`
} else { # base <- c(base, unlist(lapply(base, str_c, genitive)),
fn <- c('PartyAbbreviation') # unlist(lapply(base, str_c, definitive)),
} # unlist(lapply(base, str_c, definitive_genitive)))
return(data.frame(query = query, ids = I(ids), type = fn, prefix = actor$`_source.searchAnd`, postfix = actor$`_source.searchAndNot`, stringsAsFactors = F)) # names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
} # query_string <- paste0('(\\"',names,'\\")')
# ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s")))
query <- paste0('{"query": # actorid <- str_c(actor$`_source.institutionId`,'_s')
{"bool": {"filter":[{"term":{"country":"',country,'"}}, # query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
{"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, # df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
{"query_string" : { # }
"default_operator" : "OR", # if (exists('df1') == T & exists('df2') == T) {
"allow_leading_wildcard" : "false", # return(bind_rows(df1,df2))
"fields": ["text","teaser","preteaser","title","subtitle"], # } else if (exists('df1') == T) {
"query" : "', query_string,'" # return(df1)
} # } else if (exists('df2') == T) {
} # return(df2)
] # }
} },',highlight,' }') # }
fn <- actor$`_source.function`
return(data.frame(query = query, ids = I(ids), type = fn, stringsAsFactors = F))
} }

@ -4,8 +4,8 @@
\alias{actorizer} \alias{actorizer}
\title{Updater function for elasticizer: Conduct actor searches} \title{Updater function for elasticizer: Conduct actor searches}
\usage{ \usage{
actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super,
es_super, ver) ver)
} }
\arguments{ \arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
@ -31,5 +31,5 @@ As this is a nested function used within elasticizer, there is no return output
Updater function for elasticizer: Conduct actor searches Updater function for elasticizer: Conduct actor searches
} }
\examples{ \examples{
actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super) actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
} }

@ -4,14 +4,16 @@
\alias{query_gen_actors} \alias{query_gen_actors}
\title{Generate actor search queries based on data in actor db} \title{Generate actor search queries based on data in actor db}
\usage{ \usage{
query_gen_actors(actor, country, identifier) query_gen_actors(actor, country, pre_tags, post_tags)
} }
\arguments{ \arguments{
\item{actor}{A row from the output of elasticizer() when run on the 'actor' index} \item{actor}{A row from the output of elasticizer() when run on the 'actor' index}
\item{country}{2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.} \item{country}{2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.}
\item{identifier}{Identifier used to mark hits in the text, identifiers are prepended before the actual hit} \item{pre_tags}{Highlighter pre-tag}
\item{post_tags}{Highlighter post-tag}
} }
\value{ \value{
A data frame containing the queries, related actor ids and actor function A data frame containing the queries, related actor ids and actor function

Loading…
Cancel
Save