From a1b6c6a7cb55403c10323982b215b41085ea1e9e Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Tue, 23 Apr 2019 16:43:11 +0200 Subject: [PATCH] actorizer, query_gen_actors: revamped actor searches entirely elasticizer: updated script for use with ES 7.x --- NAMESPACE | 1 + R/actorizer.R | 93 +++++++--- R/elasticizer.R | 4 +- R/query_gen_actors.R | 395 ++++++++++++++++++++++------------------ man/actorizer.Rd | 6 +- man/query_gen_actors.Rd | 6 +- 6 files changed, 289 insertions(+), 216 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c8fe410..bf6270c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(dfm_gen) export(dupe_detect) export(elastic_update) export(elasticizer) +export(lemma_writer) export(merger) export(modelizer) export(query_gen_actors) diff --git a/R/actorizer.R b/R/actorizer.R index c9cfb93..6cbd1de 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -12,17 +12,19 @@ #' @return As this is a nested function used within elasticizer, there is no return output #' @export #' @examples -#' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super) -actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, es_super, ver) { +#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super) +actorizer <- function(out, localhost = F, ids, prefix, postfix, identifier, es_super, ver) { ### Function to filter out false positives using regex - exceptionizer <- function(id, ud, doc, markers, regex_identifier, prefix, postfix) { + exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) { min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence - max <- max+(length(split)*nchar(identifier)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6) + min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1)) + max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6) sentence <- str_sub(doc$merged, min, max) # Extract sentence from text + # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing - if (!str_detect(sentence, paste0(regex_identifier,postfix)) && !str_detect(sentence, paste0(prefix,regex_identifier))) { + if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) { return(id) } else { return(NULL) @@ -31,8 +33,16 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier ranger <- function(x, ud) { return(which((ud$start <= x) & (ud$end >= x))) } - sentencizer <- function(row, out, ids, prefix, postfix, identifier, type) { + sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) { doc <- out[row,] + if (nchar(doc$merged) > 990000) { + return( + data.frame( + err = T, + errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect" + ) + ) + } # Extracting ud output from document ud <- doc$`_source.ud`[[1]] %>% select(-one_of('exists')) %>% # Removing ud.exists variable @@ -48,11 +58,9 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier sentence_count <- max(ud$sentence_id) # Number of sentences in article actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor - actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor - actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor - # Conducting regex filtering on matches only when actor type is Party - if (type == "Party") { + # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply + if (!is.na(prefix) || prefix != '' || !is.na(postfix) || postfix != '') { ### If no pre or postfixes, match *not nothing* i.e. anything if (is.na(prefix) || prefix == '') { prefix = '$^' @@ -60,14 +68,27 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier if (is.na(postfix) || postfix == '') { postfix = '$^' } - sentence_ids <- lapply(actor_sentences, exceptionizer, ud = ud, doc = doc, markers = markers, regex_identifier = regex_identifier, prefix = prefix, postfix = postfix) + sentence_ids <- unlist(lapply(actor_sentences, + exceptionizer, + ud = ud, + doc = doc, + markers = markers, + pre_tags_regex = pre_tags_regex, + pre_tags = pre_tags, + post_tags_regex = post_tags_regex, + post_tags = post_tags, + prefix = prefix, + postfix = postfix)) } else { sentence_ids <- actor_sentences } # Generating nested sentence start and end positions for actor sentences ud <- ud %>% - filter(sentence_id %in% sentence_ids) %>% + filter(sentence_id %in% sentence_ids) + actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor + actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor + ud <- ud %>% group_by(sentence_id) %>% summarise ( sentence_start = as.integer(min(start)), @@ -80,28 +101,42 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier sentence_start = list(sentence_start), sentence_end = list(sentence_end) ) - - return( - data.frame(ud, # Sentence id, start and end position for actor sentences - actor_start = I(list(actor_start)), # List of actor ud token start positions - actor_end = I(list(actor_end)), # List of actor ud token end positions - occ = length(unique(actor_sentences)), # Number of sentences in which actor occurs - prom = length(unique(actor_sentences))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences) - rel_first = 1-(min(actor_sentences)/sentence_count), # Relative position of first occurence at sentence level - first = min(actor_sentences), # First sentence in which actor is mentioned - ids = I(list(ids)) # List of actor ids + if (length(ud$doc_id > 0)) { + return( + data.frame(ud, # Sentence id, start and end position for actor sentences + actor_start = I(list(actor_start)), # List of actor ud token start positions + actor_end = I(list(actor_end)), # List of actor ud token end positions + occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs + prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences) + rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level + first = min(sentence_ids), # First sentence in which actor is mentioned + ids = I(list(ids)) # List of actor ids + ) ) - ) + } else { + return(NULL) + } + } out <- mamlr:::out_parser(out, field = 'highlight', clean = F) - offsetter <- function(x, identifier) { - return(x-((row(x)-1)*nchar(identifier))) + offsetter <- function(x, pre_tags, post_tags) { + return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags)))) } - regex_identifier <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", identifier) - out$markers <- mclapply(str_locate_all(out$merged,coll(identifier)), offsetter, identifier = identifier, mc.cores = detectCores()) + pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags) + post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags) + out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = detectCores()) - ids <- fromJSON(ids) - updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, type = type, mc.cores = detectCores())) + # ids <- fromJSON(ids) + updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, + out = out, + ids = ids, + postfix = postfix, + prefix=prefix, + pre_tags_regex = pre_tags_regex, + pre_tags = pre_tags, + post_tags_regex = post_tags_regex, + post_tags = post_tags, + mc.cores = detectCores())) bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver) bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver)) return(elastic_update(bulk, es_super = es_super, localhost = localhost)) diff --git a/R/elasticizer.R b/R/elasticizer.R index 20a33e8..3bb3e52 100644 --- a/R/elasticizer.R +++ b/R/elasticizer.R @@ -68,11 +68,11 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw } } json <- fromJSON(res) - if (json$hits$total == 0) { + if (json$hits$total$value == 0) { return(json) } else { out <- jsonlite:::flatten(json$hits$hits) - total <- json$hits$total + total <- json$hits$total$value hits <- length(json$hits$hits) batch <- 1 print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.')) diff --git a/R/query_gen_actors.R b/R/query_gen_actors.R index f91e135..31712ee 100644 --- a/R/query_gen_actors.R +++ b/R/query_gen_actors.R @@ -3,7 +3,8 @@ #' Generate actor search queries based on data in actor db #' @param actor A row from the output of elasticizer() when run on the 'actor' index #' @param country 2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc. -#' @param identifier Identifier used to mark hits in the text, identifiers are prepended before the actual hit +#' @param pre_tags Highlighter pre-tag +#' @param post_tags Highlighter post-tag #' @return A data frame containing the queries, related actor ids and actor function #' @export #' @examples @@ -12,134 +13,170 @@ ################################################################################################# #################################### Actor search query generator ############################### ################################################################################################# -query_gen_actors <- function(actor, country, identifier) { - highlight <- paste0('"highlight" : { - "fields" : { - "text" : {}, - "teaser" : {}, - "preteaser" : {}, - "title" : {}, - "subtitle" : {} - }, - "number_of_fragments": 0, - "order": "none", - "type":"unified", - "fragment_size":0, - "pre_tags":"',identifier,'", - "post_tags": "" -}') -if (country == "no") { - genitive <- 's' - definitive <- 'en' - definitive_genitive <- 'ens' -} else if (country == 'uk') { - genitive <- '\'s' - definitive <- 's' - definitive_genitive <- '' -} else { - genitive <- '' - definitive <- '' - definitive_genitive <- '' -} +query_gen_actors <- function(actor, country, pre_tags, post_tags) { + generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) { + return(paste0('{"query": + {"bool": { + "filter":[ + {"term":{"country":"',country,'"}}, + {"range":{"publication_date":{"gte":"',startdate,'","lte":"',enddate,'"}}}, + {"query_string" : { + "default_operator" : "OR", + "allow_leading_wildcard" : "false", + "fields": ["text","teaser","preteaser","title","subtitle"], + "query" : "', querystring,'" + } + } + ], + "must_not":[ + {"term":{"computerCodes.actors":"',actorid,'"}} + ] + } + }, + "highlight" : { + "fields" : { + "text" : {}, + "teaser" : {}, + "preteaser" : {}, + "title" : {}, + "subtitle" : {} + }, + "number_of_fragments": 0, + "order": "none", + "type":"unified", + "fragment_size":0, + "pre_tags":"', pre_tags,'", + "post_tags": "',post_tags,'" + } + }')) + } + ### Setting linguistic forms for each country ### + if (country == "no" | country == "dk") { + genitive <- 's' + definitive <- 'en' + definitive_genitive <- 'ens' + } else if (country == 'uk') { + genitive <- '\'s' + } else if (country == 'nl' | country == 'be') { + genitive <- 's' + } - if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader") { + ### Generating queries for individuals (ministers, PM, Party leaders and MPs) + if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader" | actor$`_source.function` == "MP") { lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')') ## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name ## Regardless of whether the last name hit is because of a minister name or a full name proximity hit - query_string <- paste0('(((\\"', - actor$`_source.firstName`, - ' ', - actor$`_source.lastName`, - '\\"~5 OR \\"', - actor$`_source.firstName`, - ' ', - actor$`_source.lastName`,genitive, - '\\"~5) AND ',lastname) - } - if (actor$`_source.function` == "PartyLeader") { - query_string <- paste0(query_string,'))') - ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.partyId`),str_c, "_pl"))) - } - if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM") { - ## Modifiers are only applied to minister titles - capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title)) - capital_gen <- unlist(lapply(capital, str_c, genitive)) - capital_def <- unlist(lapply(capital, str_c, definitive)) - capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive)) - gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive)) - def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive)) - defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive)) - names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ') - query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))') - ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.ministryId`,actor$`_source.partyId`),str_c, "_min"))) - } - if (actor$`_source.function` == "Institution") { - #uppercasing - firstup <- function(x) { - substr(x, 1, 1) <- toupper(substr(x, 1, 1)) - x + + ### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name + if (country == 'be') { + if (T %in% str_detect(actor$`_source.middleNames`,"'")) { + query_string <- paste0('(((\\"', + actor$`_source.firstName`, + ' ', + actor$`_source.lastName`, + '\\"~5 OR \\"', + actor$`_source.firstName`, + ' ', + tolower(actor$`_source.lastName`), + '\\"~5 OR \\"', + actor$`_source.firstName`, + ' ', + tolower(actor$`_source.lastName`),genitive, + '\\"~5 OR \\"', + actor$`_source.firstName`, + ' ', + actor$`_source.lastName`,genitive, + '\\"~5) AND ',lastname) + } + } else { + query_string <- paste0('(((\\"', + actor$`_source.firstName`, + ' ', + actor$`_source.lastName`, + '\\"~5 OR \\"', + actor$`_source.firstName`, + ' ', + actor$`_source.lastName`,genitive, + '\\"~5) AND ',lastname) } + + + ### If actor is a minister, generate minister search + if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM") { + capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title)) + capital_gen <- unlist(lapply(capital, str_c, genitive)) + gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive)) + # If country is no or dk, search for definitive minister forms as well (including genitive forms) + if (country == "no" || country == "dk") { + capital_def <- unlist(lapply(capital, str_c, definitive)) + capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive)) + def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive)) + defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive)) + names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ') + } else { + names <- paste(c(capital,capital_gen,gen,actor$`_source.ministerSearch`), collapse = ' ') + } + # If country is uk, search for last name, minister names and minister titles (i.e. Johnson AND state AND secretary) + if (country == "uk") { + minister <- paste(c(str_to_title(actor$`_source.ministerName`), + actor$`_source.ministerName`, + str_c(str_to_title(actor$`_source.ministerName`),genitive), + str_c(actor$`_source.ministerName`,genitive)), collapse = ' ') + query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND (',unlist(minister),')))') + } + # If country is nl or be, add a requirement for Minister to the query + else if (country == "nl" | country == "be") { + query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND ("Minister" OR "minister")))') + } else { + query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))') + } + } else { + ### Else, generate search for first/last name only (MPs and Party leaders, currently) + query_string <- paste0(query_string,'))') + } + ids <- list(c(actor$`_source.actorId`,str_c(actor$`_source.partyId`,'_a'))) + actorid <- actor$`_source.actorId` + query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid) + return(data.frame(query = query, ids = I(ids), stringsAsFactors = F)) + } + ### Query generation for party searches + if (actor$`_source.function` == "Party") { actor$`_source.startDate` <- "2000-01-01" actor$`_source.endDate` <- "2099-01-01" - if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) { - upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup)) - upper <- c(upper, unlist(lapply(upper, str_c, genitive)), - unlist(lapply(upper, str_c, definitive)), - unlist(lapply(upper, str_c, definitive_genitive))) - capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title)) - capital <- c(capital, unlist(lapply(capital, str_c, genitive)), - unlist(lapply(capital, str_c, definitive)), - unlist(lapply(capital, str_c, definitive_genitive))) - base <- actor$`_source.institutionNameSearch` - base <- c(base, unlist(lapply(base, str_c, genitive)), - unlist(lapply(base, str_c, definitive)), - unlist(lapply(base, str_c, definitive_genitive))) - names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"') - query_string <- paste0('(\\"',names,'\\")') - ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f"))) - query <- paste0('{"query": - {"bool": {"filter":[{"term":{"country":"',country,'"}}, - {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, - {"query_string" : { - "default_operator" : "OR", - "allow_leading_wildcard" : "false", - "fields": ["text","teaser","preteaser","title","subtitle"], - "query" : "', query_string,'" - } - } - ] - } },',highlight,' }') - df1 <- data.frame(query = query, ids = I(ids), type = actor$`_source.function`, stringsAsFactors = F) + if (nchar(actor$`_source.partyNameSearchShort`[[1]]) > 0) { + # If uk, no or dk, search for both regular abbreviations, and genitive forms + if (country == "uk" | country == "no" | country == "dk") { + gen <- unlist(lapply(actor$`_source.partyNameSearchShort`, str_c, genitive)) + names <- paste(c(gen,actor$`_source.partyNameSearchShort`), collapse = '\\\" \\\"') + } else { + names <- paste(actor$`_source.partyNameSearchShort`, collapse = '\\\" \\\"') + } + # If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself) + if (country == "dk" | country == "no") { + gen <- gen[which(nchar(gen) > 2)] + } + query_string <- paste0('(\\\"',names,'\\\")') + ids <- str_c(actor$`_source.partyId`,'_s') + actorid <- str_c(actor$`_source.partyId`,'_s') + query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid) + df1 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F) } - if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) { - upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup)) - upper <- c(upper, unlist(lapply(upper, str_c, genitive)), - unlist(lapply(upper, str_c, definitive)), - unlist(lapply(upper, str_c, definitive_genitive))) - capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title)) - capital <- c(capital, unlist(lapply(capital, str_c, genitive)), - unlist(lapply(capital, str_c, definitive)), - unlist(lapply(capital, str_c, definitive_genitive))) - base <- actor$`_source.institutionNameSearchShort` - base <- c(base, unlist(lapply(base, str_c, genitive)), - unlist(lapply(base, str_c, definitive)), - unlist(lapply(base, str_c, definitive_genitive))) - names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"') - query_string <- paste0('(\\"',names,'\\")') - ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s"))) - query <- paste0('{"query": - {"bool": {"filter":[{"term":{"country":"',country,'"}}, - {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, - {"query_string" : { - "default_operator" : "OR", - "allow_leading_wildcard" : "false", - "fields": ["text","teaser","preteaser","title","subtitle"], - "query" : "', query_string,'" - } - } - ] - } },',highlight,' }') - df2 <- data.frame(query = query, ids = I(ids), type = actor$`_source.function`, stringsAsFactors = F) + if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) { + if (country == "uk" | country == "no" | country == "dk") { + gen <- unlist(lapply(actor$`_source.partyNameSearch`, str_c, genitive)) + names <- paste(c(gen,actor$`_source.partyNameSearch`), collapse = '\\\" \\\"') + } else { + names <- paste(actor$`_source.partyNameSearch`, collapse = '\\\" \\\"') + } + query_string <- paste0('(\\\"',names,'\\\")') + ids <- str_c(actor$`_source.partyId`,'_f') + actorid <- str_c(actor$`_source.partyId`,'_f') + query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid) + if (country == 'uk') { + df2 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F) + } else { + df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F) + } } if (exists('df1') == T & exists('df2') == T) { return(bind_rows(df1,df2)) @@ -149,63 +186,61 @@ if (country == "no") { return(df2) } } - if (actor$`_source.function` == "Party") { - actor$`_source.startDate` <- "2000-01-01" - actor$`_source.endDate` <- "2099-01-01" - names <- paste(c(unlist(actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"') - query_string <- paste0('(\\"',names,'\\")') - query <- paste0('{"query": - {"bool": {"filter":[{"term":{"country":"',country,'"}}, - {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, - {"query_string" : { - "default_operator" : "OR", - "allow_leading_wildcard" : "false", - "fields": ["text","teaser","preteaser","title","subtitle"], - "query" : "', query_string,'" - } - } - ] - } },',highlight,' }') - ids <- c(toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p")))) - - if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) { - names <- paste(c(unlist(actor$`_source.partyNameSearch`)), collapse = '\\" \\"') - query_string <- paste0('(\\"',names,'\\")') - query2 <- paste0('{"query": - {"bool": {"filter":[{"term":{"country":"',country,'"}}, - {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, - {"query_string" : { - "default_operator" : "OR", - "allow_leading_wildcard" : "false", - "fields": ["text","teaser","preteaser","title","subtitle"], - "query" : "', query_string,'" - } - } - ] - } },',highlight,' }') - - - ids <- c(ids, toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p")))) - query <- c(query, query2) - fn <- c('PartyAbbreviation','Party') - } else { - fn <- c('PartyAbbreviation') - } - return(data.frame(query = query, ids = I(ids), type = fn, prefix = actor$`_source.searchAnd`, postfix = actor$`_source.searchAndNot`, stringsAsFactors = F)) - } - - query <- paste0('{"query": - {"bool": {"filter":[{"term":{"country":"',country,'"}}, - {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}}, - {"query_string" : { - "default_operator" : "OR", - "allow_leading_wildcard" : "false", - "fields": ["text","teaser","preteaser","title","subtitle"], - "query" : "', query_string,'" - } - } - ] - } },',highlight,' }') - fn <- actor$`_source.function` - return(data.frame(query = query, ids = I(ids), type = fn, stringsAsFactors = F)) + ### Institution function currently not used + # if (actor$`_source.function` == "Institution") { + # #uppercasing + # firstup <- function(x) { + # substr(x, 1, 1) <- toupper(substr(x, 1, 1)) + # x + # } + # actor$`_source.startDate` <- "2000-01-01" + # actor$`_source.endDate` <- "2099-01-01" + # if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) { + # upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup)) + # upper <- c(upper, unlist(lapply(upper, str_c, genitive)), + # unlist(lapply(upper, str_c, definitive)), + # unlist(lapply(upper, str_c, definitive_genitive))) + # capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title)) + # capital <- c(capital, unlist(lapply(capital, str_c, genitive)), + # unlist(lapply(capital, str_c, definitive)), + # unlist(lapply(capital, str_c, definitive_genitive))) + # base <- actor$`_source.institutionNameSearch` + # base <- c(base, unlist(lapply(base, str_c, genitive)), + # unlist(lapply(base, str_c, definitive)), + # unlist(lapply(base, str_c, definitive_genitive))) + # names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"') + # query_string <- paste0('(\\"',names,'\\")') + # ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f"))) + # actorid <- str_c(actor$`_source.institutionId`,'_f') + # query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid) + # df1 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F) + # } + # if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) { + # upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup)) + # upper <- c(upper, unlist(lapply(upper, str_c, genitive)), + # unlist(lapply(upper, str_c, definitive)), + # unlist(lapply(upper, str_c, definitive_genitive))) + # capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title)) + # capital <- c(capital, unlist(lapply(capital, str_c, genitive)), + # unlist(lapply(capital, str_c, definitive)), + # unlist(lapply(capital, str_c, definitive_genitive))) + # base <- actor$`_source.institutionNameSearchShort` + # base <- c(base, unlist(lapply(base, str_c, genitive)), + # unlist(lapply(base, str_c, definitive)), + # unlist(lapply(base, str_c, definitive_genitive))) + # names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"') + # query_string <- paste0('(\\"',names,'\\")') + # ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s"))) + # actorid <- str_c(actor$`_source.institutionId`,'_s') + # query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid) + # df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F) + # } + # if (exists('df1') == T & exists('df2') == T) { + # return(bind_rows(df1,df2)) + # } else if (exists('df1') == T) { + # return(df1) + # } else if (exists('df2') == T) { + # return(df2) + # } + # } } diff --git a/man/actorizer.Rd b/man/actorizer.Rd index 06c60fb..c2cc681 100644 --- a/man/actorizer.Rd +++ b/man/actorizer.Rd @@ -4,8 +4,8 @@ \alias{actorizer} \title{Updater function for elasticizer: Conduct actor searches} \usage{ -actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, - es_super, ver) +actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super, + ver) } \arguments{ \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} @@ -31,5 +31,5 @@ As this is a nested function used within elasticizer, there is no return output Updater function for elasticizer: Conduct actor searches } \examples{ -actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super) +actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super) } diff --git a/man/query_gen_actors.Rd b/man/query_gen_actors.Rd index f780a94..5e2de37 100644 --- a/man/query_gen_actors.Rd +++ b/man/query_gen_actors.Rd @@ -4,14 +4,16 @@ \alias{query_gen_actors} \title{Generate actor search queries based on data in actor db} \usage{ -query_gen_actors(actor, country, identifier) +query_gen_actors(actor, country, pre_tags, post_tags) } \arguments{ \item{actor}{A row from the output of elasticizer() when run on the 'actor' index} \item{country}{2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.} -\item{identifier}{Identifier used to mark hits in the text, identifiers are prepended before the actual hit} +\item{pre_tags}{Highlighter pre-tag} + +\item{post_tags}{Highlighter post-tag} } \value{ A data frame containing the queries, related actor ids and actor function