@ -3,7 +3,8 @@
#' Generate actor search queries based on data in actor db
#' @param actor A row from the output of elasticizer() when run on the 'actor' index
#' @param country 2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.
#' @param identifier Identifier used to mark hits in the text, identifiers are prepended before the actual hit
#' @param pre_tags Highlighter pre-tag
#' @param post_tags Highlighter post-tag
#' @return A data frame containing the queries, related actor ids and actor function
#' @export
#' @examples
@ -12,8 +13,27 @@
#################################################################################################
#################################### Actor search query generator ###############################
#################################################################################################
query_gen_actors <- function ( actor , country , identifier ) {
highlight <- paste0 ( ' " h i g h l i g h t " : {
query_gen_actors <- function ( actor , country , pre_tags , post_tags ) {
generator <- function ( country , startdate , enddate , querystring , pre_tags , post_tags , actorid ) {
return ( paste0 ( ' { " q u e r y " :
{ " bool" : {
" filter" : [
{ " term" : { " country" : " ',country,'" } } ,
{ " range" : { " publication_date" : { " gte" : " ',startdate,'" , " lte" : " ',enddate,'" } } } ,
{ " query_string" : {
" default_operator" : " OR" ,
" allow_leading_wildcard" : " false" ,
" fields" : [ " text" , " teaser" , " preteaser" , " title" , " subtitle" ] ,
" query" : " ', querystring,'"
}
}
] ,
" must_not" : [
{ " term" : { " computerCodes.actors" : " ',actorid,'" } }
]
}
} ,
" highlight" : {
" fields" : {
" text" : { } ,
" teaser" : { } ,
@ -25,27 +45,31 @@ query_gen_actors <- function(actor, country, identifier) {
" order" : " none" ,
" type" : " unified" ,
" fragment_size" : 0 ,
" pre_tags" : " ',identifier,'" ,
" post_tags" : " "
} ' )
if ( country == " no" ) {
" pre_tags" : " ', pre_tags,'" ,
" post_tags" : " ',post_tags,'"
}
} ' ) )
}
### Setting linguistic forms for each country ###
if ( country == " no" | country == " dk" ) {
genitive <- ' s'
definitive <- ' en'
definitive_genitive <- ' ens'
} else if ( country == ' uk' ) {
} else if ( country == ' uk' ) {
genitive <- ' \'s'
definitive <- ' s'
definitive_genitive <- ' '
} else {
genitive <- ' '
definitive <- ' '
definitive_genitive <- ' '
}
} else if ( country == ' nl' | country == ' be' ) {
genitive <- ' s'
}
if ( actor $ `_source.function` == " Min" | actor $ `_source.function` == " PM" | actor $ `_source.function` == " PartyLeader" ) {
### Generating queries for individuals (ministers, PM, Party leaders and MPs)
if ( actor $ `_source.function` == " Minister" | actor $ `_source.function` == " PM" | actor $ `_source.function` == " PartyLeader" | actor $ `_source.function` == " MP" ) {
lastname <- paste0 ( ' (' , actor $ `_source.lastName` , ' OR ' , actor $ `_source.lastName` , genitive , ' )' )
## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name
## Regardless of whether the last name hit is because of a minister name or a full name proximity hit
### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name
if ( country == ' be' ) {
if ( T %in% str_detect ( actor $ `_source.middleNames` , " '" ) ) {
query_string <- paste0 ( ' (((\\"' ,
actor $ `_source.firstName` ,
' ' ,
@ -53,93 +77,106 @@ if (country == "no") {
' \\"~5 OR \\"' ,
actor $ `_source.firstName` ,
' ' ,
tolower ( actor $ `_source.lastName` ) ,
' \\"~5 OR \\"' ,
actor $ `_source.firstName` ,
' ' ,
tolower ( actor $ `_source.lastName` ) , genitive ,
' \\"~5 OR \\"' ,
actor $ `_source.firstName` ,
' ' ,
actor $ `_source.lastName` , genitive ,
' \\"~5) AND ' , lastname )
}
if ( actor $ `_source.function` == " PartyLeader" ) {
query_string <- paste0 ( query_string , ' ))' )
ids <- toJSON ( unlist ( lapply ( c ( actor $ `_source.actorId` , actor $ `_source.partyId` ) , str_c , " _pl" ) ) )
} else {
query_string <- paste0 ( ' (((\\"' ,
actor $ `_source.firstName` ,
' ' ,
actor $ `_source.lastName` ,
' \\"~5 OR \\"' ,
actor $ `_source.firstName` ,
' ' ,
actor $ `_source.lastName` , genitive ,
' \\"~5) AND ' , lastname )
}
if ( actor $ `_source.function` == " Min" | actor $ `_source.function` == " PM" ) {
## Modifiers are only applied to minister titles
### If actor is a minister, generate minister search
if ( actor $ `_source.function` == " Minister" | actor $ `_source.function` == " PM" ) {
capital <- unlist ( lapply ( actor $ `_source.ministerSearch` , str_to_title ) )
capital_gen <- unlist ( lapply ( capital , str_c , genitive ) )
gen <- unlist ( lapply ( actor $ `_source.ministerSearch` , str_c , genitive ) )
# If country is no or dk, search for definitive minister forms as well (including genitive forms)
if ( country == " no" || country == " dk" ) {
capital_def <- unlist ( lapply ( capital , str_c , definitive ) )
capital_defgen <- unlist ( lapply ( capital , str_c , definitive_genitive ) )
gen <- unlist ( lapply ( actor $ `_source.ministerSearch` , str_c , genitive ) )
def <- unlist ( lapply ( actor $ `_source.ministerSearch` , str_c , definitive ) )
defgen <- unlist ( lapply ( actor $ `_source.ministerSearch` , str_c , definitive_genitive ) )
names <- paste ( c ( capital , capital_gen , gen , capital_def , def , defgen , capital_defgen ) , collapse = ' ' )
} else {
names <- paste ( c ( capital , capital_gen , gen , actor $ `_source.ministerSearch` ) , collapse = ' ' )
}
# If country is uk, search for last name, minister names and minister titles (i.e. Johnson AND state AND secretary)
if ( country == " uk" ) {
minister <- paste ( c ( str_to_title ( actor $ `_source.ministerName` ) ,
actor $ `_source.ministerName` ,
str_c ( str_to_title ( actor $ `_source.ministerName` ) , genitive ) ,
str_c ( actor $ `_source.ministerName` , genitive ) ) , collapse = ' ' )
query_string <- paste0 ( query_string , ' ) OR (' , lastname , ' AND (' , names , ' ) AND (' , unlist ( minister ) , ' )))' )
}
# If country is nl or be, add a requirement for Minister to the query
else if ( country == " nl" | country == " be" ) {
query_string <- paste0 ( query_string , ' ) OR (' , lastname , ' AND (' , names , ' ) AND ("Minister" OR "minister")))' )
} else {
query_string <- paste0 ( query_string , ' ) OR (' , lastname , ' AND (' , names , ' )))' )
ids <- toJSON ( unlist ( lapply ( c ( actor $ `_source.actorId` , actor $ `_source.ministryId` , actor $ `_source.partyId` ) , str_c , " _min" ) ) )
}
if ( actor $ `_source.function` == " Institution" ) {
#uppercasing
firstup <- function ( x ) {
substr ( x , 1 , 1 ) <- toupper ( substr ( x , 1 , 1 ) )
x
} else {
### Else, generate search for first/last name only (MPs and Party leaders, currently)
query_string <- paste0 ( query_string , ' ))' )
}
ids <- list ( c ( actor $ `_source.actorId` , str_c ( actor $ `_source.partyId` , ' _a' ) ) )
actorid <- actor $ `_source.actorId`
query <- generator ( country , actor $ `_source.startDate` , actor $ `_source.endDate` , query_string , pre_tags , post_tags , actorid )
return ( data.frame ( query = query , ids = I ( ids ) , stringsAsFactors = F ) )
}
### Query generation for party searches
if ( actor $ `_source.function` == " Party" ) {
actor $ `_source.startDate` <- " 2000-01-01"
actor $ `_source.endDate` <- " 2099-01-01"
if ( nchar ( actor $ `_source.institutionNameSearch` [ [1 ] ] ) > 0 ) {
upper <- unlist ( lapply ( actor $ `_source.institutionNameSearch` , firstup ) )
upper <- c ( upper , unlist ( lapply ( upper , str_c , genitive ) ) ,
unlist ( lapply ( upper , str_c , definitive ) ) ,
unlist ( lapply ( upper , str_c , definitive_genitive ) ) )
capital <- unlist ( lapply ( actor $ `_source.institutionNameSearch` , str_to_title ) )
capital <- c ( capital , unlist ( lapply ( capital , str_c , genitive ) ) ,
unlist ( lapply ( capital , str_c , definitive ) ) ,
unlist ( lapply ( capital , str_c , definitive_genitive ) ) )
base <- actor $ `_source.institutionNameSearch`
base <- c ( base , unlist ( lapply ( base , str_c , genitive ) ) ,
unlist ( lapply ( base , str_c , definitive ) ) ,
unlist ( lapply ( base , str_c , definitive_genitive ) ) )
names <- paste ( unique ( c ( upper , capital , base ) ) , collapse = ' \\" \\"' )
query_string <- paste0 ( ' (\\"' , names , ' \\")' )
ids <- toJSON ( unlist ( lapply ( c ( actor $ `_source.institutionId` ) , str_c , " _f" ) ) )
query <- paste0 ( ' { " q u e r y " :
{ " bool" : { " filter" : [ { " term" : { " country" : " ',country,'" } } ,
{ " range" : { " publication_date" : { " gte" : " ',actor$`_source.startDate`,'" , " lte" : " ',actor$`_source.endDate`,'" } } } ,
{ " query_string" : {
" default_operator" : " OR" ,
" allow_leading_wildcard" : " false" ,
" fields" : [ " text" , " teaser" , " preteaser" , " title" , " subtitle" ] ,
" query" : " ', query_string,'"
if ( nchar ( actor $ `_source.partyNameSearchShort` [ [1 ] ] ) > 0 ) {
# If uk, no or dk, search for both regular abbreviations, and genitive forms
if ( country == " uk" | country == " no" | country == " dk" ) {
gen <- unlist ( lapply ( actor $ `_source.partyNameSearchShort` , str_c , genitive ) )
names <- paste ( c ( gen , actor $ `_source.partyNameSearchShort` ) , collapse = ' \\\" \\\"' )
} else {
names <- paste ( actor $ `_source.partyNameSearchShort` , collapse = ' \\\" \\\"' )
}
# If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself)
if ( country == " dk" | country == " no" ) {
gen <- gen [which ( nchar ( gen ) > 2 ) ]
}
]
} } , ' ,highlight,' } ' )
df1 <- data.frame ( query = query , ids = I ( ids ) , type = actor $ `_source.function` , stringsAsFactors = F )
}
if ( nchar ( actor $ `_source.institutionNameSearchShort` [ [1 ] ] ) > 0 ) {
upper <- unlist ( lapply ( actor $ `_source.institutionNameSearchShort` , firstup ) )
upper <- c ( upper , unlist ( lapply ( upper , str_c , genitive ) ) ,
unlist ( lapply ( upper , str_c , definitive ) ) ,
unlist ( lapply ( upper , str_c , definitive_genitive ) ) )
capital <- unlist ( lapply ( actor $ `_source.institutionNameSearchShort` , str_to_title ) )
capital <- c ( capital , unlist ( lapply ( capital , str_c , genitive ) ) ,
unlist ( lapply ( capital , str_c , definitive ) ) ,
unlist ( lapply ( capital , str_c , definitive_genitive ) ) )
base <- actor $ `_source.institutionNameSearchShort`
base <- c ( base , unlist ( lapply ( base , str_c , genitive ) ) ,
unlist ( lapply ( base , str_c , definitive ) ) ,
unlist ( lapply ( base , str_c , definitive_genitive ) ) )
names <- paste ( unique ( c ( upper , capital , base ) ) , collapse = ' \\" \\"' )
query_string <- paste0 ( ' (\\"' , names , ' \\")' )
ids <- toJSON ( unlist ( lapply ( c ( actor $ `_source.institutionId` ) , str_c , " _s" ) ) )
query <- paste0 ( ' { " q u e r y " :
{ " bool" : { " filter" : [ { " term" : { " country" : " ',country,'" } } ,
{ " range" : { " publication_date" : { " gte" : " ',actor$`_source.startDate`,'" , " lte" : " ',actor$`_source.endDate`,'" } } } ,
{ " query_string" : {
" default_operator" : " OR" ,
" allow_leading_wildcard" : " false" ,
" fields" : [ " text" , " teaser" , " preteaser" , " title" , " subtitle" ] ,
" query" : " ', query_string,'"
query_string <- paste0 ( ' (\\\"' , names , ' \\\")' )
ids <- str_c ( actor $ `_source.partyId` , ' _s' )
actorid <- str_c ( actor $ `_source.partyId` , ' _s' )
query <- generator ( country , actor $ `_source.startDate` , actor $ `_source.endDate` , query_string , pre_tags , post_tags , actorid )
df1 <- data.frame ( query = query , ids = I ( ids ) , prefix = actor $ `_source.notPrecededBy` , postfix = actor $ `_source.notFollowedBy` , stringsAsFactors = F )
}
if ( nchar ( actor $ `_source.partyNameSearch` [ [1 ] ] ) > 0 ) {
if ( country == " uk" | country == " no" | country == " dk" ) {
gen <- unlist ( lapply ( actor $ `_source.partyNameSearch` , str_c , genitive ) )
names <- paste ( c ( gen , actor $ `_source.partyNameSearch` ) , collapse = ' \\\" \\\"' )
} else {
names <- paste ( actor $ `_source.partyNameSearch` , collapse = ' \\\" \\\"' )
}
query_string <- paste0 ( ' (\\\"' , names , ' \\\")' )
ids <- str_c ( actor $ `_source.partyId` , ' _f' )
actorid <- str_c ( actor $ `_source.partyId` , ' _f' )
query <- generator ( country , actor $ `_source.startDate` , actor $ `_source.endDate` , query_string , pre_tags , post_tags , actorid )
if ( country == ' uk' ) {
df2 <- data.frame ( query = query , ids = I ( ids ) , prefix = actor $ `_source.notPrecededBy` , postfix = actor $ `_source.notFollowedBy` , stringsAsFactors = F )
} else {
df2 <- data.frame ( query = query , ids = I ( ids ) , stringsAsFactors = F )
}
]
} } , ' ,highlight,' } ' )
df2 <- data.frame ( query = query , ids = I ( ids ) , type = actor $ `_source.function` , stringsAsFactors = F )
}
if ( exists ( ' df1' ) == T & exists ( ' df2' ) == T ) {
return ( bind_rows ( df1 , df2 ) )
@ -149,63 +186,61 @@ if (country == "no") {
return ( df2 )
}
}
if ( actor $ `_source.function` == " Party" ) {
actor $ `_source.startDate` <- " 2000-01-01"
actor $ `_source.endDate` <- " 2099-01-01"
names <- paste ( c ( unlist ( actor $ `_source.partyNameSearchShort` ) ) , collapse = ' \\" \\"' )
query_string <- paste0 ( ' (\\"' , names , ' \\")' )
query <- paste0 ( ' { " q u e r y " :
{ " bool" : { " filter" : [ { " term" : { " country" : " ',country,'" } } ,
{ " range" : { " publication_date" : { " gte" : " ',actor$`_source.startDate`,'" , " lte" : " ',actor$`_source.endDate`,'" } } } ,
{ " query_string" : {
" default_operator" : " OR" ,
" allow_leading_wildcard" : " false" ,
" fields" : [ " text" , " teaser" , " preteaser" , " title" , " subtitle" ] ,
" query" : " ', query_string,'"
}
}
]
} } , ' ,highlight,' } ' )
ids <- c ( toJSON ( unlist ( lapply ( c ( actor $ `_source.partyId` ) , str_c , " _p" ) ) ) )
if ( nchar ( actor $ `_source.partyNameSearch` [ [1 ] ] ) > 0 ) {
names <- paste ( c ( unlist ( actor $ `_source.partyNameSearch` ) ) , collapse = ' \\" \\"' )
query_string <- paste0 ( ' (\\"' , names , ' \\")' )
query2 <- paste0 ( ' { " q u e r y " :
{ " bool" : { " filter" : [ { " term" : { " country" : " ',country,'" } } ,
{ " range" : { " publication_date" : { " gte" : " ',actor$`_source.startDate`,'" , " lte" : " ',actor$`_source.endDate`,'" } } } ,
{ " query_string" : {
" default_operator" : " OR" ,
" allow_leading_wildcard" : " false" ,
" fields" : [ " text" , " teaser" , " preteaser" , " title" , " subtitle" ] ,
" query" : " ', query_string,'"
}
}
]
} } , ' ,highlight,' } ' )
ids <- c ( ids , toJSON ( unlist ( lapply ( c ( actor $ `_source.partyId` ) , str_c , " _p" ) ) ) )
query <- c ( query , query2 )
fn <- c ( ' PartyAbbreviation' , ' Party' )
} else {
fn <- c ( ' PartyAbbreviation' )
}
return ( data.frame ( query = query , ids = I ( ids ) , type = fn , prefix = actor $ `_source.searchAnd` , postfix = actor $ `_source.searchAndNot` , stringsAsFactors = F ) )
}
query <- paste0 ( ' { " q u e r y " :
{ " bool" : { " filter" : [ { " term" : { " country" : " ',country,'" } } ,
{ " range" : { " publication_date" : { " gte" : " ',actor$`_source.startDate`,'" , " lte" : " ',actor$`_source.endDate`,'" } } } ,
{ " query_string" : {
" default_operator" : " OR" ,
" allow_leading_wildcard" : " false" ,
" fields" : [ " text" , " teaser" , " preteaser" , " title" , " subtitle" ] ,
" query" : " ', query_string,'"
}
}
]
} } , ' ,highlight,' } ' )
fn <- actor $ `_source.function`
return ( data.frame ( query = query , ids = I ( ids ) , type = fn , stringsAsFactors = F ) )
### Institution function currently not used
# if (actor$`_source.function` == "Institution") {
# #uppercasing
# firstup <- function(x) {
# substr(x, 1, 1) <- toupper(substr(x, 1, 1))
# x
# }
# actor$`_source.startDate` <- "2000-01-01"
# actor$`_source.endDate` <- "2099-01-01"
# if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) {
# upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup))
# upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
# unlist(lapply(upper, str_c, definitive)),
# unlist(lapply(upper, str_c, definitive_genitive)))
# capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title))
# capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
# unlist(lapply(capital, str_c, definitive)),
# unlist(lapply(capital, str_c, definitive_genitive)))
# base <- actor$`_source.institutionNameSearch`
# base <- c(base, unlist(lapply(base, str_c, genitive)),
# unlist(lapply(base, str_c, definitive)),
# unlist(lapply(base, str_c, definitive_genitive)))
# names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
# query_string <- paste0('(\\"',names,'\\")')
# ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f")))
# actorid <- str_c(actor$`_source.institutionId`,'_f')
# query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
# df1 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
# }
# if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) {
# upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup))
# upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
# unlist(lapply(upper, str_c, definitive)),
# unlist(lapply(upper, str_c, definitive_genitive)))
# capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title))
# capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
# unlist(lapply(capital, str_c, definitive)),
# unlist(lapply(capital, str_c, definitive_genitive)))
# base <- actor$`_source.institutionNameSearchShort`
# base <- c(base, unlist(lapply(base, str_c, genitive)),
# unlist(lapply(base, str_c, definitive)),
# unlist(lapply(base, str_c, definitive_genitive)))
# names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
# query_string <- paste0('(\\"',names,'\\")')
# ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s")))
# actorid <- str_c(actor$`_source.institutionId`,'_s')
# query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
# df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
# }
# if (exists('df1') == T & exists('df2') == T) {
# return(bind_rows(df1,df2))
# } else if (exists('df1') == T) {
# return(df1)
# } else if (exists('df2') == T) {
# return(df2)
# }
# }
}