actorizer, out_parser: switched from mclapply to future_lapply and removed windows-specific code from out_parser

query_gen_actors: rewritten minister queries to only use proximity queries
master
Erik de Vries 5 years ago
parent d0601d2aa7
commit 060751237b

@ -14,7 +14,8 @@
#' @export #' @export
#' @examples #' @examples
#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super) #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = detectCores()) { actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = 1) {
plan(multiprocess, workers = cores)
### Function to filter out false positives using regex ### Function to filter out false positives using regex
exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) { exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
@ -127,10 +128,10 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
postfix[postfix==''] <- NA postfix[postfix==''] <- NA
pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags) pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags) post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = cores) out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags)
# ids <- fromJSON(ids) # ids <- fromJSON(ids)
updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, updates <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), sentencizer,
out = out, out = out,
ids = ids, ids = ids,
postfix = postfix, postfix = postfix,
@ -138,8 +139,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
pre_tags_regex = pre_tags_regex, pre_tags_regex = pre_tags_regex,
pre_tags = pre_tags, pre_tags = pre_tags,
post_tags_regex = post_tags_regex, post_tags_regex = post_tags_regex,
post_tags = post_tags, post_tags = post_tags))
mc.cores = cores))
if (nrow(updates) == 0) { if (nrow(updates) == 0) {
print("Nothing to update for this batch") print("Nothing to update for this batch")
return(NULL) return(NULL)

@ -12,7 +12,8 @@
################################################################################################# #################################################################################################
#################################### Parser function for output fields ########################## #################################### Parser function for output fields ##########################
################################################################################################# #################################################################################################
out_parser <- function(out, field, clean = F, cores = detectCores()) { out_parser <- function(out, field, clean = F, cores = 1) {
plan(multiprocess, workers = cores)
fncols <- function(data, cname) { fncols <- function(data, cname) {
add <-cname[!cname%in%names(data)] add <-cname[!cname%in%names(data)]
@ -78,10 +79,5 @@ out_parser <- function(out, field, clean = F, cores = detectCores()) {
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . }
return(doc) return(doc)
} }
if (Sys.info()[['sysname']] == "Windows") { out <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field))
cores <- 1
} else {
cores <- cores
}
out <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field, mc.cores = cores))
} }

@ -50,6 +50,11 @@ query_gen_actors <- function(actor, country, pre_tags, post_tags) {
} }
}')) }'))
} }
prox_gen <- function(row, grid) {
return(
paste0('\\"',grid[row,]$first,' ',grid[row,]$last,'\\"~',grid[row,]$prox)
)
}
### Setting linguistic forms for each country ### ### Setting linguistic forms for each country ###
if (country == "no" | country == "dk") { if (country == "no" | country == "dk") {
genitive <- 's' genitive <- 's'
@ -67,78 +72,56 @@ query_gen_actors <- function(actor, country, pre_tags, post_tags) {
## Regardless of whether the last name hit is because of a minister name or a full name proximity hit ## Regardless of whether the last name hit is because of a minister name or a full name proximity hit
### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name ### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name
if (country == 'be') { if (country == 'be' && T %in% str_detect(actor$`_source.middleNames`,"'")) {
if (T %in% str_detect(actor$`_source.middleNames`,"'")) { last_list <- c(actor$`_source.lastName`, str_c(actor$`_source.lastName`,genitive), tolower(actor$`_source.lastName`), str_c(tolower(actor$`_source.lastName`),genitive))
lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,' OR ',tolower(actor$`_source.lastName`),' OR ',tolower(actor$`_source.lastName`),genitive,')') grid <- crossing(first = actor$`_source.firstName`, last = last_list, prox = 5)
query_string <- paste0('(((\\"', fullname <- lapply(1:nrow(grid), prox_gen, grid = grid)
actor$`_source.firstName`, query_string <- paste0('((',
' ', paste0(unlist(fullname), collapse = ' OR '),') AND ',
actor$`_source.lastName`, paste0('(',paste0(unlist(last_list), collapse = ' OR '),')'))
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
tolower(actor$`_source.lastName`),
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
tolower(actor$`_source.lastName`),genitive,
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,genitive,
'\\"~5) AND ',lastname)
} else {
lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')')
query_string <- paste0('(((\\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,
'\\"~5 OR \\"',
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,genitive,
'\\"~5) AND ',lastname)
}
} else { } else {
lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')') last_list <- c(actor$`_source.lastName`, str_c(actor$`_source.lastName`,genitive))
query_string <- paste0('(((\\"', grid <- crossing(first = actor$`_source.firstName`, last = last_list, prox = 5)
actor$`_source.firstName`, fullname <- lapply(1:nrow(grid), prox_gen, grid = grid)
' ', query_string <- paste0('((',
actor$`_source.lastName`, paste0(unlist(fullname), collapse = ' OR '),') AND ',
'\\"~5 OR \\"', paste0('(',paste0(unlist(last_list), collapse = ' OR '),')'))
actor$`_source.firstName`,
' ',
actor$`_source.lastName`,genitive,
'\\"~5) AND ',lastname)
} }
### If actor is a minister, generate minister search ### If actor is a minister, generate minister search
if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM") { if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM") {
# If country is no or dk, search for minister policy names (eg likestillingsminister) in both undefined, defined and genitive forms
if (country == "no" || country == "dk") {
capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title)) capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title))
capital_gen <- unlist(lapply(capital, str_c, genitive)) capital_gen <- unlist(lapply(capital, str_c, genitive))
gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive)) gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive))
# If country is no or dk, search for definitive minister forms as well (including genitive forms)
if (country == "no" || country == "dk") {
capital_def <- unlist(lapply(capital, str_c, definitive)) capital_def <- unlist(lapply(capital, str_c, definitive))
capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive)) capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive))
def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive)) def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive))
defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive)) defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive))
names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ') names <- paste(unlist(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen)), collapse = '\\" \\"')
} else { query_string <- paste0(query_string,') OR (',lastname,' AND (\\"',unlist(names),'\\")))')
names <- paste(c(capital,capital_gen,gen,actor$`_source.ministerSearch`), collapse = ' ')
} }
# If country is uk, search for last name, minister names and minister titles (i.e. Johnson AND state AND secretary) # If country is uk, search for last name and minister name in proximity (e.g "secretary Johnson"~5)
if (country == "uk") { if (country == "uk") {
minister <- paste(c(str_to_title(actor$`_source.ministerName`), minister <- c(str_to_title(actor$`_source.ministerName`),
actor$`_source.ministerName`, actor$`_source.ministerName`,
str_c(str_to_title(actor$`_source.ministerName`),genitive), str_c(str_to_title(actor$`_source.ministerName`),genitive),
str_c(actor$`_source.ministerName`,genitive)), collapse = ' ') str_c(actor$`_source.ministerName`,genitive))
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND (',unlist(minister),')))') grid <- crossing(first = actor$`_source.lastName`, last = minister, prox = 5)
} else if (country == "nl" | country == "be") { # If country is nl or be, add a requirement for Minister to the query ministername <- lapply(1:nrow(grid), prox_gen, grid = grid)
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND (Minister OR minister)))') query_string <- paste0(query_string,') OR (',
} else { paste0(unlist(ministername), collapse= ' OR '),')')
query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))') }
if (country == "nl" | country == "be") { # If country is nl or be, add a requirement for Minister to the query
minister <- c('Minister',
'minister',
str_c('Minister',genitive),
str_c('minister',genitive))
grid <- crossing(first = actor$`_source.lastName`, last = minister, prox = 5)
ministername <- lapply(1:nrow(grid), prox_gen, grid = grid)
query_string <- paste0(query_string,') OR (',
paste0(unlist(ministername), collapse= ' OR '),')')
} }
} else { ### Else, generate search for first/last name only (MPs and Party leaders, currently) } else { ### Else, generate search for first/last name only (MPs and Party leaders, currently)
query_string <- paste0(query_string,'))') query_string <- paste0(query_string,'))')
@ -157,15 +140,15 @@ query_gen_actors <- function(actor, country, pre_tags, post_tags) {
# If uk, no or dk, search for both regular abbreviations, and genitive forms # If uk, no or dk, search for both regular abbreviations, and genitive forms
if (country == "uk" | country == "no" | country == "dk") { if (country == "uk" | country == "no" | country == "dk") {
gen <- unlist(lapply(actor$`_source.partyNameSearchShort`, str_c, genitive)) gen <- unlist(lapply(actor$`_source.partyNameSearchShort`, str_c, genitive))
names <- paste(c(gen,actor$`_source.partyNameSearchShort`), collapse = '\\\" \\\"') names <- paste(unlist(c(gen,actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"')
} else { } else {
names <- paste(actor$`_source.partyNameSearchShort`, collapse = '\\\" \\\"') names <- paste(unlist(actor$`_source.partyNameSearchShort`), collapse = '\\" \\"')
} }
# If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself) # If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself)
if (country == "dk" | country == "no") { if (country == "dk" | country == "no") {
gen <- gen[which(nchar(gen) > 2)] gen <- gen[which(nchar(gen) > 2)]
} }
query_string <- paste0('(\\\"',names,'\\\")') query_string <- paste0('(\\"',unlist(names),'\\")')
ids <- str_c(actor$`_source.partyId`,'_s') ids <- str_c(actor$`_source.partyId`,'_s')
actorid <- str_c(actor$`_source.partyId`,'_s') actorid <- str_c(actor$`_source.partyId`,'_s')
query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid) query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
@ -174,11 +157,11 @@ query_gen_actors <- function(actor, country, pre_tags, post_tags) {
if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) { if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
if (country == "uk" | country == "no" | country == "dk") { if (country == "uk" | country == "no" | country == "dk") {
gen <- unlist(lapply(actor$`_source.partyNameSearch`, str_c, genitive)) gen <- unlist(lapply(actor$`_source.partyNameSearch`, str_c, genitive))
names <- paste(c(gen,actor$`_source.partyNameSearch`), collapse = '\\\" \\\"') names <- paste(unlist(c(gen,actor$`_source.partyNameSearch`)), collapse = '\\" \\"')
} else { } else {
names <- paste(actor$`_source.partyNameSearch`, collapse = '\\\" \\\"') names <- paste(unlist(actor$`_source.partyNameSearch`), collapse = '\\" \\"')
} }
query_string <- paste0('(\\\"',names,'\\\")') query_string <- paste0('(\\"',unlist(names),'\\")')
ids <- str_c(actor$`_source.partyId`,'_f') ids <- str_c(actor$`_source.partyId`,'_f')
actorid <- str_c(actor$`_source.partyId`,'_f') actorid <- str_c(actor$`_source.partyId`,'_f')
query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid) query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)

Loading…
Cancel
Save