actorizer: totally revamped way of finding actors

elasticizer: updated dump handling to create a dump for every batch, instead of one big file at the end
out_parser: streamlined code
query_gen_actors: only include relevant fields
ud_update: changed function parameters to work with elasticizer dump function
master
Your Name 4 years ago
parent 0e593075ee
commit 77eb51a1bf

@ -9,58 +9,64 @@
#' @param identifier String used to mark highlights. Should be a lowercase string #' @param identifier String used to mark highlights. Should be a lowercase string
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param es_super Password for write access to ElasticSearch #' @param es_super Password for write access to ElasticSearch
#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
#' @return As this is a nested function used within elasticizer, there is no return output #' @return As this is a nested function used within elasticizer, there is no return output
#' @export #' @export
#' @examples #' @examples
#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super) #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = 1) { actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) {
plan(multiprocess, workers = cores)
### Function to filter out false positives using regex
exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
sentence <- paste0(' ',str_sub(doc$merged, min, max),' ') # Extract sentence from text, adding whitespaces before and after for double negation (i.e. Con only when preceded by "("))
# Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
return(id)
} else {
return(NULL)
}
}
ranger <- function(x, ud) {
return(which((ud$start <= x) & (ud$end >= x)))
}
sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) { sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
doc <- out[row,] doc <- out[row,]
if (nchar(doc$merged) > 990000) { if (sum(nchar(doc$merged) > 990000)) {
return( stop("One or more documents in this batch exceed 990000 characters")
data.frame(
err = T,
errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
)
)
} }
# Extracting ud output from document # Extracting ud output from document
ud <- doc$`_source.ud`[[1]] %>% ud <- doc %>%
select(-one_of('exists')) %>% # Removing ud.exists variable select(`_id`,`_source.ud`, merged) %>%
unnest() %>% unnest(cols = c("_source.ud")) %>%
mutate(doc_id = doc$`_id`) select(`_id`,lemma,start,end, sentence_id,merged) %>%
markers <- doc$markers[[1]][,'start'] # Extract list of markers unnest(cols = colnames(.))
# Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
rows <- unlist(lapply(markers, ranger, ud = ud))
# Setting up an actor variable
ud$actor <- F
ud$actor[rows] <- T
sentence_count <- max(ud$sentence_id) # Number of sentences in article sentences <- ud %>%
actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor group_by(`_id`, sentence_id) %>%
summarise(
sentence_start = min(start),
sentence_end = max(end)
) %>%
mutate(
sentence_count = n()
)
hits <- left_join(ud, markers, by='_id') %>%
mutate(
actor = case_when(
start <= marker_start & end >= marker_start ~ T,
T ~ F
)
) %>%
select(`_id`, sentence_id, start, end,actor,merged) %>%
filter(actor) %>%
group_by(`_id`,sentence_id) %>%
summarise(
actor = any(actor),
actor_start = I(list(start)),
actor_end = I(list(end)),
n_markers = length(start),
merged = first(merged)
) %>%
left_join(.,sentences, by=c('_id','sentence_id')) %>%
ungroup %>%
arrange(`_id`,sentence_id) %>%
group_by(`_id`) %>%
mutate(n_markers = cumsum(n_markers)) %>%
mutate(
sentence_start_tags = sentence_start+((nchar(pre_tags)+nchar(post_tags))*(lag(n_markers, default = 0))),
sentence_end_tags = sentence_end+((nchar(pre_tags)+nchar(post_tags))*(n_markers))
) %>%
mutate(
sentence = paste0(' ',str_sub(merged, sentence_start_tags, sentence_end_tags),' ')
) %>%
select(-merged) %>%
ungroup()
# Conducting regex filtering on matches only when there is a prefix and/or postfix to apply # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
if (!is.na(prefix) || !is.na(postfix)) { if (!is.na(prefix) || !is.na(postfix)) {
### If no pre or postfixes, match *not nothing* i.e. anything ### If no pre or postfixes, match *not nothing* i.e. anything
@ -70,68 +76,50 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
if (is.na(postfix)) { if (is.na(postfix)) {
postfix = '$^' postfix = '$^'
} }
sentence_ids <- unlist(lapply(actor_sentences, hits <- hits %>%
exceptionizer, filter(
ud = ud, !str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) & !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))
doc = doc, )
markers = markers,
pre_tags_regex = pre_tags_regex,
pre_tags = pre_tags,
post_tags_regex = post_tags_regex,
post_tags = post_tags,
prefix = prefix,
postfix = postfix))
} else {
sentence_ids <- actor_sentences
} }
if (length(sentence_ids > 0)) {
# Generating nested sentence start and end positions for actor sentences hits <- hits %>%
ud <- ud %>% group_by(`_id`) %>%
filter(sentence_id %in% sentence_ids)
actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
ud <- ud %>%
group_by(sentence_id) %>%
summarise (
sentence_start = as.integer(min(start)),
sentence_end = as.integer(max(end)),
doc_id = first(doc_id)
) %>%
group_by(doc_id) %>%
summarise( summarise(
sentence_id = list(as.integer(sentence_id)), sentence_id = list(as.integer(sentence_id)),
sentence_start = list(sentence_start), sentence_start = list(sentence_start),
sentence_end = list(sentence_end) sentence_end = list(sentence_end),
) actor_start = I(list(unlist(actor_start))), # List of actor ud token start positions
actor_end = I(list(unlist(actor_end))), # List of actor ud token end positions
return( occ = length(unique(unlist(sentence_id))), # Number of sentences in which actor occurs
data.frame(ud, # Sentence id, start and end position for actor sentences first = min(unlist(sentence_id)), # First sentence in which actor is mentioned
actor_start = I(list(actor_start)), # List of actor ud token start positions ids = I(list(ids)),
actor_end = I(list(actor_end)), # List of actor ud token end positions sentence_count = first(sentence_count)# List of actor ids
occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs ) %>%
prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences) mutate(
rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level prom = occ/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
first = min(sentence_ids), # First sentence in which actor is mentioned rel_first = 1-(first/sentence_count), # Relative position of first occurence at sentence level
ids = I(list(ids)) # List of actor ids ) %>%
) select(`_id`:occ, prom,rel_first,first,ids)
) return(hits)
} else {
return(NULL)
}
} }
out <- mamlr:::out_parser(out, field = 'highlight', clean = F, cores = cores) out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
offsetter <- function(x, pre_tags, post_tags) { offsetter <- function(x, pre_tags, post_tags) {
return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags)))) return(as.list(as.data.frame(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))))
} }
prefix[prefix==''] <- NA prefix[prefix==''] <- NA
postfix[postfix==''] <- NA postfix[postfix==''] <- NA
pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags) pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags) post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags) out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags)
markers <- out %>%
select(`_id`,markers) %>%
unnest_wider(markers) %>%
rename(marker_start = start, marker_end = end) %>%
unnest(colnames(.))
# ids <- fromJSON(ids) # ids <- fromJSON(ids)
updates <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), sentencizer, updates <- sentencizer(1:1024,
out = out, out = out,
ids = ids, ids = ids,
postfix = postfix, postfix = postfix,
@ -139,7 +127,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
pre_tags_regex = pre_tags_regex, pre_tags_regex = pre_tags_regex,
pre_tags = pre_tags, pre_tags = pre_tags,
post_tags_regex = post_tags_regex, post_tags_regex = post_tags_regex,
post_tags = post_tags)) post_tags = post_tags)
if (nrow(updates) == 0) { if (nrow(updates) == 0) {
print("Nothing to update for this batch") print("Nothing to update for this batch")
return(NULL) return(NULL)

@ -108,6 +108,9 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
hits <- length(json$hits$hits) hits <- length(json$hits$hits)
batch <- 1 batch <- 1
print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.')) print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
if (dump) {
saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
}
if (length(update) > 0){ if (length(update) > 0){
update(out, localhost = localhost, ...) update(out, localhost = localhost, ...)
} }
@ -134,6 +137,8 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
if (length(update) > 0){ if (length(update) > 0){
out <- jsonlite:::flatten(json$hits$hits) out <- jsonlite:::flatten(json$hits$hits)
update(out, localhost = localhost, ...) update(out, localhost = localhost, ...)
} else if (dump) {
saveRDS(jsonlite:::flatten(json$hits$hits), file = paste0('batch_',batch*batch_size,'.Rds'))
} else { } else {
out <- bind_rows(out, jsonlite:::flatten(json$hits$hits)) out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
} }
@ -143,7 +148,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
scroll_clear(conn = conn, x = json$`_scroll_id`) scroll_clear(conn = conn, x = json$`_scroll_id`)
return("Done updating") return("Done updating")
} else if (dump) { } else if (dump) {
saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds')) return("Dumping complete")
} else { } else {
scroll_clear(conn = conn, x = json$`_scroll_id`) scroll_clear(conn = conn, x = json$`_scroll_id`)
return(out) return(out)

@ -24,43 +24,50 @@ out_parser <- function(out, field, clean = F) {
par_parser <- function(row, out, field, clean) { par_parser <- function(row, out, field, clean) {
doc <- out[row,] doc <- out[row,]
if (field == 'highlight') { if (field == 'highlight') {
doc <- replace(doc, doc=="NULL", NA)
### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
doc$highlight.title[is.na(doc$highlight.title)] <- doc$`_source.title`[is.na(doc$highlight.title)]
doc$highlight.text[is.na(doc$highlight.text)] <- doc$`_source.text`[is.na(doc$highlight.text)]
doc$highlight.teaser[is.na(doc$highlight.teaser)] <- doc$`_source.teaser`[is.na(doc$highlight.teaser)]
doc$highlight.subtitle[is.na(doc$highlight.subtitle)] <- doc$`_source.subtitle`[is.na(doc$highlight.subtitle)]
doc$highlight.preteaser[is.na(doc$highlight.preteaser)] <- doc$`_source.preteaser`[is.na(doc$highlight.preteaser)]
doc <- doc %>% doc <- doc %>%
mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>% unnest(cols = starts_with("highlight")) %>%
mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>% mutate(across(starts_with("highlight"), na_if, "NULL")) %>%
mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>% mutate(highlight.title = coalesce(highlight.title, `_source.title`),
mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>% highlight.subtitle = coalesce(highlight.subtitle, `_source.subtitle`),
mutate(highlight.text = str_replace_na(highlight.text, replacement = '')) highlight.preteaser = coalesce(highlight.preteaser, `_source.preteaser`),
doc$merged <- str_c(doc$highlight.title, highlight.teaser = coalesce(highlight.teaser, `_source.teaser`),
doc$highlight.subtitle, highlight.text = coalesce(highlight.text, `_source.text`)
doc$highlight.preteaser, ) %>%
doc$highlight.teaser, mutate(highlight.title = str_replace_na(highlight.title, replacement = ''),
doc$highlight.text, highlight.subtitle = str_replace_na(highlight.subtitle, replacement = ''),
highlight.preteaser = str_replace_na(highlight.preteaser, replacement = ''),
highlight.teaser = str_replace_na(highlight.teaser, replacement = ''),
highlight.text = str_replace_na(highlight.text, replacement = '')
) %>%
mutate(
merged = str_c(highlight.title,
highlight.subtitle,
highlight.preteaser,
highlight.teaser,
highlight.text,
'', '',
sep = ". ") sep = ". ")
)
} }
if (field == '_source') { if (field == '_source') {
doc <- doc %>% doc <- doc %>%
mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>% mutate(`_source.title` = str_replace_na(`_source.title`, replacement = ''),
mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>% `_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = ''),
mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>% `_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = ''),
mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>% `_source.teaser` = str_replace_na(`_source.teaser`, replacement = ''),
mutate(`_source.text` = str_replace_na(`_source.text`, replacement = '')) `_source.text` = str_replace_na(`_source.text`, replacement = '')
doc$merged <- str_c(doc$`_source.title`, ) %>%
doc$`_source.subtitle`, mutate(
doc$`_source.preteaser`, merged = str_c(`_source.title`,
doc$`_source.teaser`, `_source.subtitle`,
doc$`_source.text`, `_source.preteaser`,
`_source.teaser`,
`_source.text`,
'', '',
sep = ". ") sep = ". ")
)
} }
### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences

@ -15,7 +15,7 @@
################################################################################################# #################################################################################################
query_gen_actors <- function(actor, country, pre_tags, post_tags) { query_gen_actors <- function(actor, country, pre_tags, post_tags) {
generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) { generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
return(paste0('{"_source": ["ud"], return(paste0('{"_source": ["ud","title","subtitle","preteaser","teaser","text"],
"query": "query":
{"bool": { {"bool": {
"filter":[ "filter":[

@ -1,10 +1,10 @@
#' Elasticizer update function: generate UDpipe output from base text #' Generate UDpipe output from base text
#' #'
#' Elasticizer update function: generate UDpipe output from base text #' Generate UDpipe output from base text
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function) #' @param file Filename of file to read in, also used for generating output file name
#' @param udmodel UDpipe model to use #' @param wd Working directory where *file*s can be found
#' @param ud_file Filename of udpipe model to use, should be in *wd*
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param file Filename for output (ud_ is automatically prepended)
#' @return A vector of 1's indicating the success of each update call #' @return A vector of 1's indicating the success of each update call
#' @export #' @export
#' @examples #' @examples
@ -17,9 +17,11 @@
# } # }
# } # }
ud_update <- function(out, udmodel, ver, file) { ud_update <- function(file, wd, ud_file, ver) {
out <- mamlr:::out_parser(out, field = '_source', clean = F) out <- readRDS(str_c(wd,'/',file)) %>%
ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>% out_parser(., field = '_source', clean = F)
ud_model <- udpipe_load_model(file = str_c(wd,'/',ud_file))
ud <- as.data.frame(udpipe(ud_model, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
group_by(doc_id) %>% group_by(doc_id) %>%
summarise( summarise(
sentence_id = list(as.integer(sentence_id)), sentence_id = list(as.integer(sentence_id)),
@ -34,7 +36,7 @@ ud_update <- function(out, udmodel, ver, file) {
exists = list(TRUE) exists = list(TRUE)
) )
bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver) bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
saveRDS(bulk, file = paste0('ud_',file)) saveRDS(bulk, file = str_c(wd,'/ud_',file))
# res <- elastic_update(bulk, es_super = es_super, localhost = localhost) # res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
return() return()
} }

@ -13,8 +13,7 @@ actorizer(
pre_tags, pre_tags,
post_tags, post_tags,
es_super, es_super,
ver, ver
cores = 1
) )
} }
\arguments{ \arguments{
@ -32,8 +31,6 @@ actorizer(
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
\item{identifier}{String used to mark highlights. Should be a lowercase string} \item{identifier}{String used to mark highlights. Should be a lowercase string}
} }
\value{ \value{

@ -2,24 +2,24 @@
% Please edit documentation in R/ud_update.R % Please edit documentation in R/ud_update.R
\name{ud_update} \name{ud_update}
\alias{ud_update} \alias{ud_update}
\title{Elasticizer update function: generate UDpipe output from base text} \title{Generate UDpipe output from base text}
\usage{ \usage{
ud_update(out, udmodel, ver) ud_update(file, wd, ud_file, ver)
} }
\arguments{ \arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} \item{file}{Filename of file to read in, also used for generating output file name}
\item{udmodel}{UDpipe model to use} \item{wd}{Working directory where *file*s can be found}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} \item{ud_file}{Filename of udpipe model to use, should be in *wd*}
\item{file}{Filename for output (ud_ is automatically prepended)} \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
} }
\value{ \value{
A vector of 1's indicating the success of each update call A vector of 1's indicating the success of each update call
} }
\description{ \description{
Elasticizer update function: generate UDpipe output from base text Generate UDpipe output from base text
} }
\examples{ \examples{
ud_update(out, udmodel, ver, file) ud_update(out, udmodel, ver, file)

Loading…
Cancel
Save