actorizer: totally revamped way of finding actors

elasticizer: updated dump handling to create a dump for every batch, instead of one big file at the end
out_parser: streamlined code
query_gen_actors: only include relevant fields
ud_update: changed function parameters to work with elasticizer dump function
master
Your Name 5 years ago
parent 0e593075ee
commit 77eb51a1bf

@ -9,58 +9,64 @@
#' @param identifier String used to mark highlights. Should be a lowercase string
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param es_super Password for write access to ElasticSearch
#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
#' @return As this is a nested function used within elasticizer, there is no return output
#' @export
#' @examples
#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = 1) {
plan(multiprocess, workers = cores)
### Function to filter out false positives using regex
exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
sentence <- paste0(' ',str_sub(doc$merged, min, max),' ') # Extract sentence from text, adding whitespaces before and after for double negation (i.e. Con only when preceded by "("))
# Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
return(id)
} else {
return(NULL)
}
}
ranger <- function(x, ud) {
return(which((ud$start <= x) & (ud$end >= x)))
}
actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) {
sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
doc <- out[row,]
if (nchar(doc$merged) > 990000) {
return(
data.frame(
err = T,
errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
)
)
if (sum(nchar(doc$merged) > 990000)) {
stop("One or more documents in this batch exceed 990000 characters")
}
# Extracting ud output from document
ud <- doc$`_source.ud`[[1]] %>%
select(-one_of('exists')) %>% # Removing ud.exists variable
unnest() %>%
mutate(doc_id = doc$`_id`)
markers <- doc$markers[[1]][,'start'] # Extract list of markers
# Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
rows <- unlist(lapply(markers, ranger, ud = ud))
# Setting up an actor variable
ud$actor <- F
ud$actor[rows] <- T
ud <- doc %>%
select(`_id`,`_source.ud`, merged) %>%
unnest(cols = c("_source.ud")) %>%
select(`_id`,lemma,start,end, sentence_id,merged) %>%
unnest(cols = colnames(.))
sentence_count <- max(ud$sentence_id) # Number of sentences in article
actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
sentences <- ud %>%
group_by(`_id`, sentence_id) %>%
summarise(
sentence_start = min(start),
sentence_end = max(end)
) %>%
mutate(
sentence_count = n()
)
hits <- left_join(ud, markers, by='_id') %>%
mutate(
actor = case_when(
start <= marker_start & end >= marker_start ~ T,
T ~ F
)
) %>%
select(`_id`, sentence_id, start, end,actor,merged) %>%
filter(actor) %>%
group_by(`_id`,sentence_id) %>%
summarise(
actor = any(actor),
actor_start = I(list(start)),
actor_end = I(list(end)),
n_markers = length(start),
merged = first(merged)
) %>%
left_join(.,sentences, by=c('_id','sentence_id')) %>%
ungroup %>%
arrange(`_id`,sentence_id) %>%
group_by(`_id`) %>%
mutate(n_markers = cumsum(n_markers)) %>%
mutate(
sentence_start_tags = sentence_start+((nchar(pre_tags)+nchar(post_tags))*(lag(n_markers, default = 0))),
sentence_end_tags = sentence_end+((nchar(pre_tags)+nchar(post_tags))*(n_markers))
) %>%
mutate(
sentence = paste0(' ',str_sub(merged, sentence_start_tags, sentence_end_tags),' ')
) %>%
select(-merged) %>%
ungroup()
# Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
if (!is.na(prefix) || !is.na(postfix)) {
### If no pre or postfixes, match *not nothing* i.e. anything
@ -70,68 +76,50 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
if (is.na(postfix)) {
postfix = '$^'
}
sentence_ids <- unlist(lapply(actor_sentences,
exceptionizer,
ud = ud,
doc = doc,
markers = markers,
pre_tags_regex = pre_tags_regex,
pre_tags = pre_tags,
post_tags_regex = post_tags_regex,
post_tags = post_tags,
prefix = prefix,
postfix = postfix))
} else {
sentence_ids <- actor_sentences
}
if (length(sentence_ids > 0)) {
# Generating nested sentence start and end positions for actor sentences
ud <- ud %>%
filter(sentence_id %in% sentence_ids)
actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
ud <- ud %>%
group_by(sentence_id) %>%
summarise (
sentence_start = as.integer(min(start)),
sentence_end = as.integer(max(end)),
doc_id = first(doc_id)
) %>%
group_by(doc_id) %>%
summarise(
sentence_id = list(as.integer(sentence_id)),
sentence_start = list(sentence_start),
sentence_end = list(sentence_end)
)
return(
data.frame(ud, # Sentence id, start and end position for actor sentences
actor_start = I(list(actor_start)), # List of actor ud token start positions
actor_end = I(list(actor_end)), # List of actor ud token end positions
occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs
prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level
first = min(sentence_ids), # First sentence in which actor is mentioned
ids = I(list(ids)) # List of actor ids
)
hits <- hits %>%
filter(
!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) & !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))
)
} else {
return(NULL)
}
hits <- hits %>%
group_by(`_id`) %>%
summarise(
sentence_id = list(as.integer(sentence_id)),
sentence_start = list(sentence_start),
sentence_end = list(sentence_end),
actor_start = I(list(unlist(actor_start))), # List of actor ud token start positions
actor_end = I(list(unlist(actor_end))), # List of actor ud token end positions
occ = length(unique(unlist(sentence_id))), # Number of sentences in which actor occurs
first = min(unlist(sentence_id)), # First sentence in which actor is mentioned
ids = I(list(ids)),
sentence_count = first(sentence_count)# List of actor ids
) %>%
mutate(
prom = occ/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
rel_first = 1-(first/sentence_count), # Relative position of first occurence at sentence level
) %>%
select(`_id`:occ, prom,rel_first,first,ids)
return(hits)
}
out <- mamlr:::out_parser(out, field = 'highlight', clean = F, cores = cores)
out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
offsetter <- function(x, pre_tags, post_tags) {
return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
return(as.list(as.data.frame(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))))
}
prefix[prefix==''] <- NA
postfix[postfix==''] <- NA
pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags)
markers <- out %>%
select(`_id`,markers) %>%
unnest_wider(markers) %>%
rename(marker_start = start, marker_end = end) %>%
unnest(colnames(.))
# ids <- fromJSON(ids)
updates <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), sentencizer,
updates <- sentencizer(1:1024,
out = out,
ids = ids,
postfix = postfix,
@ -139,7 +127,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
pre_tags_regex = pre_tags_regex,
pre_tags = pre_tags,
post_tags_regex = post_tags_regex,
post_tags = post_tags))
post_tags = post_tags)
if (nrow(updates) == 0) {
print("Nothing to update for this batch")
return(NULL)

@ -108,6 +108,9 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
hits <- length(json$hits$hits)
batch <- 1
print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
if (dump) {
saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
}
if (length(update) > 0){
update(out, localhost = localhost, ...)
}
@ -134,6 +137,8 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
if (length(update) > 0){
out <- jsonlite:::flatten(json$hits$hits)
update(out, localhost = localhost, ...)
} else if (dump) {
saveRDS(jsonlite:::flatten(json$hits$hits), file = paste0('batch_',batch*batch_size,'.Rds'))
} else {
out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
}
@ -143,7 +148,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
scroll_clear(conn = conn, x = json$`_scroll_id`)
return("Done updating")
} else if (dump) {
saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
return("Dumping complete")
} else {
scroll_clear(conn = conn, x = json$`_scroll_id`)
return(out)

@ -24,43 +24,50 @@ out_parser <- function(out, field, clean = F) {
par_parser <- function(row, out, field, clean) {
doc <- out[row,]
if (field == 'highlight') {
doc <- replace(doc, doc=="NULL", NA)
### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
doc$highlight.title[is.na(doc$highlight.title)] <- doc$`_source.title`[is.na(doc$highlight.title)]
doc$highlight.text[is.na(doc$highlight.text)] <- doc$`_source.text`[is.na(doc$highlight.text)]
doc$highlight.teaser[is.na(doc$highlight.teaser)] <- doc$`_source.teaser`[is.na(doc$highlight.teaser)]
doc$highlight.subtitle[is.na(doc$highlight.subtitle)] <- doc$`_source.subtitle`[is.na(doc$highlight.subtitle)]
doc$highlight.preteaser[is.na(doc$highlight.preteaser)] <- doc$`_source.preteaser`[is.na(doc$highlight.preteaser)]
doc <- doc %>%
mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
doc$merged <- str_c(doc$highlight.title,
doc$highlight.subtitle,
doc$highlight.preteaser,
doc$highlight.teaser,
doc$highlight.text,
'',
sep = ". ")
unnest(cols = starts_with("highlight")) %>%
mutate(across(starts_with("highlight"), na_if, "NULL")) %>%
mutate(highlight.title = coalesce(highlight.title, `_source.title`),
highlight.subtitle = coalesce(highlight.subtitle, `_source.subtitle`),
highlight.preteaser = coalesce(highlight.preteaser, `_source.preteaser`),
highlight.teaser = coalesce(highlight.teaser, `_source.teaser`),
highlight.text = coalesce(highlight.text, `_source.text`)
) %>%
mutate(highlight.title = str_replace_na(highlight.title, replacement = ''),
highlight.subtitle = str_replace_na(highlight.subtitle, replacement = ''),
highlight.preteaser = str_replace_na(highlight.preteaser, replacement = ''),
highlight.teaser = str_replace_na(highlight.teaser, replacement = ''),
highlight.text = str_replace_na(highlight.text, replacement = '')
) %>%
mutate(
merged = str_c(highlight.title,
highlight.subtitle,
highlight.preteaser,
highlight.teaser,
highlight.text,
'',
sep = ". ")
)
}
if (field == '_source') {
doc <- doc %>%
mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
doc$merged <- str_c(doc$`_source.title`,
doc$`_source.subtitle`,
doc$`_source.preteaser`,
doc$`_source.teaser`,
doc$`_source.text`,
'',
sep = ". ")
mutate(`_source.title` = str_replace_na(`_source.title`, replacement = ''),
`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = ''),
`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = ''),
`_source.teaser` = str_replace_na(`_source.teaser`, replacement = ''),
`_source.text` = str_replace_na(`_source.text`, replacement = '')
) %>%
mutate(
merged = str_c(`_source.title`,
`_source.subtitle`,
`_source.preteaser`,
`_source.teaser`,
`_source.text`,
'',
sep = ". ")
)
}
### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences

@ -15,7 +15,7 @@
#################################################################################################
query_gen_actors <- function(actor, country, pre_tags, post_tags) {
generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
return(paste0('{"_source": ["ud"],
return(paste0('{"_source": ["ud","title","subtitle","preteaser","teaser","text"],
"query":
{"bool": {
"filter":[

@ -1,10 +1,10 @@
#' Elasticizer update function: generate UDpipe output from base text
#' Generate UDpipe output from base text
#'
#' Elasticizer update function: generate UDpipe output from base text
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
#' @param udmodel UDpipe model to use
#' Generate UDpipe output from base text
#' @param file Filename of file to read in, also used for generating output file name
#' @param wd Working directory where *file*s can be found
#' @param ud_file Filename of udpipe model to use, should be in *wd*
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param file Filename for output (ud_ is automatically prepended)
#' @return A vector of 1's indicating the success of each update call
#' @export
#' @examples
@ -17,9 +17,11 @@
# }
# }
ud_update <- function(out, udmodel, ver, file) {
out <- mamlr:::out_parser(out, field = '_source', clean = F)
ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
ud_update <- function(file, wd, ud_file, ver) {
out <- readRDS(str_c(wd,'/',file)) %>%
out_parser(., field = '_source', clean = F)
ud_model <- udpipe_load_model(file = str_c(wd,'/',ud_file))
ud <- as.data.frame(udpipe(ud_model, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
group_by(doc_id) %>%
summarise(
sentence_id = list(as.integer(sentence_id)),
@ -34,7 +36,7 @@ ud_update <- function(out, udmodel, ver, file) {
exists = list(TRUE)
)
bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
saveRDS(bulk, file = paste0('ud_',file))
saveRDS(bulk, file = str_c(wd,'/ud_',file))
# res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
return()
}

@ -13,8 +13,7 @@ actorizer(
pre_tags,
post_tags,
es_super,
ver,
cores = 1
ver
)
}
\arguments{
@ -32,8 +31,6 @@ actorizer(
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
\item{identifier}{String used to mark highlights. Should be a lowercase string}
}
\value{

@ -2,24 +2,24 @@
% Please edit documentation in R/ud_update.R
\name{ud_update}
\alias{ud_update}
\title{Elasticizer update function: generate UDpipe output from base text}
\title{Generate UDpipe output from base text}
\usage{
ud_update(out, udmodel, ver)
ud_update(file, wd, ud_file, ver)
}
\arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
\item{file}{Filename of file to read in, also used for generating output file name}
\item{udmodel}{UDpipe model to use}
\item{wd}{Working directory where *file*s can be found}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{ud_file}{Filename of udpipe model to use, should be in *wd*}
\item{file}{Filename for output (ud_ is automatically prepended)}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
}
\value{
A vector of 1's indicating the success of each update call
}
\description{
Elasticizer update function: generate UDpipe output from base text
Generate UDpipe output from base text
}
\examples{
ud_update(out, udmodel, ver, file)

Loading…
Cancel
Save