From 77eb51a1bf793cf6791490beecedb2771f5eee26 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 19 Jun 2020 11:34:18 +0200 Subject: [PATCH] actorizer: totally revamped way of finding actors elasticizer: updated dump handling to create a dump for every batch, instead of one big file at the end out_parser: streamlined code query_gen_actors: only include relevant fields ud_update: changed function parameters to work with elasticizer dump function --- R/actorizer.R | 172 ++++++++++++++++++++----------------------- R/elasticizer.R | 7 +- R/out_parser.R | 69 +++++++++-------- R/query_gen_actors.R | 2 +- R/ud_update.R | 20 ++--- man/actorizer.Rd | 5 +- man/ud_update.Rd | 14 ++-- 7 files changed, 144 insertions(+), 145 deletions(-) diff --git a/R/actorizer.R b/R/actorizer.R index 89216b5..fdc90f2 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -9,58 +9,64 @@ #' @param identifier String used to mark highlights. Should be a lowercase string #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') #' @param es_super Password for write access to ElasticSearch -#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available) #' @return As this is a nested function used within elasticizer, there is no return output #' @export #' @examples #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super) -actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = 1) { - plan(multiprocess, workers = cores) - ### Function to filter out false positives using regex - exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) { - min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence - max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence - split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence - min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1)) - max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6) - sentence <- paste0(' ',str_sub(doc$merged, min, max),' ') # Extract sentence from text, adding whitespaces before and after for double negation (i.e. Con only when preceded by "(")) - - # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing - if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) { - return(id) - } else { - return(NULL) - } - } - ranger <- function(x, ud) { - return(which((ud$start <= x) & (ud$end >= x))) - } +actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) { sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) { doc <- out[row,] - if (nchar(doc$merged) > 990000) { - return( - data.frame( - err = T, - errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect" - ) - ) + if (sum(nchar(doc$merged) > 990000)) { + stop("One or more documents in this batch exceed 990000 characters") } # Extracting ud output from document - ud <- doc$`_source.ud`[[1]] %>% - select(-one_of('exists')) %>% # Removing ud.exists variable - unnest() %>% - mutate(doc_id = doc$`_id`) - markers <- doc$markers[[1]][,'start'] # Extract list of markers - # Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word)) - rows <- unlist(lapply(markers, ranger, ud = ud)) - - # Setting up an actor variable - ud$actor <- F - ud$actor[rows] <- T + ud <- doc %>% + select(`_id`,`_source.ud`, merged) %>% + unnest(cols = c("_source.ud")) %>% + select(`_id`,lemma,start,end, sentence_id,merged) %>% + unnest(cols = colnames(.)) - sentence_count <- max(ud$sentence_id) # Number of sentences in article - actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor + sentences <- ud %>% + group_by(`_id`, sentence_id) %>% + summarise( + sentence_start = min(start), + sentence_end = max(end) + ) %>% + mutate( + sentence_count = n() + ) + hits <- left_join(ud, markers, by='_id') %>% + mutate( + actor = case_when( + start <= marker_start & end >= marker_start ~ T, + T ~ F + ) + ) %>% + select(`_id`, sentence_id, start, end,actor,merged) %>% + filter(actor) %>% + group_by(`_id`,sentence_id) %>% + summarise( + actor = any(actor), + actor_start = I(list(start)), + actor_end = I(list(end)), + n_markers = length(start), + merged = first(merged) + ) %>% + left_join(.,sentences, by=c('_id','sentence_id')) %>% + ungroup %>% + arrange(`_id`,sentence_id) %>% + group_by(`_id`) %>% + mutate(n_markers = cumsum(n_markers)) %>% + mutate( + sentence_start_tags = sentence_start+((nchar(pre_tags)+nchar(post_tags))*(lag(n_markers, default = 0))), + sentence_end_tags = sentence_end+((nchar(pre_tags)+nchar(post_tags))*(n_markers)) + ) %>% + mutate( + sentence = paste0(' ',str_sub(merged, sentence_start_tags, sentence_end_tags),' ') + ) %>% + select(-merged) %>% + ungroup() # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply if (!is.na(prefix) || !is.na(postfix)) { ### If no pre or postfixes, match *not nothing* i.e. anything @@ -70,68 +76,50 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t if (is.na(postfix)) { postfix = '$^' } - sentence_ids <- unlist(lapply(actor_sentences, - exceptionizer, - ud = ud, - doc = doc, - markers = markers, - pre_tags_regex = pre_tags_regex, - pre_tags = pre_tags, - post_tags_regex = post_tags_regex, - post_tags = post_tags, - prefix = prefix, - postfix = postfix)) - } else { - sentence_ids <- actor_sentences - } - if (length(sentence_ids > 0)) { - # Generating nested sentence start and end positions for actor sentences - ud <- ud %>% - filter(sentence_id %in% sentence_ids) - actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor - actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor - ud <- ud %>% - group_by(sentence_id) %>% - summarise ( - sentence_start = as.integer(min(start)), - sentence_end = as.integer(max(end)), - doc_id = first(doc_id) - ) %>% - group_by(doc_id) %>% - summarise( - sentence_id = list(as.integer(sentence_id)), - sentence_start = list(sentence_start), - sentence_end = list(sentence_end) - ) - - return( - data.frame(ud, # Sentence id, start and end position for actor sentences - actor_start = I(list(actor_start)), # List of actor ud token start positions - actor_end = I(list(actor_end)), # List of actor ud token end positions - occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs - prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences) - rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level - first = min(sentence_ids), # First sentence in which actor is mentioned - ids = I(list(ids)) # List of actor ids - ) + hits <- hits %>% + filter( + !str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) & !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex)) ) - } else { - return(NULL) } + hits <- hits %>% + group_by(`_id`) %>% + summarise( + sentence_id = list(as.integer(sentence_id)), + sentence_start = list(sentence_start), + sentence_end = list(sentence_end), + actor_start = I(list(unlist(actor_start))), # List of actor ud token start positions + actor_end = I(list(unlist(actor_end))), # List of actor ud token end positions + occ = length(unique(unlist(sentence_id))), # Number of sentences in which actor occurs + first = min(unlist(sentence_id)), # First sentence in which actor is mentioned + ids = I(list(ids)), + sentence_count = first(sentence_count)# List of actor ids + ) %>% + mutate( + prom = occ/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences) + rel_first = 1-(first/sentence_count), # Relative position of first occurence at sentence level + ) %>% + select(`_id`:occ, prom,rel_first,first,ids) + return(hits) + } - out <- mamlr:::out_parser(out, field = 'highlight', clean = F, cores = cores) + out <- mamlr:::out_parser(out, field = 'highlight', clean = F) offsetter <- function(x, pre_tags, post_tags) { - return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags)))) + return(as.list(as.data.frame(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags)))))) } prefix[prefix==''] <- NA postfix[postfix==''] <- NA pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags) post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags) out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags) + markers <- out %>% + select(`_id`,markers) %>% + unnest_wider(markers) %>% + rename(marker_start = start, marker_end = end) %>% + unnest(colnames(.)) # ids <- fromJSON(ids) - updates <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), sentencizer, + updates <- sentencizer(1:1024, out = out, ids = ids, postfix = postfix, @@ -139,7 +127,7 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t pre_tags_regex = pre_tags_regex, pre_tags = pre_tags, post_tags_regex = post_tags_regex, - post_tags = post_tags)) + post_tags = post_tags) if (nrow(updates) == 0) { print("Nothing to update for this batch") return(NULL) diff --git a/R/elasticizer.R b/R/elasticizer.R index 10f5e9f..0f63381 100644 --- a/R/elasticizer.R +++ b/R/elasticizer.R @@ -108,6 +108,9 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw hits <- length(json$hits$hits) batch <- 1 print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.')) + if (dump) { + saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds')) + } if (length(update) > 0){ update(out, localhost = localhost, ...) } @@ -134,6 +137,8 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw if (length(update) > 0){ out <- jsonlite:::flatten(json$hits$hits) update(out, localhost = localhost, ...) + } else if (dump) { + saveRDS(jsonlite:::flatten(json$hits$hits), file = paste0('batch_',batch*batch_size,'.Rds')) } else { out <- bind_rows(out, jsonlite:::flatten(json$hits$hits)) } @@ -143,7 +148,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw scroll_clear(conn = conn, x = json$`_scroll_id`) return("Done updating") } else if (dump) { - saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds')) + return("Dumping complete") } else { scroll_clear(conn = conn, x = json$`_scroll_id`) return(out) diff --git a/R/out_parser.R b/R/out_parser.R index ebc6066..2683201 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -24,43 +24,50 @@ out_parser <- function(out, field, clean = F) { par_parser <- function(row, out, field, clean) { doc <- out[row,] if (field == 'highlight') { - doc <- replace(doc, doc=="NULL", NA) - ### Replacing empty highlights with source text (to have the exact same text for udpipe to process) - doc$highlight.title[is.na(doc$highlight.title)] <- doc$`_source.title`[is.na(doc$highlight.title)] - doc$highlight.text[is.na(doc$highlight.text)] <- doc$`_source.text`[is.na(doc$highlight.text)] - doc$highlight.teaser[is.na(doc$highlight.teaser)] <- doc$`_source.teaser`[is.na(doc$highlight.teaser)] - doc$highlight.subtitle[is.na(doc$highlight.subtitle)] <- doc$`_source.subtitle`[is.na(doc$highlight.subtitle)] - doc$highlight.preteaser[is.na(doc$highlight.preteaser)] <- doc$`_source.preteaser`[is.na(doc$highlight.preteaser)] doc <- doc %>% - mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>% - mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>% - mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>% - mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>% - mutate(highlight.text = str_replace_na(highlight.text, replacement = '')) - doc$merged <- str_c(doc$highlight.title, - doc$highlight.subtitle, - doc$highlight.preteaser, - doc$highlight.teaser, - doc$highlight.text, - '', - sep = ". ") + unnest(cols = starts_with("highlight")) %>% + mutate(across(starts_with("highlight"), na_if, "NULL")) %>% + mutate(highlight.title = coalesce(highlight.title, `_source.title`), + highlight.subtitle = coalesce(highlight.subtitle, `_source.subtitle`), + highlight.preteaser = coalesce(highlight.preteaser, `_source.preteaser`), + highlight.teaser = coalesce(highlight.teaser, `_source.teaser`), + highlight.text = coalesce(highlight.text, `_source.text`) + ) %>% + mutate(highlight.title = str_replace_na(highlight.title, replacement = ''), + highlight.subtitle = str_replace_na(highlight.subtitle, replacement = ''), + highlight.preteaser = str_replace_na(highlight.preteaser, replacement = ''), + highlight.teaser = str_replace_na(highlight.teaser, replacement = ''), + highlight.text = str_replace_na(highlight.text, replacement = '') + ) %>% + mutate( + merged = str_c(highlight.title, + highlight.subtitle, + highlight.preteaser, + highlight.teaser, + highlight.text, + '', + sep = ". ") + ) } if (field == '_source') { doc <- doc %>% - mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>% - mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>% - mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>% - mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>% - mutate(`_source.text` = str_replace_na(`_source.text`, replacement = '')) - doc$merged <- str_c(doc$`_source.title`, - doc$`_source.subtitle`, - doc$`_source.preteaser`, - doc$`_source.teaser`, - doc$`_source.text`, - '', - sep = ". ") + mutate(`_source.title` = str_replace_na(`_source.title`, replacement = ''), + `_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = ''), + `_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = ''), + `_source.teaser` = str_replace_na(`_source.teaser`, replacement = ''), + `_source.text` = str_replace_na(`_source.text`, replacement = '') + ) %>% + mutate( + merged = str_c(`_source.title`, + `_source.subtitle`, + `_source.preteaser`, + `_source.teaser`, + `_source.text`, + '', + sep = ". ") + ) } ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences diff --git a/R/query_gen_actors.R b/R/query_gen_actors.R index ece6119..1626775 100644 --- a/R/query_gen_actors.R +++ b/R/query_gen_actors.R @@ -15,7 +15,7 @@ ################################################################################################# query_gen_actors <- function(actor, country, pre_tags, post_tags) { generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) { - return(paste0('{"_source": ["ud"], + return(paste0('{"_source": ["ud","title","subtitle","preteaser","teaser","text"], "query": {"bool": { "filter":[ diff --git a/R/ud_update.R b/R/ud_update.R index cac40bd..88b7f32 100644 --- a/R/ud_update.R +++ b/R/ud_update.R @@ -1,10 +1,10 @@ -#' Elasticizer update function: generate UDpipe output from base text +#' Generate UDpipe output from base text #' -#' Elasticizer update function: generate UDpipe output from base text -#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function) -#' @param udmodel UDpipe model to use +#' Generate UDpipe output from base text +#' @param file Filename of file to read in, also used for generating output file name +#' @param wd Working directory where *file*s can be found +#' @param ud_file Filename of udpipe model to use, should be in *wd* #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') -#' @param file Filename for output (ud_ is automatically prepended) #' @return A vector of 1's indicating the success of each update call #' @export #' @examples @@ -17,9 +17,11 @@ # } # } -ud_update <- function(out, udmodel, ver, file) { - out <- mamlr:::out_parser(out, field = '_source', clean = F) - ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>% +ud_update <- function(file, wd, ud_file, ver) { + out <- readRDS(str_c(wd,'/',file)) %>% + out_parser(., field = '_source', clean = F) + ud_model <- udpipe_load_model(file = str_c(wd,'/',ud_file)) + ud <- as.data.frame(udpipe(ud_model, x = out$merged, parser = "default", doc_id = out$`_id`)) %>% group_by(doc_id) %>% summarise( sentence_id = list(as.integer(sentence_id)), @@ -34,7 +36,7 @@ ud_update <- function(out, udmodel, ver, file) { exists = list(TRUE) ) bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver) - saveRDS(bulk, file = paste0('ud_',file)) + saveRDS(bulk, file = str_c(wd,'/ud_',file)) # res <- elastic_update(bulk, es_super = es_super, localhost = localhost) return() } diff --git a/man/actorizer.Rd b/man/actorizer.Rd index 2845f88..f578db5 100644 --- a/man/actorizer.Rd +++ b/man/actorizer.Rd @@ -13,8 +13,7 @@ actorizer( pre_tags, post_tags, es_super, - ver, - cores = 1 + ver ) } \arguments{ @@ -32,8 +31,6 @@ actorizer( \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} -\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)} - \item{identifier}{String used to mark highlights. Should be a lowercase string} } \value{ diff --git a/man/ud_update.Rd b/man/ud_update.Rd index 91bf9e4..6d0c475 100644 --- a/man/ud_update.Rd +++ b/man/ud_update.Rd @@ -2,24 +2,24 @@ % Please edit documentation in R/ud_update.R \name{ud_update} \alias{ud_update} -\title{Elasticizer update function: generate UDpipe output from base text} +\title{Generate UDpipe output from base text} \usage{ -ud_update(out, udmodel, ver) +ud_update(file, wd, ud_file, ver) } \arguments{ -\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} +\item{file}{Filename of file to read in, also used for generating output file name} -\item{udmodel}{UDpipe model to use} +\item{wd}{Working directory where *file*s can be found} -\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} +\item{ud_file}{Filename of udpipe model to use, should be in *wd*} -\item{file}{Filename for output (ud_ is automatically prepended)} +\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} } \value{ A vector of 1's indicating the success of each update call } \description{ -Elasticizer update function: generate UDpipe output from base text +Generate UDpipe output from base text } \examples{ ud_update(out, udmodel, ver, file)