From 0a3bdb630bb69c84e31ce299a6c6856335efcc7c Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Wed, 2 Jan 2019 18:11:34 +0100 Subject: [PATCH] actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser) out_parser: function to parse raw text output into a single field, either from _source or highlight fields dupe_detect: updated function to use 'ver' parameter for versioning --- R/actorizer.R | 37 +------------------------- R/dfm_gen.R | 10 +------ R/dupe_detect.R | 7 ++--- R/out_parser.R | 68 +++++++++++++++++++++++++++++++++++++++++++++++ R/ud_update.R | 29 +------------------- man/out_parser.Rd | 22 +++++++++++++++ 6 files changed, 97 insertions(+), 76 deletions(-) create mode 100644 R/out_parser.R create mode 100644 man/out_parser.Rd diff --git a/R/actorizer.R b/R/actorizer.R index 8c4bfe8..d6af3c5 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -15,13 +15,6 @@ #' @examples #' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super) actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super, ver) { - fncols <- function(data, cname) { - add <-cname[!cname%in%names(data)] - - if(length(add)!=0) data[add] <- NA - data - } - sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier) { ### If no pre or postfixes, match *not nothing* i.e. anything if (is.na(prefix) || prefix == '') { @@ -52,35 +45,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))))) } - - out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser')) - out <- replace(out, out=="NULL", NA) - - ### Replacing empty highlights with source text (to have the exact same text for udpipe to process) - out$highlight.title[is.na(out$highlight.title)] <- out$`_source.title`[is.na(out$highlight.title)] - out$highlight.text[is.na(out$highlight.text)] <- out$`_source.text`[is.na(out$highlight.text)] - out$highlight.teaser[is.na(out$highlight.teaser)] <- out$`_source.teaser`[is.na(out$highlight.teaser)] - out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)] - out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)] - - out <- out %>% - mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>% - mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>% - mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>% - mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>% - mutate(highlight.text = str_replace_na(highlight.text, replacement = '')) - out$merged <- str_c(out$highlight.title, - out$highlight.subtitle, - out$highlight.preteaser, - out$highlight.teaser, - out$highlight.text, - sep = ". ") %>% - # Remove html tags, and multiple consequent whitespaces - str_replace_all("<.{0,20}?>", " ") %>% - str_replace_all('(\\. ){2,}', '. ') %>% - str_replace_all('([!?.])\\.','\\1') %>% - str_replace_all("\\s+"," ") - + out <- out_parser(out, field = 'highlight') ids <- fromJSON(ids) updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores())) bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver) diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 7eb1141..14f7efe 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -25,15 +25,7 @@ dfm_gen <- function(out, words = '999', text = "lemmas") { out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores())) } if (text == "full") { - out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "), - str_replace_na(out$`_source.subtitle`, replacement = " "), - str_replace_na(out$`_source.preteaser`, replacement = " "), - str_replace_na(out$`_source.teaser`, replacement = " "), - str_replace_na(out$`_source.text`, replacement = " "), - sep = " ") %>% - # Remove html tags - str_replace_all("<.*?>", " ") %>% - str_replace_all("\\s+"," ") + out <- out_parser(out, field = '_source') } if ('_source.codes.majorTopic' %in% colnames(out)) { out <- out %>% diff --git a/R/dupe_detect.R b/R/dupe_detect.R index 9232045..63e4e9e 100644 --- a/R/dupe_detect.R +++ b/R/dupe_detect.R @@ -9,6 +9,7 @@ #' @param es_super Password for write access to ElasticSearch #' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words]) #' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200) +#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory #' @export #' @examples @@ -17,7 +18,7 @@ ################################################################################################# #################################### Duplicate detector ################################ ################################################################################################# -dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T) { +dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) { params <- grid[row,] print(paste0('Parsing ',params$doctypes,' on ',params$dates )) query <- paste0('{"query": @@ -49,8 +50,8 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su # append=T) dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))) - bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'), - apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set')) + bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver), + apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver)) if (length(bulk) > 0) { res <- elastic_update(bulk, es_super = es_super, localhost = localhost) } diff --git a/R/out_parser.R b/R/out_parser.R new file mode 100644 index 0000000..f4b93e8 --- /dev/null +++ b/R/out_parser.R @@ -0,0 +1,68 @@ +#' Parse raw text into a single field +#' +#' Parse raw text into a single field +#' @param out The original output data frame +#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text +#' @return a parsed output data frame including the additional column 'merged', containing the merged text +#' @examples +#' out_parser(out,field) + +################################################################################################# +#################################### Parser function for output fields ########################## +################################################################################################# +out_parser <- function(out, field) { + fncols <- function(data, cname) { + add <-cname[!cname%in%names(data)] + + if(length(add)!=0) data[add] <- NA + data + } + + out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser')) + if (field == 'highlight') { + out <- replace(out, out=="NULL", NA) + ### Replacing empty highlights with source text (to have the exact same text for udpipe to process) + out$highlight.title[is.na(out$highlight.title)] <- out$`_source.title`[is.na(out$highlight.title)] + out$highlight.text[is.na(out$highlight.text)] <- out$`_source.text`[is.na(out$highlight.text)] + out$highlight.teaser[is.na(out$highlight.teaser)] <- out$`_source.teaser`[is.na(out$highlight.teaser)] + out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)] + out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)] + + out <- out %>% + mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>% + mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>% + mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>% + mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>% + mutate(highlight.text = str_replace_na(highlight.text, replacement = '')) + out$merged <- str_c(out$highlight.title, + out$highlight.subtitle, + out$highlight.preteaser, + out$highlight.teaser, + out$highlight.text, + sep = ". ") + } + + if (field == '_source') { + out <- out %>% + mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>% + mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>% + mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>% + mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>% + mutate(`_source.text` = str_replace_na(`_source.text`, replacement = '')) + out$merged <- str_c(out$`_source.title`, + out$`_source.subtitle`, + out$`_source.preteaser`, + out$`_source.teaser`, + out$`_source.text`, + sep = ". ") + } + + ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences + # Remove html tags, and multiple consequent whitespaces + out$merged <- out$merged %>% + str_replace_all("<.{0,20}?>", " ") %>% + str_replace_all('(\\. ){2,}', '. ') %>% + str_replace_all('([!?.])\\.','\\1') %>% + str_replace_all("\\s+"," ") + return(out) +} diff --git a/R/ud_update.R b/R/ud_update.R index 602101b..047308a 100644 --- a/R/ud_update.R +++ b/R/ud_update.R @@ -20,34 +20,7 @@ # } ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) { - fncols <- function(data, cname) { - add <-cname[!cname%in%names(data)] - - if(length(add)!=0) data[add] <- NA - data - } - - out <- fncols(out, c('_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser')) - out <- replace(out, out=="NULL", NA) - - ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences - out <- out %>% - mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>% - mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>% - mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>% - mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>% - mutate(`_source.text` = str_replace_na(`_source.text`, replacement = '')) - out$merged <- str_c(out$`_source.title`, - out$`_source.subtitle`, - out$`_source.preteaser`, - out$`_source.teaser`, - out$`_source.text`, - sep = ". ") %>% - # Remove html tags, and multiple consequent whitespaces - str_replace_all("<.{0,20}?>", " ") %>% - str_replace_all('(\\. ){2,}', '. ') %>% - str_replace_all('([!?.])\\.','\\1') %>% - str_replace_all("\\s+"," ") + out <- out_parser(out, field = '_source') par_proc <- function(row, out, udmodel) { doc <- out[row,] ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% diff --git a/man/out_parser.Rd b/man/out_parser.Rd new file mode 100644 index 0000000..4e804ce --- /dev/null +++ b/man/out_parser.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/out_parser.R +\name{out_parser} +\alias{out_parser} +\title{Parse raw text into a single field} +\usage{ +out_parser(out, type) +} +\arguments{ +\item{out}{The original output data frame} + +\item{type}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} +} +\value{ +a parsed output data frame including the additional column 'merged', containing the merged text +} +\description{ +Parse raw text into a single field +} +\examples{ +out_parser(out,type) +}