actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser)

out_parser: function to parse raw text output into a single field, either from _source or highlight fields dupe_detect: updated function to use 'ver' parameter for versioning
7 years ago · 0a3bdb630b
parent 9e5a1e3354
commit 0a3bdb630b
6 changed files with 97 additions and 76 deletions
--- a/R/actorizer.R
+++ b/R/actorizer.R
@ -15,13 +15,6 @@
 #' @examples
 #' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super)
 actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super, ver) {
  fncols <- function(data, cname) {
    add <-cname[!cname%in%names(data)]
    if(length(add)!=0) data[add] <- NA
    data
  }
  sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier) {
    ### If no pre or postfixes, match *not nothing* i.e. anything
    if (is.na(prefix) || prefix == '') {
@ -52,35 +45,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
    return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids)))))
  }
-
+  out <- out_parser(out, field = 'highlight')
  out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
  out <- replace(out, out=="NULL", NA)
  ### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
  out$highlight.title[is.na(out$highlight.title)] <- out$`_source.title`[is.na(out$highlight.title)]
  out$highlight.text[is.na(out$highlight.text)] <- out$`_source.text`[is.na(out$highlight.text)]
  out$highlight.teaser[is.na(out$highlight.teaser)] <- out$`_source.teaser`[is.na(out$highlight.teaser)]
  out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)]
  out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)]
  out <- out %>%
    mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
    mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
    mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
    mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
    mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
  out$merged <- str_c(out$highlight.title,
                      out$highlight.subtitle,
                      out$highlight.preteaser,
                      out$highlight.teaser,
                      out$highlight.text,
                      sep = ". ") %>%
    # Remove html tags, and multiple consequent whitespaces
    str_replace_all("<.{0,20}?>", " ") %>%
    str_replace_all('(\\. ){2,}', '. ') %>%
    str_replace_all('([!?.])\\.','\\1') %>%
    str_replace_all("\\s+"," ")
  ids <- fromJSON(ids)
  updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores()))
  bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -25,15 +25,7 @@ dfm_gen <- function(out, words = '999', text = "lemmas") {
    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores()))
  }
  if (text == "full") {
-    out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
+    out <- out_parser(out, field = '_source')
                        str_replace_na(out$`_source.subtitle`, replacement = " "),
                        str_replace_na(out$`_source.preteaser`, replacement = " "),
                        str_replace_na(out$`_source.teaser`, replacement = " "),
                        str_replace_na(out$`_source.text`, replacement = " "),
                        sep = " ") %>%
      # Remove html tags
      str_replace_all("<.*?>", " ") %>%
      str_replace_all("\\s+"," ")
  }
  if ('_source.codes.majorTopic' %in% colnames(out)) {
    out <- out %>%
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@ -9,6 +9,7 @@
 #' @param es_super Password for write access to ElasticSearch
 #' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
 #' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
 #' @export
 #' @examples
@ -17,7 +18,7 @@
 #################################################################################################
 #################################### Duplicate detector ################################
 #################################################################################################
-dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T) {
+dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) {
  params <- grid[row,]
  print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
  query <- paste0('{"query":
@ -49,8 +50,8 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
      #       append=T)
      dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
                                dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
-      bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'),
+      bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver),
-                apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set'))
+                apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver))
      if (length(bulk) > 0) {
        res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
      }
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -0,0 +1,68 @@
 #' Parse raw text into a single field
 #'
 #' Parse raw text into a single field
 #' @param out The original output data frame
 #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
 #' @return a parsed output data frame including the additional column 'merged', containing the merged text
 #' @examples
 #' out_parser(out,field)
 #################################################################################################
 #################################### Parser function for output fields ##########################
 #################################################################################################
 out_parser <- function(out, field) {
  fncols <- function(data, cname) {
    add <-cname[!cname%in%names(data)]
    if(length(add)!=0) data[add] <- NA
    data
  }
  out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
  if (field == 'highlight') {
    out <- replace(out, out=="NULL", NA)
    ### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
    out$highlight.title[is.na(out$highlight.title)] <- out$`_source.title`[is.na(out$highlight.title)]
    out$highlight.text[is.na(out$highlight.text)] <- out$`_source.text`[is.na(out$highlight.text)]
    out$highlight.teaser[is.na(out$highlight.teaser)] <- out$`_source.teaser`[is.na(out$highlight.teaser)]
    out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)]
    out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)]
    out <- out %>%
      mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
      mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
      mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
      mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
      mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
    out$merged <- str_c(out$highlight.title,
                        out$highlight.subtitle,
                        out$highlight.preteaser,
                        out$highlight.teaser,
                        out$highlight.text,
                        sep = ". ")
  }
  if (field == '_source') {
    out <- out %>%
      mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
      mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
      mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
      mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
      mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
    out$merged <- str_c(out$`_source.title`,
                        out$`_source.subtitle`,
                        out$`_source.preteaser`,
                        out$`_source.teaser`,
                        out$`_source.text`,
                        sep = ". ")
  }
  ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
  # Remove html tags, and multiple consequent whitespaces
  out$merged <- out$merged %>%
    str_replace_all("<.{0,20}?>", " ") %>%
    str_replace_all('(\\. ){2,}', '. ') %>%
    str_replace_all('([!?.])\\.','\\1') %>%
    str_replace_all("\\s+"," ")
  return(out)
 }
--- a/R/ud_update.R
+++ b/R/ud_update.R
@ -20,34 +20,7 @@
 # }
 ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) {
-  fncols <- function(data, cname) {
+  out <- out_parser(out, field = '_source')
    add <-cname[!cname%in%names(data)]
    if(length(add)!=0) data[add] <- NA
    data
  }
  out <- fncols(out, c('_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
  out <- replace(out, out=="NULL", NA)
  ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
  out <- out %>%
    mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
    mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
    mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
    mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
    mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
  out$merged <- str_c(out$`_source.title`,
                      out$`_source.subtitle`,
                      out$`_source.preteaser`,
                      out$`_source.teaser`,
                      out$`_source.text`,
                      sep = ". ") %>%
    # Remove html tags, and multiple consequent whitespaces
    str_replace_all("<.{0,20}?>", " ") %>%
    str_replace_all('(\\. ){2,}', '. ') %>%
    str_replace_all('([!?.])\\.','\\1') %>%
    str_replace_all("\\s+"," ")
  par_proc <- function(row, out, udmodel) {
    doc <- out[row,]
    ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@ -0,0 +1,22 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/out_parser.R
 \name{out_parser}
 \alias{out_parser}
 \title{Parse raw text into a single field}
 \usage{
 out_parser(out, type)
 }
 \arguments{
 \item{out}{The original output data frame}
 \item{type}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
 }
 \value{
 a parsed output data frame including the additional column 'merged', containing the merged text
 }
 \description{
 Parse raw text into a single field
 }
 \examples{
 out_parser(out,type)
 }