From 0a3bdb630bb69c84e31ce299a6c6856335efcc7c Mon Sep 17 00:00:00 2001
From: Erik de Vries <erik@devries.pm>
Date: Wed, 2 Jan 2019 18:11:34 +0100
Subject: [PATCH] actorizer, dfm_gen, ud_update: unified output parsing from
 _source and highlight fields into a single function (out_parser) out_parser:
 function to parse raw text output into a single field, either from _source or
 highlight fields dupe_detect: updated function to use 'ver' parameter for
 versioning

---
 R/actorizer.R     | 37 +-------------------------
 R/dfm_gen.R       | 10 +------
 R/dupe_detect.R   |  7 ++---
 R/out_parser.R    | 68 +++++++++++++++++++++++++++++++++++++++++++++++
 R/ud_update.R     | 29 +-------------------
 man/out_parser.Rd | 22 +++++++++++++++
 6 files changed, 97 insertions(+), 76 deletions(-)
 create mode 100644 R/out_parser.R
 create mode 100644 man/out_parser.Rd

diff --git a/R/actorizer.R b/R/actorizer.R
index 8c4bfe8..d6af3c5 100644
--- a/R/actorizer.R
+++ b/R/actorizer.R
@@ -15,13 +15,6 @@
 #' @examples
 #' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super)
 actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super, ver) {
-  fncols <- function(data, cname) {
-    add <-cname[!cname%in%names(data)]
-
-    if(length(add)!=0) data[add] <- NA
-    data
-  }
-
   sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier) {
     ### If no pre or postfixes, match *not nothing* i.e. anything
     if (is.na(prefix) || prefix == '') {
@@ -52,35 +45,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
 
     return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids)))))
   }
-
-  out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
-  out <- replace(out, out=="NULL", NA)
-
-  ### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
-  out$highlight.title[is.na(out$highlight.title)] <- out$`_source.title`[is.na(out$highlight.title)]
-  out$highlight.text[is.na(out$highlight.text)] <- out$`_source.text`[is.na(out$highlight.text)]
-  out$highlight.teaser[is.na(out$highlight.teaser)] <- out$`_source.teaser`[is.na(out$highlight.teaser)]
-  out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)]
-  out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)]
-
-  out <- out %>%
-    mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
-    mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
-    mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
-    mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
-    mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
-  out$merged <- str_c(out$highlight.title,
-                      out$highlight.subtitle,
-                      out$highlight.preteaser,
-                      out$highlight.teaser,
-                      out$highlight.text,
-                      sep = ". ") %>%
-    # Remove html tags, and multiple consequent whitespaces
-    str_replace_all("<.{0,20}?>", " ") %>%
-    str_replace_all('(\\. ){2,}', '. ') %>%
-    str_replace_all('([!?.])\\.','\\1') %>%
-    str_replace_all("\\s+"," ")
-
+  out <- out_parser(out, field = 'highlight')
   ids <- fromJSON(ids)
   updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores()))
   bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
diff --git a/R/dfm_gen.R b/R/dfm_gen.R
index 7eb1141..14f7efe 100644
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@@ -25,15 +25,7 @@ dfm_gen <- function(out, words = '999', text = "lemmas") {
     out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores()))
   }
   if (text == "full") {
-    out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
-                        str_replace_na(out$`_source.subtitle`, replacement = " "),
-                        str_replace_na(out$`_source.preteaser`, replacement = " "),
-                        str_replace_na(out$`_source.teaser`, replacement = " "),
-                        str_replace_na(out$`_source.text`, replacement = " "),
-                        sep = " ") %>%
-      # Remove html tags
-      str_replace_all("<.*?>", " ") %>%
-      str_replace_all("\\s+"," ")
+    out <- out_parser(out, field = '_source')
   }
   if ('_source.codes.majorTopic' %in% colnames(out)) {
     out <- out %>%
diff --git a/R/dupe_detect.R b/R/dupe_detect.R
index 9232045..63e4e9e 100644
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@@ -9,6 +9,7 @@
 #' @param es_super Password for write access to ElasticSearch
 #' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
 #' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)
+#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
 #' @export
 #' @examples
@@ -17,7 +18,7 @@
 #################################################################################################
 #################################### Duplicate detector ################################
 #################################################################################################
-dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T) {
+dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) {
   params <- grid[row,]
   print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
   query <- paste0('{"query":
@@ -49,8 +50,8 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
       #       append=T)
       dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
                                 dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
-      bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'),
-                apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set'))
+      bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver),
+                apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver))
       if (length(bulk) > 0) {
         res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
       }
diff --git a/R/out_parser.R b/R/out_parser.R
new file mode 100644
index 0000000..f4b93e8
--- /dev/null
+++ b/R/out_parser.R
@@ -0,0 +1,68 @@
+#' Parse raw text into a single field
+#'
+#' Parse raw text into a single field
+#' @param out The original output data frame
+#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
+#' @return a parsed output data frame including the additional column 'merged', containing the merged text
+#' @examples
+#' out_parser(out,field)
+
+#################################################################################################
+#################################### Parser function for output fields ##########################
+#################################################################################################
+out_parser <- function(out, field) {
+  fncols <- function(data, cname) {
+    add <-cname[!cname%in%names(data)]
+
+    if(length(add)!=0) data[add] <- NA
+    data
+  }
+
+  out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
+  if (field == 'highlight') {
+    out <- replace(out, out=="NULL", NA)
+    ### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
+    out$highlight.title[is.na(out$highlight.title)] <- out$`_source.title`[is.na(out$highlight.title)]
+    out$highlight.text[is.na(out$highlight.text)] <- out$`_source.text`[is.na(out$highlight.text)]
+    out$highlight.teaser[is.na(out$highlight.teaser)] <- out$`_source.teaser`[is.na(out$highlight.teaser)]
+    out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)]
+    out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)]
+
+    out <- out %>%
+      mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
+      mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
+      mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
+      mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
+      mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
+    out$merged <- str_c(out$highlight.title,
+                        out$highlight.subtitle,
+                        out$highlight.preteaser,
+                        out$highlight.teaser,
+                        out$highlight.text,
+                        sep = ". ")
+  }
+
+  if (field == '_source') {
+    out <- out %>%
+      mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
+      mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
+      mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
+      mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
+      mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
+    out$merged <- str_c(out$`_source.title`,
+                        out$`_source.subtitle`,
+                        out$`_source.preteaser`,
+                        out$`_source.teaser`,
+                        out$`_source.text`,
+                        sep = ". ")
+  }
+
+  ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
+  # Remove html tags, and multiple consequent whitespaces
+  out$merged <- out$merged %>%
+    str_replace_all("<.{0,20}?>", " ") %>%
+    str_replace_all('(\\. ){2,}', '. ') %>%
+    str_replace_all('([!?.])\\.','\\1') %>%
+    str_replace_all("\\s+"," ")
+  return(out)
+}
diff --git a/R/ud_update.R b/R/ud_update.R
index 602101b..047308a 100644
--- a/R/ud_update.R
+++ b/R/ud_update.R
@@ -20,34 +20,7 @@
 # }
 
 ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) {
-  fncols <- function(data, cname) {
-    add <-cname[!cname%in%names(data)]
-
-    if(length(add)!=0) data[add] <- NA
-    data
-  }
-
-  out <- fncols(out, c('_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
-  out <- replace(out, out=="NULL", NA)
-
-  ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
-  out <- out %>%
-    mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
-    mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
-    mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
-    mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
-    mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
-  out$merged <- str_c(out$`_source.title`,
-                      out$`_source.subtitle`,
-                      out$`_source.preteaser`,
-                      out$`_source.teaser`,
-                      out$`_source.text`,
-                      sep = ". ") %>%
-    # Remove html tags, and multiple consequent whitespaces
-    str_replace_all("<.{0,20}?>", " ") %>%
-    str_replace_all('(\\. ){2,}', '. ') %>%
-    str_replace_all('([!?.])\\.','\\1') %>%
-    str_replace_all("\\s+"," ")
+  out <- out_parser(out, field = '_source')
   par_proc <- function(row, out, udmodel) {
     doc <- out[row,]
     ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
diff --git a/man/out_parser.Rd b/man/out_parser.Rd
new file mode 100644
index 0000000..4e804ce
--- /dev/null
+++ b/man/out_parser.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/out_parser.R
+\name{out_parser}
+\alias{out_parser}
+\title{Parse raw text into a single field}
+\usage{
+out_parser(out, type)
+}
+\arguments{
+\item{out}{The original output data frame}
+
+\item{type}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
+}
+\value{
+a parsed output data frame including the additional column 'merged', containing the merged text
+}
+\description{
+Parse raw text into a single field
+}
+\examples{
+out_parser(out,type)
+}