actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser)

out_parser: function to parse raw text output into a single field, either from _source or highlight fields
dupe_detect: updated function to use 'ver' parameter for versioning
master
Erik de Vries 6 years ago
parent 9e5a1e3354
commit 0a3bdb630b

@ -15,13 +15,6 @@
#' @examples #' @examples
#' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super) #' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super)
actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super, ver) { actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, udmodel, es_super, ver) {
fncols <- function(data, cname) {
add <-cname[!cname%in%names(data)]
if(length(add)!=0) data[add] <- NA
data
}
sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier) { sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier) {
### If no pre or postfixes, match *not nothing* i.e. anything ### If no pre or postfixes, match *not nothing* i.e. anything
if (is.na(prefix) || prefix == '') { if (is.na(prefix) || prefix == '') {
@ -52,35 +45,7 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids))))) return(data.frame(ud,occ = occurences,prom = prominence,rel_first = rel_first, ids = I(list(list(ids)))))
} }
out <- out_parser(out, field = 'highlight')
out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
out <- replace(out, out=="NULL", NA)
### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
out$highlight.title[is.na(out$highlight.title)] <- out$`_source.title`[is.na(out$highlight.title)]
out$highlight.text[is.na(out$highlight.text)] <- out$`_source.text`[is.na(out$highlight.text)]
out$highlight.teaser[is.na(out$highlight.teaser)] <- out$`_source.teaser`[is.na(out$highlight.teaser)]
out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)]
out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)]
out <- out %>%
mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
out$merged <- str_c(out$highlight.title,
out$highlight.subtitle,
out$highlight.preteaser,
out$highlight.teaser,
out$highlight.text,
sep = ". ") %>%
# Remove html tags, and multiple consequent whitespaces
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>%
str_replace_all("\\s+"," ")
ids <- fromJSON(ids) ids <- fromJSON(ids)
updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores())) updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores()))
bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver) bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)

@ -25,15 +25,7 @@ dfm_gen <- function(out, words = '999', text = "lemmas") {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores())) out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores()))
} }
if (text == "full") { if (text == "full") {
out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "), out <- out_parser(out, field = '_source')
str_replace_na(out$`_source.subtitle`, replacement = " "),
str_replace_na(out$`_source.preteaser`, replacement = " "),
str_replace_na(out$`_source.teaser`, replacement = " "),
str_replace_na(out$`_source.text`, replacement = " "),
sep = " ") %>%
# Remove html tags
str_replace_all("<.*?>", " ") %>%
str_replace_all("\\s+"," ")
} }
if ('_source.codes.majorTopic' %in% colnames(out)) { if ('_source.codes.majorTopic' %in% colnames(out)) {
out <- out %>% out <- out %>%

@ -9,6 +9,7 @@
#' @param es_super Password for write access to ElasticSearch #' @param es_super Password for write access to ElasticSearch
#' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words]) #' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
#' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200) #' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
#' @export #' @export
#' @examples #' @examples
@ -17,7 +18,7 @@
################################################################################################# #################################################################################################
#################################### Duplicate detector ################################ #################################### Duplicate detector ################################
################################################################################################# #################################################################################################
dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T) { dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) {
params <- grid[row,] params <- grid[row,]
print(paste0('Parsing ',params$doctypes,' on ',params$dates )) print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
query <- paste0('{"query": query <- paste0('{"query":
@ -49,8 +50,8 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
# append=T) # append=T)
dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))) dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'), bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver),
apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set')) apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver))
if (length(bulk) > 0) { if (length(bulk) > 0) {
res <- elastic_update(bulk, es_super = es_super, localhost = localhost) res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
} }

@ -0,0 +1,68 @@
#' Parse raw text into a single field
#'
#' Parse raw text into a single field
#' @param out The original output data frame
#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
#' @return a parsed output data frame including the additional column 'merged', containing the merged text
#' @examples
#' out_parser(out,field)
#################################################################################################
#################################### Parser function for output fields ##########################
#################################################################################################
out_parser <- function(out, field) {
fncols <- function(data, cname) {
add <-cname[!cname%in%names(data)]
if(length(add)!=0) data[add] <- NA
data
}
out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
if (field == 'highlight') {
out <- replace(out, out=="NULL", NA)
### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
out$highlight.title[is.na(out$highlight.title)] <- out$`_source.title`[is.na(out$highlight.title)]
out$highlight.text[is.na(out$highlight.text)] <- out$`_source.text`[is.na(out$highlight.text)]
out$highlight.teaser[is.na(out$highlight.teaser)] <- out$`_source.teaser`[is.na(out$highlight.teaser)]
out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)]
out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)]
out <- out %>%
mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
out$merged <- str_c(out$highlight.title,
out$highlight.subtitle,
out$highlight.preteaser,
out$highlight.teaser,
out$highlight.text,
sep = ". ")
}
if (field == '_source') {
out <- out %>%
mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
out$merged <- str_c(out$`_source.title`,
out$`_source.subtitle`,
out$`_source.preteaser`,
out$`_source.teaser`,
out$`_source.text`,
sep = ". ")
}
### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
# Remove html tags, and multiple consequent whitespaces
out$merged <- out$merged %>%
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>%
str_replace_all("\\s+"," ")
return(out)
}

@ -20,34 +20,7 @@
# } # }
ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) { ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) {
fncols <- function(data, cname) { out <- out_parser(out, field = '_source')
add <-cname[!cname%in%names(data)]
if(length(add)!=0) data[add] <- NA
data
}
out <- fncols(out, c('_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
out <- replace(out, out=="NULL", NA)
### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
out <- out %>%
mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
out$merged <- str_c(out$`_source.title`,
out$`_source.subtitle`,
out$`_source.preteaser`,
out$`_source.teaser`,
out$`_source.text`,
sep = ". ") %>%
# Remove html tags, and multiple consequent whitespaces
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>%
str_replace_all("\\s+"," ")
par_proc <- function(row, out, udmodel) { par_proc <- function(row, out, udmodel) {
doc <- out[row,] doc <- out[row,]
ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%

@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/out_parser.R
\name{out_parser}
\alias{out_parser}
\title{Parse raw text into a single field}
\usage{
out_parser(out, type)
}
\arguments{
\item{out}{The original output data frame}
\item{type}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
}
\value{
a parsed output data frame including the additional column 'merged', containing the merged text
}
\description{
Parse raw text into a single field
}
\examples{
out_parser(out,type)
}
Loading…
Cancel
Save