actorizer, ud_update: Updated merging of document fields to properly deal with missing punctuation at the end of fields (e.g. a title without punctuation at the end of the string)

modelizer: Minor update to feature keyness, using absolute values now to determine the most informative features for a class (so features that are either strongly postively or negatively related to the class)

bulk_writer: Added the 'ver' parameter to include a short version string with each update. Mostly to deal with updates that do not complete successfully on all data
master
Erik de Vries 6 years ago
parent 9f3418ef37
commit ae23456736

@ -62,14 +62,22 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)]
out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)]
out$merged <- str_c(str_replace_na(unlist(out$highlight.title), replacement = " "),
str_replace_na(unlist(out$highlight.subtitle), replacement = " "),
str_replace_na(unlist(out$highlight.preteaser), replacement = " "),
str_replace_na(unlist(out$highlight.teaser), replacement = " "),
str_replace_na(unlist(out$highlight.text), replacement = " "),
sep = " ") %>%
out <- out %>%
mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
out$merged <- str_c(out$highlight.title,
out$highlight.subtitle,
out$highlight.preteaser,
out$highlight.teaser,
out$highlight.text,
sep = ". ") %>%
# Remove html tags, and multiple consequent whitespaces
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>%
str_replace_all("\\s+"," ")
ids <- fromJSON(ids)

@ -9,14 +9,15 @@
#' @param index The name of the Elasticsearch index to update
#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)
#' @param type Type of updating to be done, can be either 'set', 'add', or 'addnested'
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
#' @export
#' @examples
#' bulk_writer(x, index = 'maml', varname = 'updated_variable')
#' bulk_writer(x, index = 'maml')
#################################################################################################
#################################### Bulk update writer ################################
#################################################################################################
bulk_writer <- function(x, index = 'maml', varname = 'updated_variable', type) {
bulk_writer <- function(x, index = 'maml', varname, type, ver) {
### Create a json object if more than one variable besides _id, otherwise use value as-is
if (length(x) > 2) {
json <- toJSON(bind_rows(x)[-1], collapse = T)
@ -27,19 +28,19 @@ bulk_writer <- function(x, index = 'maml', varname = 'updated_variable', type) {
if (varname == "ud") {
return(
paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
{ "script" : { "source": "ctx._source.ud = params.code; ctx._source.remove(\\"tokens\\")", "lang" : "painless", "params": { "code": ',json,'}}}')
{ "script" : { "source": "ctx._source.version = \\"',ver,'\\"; ctx._source.ud = params.code; ctx._source.remove(\\"tokens\\")", "lang" : "painless", "params": { "code": ',json,'}}}')
)
}
if (type == 'set') {
return(
paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
{ "script" : { "source": "if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,', "object": {"',varname,'": ',json,'} }}}')
{ "script" : { "source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,', "object": {"',varname,'": ',json,'} }}}')
)
}
if (type == "add") {
return(
paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
{"script": {"source": "if (ctx._source.computerCodes != null && ctx._source.computerCodes.containsKey(\\"',varname,'\\")) {ctx._source.computerCodes.',varname,'.addAll(params.code)} else if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,' , "object": {"',varname,'": ',json,'}}}}'
{"script": {"source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null && ctx._source.computerCodes.containsKey(\\"',varname,'\\")) {ctx._source.computerCodes.',varname,'.addAll(params.code)} else if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,' , "object": {"',varname,'": ',json,'}}}}'
)
)
}

@ -33,6 +33,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
feat_select <- function (topic, dfm, class_type, percentile,measure) {
keyness <- textstat_keyness(dfm, measure = measure, docvars(dfm, class_type) == as.numeric(topic)) %>%
na.omit()
keyness[,2] <- abs(keyness[,2])
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
return(keyness)
}

@ -10,15 +10,32 @@
#' @export
#' @examples
#' ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores())
#'
# punct_check <- function(str) {
# if (!(stri_sub(str, from = -1)) %in% c('.','!','?')) {
# return(str_c(str, '.'))
# }
# }
ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores()) {
out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
str_replace_na(out$`_source.subtitle`, replacement = " "),
str_replace_na(out$`_source.preteaser`, replacement = " "),
str_replace_na(out$`_source.teaser`, replacement = " "),
str_replace_na(out$`_source.text`, replacement = " "),
sep = " ") %>%
### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
out <- out %>%
mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
out$merged <- str_c(out$`_source.title`,
out$`_source.subtitle`,
out$`_source.preteaser`,
out$`_source.teaser`,
out$`_source.text`,
sep = ". ") %>%
# Remove html tags, and multiple consequent whitespaces
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>%
str_replace_all("\\s+"," ")
par_proc <- function(row, out, udmodel) {
doc <- out[row,]

@ -4,7 +4,7 @@
\alias{bulk_writer}
\title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates}
\usage{
bulk_writer(x, index = "maml", varname = "updated_variable", type)
bulk_writer(x, index = "maml", varname, type, ver)
}
\arguments{
\item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)}
@ -14,6 +14,8 @@ bulk_writer(x, index = "maml", varname = "updated_variable", type)
\item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)}
\item{type}{Type of updating to be done, can be either 'set', 'add', or 'addnested'}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
}
\value{
A string usable as Elasticsearch bulk update command, in line-delimited JSON
@ -26,5 +28,5 @@ add: add x to the values of [varname]
varname: When using ud, the ud field will be updated instead of a computerCodes field
}
\examples{
bulk_writer(x, index = 'maml', varname = 'updated_variable')
bulk_writer(x, index = 'maml')
}

@ -27,4 +27,5 @@ Elasticizer update function: generate UDpipe output from base text
}
\examples{
ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores())
}

Loading…
Cancel
Save