diff --git a/R/actorizer.R b/R/actorizer.R index e53baa1..f73145b 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -62,14 +62,22 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier out$highlight.subtitle[is.na(out$highlight.subtitle)] <- out$`_source.subtitle`[is.na(out$highlight.subtitle)] out$highlight.preteaser[is.na(out$highlight.preteaser)] <- out$`_source.preteaser`[is.na(out$highlight.preteaser)] - out$merged <- str_c(str_replace_na(unlist(out$highlight.title), replacement = " "), - str_replace_na(unlist(out$highlight.subtitle), replacement = " "), - str_replace_na(unlist(out$highlight.preteaser), replacement = " "), - str_replace_na(unlist(out$highlight.teaser), replacement = " "), - str_replace_na(unlist(out$highlight.text), replacement = " "), - sep = " ") %>% + out <- out %>% + mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>% + mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>% + mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>% + mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>% + mutate(highlight.text = str_replace_na(highlight.text, replacement = '')) + out$merged <- str_c(out$highlight.title, + out$highlight.subtitle, + out$highlight.preteaser, + out$highlight.teaser, + out$highlight.text, + sep = ". ") %>% # Remove html tags, and multiple consequent whitespaces str_replace_all("<.{0,20}?>", " ") %>% + str_replace_all('(\\. ){2,}', '. ') %>% + str_replace_all('([!?.])\\.','\\1') %>% str_replace_all("\\s+"," ") ids <- fromJSON(ids) diff --git a/R/bulk_writer.R b/R/bulk_writer.R index 63205f3..947538a 100644 --- a/R/bulk_writer.R +++ b/R/bulk_writer.R @@ -9,14 +9,15 @@ #' @param index The name of the Elasticsearch index to update #' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes) #' @param type Type of updating to be done, can be either 'set', 'add', or 'addnested' +#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') #' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON #' @export #' @examples -#' bulk_writer(x, index = 'maml', varname = 'updated_variable') +#' bulk_writer(x, index = 'maml') ################################################################################################# #################################### Bulk update writer ################################ ################################################################################################# -bulk_writer <- function(x, index = 'maml', varname = 'updated_variable', type) { +bulk_writer <- function(x, index = 'maml', varname, type, ver) { ### Create a json object if more than one variable besides _id, otherwise use value as-is if (length(x) > 2) { json <- toJSON(bind_rows(x)[-1], collapse = T) @@ -27,19 +28,19 @@ bulk_writer <- function(x, index = 'maml', varname = 'updated_variable', type) { if (varname == "ud") { return( paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}} -{ "script" : { "source": "ctx._source.ud = params.code; ctx._source.remove(\\"tokens\\")", "lang" : "painless", "params": { "code": ',json,'}}}') +{ "script" : { "source": "ctx._source.version = \\"',ver,'\\"; ctx._source.ud = params.code; ctx._source.remove(\\"tokens\\")", "lang" : "painless", "params": { "code": ',json,'}}}') ) } if (type == 'set') { return( paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}} -{ "script" : { "source": "if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,', "object": {"',varname,'": ',json,'} }}}') +{ "script" : { "source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,', "object": {"',varname,'": ',json,'} }}}') ) } if (type == "add") { return( paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}} - {"script": {"source": "if (ctx._source.computerCodes != null && ctx._source.computerCodes.containsKey(\\"',varname,'\\")) {ctx._source.computerCodes.',varname,'.addAll(params.code)} else if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,' , "object": {"',varname,'": ',json,'}}}}' + {"script": {"source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null && ctx._source.computerCodes.containsKey(\\"',varname,'\\")) {ctx._source.computerCodes.',varname,'.addAll(params.code)} else if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,' , "object": {"',varname,'": ',json,'}}}}' ) ) } diff --git a/R/modelizer.R b/R/modelizer.R index d58fe4f..71a9a07 100644 --- a/R/modelizer.R +++ b/R/modelizer.R @@ -33,6 +33,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se feat_select <- function (topic, dfm, class_type, percentile,measure) { keyness <- textstat_keyness(dfm, measure = measure, docvars(dfm, class_type) == as.numeric(topic)) %>% na.omit() + keyness[,2] <- abs(keyness[,2]) keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature return(keyness) } diff --git a/R/ud_update.R b/R/ud_update.R index 64f1b57..f940f36 100644 --- a/R/ud_update.R +++ b/R/ud_update.R @@ -10,15 +10,32 @@ #' @export #' @examples #' ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores()) +#' + +# punct_check <- function(str) { +# if (!(stri_sub(str, from = -1)) %in% c('.','!','?')) { +# return(str_c(str, '.')) +# } +# } + ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores()) { - out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "), - str_replace_na(out$`_source.subtitle`, replacement = " "), - str_replace_na(out$`_source.preteaser`, replacement = " "), - str_replace_na(out$`_source.teaser`, replacement = " "), - str_replace_na(out$`_source.text`, replacement = " "), - sep = " ") %>% + ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences + out <- out %>% + mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>% + mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>% + mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>% + mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>% + mutate(`_source.text` = str_replace_na(`_source.text`, replacement = '')) + out$merged <- str_c(out$`_source.title`, + out$`_source.subtitle`, + out$`_source.preteaser`, + out$`_source.teaser`, + out$`_source.text`, + sep = ". ") %>% # Remove html tags, and multiple consequent whitespaces str_replace_all("<.{0,20}?>", " ") %>% + str_replace_all('(\\. ){2,}', '. ') %>% + str_replace_all('([!?.])\\.','\\1') %>% str_replace_all("\\s+"," ") par_proc <- function(row, out, udmodel) { doc <- out[row,] diff --git a/man/bulk_writer.Rd b/man/bulk_writer.Rd index c763424..14ffcc6 100644 --- a/man/bulk_writer.Rd +++ b/man/bulk_writer.Rd @@ -4,7 +4,7 @@ \alias{bulk_writer} \title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates} \usage{ -bulk_writer(x, index = "maml", varname = "updated_variable", type) +bulk_writer(x, index = "maml", varname, type, ver) } \arguments{ \item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)} @@ -14,6 +14,8 @@ bulk_writer(x, index = "maml", varname = "updated_variable", type) \item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)} \item{type}{Type of updating to be done, can be either 'set', 'add', or 'addnested'} + +\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} } \value{ A string usable as Elasticsearch bulk update command, in line-delimited JSON @@ -26,5 +28,5 @@ add: add x to the values of [varname] varname: When using ud, the ud field will be updated instead of a computerCodes field } \examples{ -bulk_writer(x, index = 'maml', varname = 'updated_variable') +bulk_writer(x, index = 'maml') } diff --git a/man/ud_update.Rd b/man/ud_update.Rd index bff6c83..6cfc8fb 100644 --- a/man/ud_update.Rd +++ b/man/ud_update.Rd @@ -27,4 +27,5 @@ Elasticizer update function: generate UDpipe output from base text } \examples{ ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores()) + }