mamlr/R/ud_update.R

#' Elasticizer update function: generate UDpipe output from base text
#'
#' Elasticizer update function: generate UDpipe output from base text
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
#' @param udmodel UDpipe model to use
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param file Filename for output (ud_ is automatically prepended)
#' @return A vector of 1's indicating the success of each update call
#' @export
#' @examples
#' ud_update(out, udmodel, ver, file)
#'

# punct_check <- function(str) {
#   if (!(stri_sub(str, from = -1)) %in% c('.','!','?')) {
#     return(str_c(str, '.'))
#   }
# }

ud_update <- function(out, udmodel, ver, file) {
  out <- mamlr:::out_parser(out, field = '_source', clean = F)
  ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
    group_by(doc_id) %>%
    summarise(
      sentence_id = list(as.integer(sentence_id)),
      token_id = list(as.integer(token_id)),
      lemma = list(as.character(lemma)),
      upos = list(as.character(upos)),
      feats = list(as.character(feats)),
      head_token_id = list(as.integer(head_token_id)),
      dep_rel = list(as.character(dep_rel)),
      start = list(as.integer(start)),
      end = list(as.integer(end)),
      exists = list(TRUE)
   )
  bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
  saveRDS(bulk, file = paste0('ud_',file))
  # res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
  return()
}

#### Old code ####
# Use | as separator (this is not done anymore, as all data is stored as actual lists, instead of strings. Code kept for future reference)
# str_replace_all("\\|", "") %>%
# Remove VERY annoying single backslashes and replace them by whitespaces
# str_replace_all("\\\\", " ") %>%
# Replace any occurence of (double) whitespace characters by a single regular whitespace
# t_id <- paste(ud[,5], collapse = '|')
# lemmatized <- paste(ud[,7], collapse = '|') %>%
#   # Replacing double quotes with single quotes in text
#   str_replace_all("\"","\'")
# upos_tags <- paste(ud[,8], collapse = '|')
# head_t_id <- paste(ud[,11], collapse = '|')
# dep_rel <- paste(ud[,12], collapse = '|')
ud_update: Added function to lemmatize documents 6 years ago			`#' Elasticizer update function: generate UDpipe output from base text`
			`#'`
			`#' Elasticizer update function: generate UDpipe output from base text`
			`#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)`
			`#' @param udmodel UDpipe model to use`
actorizer, ud_update: implemented 'ver' variable for keeping track of updates 6 years ago			`#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')`
class_update: remove dfm_gen multicore option dfm_gen: remove multicore, update merger() code elasticizer: changed filenaming scheme for dump option merger: Fixed bug where an NA lemma would cause the entire document to become NA. Now the NA lemmas are filtered out before merging ud_update: removed parallel processing, changed script to save bulk updates in .Rds files instead of sending them straight away 4 years ago			`#' @param file Filename for output (ud_ is automatically prepended)`
ud_update: Added function to lemmatize documents 6 years ago			`#' @return A vector of 1's indicating the success of each update call`
			`#' @export`
			`#' @examples`
class_update: remove dfm_gen multicore option dfm_gen: remove multicore, update merger() code elasticizer: changed filenaming scheme for dump option merger: Fixed bug where an NA lemma would cause the entire document to become NA. Now the NA lemmas are filtered out before merging ud_update: removed parallel processing, changed script to save bulk updates in .Rds files instead of sending them straight away 4 years ago			`#' ud_update(out, udmodel, ver, file)`
actorizer, ud_update: Updated merging of document fields to properly deal with missing punctuation at the end of fields (e.g. a title without punctuation at the end of the string) modelizer: Minor update to feature keyness, using absolute values now to determine the most informative features for a class (so features that are either strongly postively or negatively related to the class) bulk_writer: Added the 'ver' parameter to include a short version string with each update. Mostly to deal with updates that do not complete successfully on all data 6 years ago			`#'`

			`# punct_check <- function(str) {`
			`# if (!(stri_sub(str, from = -1)) %in% c('.','!','?')) {`
			`# return(str_c(str, '.'))`
			`# }`
			`# }`

ud_update: small fix to file naming 4 years ago			`ud_update <- function(out, udmodel, ver, file) {`
actorizer, ud_update: Updated ud parsing and actorizer to work based on character positions. This code is used for local testing 6 years ago			`out <- mamlr:::out_parser(out, field = '_source', clean = F)`
class_update: remove dfm_gen multicore option dfm_gen: remove multicore, update merger() code elasticizer: changed filenaming scheme for dump option merger: Fixed bug where an NA lemma would cause the entire document to become NA. Now the NA lemmas are filtered out before merging ud_update: removed parallel processing, changed script to save bulk updates in .Rds files instead of sending them straight away 4 years ago			ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
			`group_by(doc_id) %>%`
			`summarise(`
			`sentence_id = list(as.integer(sentence_id)),`
			`token_id = list(as.integer(token_id)),`
			`lemma = list(as.character(lemma)),`
			`upos = list(as.character(upos)),`
			`feats = list(as.character(feats)),`
			`head_token_id = list(as.integer(head_token_id)),`
			`dep_rel = list(as.character(dep_rel)),`
			`start = list(as.integer(start)),`
			`end = list(as.integer(end)),`
			`exists = list(TRUE)`
			`)`
actorizer, ud_update: implemented 'ver' variable for keeping track of updates 6 years ago			`bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)`
class_update: remove dfm_gen multicore option dfm_gen: remove multicore, update merger() code elasticizer: changed filenaming scheme for dump option merger: Fixed bug where an NA lemma would cause the entire document to become NA. Now the NA lemmas are filtered out before merging ud_update: removed parallel processing, changed script to save bulk updates in .Rds files instead of sending them straight away 4 years ago			`saveRDS(bulk, file = paste0('ud_',file))`
			`# res <- elastic_update(bulk, es_super = es_super, localhost = localhost)`
			`return()`
ud_update: Added function to lemmatize documents 6 years ago			`}`

			`#### Old code ####`
			`# Use \| as separator (this is not done anymore, as all data is stored as actual lists, instead of strings. Code kept for future reference)`
			`# str_replace_all("\\\|", "") %>%`
			`# Remove VERY annoying single backslashes and replace them by whitespaces`
			`# str_replace_all("\\\\", " ") %>%`
			`# Replace any occurence of (double) whitespace characters by a single regular whitespace`
			`# t_id <- paste(ud[,5], collapse = '\|')`
			`# lemmatized <- paste(ud[,7], collapse = '\|') %>%`
			`# # Replacing double quotes with single quotes in text`
			`# str_replace_all("\"","\'")`
			`# upos_tags <- paste(ud[,8], collapse = '\|')`
			`# head_t_id <- paste(ud[,11], collapse = '\|')`
			`# dep_rel <- paste(ud[,12], collapse = '\|')`