You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mamlr/R/ud_update.R

89 lines
4.2 KiB

#' Elasticizer update function: generate UDpipe output from base text
#'
#' Elasticizer update function: generate UDpipe output from base text
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @param udmodel UDpipe model to use
#' @param es_super Password for write access to ElasticSearch
#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @return A vector of 1's indicating the success of each update call
#' @export
#' @examples
#' ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores())
#'
# punct_check <- function(str) {
# if (!(stri_sub(str, from = -1)) %in% c('.','!','?')) {
# return(str_c(str, '.'))
# }
# }
ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) {
fncols <- function(data, cname) {
add <-cname[!cname%in%names(data)]
if(length(add)!=0) data[add] <- NA
data
}
out <- fncols(out, c('_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
out <- replace(out, out=="NULL", NA)
### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
out <- out %>%
mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
out$merged <- str_c(out$`_source.title`,
out$`_source.subtitle`,
out$`_source.preteaser`,
out$`_source.teaser`,
out$`_source.text`,
'.',
sep = ". ") %>%
# Remove html tags, and multiple consequent whitespaces
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>%
str_replace_all("\\s+"," ")
# out <- filter(out, nchar(merged) > 1)
par_proc <- function(row, out, udmodel) {
doc <- out[row,]
ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
group_by(doc_id) %>%
summarise(
paragraph_id = list(list(as.integer(paragraph_id))),
sentence_id = list(list(as.integer(sentence_id))),
token_id = list(list(as.integer(token_id))),
lemma = list(list(as.character(lemma))),
upos = list(list(as.character(upos))),
feats = list(list(as.character(feats))),
head_token_id = list(list(as.integer(head_token_id))),
dep_rel = list(list(as.character(dep_rel))),
exists = list(list(TRUE))
)
return(ud)
}
ud <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_proc, out = out, udmodel=udmodel, mc.cores = cores))
bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
return(res)
}
#### Old code ####
# Use | as separator (this is not done anymore, as all data is stored as actual lists, instead of strings. Code kept for future reference)
# str_replace_all("\\|", "") %>%
# Remove VERY annoying single backslashes and replace them by whitespaces
# str_replace_all("\\\\", " ") %>%
# Replace any occurence of (double) whitespace characters by a single regular whitespace
# t_id <- paste(ud[,5], collapse = '|')
# lemmatized <- paste(ud[,7], collapse = '|') %>%
# # Replacing double quotes with single quotes in text
# str_replace_all("\"","\'")
# upos_tags <- paste(ud[,8], collapse = '|')
# head_t_id <- paste(ud[,11], collapse = '|')
# dep_rel <- paste(ud[,12], collapse = '|')