You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
57 lines
2.4 KiB
57 lines
2.4 KiB
#' Generate UDpipe output from base text
|
|
#'
|
|
#' Generate UDpipe output from base text
|
|
#' @param file Filename of file to read in, also used for generating output file name
|
|
#' @param wd Working directory where *file*s can be found
|
|
#' @param ud_file Filename of udpipe model to use, should be in *wd*
|
|
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
|
|
#' @return A vector of 1's indicating the success of each update call
|
|
#' @export
|
|
#' @examples
|
|
#' ud_update(out, udmodel, ver, file)
|
|
#'
|
|
|
|
# punct_check <- function(str) {
|
|
# if (!(stri_sub(str, from = -1)) %in% c('.','!','?')) {
|
|
# return(str_c(str, '.'))
|
|
# }
|
|
# }
|
|
|
|
ud_update <- function(file, wd, ud_file, ver) {
|
|
out <- readRDS(str_c(wd,'/',file)) %>%
|
|
out_parser(., field = '_source', clean = F)
|
|
ud_model <- udpipe_load_model(file = str_c(wd,'/',ud_file))
|
|
ud <- as.data.frame(udpipe(ud_model, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
|
|
group_by(doc_id) %>%
|
|
summarise(
|
|
sentence_id = list(as.integer(sentence_id)),
|
|
token_id = list(as.integer(token_id)),
|
|
lemma = list(as.character(lemma)),
|
|
upos = list(as.character(upos)),
|
|
feats = list(as.character(feats)),
|
|
head_token_id = list(as.integer(head_token_id)),
|
|
dep_rel = list(as.character(dep_rel)),
|
|
start = list(as.integer(start)),
|
|
end = list(as.integer(end)),
|
|
exists = list(TRUE)
|
|
)
|
|
bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
|
|
saveRDS(bulk, file = str_c(wd,'/ud_',file))
|
|
# res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
|
|
return()
|
|
}
|
|
|
|
#### Old code ####
|
|
# Use | as separator (this is not done anymore, as all data is stored as actual lists, instead of strings. Code kept for future reference)
|
|
# str_replace_all("\\|", "") %>%
|
|
# Remove VERY annoying single backslashes and replace them by whitespaces
|
|
# str_replace_all("\\\\", " ") %>%
|
|
# Replace any occurence of (double) whitespace characters by a single regular whitespace
|
|
# t_id <- paste(ud[,5], collapse = '|')
|
|
# lemmatized <- paste(ud[,7], collapse = '|') %>%
|
|
# # Replacing double quotes with single quotes in text
|
|
# str_replace_all("\"","\'")
|
|
# upos_tags <- paste(ud[,8], collapse = '|')
|
|
# head_t_id <- paste(ud[,11], collapse = '|')
|
|
# dep_rel <- paste(ud[,12], collapse = '|')
|