changed udpipe output variable from tokens to ud

master
Erik de Vries 6 years ago
parent 061da17c2a
commit 34a6adf64e

@ -4,7 +4,7 @@
#' Type can be either one of three values: #' Type can be either one of three values:
#' set: set the value of [varname] to x #' set: set the value of [varname] to x
#' add: add x to the values of [varname] #' add: add x to the values of [varname]
#' varname: When using tokens, the token field will be updated instead of a computerCodes field #' varname: When using ud, the ud field will be updated instead of a computerCodes field
#' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is) #' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
#' @param index The name of the Elasticsearch index to update #' @param index The name of the Elasticsearch index to update
#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes) #' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)
@ -24,10 +24,10 @@ bulk_writer <- function(x, index = 'maml', varname = 'updated_variable', type) {
names(x) <- NULL names(x) <- NULL
json <- toJSON(x[-1], collapse = T) json <- toJSON(x[-1], collapse = T)
} }
if (varname == "tokens") { if (varname == "ud") {
return( return(
paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}} paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
{ "script" : { "source": "ctx._source.tokens = params.code", "lang" : "painless", "params": { "code": ',json,'}}}') { "script" : { "source": "ctx._source.ud = params.code", "lang" : "painless", "params": { "code": ',json,'}}}')
) )
} }
if (type == 'set') { if (type == 'set') {

@ -25,20 +25,20 @@ ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword
ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
group_by(doc_id) %>% group_by(doc_id) %>%
summarise( summarise(
paragraph_id = list(list(paragraph_id)), paragraph_id = list(list(as.integer(paragraph_id))),
sentence_id = list(list(sentence_id)), sentence_id = list(list(as.integer(sentence_id))),
token_id = list(list(as.numeric(token_id))), token_id = list(list(as.integer(token_id))),
lemma = list(list(lemma)), lemma = list(list(as.character(lemma))),
upos = list(list(upos)), upos = list(list(as.character(upos))),
feats = list(list(feats)), feats = list(list(as.character(feats))),
head_token_id = list(list(as.numeric(head_token_id))), head_token_id = list(list(as.integer(head_token_id))),
dep_rel = list(list(dep_rel)), dep_rel = list(list(as.character(dep_rel))),
exists = list(list(TRUE)) exists = list(list(TRUE))
) )
return(ud) return(ud)
} }
ud <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_proc, out = out, udmodel=udmodel, mc.cores = cores)) ud <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_proc, out = out, udmodel=udmodel, mc.cores = cores))
bulk <- apply(ud, 1, bulk_writer, varname = 'tokens', type = 'set') bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set')
res <- elastic_update(bulk, es_super = es_super, localhost = localhost) res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
return(res) return(res)
} }

Loading…
Cancel
Save