changed udpipe output variable from tokens to ud

7 years ago · 34a6adf64e
parent 061da17c2a
commit 34a6adf64e
2 changed files with 12 additions and 12 deletions
--- a/R/bulk_writer.R
+++ b/R/bulk_writer.R
@ -4,7 +4,7 @@
 #' Type can be either one of three values:
 #' set: set the value of [varname] to x
 #' add: add x to the values of [varname]
-#' varname: When using tokens, the token field will be updated instead of a computerCodes field
+#' varname: When using ud, the ud field will be updated instead of a computerCodes field
 #' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
 #' @param index The name of the Elasticsearch index to update
 #' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)
@ -24,10 +24,10 @@ bulk_writer <- function(x, index = 'maml', varname = 'updated_variable', type) {
    names(x) <- NULL
    json <- toJSON(x[-1], collapse = T)
  }
-  if (varname == "tokens") {
+  if (varname == "ud") {
    return(
      paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
-{ "script" : { "source": "ctx._source.tokens = params.code", "lang" : "painless", "params": { "code": ',json,'}}}')
+{ "script" : { "source": "ctx._source.ud = params.code", "lang" : "painless", "params": { "code": ',json,'}}}')
    )
  }
  if (type == 'set') {
--- a/R/ud_update.R
+++ b/R/ud_update.R
@ -25,20 +25,20 @@ ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword
    ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
      group_by(doc_id) %>%
      summarise(
-        paragraph_id = list(list(paragraph_id)),
+        paragraph_id = list(list(as.integer(paragraph_id))),
-        sentence_id = list(list(sentence_id)),
+        sentence_id = list(list(as.integer(sentence_id))),
-        token_id = list(list(as.numeric(token_id))),
+        token_id = list(list(as.integer(token_id))),
-        lemma = list(list(lemma)),
+        lemma = list(list(as.character(lemma))),
-        upos = list(list(upos)),
+        upos = list(list(as.character(upos))),
-        feats = list(list(feats)),
+        feats = list(list(as.character(feats))),
-        head_token_id = list(list(as.numeric(head_token_id))),
+        head_token_id = list(list(as.integer(head_token_id))),
-        dep_rel = list(list(dep_rel)),
+        dep_rel = list(list(as.character(dep_rel))),
        exists = list(list(TRUE))
     )
    return(ud)
  }
  ud <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_proc, out = out, udmodel=udmodel, mc.cores = cores))
-  bulk <- apply(ud, 1, bulk_writer, varname = 'tokens', type = 'set')
+  bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set')
  res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
  return(res)
 }