ud_update: Added function to lemmatize documents

7 years ago · 061da17c2a
parent ef51ce60a9
commit 061da17c2a
4 changed files with 91 additions and 1 deletions
--- a/3
+++ b/3
@ -12,7 +12,8 @@ Depends: R (>= 3.3.1),
    httr,
    caret,
    e1071,
-    udpipe
+    udpipe,
+    parallel
 License: Copyright Erik de Vries
 Encoding: UTF-8
 LazyData: true
--- a/1
+++ b/1
@ -10,3 +10,4 @@ export(merger)
 export(modelizer)
 export(query_gen_actors)
 export(query_string)
+export(ud_update)
--- a/R/ud_update.R
+++ b/R/ud_update.R
@ -0,0 +1,58 @@
+#' Elasticizer update function: generate UDpipe output from base text
+#'
+#' Elasticizer update function: generate UDpipe output from base text
+#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
+#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
+#' @param udmodel UDpipe model to use
+#' @param es_super Password for write access to ElasticSearch
+#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
+#' @return A vector of 1's indicating the success of each update call
+#' @export
+#' @examples
+#' ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores())
+ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores()) {
+  out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
+                      str_replace_na(out$`_source.subtitle`, replacement = " "),
+                      str_replace_na(out$`_source.preteaser`, replacement = " "),
+                      str_replace_na(out$`_source.teaser`, replacement = " "),
+                      str_replace_na(out$`_source.text`, replacement = " "),
+                      sep = " ") %>%
+    # Remove html tags, and multiple consequent whitespaces
+    str_replace_all("<.*?>", " ") %>%
+    str_replace_all("\\s+"," ")
+  par_proc <- function(row, out, udmodel) {
+    doc <- out[row,]
+    ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
+      group_by(doc_id) %>%
+      summarise(
+        paragraph_id = list(list(paragraph_id)),
+        sentence_id = list(list(sentence_id)),
+        token_id = list(list(as.numeric(token_id))),
+        lemma = list(list(lemma)),
+        upos = list(list(upos)),
+        feats = list(list(feats)),
+        head_token_id = list(list(as.numeric(head_token_id))),
+        dep_rel = list(list(dep_rel)),
+        exists = list(list(TRUE))
+     )
+    return(ud)
+  }
+  ud <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_proc, out = out, udmodel=udmodel, mc.cores = cores))
+  bulk <- apply(ud, 1, bulk_writer, varname = 'tokens', type = 'set')
+  res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
+  return(res)
+}
+
+#### Old code ####
+# Use | as separator (this is not done anymore, as all data is stored as actual lists, instead of strings. Code kept for future reference)
+# str_replace_all("\\|", "") %>%
+# Remove VERY annoying single backslashes and replace them by whitespaces
+# str_replace_all("\\\\", " ") %>%
+# Replace any occurence of (double) whitespace characters by a single regular whitespace
+# t_id <- paste(ud[,5], collapse = '|')
+# lemmatized <- paste(ud[,7], collapse = '|') %>%
+#   # Replacing double quotes with single quotes in text
+#   str_replace_all("\"","\'")
+# upos_tags <- paste(ud[,8], collapse = '|')
+# head_t_id <- paste(ud[,11], collapse = '|')
+# dep_rel <- paste(ud[,12], collapse = '|')
--- a/man/ud_update.Rd
+++ b/man/ud_update.Rd
@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ud_update.R
+\name{ud_update}
+\alias{ud_update}
+\title{Elasticizer update function: generate UDpipe output from base text}
+\usage{
+ud_update(out, localhost = T, udmodel,
+  es_super = .rs.askForPassword("ElasticSearch WRITE"),
+  cores = detectCores())
+}
+\arguments{
+\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
+
+\item{localhost}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
+
+\item{udmodel}{UDpipe model to use}
+
+\item{es_super}{Password for write access to ElasticSearch}
+
+\item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)}
+}
+\value{
+A vector of 1's indicating the success of each update call
+}
+\description{
+Elasticizer update function: generate UDpipe output from base text
+}
+\examples{
+ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores())
+}