From 4bbe84ab83b1baad92797d8814b4eb5e8c7de14a Mon Sep 17 00:00:00 2001
From: Erik de Vries <erik@devries.pm>
Date: Mon, 22 Oct 2018 12:07:53 +0200
Subject: [PATCH] First release of mamlr package

---
 .Rbuildignore         |  2 ++
 .gitignore            |  3 +++
 DESCRIPTION           | 15 +++++++++++++
 MaML.Rproj            | 21 ++++++++++++++++++
 NAMESPACE             |  7 ++++++
 R/bulk_writer.R       | 19 ++++++++++++++++
 R/dfm_gen.R           | 48 ++++++++++++++++++++++++++++++++++++++++
 R/elastic_update.R    | 28 ++++++++++++++++++++++++
 R/elasticizer.R       | 51 +++++++++++++++++++++++++++++++++++++++++++
 R/merger.R            | 28 ++++++++++++++++++++++++
 man/bulk_writer.Rd    | 24 ++++++++++++++++++++
 man/dfm_gen.Rd        | 22 +++++++++++++++++++
 man/elastic_update.Rd | 22 +++++++++++++++++++
 man/elasticizer.Rd    | 26 ++++++++++++++++++++++
 man/merger.Rd         | 24 ++++++++++++++++++++
 15 files changed, 340 insertions(+)
 create mode 100644 .Rbuildignore
 create mode 100644 .gitignore
 create mode 100644 DESCRIPTION
 create mode 100644 MaML.Rproj
 create mode 100644 NAMESPACE
 create mode 100644 R/bulk_writer.R
 create mode 100644 R/dfm_gen.R
 create mode 100644 R/elastic_update.R
 create mode 100644 R/elasticizer.R
 create mode 100644 R/merger.R
 create mode 100644 man/bulk_writer.Rd
 create mode 100644 man/dfm_gen.Rd
 create mode 100644 man/elastic_update.Rd
 create mode 100644 man/elasticizer.Rd
 create mode 100644 man/merger.Rd

diff --git a/.Rbuildignore b/.Rbuildignore
new file mode 100644
index 0000000..91114bf
--- /dev/null
+++ b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..807ea25
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.Rproj.user
+.Rhistory
+.RData
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..91999da
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,15 @@
+Package: maml
+Title: General functions for the MaML project
+Version: 0.1
+Authors@R: Erik de Vries
+Description: Provide general functions in support of the MaML project, like data retrieval and parsing
+Depends: R (>= 3.4.4)
+License: Copyright Erik de Vries
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.1.0
+Depends: elastic,
+    jsonlite,
+    parallel,
+    tidyverse,
+    quanteda
diff --git a/MaML.Rproj b/MaML.Rproj
new file mode 100644
index 0000000..cba1b6b
--- /dev/null
+++ b/MaML.Rproj
@@ -0,0 +1,21 @@
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..b7c4b7a
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,7 @@
+# Generated by roxygen2: do not edit by hand
+
+export(bulk_writer)
+export(dfm_gen)
+export(elastic_update)
+export(elasticizer)
+export(merger)
diff --git a/R/bulk_writer.R b/R/bulk_writer.R
new file mode 100644
index 0000000..91e90d9
--- /dev/null
+++ b/R/bulk_writer.R
@@ -0,0 +1,19 @@
+#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
+#'
+#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
+#' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
+#' @param index The name of the Elasticsearch index to update
+#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created)
+#' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
+#' @export
+#' @examples
+#' bulk_writer(x, index = 'maml', varname = 'updated_variable')
+#################################################################################################
+#################################### Bulk update writer ################################
+#################################################################################################
+bulk_writer <- function(x, index = 'maml', varname = 'updated_variable') {
+  return(
+    paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
+  { "script" : { "source": "ctx._source.',varname,' = params.code", "lang" : "painless","params" : {"code":',toJSON(x[-1], collapse = F),'}}}')
+  )
+}
\ No newline at end of file
diff --git a/R/dfm_gen.R b/R/dfm_gen.R
new file mode 100644
index 0000000..65b024c
--- /dev/null
+++ b/R/dfm_gen.R
@@ -0,0 +1,48 @@
+#' Generates dfm from ElasticSearch output
+#'
+#' Generates dfm from ElasticSearch output
+#' @param out The elasticizer-generated data frame
+#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
+#' @return A Quanteda dfm 
+#' @export
+#' @examples
+#' dfm_gen(out, words = '999')
+
+
+#################################################################################################
+#################################### DFM generator #############################
+#################################################################################################
+
+# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
+
+dfm_gen <- function(out,words = '999') {
+  # Create subset with just ids, codes and text
+  out <- out %>%
+    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
+  fields <- length(names(out))
+  out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
+  # out$codes <- out$`_source.codes.majorTopic` %>%
+  out <- out %>%
+    mutate(codes = case_when(
+      .$`_source.codes.timeSpent` == -1 ~ NA_character_,
+      TRUE ~ .$`_source.codes.majorTopic`
+    )
+    ) %>%
+    mutate(junk = case_when(
+      .$codes == 2301 ~ 1,
+      .$codes == 3101 ~ 1,
+      .$codes == 34 ~ 1,
+      .$`_source.codes.timeSpent` == -1 ~ NA_real_,
+      TRUE ~ 0
+    )
+    ) %>%
+    mutate(aggregate = .$codes %>%
+             str_pad(4, side="right", pad="a") %>%
+             str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
+             .[,2] %>%
+             as.numeric()
+    )
+  dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>%
+    dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
+  return(dfm)
+}
\ No newline at end of file
diff --git a/R/elastic_update.R b/R/elastic_update.R
new file mode 100644
index 0000000..6e9621d
--- /dev/null
+++ b/R/elastic_update.R
@@ -0,0 +1,28 @@
+#' Push a line-delimited JSON string to Elasticsearch as bulk update
+#'
+#' Push a line-delimited JSON string to Elasticsearch as bulk update
+#' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update
+#' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database
+#' @return An html response object indicating the status of the update
+#' @export
+#' @examples
+#' elastic_update(x, es_super = 'secret')
+
+#################################################################################################
+#################################### Elasticsearch Updater ################################
+#################################################################################################
+elastic_update <- function(x, es_super = 'secret') {
+  bulk <- paste0(x,'\n')
+  url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty')
+  res <- RETRY("POST", url = url
+               , body = bulk
+               , encode = "raw"
+               , add_headers("Content-Type" = "application/json")
+               , times = 10
+               , pause_min = 10
+  )
+  # stop_for_status(res)
+  # content(res, "parsed", "application/json")
+  # appData <- content(res)
+  return(res)
+}
\ No newline at end of file
diff --git a/R/elasticizer.R b/R/elasticizer.R
new file mode 100644
index 0000000..1994db1
--- /dev/null
+++ b/R/elasticizer.R
@@ -0,0 +1,51 @@
+#' Generate a data frame out of unparsed Elasticsearch JSON
+#'
+#' Generate a data frame out of unparsed Elasticsearch JSON
+#' @param query A JSON-formatted query in the Elasticsearch query DSL
+#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
+#' @param index The name of the Elasticsearch index to search through
+
+#' @return A data frame containing all the search results
+#' @export
+#' @examples
+#' elasticizer(query, src = TRUE, index = "maml")
+#################################################################################################
+#################################### Get data from ElasticSearch ################################
+#################################################################################################
+elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){
+  connect(es_port = 443,
+          es_transport = 'https',
+          es_host = 'linux01.uis.no',
+          es_path = 'es',
+          es_user = 'es',
+          es_pwd = es_pwd,
+          errors = 'complete')
+  # Get all results - one approach is to use a while loop
+  if (src == T) {
+    res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
+  }
+  if (src == F) {
+    res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T, source = F)
+  }
+  json <- fromJSON(res)
+  if (json$hits$total == 0) {
+    return("No results found")
+  } else {
+    out <-  jsonlite:::flatten(json$hits$hits)
+    total <- json$hits$total
+    hits <- 1
+    batch <- 1
+    print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
+    while(hits != 0){
+      res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
+      json <- fromJSON(res)
+      hits <- length(json$hits$hits)
+      if(hits > 0) {
+        batch <- batch+1
+        print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
+        out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
+      }
+    }
+    return(out)
+  }
+}
\ No newline at end of file
diff --git a/R/merger.R b/R/merger.R
new file mode 100644
index 0000000..8d17318
--- /dev/null
+++ b/R/merger.R
@@ -0,0 +1,28 @@
+#' Merges list of lemmas back into a pseudo-document
+#'
+#' Merges list of lemmas back into a pseudo-document
+#' @param row A row number form the Elasticizer-generated data frame
+#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
+#' @param out The elasticizer-generated data frame
+#' @return A documentified string of lemmas, one document at a time
+#' @export
+#' @examples
+#' merger(1, words = '999', out = out)
+#################################################################################################
+#################################### Reconstructing documents from lemmas########################
+#################################################################################################
+## Only merging lemmas for now, feature selection has no impact on junk classification
+merger <- function(row, words = '999', out = out) {
+  df <- out[row,]
+  # Mergin lemmas into single string
+  lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
+  # Replacing $-marked punctuation with their regular forms
+  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
+    ### Removing numbers and non-words containing numbers
+    str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
+    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
+    paste0(.,". ")
+  if (words != "999") {
+    lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
+  return(lemmas)
+}
\ No newline at end of file
diff --git a/man/bulk_writer.Rd b/man/bulk_writer.Rd
new file mode 100644
index 0000000..0b837c7
--- /dev/null
+++ b/man/bulk_writer.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/bulk_writer.R
+\name{bulk_writer}
+\alias{bulk_writer}
+\title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates}
+\usage{
+bulk_writer(x, index = "maml", varname = "updated_variable")
+}
+\arguments{
+\item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)}
+
+\item{index}{The name of the Elasticsearch index to update}
+
+\item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created)}
+}
+\value{
+A string usable as Elasticsearch bulk update command, in line-delimited JSON
+}
+\description{
+Generate a line-delimited JSON string for use in Elasticsearch bulk updates
+}
+\examples{
+bulk_writer(x, index = 'maml', varname = 'updated_variable')
+}
diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd
new file mode 100644
index 0000000..6f94afb
--- /dev/null
+++ b/man/dfm_gen.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dfm_gen.R
+\name{dfm_gen}
+\alias{dfm_gen}
+\title{Generates dfm from ElasticSearch output}
+\usage{
+dfm_gen(out, words = "999")
+}
+\arguments{
+\item{out}{The elasticizer-generated data frame}
+
+\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
+}
+\value{
+A Quanteda dfm
+}
+\description{
+Generates dfm from ElasticSearch output
+}
+\examples{
+dfm_gen(out, words = '999')
+}
diff --git a/man/elastic_update.Rd b/man/elastic_update.Rd
new file mode 100644
index 0000000..1b5a43f
--- /dev/null
+++ b/man/elastic_update.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/elastic_update.R
+\name{elastic_update}
+\alias{elastic_update}
+\title{Push a line-delimited JSON string to Elasticsearch as bulk update}
+\usage{
+elastic_update(x, es_super = "secret")
+}
+\arguments{
+\item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update}
+
+\item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database}
+}
+\value{
+An html response object indicating the status of the update
+}
+\description{
+Push a line-delimited JSON string to Elasticsearch as bulk update
+}
+\examples{
+elastic_update(x, es_super = 'secret')
+}
diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd
new file mode 100644
index 0000000..b88f751
--- /dev/null
+++ b/man/elasticizer.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/elasticizer.R
+\name{elasticizer}
+\alias{elasticizer}
+\title{Generate a data frame out of unparsed Elasticsearch JSON}
+\usage{
+elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
+}
+\arguments{
+\item{query}{A JSON-formatted query in the Elasticsearch query DSL}
+
+\item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}
+
+\item{index}{The name of the Elasticsearch index to search through}
+
+\item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database}
+}
+\value{
+A data frame containing all the search results
+}
+\description{
+Generate a data frame out of unparsed Elasticsearch JSON
+}
+\examples{
+elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret")
+}
diff --git a/man/merger.Rd b/man/merger.Rd
new file mode 100644
index 0000000..0b1739b
--- /dev/null
+++ b/man/merger.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/merger.R
+\name{merger}
+\alias{merger}
+\title{Merges list of lemmas back into a pseudo-document}
+\usage{
+merger(row, words = "999", out = out)
+}
+\arguments{
+\item{row}{A row number form the Elasticizer-generated data frame}
+
+\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
+
+\item{out}{The elasticizer-generated data frame}
+}
+\value{
+A documentified string of lemmas, one document at a time
+}
+\description{
+Merges list of lemmas back into a pseudo-document
+}
+\examples{
+merger(1, words = '999', out = out)
+}