From 4bbe84ab83b1baad92797d8814b4eb5e8c7de14a Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Mon, 22 Oct 2018 12:07:53 +0200 Subject: [PATCH] First release of mamlr package --- .Rbuildignore | 2 ++ .gitignore | 3 +++ DESCRIPTION | 15 +++++++++++++ MaML.Rproj | 21 ++++++++++++++++++ NAMESPACE | 7 ++++++ R/bulk_writer.R | 19 ++++++++++++++++ R/dfm_gen.R | 48 ++++++++++++++++++++++++++++++++++++++++ R/elastic_update.R | 28 ++++++++++++++++++++++++ R/elasticizer.R | 51 +++++++++++++++++++++++++++++++++++++++++++ R/merger.R | 28 ++++++++++++++++++++++++ man/bulk_writer.Rd | 24 ++++++++++++++++++++ man/dfm_gen.Rd | 22 +++++++++++++++++++ man/elastic_update.Rd | 22 +++++++++++++++++++ man/elasticizer.Rd | 26 ++++++++++++++++++++++ man/merger.Rd | 24 ++++++++++++++++++++ 15 files changed, 340 insertions(+) create mode 100644 .Rbuildignore create mode 100644 .gitignore create mode 100644 DESCRIPTION create mode 100644 MaML.Rproj create mode 100644 NAMESPACE create mode 100644 R/bulk_writer.R create mode 100644 R/dfm_gen.R create mode 100644 R/elastic_update.R create mode 100644 R/elasticizer.R create mode 100644 R/merger.R create mode 100644 man/bulk_writer.Rd create mode 100644 man/dfm_gen.Rd create mode 100644 man/elastic_update.Rd create mode 100644 man/elasticizer.Rd create mode 100644 man/merger.Rd diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..807ea25 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.Rproj.user +.Rhistory +.RData diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..91999da --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,15 @@ +Package: maml +Title: General functions for the MaML project +Version: 0.1 +Authors@R: Erik de Vries +Description: Provide general functions in support of the MaML project, like data retrieval and parsing +Depends: R (>= 3.4.4) +License: Copyright Erik de Vries +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.1.0 +Depends: elastic, + jsonlite, + parallel, + tidyverse, + quanteda diff --git a/MaML.Rproj b/MaML.Rproj new file mode 100644 index 0000000..cba1b6b --- /dev/null +++ b/MaML.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..b7c4b7a --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,7 @@ +# Generated by roxygen2: do not edit by hand + +export(bulk_writer) +export(dfm_gen) +export(elastic_update) +export(elasticizer) +export(merger) diff --git a/R/bulk_writer.R b/R/bulk_writer.R new file mode 100644 index 0000000..91e90d9 --- /dev/null +++ b/R/bulk_writer.R @@ -0,0 +1,19 @@ +#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates +#' +#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates +#' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is) +#' @param index The name of the Elasticsearch index to update +#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created) +#' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON +#' @export +#' @examples +#' bulk_writer(x, index = 'maml', varname = 'updated_variable') +################################################################################################# +#################################### Bulk update writer ################################ +################################################################################################# +bulk_writer <- function(x, index = 'maml', varname = 'updated_variable') { + return( + paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}} + { "script" : { "source": "ctx._source.',varname,' = params.code", "lang" : "painless","params" : {"code":',toJSON(x[-1], collapse = F),'}}}') + ) +} \ No newline at end of file diff --git a/R/dfm_gen.R b/R/dfm_gen.R new file mode 100644 index 0000000..65b024c --- /dev/null +++ b/R/dfm_gen.R @@ -0,0 +1,48 @@ +#' Generates dfm from ElasticSearch output +#' +#' Generates dfm from ElasticSearch output +#' @param out The elasticizer-generated data frame +#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document +#' @return A Quanteda dfm +#' @export +#' @examples +#' dfm_gen(out, words = '999') + + +################################################################################################# +#################################### DFM generator ############################# +################################################################################################# + +# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack + +dfm_gen <- function(out,words = '999') { + # Create subset with just ids, codes and text + out <- out %>% + select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field + fields <- length(names(out)) + out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores())) + # out$codes <- out$`_source.codes.majorTopic` %>% + out <- out %>% + mutate(codes = case_when( + .$`_source.codes.timeSpent` == -1 ~ NA_character_, + TRUE ~ .$`_source.codes.majorTopic` + ) + ) %>% + mutate(junk = case_when( + .$codes == 2301 ~ 1, + .$codes == 3101 ~ 1, + .$codes == 34 ~ 1, + .$`_source.codes.timeSpent` == -1 ~ NA_real_, + TRUE ~ 0 + ) + ) %>% + mutate(aggregate = .$codes %>% + str_pad(4, side="right", pad="a") %>% + str_match("([0-9]{1,2})?[0|a][1-9|a]") %>% + .[,2] %>% + as.numeric() + ) + dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>% + dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1) + return(dfm) +} \ No newline at end of file diff --git a/R/elastic_update.R b/R/elastic_update.R new file mode 100644 index 0000000..6e9621d --- /dev/null +++ b/R/elastic_update.R @@ -0,0 +1,28 @@ +#' Push a line-delimited JSON string to Elasticsearch as bulk update +#' +#' Push a line-delimited JSON string to Elasticsearch as bulk update +#' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update +#' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database +#' @return An html response object indicating the status of the update +#' @export +#' @examples +#' elastic_update(x, es_super = 'secret') + +################################################################################################# +#################################### Elasticsearch Updater ################################ +################################################################################################# +elastic_update <- function(x, es_super = 'secret') { + bulk <- paste0(x,'\n') + url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty') + res <- RETRY("POST", url = url + , body = bulk + , encode = "raw" + , add_headers("Content-Type" = "application/json") + , times = 10 + , pause_min = 10 + ) + # stop_for_status(res) + # content(res, "parsed", "application/json") + # appData <- content(res) + return(res) +} \ No newline at end of file diff --git a/R/elasticizer.R b/R/elasticizer.R new file mode 100644 index 0000000..1994db1 --- /dev/null +++ b/R/elasticizer.R @@ -0,0 +1,51 @@ +#' Generate a data frame out of unparsed Elasticsearch JSON +#' +#' Generate a data frame out of unparsed Elasticsearch JSON +#' @param query A JSON-formatted query in the Elasticsearch query DSL +#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved +#' @param index The name of the Elasticsearch index to search through + +#' @return A data frame containing all the search results +#' @export +#' @examples +#' elasticizer(query, src = TRUE, index = "maml") +################################################################################################# +#################################### Get data from ElasticSearch ################################ +################################################################################################# +elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){ + connect(es_port = 443, + es_transport = 'https', + es_host = 'linux01.uis.no', + es_path = 'es', + es_user = 'es', + es_pwd = es_pwd, + errors = 'complete') + # Get all results - one approach is to use a while loop + if (src == T) { + res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T) + } + if (src == F) { + res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T, source = F) + } + json <- fromJSON(res) + if (json$hits$total == 0) { + return("No results found") + } else { + out <- jsonlite:::flatten(json$hits$hits) + total <- json$hits$total + hits <- 1 + batch <- 1 + print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.')) + while(hits != 0){ + res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T) + json <- fromJSON(res) + hits <- length(json$hits$hits) + if(hits > 0) { + batch <- batch+1 + print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.')) + out <- bind_rows(out, jsonlite:::flatten(json$hits$hits)) + } + } + return(out) + } +} \ No newline at end of file diff --git a/R/merger.R b/R/merger.R new file mode 100644 index 0000000..8d17318 --- /dev/null +++ b/R/merger.R @@ -0,0 +1,28 @@ +#' Merges list of lemmas back into a pseudo-document +#' +#' Merges list of lemmas back into a pseudo-document +#' @param row A row number form the Elasticizer-generated data frame +#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document +#' @param out The elasticizer-generated data frame +#' @return A documentified string of lemmas, one document at a time +#' @export +#' @examples +#' merger(1, words = '999', out = out) +################################################################################################# +#################################### Reconstructing documents from lemmas######################## +################################################################################################# +## Only merging lemmas for now, feature selection has no impact on junk classification +merger <- function(row, words = '999', out = out) { + df <- out[row,] + # Mergin lemmas into single string + lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ') + # Replacing $-marked punctuation with their regular forms + lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>% + ### Removing numbers and non-words containing numbers + str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>% + # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " + paste0(.,". ") + if (words != "999") { + lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))} + return(lemmas) +} \ No newline at end of file diff --git a/man/bulk_writer.Rd b/man/bulk_writer.Rd new file mode 100644 index 0000000..0b837c7 --- /dev/null +++ b/man/bulk_writer.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bulk_writer.R +\name{bulk_writer} +\alias{bulk_writer} +\title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates} +\usage{ +bulk_writer(x, index = "maml", varname = "updated_variable") +} +\arguments{ +\item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)} + +\item{index}{The name of the Elasticsearch index to update} + +\item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created)} +} +\value{ +A string usable as Elasticsearch bulk update command, in line-delimited JSON +} +\description{ +Generate a line-delimited JSON string for use in Elasticsearch bulk updates +} +\examples{ +bulk_writer(x, index = 'maml', varname = 'updated_variable') +} diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd new file mode 100644 index 0000000..6f94afb --- /dev/null +++ b/man/dfm_gen.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dfm_gen.R +\name{dfm_gen} +\alias{dfm_gen} +\title{Generates dfm from ElasticSearch output} +\usage{ +dfm_gen(out, words = "999") +} +\arguments{ +\item{out}{The elasticizer-generated data frame} + +\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} +} +\value{ +A Quanteda dfm +} +\description{ +Generates dfm from ElasticSearch output +} +\examples{ +dfm_gen(out, words = '999') +} diff --git a/man/elastic_update.Rd b/man/elastic_update.Rd new file mode 100644 index 0000000..1b5a43f --- /dev/null +++ b/man/elastic_update.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/elastic_update.R +\name{elastic_update} +\alias{elastic_update} +\title{Push a line-delimited JSON string to Elasticsearch as bulk update} +\usage{ +elastic_update(x, es_super = "secret") +} +\arguments{ +\item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update} + +\item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database} +} +\value{ +An html response object indicating the status of the update +} +\description{ +Push a line-delimited JSON string to Elasticsearch as bulk update +} +\examples{ +elastic_update(x, es_super = 'secret') +} diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd new file mode 100644 index 0000000..b88f751 --- /dev/null +++ b/man/elasticizer.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/elasticizer.R +\name{elasticizer} +\alias{elasticizer} +\title{Generate a data frame out of unparsed Elasticsearch JSON} +\usage{ +elasticizer(query, src = T, index = "maml", es_pwd = "unkown") +} +\arguments{ +\item{query}{A JSON-formatted query in the Elasticsearch query DSL} + +\item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved} + +\item{index}{The name of the Elasticsearch index to search through} + +\item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database} +} +\value{ +A data frame containing all the search results +} +\description{ +Generate a data frame out of unparsed Elasticsearch JSON +} +\examples{ +elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret") +} diff --git a/man/merger.Rd b/man/merger.Rd new file mode 100644 index 0000000..0b1739b --- /dev/null +++ b/man/merger.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merger.R +\name{merger} +\alias{merger} +\title{Merges list of lemmas back into a pseudo-document} +\usage{ +merger(row, words = "999", out = out) +} +\arguments{ +\item{row}{A row number form the Elasticizer-generated data frame} + +\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} + +\item{out}{The elasticizer-generated data frame} +} +\value{ +A documentified string of lemmas, one document at a time +} +\description{ +Merges list of lemmas back into a pseudo-document +} +\examples{ +merger(1, words = '999', out = out) +}