First release of mamlr package

7 years ago · 4bbe84ab83
commit 4bbe84ab83
15 changed files with 340 additions and 0 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@ -0,0 +1,2 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 .Rproj.user
 .Rhistory
 .RData
--- a/15
+++ b/15
@ -0,0 +1,15 @@
 Package: maml
 Title: General functions for the MaML project
 Version: 0.1
 Authors@R: Erik de Vries
 Description: Provide general functions in support of the MaML project, like data retrieval and parsing
 Depends: R (>= 3.4.4)
 License: Copyright Erik de Vries
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 6.1.0
 Depends: elastic,
    jsonlite,
    parallel,
    tidyverse,
    quanteda
--- a/MaML.Rproj
+++ b/MaML.Rproj
@ -0,0 +1,21 @@
 Version: 1.0
 RestoreWorkspace: No
 SaveWorkspace: No
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
 NumSpacesForTab: 2
 Encoding: UTF-8
 RnwWeave: Sweave
 LaTeX: pdfLaTeX
 AutoAppendNewline: Yes
 StripTrailingWhitespace: Yes
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
 PackageRoxygenize: rd,collate,namespace
--- a/7
+++ b/7
@ -0,0 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 export(bulk_writer)
 export(dfm_gen)
 export(elastic_update)
 export(elasticizer)
 export(merger)
--- a/R/bulk_writer.R
+++ b/R/bulk_writer.R
@ -0,0 +1,19 @@
 #' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
 #'
 #' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
 #' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
 #' @param index The name of the Elasticsearch index to update
 #' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created)
 #' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
 #' @export
 #' @examples
 #' bulk_writer(x, index = 'maml', varname = 'updated_variable')
 #################################################################################################
 #################################### Bulk update writer ################################
 #################################################################################################
 bulk_writer <- function(x, index = 'maml', varname = 'updated_variable') {
  return(
    paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
  { "script" : { "source": "ctx._source.',varname,' = params.code", "lang" : "painless","params" : {"code":',toJSON(x[-1], collapse = F),'}}}')
  )
 }
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -0,0 +1,48 @@
 #' Generates dfm from ElasticSearch output
 #'
 #' Generates dfm from ElasticSearch output
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @return A Quanteda dfm 
 #' @export
 #' @examples
 #' dfm_gen(out, words = '999')
 #################################################################################################
 #################################### DFM generator #############################
 #################################################################################################
 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
 dfm_gen <- function(out,words = '999') {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
  out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
  # out$codes <- out$`_source.codes.majorTopic` %>%
  out <- out %>%
    mutate(codes = case_when(
      .$`_source.codes.timeSpent` == -1 ~ NA_character_,
      TRUE ~ .$`_source.codes.majorTopic`
    )
    ) %>%
    mutate(junk = case_when(
      .$codes == 2301 ~ 1,
      .$codes == 3101 ~ 1,
      .$codes == 34 ~ 1,
      .$`_source.codes.timeSpent` == -1 ~ NA_real_,
      TRUE ~ 0
    )
    ) %>%
    mutate(aggregate = .$codes %>%
             str_pad(4, side="right", pad="a") %>%
             str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
             .[,2] %>%
             as.numeric()
    )
  dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>%
    dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
  return(dfm)
 }
--- a/R/elastic_update.R
+++ b/R/elastic_update.R
@ -0,0 +1,28 @@
 #' Push a line-delimited JSON string to Elasticsearch as bulk update
 #'
 #' Push a line-delimited JSON string to Elasticsearch as bulk update
 #' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update
 #' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database
 #' @return An html response object indicating the status of the update
 #' @export
 #' @examples
 #' elastic_update(x, es_super = 'secret')
 #################################################################################################
 #################################### Elasticsearch Updater ################################
 #################################################################################################
 elastic_update <- function(x, es_super = 'secret') {
  bulk <- paste0(x,'\n')
  url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty')
  res <- RETRY("POST", url = url
               , body = bulk
               , encode = "raw"
               , add_headers("Content-Type" = "application/json")
               , times = 10
               , pause_min = 10
  )
  # stop_for_status(res)
  # content(res, "parsed", "application/json")
  # appData <- content(res)
  return(res)
 }
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -0,0 +1,51 @@
 #' Generate a data frame out of unparsed Elasticsearch JSON
 #'
 #' Generate a data frame out of unparsed Elasticsearch JSON
 #' @param query A JSON-formatted query in the Elasticsearch query DSL
 #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
 #' @param index The name of the Elasticsearch index to search through
 #' @return A data frame containing all the search results
 #' @export
 #' @examples
 #' elasticizer(query, src = TRUE, index = "maml")
 #################################################################################################
 #################################### Get data from ElasticSearch ################################
 #################################################################################################
 elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){
  connect(es_port = 443,
          es_transport = 'https',
          es_host = 'linux01.uis.no',
          es_path = 'es',
          es_user = 'es',
          es_pwd = es_pwd,
          errors = 'complete')
  # Get all results - one approach is to use a while loop
  if (src == T) {
    res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
  }
  if (src == F) {
    res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T, source = F)
  }
  json <- fromJSON(res)
  if (json$hits$total == 0) {
    return("No results found")
  } else {
    out <-  jsonlite:::flatten(json$hits$hits)
    total <- json$hits$total
    hits <- 1
    batch <- 1
    print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
    while(hits != 0){
      res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
      json <- fromJSON(res)
      hits <- length(json$hits$hits)
      if(hits > 0) {
        batch <- batch+1
        print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
        out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
      }
    }
    return(out)
  }
 }
--- a/R/merger.R
+++ b/R/merger.R
@ -0,0 +1,28 @@
 #' Merges list of lemmas back into a pseudo-document
 #'
 #' Merges list of lemmas back into a pseudo-document
 #' @param row A row number form the Elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param out The elasticizer-generated data frame
 #' @return A documentified string of lemmas, one document at a time
 #' @export
 #' @examples
 #' merger(1, words = '999', out = out)
 #################################################################################################
 #################################### Reconstructing documents from lemmas########################
 #################################################################################################
 ## Only merging lemmas for now, feature selection has no impact on junk classification
 merger <- function(row, words = '999', out = out) {
  df <- out[row,]
  # Mergin lemmas into single string
  lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
  # Replacing $-marked punctuation with their regular forms
  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
    ### Removing numbers and non-words containing numbers
    str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
    paste0(.,". ")
  if (words != "999") {
    lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
  return(lemmas)
 }
--- a/man/bulk_writer.Rd
+++ b/man/bulk_writer.Rd
@ -0,0 +1,24 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/bulk_writer.R
 \name{bulk_writer}
 \alias{bulk_writer}
 \title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates}
 \usage{
 bulk_writer(x, index = "maml", varname = "updated_variable")
 }
 \arguments{
 \item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)}
 \item{index}{The name of the Elasticsearch index to update}
 \item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created)}
 }
 \value{
 A string usable as Elasticsearch bulk update command, in line-delimited JSON
 }
 \description{
 Generate a line-delimited JSON string for use in Elasticsearch bulk updates
 }
 \examples{
 bulk_writer(x, index = 'maml', varname = 'updated_variable')
 }
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -0,0 +1,22 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/dfm_gen.R
 \name{dfm_gen}
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
 dfm_gen(out, words = "999")
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
 }
 \value{
 A Quanteda dfm
 }
 \description{
 Generates dfm from ElasticSearch output
 }
 \examples{
 dfm_gen(out, words = '999')
 }
--- a/man/elastic_update.Rd
+++ b/man/elastic_update.Rd
@ -0,0 +1,22 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/elastic_update.R
 \name{elastic_update}
 \alias{elastic_update}
 \title{Push a line-delimited JSON string to Elasticsearch as bulk update}
 \usage{
 elastic_update(x, es_super = "secret")
 }
 \arguments{
 \item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update}
 \item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database}
 }
 \value{
 An html response object indicating the status of the update
 }
 \description{
 Push a line-delimited JSON string to Elasticsearch as bulk update
 }
 \examples{
 elastic_update(x, es_super = 'secret')
 }
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -0,0 +1,26 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/elasticizer.R
 \name{elasticizer}
 \alias{elasticizer}
 \title{Generate a data frame out of unparsed Elasticsearch JSON}
 \usage{
 elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
 }
 \arguments{
 \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
 \item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}
 \item{index}{The name of the Elasticsearch index to search through}
 \item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database}
 }
 \value{
 A data frame containing all the search results
 }
 \description{
 Generate a data frame out of unparsed Elasticsearch JSON
 }
 \examples{
 elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret")
 }
--- a/man/merger.Rd
+++ b/man/merger.Rd
@ -0,0 +1,24 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/merger.R
 \name{merger}
 \alias{merger}
 \title{Merges list of lemmas back into a pseudo-document}
 \usage{
 merger(row, words = "999", out = out)
 }
 \arguments{
 \item{row}{A row number form the Elasticizer-generated data frame}
 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
 \item{out}{The elasticizer-generated data frame}
 }
 \value{
 A documentified string of lemmas, one document at a time
 }
 \description{
 Merges list of lemmas back into a pseudo-document
 }
 \examples{
 merger(1, words = '999', out = out)
 }