First release of mamlr package

master
Erik de Vries 6 years ago
commit 4bbe84ab83

@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$

3
.gitignore vendored

@ -0,0 +1,3 @@
.Rproj.user
.Rhistory
.RData

@ -0,0 +1,15 @@
Package: maml
Title: General functions for the MaML project
Version: 0.1
Authors@R: Erik de Vries
Description: Provide general functions in support of the MaML project, like data retrieval and parsing
Depends: R (>= 3.4.4)
License: Copyright Erik de Vries
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.0
Depends: elastic,
jsonlite,
parallel,
tidyverse,
quanteda

@ -0,0 +1,21 @@
Version: 1.0
RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace

@ -0,0 +1,7 @@
# Generated by roxygen2: do not edit by hand
export(bulk_writer)
export(dfm_gen)
export(elastic_update)
export(elasticizer)
export(merger)

@ -0,0 +1,19 @@
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
#'
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
#' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
#' @param index The name of the Elasticsearch index to update
#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created)
#' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
#' @export
#' @examples
#' bulk_writer(x, index = 'maml', varname = 'updated_variable')
#################################################################################################
#################################### Bulk update writer ################################
#################################################################################################
bulk_writer <- function(x, index = 'maml', varname = 'updated_variable') {
return(
paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
{ "script" : { "source": "ctx._source.',varname,' = params.code", "lang" : "painless","params" : {"code":',toJSON(x[-1], collapse = F),'}}}')
)
}

@ -0,0 +1,48 @@
#' Generates dfm from ElasticSearch output
#'
#' Generates dfm from ElasticSearch output
#' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @return A Quanteda dfm
#' @export
#' @examples
#' dfm_gen(out, words = '999')
#################################################################################################
#################################### DFM generator #############################
#################################################################################################
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out,words = '999') {
# Create subset with just ids, codes and text
out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out))
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
# out$codes <- out$`_source.codes.majorTopic` %>%
out <- out %>%
mutate(codes = case_when(
.$`_source.codes.timeSpent` == -1 ~ NA_character_,
TRUE ~ .$`_source.codes.majorTopic`
)
) %>%
mutate(junk = case_when(
.$codes == 2301 ~ 1,
.$codes == 3101 ~ 1,
.$codes == 34 ~ 1,
.$`_source.codes.timeSpent` == -1 ~ NA_real_,
TRUE ~ 0
)
) %>%
mutate(aggregate = .$codes %>%
str_pad(4, side="right", pad="a") %>%
str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
.[,2] %>%
as.numeric()
)
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>%
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
return(dfm)
}

@ -0,0 +1,28 @@
#' Push a line-delimited JSON string to Elasticsearch as bulk update
#'
#' Push a line-delimited JSON string to Elasticsearch as bulk update
#' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update
#' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database
#' @return An html response object indicating the status of the update
#' @export
#' @examples
#' elastic_update(x, es_super = 'secret')
#################################################################################################
#################################### Elasticsearch Updater ################################
#################################################################################################
elastic_update <- function(x, es_super = 'secret') {
bulk <- paste0(x,'\n')
url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty')
res <- RETRY("POST", url = url
, body = bulk
, encode = "raw"
, add_headers("Content-Type" = "application/json")
, times = 10
, pause_min = 10
)
# stop_for_status(res)
# content(res, "parsed", "application/json")
# appData <- content(res)
return(res)
}

@ -0,0 +1,51 @@
#' Generate a data frame out of unparsed Elasticsearch JSON
#'
#' Generate a data frame out of unparsed Elasticsearch JSON
#' @param query A JSON-formatted query in the Elasticsearch query DSL
#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
#' @param index The name of the Elasticsearch index to search through
#' @return A data frame containing all the search results
#' @export
#' @examples
#' elasticizer(query, src = TRUE, index = "maml")
#################################################################################################
#################################### Get data from ElasticSearch ################################
#################################################################################################
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){
connect(es_port = 443,
es_transport = 'https',
es_host = 'linux01.uis.no',
es_path = 'es',
es_user = 'es',
es_pwd = es_pwd,
errors = 'complete')
# Get all results - one approach is to use a while loop
if (src == T) {
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
}
if (src == F) {
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T, source = F)
}
json <- fromJSON(res)
if (json$hits$total == 0) {
return("No results found")
} else {
out <- jsonlite:::flatten(json$hits$hits)
total <- json$hits$total
hits <- 1
batch <- 1
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
while(hits != 0){
res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
json <- fromJSON(res)
hits <- length(json$hits$hits)
if(hits > 0) {
batch <- batch+1
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
}
}
return(out)
}
}

@ -0,0 +1,28 @@
#' Merges list of lemmas back into a pseudo-document
#'
#' Merges list of lemmas back into a pseudo-document
#' @param row A row number form the Elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param out The elasticizer-generated data frame
#' @return A documentified string of lemmas, one document at a time
#' @export
#' @examples
#' merger(1, words = '999', out = out)
#################################################################################################
#################################### Reconstructing documents from lemmas########################
#################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification
merger <- function(row, words = '999', out = out) {
df <- out[row,]
# Mergin lemmas into single string
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
# Replacing $-marked punctuation with their regular forms
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
### Removing numbers and non-words containing numbers
str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
paste0(.,". ")
if (words != "999") {
lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
return(lemmas)
}

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/bulk_writer.R
\name{bulk_writer}
\alias{bulk_writer}
\title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates}
\usage{
bulk_writer(x, index = "maml", varname = "updated_variable")
}
\arguments{
\item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)}
\item{index}{The name of the Elasticsearch index to update}
\item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created)}
}
\value{
A string usable as Elasticsearch bulk update command, in line-delimited JSON
}
\description{
Generate a line-delimited JSON string for use in Elasticsearch bulk updates
}
\examples{
bulk_writer(x, index = 'maml', varname = 'updated_variable')
}

@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm_gen.R
\name{dfm_gen}
\alias{dfm_gen}
\title{Generates dfm from ElasticSearch output}
\usage{
dfm_gen(out, words = "999")
}
\arguments{
\item{out}{The elasticizer-generated data frame}
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
}
\value{
A Quanteda dfm
}
\description{
Generates dfm from ElasticSearch output
}
\examples{
dfm_gen(out, words = '999')
}

@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/elastic_update.R
\name{elastic_update}
\alias{elastic_update}
\title{Push a line-delimited JSON string to Elasticsearch as bulk update}
\usage{
elastic_update(x, es_super = "secret")
}
\arguments{
\item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update}
\item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database}
}
\value{
An html response object indicating the status of the update
}
\description{
Push a line-delimited JSON string to Elasticsearch as bulk update
}
\examples{
elastic_update(x, es_super = 'secret')
}

@ -0,0 +1,26 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/elasticizer.R
\name{elasticizer}
\alias{elasticizer}
\title{Generate a data frame out of unparsed Elasticsearch JSON}
\usage{
elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
}
\arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL}
\item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}
\item{index}{The name of the Elasticsearch index to search through}
\item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database}
}
\value{
A data frame containing all the search results
}
\description{
Generate a data frame out of unparsed Elasticsearch JSON
}
\examples{
elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret")
}

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/merger.R
\name{merger}
\alias{merger}
\title{Merges list of lemmas back into a pseudo-document}
\usage{
merger(row, words = "999", out = out)
}
\arguments{
\item{row}{A row number form the Elasticizer-generated data frame}
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
\item{out}{The elasticizer-generated data frame}
}
\value{
A documentified string of lemmas, one document at a time
}
\description{
Merges list of lemmas back into a pseudo-document
}
\examples{
merger(1, words = '999', out = out)
}
Loading…
Cancel
Save