First release of mamlr package

Erik de Vries 6 years ago
commit 4bbe84ab83

.gitignore vendored

Package: maml
Title: General functions for the MaML project
Version: 0.1
Authors@R: Erik de Vries
Description: Provide general functions in support of the MaML project, like data retrieval and parsing
Depends: R (>= 3.4.4)
License: Copyright Erik de Vries
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.0
Depends: elastic,

Version: 1.0
RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace

#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
#' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
#' @param index The name of the Elasticsearch index to update
#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created)
#' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
#' @export
#' @examples
#' bulk_writer(x, index = 'maml', varname = 'updated_variable')
#################################### Bulk update writer ################################
bulk_writer <- function(x, index = 'maml', varname = 'updated_variable') {
paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
{ "script" : { "source": "ctx._source.',varname,' = params.code", "lang" : "painless","params" : {"code":',toJSON(x[-1], collapse = F),'}}}')

#' Generates dfm from ElasticSearch output
#' Generates dfm from ElasticSearch output
#' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @return A Quanteda dfm
#' @export
#' @examples
#' dfm_gen(out, words = '999')
#################################### DFM generator #############################
# filter(`` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out,words = '999') {
# Create subset with just ids, codes and text
out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out))
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
# out$codes <- out$`` %>%
out <- out %>%
mutate(codes = case_when(
.$`` == -1 ~ NA_character_,
TRUE ~ .$``
) %>%
mutate(junk = case_when(
.$codes == 2301 ~ 1,
.$codes == 3101 ~ 1,
.$codes == 34 ~ 1,
.$`` == -1 ~ NA_real_,
TRUE ~ 0
) %>%
mutate(aggregate = .$codes %>%
str_pad(4, side="right", pad="a") %>%
str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
.[,2] %>%
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>%
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)

#' Push a line-delimited JSON string to Elasticsearch as bulk update
#' Push a line-delimited JSON string to Elasticsearch as bulk update
#' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update
#' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database
#' @return An html response object indicating the status of the update
#' @export
#' @examples
#' elastic_update(x, es_super = 'secret')
#################################### Elasticsearch Updater ################################
elastic_update <- function(x, es_super = 'secret') {
bulk <- paste0(x,'\n')
url <- paste0('https://super:',es_super,'')
res <- RETRY("POST", url = url
, body = bulk
, encode = "raw"
, add_headers("Content-Type" = "application/json")
, times = 10
, pause_min = 10
# stop_for_status(res)
# content(res, "parsed", "application/json")
# appData <- content(res)

#' Generate a data frame out of unparsed Elasticsearch JSON
#' Generate a data frame out of unparsed Elasticsearch JSON
#' @param query A JSON-formatted query in the Elasticsearch query DSL
#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
#' @param index The name of the Elasticsearch index to search through
#' @return A data frame containing all the search results
#' @export
#' @examples
#' elasticizer(query, src = TRUE, index = "maml")
#################################### Get data from ElasticSearch ################################
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){
connect(es_port = 443,
es_transport = 'https',
es_host = '',
es_path = 'es',
es_user = 'es',
es_pwd = es_pwd,
errors = 'complete')
# Get all results - one approach is to use a while loop
if (src == T) {
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
if (src == F) {
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T, source = F)
json <- fromJSON(res)
if (json$hits$total == 0) {
return("No results found")
} else {
out <- jsonlite:::flatten(json$hits$hits)
total <- json$hits$total
hits <- 1
batch <- 1
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
while(hits != 0){
res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
json <- fromJSON(res)
hits <- length(json$hits$hits)
if(hits > 0) {
batch <- batch+1
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))

#' Merges list of lemmas back into a pseudo-document
#' Merges list of lemmas back into a pseudo-document
#' @param row A row number form the Elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param out The elasticizer-generated data frame
#' @return A documentified string of lemmas, one document at a time
#' @export
#' @examples
#' merger(1, words = '999', out = out)
#################################### Reconstructing documents from lemmas########################
## Only merging lemmas for now, feature selection has no impact on junk classification
merger <- function(row, words = '999', out = out) {
df <- out[row,]
# Mergin lemmas into single string
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
# Replacing $-marked punctuation with their regular forms
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
### Removing numbers and non-words containing numbers
str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
paste0(.,". ")
if (words != "999") {
lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}

