commit
4bbe84ab83
@ -0,0 +1,2 @@
|
||||
^.*\.Rproj$
|
||||
^\.Rproj\.user$
|
@ -0,0 +1,3 @@
|
||||
.Rproj.user
|
||||
.Rhistory
|
||||
.RData
|
@ -0,0 +1,15 @@
|
||||
Package: maml
|
||||
Title: General functions for the MaML project
|
||||
Version: 0.1
|
||||
Authors@R: Erik de Vries
|
||||
Description: Provide general functions in support of the MaML project, like data retrieval and parsing
|
||||
Depends: R (>= 3.4.4)
|
||||
License: Copyright Erik de Vries
|
||||
Encoding: UTF-8
|
||||
LazyData: true
|
||||
RoxygenNote: 6.1.0
|
||||
Depends: elastic,
|
||||
jsonlite,
|
||||
parallel,
|
||||
tidyverse,
|
||||
quanteda
|
@ -0,0 +1,21 @@
|
||||
Version: 1.0
|
||||
|
||||
RestoreWorkspace: No
|
||||
SaveWorkspace: No
|
||||
AlwaysSaveHistory: Default
|
||||
|
||||
EnableCodeIndexing: Yes
|
||||
UseSpacesForTab: Yes
|
||||
NumSpacesForTab: 2
|
||||
Encoding: UTF-8
|
||||
|
||||
RnwWeave: Sweave
|
||||
LaTeX: pdfLaTeX
|
||||
|
||||
AutoAppendNewline: Yes
|
||||
StripTrailingWhitespace: Yes
|
||||
|
||||
BuildType: Package
|
||||
PackageUseDevtools: Yes
|
||||
PackageInstallArgs: --no-multiarch --with-keep.source
|
||||
PackageRoxygenize: rd,collate,namespace
|
@ -0,0 +1,7 @@
|
||||
# Generated by roxygen2: do not edit by hand
|
||||
|
||||
export(bulk_writer)
|
||||
export(dfm_gen)
|
||||
export(elastic_update)
|
||||
export(elasticizer)
|
||||
export(merger)
|
@ -0,0 +1,19 @@
|
||||
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
||||
#'
|
||||
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
||||
#' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
|
||||
#' @param index The name of the Elasticsearch index to update
|
||||
#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created)
|
||||
#' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
|
||||
#' @export
|
||||
#' @examples
|
||||
#' bulk_writer(x, index = 'maml', varname = 'updated_variable')
|
||||
#################################################################################################
|
||||
#################################### Bulk update writer ################################
|
||||
#################################################################################################
|
||||
bulk_writer <- function(x, index = 'maml', varname = 'updated_variable') {
|
||||
return(
|
||||
paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
|
||||
{ "script" : { "source": "ctx._source.',varname,' = params.code", "lang" : "painless","params" : {"code":',toJSON(x[-1], collapse = F),'}}}')
|
||||
)
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
#' Generates dfm from ElasticSearch output
|
||||
#'
|
||||
#' Generates dfm from ElasticSearch output
|
||||
#' @param out The elasticizer-generated data frame
|
||||
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
|
||||
#' @return A Quanteda dfm
|
||||
#' @export
|
||||
#' @examples
|
||||
#' dfm_gen(out, words = '999')
|
||||
|
||||
|
||||
#################################################################################################
|
||||
#################################### DFM generator #############################
|
||||
#################################################################################################
|
||||
|
||||
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
|
||||
|
||||
dfm_gen <- function(out,words = '999') {
|
||||
# Create subset with just ids, codes and text
|
||||
out <- out %>%
|
||||
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
|
||||
fields <- length(names(out))
|
||||
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
|
||||
# out$codes <- out$`_source.codes.majorTopic` %>%
|
||||
out <- out %>%
|
||||
mutate(codes = case_when(
|
||||
.$`_source.codes.timeSpent` == -1 ~ NA_character_,
|
||||
TRUE ~ .$`_source.codes.majorTopic`
|
||||
)
|
||||
) %>%
|
||||
mutate(junk = case_when(
|
||||
.$codes == 2301 ~ 1,
|
||||
.$codes == 3101 ~ 1,
|
||||
.$codes == 34 ~ 1,
|
||||
.$`_source.codes.timeSpent` == -1 ~ NA_real_,
|
||||
TRUE ~ 0
|
||||
)
|
||||
) %>%
|
||||
mutate(aggregate = .$codes %>%
|
||||
str_pad(4, side="right", pad="a") %>%
|
||||
str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
|
||||
.[,2] %>%
|
||||
as.numeric()
|
||||
)
|
||||
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>%
|
||||
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
|
||||
return(dfm)
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
#' Push a line-delimited JSON string to Elasticsearch as bulk update
|
||||
#'
|
||||
#' Push a line-delimited JSON string to Elasticsearch as bulk update
|
||||
#' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update
|
||||
#' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database
|
||||
#' @return An html response object indicating the status of the update
|
||||
#' @export
|
||||
#' @examples
|
||||
#' elastic_update(x, es_super = 'secret')
|
||||
|
||||
#################################################################################################
|
||||
#################################### Elasticsearch Updater ################################
|
||||
#################################################################################################
|
||||
elastic_update <- function(x, es_super = 'secret') {
|
||||
bulk <- paste0(x,'\n')
|
||||
url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty')
|
||||
res <- RETRY("POST", url = url
|
||||
, body = bulk
|
||||
, encode = "raw"
|
||||
, add_headers("Content-Type" = "application/json")
|
||||
, times = 10
|
||||
, pause_min = 10
|
||||
)
|
||||
# stop_for_status(res)
|
||||
# content(res, "parsed", "application/json")
|
||||
# appData <- content(res)
|
||||
return(res)
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
#' Generate a data frame out of unparsed Elasticsearch JSON
|
||||
#'
|
||||
#' Generate a data frame out of unparsed Elasticsearch JSON
|
||||
#' @param query A JSON-formatted query in the Elasticsearch query DSL
|
||||
#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
|
||||
#' @param index The name of the Elasticsearch index to search through
|
||||
|
||||
#' @return A data frame containing all the search results
|
||||
#' @export
|
||||
#' @examples
|
||||
#' elasticizer(query, src = TRUE, index = "maml")
|
||||
#################################################################################################
|
||||
#################################### Get data from ElasticSearch ################################
|
||||
#################################################################################################
|
||||
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){
|
||||
connect(es_port = 443,
|
||||
es_transport = 'https',
|
||||
es_host = 'linux01.uis.no',
|
||||
es_path = 'es',
|
||||
es_user = 'es',
|
||||
es_pwd = es_pwd,
|
||||
errors = 'complete')
|
||||
# Get all results - one approach is to use a while loop
|
||||
if (src == T) {
|
||||
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
|
||||
}
|
||||
if (src == F) {
|
||||
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T, source = F)
|
||||
}
|
||||
json <- fromJSON(res)
|
||||
if (json$hits$total == 0) {
|
||||
return("No results found")
|
||||
} else {
|
||||
out <- jsonlite:::flatten(json$hits$hits)
|
||||
total <- json$hits$total
|
||||
hits <- 1
|
||||
batch <- 1
|
||||
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
|
||||
while(hits != 0){
|
||||
res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
|
||||
json <- fromJSON(res)
|
||||
hits <- length(json$hits$hits)
|
||||
if(hits > 0) {
|
||||
batch <- batch+1
|
||||
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
|
||||
out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
|
||||
}
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
#' Merges list of lemmas back into a pseudo-document
|
||||
#'
|
||||
#' Merges list of lemmas back into a pseudo-document
|
||||
#' @param row A row number form the Elasticizer-generated data frame
|
||||
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
|
||||
#' @param out The elasticizer-generated data frame
|
||||
#' @return A documentified string of lemmas, one document at a time
|
||||
#' @export
|
||||
#' @examples
|
||||
#' merger(1, words = '999', out = out)
|
||||
#################################################################################################
|
||||
#################################### Reconstructing documents from lemmas########################
|
||||
#################################################################################################
|
||||
## Only merging lemmas for now, feature selection has no impact on junk classification
|
||||
merger <- function(row, words = '999', out = out) {
|
||||
df <- out[row,]
|
||||
# Mergin lemmas into single string
|
||||
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
|
||||
# Replacing $-marked punctuation with their regular forms
|
||||
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
|
||||
### Removing numbers and non-words containing numbers
|
||||
str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
|
||||
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
|
||||
paste0(.,". ")
|
||||
if (words != "999") {
|
||||
lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
|
||||
return(lemmas)
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/bulk_writer.R
|
||||
\name{bulk_writer}
|
||||
\alias{bulk_writer}
|
||||
\title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates}
|
||||
\usage{
|
||||
bulk_writer(x, index = "maml", varname = "updated_variable")
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)}
|
||||
|
||||
\item{index}{The name of the Elasticsearch index to update}
|
||||
|
||||
\item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created)}
|
||||
}
|
||||
\value{
|
||||
A string usable as Elasticsearch bulk update command, in line-delimited JSON
|
||||
}
|
||||
\description{
|
||||
Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
||||
}
|
||||
\examples{
|
||||
bulk_writer(x, index = 'maml', varname = 'updated_variable')
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/dfm_gen.R
|
||||
\name{dfm_gen}
|
||||
\alias{dfm_gen}
|
||||
\title{Generates dfm from ElasticSearch output}
|
||||
\usage{
|
||||
dfm_gen(out, words = "999")
|
||||
}
|
||||
\arguments{
|
||||
\item{out}{The elasticizer-generated data frame}
|
||||
|
||||
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
|
||||
}
|
||||
\value{
|
||||
A Quanteda dfm
|
||||
}
|
||||
\description{
|
||||
Generates dfm from ElasticSearch output
|
||||
}
|
||||
\examples{
|
||||
dfm_gen(out, words = '999')
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/elastic_update.R
|
||||
\name{elastic_update}
|
||||
\alias{elastic_update}
|
||||
\title{Push a line-delimited JSON string to Elasticsearch as bulk update}
|
||||
\usage{
|
||||
elastic_update(x, es_super = "secret")
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update}
|
||||
|
||||
\item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database}
|
||||
}
|
||||
\value{
|
||||
An html response object indicating the status of the update
|
||||
}
|
||||
\description{
|
||||
Push a line-delimited JSON string to Elasticsearch as bulk update
|
||||
}
|
||||
\examples{
|
||||
elastic_update(x, es_super = 'secret')
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/elasticizer.R
|
||||
\name{elasticizer}
|
||||
\alias{elasticizer}
|
||||
\title{Generate a data frame out of unparsed Elasticsearch JSON}
|
||||
\usage{
|
||||
elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
|
||||
}
|
||||
\arguments{
|
||||
\item{query}{A JSON-formatted query in the Elasticsearch query DSL}
|
||||
|
||||
\item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}
|
||||
|
||||
\item{index}{The name of the Elasticsearch index to search through}
|
||||
|
||||
\item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database}
|
||||
}
|
||||
\value{
|
||||
A data frame containing all the search results
|
||||
}
|
||||
\description{
|
||||
Generate a data frame out of unparsed Elasticsearch JSON
|
||||
}
|
||||
\examples{
|
||||
elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret")
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/merger.R
|
||||
\name{merger}
|
||||
\alias{merger}
|
||||
\title{Merges list of lemmas back into a pseudo-document}
|
||||
\usage{
|
||||
merger(row, words = "999", out = out)
|
||||
}
|
||||
\arguments{
|
||||
\item{row}{A row number form the Elasticizer-generated data frame}
|
||||
|
||||
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
|
||||
|
||||
\item{out}{The elasticizer-generated data frame}
|
||||
}
|
||||
\value{
|
||||
A documentified string of lemmas, one document at a time
|
||||
}
|
||||
\description{
|
||||
Merges list of lemmas back into a pseudo-document
|
||||
}
|
||||
\examples{
|
||||
merger(1, words = '999', out = out)
|
||||
}
|
Loading…
Reference in new issue