commit
4bbe84ab83
@ -0,0 +1,2 @@
|
|||||||
|
^.*\.Rproj$
|
||||||
|
^\.Rproj\.user$
|
@ -0,0 +1,3 @@
|
|||||||
|
.Rproj.user
|
||||||
|
.Rhistory
|
||||||
|
.RData
|
@ -0,0 +1,15 @@
|
|||||||
|
Package: maml
|
||||||
|
Title: General functions for the MaML project
|
||||||
|
Version: 0.1
|
||||||
|
Authors@R: Erik de Vries
|
||||||
|
Description: Provide general functions in support of the MaML project, like data retrieval and parsing
|
||||||
|
Depends: R (>= 3.4.4)
|
||||||
|
License: Copyright Erik de Vries
|
||||||
|
Encoding: UTF-8
|
||||||
|
LazyData: true
|
||||||
|
RoxygenNote: 6.1.0
|
||||||
|
Depends: elastic,
|
||||||
|
jsonlite,
|
||||||
|
parallel,
|
||||||
|
tidyverse,
|
||||||
|
quanteda
|
@ -0,0 +1,21 @@
|
|||||||
|
Version: 1.0
|
||||||
|
|
||||||
|
RestoreWorkspace: No
|
||||||
|
SaveWorkspace: No
|
||||||
|
AlwaysSaveHistory: Default
|
||||||
|
|
||||||
|
EnableCodeIndexing: Yes
|
||||||
|
UseSpacesForTab: Yes
|
||||||
|
NumSpacesForTab: 2
|
||||||
|
Encoding: UTF-8
|
||||||
|
|
||||||
|
RnwWeave: Sweave
|
||||||
|
LaTeX: pdfLaTeX
|
||||||
|
|
||||||
|
AutoAppendNewline: Yes
|
||||||
|
StripTrailingWhitespace: Yes
|
||||||
|
|
||||||
|
BuildType: Package
|
||||||
|
PackageUseDevtools: Yes
|
||||||
|
PackageInstallArgs: --no-multiarch --with-keep.source
|
||||||
|
PackageRoxygenize: rd,collate,namespace
|
@ -0,0 +1,7 @@
|
|||||||
|
# Generated by roxygen2: do not edit by hand
|
||||||
|
|
||||||
|
export(bulk_writer)
|
||||||
|
export(dfm_gen)
|
||||||
|
export(elastic_update)
|
||||||
|
export(elasticizer)
|
||||||
|
export(merger)
|
@ -0,0 +1,19 @@
|
|||||||
|
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
||||||
|
#'
|
||||||
|
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
||||||
|
#' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
|
||||||
|
#' @param index The name of the Elasticsearch index to update
|
||||||
|
#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created)
|
||||||
|
#' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' bulk_writer(x, index = 'maml', varname = 'updated_variable')
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Bulk update writer ################################
|
||||||
|
#################################################################################################
|
||||||
|
bulk_writer <- function(x, index = 'maml', varname = 'updated_variable') {
|
||||||
|
return(
|
||||||
|
paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
|
||||||
|
{ "script" : { "source": "ctx._source.',varname,' = params.code", "lang" : "painless","params" : {"code":',toJSON(x[-1], collapse = F),'}}}')
|
||||||
|
)
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
#' Generates dfm from ElasticSearch output
|
||||||
|
#'
|
||||||
|
#' Generates dfm from ElasticSearch output
|
||||||
|
#' @param out The elasticizer-generated data frame
|
||||||
|
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
|
||||||
|
#' @return A Quanteda dfm
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' dfm_gen(out, words = '999')
|
||||||
|
|
||||||
|
|
||||||
|
#################################################################################################
|
||||||
|
#################################### DFM generator #############################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
|
||||||
|
|
||||||
|
dfm_gen <- function(out,words = '999') {
|
||||||
|
# Create subset with just ids, codes and text
|
||||||
|
out <- out %>%
|
||||||
|
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
|
||||||
|
fields <- length(names(out))
|
||||||
|
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
|
||||||
|
# out$codes <- out$`_source.codes.majorTopic` %>%
|
||||||
|
out <- out %>%
|
||||||
|
mutate(codes = case_when(
|
||||||
|
.$`_source.codes.timeSpent` == -1 ~ NA_character_,
|
||||||
|
TRUE ~ .$`_source.codes.majorTopic`
|
||||||
|
)
|
||||||
|
) %>%
|
||||||
|
mutate(junk = case_when(
|
||||||
|
.$codes == 2301 ~ 1,
|
||||||
|
.$codes == 3101 ~ 1,
|
||||||
|
.$codes == 34 ~ 1,
|
||||||
|
.$`_source.codes.timeSpent` == -1 ~ NA_real_,
|
||||||
|
TRUE ~ 0
|
||||||
|
)
|
||||||
|
) %>%
|
||||||
|
mutate(aggregate = .$codes %>%
|
||||||
|
str_pad(4, side="right", pad="a") %>%
|
||||||
|
str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
|
||||||
|
.[,2] %>%
|
||||||
|
as.numeric()
|
||||||
|
)
|
||||||
|
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = out[,-seq(1,(length(names(out))-3),1)]) %>%
|
||||||
|
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
|
||||||
|
return(dfm)
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
#' Push a line-delimited JSON string to Elasticsearch as bulk update
|
||||||
|
#'
|
||||||
|
#' Push a line-delimited JSON string to Elasticsearch as bulk update
|
||||||
|
#' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update
|
||||||
|
#' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database
|
||||||
|
#' @return An html response object indicating the status of the update
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' elastic_update(x, es_super = 'secret')
|
||||||
|
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Elasticsearch Updater ################################
|
||||||
|
#################################################################################################
|
||||||
|
elastic_update <- function(x, es_super = 'secret') {
|
||||||
|
bulk <- paste0(x,'\n')
|
||||||
|
url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty')
|
||||||
|
res <- RETRY("POST", url = url
|
||||||
|
, body = bulk
|
||||||
|
, encode = "raw"
|
||||||
|
, add_headers("Content-Type" = "application/json")
|
||||||
|
, times = 10
|
||||||
|
, pause_min = 10
|
||||||
|
)
|
||||||
|
# stop_for_status(res)
|
||||||
|
# content(res, "parsed", "application/json")
|
||||||
|
# appData <- content(res)
|
||||||
|
return(res)
|
||||||
|
}
|
@ -0,0 +1,51 @@
|
|||||||
|
#' Generate a data frame out of unparsed Elasticsearch JSON
|
||||||
|
#'
|
||||||
|
#' Generate a data frame out of unparsed Elasticsearch JSON
|
||||||
|
#' @param query A JSON-formatted query in the Elasticsearch query DSL
|
||||||
|
#' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
|
||||||
|
#' @param index The name of the Elasticsearch index to search through
|
||||||
|
|
||||||
|
#' @return A data frame containing all the search results
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' elasticizer(query, src = TRUE, index = "maml")
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Get data from ElasticSearch ################################
|
||||||
|
#################################################################################################
|
||||||
|
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ")){
|
||||||
|
connect(es_port = 443,
|
||||||
|
es_transport = 'https',
|
||||||
|
es_host = 'linux01.uis.no',
|
||||||
|
es_path = 'es',
|
||||||
|
es_user = 'es',
|
||||||
|
es_pwd = es_pwd,
|
||||||
|
errors = 'complete')
|
||||||
|
# Get all results - one approach is to use a while loop
|
||||||
|
if (src == T) {
|
||||||
|
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
|
||||||
|
}
|
||||||
|
if (src == F) {
|
||||||
|
res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T, source = F)
|
||||||
|
}
|
||||||
|
json <- fromJSON(res)
|
||||||
|
if (json$hits$total == 0) {
|
||||||
|
return("No results found")
|
||||||
|
} else {
|
||||||
|
out <- jsonlite:::flatten(json$hits$hits)
|
||||||
|
total <- json$hits$total
|
||||||
|
hits <- 1
|
||||||
|
batch <- 1
|
||||||
|
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
|
||||||
|
while(hits != 0){
|
||||||
|
res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
|
||||||
|
json <- fromJSON(res)
|
||||||
|
hits <- length(json$hits$hits)
|
||||||
|
if(hits > 0) {
|
||||||
|
batch <- batch+1
|
||||||
|
print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
|
||||||
|
out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
#' Merges list of lemmas back into a pseudo-document
|
||||||
|
#'
|
||||||
|
#' Merges list of lemmas back into a pseudo-document
|
||||||
|
#' @param row A row number form the Elasticizer-generated data frame
|
||||||
|
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
|
||||||
|
#' @param out The elasticizer-generated data frame
|
||||||
|
#' @return A documentified string of lemmas, one document at a time
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' merger(1, words = '999', out = out)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Reconstructing documents from lemmas########################
|
||||||
|
#################################################################################################
|
||||||
|
## Only merging lemmas for now, feature selection has no impact on junk classification
|
||||||
|
merger <- function(row, words = '999', out = out) {
|
||||||
|
df <- out[row,]
|
||||||
|
# Mergin lemmas into single string
|
||||||
|
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
|
||||||
|
# Replacing $-marked punctuation with their regular forms
|
||||||
|
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
|
||||||
|
### Removing numbers and non-words containing numbers
|
||||||
|
str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
|
||||||
|
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
|
||||||
|
paste0(.,". ")
|
||||||
|
if (words != "999") {
|
||||||
|
lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
|
||||||
|
return(lemmas)
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/bulk_writer.R
|
||||||
|
\name{bulk_writer}
|
||||||
|
\alias{bulk_writer}
|
||||||
|
\title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates}
|
||||||
|
\usage{
|
||||||
|
bulk_writer(x, index = "maml", varname = "updated_variable")
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)}
|
||||||
|
|
||||||
|
\item{index}{The name of the Elasticsearch index to update}
|
||||||
|
|
||||||
|
\item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created)}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A string usable as Elasticsearch bulk update command, in line-delimited JSON
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
bulk_writer(x, index = 'maml', varname = 'updated_variable')
|
||||||
|
}
|
@ -0,0 +1,22 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/dfm_gen.R
|
||||||
|
\name{dfm_gen}
|
||||||
|
\alias{dfm_gen}
|
||||||
|
\title{Generates dfm from ElasticSearch output}
|
||||||
|
\usage{
|
||||||
|
dfm_gen(out, words = "999")
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{out}{The elasticizer-generated data frame}
|
||||||
|
|
||||||
|
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A Quanteda dfm
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generates dfm from ElasticSearch output
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
dfm_gen(out, words = '999')
|
||||||
|
}
|
@ -0,0 +1,22 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/elastic_update.R
|
||||||
|
\name{elastic_update}
|
||||||
|
\alias{elastic_update}
|
||||||
|
\title{Push a line-delimited JSON string to Elasticsearch as bulk update}
|
||||||
|
\usage{
|
||||||
|
elastic_update(x, es_super = "secret")
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update}
|
||||||
|
|
||||||
|
\item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
An html response object indicating the status of the update
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Push a line-delimited JSON string to Elasticsearch as bulk update
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
elastic_update(x, es_super = 'secret')
|
||||||
|
}
|
@ -0,0 +1,26 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/elasticizer.R
|
||||||
|
\name{elasticizer}
|
||||||
|
\alias{elasticizer}
|
||||||
|
\title{Generate a data frame out of unparsed Elasticsearch JSON}
|
||||||
|
\usage{
|
||||||
|
elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{query}{A JSON-formatted query in the Elasticsearch query DSL}
|
||||||
|
|
||||||
|
\item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}
|
||||||
|
|
||||||
|
\item{index}{The name of the Elasticsearch index to search through}
|
||||||
|
|
||||||
|
\item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A data frame containing all the search results
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate a data frame out of unparsed Elasticsearch JSON
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret")
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/merger.R
|
||||||
|
\name{merger}
|
||||||
|
\alias{merger}
|
||||||
|
\title{Merges list of lemmas back into a pseudo-document}
|
||||||
|
\usage{
|
||||||
|
merger(row, words = "999", out = out)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{row}{A row number form the Elasticizer-generated data frame}
|
||||||
|
|
||||||
|
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
|
||||||
|
|
||||||
|
\item{out}{The elasticizer-generated data frame}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A documentified string of lemmas, one document at a time
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Merges list of lemmas back into a pseudo-document
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
merger(1, words = '999', out = out)
|
||||||
|
}
|
Loading…
Reference in new issue