diff --git a/DESCRIPTION b/DESCRIPTION index cc01d42..b7ea14f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,13 +3,13 @@ Title: General functions for the MaML project Version: 0.1 Authors: Erik de Vries Description: Provide general functions in support of the MaML project, like data retrieval and parsing -Depends: R (>= 3.4.4) -License: Copyright Erik de Vries -Encoding: UTF-8 -LazyData: true -RoxygenNote: 6.1.0 -Depends: elastic, +Depends: R (>= 3.4.4), + elastic, jsonlite, parallel, tidyverse, quanteda +License: Copyright Erik de Vries +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.1.0 diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 65b024c..2cc0e75 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -3,6 +3,7 @@ #' Generates dfm from ElasticSearch output #' @param out The elasticizer-generated data frame #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document +#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas" #' @return A Quanteda dfm #' @export #' @examples @@ -15,12 +16,24 @@ # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack -dfm_gen <- function(out,words = '999') { +dfm_gen <- function(out,words = '999', text = c("lemmas","full")) { # Create subset with just ids, codes and text out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field fields <- length(names(out)) - out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores())) + if (text == "lemmas") { + out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores())) + } + if (text == "full") { + out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "), + str_replace_na(out$`_source.subtitle`, replacement = " "), + str_replace_na(out$`_source.preteaser`, replacement = " "), + str_replace_na(out$`_source.teaser`, replacement = " "), + str_replace_na(out$`_source.text`, replacement = " "), + sep = " ") %>% + # Remove html tags + str_replace_all("<.*?>", " ") + } # out$codes <- out$`_source.codes.majorTopic` %>% out <- out %>% mutate(codes = case_when( diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index 6f94afb..866c4b1 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -4,12 +4,14 @@ \alias{dfm_gen} \title{Generates dfm from ElasticSearch output} \usage{ -dfm_gen(out, words = "999") +dfm_gen(out, words = "999", text = c("lemmas", "full")) } \arguments{ \item{out}{The elasticizer-generated data frame} \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} + +\item{text}{String indicating whether the "merged" field will contain the "full" text, or "lemmas"} } \value{ A Quanteda dfm diff --git a/man/elasticizer.Rd b/man/elasticizer.Rd index b88f751..357b95f 100644 --- a/man/elasticizer.Rd +++ b/man/elasticizer.Rd @@ -4,7 +4,8 @@ \alias{elasticizer} \title{Generate a data frame out of unparsed Elasticsearch JSON} \usage{ -elasticizer(query, src = T, index = "maml", es_pwd = "unkown") +elasticizer(query, src = T, index = "maml", + es_pwd = .rs.askForPassword("Elasticsearch READ")) } \arguments{ \item{query}{A JSON-formatted query in the Elasticsearch query DSL} @@ -12,8 +13,6 @@ elasticizer(query, src = T, index = "maml", es_pwd = "unkown") \item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved} \item{index}{The name of the Elasticsearch index to search through} - -\item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database} } \value{ A data frame containing all the search results @@ -22,5 +21,5 @@ A data frame containing all the search results Generate a data frame out of unparsed Elasticsearch JSON } \examples{ -elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret") +elasticizer(query, src = TRUE, index = "maml") }