Added option for fulltext vs lemmas merged field

7 years ago · 0e45c0f2d1
parent 4cfb508a50
commit 0e45c0f2d1
4 changed files with 27 additions and 13 deletions
--- a/12
+++ b/12
@ -3,13 +3,13 @@ Title: General functions for the MaML project
 Version: 0.1
 Authors: Erik de Vries
 Description: Provide general functions in support of the MaML project, like data retrieval and parsing
-Depends: R (>= 3.4.4)
-License: Copyright Erik de Vries
-Encoding: UTF-8
-LazyData: true
-RoxygenNote: 6.1.0
-Depends: elastic,
+Depends: R (>= 3.4.4),
+    elastic,
    jsonlite,
    parallel,
    tidyverse,
    quanteda
+License: Copyright Erik de Vries
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.1.0
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -3,6 +3,7 @@
 #' Generates dfm from ElasticSearch output
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
+#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas"
 #' @return A Quanteda dfm 
 #' @export
 #' @examples
@ -15,12 +16,24 @@

 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack

-dfm_gen <- function(out,words = '999') {
+dfm_gen <- function(out,words = '999', text = c("lemmas","full")) {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
-  out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
+  if (text == "lemmas") {
+    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
+  }
+  if (text == "full") {
+    out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
+                        str_replace_na(out$`_source.subtitle`, replacement = " "),
+                        str_replace_na(out$`_source.preteaser`, replacement = " "),
+                        str_replace_na(out$`_source.teaser`, replacement = " "),
+                        str_replace_na(out$`_source.text`, replacement = " "),
+                        sep = " ") %>%
+      # Remove html tags
+      str_replace_all("<.*?>", " ")
+  }
  # out$codes <- out$`_source.codes.majorTopic` %>%
  out <- out %>%
    mutate(codes = case_when(
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,12 +4,14 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999")
+dfm_gen(out, words = "999", text = c("lemmas", "full"))
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}

 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
+
+\item{text}{String indicating whether the "merged" field will contain the "full" text, or "lemmas"}
 }
 \value{
 A Quanteda dfm
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -4,7 +4,8 @@
 \alias{elasticizer}
 \title{Generate a data frame out of unparsed Elasticsearch JSON}
 \usage{
-elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
+elasticizer(query, src = T, index = "maml",
+  es_pwd = .rs.askForPassword("Elasticsearch READ"))
 }
 \arguments{
 \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -12,8 +13,6 @@ elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
 \item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}

 \item{index}{The name of the Elasticsearch index to search through}
-
-\item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database}
 }
 \value{
 A data frame containing all the search results
@ -22,5 +21,5 @@ A data frame containing all the search results
 Generate a data frame out of unparsed Elasticsearch JSON
 }
 \examples{
-elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret")
+elasticizer(query, src = TRUE, index = "maml")
 }