Added option for fulltext vs lemmas merged field

master
Erik de Vries 6 years ago
parent 4cfb508a50
commit 0e45c0f2d1

@ -3,13 +3,13 @@ Title: General functions for the MaML project
Version: 0.1
Authors: Erik de Vries
Description: Provide general functions in support of the MaML project, like data retrieval and parsing
Depends: R (>= 3.4.4)
License: Copyright Erik de Vries
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.0
Depends: elastic,
Depends: R (>= 3.4.4),
elastic,
jsonlite,
parallel,
tidyverse,
quanteda
License: Copyright Erik de Vries
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.0

@ -3,6 +3,7 @@
#' Generates dfm from ElasticSearch output
#' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas"
#' @return A Quanteda dfm
#' @export
#' @examples
@ -15,12 +16,24 @@
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out,words = '999') {
dfm_gen <- function(out,words = '999', text = c("lemmas","full")) {
# Create subset with just ids, codes and text
out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out))
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
if (text == "lemmas") {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
}
if (text == "full") {
out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
str_replace_na(out$`_source.subtitle`, replacement = " "),
str_replace_na(out$`_source.preteaser`, replacement = " "),
str_replace_na(out$`_source.teaser`, replacement = " "),
str_replace_na(out$`_source.text`, replacement = " "),
sep = " ") %>%
# Remove html tags
str_replace_all("<.*?>", " ")
}
# out$codes <- out$`_source.codes.majorTopic` %>%
out <- out %>%
mutate(codes = case_when(

@ -4,12 +4,14 @@
\alias{dfm_gen}
\title{Generates dfm from ElasticSearch output}
\usage{
dfm_gen(out, words = "999")
dfm_gen(out, words = "999", text = c("lemmas", "full"))
}
\arguments{
\item{out}{The elasticizer-generated data frame}
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
\item{text}{String indicating whether the "merged" field will contain the "full" text, or "lemmas"}
}
\value{
A Quanteda dfm

@ -4,7 +4,8 @@
\alias{elasticizer}
\title{Generate a data frame out of unparsed Elasticsearch JSON}
\usage{
elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
elasticizer(query, src = T, index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ"))
}
\arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -12,8 +13,6 @@ elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
\item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}
\item{index}{The name of the Elasticsearch index to search through}
\item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database}
}
\value{
A data frame containing all the search results
@ -22,5 +21,5 @@ A data frame containing all the search results
Generate a data frame out of unparsed Elasticsearch JSON
}
\examples{
elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret")
elasticizer(query, src = TRUE, index = "maml")
}

Loading…
Cancel
Save