Added option for fulltext vs lemmas merged field

master
Erik de Vries 6 years ago
parent 4cfb508a50
commit 0e45c0f2d1

@ -3,13 +3,13 @@ Title: General functions for the MaML project
Version: 0.1 Version: 0.1
Authors: Erik de Vries Authors: Erik de Vries
Description: Provide general functions in support of the MaML project, like data retrieval and parsing Description: Provide general functions in support of the MaML project, like data retrieval and parsing
Depends: R (>= 3.4.4) Depends: R (>= 3.4.4),
License: Copyright Erik de Vries elastic,
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.0
Depends: elastic,
jsonlite, jsonlite,
parallel, parallel,
tidyverse, tidyverse,
quanteda quanteda
License: Copyright Erik de Vries
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.0

@ -3,6 +3,7 @@
#' Generates dfm from ElasticSearch output #' Generates dfm from ElasticSearch output
#' @param out The elasticizer-generated data frame #' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas"
#' @return A Quanteda dfm #' @return A Quanteda dfm
#' @export #' @export
#' @examples #' @examples
@ -15,12 +16,24 @@
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out,words = '999') { dfm_gen <- function(out,words = '999', text = c("lemmas","full")) {
# Create subset with just ids, codes and text # Create subset with just ids, codes and text
out <- out %>% out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out)) fields <- length(names(out))
if (text == "lemmas") {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores())) out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
}
if (text == "full") {
out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
str_replace_na(out$`_source.subtitle`, replacement = " "),
str_replace_na(out$`_source.preteaser`, replacement = " "),
str_replace_na(out$`_source.teaser`, replacement = " "),
str_replace_na(out$`_source.text`, replacement = " "),
sep = " ") %>%
# Remove html tags
str_replace_all("<.*?>", " ")
}
# out$codes <- out$`_source.codes.majorTopic` %>% # out$codes <- out$`_source.codes.majorTopic` %>%
out <- out %>% out <- out %>%
mutate(codes = case_when( mutate(codes = case_when(

@ -4,12 +4,14 @@
\alias{dfm_gen} \alias{dfm_gen}
\title{Generates dfm from ElasticSearch output} \title{Generates dfm from ElasticSearch output}
\usage{ \usage{
dfm_gen(out, words = "999") dfm_gen(out, words = "999", text = c("lemmas", "full"))
} }
\arguments{ \arguments{
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
\item{text}{String indicating whether the "merged" field will contain the "full" text, or "lemmas"}
} }
\value{ \value{
A Quanteda dfm A Quanteda dfm

@ -4,7 +4,8 @@
\alias{elasticizer} \alias{elasticizer}
\title{Generate a data frame out of unparsed Elasticsearch JSON} \title{Generate a data frame out of unparsed Elasticsearch JSON}
\usage{ \usage{
elasticizer(query, src = T, index = "maml", es_pwd = "unkown") elasticizer(query, src = T, index = "maml",
es_pwd = .rs.askForPassword("Elasticsearch READ"))
} }
\arguments{ \arguments{
\item{query}{A JSON-formatted query in the Elasticsearch query DSL} \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -12,8 +13,6 @@ elasticizer(query, src = T, index = "maml", es_pwd = "unkown")
\item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved} \item{src}{Logical (true/false) indicating whether or not the source of each document should be retrieved}
\item{index}{The name of the Elasticsearch index to search through} \item{index}{The name of the Elasticsearch index to search through}
\item{es_pwd}{The (very secret, and thus not stored in any scripts!) password to use for read access to the database}
} }
\value{ \value{
A data frame containing all the search results A data frame containing all the search results
@ -22,5 +21,5 @@ A data frame containing all the search results
Generate a data frame out of unparsed Elasticsearch JSON Generate a data frame out of unparsed Elasticsearch JSON
} }
\examples{ \examples{
elasticizer(query, src = TRUE, index = "maml", es_pwd = "secret") elasticizer(query, src = TRUE, index = "maml")
} }

Loading…
Cancel
Save