class_update; dfm_gen; merger: updated functions to accept text parameter for both old style 'lemmas' and new style 'ud'

master
Erik de Vries 6 years ago
parent 85aab558e0
commit 9f3418ef37

@ -6,6 +6,7 @@
#' @param model_final The classification model (e.g. output from textstat_nb(), svm() or others) #' @param model_final The classification model (e.g. output from textstat_nb(), svm() or others)
#' @param dfm_words A dfm containing all the words and only the words used to generate the model (is used for subsetting) #' @param dfm_words A dfm containing all the words and only the words used to generate the model (is used for subsetting)
#' @param varname String containing the variable name to use for the classification result, usually has the format computerCodes.varname #' @param varname String containing the variable name to use for the classification result, usually has the format computerCodes.varname
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param es_super Password for write access to ElasticSearch #' @param es_super Password for write access to ElasticSearch
#' @return As this is a nested function used within elasticizer, there is no return output #' @return As this is a nested function used within elasticizer, there is no return output
#' @export #' @export
@ -14,9 +15,9 @@
################################################################################################# #################################################################################################
#################################### Update any kind of classification ########################## #################################### Update any kind of classification ##########################
################################################################################################# #################################################################################################
class_update <- function(out, localhost = T, model_final, dfm_words, varname, es_super = .rs.askForPassword('ElasticSearch WRITE')) { class_update <- function(out, localhost = T, model_final, dfm_words, varname, text, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
print('updating') print('updating')
dfm <- dfm_gen(out, text = 'lemmas') %>% dfm <- dfm_gen(out, text = text) %>%
dfm_keep(dfm_words, valuetype="fixed", verbose=T) dfm_keep(dfm_words, valuetype="fixed", verbose=T)
pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm)) pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm))
bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set') bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set')

@ -3,7 +3,7 @@
#' Generates dfm from ElasticSearch output #' Generates dfm from ElasticSearch output
#' @param out The elasticizer-generated data frame #' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas" #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @return A Quanteda dfm #' @return A Quanteda dfm
#' @export #' @export
#' @examples #' @examples
@ -21,8 +21,8 @@ dfm_gen <- function(out, words = '999', text = "lemmas") {
out <- out %>% out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out)) fields <- length(names(out))
if (text == "lemmas") { if (text == "lemmas" || text == 'ud') {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, mc.cores = detectCores())) out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores()))
} }
if (text == "full") { if (text == "full") {
out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "), out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),

@ -4,18 +4,24 @@
#' @param row A row number form the Elasticizer-generated data frame #' @param row A row number form the Elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param out The elasticizer-generated data frame #' @param out The elasticizer-generated data frame
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @return A documentified string of lemmas, one document at a time #' @return A documentified string of lemmas, one document at a time
#' @export #' @export
#' @examples #' @examples
#' merger(1, words = '999', out = out) #' merger(1, words = '999', out, text)
################################################################################################# #################################################################################################
#################################### Reconstructing documents from lemmas######################## #################################### Reconstructing documents from lemmas########################
################################################################################################# #################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification ## Only merging lemmas for now, feature selection has no impact on junk classification
merger <- function(row, out = out) { merger <- function(row, out, text) {
df <- out[row,] df <- out[row,]
# Mergin lemmas into single string # Mergin lemmas into single string
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ') if (text == 'lemmas') {
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
}
if (text == 'ud') {
lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
}
# Replacing $-marked punctuation with their regular forms # Replacing $-marked punctuation with their regular forms
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>% lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
### Removing numbers and non-words containing numbers ### Removing numbers and non-words containing numbers

@ -4,7 +4,7 @@
\alias{class_update} \alias{class_update}
\title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information} \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information}
\usage{ \usage{
class_update(out, localhost = T, model_final, dfm_words, varname, class_update(out, localhost = T, model_final, dfm_words, varname, text,
es_super = .rs.askForPassword("ElasticSearch WRITE")) es_super = .rs.askForPassword("ElasticSearch WRITE"))
} }
\arguments{ \arguments{
@ -18,6 +18,8 @@ class_update(out, localhost = T, model_final, dfm_words, varname,
\item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname} \item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname}
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
\item{es_super}{Password for write access to ElasticSearch} \item{es_super}{Password for write access to ElasticSearch}
} }
\value{ \value{

@ -11,7 +11,7 @@ dfm_gen(out, words = "999", text = "lemmas")
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
\item{text}{String indicating whether the "merged" field will contain the "full" text, or "lemmas"} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
} }
\value{ \value{
A Quanteda dfm A Quanteda dfm

@ -4,13 +4,15 @@
\alias{merger} \alias{merger}
\title{Merges list of lemmas back into a pseudo-document} \title{Merges list of lemmas back into a pseudo-document}
\usage{ \usage{
merger(row, out = out) merger(row, out, text)
} }
\arguments{ \arguments{
\item{row}{A row number form the Elasticizer-generated data frame} \item{row}{A row number form the Elasticizer-generated data frame}
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
} }
\value{ \value{
@ -20,5 +22,5 @@ A documentified string of lemmas, one document at a time
Merges list of lemmas back into a pseudo-document Merges list of lemmas back into a pseudo-document
} }
\examples{ \examples{
merger(1, words = '999', out = out) merger(1, words = '999', out, text)
} }

Loading…
Cancel
Save