From 85306007f4f97296f5d48c3edaadb163109c48e1 Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Wed, 16 Jan 2019 19:34:37 +0100 Subject: [PATCH] class_update: added words and clean parameters, in addition to text parameter, to be able to set data preprocessing exactly the same as in the trained model --- R/class_update.R | 8 +++++--- man/class_update.Rd | 8 ++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/R/class_update.R b/R/class_update.R index 26c5039..d02204f 100644 --- a/R/class_update.R +++ b/R/class_update.R @@ -6,7 +6,9 @@ #' @param model_final The classification model (e.g. output from textstat_nb(), svm() or others) #' @param dfm_words A dfm containing all the words and only the words used to generate the model (is used for subsetting) #' @param varname String containing the variable name to use for the classification result, usually has the format computerCodes.varname -#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" +#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document +#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags +#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @param es_super Password for write access to ElasticSearch #' @return As this is a nested function used within elasticizer, there is no return output #' @export @@ -15,9 +17,9 @@ ################################################################################################# #################################### Update any kind of classification ########################## ################################################################################################# -class_update <- function(out, localhost = T, model_final, dfm_words, varname, text, es_super = .rs.askForPassword('ElasticSearch WRITE')) { +class_update <- function(out, localhost = T, model_final, dfm_words, varname, text, words, clean, es_super = .rs.askForPassword('ElasticSearch WRITE')) { print('updating') - dfm <- dfm_gen(out, text = text) %>% + dfm <- dfm_gen(out, text = text, words = words, clean = clean) %>% dfm_keep(dfm_words, valuetype="fixed", verbose=T) pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm)) bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set') diff --git a/man/class_update.Rd b/man/class_update.Rd index ba6224d..d91a80e 100644 --- a/man/class_update.Rd +++ b/man/class_update.Rd @@ -5,7 +5,7 @@ \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information} \usage{ class_update(out, localhost = T, model_final, dfm_words, varname, text, - es_super = .rs.askForPassword("ElasticSearch WRITE")) + words, clean, es_super = .rs.askForPassword("ElasticSearch WRITE")) } \arguments{ \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} @@ -18,7 +18,11 @@ class_update(out, localhost = T, model_final, dfm_words, varname, text, \item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname} -\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} +\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags} + +\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} + +\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} \item{es_super}{Password for write access to ElasticSearch} }