diff --git a/R/class_update.R b/R/class_update.R index 47843b1..26c5039 100644 --- a/R/class_update.R +++ b/R/class_update.R @@ -6,6 +6,7 @@ #' @param model_final The classification model (e.g. output from textstat_nb(), svm() or others) #' @param dfm_words A dfm containing all the words and only the words used to generate the model (is used for subsetting) #' @param varname String containing the variable name to use for the classification result, usually has the format computerCodes.varname +#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @param es_super Password for write access to ElasticSearch #' @return As this is a nested function used within elasticizer, there is no return output #' @export @@ -14,9 +15,9 @@ ################################################################################################# #################################### Update any kind of classification ########################## ################################################################################################# -class_update <- function(out, localhost = T, model_final, dfm_words, varname, es_super = .rs.askForPassword('ElasticSearch WRITE')) { +class_update <- function(out, localhost = T, model_final, dfm_words, varname, text, es_super = .rs.askForPassword('ElasticSearch WRITE')) { print('updating') - dfm <- dfm_gen(out, text = 'lemmas') %>% + dfm <- dfm_gen(out, text = text) %>% dfm_keep(dfm_words, valuetype="fixed", verbose=T) pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm)) bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set') diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 53b689e..7eb1141 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -3,7 +3,7 @@ #' Generates dfm from ElasticSearch output #' @param out The elasticizer-generated data frame #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document -#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas" +#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @return A Quanteda dfm #' @export #' @examples @@ -21,8 +21,8 @@ dfm_gen <- function(out, words = '999', text = "lemmas") { out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field fields <- length(names(out)) - if (text == "lemmas") { - out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, mc.cores = detectCores())) + if (text == "lemmas" || text == 'ud') { + out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores())) } if (text == "full") { out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "), diff --git a/R/merger.R b/R/merger.R index 975d32a..50b0f83 100644 --- a/R/merger.R +++ b/R/merger.R @@ -4,18 +4,24 @@ #' @param row A row number form the Elasticizer-generated data frame #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param out The elasticizer-generated data frame +#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @return A documentified string of lemmas, one document at a time #' @export #' @examples -#' merger(1, words = '999', out = out) +#' merger(1, words = '999', out, text) ################################################################################################# #################################### Reconstructing documents from lemmas######################## ################################################################################################# ## Only merging lemmas for now, feature selection has no impact on junk classification -merger <- function(row, out = out) { +merger <- function(row, out, text) { df <- out[row,] # Mergin lemmas into single string - lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ') + if (text == 'lemmas') { + lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ') + } + if (text == 'ud') { + lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ') + } # Replacing $-marked punctuation with their regular forms lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>% ### Removing numbers and non-words containing numbers diff --git a/man/class_update.Rd b/man/class_update.Rd index 25aaeae..ba6224d 100644 --- a/man/class_update.Rd +++ b/man/class_update.Rd @@ -4,7 +4,7 @@ \alias{class_update} \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information} \usage{ -class_update(out, localhost = T, model_final, dfm_words, varname, +class_update(out, localhost = T, model_final, dfm_words, varname, text, es_super = .rs.askForPassword("ElasticSearch WRITE")) } \arguments{ @@ -18,6 +18,8 @@ class_update(out, localhost = T, model_final, dfm_words, varname, \item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname} +\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} + \item{es_super}{Password for write access to ElasticSearch} } \value{ diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index 54d86c7..1e3b66b 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -11,7 +11,7 @@ dfm_gen(out, words = "999", text = "lemmas") \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} -\item{text}{String indicating whether the "merged" field will contain the "full" text, or "lemmas"} +\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} } \value{ A Quanteda dfm diff --git a/man/merger.Rd b/man/merger.Rd index dea2dbd..4efe147 100644 --- a/man/merger.Rd +++ b/man/merger.Rd @@ -4,13 +4,15 @@ \alias{merger} \title{Merges list of lemmas back into a pseudo-document} \usage{ -merger(row, out = out) +merger(row, out, text) } \arguments{ \item{row}{A row number form the Elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame} +\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} + \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} } \value{ @@ -20,5 +22,5 @@ A documentified string of lemmas, one document at a time Merges list of lemmas back into a pseudo-document } \examples{ -merger(1, words = '999', out = out) +merger(1, words = '999', out, text) }