class_update: remove dfm_gen multicore option

dfm_gen: remove multicore, update merger() code elasticizer: changed filenaming scheme for dump option merger: Fixed bug where an NA lemma would cause the entire document to become NA. Now the NA lemmas are filtered out before merging ud_update: removed parallel processing, changed script to save bulk updates in .Rds files instead of sending them straight away
4 years ago · 4b4d860235
parent 5d99ec9509
commit 4b4d860235
9 changed files with 120 additions and 85 deletions
--- a/Old/merger_old.R
+++ b/Old/merger_old.R
@ -0,0 +1,47 @@
+#' Merges list of lemmas back into a pseudo-document
+#'
+#' Merges list of lemmas back into a pseudo-document
+#' @param row A row number form the Elasticizer-generated data frame
+#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
+#' @param out The elasticizer-generated data frame
+#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
+#' @return A documentified string of lemmas, one document at a time
+#' @export
+#' @examples
+#' merger(1, words = '999', out, text)
+#################################################################################################
+#################################### Reconstructing documents from lemmas########################
+#################################################################################################
+## Only merging lemmas for now, feature selection has no impact on junk classification
+merger_old <- function(row, out, text, clean) {
+  df <- out[row,]
+  # Mergin lemmas into single string
+  if (text == 'lemmas') {
+    lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
+  }
+  if (text == 'ud') {
+    lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
+  }
+  if (text == 'ud_upos') {
+    df <- unnest(df,`_source.ud`)
+    lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
+      # Regex removes all words consisting of or containing numbers, @#$%
+      # Punctuation is not taken into account, as it is already filtered out, see above
+      {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "")  else . }
+    # In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string
+    if (length(lemmas) == 0 ){
+      lemmas <- ''
+    }
+    return(lemmas)
+  }
+  # Replacing $-marked punctuation with their regular forms
+  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
+    # Regex removes all words consisting of or containing numbers, @#$%
+    # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
+    # Regex also used in out_parser
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
+    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
+    paste0(.,". ")
+  return(lemmas)
+}
--- a/R/class_update.R
+++ b/R/class_update.R
@ -20,7 +20,7 @@
 #################################################################################################
 class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) {
  print('updating')
-  dfm <- dfm_gen(out, text = text, words = words, clean = clean, cores = cores)
+  dfm <- dfm_gen(out, text = text, words = words, clean = clean)
  if (!is.null(model_final$idf)) {
    dfm <- dfm_weight(dfm, weights = model_final$idf)
  }
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -5,7 +5,6 @@
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
-#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
 #' @param tolower Boolean indicating whether dfm features should be lowercased
 #' @return A Quanteda dfm
 #' @export
@ -19,16 +18,16 @@

 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack

-dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1, tolower = T) {
+dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
  if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
-    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = cores))
+    out <- left_join(out, merger(out, text=text, clean=clean), by = "_id")
  }
  if (text == "full") {
-    out <- mamlr:::out_parser(out, field = '_source' , clean = clean, cores = cores)
+    out <- mamlr:::out_parser(out, field = '_source' , clean = clean)
  }
  if ('_source.codes.majorTopic' %in% colnames(out)) {
    out <- out %>%
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -143,7 +143,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return("Done updating")
    } else if (dump) {
-      saveRDS(out, file = paste0('df_raw',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
+      saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
    } else {
      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return(out)
--- a/R/merger.R
+++ b/R/merger.R
@ -1,47 +1,59 @@
 #' Merges list of lemmas back into a pseudo-document
 #'
 #' Merges list of lemmas back into a pseudo-document
-#' @param row A row number form the Elasticizer-generated data frame
-#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param out The elasticizer-generated data frame
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @return A documentified string of lemmas, one document at a time
 #' @export
 #' @examples
-#' merger(1, words = '999', out, text)
+#' merger(out, text, clean)
 #################################################################################################
 #################################### Reconstructing documents from lemmas########################
 #################################################################################################
 ## Only merging lemmas for now, feature selection has no impact on junk classification
-merger <- function(row, out, text, clean) {
-  df <- out[row,]
-  # Mergin lemmas into single string
-  if (text == 'lemmas') {
-    lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
-  }
-  if (text == 'ud') {
-    lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
-  }
+merger <- function(out, text, clean) {
+  df <- unnest(out, cols = '_source.ud') %>%
+    unnest(cols = c('lemma','upos')) %>%
+    # This line is added in the new merger function, in the old merger function this would result in the following:
+    # 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma
+    # 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA
+    filter(!is.na(lemma)) %>%
+    group_by(`_id`)
  if (text == 'ud_upos') {
-    df <- unnest(df,`_source.ud`)
-    lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
+    df <- df %>%
+      filter(upos != 'PUNCT') %>%
+      mutate(
+        lem_u = str_c(lemma,upos,sep="_")
+      ) %>%
+      summarise(
+                merged = str_c(c(lem_u), collapse= ' ')
+      ) %>%
      # Regex removes all words consisting of or containing numbers, @#$%
      # Punctuation is not taken into account, as it is already filtered out, see above
-    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "")  else . }
-    # In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string
-    if (length(lemmas) == 0 ){
-      lemmas <- ''
-    }
-    return(lemmas)
+      {if(clean == T) mutate(.,
+                             merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "")
+      )
+        else . }
+  }
+  if (text == 'ud') {
+    df <- df %>%
+      summarise(
+                merged = str_c(c(lemma), collapse= ' ')
+      ) %>%
+      mutate(
+        merged = str_replace_all(merged," \\$(.+?)", "\\1")
+      ) %>%
+      # Regex removes all words consisting of or containing numbers, @#$%
+      # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
+      # Regex also used in out_parser
+      # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
+      {if(clean == T) mutate(.,
+                             merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")
+      )
+        else . } %>%
+      mutate(.,
+             merged = paste0(merged,'. '))
  }
-  # Replacing $-marked punctuation with their regular forms
-  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
-    # Regex removes all words consisting of or containing numbers, @#$%
-    # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
-    # Regex also used in out_parser
-    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
-    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
-    paste0(.,". ")
-  return(lemmas)
+  return(df)
 }
--- a/R/ud_update.R
+++ b/R/ud_update.R
@ -2,15 +2,13 @@
 #'
 #' Elasticizer update function: generate UDpipe output from base text
 #' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
-#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param udmodel UDpipe model to use
-#' @param es_super Password for write access to ElasticSearch
-#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
+#' @param file Filename for output (ud_ is automatically prepended)
 #' @return A vector of 1's indicating the success of each update call
 #' @export
 #' @examples
-#' ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores())
+#' ud_update(out, udmodel, ver, file)
 #'

 # punct_check <- function(str) {
@ -19,30 +17,26 @@
 #   }
 # }

-ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) {
+ud_update <- function(out, udmodel, ver) {
  out <- mamlr:::out_parser(out, field = '_source', clean = F)
-  par_proc <- function(row, out, udmodel) {
-    doc <- out[row,]
-    ud <- as.data.frame(udpipe(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
-      group_by(doc_id) %>%
-      summarise(
-        sentence_id = list(as.integer(sentence_id)),
-        token_id = list(as.integer(token_id)),
-        lemma = list(as.character(lemma)),
-        upos = list(as.character(upos)),
-        feats = list(as.character(feats)),
-        head_token_id = list(as.integer(head_token_id)),
-        dep_rel = list(as.character(dep_rel)),
-        start = list(as.integer(start)),
-        end = list(as.integer(end)),
-        exists = list(TRUE)
-     )
-    return(ud)
-  }
-  ud <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_proc, out = out, udmodel=udmodel, mc.cores = cores))
+  ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
+    group_by(doc_id) %>%
+    summarise(
+      sentence_id = list(as.integer(sentence_id)),
+      token_id = list(as.integer(token_id)),
+      lemma = list(as.character(lemma)),
+      upos = list(as.character(upos)),
+      feats = list(as.character(feats)),
+      head_token_id = list(as.integer(head_token_id)),
+      dep_rel = list(as.character(dep_rel)),
+      start = list(as.integer(start)),
+      end = list(as.integer(end)),
+      exists = list(TRUE)
+   )
  bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
-  res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
-  return(res)
+  saveRDS(bulk, file = paste0('ud_',file))
+  # res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
+  return()
 }

 #### Old code ####
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,7 +4,7 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T)
+dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
@ -15,8 +15,6 @@ dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T)

 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}

-\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
-
 \item{tolower}{Boolean indicating whether dfm features should be lowercased}
 }
 \value{
--- a/man/merger.Rd
+++ b/man/merger.Rd
@ -4,18 +4,14 @@
 \alias{merger}
 \title{Merges list of lemmas back into a pseudo-document}
 \usage{
-merger(row, out, text, clean)
+merger(out, text, clean)
 }
 \arguments{
-\item{row}{A row number form the Elasticizer-generated data frame}
-
 \item{out}{The elasticizer-generated data frame}

 \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}

 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
-
-\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
 }
 \value{
 A documentified string of lemmas, one document at a time
@ -24,5 +20,5 @@ A documentified string of lemmas, one document at a time
 Merges list of lemmas back into a pseudo-document
 }
 \examples{
-merger(1, words = '999', out, text)
+merger(out, text, clean)
 }
--- a/man/ud_update.Rd
+++ b/man/ud_update.Rd
@ -4,27 +4,16 @@
 \alias{ud_update}
 \title{Elasticizer update function: generate UDpipe output from base text}
 \usage{
-ud_update(
-  out,
-  localhost = T,
-  udmodel,
-  es_super = .rs.askForPassword("ElasticSearch WRITE"),
-  cores = detectCores(),
-  ver
-)
+ud_update(out, udmodel, ver)
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}

-\item{localhost}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
-
 \item{udmodel}{UDpipe model to use}

-\item{es_super}{Password for write access to ElasticSearch}
-
-\item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)}
-
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
+
+\item{file}{Filename for output (ud_ is automatically prepended)}
 }
 \value{
 A vector of 1's indicating the success of each update call
@ -33,6 +22,6 @@ A vector of 1's indicating the success of each update call
 Elasticizer update function: generate UDpipe output from base text
 }
 \examples{
-ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores())
+ud_update(out, udmodel, ver, file)

 }