diff --git a/Old/merger_old.R b/Old/merger_old.R new file mode 100644 index 0000000..80a66e1 --- /dev/null +++ b/Old/merger_old.R @@ -0,0 +1,47 @@ +#' Merges list of lemmas back into a pseudo-document +#' +#' Merges list of lemmas back into a pseudo-document +#' @param row A row number form the Elasticizer-generated data frame +#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document +#' @param out The elasticizer-generated data frame +#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" +#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). +#' @return A documentified string of lemmas, one document at a time +#' @export +#' @examples +#' merger(1, words = '999', out, text) +################################################################################################# +#################################### Reconstructing documents from lemmas######################## +################################################################################################# +## Only merging lemmas for now, feature selection has no impact on junk classification +merger_old <- function(row, out, text, clean) { + df <- out[row,] + # Mergin lemmas into single string + if (text == 'lemmas') { + lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ') + } + if (text == 'ud') { + lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ') + } + if (text == 'ud_upos') { + df <- unnest(df,`_source.ud`) + lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>% + # Regex removes all words consisting of or containing numbers, @#$% + # Punctuation is not taken into account, as it is already filtered out, see above + {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "") else . } + # In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string + if (length(lemmas) == 0 ){ + lemmas <- '' + } + return(lemmas) + } + # Replacing $-marked punctuation with their regular forms + lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>% + # Regex removes all words consisting of or containing numbers, @#$% + # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above + # Regex also used in out_parser + {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>% + # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " + paste0(.,". ") + return(lemmas) +} diff --git a/R/class_update.R b/R/class_update.R index 596c1bb..68d1294 100644 --- a/R/class_update.R +++ b/R/class_update.R @@ -20,7 +20,7 @@ ################################################################################################# class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) { print('updating') - dfm <- dfm_gen(out, text = text, words = words, clean = clean, cores = cores) + dfm <- dfm_gen(out, text = text, words = words, clean = clean) if (!is.null(model_final$idf)) { dfm <- dfm_weight(dfm, weights = model_final$idf) } diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 187cbfa..d086b0c 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -5,7 +5,6 @@ #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). -#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available) #' @param tolower Boolean indicating whether dfm features should be lowercased #' @return A Quanteda dfm #' @export @@ -19,16 +18,16 @@ # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack -dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1, tolower = T) { +dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) { # Create subset with just ids, codes and text out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field fields <- length(names(out)) if (text == "lemmas" || text == 'ud' || text == 'ud_upos') { - out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = cores)) + out <- left_join(out, merger(out, text=text, clean=clean), by = "_id") } if (text == "full") { - out <- mamlr:::out_parser(out, field = '_source' , clean = clean, cores = cores) + out <- mamlr:::out_parser(out, field = '_source' , clean = clean) } if ('_source.codes.majorTopic' %in% colnames(out)) { out <- out %>% diff --git a/R/elasticizer.R b/R/elasticizer.R index 69a8116..10f5e9f 100644 --- a/R/elasticizer.R +++ b/R/elasticizer.R @@ -143,7 +143,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw scroll_clear(conn = conn, x = json$`_scroll_id`) return("Done updating") } else if (dump) { - saveRDS(out, file = paste0('df_raw',as.numeric(as.POSIXct(Sys.time())),'.Rds')) + saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds')) } else { scroll_clear(conn = conn, x = json$`_scroll_id`) return(out) diff --git a/R/merger.R b/R/merger.R index 5c0c0c0..ed16153 100644 --- a/R/merger.R +++ b/R/merger.R @@ -1,47 +1,59 @@ #' Merges list of lemmas back into a pseudo-document #' #' Merges list of lemmas back into a pseudo-document -#' @param row A row number form the Elasticizer-generated data frame -#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param out The elasticizer-generated data frame #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @return A documentified string of lemmas, one document at a time #' @export #' @examples -#' merger(1, words = '999', out, text) +#' merger(out, text, clean) ################################################################################################# #################################### Reconstructing documents from lemmas######################## ################################################################################################# ## Only merging lemmas for now, feature selection has no impact on junk classification -merger <- function(row, out, text, clean) { - df <- out[row,] - # Mergin lemmas into single string - if (text == 'lemmas') { - lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ') - } - if (text == 'ud') { - lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ') - } +merger <- function(out, text, clean) { + df <- unnest(out, cols = '_source.ud') %>% + unnest(cols = c('lemma','upos')) %>% + # This line is added in the new merger function, in the old merger function this would result in the following: + # 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma + # 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA + filter(!is.na(lemma)) %>% + group_by(`_id`) if (text == 'ud_upos') { - df <- unnest(df,`_source.ud`) - lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>% + df <- df %>% + filter(upos != 'PUNCT') %>% + mutate( + lem_u = str_c(lemma,upos,sep="_") + ) %>% + summarise( + merged = str_c(c(lem_u), collapse= ' ') + ) %>% # Regex removes all words consisting of or containing numbers, @#$% # Punctuation is not taken into account, as it is already filtered out, see above - {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "") else . } - # In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string - if (length(lemmas) == 0 ){ - lemmas <- '' - } - return(lemmas) + {if(clean == T) mutate(., + merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "") + ) + else . } + } + if (text == 'ud') { + df <- df %>% + summarise( + merged = str_c(c(lemma), collapse= ' ') + ) %>% + mutate( + merged = str_replace_all(merged," \\$(.+?)", "\\1") + ) %>% + # Regex removes all words consisting of or containing numbers, @#$% + # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above + # Regex also used in out_parser + # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " + {if(clean == T) mutate(., + merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") + ) + else . } %>% + mutate(., + merged = paste0(merged,'. ')) } - # Replacing $-marked punctuation with their regular forms - lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>% - # Regex removes all words consisting of or containing numbers, @#$% - # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above - # Regex also used in out_parser - {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>% - # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " - paste0(.,". ") - return(lemmas) + return(df) } diff --git a/R/ud_update.R b/R/ud_update.R index d8632d1..9853004 100644 --- a/R/ud_update.R +++ b/R/ud_update.R @@ -2,15 +2,13 @@ #' #' Elasticizer update function: generate UDpipe output from base text #' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function) -#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200) #' @param udmodel UDpipe model to use -#' @param es_super Password for write access to ElasticSearch -#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available) #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') +#' @param file Filename for output (ud_ is automatically prepended) #' @return A vector of 1's indicating the success of each update call #' @export #' @examples -#' ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores()) +#' ud_update(out, udmodel, ver, file) #' # punct_check <- function(str) { @@ -19,30 +17,26 @@ # } # } -ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) { +ud_update <- function(out, udmodel, ver) { out <- mamlr:::out_parser(out, field = '_source', clean = F) - par_proc <- function(row, out, udmodel) { - doc <- out[row,] - ud <- as.data.frame(udpipe(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% - group_by(doc_id) %>% - summarise( - sentence_id = list(as.integer(sentence_id)), - token_id = list(as.integer(token_id)), - lemma = list(as.character(lemma)), - upos = list(as.character(upos)), - feats = list(as.character(feats)), - head_token_id = list(as.integer(head_token_id)), - dep_rel = list(as.character(dep_rel)), - start = list(as.integer(start)), - end = list(as.integer(end)), - exists = list(TRUE) - ) - return(ud) - } - ud <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_proc, out = out, udmodel=udmodel, mc.cores = cores)) + ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>% + group_by(doc_id) %>% + summarise( + sentence_id = list(as.integer(sentence_id)), + token_id = list(as.integer(token_id)), + lemma = list(as.character(lemma)), + upos = list(as.character(upos)), + feats = list(as.character(feats)), + head_token_id = list(as.integer(head_token_id)), + dep_rel = list(as.character(dep_rel)), + start = list(as.integer(start)), + end = list(as.integer(end)), + exists = list(TRUE) + ) bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver) - res <- elastic_update(bulk, es_super = es_super, localhost = localhost) - return(res) + saveRDS(bulk, file = paste0('ud_',file)) + # res <- elastic_update(bulk, es_super = es_super, localhost = localhost) + return() } #### Old code #### diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index 93ecaf9..595d08b 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -4,7 +4,7 @@ \alias{dfm_gen} \title{Generates dfm from ElasticSearch output} \usage{ -dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T) +dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T) } \arguments{ \item{out}{The elasticizer-generated data frame} @@ -15,8 +15,6 @@ dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T) \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} -\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)} - \item{tolower}{Boolean indicating whether dfm features should be lowercased} } \value{ diff --git a/man/merger.Rd b/man/merger.Rd index 85f280a..69ec7b7 100644 --- a/man/merger.Rd +++ b/man/merger.Rd @@ -4,18 +4,14 @@ \alias{merger} \title{Merges list of lemmas back into a pseudo-document} \usage{ -merger(row, out, text, clean) +merger(out, text, clean) } \arguments{ -\item{row}{A row number form the Elasticizer-generated data frame} - \item{out}{The elasticizer-generated data frame} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} - -\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} } \value{ A documentified string of lemmas, one document at a time @@ -24,5 +20,5 @@ A documentified string of lemmas, one document at a time Merges list of lemmas back into a pseudo-document } \examples{ -merger(1, words = '999', out, text) +merger(out, text, clean) } diff --git a/man/ud_update.Rd b/man/ud_update.Rd index f636857..91bf9e4 100644 --- a/man/ud_update.Rd +++ b/man/ud_update.Rd @@ -4,27 +4,16 @@ \alias{ud_update} \title{Elasticizer update function: generate UDpipe output from base text} \usage{ -ud_update( - out, - localhost = T, - udmodel, - es_super = .rs.askForPassword("ElasticSearch WRITE"), - cores = detectCores(), - ver -) +ud_update(out, udmodel, ver) } \arguments{ \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} -\item{localhost}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)} - \item{udmodel}{UDpipe model to use} -\item{es_super}{Password for write access to ElasticSearch} - -\item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)} - \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} + +\item{file}{Filename for output (ud_ is automatically prepended)} } \value{ A vector of 1's indicating the success of each update call @@ -33,6 +22,6 @@ A vector of 1's indicating the success of each update call Elasticizer update function: generate UDpipe output from base text } \examples{ -ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores()) +ud_update(out, udmodel, ver, file) }