class_update: remove dfm_gen multicore option

dfm_gen: remove multicore, update merger() code
elasticizer: changed filenaming scheme for dump option
merger: Fixed bug where an NA lemma would cause the entire document to become NA. Now the NA lemmas are filtered out before merging
ud_update: removed parallel processing, changed script to save bulk updates in .Rds files instead of sending them straight away
master
Your Name 4 years ago
parent 5d99ec9509
commit 4b4d860235

@ -0,0 +1,47 @@
#' Merges list of lemmas back into a pseudo-document
#'
#' Merges list of lemmas back into a pseudo-document
#' @param row A row number form the Elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param out The elasticizer-generated data frame
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @return A documentified string of lemmas, one document at a time
#' @export
#' @examples
#' merger(1, words = '999', out, text)
#################################################################################################
#################################### Reconstructing documents from lemmas########################
#################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification
merger_old <- function(row, out, text, clean) {
df <- out[row,]
# Mergin lemmas into single string
if (text == 'lemmas') {
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
}
if (text == 'ud') {
lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
}
if (text == 'ud_upos') {
df <- unnest(df,`_source.ud`)
lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is not taken into account, as it is already filtered out, see above
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "") else . }
# In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string
if (length(lemmas) == 0 ){
lemmas <- ''
}
return(lemmas)
}
# Replacing $-marked punctuation with their regular forms
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in out_parser
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>%
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
paste0(.,". ")
return(lemmas)
}

@ -20,7 +20,7 @@
################################################################################################# #################################################################################################
class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) { class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) {
print('updating') print('updating')
dfm <- dfm_gen(out, text = text, words = words, clean = clean, cores = cores) dfm <- dfm_gen(out, text = text, words = words, clean = clean)
if (!is.null(model_final$idf)) { if (!is.null(model_final$idf)) {
dfm <- dfm_weight(dfm, weights = model_final$idf) dfm <- dfm_weight(dfm, weights = model_final$idf)
} }

@ -5,7 +5,6 @@
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
#' @param tolower Boolean indicating whether dfm features should be lowercased #' @param tolower Boolean indicating whether dfm features should be lowercased
#' @return A Quanteda dfm #' @return A Quanteda dfm
#' @export #' @export
@ -19,16 +18,16 @@
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1, tolower = T) { dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
# Create subset with just ids, codes and text # Create subset with just ids, codes and text
out <- out %>% out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out)) fields <- length(names(out))
if (text == "lemmas" || text == 'ud' || text == 'ud_upos') { if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = cores)) out <- left_join(out, merger(out, text=text, clean=clean), by = "_id")
} }
if (text == "full") { if (text == "full") {
out <- mamlr:::out_parser(out, field = '_source' , clean = clean, cores = cores) out <- mamlr:::out_parser(out, field = '_source' , clean = clean)
} }
if ('_source.codes.majorTopic' %in% colnames(out)) { if ('_source.codes.majorTopic' %in% colnames(out)) {
out <- out %>% out <- out %>%

@ -143,7 +143,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
scroll_clear(conn = conn, x = json$`_scroll_id`) scroll_clear(conn = conn, x = json$`_scroll_id`)
return("Done updating") return("Done updating")
} else if (dump) { } else if (dump) {
saveRDS(out, file = paste0('df_raw',as.numeric(as.POSIXct(Sys.time())),'.Rds')) saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
} else { } else {
scroll_clear(conn = conn, x = json$`_scroll_id`) scroll_clear(conn = conn, x = json$`_scroll_id`)
return(out) return(out)

@ -1,47 +1,59 @@
#' Merges list of lemmas back into a pseudo-document #' Merges list of lemmas back into a pseudo-document
#' #'
#' Merges list of lemmas back into a pseudo-document #' Merges list of lemmas back into a pseudo-document
#' @param row A row number form the Elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param out The elasticizer-generated data frame #' @param out The elasticizer-generated data frame
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @return A documentified string of lemmas, one document at a time #' @return A documentified string of lemmas, one document at a time
#' @export #' @export
#' @examples #' @examples
#' merger(1, words = '999', out, text) #' merger(out, text, clean)
################################################################################################# #################################################################################################
#################################### Reconstructing documents from lemmas######################## #################################### Reconstructing documents from lemmas########################
################################################################################################# #################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification ## Only merging lemmas for now, feature selection has no impact on junk classification
merger <- function(row, out, text, clean) { merger <- function(out, text, clean) {
df <- out[row,] df <- unnest(out, cols = '_source.ud') %>%
# Mergin lemmas into single string unnest(cols = c('lemma','upos')) %>%
if (text == 'lemmas') { # This line is added in the new merger function, in the old merger function this would result in the following:
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ') # 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma
} # 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA
if (text == 'ud') { filter(!is.na(lemma)) %>%
lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ') group_by(`_id`)
}
if (text == 'ud_upos') { if (text == 'ud_upos') {
df <- unnest(df,`_source.ud`) df <- df %>%
lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>% filter(upos != 'PUNCT') %>%
mutate(
lem_u = str_c(lemma,upos,sep="_")
) %>%
summarise(
merged = str_c(c(lem_u), collapse= ' ')
) %>%
# Regex removes all words consisting of or containing numbers, @#$% # Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is not taken into account, as it is already filtered out, see above # Punctuation is not taken into account, as it is already filtered out, see above
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "") else . } {if(clean == T) mutate(.,
# In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "")
if (length(lemmas) == 0 ){ )
lemmas <- '' else . }
} }
return(lemmas) if (text == 'ud') {
df <- df %>%
summarise(
merged = str_c(c(lemma), collapse= ' ')
) %>%
mutate(
merged = str_replace_all(merged," \\$(.+?)", "\\1")
) %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in out_parser
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
{if(clean == T) mutate(.,
merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")
)
else . } %>%
mutate(.,
merged = paste0(merged,'. '))
} }
# Replacing $-marked punctuation with their regular forms return(df)
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in out_parser
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>%
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
paste0(.,". ")
return(lemmas)
} }

@ -2,15 +2,13 @@
#' #'
#' Elasticizer update function: generate UDpipe output from base text #' Elasticizer update function: generate UDpipe output from base text
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function) #' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @param udmodel UDpipe model to use #' @param udmodel UDpipe model to use
#' @param es_super Password for write access to ElasticSearch
#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2') #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param file Filename for output (ud_ is automatically prepended)
#' @return A vector of 1's indicating the success of each update call #' @return A vector of 1's indicating the success of each update call
#' @export #' @export
#' @examples #' @examples
#' ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores()) #' ud_update(out, udmodel, ver, file)
#' #'
# punct_check <- function(str) { # punct_check <- function(str) {
@ -19,30 +17,26 @@
# } # }
# } # }
ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) { ud_update <- function(out, udmodel, ver) {
out <- mamlr:::out_parser(out, field = '_source', clean = F) out <- mamlr:::out_parser(out, field = '_source', clean = F)
par_proc <- function(row, out, udmodel) { ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
doc <- out[row,] group_by(doc_id) %>%
ud <- as.data.frame(udpipe(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% summarise(
group_by(doc_id) %>% sentence_id = list(as.integer(sentence_id)),
summarise( token_id = list(as.integer(token_id)),
sentence_id = list(as.integer(sentence_id)), lemma = list(as.character(lemma)),
token_id = list(as.integer(token_id)), upos = list(as.character(upos)),
lemma = list(as.character(lemma)), feats = list(as.character(feats)),
upos = list(as.character(upos)), head_token_id = list(as.integer(head_token_id)),
feats = list(as.character(feats)), dep_rel = list(as.character(dep_rel)),
head_token_id = list(as.integer(head_token_id)), start = list(as.integer(start)),
dep_rel = list(as.character(dep_rel)), end = list(as.integer(end)),
start = list(as.integer(start)), exists = list(TRUE)
end = list(as.integer(end)), )
exists = list(TRUE)
)
return(ud)
}
ud <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_proc, out = out, udmodel=udmodel, mc.cores = cores))
bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver) bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
res <- elastic_update(bulk, es_super = es_super, localhost = localhost) saveRDS(bulk, file = paste0('ud_',file))
return(res) # res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
return()
} }
#### Old code #### #### Old code ####

@ -4,7 +4,7 @@
\alias{dfm_gen} \alias{dfm_gen}
\title{Generates dfm from ElasticSearch output} \title{Generates dfm from ElasticSearch output}
\usage{ \usage{
dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T) dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T)
} }
\arguments{ \arguments{
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}
@ -15,8 +15,6 @@ dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T)
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
\item{tolower}{Boolean indicating whether dfm features should be lowercased} \item{tolower}{Boolean indicating whether dfm features should be lowercased}
} }
\value{ \value{

@ -4,18 +4,14 @@
\alias{merger} \alias{merger}
\title{Merges list of lemmas back into a pseudo-document} \title{Merges list of lemmas back into a pseudo-document}
\usage{ \usage{
merger(row, out, text, clean) merger(out, text, clean)
} }
\arguments{ \arguments{
\item{row}{A row number form the Elasticizer-generated data frame}
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
} }
\value{ \value{
A documentified string of lemmas, one document at a time A documentified string of lemmas, one document at a time
@ -24,5 +20,5 @@ A documentified string of lemmas, one document at a time
Merges list of lemmas back into a pseudo-document Merges list of lemmas back into a pseudo-document
} }
\examples{ \examples{
merger(1, words = '999', out, text) merger(out, text, clean)
} }

@ -4,27 +4,16 @@
\alias{ud_update} \alias{ud_update}
\title{Elasticizer update function: generate UDpipe output from base text} \title{Elasticizer update function: generate UDpipe output from base text}
\usage{ \usage{
ud_update( ud_update(out, udmodel, ver)
out,
localhost = T,
udmodel,
es_super = .rs.askForPassword("ElasticSearch WRITE"),
cores = detectCores(),
ver
)
} }
\arguments{ \arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)} \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
\item{localhost}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
\item{udmodel}{UDpipe model to use} \item{udmodel}{UDpipe model to use}
\item{es_super}{Password for write access to ElasticSearch}
\item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')} \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{file}{Filename for output (ud_ is automatically prepended)}
} }
\value{ \value{
A vector of 1's indicating the success of each update call A vector of 1's indicating the success of each update call
@ -33,6 +22,6 @@ A vector of 1's indicating the success of each update call
Elasticizer update function: generate UDpipe output from base text Elasticizer update function: generate UDpipe output from base text
} }
\examples{ \examples{
ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores()) ud_update(out, udmodel, ver, file)
} }

Loading…
Cancel
Save