class_update: remove dfm_gen multicore option

dfm_gen: remove multicore, update merger() code
elasticizer: changed filenaming scheme for dump option
merger: Fixed bug where an NA lemma would cause the entire document to become NA. Now the NA lemmas are filtered out before merging
ud_update: removed parallel processing, changed script to save bulk updates in .Rds files instead of sending them straight away
master
Your Name 5 years ago
parent 5d99ec9509
commit 4b4d860235

@ -0,0 +1,47 @@
#' Merges list of lemmas back into a pseudo-document
#'
#' Merges list of lemmas back into a pseudo-document
#' @param row A row number form the Elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param out The elasticizer-generated data frame
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @return A documentified string of lemmas, one document at a time
#' @export
#' @examples
#' merger(1, words = '999', out, text)
#################################################################################################
#################################### Reconstructing documents from lemmas########################
#################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification
merger_old <- function(row, out, text, clean) {
df <- out[row,]
# Mergin lemmas into single string
if (text == 'lemmas') {
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
}
if (text == 'ud') {
lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
}
if (text == 'ud_upos') {
df <- unnest(df,`_source.ud`)
lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is not taken into account, as it is already filtered out, see above
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "") else . }
# In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string
if (length(lemmas) == 0 ){
lemmas <- ''
}
return(lemmas)
}
# Replacing $-marked punctuation with their regular forms
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in out_parser
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>%
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
paste0(.,". ")
return(lemmas)
}

@ -20,7 +20,7 @@
#################################################################################################
class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE'), cores = 1) {
print('updating')
dfm <- dfm_gen(out, text = text, words = words, clean = clean, cores = cores)
dfm <- dfm_gen(out, text = text, words = words, clean = clean)
if (!is.null(model_final$idf)) {
dfm <- dfm_weight(dfm, weights = model_final$idf)
}

@ -5,7 +5,6 @@
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
#' @param tolower Boolean indicating whether dfm features should be lowercased
#' @return A Quanteda dfm
#' @export
@ -19,16 +18,16 @@
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1, tolower = T) {
dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T) {
# Create subset with just ids, codes and text
out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out))
if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = cores))
out <- left_join(out, merger(out, text=text, clean=clean), by = "_id")
}
if (text == "full") {
out <- mamlr:::out_parser(out, field = '_source' , clean = clean, cores = cores)
out <- mamlr:::out_parser(out, field = '_source' , clean = clean)
}
if ('_source.codes.majorTopic' %in% colnames(out)) {
out <- out %>%

@ -143,7 +143,7 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
scroll_clear(conn = conn, x = json$`_scroll_id`)
return("Done updating")
} else if (dump) {
saveRDS(out, file = paste0('df_raw',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
} else {
scroll_clear(conn = conn, x = json$`_scroll_id`)
return(out)

@ -1,47 +1,59 @@
#' Merges list of lemmas back into a pseudo-document
#'
#' Merges list of lemmas back into a pseudo-document
#' @param row A row number form the Elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param out The elasticizer-generated data frame
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @return A documentified string of lemmas, one document at a time
#' @export
#' @examples
#' merger(1, words = '999', out, text)
#' merger(out, text, clean)
#################################################################################################
#################################### Reconstructing documents from lemmas########################
#################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification
merger <- function(row, out, text, clean) {
df <- out[row,]
# Mergin lemmas into single string
if (text == 'lemmas') {
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
}
if (text == 'ud') {
lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
}
merger <- function(out, text, clean) {
df <- unnest(out, cols = '_source.ud') %>%
unnest(cols = c('lemma','upos')) %>%
# This line is added in the new merger function, in the old merger function this would result in the following:
# 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma
# 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA
filter(!is.na(lemma)) %>%
group_by(`_id`)
if (text == 'ud_upos') {
df <- unnest(df,`_source.ud`)
lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
df <- df %>%
filter(upos != 'PUNCT') %>%
mutate(
lem_u = str_c(lemma,upos,sep="_")
) %>%
summarise(
merged = str_c(c(lem_u), collapse= ' ')
) %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is not taken into account, as it is already filtered out, see above
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "") else . }
# In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string
if (length(lemmas) == 0 ){
lemmas <- ''
}
return(lemmas)
{if(clean == T) mutate(.,
merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "")
)
else . }
}
if (text == 'ud') {
df <- df %>%
summarise(
merged = str_c(c(lemma), collapse= ' ')
) %>%
mutate(
merged = str_replace_all(merged," \\$(.+?)", "\\1")
) %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in out_parser
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
{if(clean == T) mutate(.,
merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")
)
else . } %>%
mutate(.,
merged = paste0(merged,'. '))
}
# Replacing $-marked punctuation with their regular forms
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in out_parser
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>%
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
paste0(.,". ")
return(lemmas)
return(df)
}

@ -2,15 +2,13 @@
#'
#' Elasticizer update function: generate UDpipe output from base text
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @param udmodel UDpipe model to use
#' @param es_super Password for write access to ElasticSearch
#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param file Filename for output (ud_ is automatically prepended)
#' @return A vector of 1's indicating the success of each update call
#' @export
#' @examples
#' ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores())
#' ud_update(out, udmodel, ver, file)
#'
# punct_check <- function(str) {
@ -19,30 +17,26 @@
# }
# }
ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) {
ud_update <- function(out, udmodel, ver) {
out <- mamlr:::out_parser(out, field = '_source', clean = F)
par_proc <- function(row, out, udmodel) {
doc <- out[row,]
ud <- as.data.frame(udpipe(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%
group_by(doc_id) %>%
summarise(
sentence_id = list(as.integer(sentence_id)),
token_id = list(as.integer(token_id)),
lemma = list(as.character(lemma)),
upos = list(as.character(upos)),
feats = list(as.character(feats)),
head_token_id = list(as.integer(head_token_id)),
dep_rel = list(as.character(dep_rel)),
start = list(as.integer(start)),
end = list(as.integer(end)),
exists = list(TRUE)
)
return(ud)
}
ud <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_proc, out = out, udmodel=udmodel, mc.cores = cores))
ud <- as.data.frame(udpipe(udmodel, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
group_by(doc_id) %>%
summarise(
sentence_id = list(as.integer(sentence_id)),
token_id = list(as.integer(token_id)),
lemma = list(as.character(lemma)),
upos = list(as.character(upos)),
feats = list(as.character(feats)),
head_token_id = list(as.integer(head_token_id)),
dep_rel = list(as.character(dep_rel)),
start = list(as.integer(start)),
end = list(as.integer(end)),
exists = list(TRUE)
)
bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
return(res)
saveRDS(bulk, file = paste0('ud_',file))
# res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
return()
}
#### Old code ####

@ -4,7 +4,7 @@
\alias{dfm_gen}
\title{Generates dfm from ElasticSearch output}
\usage{
dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T)
dfm_gen(out, words = "999", text = "lemmas", clean, tolower = T)
}
\arguments{
\item{out}{The elasticizer-generated data frame}
@ -15,8 +15,6 @@ dfm_gen(out, words = "999", text = "lemmas", clean, cores = 1, tolower = T)
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
\item{tolower}{Boolean indicating whether dfm features should be lowercased}
}
\value{

@ -4,18 +4,14 @@
\alias{merger}
\title{Merges list of lemmas back into a pseudo-document}
\usage{
merger(row, out, text, clean)
merger(out, text, clean)
}
\arguments{
\item{row}{A row number form the Elasticizer-generated data frame}
\item{out}{The elasticizer-generated data frame}
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
}
\value{
A documentified string of lemmas, one document at a time
@ -24,5 +20,5 @@ A documentified string of lemmas, one document at a time
Merges list of lemmas back into a pseudo-document
}
\examples{
merger(1, words = '999', out, text)
merger(out, text, clean)
}

@ -4,27 +4,16 @@
\alias{ud_update}
\title{Elasticizer update function: generate UDpipe output from base text}
\usage{
ud_update(
out,
localhost = T,
udmodel,
es_super = .rs.askForPassword("ElasticSearch WRITE"),
cores = detectCores(),
ver
)
ud_update(out, udmodel, ver)
}
\arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
\item{localhost}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
\item{udmodel}{UDpipe model to use}
\item{es_super}{Password for write access to ElasticSearch}
\item{cores}{Number of cores to use for parallel processing, defaults to detectCores() (all cores available)}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{file}{Filename for output (ud_ is automatically prepended)}
}
\value{
A vector of 1's indicating the success of each update call
@ -33,6 +22,6 @@ A vector of 1's indicating the success of each update call
Elasticizer update function: generate UDpipe output from base text
}
\examples{
ud_update(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores())
ud_update(out, udmodel, ver, file)
}

Loading…
Cancel
Save