actorizer: major fix to ud parsing, changed regex to remove html tags to only include tags with a maximum of 20 characters in them

ud_update: major fix to ud parsing, changed regex to remove html tags to only include tags with a maximum of 20 characters in them
elastic_update: set the minimum break between retries from 10 to 30 seconds
elasticizer: implementation of retries for elasticizer function, 10 retries with a break of 30 seconds in between
master
Erik de Vries 6 years ago
parent b042fdb1e3
commit 54dfb6a8ca

@ -22,7 +22,6 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
}
sentencizer <- function(row, out, udmodel, ids, prefix, postfix, identifier) {
### If no pre or postfixes, match *not nothing* i.e. anything
if (is.na(prefix) || prefix == '') {
prefix = '$^'
@ -32,8 +31,6 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
}
### Also needs fix for empty strings (non-NA)
doc <- out[row,]
print(doc$merged)
print(row)
ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`)) %>%
filter(upos != "PUNCT") # Removing punctuation to get accurate word counts
sentence_count <- length(unique(ud$sentence))
@ -71,12 +68,12 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
str_replace_na(unlist(out$highlight.teaser), replacement = " "),
str_replace_na(unlist(out$highlight.text), replacement = " "),
sep = " ") %>%
# Replacing html tags with whitespaces
str_replace_all("<.*?>", " ") %>%
# Remove html tags, and multiple consequent whitespaces
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all("\\s+"," ")
ids <- fromJSON(ids)
updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = 1))
updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, udmodel = udmodel, mc.cores = detectCores()))
bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add')
bulk <- c(bulk,apply(updates[c(1,8)], 1, bulk_writer, varname='actors', type = 'add'))
return(elastic_update(bulk, es_super = es_super, localhost = localhost))

@ -25,7 +25,7 @@ elastic_update <- function(x, es_super = 'secret', localhost = T) {
, encode = "raw"
, add_headers("Content-Type" = "application/json")
, times = 10
, pause_min = 10
, pause_min = 30
)
httr:::stop_for_status(res)
appData <- httr:::content(res)

@ -18,6 +18,8 @@
#################################### Get data from ElasticSearch ################################
#################################################################################################
elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), size = 1024, update = NULL, localhost = F, ...){
retries <- 10 ### Number of retries on error
sleep <- 30 ### Number of seconds between retries
httr::set_config(httr::config(http_version = 0))
if (localhost == F) {
connect(es_port = 443,
@ -39,10 +41,30 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
}
# Get all results - one approach is to use a while loop
if (src == T) {
res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T)
res <- NULL
attempt <- 0
while( is.null(res) && attempt <= retries ) {
if (attempt > 0) {
Sys.sleep(sleep)
}
attempt <- attempt + 1
try(
res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T)
)
}
}
if (src == F) {
res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T, source = F)
res <- NULL
attempt <- 0
while( is.null(res) && attempt <= retries ) {
if (attempt > 0) {
Sys.sleep(sleep)
}
attempt <- attempt + 1
try(
res <- Search(index = index, time_scroll="20m",body = query, size = size, raw=T, source = F)
)
}
}
json <- fromJSON(res)
if (json$hits$total == 0) {
@ -57,7 +79,17 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
update(out, localhost = localhost, ...)
}
while(hits != 0){
res <- scroll(json$`_scroll_id`, time_scroll="20m", raw=T)
res <- NULL
attempt <- 0
while( is.null(res) && attempt <= retries ) {
if (attempt > 0) {
Sys.sleep(sleep)
}
attempt <- attempt + 1
try(
res <- scroll(json$`_scroll_id`, time_scroll="20m", raw=T)
)
}
json <- fromJSON(res)
hits <- length(json$hits$hits)
if(hits > 0) {

@ -18,7 +18,7 @@ ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword
str_replace_na(out$`_source.text`, replacement = " "),
sep = " ") %>%
# Remove html tags, and multiple consequent whitespaces
str_replace_all("<.*?>", " ") %>%
str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all("\\s+"," ")
par_proc <- function(row, out, udmodel) {
doc <- out[row,]

Loading…
Cancel
Save