bulk_writer: changed handling of single-row dataframe parsing to JSON
elastic_update: changed function to return instead of print appData on error
ud_update: Changed nested lists to flat lists, and added start and end character positions
#' @param prefix Regex containing prefixes that should be excluded from hits
#' @param prefix Regex containing prefixes that should be excluded from hits
#' @param postfix Regex containing postfixes that should be excluded from hits
#' @param postfix Regex containing postfixes that should be excluded from hits
#' @param identifier String used to mark highlights. Should be a lowercase string
#' @param identifier String used to mark highlights. Should be a lowercase string
#' @param udmodel The udpipe model used for parsing every hit
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param es_super Password for write access to ElasticSearch
#' @param es_super Password for write access to ElasticSearch
#' @return As this is a nested function used within elasticizer, there is no return output
#' @return As this is a nested function used within elasticizer, there is no return output
min<-min(ud$start[ud$sentence_id==id])# Get start position of sentence
max<-max(ud$end[ud$sentence_id==id])
max<-max(ud$end[ud$sentence_id==id])# Get end position of sentence
split<-markers[markers%in%seq(min,max,1)]
split<-markers[markers%in%seq(min,max,1)]# Get markers in sentence
max<-max+(length(split)*nchar(identifier))
max<-max+(length(split)*nchar(identifier))# Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
sentence<-str_sub(doc$highlight,min,max)
sentence<-str_sub(doc$merged,min,max)# Extract sentence from text
# Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
if (!str_detect(sentence,paste0(regex_identifier,postfix))&&!str_detect(sentence,paste0(prefix,regex_identifier))){
if (!str_detect(sentence,paste0(regex_identifier,postfix))&&!str_detect(sentence,paste0(prefix,regex_identifier))){
# ud <- as.data.frame(udpipe(udmodel, x = doc$merged, parser = "none", doc_id = doc$`_id`))
markers<-doc$markers[[1]][,'start']# Extract list of markers
markers<-doc$markers[[1]][,'start']
# Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
# ## The exception below is only valid for the UK, where the original UDPipe output misses a dot at the end of the article, but the actor output does not
# ## (UK output is older than actor output, should be updated)
# if (length(ud_org$sentence_id) == length(ud$sentence_id)-1) {
# ud <- ud[-length(ud$sentence_id),]
# }
# if (length(ud_org$sentence_id) == length(ud$sentence_id)) {
# ud <- bind_cols(ud_org, sentence = ud$sentence, token = ud$token, doc_id = ud$doc_id, actor = ud$actor)
# } else {
# err = T
# print(paste0('ud_org and ud_actor not the same length for id ', doc$`_id`))
# print(length(ud_org$sentence_id))
# print(length(ud$sentence_id))
# }
return(
data.frame(ud,# Sentence id, start and end position for actor sentences
actor_start=I(list(actor_start)),# List of actor ud token start positions
actor_end=I(list(actor_end)),# List of actor ud token end positions
occ=length(unique(actor_sentences)),# Number of sentences in which actor occurs
prom=length(unique(actor_sentences))/sentence_count,# Relative prominence of actor in article (number of occurences/total # sentences)
rel_first=1-(min(actor_sentences)/sentence_count),# Relative position of first occurence at sentence level
first=min(actor_sentences),# First sentence in which actor is mentioned