elasticizer: updated dump handling to create a dump for every batch, instead of one big file at the end
out_parser: streamlined code
query_gen_actors: only include relevant fields
ud_update: changed function parameters to work with elasticizer dump function
#' @param identifier String used to mark highlights. Should be a lowercase string
#' @param identifier String used to mark highlights. Should be a lowercase string
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param es_super Password for write access to ElasticSearch
#' @param es_super Password for write access to ElasticSearch
#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
#' @return As this is a nested function used within elasticizer, there is no return output
#' @return As this is a nested function used within elasticizer, there is no return output
max<-max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers))# Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
sentence<-paste0(' ',str_sub(doc$merged,min,max),' ')# Extract sentence from text, adding whitespaces before and after for double negation (i.e. Con only when preceded by "("))
# Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
if (!str_detect(sentence,paste0(post_tags_regex,'(',postfix,')'))&&!str_detect(sentence,paste0('(',prefix,')',pre_tags_regex))){
markers<-doc$markers[[1]][,'start']# Extract list of markers
unnest(cols=colnames(.))
# Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
rows<-unlist(lapply(markers,ranger,ud=ud))
# Setting up an actor variable
ud$actor<-F
ud$actor[rows]<-T
sentence_count<-max(ud$sentence_id)# Number of sentences in article
sentences<-ud%>%
actor_sentences<-unique(ud$sentence_id[ud$actor])# Sentence ids of sentences mentioning actor
#' Elasticizer update function: generate UDpipe output from base text
#' Generate UDpipe output from base text
#'
#'
#' Elasticizer update function: generate UDpipe output from base text
#' Generate UDpipe output from base text
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
#' @param file Filename of file to read in, also used for generating output file name
#' @param udmodel UDpipe model to use
#' @param wd Working directory where *file*s can be found
#' @param ud_file Filename of udpipe model to use, should be in *wd*
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
#' @param file Filename for output (ud_ is automatically prepended)
#' @return A vector of 1's indicating the success of each update call
#' @return A vector of 1's indicating the success of each update call
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{cores}{Number of cores to use for parallel processing, defaults to cores (all cores available)}
\item{identifier}{String used to mark highlights. Should be a lowercase string}
\item{identifier}{String used to mark highlights. Should be a lowercase string}
\title{Elasticizer update function: generate UDpipe output from base text}
\title{Generate UDpipe output from base text}
\usage{
\usage{
ud_update(out, udmodel, ver)
ud_update(file, wd, ud_file, ver)
}
}
\arguments{
\arguments{
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
\item{file}{Filename of file to read in, also used for generating output file name}
\item{udmodel}{UDpipe model to use}
\item{wd}{Working directory where *file*s can be found}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
\item{ud_file}{Filename of udpipe model to use, should be in *wd*}
\item{file}{Filename for output (ud_ is automatically prepended)}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
}
}
\value{
\value{
A vector of 1's indicating the success of each update call
A vector of 1's indicating the success of each update call
}
}
\description{
\description{
Elasticizer update function: generate UDpipe output from base text