mamlr final commit

fix in package version check
update
54 changed files with 3999 additions and 478 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 .Rproj.user
 .Rhistory
 .RData
+*.RData
+*.Rds
--- a/9
+++ b/9
@ -9,11 +9,16 @@ Depends: R (>= 3.3.1),
    parallel,
    tidyverse,
    quanteda,
+    quanteda.textmodels,
    httr,
    caret,
    e1071,
-    udpipe
+    udpipe,
+    SparseM,
+    future,
+    future.apply,
+    data.table
 License: Copyright Erik de Vries
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.1.0
+RoxygenNote: 7.3.1
--- a/MaML.Rproj
+++ b/MaML.Rproj
@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: 29379696-05ca-4ddf-beba-206033acc0f0

 RestoreWorkspace: No
 SaveWorkspace: No
--- a/12
+++ b/12
@ -1,12 +1,24 @@
 # Generated by roxygen2: do not edit by hand

+export(actorizer)
 export(bulk_writer)
 export(class_update)
+export(cv_generator)
 export(dfm_gen)
 export(dupe_detect)
 export(elastic_update)
 export(elasticizer)
+export(estimator)
+export(feat_select)
+export(lemma_writer)
 export(merger)
+export(metric_gen)
 export(modelizer)
+export(modelizer_old)
+export(out_parser)
+export(preproc)
 export(query_gen_actors)
 export(query_string)
+export(sent_merger)
+export(sentencizer)
+export(ud_update)
--- a/Old/actor_aggregation.R
+++ b/Old/actor_aggregation.R
@ -0,0 +1,278 @@
+### Notes:
+# Do you want to search for either one OR other actorid, or both occuring in the same document?
+# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
+# Search by actorId, then aggregate by month
+# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
+# Develop query generator for specific actors (ie combine actorId with start and end dates)
+
+
+
+#' Generate aggregated actor measures from raw data
+#'
+#' Generate aggregated actor measures from raw data
+#' @param row The row of the actors data frame used for aggregation
+#' @param actors The data frame containing actor data
+#' @param es_pwd The password for read access to ES
+#' @param localhost Boolean indicating if the script is running locally or not
+#' @param default_operator String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR
+#' @return No return value, data per actor is saved in an RDS file
+#' @export
+#' @examples
+#' actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR')
+#################################################################################################
+#################################### Aggregate actor results ################################
+#################################################################################################
+actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR', sent_dict = NULL, cores = detectCores()) {
+  ### Functions
+  aggregator <- function (id, duplicates) {
+    df <- duplicates %>%
+      filter(`_id` == id) %>%
+      group_by(`_id`) %>%
+      summarise(
+        `_source.doctype` = first(`_source.doctype`),
+        `_source.publication_date` = first(`_source.publication_date`),
+        # actor_end = list(sort(unique(unlist(actor_end)))),
+        prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
+        sentence_id = list(sort(unique(unlist(sentence_id)))),
+        rel_first = list(max(unlist(rel_first))),
+        # sentence_end = list(sort(unique(unlist(sentence_end)))),
+        # actor_start = list(sort(unique(unlist(actor_start)))),
+        ids = list(unique(unlist(ids))),
+        # sentence_start = list(sort(unique(unlist(sentence_start)))),
+        occ = list(length(unique(unlist(sentence_id)))),
+        first = list(min(unlist(sentence_id)))
+      )
+    return(df)
+  }
+  ### Calculate sentiment scores for each actor-document
+  par_sent <- function(row, out, sent_dict) {
+    out_row <- out[row,]
+    ### Contains sentiment per sentence for whole article
+    sentiment_ud <- out_row$`_source.ud`[[1]] %>%
+      select(-one_of('exists')) %>%
+      unnest() %>%
+      filter(upos != 'PUNCT') %>% # For getting proper word counts
+      mutate(V1 = str_c(lemma,'_',upos)) %>%
+      left_join(sent_dict, by = 'V1') %>%
+      ### Setting binary sentiment as unit of analysis
+      mutate(V2 = V3) %>%
+      group_by(sentence_id) %>%
+      mutate(
+        V2 = case_when(
+          is.na(V2) == T ~ 0,
+          TRUE ~ V2
+        )
+      ) %>%
+      summarise(sent_sum = sum(V2),
+                words = length(lemma),
+                sent_words = length(na.omit(V3))) %>%
+      mutate(
+        sent = sent_sum/words,
+        arousal = sent_words/words
+      )
+    out_row <- select(out_row, -`_source.ud`)
+    ### Contains sentiment per sentence for actor
+    actor_tone <- filter(sentiment_ud, sentence_id %in% unlist(out_row$sentence_id))
+
+    ### Aggregated sentiment per actor (over all sentences containing actor)
+    actor <- summarise(actor_tone,
+                       sent = sum(sent_sum)/sum(words),
+                       sent_sum = sum(sent_sum),
+                       sent_words = sum(sent_words),
+                       words = sum(words),
+                       arousal = sum(sent_words)/sum(words)
+    )
+
+    ### Aggregated sentiment per article (over all sentences in article)
+    text <- summarise(sentiment_ud,
+                      sent = sum(sent_sum)/sum(words),
+                      sent_sum = sum(sent_sum),
+                      sent_words = sum(sent_words),
+                      words = sum(words),
+                      arousal = sum(sent_words)/sum(words)
+    )
+    return(cbind(out_row,data.frame(actor = actor, text = text)))
+  }
+
+
+  ### Creating aggregate measuers at daily, weekly, monthly and yearly level
+  grouper <- function(level, out, actorids, sent = F) {
+    by_newspaper <- out %>%
+      mutate(
+        sentence_count = round(unlist(occ)/unlist(prom))
+      ) %>%
+      group_by_at(vars(level, `_source.doctype`)) %>%
+      summarise(
+        occ = sum(unlist(occ)),
+        prom_art = mean(unlist(prom)),
+        rel_first_art = mean(unlist(rel_first)),
+        first = mean(unlist(first)),
+        sentence_count = sum(sentence_count),
+        articles = length(`_id`),
+        level = level
+      ) %>%
+      ungroup()
+    aggregate <- out %>%
+      mutate(
+        sentence_count = round(unlist(occ)/unlist(prom))
+      ) %>%
+      group_by_at(vars(level)) %>%
+      summarise(
+        occ = sum(unlist(occ)),
+        prom_art = mean(unlist(prom)),
+        rel_first_art = mean(unlist(rel_first)),
+        first = mean(unlist(first)),
+        sentence_count = sum(sentence_count),
+        articles = length(`_id`),
+        `_source.doctype` = 'agg',
+        level = level
+      ) %>%
+      ungroup()
+
+    if(sent == T) {
+      by_newspaper_sent <- out %>%
+        group_by_at(vars(level, `_source.doctype`)) %>%
+        summarise(
+          actor.sent = mean(actor.sent),
+          actor.sent_sum = sum(actor.sent_sum),
+          actor.sent_words = sum(actor.sent_words),
+          actor.words = sum(actor.words),
+          actor.arousal = mean(actor.arousal),
+          text.sent = mean(text.sent),
+          text.sent_sum = sum(text.sent_sum),
+          text.sent_words = sum(text.sent_words),
+          text.words = sum(text.words),
+          text.arousal = mean(text.arousal)
+        ) %>%
+        ungroup() %>%
+        select(-level,-`_source.doctype`)
+      aggregate_sent <- out %>%
+        group_by_at(vars(level)) %>%
+        summarise(
+          actor.sent = mean(actor.sent),
+          actor.sent_sum = sum(actor.sent_sum),
+          actor.sent_words = sum(actor.sent_words),
+          actor.words = sum(actor.words),
+          actor.arousal = mean(actor.arousal),
+          text.sent = mean(text.sent),
+          text.sent_sum = sum(text.sent_sum),
+          text.sent_words = sum(text.sent_words),
+          text.words = sum(text.words),
+          text.arousal = mean(text.arousal)
+        ) %>%
+        ungroup() %>%
+        select(-level)
+      aggregate <- bind_cols(aggregate,aggregate_sent)
+      by_newspaper <- bind_cols(by_newspaper, by_newspaper_sent)
+    }
+    output <- bind_rows(by_newspaper, aggregate) %>%
+      bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),]) %>%
+      select(
+        -`_index`,
+        -`_type`,
+        -`_score`,
+        -`_id`,
+        -contains('Search'),
+        -contains('not')
+      )
+    colnames(output) <- gsub("_source.",'', colnames(output))
+    return(output)
+  }
+  ###########################################################################################
+  plan(multiprocess, workers = cores)
+  if (is.null(sent_dict) == F) {
+    fields <- c('ud','computerCodes.actorsDetail', 'doctype', 'publication_date')
+  } else {
+    fields <- c('computerCodes.actorsDetail', 'doctype', 'publication_date')
+  }
+  actor <- actors[row,]
+  if (actor$`_source.function` == "Party"){
+    years = seq(2000,2019,1)
+    startDate <- '2000'
+    endDate <- '2019'
+  } else {
+    years = c(0)
+    startDate <- actor$`_source.startDate`
+    endDate <- actor$`_source.endDate`
+  }
+  if (actor$`_source.function` == 'Party' && actor$party_only == T) {
+    actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'))
+  } else if (actor$`_source.function` == 'Party') {
+    actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a'))
+    actor$party_only <- F
+  } else {
+    actorids <- actor$`_source.actorId`
+    actor$party_only <- NULL
+  }
+
+  actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {
+    if (year > 0) {
+      query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31]')
+    } else {
+      query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,']')
+    }
+
+    ### Temporary exception for UK missing junk coding
+    if (actor$`_source.country` != 'uk') {
+      query <- paste0(query,' && computerCodes.junk:0')
+    }
+
+    out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query),
+                                    fields = fields, default_operator = default_operator),
+                       localhost = localhost,
+                       es_pwd = es_pwd)
+    if (length(out$`_id`) > 0 ) {
+      if (is.null(sent_dict) == F) {
+        out_ud <- out %>% select(`_id`,`_source.ud`)
+        out <- out %>% select(-`_source.ud`)
+      }
+      ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
+      out <- out %>%
+        unnest(`_source.computerCodes.actorsDetail`, .preserve = colnames(.)) %>%
+        unnest(ids, .preserve = colnames(.)) %>%
+        filter(ids1 %in% actorids) # %>%
+      # select(-ends_with('start')) %>%
+      # select(-ends_with('end')) %>%
+      # select(-starts_with('ids'))
+
+      ### Only if there are more rows than articles, recalculate
+      if (length(unique(out$`_id`)) != length(out$`_id`)) {
+        duplicates <- out[(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),]
+        actor_single <- out[!(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),]
+        art_id <- unique(duplicates$`_id`)
+        dupe_merged <- bind_rows(future_lapply(art_id, aggregator, duplicates = duplicates))
+        out <- bind_rows(dupe_merged, actor_single)
+      }
+
+      if (is.null(sent_dict) == F) {
+        out <- left_join(out, out_ud, by = '_id')
+        out <- bind_rows(future_lapply(seq(1,nrow(out),1),par_sent, out = out, sent_dict = sent_dict))
+      }
+
+      ### Creating date grouping variables
+      out <- out %>%
+        mutate(
+          year = strftime(`_source.publication_date`, format = '%Y'),
+          yearmonth = strftime(out$`_source.publication_date`, format = '%Y%m'),
+          yearmonthday = strftime(out$`_source.publication_date`, format = '%Y%m%d'),
+          yearweek = strftime(out$`_source.publication_date`, format = "%Y%V")
+        ) %>%
+        select(
+          -`_score`,
+          -`_index`,
+          -`_type`,
+          -`_score`,
+          -`_source.computerCodes.actorsDetail`,
+          -ids1
+        )
+      levels <- c('year','yearmonth','yearmonthday','yearweek')
+      aggregate_data <- bind_rows(lapply(levels, grouper, out = out, actorids = actorids, sent = !is.null(sent_dict)))
+      return(aggregate_data)
+    } else {
+      return()
+    }
+  }
+  saveRDS(bind_rows(lapply(years, actor_aggregator, query, actor, actorids, default_operator, localhost, es_pwd)), file = paste0(actor$`_source.country`,'_',paste0(actorids,collapse = ''),actor$`_source.function`,startDate,endDate,'.Rds'))
+  print(paste0('Done with ',row,'/',nrow(actors),' actors'))
+  return()
+}
--- a/Old/actor_fetcher.R
+++ b/Old/actor_fetcher.R
@ -0,0 +1,188 @@
+#' Generate actor data frames (with sentiment) from database
+#'
+#' Generate actor data frames (with sentiment) from database
+#' @param out Data frame produced by elasticizer
+#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.
+#' @param actor_ids Optional vector containing the actor ids to be collected
+#' @param cores Number of threads to use for parallel processing
+#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
+#' @return No return value, data per batch is saved in an RDS file
+#' @export
+#' @examples
+#' actor_fetcher(out, sent_dict = NULL, cores = 1)
+#################################################################################################
+#################################### Aggregate actor results ################################
+#################################################################################################
+actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, localhost = NULL, validation = F) {
+  plan(multiprocess, workers = cores)
+  ### Functions
+  ### Calculate sentiment scores for each actor-document
+  sent_scorer <- function(row, out_row, ud_sent) {
+    ### Contains sentiment per sentence for actor
+    actor_tone <- filter(ud_sent, sentence_id %in% unlist(out_row[row,]$sentence_id))
+
+    ### Aggregated sentiment per actor (over all sentences containing actor)
+    actor <- summarise(actor_tone,
+                       sent = sum(sent_sum)/sum(words),
+                       sent_sum = sum(sent_sum),
+                       sent_words = sum(sent_words),
+                       words = sum(words),
+                       arousal = sum(sent_words)/sum(words)
+    )
+    return(cbind(out_row[row,],data.frame(actor = actor)))
+  }
+
+  aggregator <- function (pid, dupe_df) {
+    ### Party ids excluding actors
+    p_ids <- c(str_c(pid,'_f'),str_c(pid,'_s'))
+    ### Party ids including actors
+    p_ids_a <- c(p_ids,str_c(pid,'_a'))
+    summarizer <- function (p_ids, dupe_df, merged_id) {
+      id <- dupe_df$`_id`[[1]]
+      dupe_df <- dupe_df %>%
+        filter(ids %in% p_ids)
+      if (nrow(dupe_df) > 0) {
+        return(
+          dupe_df %>% summarise(
+            `_id` = first(`_id`),
+            `_source.doctype` = first(`_source.doctype`),
+            `_source.publication_date` = first(`_source.publication_date`),
+            prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
+            sentence_id = list(sort(unique(unlist(sentence_id)))),
+            rel_first = list(max(unlist(rel_first))),
+            ids = merged_id,
+            occ = list(length(unique(unlist(sentence_id)))),
+            first = list(min(unlist(sentence_id))),
+            actor_start = list(sort(unique(unlist(actor_start)))),
+            actor_end = list(sort(unique(unlist(actor_end)))),
+            sentence_start = list(sort(unique(unlist(sentence_start)))),
+            sentence_end = list(sort(unique(unlist(sentence_end))))
+          )
+        )
+      } else {
+        print(paste0('id:',id))
+        return(NULL)
+      }
+    }
+    party <- summarizer(p_ids, dupe_df, str_c(pid,'_mfs'))
+    party_actor <- summarizer(p_ids_a, dupe_df, str_c(pid,'_mfsa'))
+    return(bind_rows(party,party_actor))
+  }
+
+  par_sent <- function(row, out, sent_dict = NULL) {
+    out_row <- out[row,]
+    ### Generating sentence-level sentiment scores from ud
+    if (is.null(sent_dict) == F) {
+      ud_sent <- out_row$`_source.ud`[[1]] %>%
+        select(-one_of('exists')) %>%
+        unnest() %>%
+        filter(upos != 'PUNCT') # For getting proper word counts
+      if ("lem_u" %in% colnames(sent_dict)) {
+        ud_sent <- ud_sent %>%
+          mutate(lem_u = str_c(lemma,'_',upos)) %>%
+          left_join(sent_dict, by = 'lem_u')
+      } else if ("lemma" %in% colnames(sent_dict)) {
+        ud_sent <- ud_sent %>%
+          left_join(sent_dict, by = 'lemma') %>%
+          mutate(lem_u = lemma)
+      }
+      ud_sent <- ud_sent %>%
+        group_by(sentence_id) %>%
+        mutate(
+          prox = case_when(
+            is.na(prox) == T ~ 0,
+            TRUE ~ prox
+          )
+        ) %>%
+        summarise(sent_sum = sum(prox),
+                  words = length(lemma),
+                  sent_words = sum(prox != 0),
+                  sent_lemmas = list(lem_u[prox != 0])) %>%
+        mutate(
+          sent = sent_sum/words,
+          arousal = sent_words/words
+        )
+      out_row <- select(out_row, -`_source.ud`)
+    }
+
+    if (validation == T) {
+      codes_sent <- filter(ud_sent, sentence_id == out_row$`_source.codes.sentence.id`[1])
+      return(cbind(out_row, codes = codes_sent))
+    }
+
+    ### Unnest out_row to individual actor ids
+    out_row <- out_row %>%
+      unnest(`_source.computerCodes.actorsDetail`) %>%
+      mutate(ids_list = ids) %>%
+      unnest(ids) %>%
+      mutate(
+        pids = str_sub(ids, start = 1, end = -3)
+      )
+
+    if (!is.null(actor_ids)) {
+      out_row <- filter(out_row, ids %in% actor_ids )
+    }
+
+    ### Get list of party ids occuring more than once in the document
+    pids_table <- table(out_row$pids)
+    dupe_pids <- names(pids_table[pids_table > 1])%>%
+      str_subset(pattern = fixed('P_'))
+    single_pids <- names(pids_table[pids_table <= 1]) %>%
+      str_subset(pattern = fixed('P_'))
+    ### Data frame containing only duplicate party ids
+    dupe_df <- out_row %>%
+      filter(pids %in% dupe_pids)
+    ### Data frame containing only single party ids
+    single_df <- out_row %>%
+      filter(pids %in% single_pids)
+
+    ### Data frame for single occurrence mfsa
+    single_party_actor <- single_df %>%
+      mutate(
+        ids = str_c(pids,'_mfsa')
+      )
+    ### Data frame for single occurence mfs
+    single_party <- single_df %>%
+      filter(!endsWith(ids, '_a')) %>%
+      mutate(
+        ids = str_c(pids,'_mfs')
+      )
+    out_row <- out_row %>%
+      filter(startsWith(ids,'A_')) %>%
+      bind_rows(., single_party, single_party_actor)
+    ### For each of the party ids in the list above, aggregate to _mfs and _mfsa
+    if (length(dupe_pids) > 0) {
+      aggregate <- bind_rows(lapply(dupe_pids, aggregator, dupe_df = dupe_df))
+      out_row <- bind_rows(out_row, aggregate)
+    }
+
+    ### Generating sentiment scores for article and actors
+    if (is.null(sent_dict) == F) {
+      ### Aggregated sentiment per article (over all sentences in article)
+      text_sent <- summarise(ud_sent,
+                             sent = sum(sent_sum)/sum(words),
+                             sent_sum = sum(sent_sum),
+                             sent_words = sum(sent_words),
+                             words = sum(words),
+                             arousal = sum(sent_words)/sum(words)
+      )
+      out_row <- bind_rows(lapply(seq(1,nrow(out_row),1),sent_scorer, out_row = out_row, ud_sent = ud_sent)) %>%
+        cbind(., text = text_sent)
+    }
+    out_row <- out_row %>%
+      mutate(
+        year = strftime(`_source.publication_date`, format = '%Y'),
+        yearmonth = strftime(`_source.publication_date`, format = '%Y%m'),
+        yearmonthday = strftime(`_source.publication_date`, format = '%Y%m%d'),
+        yearweek = strftime(`_source.publication_date`, format = "%Y%V")
+      ) %>%
+      select(#-`_source.computerCodes.actorsDetail`,
+        -`_score`,
+        -`_index`,
+        -`_type`,
+        -pids)
+    return(out_row)
+  }
+  saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
+  return()
+}
--- a/Old/actorizer_old.R
+++ b/Old/actorizer_old.R
@ -0,0 +1,152 @@
+#' Updater function for elasticizer: Conduct actor searches
+#'
+#' Updater function for elasticizer: Conduct actor searches
+#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
+#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
+#' @param ids List of actor ids
+#' @param prefix Regex containing prefixes that should be excluded from hits
+#' @param postfix Regex containing postfixes that should be excluded from hits
+#' @param identifier String used to mark highlights. Should be a lowercase string
+#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
+#' @param es_super Password for write access to ElasticSearch
+#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
+#' @return As this is a nested function used within elasticizer, there is no return output
+#' @export
+#' @examples
+#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
+actorizer_old <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = 1) {
+  plan(multiprocess, workers = cores)
+  ### Function to filter out false positives using regex
+  exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
+    min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
+    max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
+    split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
+    min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
+    max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
+    sentence <- paste0(' ',str_sub(doc$merged, min, max),' ') # Extract sentence from text, adding whitespaces before and after for double negation (i.e. Con only when preceded by "("))
+
+    # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
+    if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
+      return(id)
+    } else {
+      return(NULL)
+    }
+  }
+  ranger <- function(x, ud) {
+    return(which((ud$start <= x) & (ud$end >= x)))
+  }
+  sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
+    doc <- out[row,]
+    if (nchar(doc$merged) > 990000) {
+      return(
+        data.frame(
+          err = T,
+          errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
+        )
+      )
+    }
+    # Extracting ud output from document
+    ud <- doc$`_source.ud`[[1]] %>%
+      select(-one_of('exists')) %>% # Removing ud.exists variable
+      unnest() %>%
+      mutate(doc_id = doc$`_id`)
+    markers <- doc$markers[[1]][,'start'] # Extract list of markers
+    # Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
+    rows <- unlist(lapply(markers, ranger, ud = ud))
+
+    # Setting up an actor variable
+    ud$actor <- F
+    ud$actor[rows] <- T
+
+    sentence_count <- max(ud$sentence_id) # Number of sentences in article
+    actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
+
+    # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
+    if (!is.na(prefix) || !is.na(postfix)) {
+      ### If no pre or postfixes, match *not nothing* i.e. anything
+      if (is.na(prefix)) {
+        prefix = '$^'
+      }
+      if (is.na(postfix)) {
+        postfix = '$^'
+      }
+      sentence_ids <- unlist(lapply(actor_sentences,
+                                    exceptionizer,
+                                    ud = ud,
+                                    doc = doc,
+                                    markers = markers,
+                                    pre_tags_regex = pre_tags_regex,
+                                    pre_tags = pre_tags,
+                                    post_tags_regex = post_tags_regex,
+                                    post_tags = post_tags,
+                                    prefix = prefix,
+                                    postfix = postfix))
+    } else {
+      sentence_ids <- actor_sentences
+    }
+    if (length(sentence_ids > 0)) {
+      # Generating nested sentence start and end positions for actor sentences
+      ud <- ud %>%
+        filter(sentence_id %in% sentence_ids)
+      actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
+      actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
+      ud <- ud %>%
+        group_by(sentence_id) %>%
+        summarise (
+          sentence_start = as.integer(min(start)),
+          sentence_end = as.integer(max(end)),
+          doc_id = first(doc_id)
+        ) %>%
+        group_by(doc_id) %>%
+        summarise(
+          sentence_id = list(as.integer(sentence_id)),
+          sentence_start = list(sentence_start),
+          sentence_end = list(sentence_end)
+        )
+
+      return(
+        data.frame(ud, # Sentence id, start and end position for actor sentences
+                   actor_start = I(list(actor_start)), # List of actor ud token start positions
+                   actor_end = I(list(actor_end)), # List of actor ud token end positions
+                   occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs
+                   prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
+                   rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level
+                   first = min(sentence_ids), # First sentence in which actor is mentioned
+                   ids = I(list(ids)) # List of actor ids
+        )
+      )
+    } else {
+      return(NULL)
+    }
+
+  }
+  out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
+  offsetter <- function(x, pre_tags, post_tags) {
+    return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
+  }
+  prefix[prefix==''] <- NA
+  postfix[postfix==''] <- NA
+  pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
+  post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
+  out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags)
+
+  # ids <- fromJSON(ids)
+  updates <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), sentencizer,
+                                     out = out,
+                                     ids = ids,
+                                     postfix = postfix,
+                                     prefix=prefix,
+                                     pre_tags_regex = pre_tags_regex,
+                                     pre_tags = pre_tags,
+                                     post_tags_regex = post_tags_regex,
+                                     post_tags = post_tags))
+  if (nrow(updates) == 0) {
+    print("Nothing to update for this batch")
+    return(NULL)
+  } else {
+    bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
+    bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
+    return(elastic_update(bulk, es_super = es_super, localhost = localhost))
+  }
+
+}
--- a/Old/aggregator.R
+++ b/Old/aggregator.R
@ -0,0 +1,39 @@
+#' Aggregator function, to aggregate actor results
+#'
+#' Aggregator function, to aggregate actor results
+#' @param id Article id of the article for which actor aggregation should be done
+#' @param actor_df The dataframe containing the actor data
+#' @param merge_id The actorid that should be assigned to the merged result
+#' @return A dataframe with the merged results
+#' @export
+#' @examples
+#' aggregator(id, actor_df, merge_id)
+
+aggregator <- function (id, actor_df, merge_id) {
+article <- filter(actor_df, `_id` == id) %>%
+  unnest(sentence_id, .preserve = colnames(.))
+occ <- length(unlist(unique(article$sentence_id1)))
+sentence_count <- round(article$occ[[1]]/article$prom[[1]])
+prom <- occ/sentence_count
+rel_first <- 1-(min(article$sentence_id1)/sentence_count)
+actor_start <- sort(unique(unlist(article$actor_start)))
+actor_end <- sort(unique(unlist(article$actor_end)))
+sentence_start <- sort(unique(unlist(article$sentence_start)))
+sentence_end <- sort(unique(unlist(article$sentence_end)))
+sentence_id <- sort(unique(unlist(article$sentence_id)))
+
+return(data.frame(doc_id = first(article$`_id`),
+                  sentence_id = I(list(as.integer(sentence_id))),
+                  sentence_start = I(list(sentence_start)),
+                  sentence_end = I(list(sentence_end)),
+                  actor_start = I(list(actor_start)), # List of actor ud token start positions
+                  actor_end = I(list(actor_end)), # List of actor ud token end positions
+                  occ = occ, # Number of sentences in which actor occurs
+                  prom = prom, # Relative prominence of actor in article (number of occurences/total # sentences)
+                  rel_first = rel_first, # Relative position of first occurence at sentence level
+                  first = min(article$sentence_id1), # First sentence in which actor is mentioned
+                  ids = merge_id, # List of actor ids
+                  stringsAsFactors = F
+)
+)
+}
--- a/Old/aggregator_elastic.R
+++ b/Old/aggregator_elastic.R
@ -0,0 +1,61 @@
+### Notes:
+# Do you want to search for either one OR other actorid, or both occuring in the same document?
+# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
+# Search by actorId, then aggregate by month
+# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
+# Develop query generator for specific actors (ie combine actorId with start and end dates)
+
+
+
+#' Generate and store aggregate actor measures to elasticsearch
+#'
+#' Generate and store aggregate actor measures to elasticsearch
+#' @param out The output provided by elasticizer()
+#' @param localhost Boolean indicating if the script should run locally, or remote
+#' @param es_super Write password for ES
+#' @param actorids List of actorids used in the search, should be the same as the actorids used for elasticizer()
+#' @param ver String indicating the version of the update
+#' @return Return value is based on output of elastic_update()
+#' @export
+#' @examples
+#' aggregator_elastic(out, localhost = F, actorids, ver, es_super)
+#################################################################################################
+#################################### Aggregate actor results ################################
+#################################################################################################
+aggregator_elastic <- function(out, localhost = F, actorids, ver, es_super) {
+  ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
+  partyid <- str_sub(actorids[1], end=-3)
+  actor_df <- out %>%
+    unnest() %>%
+    unnest(ids, .preserve = colnames(.)) %>%
+    filter(ids1 %in% actorids)
+
+  agg_party_actors <- bind_rows(lapply(unique(actor_df$`_id`),
+                                         mamlr:::aggregator,
+                                         actor_df = actor_df,
+                                         merge_id = paste0(partyid,'_mfsa')))
+
+  party <- actor_df %>%
+    filter(!endsWith(ids1, '_a'))
+  agg_party <- bind_rows(lapply(unique(party$`_id`),
+                                         mamlr:::aggregator,
+                                         actor_df = party,
+                                         merge_id = paste0(partyid,'_mfs')))
+
+  actors_only <- actor_df %>%
+    filter(endsWith(ids1, '_a'))
+  agg_actors <- bind_rows(lapply(unique(actors_only$`_id`),
+                                  mamlr:::aggregator,
+                                  actor_df = actors_only,
+                                  merge_id = paste0(partyid,'_ma')))
+  df_out <- bind_rows(agg_party_actors, agg_party, agg_actors)
+  doc_ids <- df_out$doc_id
+  df_out <- df_out %>%
+    select(-1) %>%
+    split(as.factor(doc_ids))
+  df_out <- data.frame(doc_id = names(df_out), list = I(df_out))
+  bulk <- apply(df_out, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
+  return(elastic_update(bulk, es_super = es_super, localhost = localhost))
+}
+
+
--- a/Old/merger_old.R
+++ b/Old/merger_old.R
@ -0,0 +1,47 @@
+#' Merges list of lemmas back into a pseudo-document
+#'
+#' Merges list of lemmas back into a pseudo-document
+#' @param row A row number form the Elasticizer-generated data frame
+#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
+#' @param out The elasticizer-generated data frame
+#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
+#' @return A documentified string of lemmas, one document at a time
+#' @export
+#' @examples
+#' merger(1, words = '999', out, text)
+#################################################################################################
+#################################### Reconstructing documents from lemmas########################
+#################################################################################################
+## Only merging lemmas for now, feature selection has no impact on junk classification
+merger_old <- function(row, out, text, clean) {
+  df <- out[row,]
+  # Mergin lemmas into single string
+  if (text == 'lemmas') {
+    lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
+  }
+  if (text == 'ud') {
+    lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
+  }
+  if (text == 'ud_upos') {
+    df <- unnest(df,`_source.ud`)
+    lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
+      # Regex removes all words consisting of or containing numbers, @#$%
+      # Punctuation is not taken into account, as it is already filtered out, see above
+      {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "")  else . }
+    # In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string
+    if (length(lemmas) == 0 ){
+      lemmas <- ''
+    }
+    return(lemmas)
+  }
+  # Replacing $-marked punctuation with their regular forms
+  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
+    # Regex removes all words consisting of or containing numbers, @#$%
+    # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
+    # Regex also used in out_parser
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
+    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
+    paste0(.,". ")
+  return(lemmas)
+}
--- a/R/actorizer.R
+++ b/R/actorizer.R
@ -0,0 +1,143 @@
+#' Updater function for elasticizer: Conduct actor searches
+#'
+#' Updater function for elasticizer: Conduct actor searches
+#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
+#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
+#' @param ids List of actor ids
+#' @param prefix Regex containing prefixes that should be excluded from hits
+#' @param postfix Regex containing postfixes that should be excluded from hits
+#' @param identifier String used to mark highlights. Should be a lowercase string
+#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
+#' @param es_super Password for write access to ElasticSearch
+#' @return As this is a nested function used within elasticizer, there is no return output
+#' @export
+#' @examples
+#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
+actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) {
+  offsetter <- function(x, pre_tags, post_tags) {
+    return(as.list(as.data.frame(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))))
+  }
+
+  out <- mamlr:::out_parser(out, field = 'highlight', clean = F) %>%
+    ## Computing offset for first token position (some articles have a minimum token start position of 16, instead of 1 or 2)
+    mutate( # Checking if the merged field starts with a whitespace character
+      space = case_when(
+        str_starts(merged, '\\s') ~ 1,
+        T ~ 0)
+    ) %>%
+    unnest(cols = '_source.ud') %>%
+    rowwise() %>%
+    mutate(ud_min = min(unlist(start))-1-space) ## Create offset variable, subtract 1 for default token start position of 1, and subtract 1 if merged field starts with a whitespace
+
+  print(str_c('Number of articles with minimum token start position higher than 2: ',sum(out$ud_min > 2)))
+  print('Unique ud_min offset values in batch: ')
+  print(unique(out$ud_min))
+  prefix[prefix==''] <- NA
+  postfix[postfix==''] <- NA
+  pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
+  post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
+
+  if (sum(nchar(out$merged) > 990000) > 0) {
+    stop("One or more documents in this batch exceed 990000 characters")
+  }
+  # Extracting ud output from document
+  ud <- out %>%
+    select(`_id`,lemma,start,end, sentence_id,merged) %>%
+    unnest(cols = colnames(.))
+
+  sentences <- ud %>%
+    group_by(`_id`, sentence_id) %>%
+    summarise(
+      sentence_start = min(start),
+      sentence_end = max(end)
+    ) %>%
+    mutate(
+      sentence_count = n()
+    )
+
+  out$markers <- lapply(str_locate_all(out$merged,coll(pre_tags)),offsetter, pre_tags = pre_tags, post_tags = post_tags)
+  markers <- out %>%
+    select(`_id`,markers, ud_min) %>%
+    unnest_wider(markers) %>%
+    rename(marker_start = start, marker_end = end) %>%
+    unnest(colnames(.)) %>%
+    ## Modifying marker start and end positions using the ud_min column (see above)
+    mutate(marker_start = marker_start +(ud_min),
+             marker_end = marker_end + (ud_min))
+
+
+  hits <- as.data.table(ud)[as.data.table(markers), .(`_id`, lemma,x.start, start, end, x.end, sentence_id, merged), on =.(`_id` = `_id`, start <= marker_start, end >= marker_start)] %>%
+    mutate(end = x.end,
+           start = x.start) %>%
+    select(`_id`, sentence_id, start, end,merged) %>%
+    group_by(`_id`,sentence_id) %>%
+    summarise(
+      actor_start = I(list(start)),
+      actor_end = I(list(end)),
+      n_markers = length(start),
+      merged = first(merged)
+    ) %>%
+    left_join(.,sentences, by=c('_id','sentence_id')) %>%
+    ungroup %>%
+    arrange(`_id`,sentence_id) %>%
+    group_by(`_id`) %>%
+    mutate(n_markers = cumsum(n_markers)) %>%
+    mutate(
+      sentence_start_tags = sentence_start+((nchar(pre_tags)+nchar(post_tags))*(lag(n_markers, default = 0))),
+      sentence_end_tags = sentence_end+((nchar(pre_tags)+nchar(post_tags))*(n_markers))
+    ) %>%
+    mutate(
+      sentence = paste0(' ',str_sub(merged, sentence_start_tags, sentence_end_tags),' ')
+    ) %>%
+    select(-merged) %>%
+    ungroup()
+  # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
+  if (!is.na(prefix) || !is.na(postfix)) {
+    ### If no pre or postfixes, match *not nothing* i.e. anything
+    if (is.na(prefix)) {
+      prefix = '$^'
+    }
+    if (is.na(postfix)) {
+      postfix = '$^'
+    }
+    hits <- hits %>%
+      filter(
+        !str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) & !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))
+      )
+  }
+
+  ### Checking and removing any na rows, and reporting them in the console
+  nas <- hits %>% filter(is.na(sentence_id))
+  hits <- hits %>% filter(!is.na(sentence_id))
+  if (nrow(nas) > 0) {
+    print(str_c('The following articles have not been searched correctly for actorId ',ids))
+    print(str_c('id_na: ',nas$`_id`, collapse = '\n '))
+  }
+  if (nrow(hits) == 0) {
+    print("Nothing to update for this batch")
+    return(NULL)
+  } else {
+    hits <- hits %>%
+      group_by(`_id`) %>%
+      summarise(
+        sentence_id = list(as.integer(sentence_id)),
+        sentence_start = list(sentence_start),
+        sentence_end = list(sentence_end),
+        actor_start = I(list(unlist(actor_start))), # List of actor ud token start positions
+        actor_end = I(list(unlist(actor_end))), # List of actor ud token end positions
+        occ = length(unique(unlist(sentence_id))), # Number of sentences in which actor occurs
+        first = min(unlist(sentence_id)), # First sentence in which actor is mentioned
+        ids = I(list(ids)),
+        sentence_count = first(sentence_count)# List of actor ids
+      ) %>%
+      mutate(
+        prom = occ/sentence_count, # Relative prominence of actor in article (number of occurrences/total # sentences)
+        rel_first = 1-(first/sentence_count), # Relative position of first occurrence at sentence level
+      ) %>%
+      select(`_id`:occ, prom,rel_first,first,ids)
+    bulk <- apply(hits, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
+    bulk <- c(bulk,apply(hits[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
+    return(elastic_update(bulk, es_super = es_super, localhost = localhost))
+  }
+
+}
--- a/R/bulk_writer.R
+++ b/R/bulk_writer.R
@ -4,42 +4,43 @@
 #' Type can be either one of three values:
 #' set: set the value of [varname] to x
 #' add: add x to the values of [varname]
-#' varname: When using tokens, the token field will be updated instead of a computerCodes field
+#' varname: When using ud, the ud field will be updated instead of a computerCodes field
 #' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
 #' @param index The name of the Elasticsearch index to update
 #' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)
 #' @param type Type of updating to be done, can be either 'set', 'add', or 'addnested'
+#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
 #' @export
 #' @examples
-#' bulk_writer(x, index = 'maml', varname = 'updated_variable')
+#' bulk_writer(x, index = 'maml')
 #################################################################################################
 #################################### Bulk update writer ################################
 #################################################################################################
-bulk_writer <- function(x, index = 'maml', varname = 'updated_variable', type) {
+bulk_writer <- function(x, index = 'maml', varname, type, ver) {
  ### Create a json object if more than one variable besides _id, otherwise use value as-is
  if (length(x) > 2) {
-    json <- toJSON(bind_rows(x)[-1], collapse = T)
-  x} else {
+    json <- toJSON(list(x[-1]), collapse = T)
+  } else {
    names(x) <- NULL
    json <- toJSON(x[-1], collapse = T)
  }
-  if (varname == "tokens") {
+  if (varname == "ud") {
    return(
-      paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
-{ "script" : { "source": "ctx._source.tokens = params.code", "lang" : "painless", "params": { "code": ',json,'}}}')
+      paste0('{"update": {"_index": "',index,'", "_type": "_doc", "_id": "',x[1],'"}}
+{ "script" : { "source": "ctx._source.version = \\"',ver,'\\"; ctx._source.ud = params.code; ctx._source.remove(\\"tokens\\")", "lang" : "painless", "params": { "code": ',json,'}}}')
    )
  }
  if (type == 'set') {
    return(
-      paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
-{ "script" : { "source": "if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,', "object": {"',varname,'": ',json,'} }}}')
+      paste0('{"update": {"_index": "',index,'", "_type": "_doc", "_id": "',x[1],'"}}
+{ "script" : { "source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,', "object": {"',varname,'": ',json,'} }}}')
    )
  }
  if (type == "add") {
    return(
-      paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
-        {"script": {"source": "if (ctx._source.computerCodes != null && ctx._source.computerCodes.containsKey(\\"',varname,'\\")) {ctx._source.computerCodes.',varname,'.addAll(params.code)} else if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,' , "object": {"',varname,'": ',json,'}}}}'
+      paste0('{"update": {"_index": "',index,'", "_type": "_doc", "_id": "',x[1],'"}}
+        {"script": {"source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null && ctx._source.computerCodes.containsKey(\\"',varname,'\\")) {ctx._source.computerCodes.',varname,'.addAll(params.code)} else if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,' , "object": {"',varname,'": ',json,'}}}}'
      )
    )
  }
--- a/R/class_update.R
+++ b/R/class_update.R
@ -6,6 +6,10 @@
 #' @param model_final The classification model (e.g. output from textstat_nb(), svm() or others)
 #' @param dfm_words A dfm containing all the words and only the words used to generate the model (is used for subsetting)
 #' @param varname String containing the variable name to use for the classification result, usually has the format computerCodes.varname
+#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
+#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
+#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @param es_super Password for write access to ElasticSearch
 #' @return As this is a nested function used within elasticizer, there is no return output
 #' @export
@ -14,11 +18,13 @@
 #################################################################################################
 #################################### Update any kind of classification ##########################
 #################################################################################################
-class_update <- function(out, localhost = T, model_final, dfm_words, varname, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
+class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
  print('updating')
-  dfm <- dfm_gen(out, text = 'lemmas') %>%
-    dfm_keep(dfm_words, valuetype="fixed", verbose=T)
-  pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm))
-  bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set')
+  dfm <- dfm_gen(out, text = text, words = words, clean = clean)
+  if (!is.null(model_final$idf)) {
+    dfm <- dfm_weight(dfm, weights = model_final$idf)
+  }
+  pred <- data.frame(id = out$`_id`, pred = predict(model_final$text_model, newdata = dfm, type = "class", force = T))
+  bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set', ver = ver)
  res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
 }
--- a/R/cv_generator.R
+++ b/R/cv_generator.R
@ -0,0 +1,62 @@
+#' Generate CV folds for nested cross-validation
+#'
+#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
+#'
+#' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data
+#' @param inner_k Number of inner CV (parameter optimization) folds
+#' @param vec Vector containing the true values of the classification
+#' @param grid Parameter grid for optimization
+#' @param seed integer used as seed for random number generation
+#' @return A nested set of lists with row numbers
+#' @export
+#' @examples
+#' cv_generator(outer_k, inner_k, dfm, class_type)
+#################################################################################################
+#################################### Generate CV folds ##########################################
+#################################################################################################
+cv_generator <- function(outer_k, inner_k, vec, grid, seed) {
+  ### Generate inner folds for nested cv
+  inner_loop <- function(i, folds, vec, inner_k, grid, seed) {
+    # RNG needs to be set explicitly for each fold
+    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+    inner_folds <- createFolds(as.factor(vec[-folds[[i]]]), k= inner_k)
+    grid <- crossing(grid, inner_fold = names(inner_folds), outer_fold = names(folds)[i])
+    return(list(grid = grid, inner_folds = inner_folds, outer_fold = names(folds)[i]))
+  }
+
+  ### Generate outer folds for nested cv
+  generate_folds <- function(outer_k, inner_k, vec, grid, seed){
+    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+    if (is.null(outer_k)) { # If no outer_k, use all data to generate inner_k folds for parameter optimization
+      inner_folds <- createFolds(as.factor(vec), k= inner_k)
+      grid <- crossing(grid, inner_fold = names(inner_folds))
+      return(list(grid = grid,
+                  inner_folds = inner_folds))
+    } else if (outer_k < 1) { # Create holdout validation for model performance estimation, with test set equal to outer_k
+      folds <- createDataPartition(as.factor(vec), p=outer_k)
+    } else { # Do full nested CV
+      folds <- createFolds(as.factor(vec), k= outer_k)
+    }
+    # Generate grid of hyperparameters for model optimization, and include inner folds row numbers
+    grid_folds <- lapply(1:length(folds),
+                          inner_loop,
+                          folds = folds,
+                          vec = vec,
+                          inner_k = inner_k,
+                          grid = grid,
+                          seed = seed)
+
+    # Extract grid dataframe from results
+    grid <- grid_folds %>% purrr::map(1) %>% dplyr::bind_rows()
+
+    # Extract row numbers for inner folds from results
+    inner_folds <- grid_folds %>% purrr::map(2)
+
+    # Extract the names of the inner folds from results
+    names(inner_folds) <- grid_folds %>% purrr::map(3) %>% unlist(.)
+    return(list(grid = grid,
+                outer_folds = folds,
+                inner_folds = inner_folds))
+  }
+  return(generate_folds(outer_k,inner_k = inner_k, vec = vec, grid = grid, seed = seed))
+}
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -3,7 +3,11 @@
 #' Generates dfm from ElasticSearch output
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
-#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas"
+#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
+#' @param tolower Boolean indicating whether dfm features should be lowercased
+#' @param binary Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE
+#' @param ngrams Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1
 #' @return A Quanteda dfm
 #' @export
 #' @examples
@ -16,57 +20,57 @@

 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack

-dfm_gen <- function(out, words = '999', text = "lemmas") {
+dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T, binary=F, ngrams=1) {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
-  if (text == "lemmas") {
-    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, mc.cores = detectCores()))
+  if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
+    out <- left_join(out, merger(out, text=text, clean=clean), by = "_id")
  }
  if (text == "full") {
-    out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
-                        str_replace_na(out$`_source.subtitle`, replacement = " "),
-                        str_replace_na(out$`_source.preteaser`, replacement = " "),
-                        str_replace_na(out$`_source.teaser`, replacement = " "),
-                        str_replace_na(out$`_source.text`, replacement = " "),
-                        sep = " ") %>%
-      # Remove html tags
-      str_replace_all("<.*?>", " ") %>%
-      str_replace_all("\\s+"," ")
+    out <- mamlr:::out_parser(out, field = '_source' , clean = clean)
  }
  if ('_source.codes.majorTopic' %in% colnames(out)) {
    out <- out %>%
-      mutate(codes = case_when(
-        .$`_source.codes.timeSpent` == -1 ~ NA_character_,
-        TRUE ~ .$`_source.codes.majorTopic`
-      )
-      ) %>%
+      mutate(codes = `_source.codes.majorTopic`) %>%
      mutate(junk = case_when(
+          .$codes == 92 ~ 1,
+          .$codes == 91 ~ 1,
+          .$codes == 93 ~ 1,
          .$codes == 2301 ~ 1,
          .$codes == 3101 ~ 1,
          .$codes == 34 ~ 1,
-        .$`_source.codes.timeSpent` == -1 ~ NA_real_,
          TRUE ~ 0
-      )
-      ) %>%
-      mutate(aggregate = .$codes %>%
+        ),
+        aggregate = .$codes %>%
          str_pad(4, side="right", pad="a") %>%
          str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
          .[,2] %>%
-               as.numeric()
+          as.numeric(),
+        nondomestic = as.numeric(`_source.codes.nonDomestic`)
+      ) %>%
+      mutate(
      )
-   vardoc <- out[,-seq(1,(length(names(out))-3),1)]
+   vardoc <- select(out, codes, junk, aggregate, nondomestic)
  } else {
    vardoc <- NULL
  }
  if (words != "999") {
    ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary
    # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))
-    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>%
-                                          str_extract('.*[.?!]'))
+    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' '))
+
+    if(text != 'ud_upos') {
+      out$merged <- str_extract(out$merged,'.*[.?!]')
+    }
  }
  dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
-    dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
+    tokens(remove_punct = T) %>%
+    tokens_ngrams(n = ngrams, skip = 0, concatenator = '_') %>%
+    dfm(tolower = tolower, stem = F, valuetype = "regex")
+  if (binary) {
+    dfm <- dfm_weight(dfm, scheme = 'boolean')
+  }
  return(dfm)
 }
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@ -9,6 +9,7 @@
 #' @param es_super Password for write access to ElasticSearch
 #' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
 #' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)
+#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
 #' @export
 #' @examples
@ -17,29 +18,24 @@
 #################################################################################################
 #################################### Duplicate detector ################################
 #################################################################################################
-dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T) {
+dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) {
  params <- grid[row,]
  print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
-  query <- paste0('{"query":
-                  {"bool": {"filter":[{"term":{"doctype": "',params$doctypes,'"}},
-                  {"range" : {
-                  "publication_date" : {
-                  "gte" : "',params$dates,'T00:00:00Z",
-                  "lt" :  "',params$dates+1,'T00:00:00Z"
-                  }
-                  }}]
-
-                  } } }')
-  out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost)
+  query <- paste0('doctype:\\"',params$doctypes,'\\" && publication_date:',params$dates,' && !computerCodes._delete:1')
+  out <- elasticizer(query_string(query, fields = c('country','text','title','subtitle','teaser','preteaser')), es_pwd = es_pwd, localhost= localhost)
  if (class(out$hits$hits) != 'list') {
-    dfm <- dfm_gen(out, text = "full", words = words)
+    dfm <- dfm_gen(out, text = "full", words = words, clean = T)
    if (sum(dfm[1,]) > 0) {
      simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
      diag(simil) <- NA
-      df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
-        rownames_to_column("rowid") %>%
-        mutate(colid = colnames(simil)[col]) %>%
-        .[,c(1,4)] %>%
+      duplicates <- which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)
+      duplicates <- cbind(duplicates, rowid= rownames(duplicates))
+      if (length(duplicates) > 0) {
+        rownames(duplicates) <- seq(1:length(rownames(duplicates)))
+        df <- as.data.frame(duplicates, make.names = NA, stringsAsFactors = F) %>%
+          # bind_cols(colid = colnames(simil)[.['col']]) %>%
+          mutate(colid = colnames(simil)[as.numeric(col)]) %>%
+          .[,c(3,4)] %>%
          group_by(colid) %>% summarise(rowid=list(rowid))
        text <- capture.output(stream_out(df))
        # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
@ -49,12 +45,13 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
        #       append=T)
        dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
                                  dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
-      bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'),
-                apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set'))
-      if (length(bulk) > 0) {
+        bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver),
+                  apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver))
        res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
-      }
        return(paste0('Checked ',params$doctypes,' on ',params$dates ))
+      } else {
+        return(paste0('No duplicates for ',params$doctypes,' on ',params$dates ))
+      }
    } else {
      return(paste0('No results for ',params$doctypes,' on ',params$dates ))
    }
--- a/R/elastic_update.R
+++ b/R/elastic_update.R
@ -15,7 +15,7 @@
 elastic_update <- function(x, es_super = 'secret', localhost = T) {
  bulk <- paste0(paste0(x, collapse = '\n'),'\n')
  if (localhost == F) {
-    url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty&refresh=wait_for')
+    url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty')
  }
  if (localhost == T) {
    url <- 'http://localhost:9200/_bulk?pretty'
@ -25,13 +25,14 @@ elastic_update <- function(x, es_super = 'secret', localhost = T) {
               , encode = "raw"
               , add_headers("Content-Type" = "application/json")
               , times = 10
-               , pause_min = 10
+               , pause_min = 30
  )
  httr:::stop_for_status(res)
  appData <- httr:::content(res)
  if (appData$errors == T){
+    print("Aborting, errors found during updating")
    print(appData)
-    stop("Aborting, errors found during updating")
+    return(appData)
  }
  print("updated")
  return(1)
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -4,6 +4,12 @@
 #' @param query A JSON-formatted query in the Elasticsearch query DSL
 #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
 #' @param index The name of the Elasticsearch index to search through
+#' @param es_user Username used to connect, defaults to 'es'
+#' @param es_pwd The password for Elasticsearch read access
+#' @param batch_size Batch size
+#' @param max_batch Maximum number batches to retrieve
+#' @param time_scroll Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)
+#' @param dump Boolean indicating whether the data frames should be returned, or dumped as .Rds files
 #' @param update When set, indicates an update function to use on each batch of 1000 articles
 #' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param ... Parameters passed on to the update function
@ -15,63 +21,147 @@
 #################################################################################################
 #################################### Get data from ElasticSearch ################################
 #################################################################################################
-elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL, localhost = F, ...){
+elasticizer <- function(query, src = T, index = 'maml', es_user, es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, time_scroll = "5m", dump = F, file='', update = NULL, localhost = F, ...){
+  retries <- 10 ### Number of retries on error
+  sleep <- 30 ### Number of seconds between retries
  httr::set_config(httr::config(http_version = 0))
+  ## Transitional code for syntax change in elastic package
+  if (packageVersion("elastic") < '1') {
    if (localhost == F) {
      connect(es_port = 443,
              es_transport = 'https',
              es_host = 'linux01.uis.no',
              es_path = 'es',
-            es_user = 'es',
+              es_user = es_user,
              es_pwd = es_pwd,
-            errors = 'complete')
+              errors = 'complete'
+#              ssl_verifypeer = FALSE,
+#              ssl_verifyhost=FALSE
+              )
    }
    if (localhost == T){
      connect(es_port = 9200,
              es_transport = 'http',
              es_host = 'localhost',
              es_path = '',
-            es_user = '',
-            es_pwd = '',
+              es_user = es_user,
+              es_pwd = es_pwd,
+              errors = 'complete')
+    }
+    conn <- NULL
+  } else {
+    if (localhost == F) {
+      conn <- connect(port = 443,
+              transport = 'https',
+              host = 'linux01.uis.no',
+              path = 'es',
+              user = es_user,
+              pwd = es_pwd,
+              errors = 'complete'
+#              ssl_verifypeer = FALSE,
+#              ssl_verifyhost=FALSE
+              )
+    }
+    if (localhost == T){
+      conn <- connect(port = 9200,
+              transport = 'http',
+              host = 'localhost',
+              path = '',
+              user = es_user,
+              pwd = es_pwd,
              errors = 'complete')
    }
+  }
  # Get all results - one approach is to use a while loop
  if (src == T) {
-    res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
+    res <- NULL
+    attempt <- 0
+    while( is.null(res) && attempt <= retries ) {
+      if (attempt > 0) {
+        Sys.sleep(sleep)
+      }
+      attempt <- attempt + 1
+      try(
+        res <- Search(conn = conn, index = index, time_scroll=time_scroll,body = query, size = batch_size, raw=T)
+      )
+      if (attempt > 1) {
+        print(paste0('Successful after ',attempt,' attempts'))
+      }
+    }
  }
  if (src == F) {
-    res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T, source = F)
+    res <- NULL
+    attempt <- 0
+    while( is.null(res) && attempt <= retries ) {
+      if (attempt > 0) {
+        Sys.sleep(sleep)
+      }
+      attempt <- attempt + 1
+      try(
+        res <- Search(conn = conn, index = index, time_scroll=time_scroll,body = query, size = batch_size, raw=T, source = F)
+      )
+      if (attempt > 1) {
+        print(paste0('Successful after ',attempt,' attempts'))
+      }
+    }
  }
  json <- fromJSON(res)
-  if (json$hits$total == 0) {
+  if (json$hits$total$value == 0) {
+    scroll_clear(conn = conn, x = json$`_scroll_id`)
    return(json)
  } else {
    out <-  jsonlite:::flatten(json$hits$hits)
-    total <- json$hits$total
-    hits <- 1
+    total <- json$hits$total$value
+    hits <- length(json$hits$hits)
    batch <- 1
-    print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
+    print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
+    if (dump) {
+      # saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
+      write(jsonify:::to_ndjson(out),file=file, append=TRUE)
+    }
    if (length(update) > 0){
      update(out, localhost = localhost, ...)
    }
-    while(hits != 0){
-      res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
+    while(hits > 0 && batch < max_batch ){
+      res <- NULL
+      attempt <- 0
+      while( is.null(res) && attempt <= retries ) {
+        if (attempt > 0) {
+          Sys.sleep(sleep)
+        }
+        attempt <- attempt + 1
+        try(
+          res <- scroll(conn = conn, json$`_scroll_id`, time_scroll=time_scroll, raw=T)
+        )
+        if (attempt > 1) {
+          print(paste0('Successful after ',attempt,' attempts'))
+        }
+      }
      json <- fromJSON(res)
      hits <- length(json$hits$hits)
      if(hits > 0) {
        batch <- batch+1
-        print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
+        print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
        if (length(update) > 0){
          out <-  jsonlite:::flatten(json$hits$hits)
          update(out, localhost = localhost, ...)
+        } else if (dump) {
+          # saveRDS(jsonlite:::flatten(json$hits$hits), file = paste0('batch_',batch*batch_size,'.Rds'))
+          write(jsonify:::to_ndjson(jsonlite:::flatten(json$hits$hits)),file=file, append=TRUE)
        } else {
-          out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
+          # Old merging code
+          # out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
+          out <- rbindlist(list(out, jsonlite:::flatten(json$hits$hits)),use.names = T, fill = T)
        }
      }
    }
    if (length(update) > 0) {
+      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return("Done updating")
+    } else if (dump) {
+      return("Dumping complete")
    } else {
+      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return(out)
    }
  }
--- a/R/estimator.R
+++ b/R/estimator.R
@ -0,0 +1,100 @@
+#' Generate models and get classifications on test sets
+#'
+#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
+#'
+#' @param row Row number of current item in grid
+#' @param grid Grid with model parameters and CV folds
+#' @param outer_folds List with row numbers for outer folds
+#' @param dfm DFM containing labeled documents
+#' @param class_type Name of column in docvars() containing the classes
+#' @param model Model to use (currently only nb)
+#' @param we_vectors Matrix with word embedding vectors
+#' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
+#' @export
+#' @examples
+#' estimator(row, grid, outer_folds, dfm, class_type, model)
+#################################################################################################
+#################################### Generate models ############################################
+#################################################################################################
+
+### Classification function
+estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model, we_vectors) {
+  # Get parameters for current iteration
+  params <- grid[row,]
+
+  # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
+  if ("inner_fold" %in% colnames(params) && "outer_fold" %in% colnames(params))  {
+    dfm_train <- dfm[-outer_folds[[params$outer_fold]],] %>%
+      .[-inner_folds[[params$outer_fold]][[params$inner_fold]],]
+    dfm_test <- dfm[-outer_folds[[params$outer_fold]],] %>%
+      .[inner_folds[[params$outer_fold]][[params$inner_fold]],]
+    # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
+  } else if ("outer_fold" %in% colnames(params)) {
+    dfm_train <- dfm[-outer_folds[[params$outer_fold]],]
+    dfm_test <- dfm[outer_folds[[params$outer_fold]],]
+    # If only inner folds, validate performance directly on inner folds
+  } else if ("inner_fold" %in% colnames(params)) {
+    dfm_train <- dfm[-inner_folds[[params$inner_fold]],]
+    dfm_test <- dfm[inner_folds[[params$inner_fold]],]
+    # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
+  } else {
+    dfm_test <- NULL
+    dfm_train <- dfm
+  }
+
+  dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
+  if (params$tfidf) {
+    idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
+    dfm_train <- dfm_weight(dfm_train, weights = idf)
+    if (!is.null(dfm_test)) {
+      dfm_test <- dfm_weight(dfm_test, weights = idf)
+    }
+  } else {
+    idf <- NULL
+  }
+
+  if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
+
+    # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
+    words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
+                                  feat_select,
+                                  dfm = dfm_train,
+                                  class_type = params$class_type,
+                                  percentile = params$feat_percentiles,
+                                  measure = params$feat_measures
+    )))
+    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
+    if (!is.null(dfm_test)) {
+      dfm_test <- dfm_keep(dfm_test, words, valuetype="fixed", verbose=F)
+    }
+  }
+
+  if (model == "nb") {
+    text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
+  }
+  if (model == "svm") {
+    text_model <- svm(x=as.matrix(train_data), y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
+  }
+  # if (model == 'nnet') {
+  #   idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
+  #   text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
+  # }
+  ### Add more if statements for different models
+
+  # If training on whole dataset, return final model, and idf values from dataset
+  if (is.null(dfm_test)) {
+    return(list(text_model=text_model, idf=idf))
+  } else { # Create a test set, and classify test items
+    # Use force=T to keep only features present in both training and test set
+    pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T)
+
+    return(data.frame(
+      tv = I(list(docvars(dfm_test, class_type))), # True values from test set
+      pred = I(list(pred)), # Predictions of test set
+      params, # Parameters used to generate classification model
+      text_model = I(list(text_model)), # The classification model
+      idf = I(list(idf)), # IDF of the training dataset used for model creation
+      stringsAsFactors = F
+    ))
+  }
+}
--- a/R/feat_select.R
+++ b/R/feat_select.R
@ -0,0 +1,28 @@
+#' Select features using quanteda textstat_keyness
+#'
+#' Select features based on the textstat_keyness function and a percentile cutoff
+#' Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
+#'
+#' @param topic The topic to determine keywords for
+#' @param dfm The input dfm
+#' @param class_type Name of the column in docvars containing the classification
+#' @param percentile Cutoff for the list of words that should be returned
+#' @param measure Measure to use in determining keyness, default = chi2; see textstat_keyness for other options
+#' @return A vector of words that are key to the topic
+#' @export
+#' @examples
+#' feat_select(topic, dfm, class_type, percentile, measure="chi2")
+#################################################################################################
+#################################### Feature selection ##########################################
+#################################################################################################
+
+feat_select <- function (topic, dfm, class_type, percentile, measure="chi2") {
+  # Use quanteda textstat_keyness to determine feature importance
+  keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
+    na.omit()
+  # Convert keyness values to absolute values, to take into account both positive and negative extremes
+  keyness[,2] <- abs(keyness[,2])
+  # Keep only the words with an absolute keyness value falling in the top [percentile] percentile
+  keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
+  return(keyness)
+}
--- a/R/lemma_writer.R
+++ b/R/lemma_writer.R
@ -0,0 +1,42 @@
+#' Generates text output files (without punctuation) for external applications, such as GloVe embeddings
+#'
+#' Generates text output files (without punctuation) for external applications, such as GloVe embeddings
+#' @param out The elasticizer-generated data frame
+#' @param file The file to write the output to (including path, when required). When documents = T, provide path including trailing /
+#' @param documents Indicate whether the writer should output to a single file, or individual documents
+#' @param lemma Indicate whether document output should be lemmas or original document
+#' @param cores Indicate the number of cores to use for parallel processing
+#' @param localhost Unused, but defaults to FALSE
+#' @return A Quanteda dfm
+#' @export
+#' @examples
+#' dfm_gen(out, words = '999')
+
+
+#################################################################################################
+#################################### Lemma text file generator #############################
+#################################################################################################
+
+lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1) {
+  plan(multiprocess, workers = cores)
+  par_writer <- function(row, out, lemma) {
+    if (lemma == T) {
+      cat(iconv(unlist(unnest(out[row,],`_source.ud`)$lemma), to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F)
+    } else {
+      cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F)
+    }
+  }
+
+  if (documents == F) {
+    out <- unnest(out,`_source.ud`)
+    lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_')
+    cat(lemma, file = file, append = T)
+  }
+  if (documents == T) {
+    if (lemma == F) {
+      out <- out_parser(out, field = '_source', clean = F)
+    }
+    future_lapply(1:nrow(out), par_writer, out = out, lemma = lemma)
+  }
+
+}
--- a/R/merger.R
+++ b/R/merger.R
@ -1,26 +1,60 @@
 #' Merges list of lemmas back into a pseudo-document
 #'
 #' Merges list of lemmas back into a pseudo-document
-#' @param row A row number form the Elasticizer-generated data frame
-#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param out The elasticizer-generated data frame
+#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @return A documentified string of lemmas, one document at a time
 #' @export
 #' @examples
-#' merger(1, words = '999', out = out)
+#' merger(out, text, clean)
 #################################################################################################
 #################################### Reconstructing documents from lemmas########################
 #################################################################################################
 ## Only merging lemmas for now, feature selection has no impact on junk classification
-merger <- function(row, out = out) {
-  df <- out[row,]
-  # Mergin lemmas into single string
-  lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
-  # Replacing $-marked punctuation with their regular forms
-  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
-    ### Removing numbers and non-words containing numbers
-    str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
+merger <- function(out, text, clean) {
+  df <- unnest(out, cols = '_source.ud') %>%
+    select(`_id`,lemma,upos) %>%
+    unnest(cols = c('lemma','upos')) %>%
+    # This line is added in the new merger function, in the old merger function this would result in the following:
+    # 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma
+    # 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA
+    filter(!is.na(lemma)) %>%
+    group_by(`_id`)
+  if (text == 'ud_upos') {
+    df <- df %>%
+      filter(upos != 'PUNCT') %>%
+      mutate(
+        lem_u = str_c(lemma,upos,sep="_")
+      ) %>%
+      summarise(
+                merged = str_c(c(lem_u), collapse= ' ')
+      ) %>%
+      # Regex removes all words consisting of or containing numbers, @#$%
+      # Punctuation is not taken into account, as it is already filtered out, see above
+      {if(clean == T) mutate(.,
+                             merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "")
+      )
+        else . }
+  }
+  if (text == 'ud') {
+    df <- df %>%
+      summarise(
+                merged = str_c(c(lemma), collapse= ' ')
+      ) %>%
+      mutate(
+        merged = str_replace_all(merged," \\$(.+?)", "\\1")
+      ) %>%
+      # Regex removes all words consisting of or containing numbers, @#$%
+      # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
+      # Regex also used in out_parser
      # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
-    paste0(.,". ")
-  return(lemmas)
+      {if(clean == T) mutate(.,
+                             merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")
+      )
+        else . } %>%
+      mutate(.,
+             merged = paste0(merged,'. '))
+  }
+  return(df)
 }
--- a/R/metric_gen.R
+++ b/R/metric_gen.R
@ -0,0 +1,47 @@
+#' Generate performance statistics for models
+#'
+#' Generate performance statistics for models, based on their predictions and the true values
+#'
+#' @param x A data frame containing at least the columns "pred" and "tv"
+#' @return x, with additional columns for performance metrics
+#' @export
+#' @examples
+#' metric_gen(x)
+#################################################################################################
+############################# Performance metric generation #####################################
+#################################################################################################
+
+metric_gen <- function(x) {
+  ### Fix for missing classes in multiclass classification
+  ### Sorting u for easier interpretation of confusion matrix
+  u <- as.character(sort(as.numeric(union(unlist(x$pred), unlist(x$tv)))))
+  # Create a crosstable with predictions and true values
+  class_table <- table(prediction = factor(unlist(x$pred), u), trueValues = factor(unlist(x$tv), u))
+
+  # When only two classes, set positive class explicitly as the class with the highest value
+  if (length(unique(u)) == 2) {
+    conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
+    weighted_measures <- as.data.frame(conf_mat$byClass)
+    macro_measures <- as.data.frame(conf_mat$byClass)
+  } else {
+    # Create a confusion matrix
+    conf_mat <- confusionMatrix(class_table, mode = "everything")
+    # Set "positive" value to NA, because not applicable
+    conf_mat$positive <- NA
+    # Compute weighted performance measures
+    weighted_measures <- colSums(conf_mat$byClass * colSums(conf_mat$table))/sum(colSums(conf_mat$table))
+    # Compute unweighted performance measures (divide by number of classes, each class equally important)
+    macro_measures <- colSums(conf_mat$byClass)/nrow(conf_mat$byClass)
+    # Replace NaN's by 0 when occurring
+    weighted_measures[is.nan(weighted_measures)] <- 0
+    macro_measures[is.nan(macro_measures)] <- 0
+  }
+  return(cbind(x,
+               as.data.frame(t(conf_mat$overall)),
+               'weighted' = t(as.data.frame(weighted_measures)),
+               'macro' = t(as.data.frame(macro_measures)),
+               pos_cat = conf_mat$positive,
+               conf_mat = I(list(conf_mat))
+              )
+         )
+}
--- a/R/modelizer.R
+++ b/R/modelizer.R
@ -8,219 +8,124 @@
 #' - percentiles (cutoff point for tf-idf feature selection)
 #' - measures (what measure to use for determining feature importance, see textstat_keyness for options)
 #' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars
-#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds)
-#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)
-#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)
-#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)
-#' @param seed Integer to use as seed for random number generation, ensures replicability
 #' @param outer_k Number of outer cross-validation folds (for performance estimation)
 #' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection)
-#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
 #' @param class_type Type of classification to model ("junk", "aggregate", or "codes")
 #' @param opt_measure Label of measure in confusion matrix to use as performance indicator
 #' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename)
 #' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
-#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
+#' @param seed Integer to use as seed for random number generation, ensures replicability
+#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
+#' @param we_vectors Matrix with word embedding vectors
+#' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1
+#' @return A list containing all relevant output
 #' @export
 #' @examples
-#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
+#' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
 #################################################################################################
 #################################### Function to generate classification models #################
 #################################################################################################

-modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
-  ### Functions ###
-  feat_select <- function (topic, dfm, class_type, percentile,measure) {
-    keyness <- textstat_keyness(dfm, measure = measure, docvars(dfm, class_type) == as.numeric(topic)) %>%
-      na.omit()
-    keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
-    return(keyness)
-  }
-  ### Generate inner folds for nested cv
-  inner_loop <- function(fold, dfm, inner_k, class_type) {
-    # RNG needs to be set explicitly for each fold
-    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
-    ## Either createDataPartition for simple holdout parameter optimization
-    ## Or createFolds for proper inner CV for nested CV
-    # if (inner_k <= 1) {
-    #   inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
-    # } else {
-    inner_folds <- createFolds(as.factor(docvars(dfm[-fold], class_type)), k= inner_k)
-    # }
-    return(c(outer_fold = list(fold),inner_folds))
-  }
+modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, we_vectors, cores = 1) {
+  ## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building
+  folds <- cv_generator(outer_k,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
+  inner_grid <- folds$grid
+  outer_folds <- folds$outer_folds
+  inner_folds <- folds$inner_folds

-  ### Generate outer folds for nested cv
-  generate_folds <- function(outer_k, inner_k, dfm, class_type){
-    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
-    folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
-    return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type))
-  }
+  ## Create multithread work pool for future_lapply
+  plan(strategy = multiprocess, workers = cores)

-  ### Gets called for every parameter combination, and calls classifier for every inner cv fold
-  inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
-    params <- grid[row,]
-    # For each inner fold, cross validate the specified parameters
-    res <-
-      bind_rows(mclapply(inner_folds,
-                         classifier,
-                         outer_fold = outer_fold,
-                         params = params,
-                         dfm = dfm,
-                         class_type = class_type,
-                         model = model,
-                         cores_feats = cores_feats,
-                         mc.cores = cores_inner
-      )
-      )
-
-    return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params))
-  }
-
-  ### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
-  outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
-    # If fold contains both inner folds and outer fold
-    if (length(fold) == inner_k + 1) {
-      inner_folds <- fold[-1]
-      outer_fold <- fold$outer_fold
-      # For each row in grid, cross-validate results
-      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
-                                inner_cv,
-                                cores_feats= cores_feats,
-                                grid = grid,
-                                dfm = dfm,
-                                class_type = class_type,
-                                model = model,
-                                outer_fold = outer_fold,
+  ## Use estimator function to build models for every parameter combination in grid
+  inner_cv_output <- future_lapply(1:nrow(inner_grid), estimator,
+                                   grid = inner_grid,
+                                   outer_folds = outer_folds,
                                   inner_folds = inner_folds,
-                                cores_inner = cores_inner,
-                                mc.cores = cores_grid)
-      )
-      # Determine optimum hyperparameters within outer fold training set
-      optimum <- res[which.max(res[,opt_measure]),] %>%
-        select(percentiles: ncol(.))
-      # Validate performance of optimum hyperparameters on outer fold test set
-      return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats))
-    } else {
-      # If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters
-      inner_folds <- fold
-      outer_fold <- NULL
-      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
-                                inner_cv,
-                                cores_feats= cores_feats,
-                                grid = grid,
                                   dfm = dfm,
                                   class_type = class_type,
-                                model = model,
-                                outer_fold = outer_fold,
-                                inner_folds = inner_folds,
-                                cores_inner = cores_inner,
-                                mc.cores = cores_grid)
-      )
-      return(res)
-    }
-  }
+                                   we_vectors = we_vectors,
+                                   model = model) %>%
+    future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row
+    bind_rows(.)

-  ### Custom tfidf function to allow same idf for different dfm's
-  custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
-    if (!nfeat(x) || !ndoc(x)) return(x)
-    tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
-    if (nfeat(x) != length(dfreq))
-      stop("missing some values in idf calculation")
-    # get the document indexes
-    j <- as(tfreq, "dgTMatrix")@j + 1
-    # replace just the non-zero values by product with idf
-    x@x <- tfreq@x * dfreq[j]
-    # record attributes
-    x@weightTf <- tfreq@weightTf
-    x@weightDf <- c(list(scheme = scheme_df, base = base), args)
-    return(x)
-  }

-  ### Classification function
-  classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
-    # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
-    if (length(inner_fold) > 0 && length(outer_fold) > 0)  {
-      dfm_train <- dfm[-outer_fold] %>%
-        .[-inner_fold]
-      dfm_test <- dfm[-outer_fold] %>%
-        .[inner_fold]
-      # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
-    } else if (length(outer_fold) > 0 ) {
-      dfm_train <- dfm[-outer_fold]
-      dfm_test <- dfm[outer_fold]
-      # If only inner folds, validate performance directly on inner folds (is the same as above?)
-    } else if (length(inner_fold) > 0 ) {
-      dfm_train <- dfm[-inner_fold]
-      dfm_test <- dfm[inner_fold]
-      # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
-    } else {
-      final <- T ### Indicate final modeling run on whole dataset
-      dfm_train <- dfm
-    }
-    ### Getting features from training dataset
-    # Getting idf from training data, and using it to normalize both training and testing feature occurence
-    dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
-    dfreq <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0, use.names=T)
-    dfm_train <- custom_tfidf(dfm_train, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq)
-    # Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
-    # than the threshold on two or more of those categories
-    words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
-                             feat_select,
-                             dfm = dfm_train,
-                             class_type = class_type,
-                             percentile = params$percentiles,
-                             measure = params$measures,
-                             mc.cores = cores_feats
-    )))
-    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
+  outer_grid <- inner_cv_output %>%
+    # Group by outer folds, and by parameters used for model tuning
+    group_by_at(c("outer_fold", colnames(grid))) %>%
+    # Get mean values for all numeric (performance indicator) variables
+    summarise_if(is.numeric, mean, na.rm = F) %>%
+    # Group the mean performance indicators by outer_fold
+    group_by(outer_fold) %>%
+    # Get for each outer_fold the row with the highest value of opt_measure
+    slice(which.max((!!as.name(opt_measure)))) %>%
+    # Select only the columns outer_fold, and the columns that are in the original parameter grid
+    select(outer_fold, colnames(grid))
+  ## Create multithread work pool for future_lapply
+  plan(strategy = multiprocess, workers = cores)

+  # Use the estimator function to build optimum models for each outer_fold
+  outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator,
+                                   grid = outer_grid,
+                                   outer_folds = outer_folds,
+                                   inner_folds = NULL,
+                                   dfm = dfm,
+                                   class_type = class_type,
+                                   we_vectors = we_vectors,
+                                   model = model) %>%
+    future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid
+    bind_rows(.)

-    if (model == "nb") {
-      text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
-    }
-    if (model == "svm") {
-      text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = "linear", gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
-    }
-    ### Add more if statements for different models
-    if (exists("final") == T) {
-      return(text_model)
-    } else {
-      ### Removing all features not in training set from test set and weighting the remaining features according to training idf
-      dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
-      dfm_test <- custom_tfidf(dfm_test, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq[words])
-      pred <- predict(text_model, newdata = dfm_test)
-      class_table <- table(prediction = pred, trueValues = docvars(dfm_test, class_type))
-      conf_mat <- confusionMatrix(class_table, mode = "everything")
-      if (is.matrix(conf_mat$byClass) == T) {
-        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
-      } else {
-        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive))
-      }
-    }
-  }
-
-  ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script)
-  folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type)
+  # Create (inner) folds for parameter optimization on the entire dataset
+  final_folds <- cv_generator(NULL,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
+  final_grid <- final_folds$grid
+  final_inner <- final_folds$inner_folds

-  ## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator)
-  performance <- bind_rows(mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)) %>%
-    bind_rows(., colMeans(select(., 1:`Balanced Accuracy`)))
+  ## Create multithread work pool for future_lapply
+  plan(strategy = multiprocess, workers = cores)

-  ## Set seed and generate folds for final hyperparameter optimization search (using CV)
-  set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
-  folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k))
+  # Use the estimator function to estimate the performance of each row in final_grid
+  final_cv_output <- future_lapply(1:nrow(final_grid), estimator,
+                                   grid = final_grid,
+                                   outer_folds = NULL,
+                                   inner_folds = final_inner,
+                                   dfm = dfm,
+                                   class_type = class_type,
+                                   we_vectors = we_vectors,
+                                   model = model) %>%
+    future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid
+    bind_rows(.)

-  ## Get the final hyperparameter performance for all value combinations in grid
-  params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer))

-  ## Select optimum final hyperparameters
-  optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>%
-    select(percentiles: ncol(.))
+  final_params <- final_cv_output %>%
+    # Group final parameter optimization cv results by parameters used for optimization
+    group_by_at(colnames(grid)) %>%
+    # Get mean performance metrics for each fold
+    summarise_if(is.numeric, mean, na.rm = F) %>%
+    # Ungroup to allow for slicing
+    ungroup() %>%
+    # Select row with highest value of opt_measure
+    slice(which.max((!!as.name(opt_measure)))) %>%
+    # Keep only the columns that are present in the original parameter grid
+    select(colnames(grid))
+  # Use the estimator function to estimate the final model, using the optimum parameters provided in final_params
+  model_final <- estimator(1,
+                           grid = final_params,
+                           outer_folds = NULL,
+                           inner_folds = NULL,
+                           dfm = dfm,
+                           class_type = class_type,
+                           model = model)
+  # Create list with output variables
+  output <- list(final_params = final_params,
+                 outer_cv_output = outer_cv_output,
+                 model_final = model_final,
+                 grid = grid,
+                 seed = seed,
+                 opt_measure = opt_measure,
+                 model = model,
+                 country = country,
+                 class_type = class_type)

-  ## Estimate final model on whole dataset, using optimum final hyperparameters determined above
-  model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = detectCores())
-  rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
-  save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
-  return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
+  # Return ouput
+  return(output)
 }
--- a/R/modelizer_old.R
+++ b/R/modelizer_old.R
@ -0,0 +1,301 @@
+#' Generate a classification model
+#'
+#' Generate a nested cross validated classification model based on a dfm with class labels as docvars
+#' Currently only supports Naïve Bayes using quanteda's textmodel_nb
+#' Hyperparemeter optimization is enabled through the grid parameter
+#' A grid should be generated from vectors with the labels as described for each model, using the crossing() command
+#' For Naïve Bayes, the following parameters can be used:
+#' - percentiles (cutoff point for tf-idf feature selection)
+#' - measures (what measure to use for determining feature importance, see textstat_keyness for options)
+#' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars
+#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds)
+#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)
+#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)
+#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)
+#' @param seed Integer to use as seed for random number generation, ensures replicability
+#' @param outer_k Number of outer cross-validation folds (for performance estimation)
+#' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection)
+#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
+#' @param class_type Type of classification to model ("junk", "aggregate", or "codes")
+#' @param opt_measure Label of measure in confusion matrix to use as performance indicator
+#' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename)
+#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
+#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
+#' @export
+#' @examples
+#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
+#################################################################################################
+#################################### Function to generate classification models #################
+#################################################################################################
+
+modelizer_old <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
+  ### Functions ###
+  feat_select <- function (topic, dfm, class_type, percentile,measure) {
+    keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
+      na.omit()
+    keyness[,2] <- abs(keyness[,2])
+    keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
+    return(keyness)
+  }
+  ### Generate inner folds for nested cv
+  inner_loop <- function(fold, dfm, inner_k, class_type) {
+    # RNG needs to be set explicitly for each fold
+    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+    ## Either createDataPartition for simple holdout parameter optimization
+    ## Or createFolds for proper inner CV for nested CV
+    # if (inner_k <= 1) {
+    #   inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
+    # } else {
+    inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k)
+    # }
+    return(c(outer_fold = list(fold),inner_folds))
+  }
+
+  ### Generate outer folds for nested cv
+  generate_folds <- function(outer_k, inner_k, dfm, class_type){
+    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+    folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
+    return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type))
+  }
+
+  ### Gets called for every parameter combination, and calls classifier for every inner cv fold
+  inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
+    print(str_c('params ',row))
+    params <- grid[row,]
+    # For each inner fold, cross validate the specified parameters
+    res <-
+      bind_rows(mclapply(inner_folds,
+                         classifier,
+                         outer_fold = outer_fold,
+                         params = params,
+                         dfm = dfm,
+                         class_type = class_type,
+                         model = model,
+                         cores_feats = cores_feats,
+                         mc.cores = cores_inner
+      )
+      )
+    # print(res)
+    # print(res[1,1])
+    # print('inner_cv')
+    return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params))
+  }
+
+  ### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
+  outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
+    print('outer cv')
+    # If fold contains both inner folds and outer fold
+    if (length(fold) == inner_k + 1) {
+      inner_folds <- fold[-1]
+      outer_fold <- fold$outer_fold
+      # For each row in grid, cross-validate results
+      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
+                                inner_cv,
+                                cores_feats= cores_feats,
+                                grid = grid,
+                                dfm = dfm,
+                                class_type = class_type,
+                                model = model,
+                                outer_fold = outer_fold,
+                                inner_folds = inner_folds,
+                                cores_inner = cores_inner,
+                                mc.cores = cores_grid)
+      )
+      # print(res)
+      # print(res[1,1])
+      # print('outer_cv')
+      # Determine optimum hyperparameters within outer fold training set
+      optimum <- res[which.max(res[,opt_measure]),] %>%
+        select(percentiles: ncol(.))
+      # Validate performance of optimum hyperparameters on outer fold test set
+      return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats))
+    } else {
+      # If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters
+      inner_folds <- fold
+      outer_fold <- NULL
+      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
+                                inner_cv,
+                                cores_feats= cores_feats,
+                                grid = grid,
+                                dfm = dfm,
+                                class_type = class_type,
+                                model = model,
+                                outer_fold = outer_fold,
+                                inner_folds = inner_folds,
+                                cores_inner = cores_inner,
+                                mc.cores = cores_grid)
+      )
+      # print(res)
+      # print(res[1,1])
+      # print('line 126, final model parameter optimization')
+      return(res)
+    }
+  }
+
+  # ### Custom tfidf function to allow same idf for different dfm's
+  # custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
+  #   if (!nfeat(x) || !ndoc(x)) return(x)
+  #   tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
+  #   if (nfeat(x) != length(dfreq))
+  #     stop("missing some values in idf calculation")
+  #   # get the document indexes
+  #   j <- as(tfreq, "dgTMatrix")@j + 1
+  #   # replace just the non-zero values by product with idf
+  #   x@x <- tfreq@x * dfreq[j]
+  #   # record attributes
+  #
+  #   ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2
+  #
+  #   # x@weightTf <- tfreq@weightTf
+  #   # x@weightDf <- c(list(scheme = scheme_df, base = base), args)
+  #   return(x)
+  # }
+
+  ### Classification function
+  classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
+    # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
+    if (length(inner_fold) > 0 && length(outer_fold) > 0)  {
+      dfm_train <- dfm[-outer_fold,] %>%
+        .[-inner_fold,]
+      dfm_test <- dfm[-outer_fold,] %>%
+        .[inner_fold,]
+      # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
+    } else if (length(outer_fold) > 0 ) {
+      dfm_train <- dfm[-outer_fold,]
+      dfm_test <- dfm[outer_fold,]
+      validation_cv <- T
+      # If only inner folds, validate performance directly on inner folds (is the same as above?)
+    } else if (length(inner_fold) > 0 ) {
+      dfm_train <- dfm[-inner_fold,]
+      dfm_test <- dfm[inner_fold,]
+      # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
+    } else {
+      final <- T ### Indicate final modeling run on whole dataset
+      dfm_train <- dfm
+    }
+    if (model == 'nb') {
+      scheme_tf <- 'count' # The 'old' way
+    } else {
+      scheme_tf <- 'prop' # The 'new' way
+    }
+    ### Getting features from training dataset
+    # Getting idf from training data, and using it to normalize both training and testing feature occurence
+    dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
+    idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
+    dfm_train <- dfm_weight(dfm_train, weights = idf)
+    # Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
+    # than the threshold on two or more of those categories
+    words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
+                             feat_select,
+                             dfm = dfm_train,
+                             class_type = class_type,
+                             percentile = params$percentiles,
+                             measure = params$measures,
+                             mc.cores = cores_feats
+    )))
+    # dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq)
+    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
+
+
+    if (model == "nb") {
+      text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
+    }
+    if (model == "svm") {
+      text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
+    }
+    if (model == 'nnet') {
+      idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
+      text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
+    }
+    # if (model == 'neuralnet') {
+    #   dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
+    #   dfm_test <- custom_tfidf(dfm_test, scheme_tf = "prop", scheme_df = "inverse", base = 10, dfreq = dfreq[words])
+    #
+    #   idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
+    #   colnames(idC) <- NULL
+    #   nn_train <- cbind(idC, dfm_train)
+    #   n <- colnames(nn_train)[3:length(colnames(nn_train))]
+    #   f <- as.formula(paste0("feat1 + feat2 ~ `", paste0(n, collapse = "` + `"),"`"))
+    #   idC_out <- class.ind(as.factor(docvars(dfm_test, class_type)))
+    #   colnames(idC_out) <- NULL
+    #   nn_test <- cbind(idC_out, dfm_test)
+    #   nn <- neuralnet(f,data=nn_train,hidden=3,linear.output=F,act.fct = 'logistic',lifesign = 'full', threshold = .005)
+    #   pr.nn <- compute(nn,nn_test[,3:length(colnames(nn_test))])
+    #   class_table <- table(prediction = as.matrix(round(pr.nn$net.result[,2])), trueValues = as.matrix(idC_out[,2]))
+    #   nn_conf_mat <- confusionMatrix(class_table, mode = "everything")
+    # }
+    ### Add more if statements for different models
+    if (exists("final")) {
+      return(list(text_model=text_model, idf=idf))
+    } else {
+      ### Removing all features not in training set from test set and weighting the remaining features according to training idf
+      dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
+      dfm_test <- dfm_weight(dfm_test, weights = idf)
+      pred <- predict(text_model, newdata = dfm_test, type = 'class')
+      ### Fix for single-class 'predictions' in borderline situations
+      # if (length(unique(pred)) == 1 & class_type == 'junk') {
+      #   if (unique(pred) == '0') {
+      #     pred[1] <- '1'
+      #   } else {
+      #     pred[1] <- '0'
+      #   }
+      # }
+
+      ### Fix for missing classes in multiclass classification
+      u <- union(pred, docvars(dfm_test, class_type))
+
+      class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u))
+      if (length(unique(u)) == 2) {
+        conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
+      } else {
+        conf_mat <- confusionMatrix(class_table, mode = "everything")
+        conf_mat$positive <- NA
+      }
+      if (exists("validation_cv")) {
+        return(data.frame(
+          tv = docvars(dfm_test, class_type),
+          pred = pred,
+          params = params,
+          pos_cat = conf_mat$positive,
+          stringsAsFactors = F
+        ))
+      }
+      if (is.matrix(conf_mat$byClass) == T) {
+        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
+      } else {
+        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive))
+      }
+    }
+  }
+
+  ### If outer_k is 1, do a holdout training run, with only cross-validation for parameter optimization, else, do nested CV
+  ### If holdout, training/test distribution is the same as for inner CV
+  if (outer_k < 1) {
+    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+    outer_fold <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k)
+    folds <- lapply(outer_fold,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type)
+  } else {
+    ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script)
+    folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type)
+  }
+
+  ## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator)
+  performance <- mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)
+
+  ## Set seed and generate folds for final hyperparameter optimization search (using CV)
+  set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+  folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k))
+
+  ## Get the final hyperparameter performance for all value combinations in grid
+  params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer))
+
+  ## Select optimum final hyperparameters
+  optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>%
+    select(percentiles: ncol(.))
+
+  ## Estimate final model on whole dataset, using optimum final hyperparameters determined above
+  model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer)))
+  rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
+  save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
+  return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
+}
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -0,0 +1,89 @@
+#' Parse raw text into a single field
+#'
+#' Parse raw text from the MaML database into a single field
+#' @param out The original output data frame
+#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
+#' @return a parsed output data frame including the additional column 'merged', containing the merged text
+#' @export
+#' @examples
+#' out_parser(out,field)
+
+#################################################################################################
+#################################### Parser function for output fields ##########################
+#################################################################################################
+out_parser <- function(out, field, clean = F) {
+  fncols <- function(data, cname) {
+    add <-cname[!cname%in%names(data)]
+
+    if(length(add)!=0) data[, (add) := (NA)]
+    data
+  }
+
+  out <- fncols(data.table(out), c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
+  par_parser <- function(row, out, field, clean) {
+    doc <- out[row,]
+    if (field == 'highlight') {
+
+      doc <- doc %>%
+        unnest(cols = starts_with("highlight")) %>%
+      mutate(across(starts_with("highlight"), na_if, "NULL")) %>%
+        mutate(highlight.title = coalesce(highlight.title, `_source.title`),
+               highlight.subtitle = coalesce(highlight.subtitle, `_source.subtitle`),
+               highlight.preteaser = coalesce(highlight.preteaser, `_source.preteaser`),
+               highlight.teaser = coalesce(highlight.teaser, `_source.teaser`),
+               highlight.text = coalesce(highlight.text, `_source.text`)
+        ) %>%
+        mutate(highlight.title = str_replace_na(highlight.title, replacement = ''),
+               highlight.subtitle = str_replace_na(highlight.subtitle, replacement = ''),
+               highlight.preteaser = str_replace_na(highlight.preteaser, replacement = ''),
+               highlight.teaser = str_replace_na(highlight.teaser, replacement = ''),
+               highlight.text = str_replace_na(highlight.text, replacement = '')
+        ) %>%
+        mutate(
+          merged = str_c(highlight.title,
+                         highlight.subtitle,
+                         highlight.preteaser,
+                         highlight.teaser,
+                         highlight.text,
+                         '',
+                         sep = ". ")
+        )
+    }
+
+    if (field == '_source') {
+      doc <- doc %>%
+        mutate(`_source.title` = str_replace_na(`_source.title`, replacement = ''),
+               `_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = ''),
+               `_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = ''),
+               `_source.teaser` = str_replace_na(`_source.teaser`, replacement = ''),
+               `_source.text` = str_replace_na(`_source.text`, replacement = '')
+               ) %>%
+      mutate(
+        merged = str_c(`_source.title`,
+                       `_source.subtitle`,
+                       `_source.preteaser`,
+                       `_source.teaser`,
+                       `_source.text`,
+                       '',
+                       sep = ". ")
+      )
+    }
+
+    ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
+    # Remove html tags, and multiple consequent whitespaces
+    # Regex removes all words consisting of or containing numbers, @#$%
+    # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
+    # Regex also used in merger function
+    ### Old regex, used for duplicate detection:
+    # \\S*?[0-9@#$%]+[^\\s!?.,;:]*
+    doc$merged <- doc$merged %>%
+      str_replace_all("<.{0,20}?>", " ") %>%
+      str_replace_all('(\\. ){2,}', '. ') %>%
+      str_replace_all('([!?.])\\.','\\1') %>%
+      str_replace_all("\\s+"," ") %>%
+      {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . }
+    return(doc)
+  }
+  return(par_parser(1:nrow(out), out=out, clean=clean, field=field))
+}
--- a/R/preproc.R
+++ b/R/preproc.R
@ -0,0 +1,71 @@
+#' Preprocess dfm data for use in modeling procedure
+#'
+#' Process dfm according to parameters provided in params
+#'
+#' @param dfm_train Training dfm
+#' @param dfm_test Testing dfm if applicable, otherwise NULL
+#' @param params Row from grid with parameter optimization
+#' @param we_vectors Matrix with word embedding vectors
+#' @return List with dfm_train and dfm_test, processed according to parameters in params
+#' @export
+#' @examples
+#' preproc(dfm_train, dfm_test = NULL, params)
+#################################################################################################
+#################################### Preprocess data ############################################
+#################################################################################################
+
+
+### CURRENTLY UNUSED!!!###
+
+preproc <- function(dfm_train, dfm_test = NULL, params, we_vectors) {
+  # Remove non-existing features from training dfm
+  dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
+  if (params$tfidf) {
+    idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
+    dfm_train <- dfm_weight(dfm_train, weights = idf)
+    if (!is.null(dfm_test)) {
+      dfm_test <- dfm_weight(dfm_test, weights = idf)
+    }
+  } else {
+    idf <- NULL
+  }
+
+  if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
+
+    # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
+    words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
+                                  feat_select,
+                                  dfm = dfm_train,
+                                  class_type = params$class_type,
+                                  percentile = params$feat_percentiles,
+                                  measure = params$feat_measures
+    )))
+    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
+  }
+
+  if (!is.null(we_vectors)) {
+    shared_dict <- sort(intersect(dfm_train@Dimnames$features,we_vectors$V1))
+    if (!is.null(dfm_test)) {
+      shared_dict <- sort(intersect(dfm_test@Dimnames$features,shared_dict))
+      dfm_test <- dfm_keep(dfm_test, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
+        .[, sort(colnames(.))]
+    }
+    dfm_train <- dfm_keep(dfm_train, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
+      .[, sort(colnames(.))]
+    we_matrix <- filter(we_vectors, V1 %in% shared_dict) %>%
+      arrange(V1) %>%
+      as.data.table(.) %>%
+      .[,2:ncol(.), with = F] %>%
+      as.matrix(.)
+
+    dfm_train_we_sum <- dfm_train %*% we_matrix
+    # dfm_train_we_mean <- dfm_train_we_sum / as.vector(rowSums(dfm_train))
+
+    if (!is.null(dfm_test)) {
+      dfm_test_we_sum <- dfm_test %*% we_matrix
+      # dfm_test_we_mean <- dfm_test_we_sum / as.vector(rowSums(dfm_test))
+    }
+    return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf, dfm_train_we = dfm_train_we_sum, dfm_test_we = dfm_test_we_sum))
+  }
+  return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
+}
--- a/R/query_gen_actors.R
+++ b/R/query_gen_actors.R
@ -2,7 +2,8 @@
 #'
 #' Generate actor search queries based on data in actor db
 #' @param actor A row from the output of elasticizer() when run on the 'actor' index
-#' @param country 2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.
+#' @param pre_tags Highlighter pre-tag
+#' @param post_tags Highlighter post-tag
 #' @return A data frame containing the queries, related actor ids and actor function
 #' @export
 #' @examples
@ -11,8 +12,28 @@
 #################################################################################################
 #################################### Actor search query generator ###############################
 #################################################################################################
-query_gen_actors <- function(actor, country) {
-  highlight <- paste0('"highlight" : {
+query_gen_actors <- function(actor, pre_tags, post_tags) {
+  generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
+    return(paste0('{"_source": ["ud","title","subtitle","preteaser","teaser","text"],
+    "query":
+                    {"bool": {
+                      "filter":[
+                        {"term":{"country":"',country,'"}},
+                        {"range":{"publication_date":{"gte":"',startdate,'","lte":"',enddate,'"}}},
+                        {"query_string" : {
+                          "default_operator" : "OR",
+                          "allow_leading_wildcard" : "false",
+                          "fields": ["text","teaser","preteaser","title","subtitle"],
+                          "query" : "', querystring,'"
+                          }
+                        }
+                      ],
+                      "must_not":[
+                        {"term":{"computerCodes.actors.keyword":"',actorid,'"}}
+                      ]
+                    }
+                  },
+                "highlight" : {
                  "fields" : {
                  "text" : {},
                  "teaser" : {},
@ -24,107 +45,197 @@ query_gen_actors <- function(actor, country) {
                  "order": "none",
                  "type":"unified",
                  "fragment_size":0,
-                      "pre_tags":"',identifier,'",
-                      "post_tags": ""
-}')
-if (country == "no") {
+                  "pre_tags":"', pre_tags,'",
+                  "post_tags": "',post_tags,'"
+                }
+              }'))
+  }
+  prox_gen <- function(row, grid) {
+    return(
+      paste0('\\"',grid[row,]$first,' ',grid[row,]$last,'\\"~',grid[row,]$prox)
+    )
+  }
+  country <- actor$`_source.country`
+  ### Setting linguistic forms for each country ###
+  if (country == "no" | country == "dk") {
    genitive <- 's'
    definitive <- 'en'
-  definitive_genitive <- 'ens'
-} else {
-  genitive <- ''
-  definitive <- ''
-  definitive_genitive <- ''
-}
+  } else if (country == 'uk') {
+    genitive <- '\'s'
+  } else if (country == 'nl' | country == 'be') {
+    genitive <- 's'
+  }

-  if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader") {
-    lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')')
+  ### Generating queries for individuals (ministers, PM, Party leaders and MPs)
+  if (actor$`_source.function` == "JunMin" | actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader" | actor$`_source.function` == "MP") {
    ## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name
    ## Regardless of whether the last name hit is because of a minister name or a full name proximity hit
-    query_string <- paste0('(((\\"',
-                           actor$`_source.firstName`,
-                           ' ',
-                           actor$`_source.lastName`,
-                           '\\"~5 OR \\"',
-                           actor$`_source.firstName`,
-                           ' ',
-                           actor$`_source.lastName`,genitive,
-                           '\\"~5) AND ',lastname)
-  }
-  if (actor$`_source.function` == "PartyLeader") {
-    query_string <- paste0(query_string,'))')
-    ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.partyId`),str_c, "_pl")))
+
+    ### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name
+    if (country == 'be' && T %in% str_detect(actor$`_source.middleNames`,"'")) {
+      last_list <- c(actor$`_source.lastName`, str_c(actor$`_source.lastName`,genitive), tolower(actor$`_source.lastName`), str_c(tolower(actor$`_source.lastName`),genitive))
+      grid <- crossing(first = actor$`_source.firstName`, last = last_list, prox = 5)
+      fullname <- lapply(1:nrow(grid), prox_gen, grid = grid)
+      query_string <- paste0('((',
+                             paste0(unlist(fullname), collapse = ' OR '),') AND ',
+                             paste0('(',paste0(unlist(last_list), collapse = ' OR '),')'))
+    } else {
+      last_list <- c(actor$`_source.lastName`, str_c(actor$`_source.lastName`,genitive))
+      grid <- crossing(first = actor$`_source.firstName`, last = last_list, prox = 5)
+      fullname <- lapply(1:nrow(grid), prox_gen, grid = grid)
+      query_string <- paste0('((',
+                             paste0(unlist(fullname), collapse = ' OR '),') AND ',
+                             paste0('(',paste0(unlist(last_list), collapse = ' OR '),')'))
    }
+
+    ### If actor is a minister, generate minister search
    if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM") {
-    ## Modifiers are only applied to minister titles
-    capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title))
-    capital_gen <- unlist(lapply(capital, str_c, genitive))
-    capital_def <- unlist(lapply(capital, str_c, definitive))
-    capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive))
-    gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive))
-    def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive))
-    defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive))
-    names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ')
-    query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))')
-    ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.ministryId`,actor$`_source.partyId`),str_c, "_min")))
+      if (country == "no" || country == "dk") {
+        minister <- str_split(actor$`_source.ministerSearch`, pattern = '-| ') %>%
+          map(1)
+
+        capital <- unlist(str_to_title(minister))
+        capital_def <- unlist(str_c(capital, definitive))
+        def <- unlist(str_c(minister,definitive))
+        minister <- unlist(c(minister,capital,capital_def,def))
      }
-  if (actor$`_source.function` == "Party") {
-    actor$`_source.startDate` <- "2000-01-01"
-    actor$`_source.endDate` <- "2099-01-01"
-    names <- paste(c(unlist(actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"')
-    query_string <- paste0('(\\"',names,'\\")')
-    query <- paste0('{"query":
-                    {"bool": {"filter":[{"term":{"country":"',country,'"}},
-                    {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
-                    {"query_string" : {
-                    "default_operator" : "OR",
-                    "allow_leading_wildcard" : "false",
-                    "fields": ["text","teaser","preteaser","title","subtitle"],
-                    "query" : "', query_string,'"
+      if (country == "uk") {
+        minister <- c(str_to_title(actor$`_source.ministerName`),
+                        actor$`_source.ministerName`)
+        if(actor$`_source.function` == "PM") {
+          minister <- c(minister,
+                        "PM")
        }
      }
-                    ]
-                    } },',highlight,' }')
-    ids <- c(toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p"))))
-
-    if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
-      names <- paste(c(unlist(actor$`_source.partyNameSearch`)), collapse = '\\" \\"')
-      query_string <- paste0('(\\"',names,'\\")')
-      query2 <- paste0('{"query":
-                       {"bool": {"filter":[{"term":{"country":"',country,'"}},
-                       {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
-                       {"query_string" : {
-                       "default_operator" : "OR",
-                       "allow_leading_wildcard" : "false",
-                       "fields": ["text","teaser","preteaser","title","subtitle"],
-                       "query" : "', query_string,'"
+      if (country == "nl" | country == "be") { # If country is nl or be, add a requirement for Minister to the query
+        minister <- c("Minister",
+                      "minister")
+        if(actor$`_source.function` == "PM") {
+          minister <- c(minister,
+                        "Premier",
+                        "premier")
        }
+        if(actor$`_source.function` == "JunMin") {
+          minister <- c("Staatssecretaris",
+                        "staatssecretaris")
+        }
+      }
+      grid <- crossing(first = last_list, last = minister, prox = 5)
+      ministername <- lapply(1:nrow(grid), prox_gen, grid = grid)
+      query_string <- paste0(query_string,') OR ((',
+                             paste0(unlist(ministername), collapse= ' OR '),') AND ',
+                             paste0('(',paste0(unlist(last_list), collapse = ' OR '),'))'))
+    } else { ### Else, generate search for first/last name only (MPs and Party leaders, currently)
+      query_string <- paste0(query_string,')')
+    }
+    ids <- list(c(actor$`_source.actorId`,str_c(actor$`_source.partyId`,'_a')))
+    actorid <- actor$`_source.actorId`
+    query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+    return(data.frame(query = query, ids = I(ids), prefix = NA, postfix = NA, stringsAsFactors = F))
  }
-                       ]
-                       } },',highlight,' }')
-

-    ids <- c(ids, toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p"))))
-    query <- c(query, query2)
-    fn <- c('PartyAbbreviation','Party')
+  ### Query generation for party searches
+  if (actor$`_source.function` == "Party") {
+    # actor$`_source.startDate` <- "2000-01-01"
+    # actor$`_source.endDate` <- "2099-01-01"
+    if (nchar(actor$`_source.partyNameSearchShort`[[1]]) > 0) {
+      # If uk, no or dk, search for both regular abbreviations, and genitive forms
+      if (country == "uk" | country == "no" | country == "dk") {
+        gen <- unlist(lapply(actor$`_source.partyNameSearchShort`, str_c, genitive))
+        names <- paste(unlist(c(gen,actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"')
      } else {
-      fn <- c('PartyAbbreviation')
+        names <- paste(unlist(actor$`_source.partyNameSearchShort`), collapse = '\\" \\"')
      }
-    return(data.frame(query = query, ids = I(ids), type = fn, prefix = actor$`_source.searchAnd`, postfix = actor$`_source.searchAndNot`, stringsAsFactors = F))
+      # If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself)
+      if (country == "dk" | country == "no") {
+        gen <- gen[which(nchar(gen) > 2)]
      }
-
-  query <- paste0('{"query":
-                  {"bool": {"filter":[{"term":{"country":"',country,'"}},
-                  {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
-                  {"query_string" : {
-                  "default_operator" : "OR",
-                  "allow_leading_wildcard" : "false",
-                  "fields": ["text","teaser","preteaser","title","subtitle"],
-                  "query" : "', query_string,'"
+      query_string <- paste0('(\\"',unlist(names),'\\")')
+      ids <- str_c(actor$`_source.partyId`,'_s')
+      actorid <- str_c(actor$`_source.partyId`,'_s')
+      query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+      df1 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
+    }
+    if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
+      if (country == "uk" | country == "no" | country == "dk") {
+        gen <- unlist(lapply(actor$`_source.partyNameSearch`, str_c, genitive))
+        names <- paste(unlist(c(gen,actor$`_source.partyNameSearch`)), collapse = '\\" \\"')
+      } else {
+        names <- paste(unlist(actor$`_source.partyNameSearch`), collapse = '\\" \\"')
      }
+      query_string <- paste0('(\\"',unlist(names),'\\")')
+      ids <- str_c(actor$`_source.partyId`,'_f')
+      actorid <- str_c(actor$`_source.partyId`,'_f')
+      query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+      if (country == 'uk' | country == 'nl' | country == 'be') {
+        df2 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
+      } else {
+        df2 <- data.frame(query = query, ids = I(ids), prefix = NA, postfix = NA, stringsAsFactors = F)
      }
-                  ]
-                  } },',highlight,' }')
-  fn <- actor$`_source.function`
-  return(data.frame(query = query, ids = I(ids), type = fn, stringsAsFactors = F))
+    }
+    if (exists('df1') == T & exists('df2') == T) {
+      return(bind_rows(df1,df2))
+    } else if (exists('df1') == T) {
+      return(df1)
+    } else if (exists('df2') == T) {
+      return(df2)
+    }
+  }
+  ### Institution function currently not used
+  # if (actor$`_source.function` == "Institution") {
+  #   #uppercasing
+  #   firstup <- function(x) {
+  #     substr(x, 1, 1) <- toupper(substr(x, 1, 1))
+  #     x
+  #   }
+  #   actor$`_source.startDate` <- "2000-01-01"
+  #   actor$`_source.endDate` <- "2099-01-01"
+  #   if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) {
+  #     upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup))
+  #     upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
+  #                unlist(lapply(upper, str_c, definitive)),
+  #                unlist(lapply(upper, str_c, definitive_genitive)))
+  #     capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title))
+  #     capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
+  #                  unlist(lapply(capital, str_c, definitive)),
+  #                  unlist(lapply(capital, str_c, definitive_genitive)))
+  #     base <- actor$`_source.institutionNameSearch`
+  #     base <- c(base, unlist(lapply(base, str_c, genitive)),
+  #               unlist(lapply(base, str_c, definitive)),
+  #               unlist(lapply(base, str_c, definitive_genitive)))
+  #     names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
+  #     query_string <- paste0('(\\"',names,'\\")')
+  #     ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f")))
+  #     actorid <- str_c(actor$`_source.institutionId`,'_f')
+  #     query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+  #     df1 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
+  #   }
+  #   if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) {
+  #     upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup))
+  #     upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
+  #                unlist(lapply(upper, str_c, definitive)),
+  #                unlist(lapply(upper, str_c, definitive_genitive)))
+  #     capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title))
+  #     capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
+  #                  unlist(lapply(capital, str_c, definitive)),
+  #                  unlist(lapply(capital, str_c, definitive_genitive)))
+  #     base <- actor$`_source.institutionNameSearchShort`
+  #     base <- c(base, unlist(lapply(base, str_c, genitive)),
+  #               unlist(lapply(base, str_c, definitive)),
+  #               unlist(lapply(base, str_c, definitive_genitive)))
+  #     names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
+  #     query_string <- paste0('(\\"',names,'\\")')
+  #     ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s")))
+  #     actorid <- str_c(actor$`_source.institutionId`,'_s')
+  #     query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+  #     df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
+  #   }
+  #   if (exists('df1') == T & exists('df2') == T) {
+  #     return(bind_rows(df1,df2))
+  #   } else if (exists('df1') == T) {
+  #     return(df1)
+  #   } else if (exists('df2') == T) {
+  #     return(df2)
+  #   }
+  # }
 }
--- a/R/query_string.R
+++ b/R/query_string.R
@ -1,26 +1,60 @@
 #' Generate a query string query for ElasticSearch
 #'
 #' Generate a query string query for ElasticSearch
-#' @param x Query string in ElasticSearch query string format
+#' @param query Query string in ElasticSearch query string format
+#' @param fields List of field names to return, defaults to all
+#' @param random Return randomized results. Boolean, defaults to FALSE
 #' @return A formatted ElasticSearch query string query
 #' @export
 #' @examples
-#' query_string(x)
+#' query_string(query)
 #################################################################################################
 #################################### Get data from ElasticSearch ################################
 #################################################################################################

-query_string <- function(x) {
+query_string <- function(query, fields = F, random = F, default_operator = "AND") {
+  if (typeof(fields) == 'logical') {
+    fields <- '*'
+  }
+  if (random == T) {
+    return(paste0(
+      '{
+      "_source": ',toJSON(fields),',
+        "query": {
+          "function_score": {
+            "query": {
+              "bool":{
+                "filter": [{
+                  "query_string" : {
+                      "query" : "',query,'",
+                      "default_operator": "',default_operator,'",
+                      "allow_leading_wildcard" : false
+                  }
+                }]
+              }
+            },
+            "random_score": {},
+            "boost_mode": "sum"
+          }
+        }
+      }'
+    ))
+  } else {
  return(paste0(
    '{
+      "_source": ',toJSON(fields),',
      "query": {
+        "bool":{
+          "filter": [{
            "query_string" : {
-            "default_field" : "text",
-            "query" : "',x,'",
-            "default_operator": "AND",
+                "query" : "',query,'",
+                "default_operator": "',default_operator,'",
                "allow_leading_wildcard" : false
            }
+          }]
+        }
      }
-}'
+    }'
  ))
+  }
 }
--- a/R/sent_merger.R
+++ b/R/sent_merger.R
@ -0,0 +1,260 @@
+#' Aggregate sentence-level dataset containing sentiment (from sentencizer())
+#'
+#' Aggregate sentence-level dataset containing sentiment (from sentencizer())
+#' @param df Data frame with actor ids, produced by sentencizer
+#' @param actors_meta Optional data frame containing actor metadata obtained using elasticizer(index="actors")
+#' @param actor_groups Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
+#' @param pos_cutoff Optional value above which sentence-level sentiment scores should be considered "positive"
+#' @param neg_cutoff Optional value below which sentence-level sentiment scores should be considered "negative"
+#' @param single_party Boolean to generate data only from sentences in which a single party is mentioned, defaults to FALSE
+#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
+#' @export
+#' @examples
+#' sent_merger(df, actors_meta, ids = NULL)
+#################################################################################################
+#################################### Generate actor-article dataset #############################
+#################################################################################################
+
+### NOTE: The exceptions for various partyId_a ids has been implemented because of an error with
+### some individual actors, where the partyId of an individual actor doesn't match an actual
+### partyId in the actor dataset
+
+sent_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff = NULL, neg_cutoff = NULL, single_party = F) {
+  grouper <- function(id2, df) {
+    # Prevent usage of deprecated partyId_a ids, which are incorrect and should no longer be used
+    if (any(str_ends(id2, '_a'))) {
+      return("You're seemingly using a deprecated [partyId]_a id in your aggregations")
+    }
+    return(df[ids %in% id2,] %>%
+             .[!duplicated(.,by = c('id','sentence_id')),.(
+               actor.sent = sum(sent_binary_weighted)/sum(words),
+               actor.sent_words = sum(sent_words),
+               actor.words = sum(words),
+               # actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
+               actor.first = min(sentence_id),
+               actor.occ = .N,
+               publication_date = first(publication_date),
+               ids = str_c(id2, collapse = '-')
+             ), by = c('id')]
+           )
+  }
+
+  ## Remove some of the metadata from the source df
+  df <- data.table(df)[,.(
+    (.SD),
+    doctype = as.factor(`_source.doctype`),
+    publication_date = as.Date(`_source.publication_date`),
+    id = as.factor(`_id`)
+  ), .SDcols = !c('_source.doctype','_source.publication_date','_id')]
+
+  ## Create bogus variables if sentiment is not scored
+  if(!"sent_sum" %in% colnames(df)) {
+    df <- df[,.(
+      (.SD),
+      sent_words = 0,
+      sent_sum = 0
+    )]
+  }
+
+  ## Unnest to sentence level
+
+  ## Check if raw sentiment data contains actor ids
+  if ('ids' %in% colnames(df)) {
+    df <- df[,lapply(.SD, unlist, recursive=F),
+             .SDcols = c('sentence_id', 'sent_sum', 'words', 'sent_words','ids'),
+             by = list(id,publication_date,doctype)]
+  } else {
+    df <- df[,lapply(.SD, unlist, recursive=F),
+             .SDcols = c('sentence_id', 'sent_sum', 'words', 'sent_words'),
+             by = list(id,publication_date,doctype)]
+  }
+
+  df <- df[,.(
+    (.SD),
+    sent = sent_sum/words
+  )][,.(
+    (.SD),
+    sent_binary = case_when(
+      sent > pos_cutoff ~ 1,
+      sent == 0 ~ 0,
+      sent >= neg_cutoff & sent <= pos_cutoff ~ 0,
+      TRUE ~ -1
+    )
+  )][,.(
+    (.SD),
+    sent_binary_weighted = sent_binary*words
+  )]
+
+  text_sent <- df[,
+                      .(text.sent = sum(sent_binary_weighted)/sum(words),
+                        text.sent_words = sum(sent_words),
+                        text.words = sum(words),
+                        text.arousal = sum(sent_words)/sum(words),
+                        text.sentences = .N,
+                        doctype = first(doctype),
+                        publication_date = first(publication_date)
+                      ), by = list(id)]
+
+    ## Create aggregations according to list of actorId vectors in ids
+  if(!is.null(actor_groups)) {
+    output <- lapply(actor_groups,grouper, df = df) %>%
+      rbindlist(.) %>%
+      left_join(text_sent, by=c("id","publication_date")) %>%
+      mutate(
+        actor.prom = actor.occ/text.sentences,
+        actor.rel_first = 1-(actor.first/text.sentences),
+        year = strftime(publication_date, format = '%Y'),
+        yearmonth = strftime(publication_date, format = '%Y%m'),
+        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
+        yearweek = strftime(publication_date, format = "%Y%V")
+      ) %>%
+      mutate(across(where(is.character), as.factor)) %>%
+      mutate(across(where(is.Date), as.factor))
+    return(output)
+  } else if(!is.null(actors_meta)) {
+    text_noactors <- df[lengths(ids) == 0L,
+                        .(noactor.sent = sum(sent_binary_weighted)/sum(words),
+                          noactor.sent_words = sum(sent_words),
+                          noactor.words = sum(words),
+                          noactor.arousal = sum(sent_words)/sum(words),
+                          noactor.first = min(sentence_id),
+                          noactor.occ = .N
+                        ), by = list(id)]
+
+    all <- df[lengths(ids) > 0L,
+              .(actor.sent = sum(sent_binary_weighted)/sum(words),
+                actor.sent_words = sum(sent_words),
+                actor.words = sum(words),
+                actor.arousal = sum(sent_words)/sum(words),
+                actor.first = min(sentence_id),
+                actor.occ = .N,
+                publication_date = first(publication_date),
+                ids = 'all'), by = list(id)]
+
+    ## Unnest to actor level
+    df <- df[,.(ids = as.character(unlist(ids))),
+             by = list(id,publication_date,sentence_id, sent_sum, words, sent_words,sent_binary_weighted)
+            ][ # Remove deprecated actor_partyids from ES database
+             !str_ends(ids, '_a')]
+
+    ## Prepare actor metadata
+    colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','')
+    actors_meta <- data.table(actors_meta)[,
+                               .((.SD),
+                                 startDate = as.Date(startDate),
+                                 endDate = as.Date(endDate),
+                                 ids = ifelse(!is.na(actorId), actorId, partyId)
+                               ), .SDcols = -c('_id','startDate','endDate','_index','_type','_score')
+    ]
+
+    ## Create table with partyIds by date and actorId to join by
+    actors_party <- actors_meta %>%
+      group_by(ids,partyId,startDate,endDate) %>%
+      summarise() %>%
+      na.omit() %>%
+      ungroup() %>%
+      data.table(.)
+
+    ## Add partyId to each actorId without filtering parties out
+    df <- df %>%
+      # Fill partyId column for actor mentions
+      actors_party[., c(colnames(.),'partyId'), # Join by actorId, within active period (start/endDate)
+                   on = .(ids == ids, startDate <= publication_date, endDate >= publication_date),
+                   with = F] %>%
+      # Fill partyId column for party mentions
+      .[is.na(partyId), partyId:=str_sub(ids, start = 1, end = -3)] %>%
+      # Some actors seemingly belong to different parties on the same day, hence basing unique rows on both (actor)ids and partyId
+      .[!duplicated(.,by = c('id','ids','sentence_id','partyId')),] # Keep all unique rows
+
+    ## Removing sentences containing more than one party
+    if(single_party) {
+      # Create variable indicating number of unique party ids per sentence, and keep only sentences where unique parties == 1
+      df <- df %>%
+        .[, upid := length(unique(partyId)), by = c('id','sentence_id')] %>%
+        .[upid == 1,]
+    }
+
+    ## Create aggregate measures for individual actors
+    actors_merged <- df[str_starts(ids, 'A_')] %>% .[!duplicated(.,by = c('id','ids','sentence_id')),] %>% # Removing duplicate rows when actor is counted multiple times in the same sentence, because of multiple functions or parties.
+      .[,
+        .(actor.sent = sum(sent_binary_weighted)/sum(words),
+          actor.sent_words = sum(sent_words),
+          actor.words = sum(words),
+          actor.arousal = sum(sent_words)/sum(words),
+          actor.first = min(sentence_id),
+          actor.occ = .N,
+          publication_date = first(publication_date)), by = list(id, ids)]
+
+    ## Create actor metadata dataframe per active date (one row per day per actor)
+    actors_merged <- actors_meta[actors_merged,
+                           c('x.startDate','x.endDate',colnames(actors_merged), 'lastName','firstName','function.','gender','yearOfBirth','parlPeriod','partyId','ministerName','ministryId','actorId','startDate','endDate'),
+                           on =.(ids = ids, startDate <= publication_date, endDate >= publication_date),
+                           mult = 'all',
+                           with = F][,.(
+                             startDate = x.startDate,
+                             endDate = x.endDate,
+                             (.SD)
+                           ), .SDcols = -c('x.startDate', 'x.endDate','startDate','endDate')]
+    ## Generate party-actor aggregations (mfsa)
+
+    # Create party data table
+    parties_actors <- df %>%
+    .[!duplicated(.,by = c('id','partyId','sentence_id')),.( # Remove rows (sentences) where a party is counted multiple times
+      actor.sent = sum(sent_binary_weighted)/sum(words),
+      actor.sent_words = sum(sent_words),
+      actor.words = sum(words),
+      actor.arousal = sum(sent_words)/sum(words),
+      actor.first = min(sentence_id),
+      actor.occ = .N,
+      publication_date = first(publication_date)
+    ), by = c('id','partyId')] # Summarize by article and partyId
+    # Add party metadata
+    parties_actors <- actors_meta[str_starts(ids, 'P_')][parties_actors, on = c('partyId'), mult = 'first'][!is.na(id),.(ids = str_c(partyId,"_mfsa"), (.SD)), .SDcols = -c('ids')]
+
+    ## Generate party aggregations (mfs)
+    parties <- df[str_ends(ids,'_f') | str_ends(ids,'_s'),.(
+      ids = str_sub(ids, start = 1, end = -3),
+      (.SD)
+    ),.SDcols = -c('ids')] %>% .[!duplicated(.,by = c('id','ids','sentence_id')),.(
+      actor.sent = sum(sent_binary_weighted)/sum(words),
+      actor.sent_words = sum(sent_words),
+      actor.words = sum(words),
+      actor.arousal = sum(sent_words)/sum(words),
+      actor.first = min(sentence_id),
+      actor.occ = .N,
+      publication_date = first(publication_date)
+    ), by = c('id','ids')]
+    parties <- actors_meta[parties, on = c('ids'), mult = 'first'][!is.na(id),.(ids = str_c(ids,"_mfs"), (.SD)), .SDcols = -c('ids')]
+
+    ## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
+    df <- bind_rows(actors_merged, parties, parties_actors, all) %>%
+      left_join(.,text_sent, by=c("id","publication_date")) %>%
+      left_join(.,text_noactors, by="id") %>%
+      mutate(
+        actor.prom = actor.occ/text.sentences,
+        actor.rel_first = 1-(actor.first/text.sentences),
+        year = strftime(publication_date, format = '%Y'),
+        yearmonth = strftime(publication_date, format = '%Y%m'),
+        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
+        yearweek = strftime(publication_date, format = "%Y%V")
+      ) %>%
+      ungroup() %>%
+      select(-contains('Search'),-starts_with('not')) %>%
+      mutate(across(where(is.character), as.factor)) %>%
+      mutate(across(where(is.Date), as.factor))
+    return(df)
+  } else {
+    df <- text_sent %>%
+      mutate(
+        year = strftime(publication_date, format = '%Y'),
+        yearmonth = strftime(publication_date, format = '%Y%m'),
+        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
+        yearweek = strftime(publication_date, format = "%Y%V")
+      ) %>%
+      ungroup() %>%
+      mutate(across(where(is.character), as.factor)) %>%
+      mutate(across(where(is.Date), as.factor))
+    return(df)
+  }
+}
+
--- a/R/sentencizer.R
+++ b/R/sentencizer.R
@ -0,0 +1,116 @@
+#' Generate sentence-level dataset with sentiment and actor presence
+#'
+#' Generate sentence-level dataset with sentiment and actor presence
+#' @param out Data frame produced by elasticizer
+#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.
+#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
+#' @return No return value, data per batch is saved in an RDS file
+#' @export
+#' @examples
+#' sentencizer(out, sent_dict = NULL, validation = F)
+#################################################################################################
+#################################### Generate sentence-level dataset#############################
+#################################################################################################
+sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) {
+  ## Despite the function name, parallel processing is not used, because it is slower
+  par_sent <- function(row, out, sent_dict = NULL) {
+    out <- out[row,]
+    ## Create df with article metadata (fields that are included in the elasticizer function)
+    metadata <- out %>%
+      select(`_id`,`_source.publication_date`,`_source.doctype`) %>%
+      mutate(`_source.publication_date` = as.factor(`_source.publication_date`),
+             `_source.doctype` = as.factor(`_source.doctype`))
+
+    ## Unnest documents into individual words
+    ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
+      unnest(cols = colnames(.)) %>%
+      select(-one_of('exists')) %>%
+      unnest(cols = colnames(.)) %>%
+      filter(upos != 'PUNCT')
+
+    ## If there is a dictionary, apply it
+    if (!is.null(sent_dict)) {
+      ## If the dictionary contains the column lem_u, assume lemma_upos format
+      if ("lem_u" %in% colnames(sent_dict)) {
+        ud_sent <- ud_sent %>%
+          mutate(lem_u = str_c(lemma,'_',upos)) %>%
+          left_join(sent_dict, by = 'lem_u')
+        ## If the dictionary contains the column lemma, assume simple lemma format
+      } else if ("lemma" %in% colnames(sent_dict)) {
+        ud_sent <- ud_sent %>%
+          left_join(sent_dict, by = 'lemma') %>%
+          mutate(lem_u = lemma)
+      }
+
+      ## Group by sentences, and generate dictionary scores per sentence
+      ud_sent <- ud_sent %>%
+        group_by(`_id`,sentence_id) %>%
+        mutate(
+          prox = case_when(
+            is.na(prox) == T ~ 0,
+            TRUE ~ prox
+          )
+        ) %>%
+        summarise(sent_sum = sum(prox),
+                  words = length(lemma),
+                  sent_words = sum(prox != 0),
+                  # sent_lemmas = list(lem_u[prox != 0])
+                  )
+      ## If there is no dictionary, create a ud_sent, with just sentence ids and word counts per sentence
+    } else {
+      ud_sent <- ud_sent %>%
+        group_by(`_id`,sentence_id) %>%
+        summarise(words = length(lemma))
+    }
+
+    ## Remove ud ouptut from source before further processing
+    out <- select(out, -`_source.ud`)
+
+    ## If dictionary validation, return just the sentences that have been hand-coded
+    if (validation == T) {
+      codes_sent <- ud_sent %>%
+        left_join(.,out, by='_id') %>%
+        rowwise() %>%
+        filter(sentence_id == `_source.codes.sentence.id`)
+      return(codes_sent)
+    }
+
+    if("_source.computerCodes.actorsDetail" %in% colnames(out)) {
+
+      ## If actor details in source, create vector of actor ids for each sentence
+      out <- out %>%
+        unnest(`_source.computerCodes.actorsDetail`) %>%
+        # mutate(ids_list = ids) %>%
+        unnest(ids) %>%
+        unnest(sentence_id) %>%
+        group_by(`_id`,sentence_id) %>%
+        summarise(
+          ids = list(ids)
+        )
+    } else {
+      ## If no actor details, keep one row per article and add a bogus sentence_id
+      out <- out %>%
+        group_by(`_id`) %>%
+        summarise() %>%
+        mutate(sentence_id = 1)
+    }
+
+    ## Combine ud_sent with the source dataset
+      out <- out %>%
+        left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
+        group_by(`_id`)
+
+      out <- out %>%
+        summarise_all(list) %>%
+        left_join(.,metadata,by='_id') %>%
+        ungroup()
+    return(out)
+  }
+  saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
+  return()
+  ### Keeping the option for parallel computation
+  # microbenchmark::microbenchmark(out_normal <- par_sent(1:nrow(out),out = out, sent_dict=sent_dict), times = 1)
+  # plan(multiprocess, workers = cores)
+  # chunks <- split(1:nrow(out), sort(1:nrow(out)%%cores))
+  # microbenchmark::microbenchmark(out_par <- bind_rows(future_lapply(chunks,par_sent, out=out, sent_dict=sent_dict)), times = 1)
+}
--- a/R/ud_update.R
+++ b/R/ud_update.R
@ -0,0 +1,56 @@
+#' Generate UDpipe output from base text
+#'
+#' Generate UDpipe output from base text
+#' @param file Filename of file to read in, also used for generating output file name
+#' @param wd Working directory where *file*s can be found
+#' @param ud_file Filename of udpipe model to use, should be in *wd*
+#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
+#' @return A vector of 1's indicating the success of each update call
+#' @export
+#' @examples
+#' ud_update(out, udmodel, ver, file)
+#'
+
+# punct_check <- function(str) {
+#   if (!(stri_sub(str, from = -1)) %in% c('.','!','?')) {
+#     return(str_c(str, '.'))
+#   }
+# }
+
+ud_update <- function(file, wd, ud_file, ver) {
+  out <- readRDS(str_c(wd,'/',file)) %>%
+    out_parser(., field = '_source', clean = F)
+  ud_model <- udpipe_load_model(file = str_c(wd,'/',ud_file))
+  ud <- as.data.frame(udpipe(ud_model, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
+    group_by(doc_id) %>%
+    summarise(
+      sentence_id = list(as.integer(sentence_id)),
+      token_id = list(as.integer(token_id)),
+      lemma = list(as.character(lemma)),
+      upos = list(as.character(upos)),
+      feats = list(as.character(feats)),
+      head_token_id = list(as.integer(head_token_id)),
+      dep_rel = list(as.character(dep_rel)),
+      start = list(as.integer(start)),
+      end = list(as.integer(end)),
+      exists = list(TRUE)
+   )
+  bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
+  saveRDS(bulk, file = str_c(wd,'/ud_',file))
+  # res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
+  return()
+}
+
+#### Old code ####
+# Use | as separator (this is not done anymore, as all data is stored as actual lists, instead of strings. Code kept for future reference)
+# str_replace_all("\\|", "") %>%
+# Remove VERY annoying single backslashes and replace them by whitespaces
+# str_replace_all("\\\\", " ") %>%
+# Replace any occurence of (double) whitespace characters by a single regular whitespace
+# t_id <- paste(ud[,5], collapse = '|')
+# lemmatized <- paste(ud[,7], collapse = '|') %>%
+#   # Replacing double quotes with single quotes in text
+#   str_replace_all("\"","\'")
+# upos_tags <- paste(ud[,8], collapse = '|')
+# head_t_id <- paste(ud[,11], collapse = '|')
+# dep_rel <- paste(ud[,12], collapse = '|')
--- a/man/actorizer.Rd
+++ b/man/actorizer.Rd
@ -0,0 +1,44 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/actorizer.R
+\name{actorizer}
+\alias{actorizer}
+\title{Updater function for elasticizer: Conduct actor searches}
+\usage{
+actorizer(
+  out,
+  localhost = F,
+  ids,
+  prefix,
+  postfix,
+  pre_tags,
+  post_tags,
+  es_super,
+  ver
+)
+}
+\arguments{
+\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
+
+\item{localhost}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
+
+\item{ids}{List of actor ids}
+
+\item{prefix}{Regex containing prefixes that should be excluded from hits}
+
+\item{postfix}{Regex containing postfixes that should be excluded from hits}
+
+\item{es_super}{Password for write access to ElasticSearch}
+
+\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
+
+\item{identifier}{String used to mark highlights. Should be a lowercase string}
+}
+\value{
+As this is a nested function used within elasticizer, there is no return output
+}
+\description{
+Updater function for elasticizer: Conduct actor searches
+}
+\examples{
+actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
+}
--- a/man/bulk_writer.Rd
+++ b/man/bulk_writer.Rd
@ -4,7 +4,7 @@
 \alias{bulk_writer}
 \title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates}
 \usage{
-bulk_writer(x, index = "maml", varname = "updated_variable", type)
+bulk_writer(x, index = "maml", varname, type, ver)
 }
 \arguments{
 \item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)}
@ -14,6 +14,8 @@ bulk_writer(x, index = "maml", varname = "updated_variable", type)
 \item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)}

 \item{type}{Type of updating to be done, can be either 'set', 'add', or 'addnested'}
+
+\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 }
 \value{
 A string usable as Elasticsearch bulk update command, in line-delimited JSON
@ -23,8 +25,8 @@ Generate a line-delimited JSON string for use in Elasticsearch bulk updates
 Type can be either one of three values:
 set: set the value of [varname] to x
 add: add x to the values of [varname]
-varname: When using tokens, the token field will be updated instead of a computerCodes field
+varname: When using ud, the ud field will be updated instead of a computerCodes field
 }
 \examples{
-bulk_writer(x, index = 'maml', varname = 'updated_variable')
+bulk_writer(x, index = 'maml')
 }
--- a/man/class_update.Rd
+++ b/man/class_update.Rd
@ -4,8 +4,17 @@
 \alias{class_update}
 \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information}
 \usage{
-class_update(out, localhost = T, model_final, dfm_words, varname,
-  es_super = .rs.askForPassword("ElasticSearch WRITE"))
+class_update(
+  out,
+  localhost = T,
+  model_final,
+  varname,
+  text,
+  words,
+  clean,
+  ver,
+  es_super = .rs.askForPassword("ElasticSearch WRITE")
+)
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
@ -14,11 +23,19 @@ class_update(out, localhost = T, model_final, dfm_words, varname,

 \item{model_final}{The classification model (e.g. output from textstat_nb(), svm() or others)}

-\item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)}
-
 \item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname}

+\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
+
+\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
+
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
+
+\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
+
 \item{es_super}{Password for write access to ElasticSearch}
+
+\item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)}
 }
 \value{
 As this is a nested function used within elasticizer, there is no return output
--- a/man/cv_generator.Rd
+++ b/man/cv_generator.Rd
@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cv_generator.R
+\name{cv_generator}
+\alias{cv_generator}
+\title{Generate CV folds for nested cross-validation}
+\usage{
+cv_generator(outer_k, inner_k, vec, grid, seed)
+}
+\arguments{
+\item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data}
+
+\item{inner_k}{Number of inner CV (parameter optimization) folds}
+
+\item{vec}{Vector containing the true values of the classification}
+
+\item{grid}{Parameter grid for optimization}
+
+\item{seed}{integer used as seed for random number generation}
+}
+\value{
+A nested set of lists with row numbers
+}
+\description{
+Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
+}
+\examples{
+cv_generator(outer_k, inner_k, dfm, class_type)
+}
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,14 +4,30 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas")
+dfm_gen(
+  out,
+  words = "999",
+  text = "lemmas",
+  clean,
+  tolower = T,
+  binary = F,
+  ngrams = 1
+)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}

 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}

-\item{text}{String indicating whether the "merged" field will contain the "full" text, or "lemmas"}
+\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
+
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
+
+\item{tolower}{Boolean indicating whether dfm features should be lowercased}
+
+\item{binary}{Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE}
+
+\item{ngrams}{Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1}
 }
 \value{
 A Quanteda dfm
--- a/man/dupe_detect.Rd
+++ b/man/dupe_detect.Rd
@ -4,8 +4,17 @@
 \alias{dupe_detect}
 \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
 \usage{
-dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
-  words, localhost = T)
+dupe_detect(
+  row,
+  grid,
+  cutoff_lower,
+  cutoff_upper = 1,
+  es_pwd,
+  es_super,
+  words,
+  localhost = T,
+  ver
+)
 }
 \arguments{
 \item{row}{Row of grid to parse}
@ -23,6 +32,8 @@ dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
 \item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])}

 \item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)}
+
+\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 }
 \value{
 dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -4,9 +4,20 @@
 \alias{elasticizer}
 \title{Generate a data frame out of unparsed Elasticsearch JSON}
 \usage{
-elasticizer(query, src = T, index = "maml",
-  es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL,
-  localhost = F, ...)
+elasticizer(
+  query,
+  src = T,
+  index = "maml",
+  es_user,
+  es_pwd = .rs.askForPassword("Elasticsearch READ"),
+  batch_size = 1024,
+  max_batch = Inf,
+  time_scroll = "5m",
+  dump = F,
+  update = NULL,
+  localhost = F,
+  ...
+)
 }
 \arguments{
 \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -15,6 +26,18 @@ elasticizer(query, src = T, index = "maml",

 \item{index}{The name of the Elasticsearch index to search through}

+\item{es_user}{Username used to connect, defaults to 'es'}
+
+\item{es_pwd}{The password for Elasticsearch read access}
+
+\item{batch_size}{Batch size}
+
+\item{max_batch}{Maximum number batches to retrieve}
+
+\item{time_scroll}{Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)}
+
+\item{dump}{Boolean indicating whether the data frames should be returned, or dumped as .Rds files}
+
 \item{update}{When set, indicates an update function to use on each batch of 1000 articles}

 \item{...}{Parameters passed on to the update function}
--- a/man/estimator.Rd
+++ b/man/estimator.Rd
@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/estimator.R
+\name{estimator}
+\alias{estimator}
+\title{Generate models and get classifications on test sets}
+\usage{
+estimator(
+  row,
+  grid,
+  outer_folds,
+  inner_folds,
+  dfm,
+  class_type,
+  model,
+  we_vectors
+)
+}
+\arguments{
+\item{row}{Row number of current item in grid}
+
+\item{grid}{Grid with model parameters and CV folds}
+
+\item{outer_folds}{List with row numbers for outer folds}
+
+\item{dfm}{DFM containing labeled documents}
+
+\item{class_type}{Name of column in docvars() containing the classes}
+
+\item{model}{Model to use (currently only nb)}
+
+\item{we_vectors}{Matrix with word embedding vectors}
+}
+\value{
+Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
+}
+\description{
+Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
+}
+\examples{
+estimator(row, grid, outer_folds, dfm, class_type, model)
+}
--- a/man/feat_select.Rd
+++ b/man/feat_select.Rd
@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feat_select.R
+\name{feat_select}
+\alias{feat_select}
+\title{Select features using quanteda textstat_keyness}
+\usage{
+feat_select(topic, dfm, class_type, percentile, measure = "chi2")
+}
+\arguments{
+\item{topic}{The topic to determine keywords for}
+
+\item{dfm}{The input dfm}
+
+\item{class_type}{Name of the column in docvars containing the classification}
+
+\item{percentile}{Cutoff for the list of words that should be returned}
+
+\item{measure}{Measure to use in determining keyness, default = chi2; see textstat_keyness for other options}
+}
+\value{
+A vector of words that are key to the topic
+}
+\description{
+Select features based on the textstat_keyness function and a percentile cutoff
+Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
+}
+\examples{
+feat_select(topic, dfm, class_type, percentile, measure="chi2")
+}
--- a/man/lemma_writer.Rd
+++ b/man/lemma_writer.Rd
@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lemma_writer.R
+\name{lemma_writer}
+\alias{lemma_writer}
+\title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings}
+\usage{
+lemma_writer(out, file, localhost = F, documents = F, lemma = F, cores = 1)
+}
+\arguments{
+\item{out}{The elasticizer-generated data frame}
+
+\item{file}{The file to write the output to (including path, when required). When documents = T, provide path including trailing /}
+
+\item{localhost}{Unused, but defaults to FALSE}
+
+\item{documents}{Indicate whether the writer should output to a single file, or individual documents}
+
+\item{lemma}{Indicate whether document output should be lemmas or original document}
+
+\item{cores}{Indicate the number of cores to use for parallel processing}
+}
+\value{
+A Quanteda dfm
+}
+\description{
+Generates text output files (without punctuation) for external applications, such as GloVe embeddings
+}
+\examples{
+dfm_gen(out, words = '999')
+}
--- a/man/merger.Rd
+++ b/man/merger.Rd
@ -4,14 +4,14 @@
 \alias{merger}
 \title{Merges list of lemmas back into a pseudo-document}
 \usage{
-merger(row, out = out)
+merger(out, text, clean)
 }
 \arguments{
-\item{row}{A row number form the Elasticizer-generated data frame}
-
 \item{out}{The elasticizer-generated data frame}

-\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
+\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
+
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
 }
 \value{
 A documentified string of lemmas, one document at a time
@ -20,5 +20,5 @@ A documentified string of lemmas, one document at a time
 Merges list of lemmas back into a pseudo-document
 }
 \examples{
-merger(1, words = '999', out = out)
+merger(out, text, clean)
 }
--- a/man/metric_gen.Rd
+++ b/man/metric_gen.Rd
@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/metric_gen.R
+\name{metric_gen}
+\alias{metric_gen}
+\title{Generate performance statistics for models}
+\usage{
+metric_gen(x)
+}
+\arguments{
+\item{x}{A data frame containing at least the columns "pred" and "tv"}
+}
+\value{
+x, with additional columns for performance metrics
+}
+\description{
+Generate performance statistics for models, based on their predictions and the true values
+}
+\examples{
+metric_gen(x)
+}
--- a/man/modelizer.Rd
+++ b/man/modelizer.Rd
@ -4,28 +4,27 @@
 \alias{modelizer}
 \title{Generate a classification model}
 \usage{
-modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
-  outer_k, inner_k, model, class_type, opt_measure, country, grid)
+modelizer(
+  dfm,
+  outer_k,
+  inner_k,
+  class_type,
+  opt_measure,
+  country,
+  grid,
+  seed,
+  model,
+  we_vectors,
+  cores = 1
+)
 }
 \arguments{
 \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}

-\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)}
-
-\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)}
-
-\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)}
-
-\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)}
-
-\item{seed}{Integer to use as seed for random number generation, ensures replicability}
-
 \item{outer_k}{Number of outer cross-validation folds (for performance estimation)}

 \item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}

-\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
-
 \item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}

 \item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
@ -33,9 +32,17 @@ modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
 \item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}

 \item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
+
+\item{seed}{Integer to use as seed for random number generation, ensures replicability}
+
+\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
+
+\item{we_vectors}{Matrix with word embedding vectors}
+
+\item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1}
 }
 \value{
-An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
+A list containing all relevant output
 }
 \description{
 Generate a nested cross validated classification model based on a dfm with class labels as docvars
@ -47,5 +54,5 @@ For Naïve Bayes, the following parameters can be used:
 - measures (what measure to use for determining feature importance, see textstat_keyness for options)
 }
 \examples{
-modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
+modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
 }
--- a/man/modelizer_old.Rd
+++ b/man/modelizer_old.Rd
@ -0,0 +1,64 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/modelizer_old.R
+\name{modelizer_old}
+\alias{modelizer_old}
+\title{Generate a classification model}
+\usage{
+modelizer_old(
+  dfm,
+  cores_outer,
+  cores_grid,
+  cores_inner,
+  cores_feats,
+  seed,
+  outer_k,
+  inner_k,
+  model,
+  class_type,
+  opt_measure,
+  country,
+  grid
+)
+}
+\arguments{
+\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
+
+\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)}
+
+\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)}
+
+\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)}
+
+\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)}
+
+\item{seed}{Integer to use as seed for random number generation, ensures replicability}
+
+\item{outer_k}{Number of outer cross-validation folds (for performance estimation)}
+
+\item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}
+
+\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
+
+\item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}
+
+\item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
+
+\item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}
+
+\item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
+}
+\value{
+An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
+}
+\description{
+Generate a nested cross validated classification model based on a dfm with class labels as docvars
+Currently only supports Naïve Bayes using quanteda's textmodel_nb
+Hyperparemeter optimization is enabled through the grid parameter
+A grid should be generated from vectors with the labels as described for each model, using the crossing() command
+For Naïve Bayes, the following parameters can be used:
+- percentiles (cutoff point for tf-idf feature selection)
+- measures (what measure to use for determining feature importance, see textstat_keyness for options)
+}
+\examples{
+modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
+}
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/out_parser.R
+\name{out_parser}
+\alias{out_parser}
+\title{Parse raw text into a single field}
+\usage{
+out_parser(out, field, clean = F)
+}
+\arguments{
+\item{out}{The original output data frame}
+
+\item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
+
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
+}
+\value{
+a parsed output data frame including the additional column 'merged', containing the merged text
+}
+\description{
+Parse raw text from the MaML database into a single field
+}
+\examples{
+out_parser(out,field)
+}
--- a/man/preproc.Rd
+++ b/man/preproc.Rd
@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/preproc.R
+\name{preproc}
+\alias{preproc}
+\title{Preprocess dfm data for use in modeling procedure}
+\usage{
+preproc(dfm_train, dfm_test = NULL, params, we_vectors)
+}
+\arguments{
+\item{dfm_train}{Training dfm}
+
+\item{dfm_test}{Testing dfm if applicable, otherwise NULL}
+
+\item{params}{Row from grid with parameter optimization}
+
+\item{we_vectors}{Matrix with word embedding vectors}
+}
+\value{
+List with dfm_train and dfm_test, processed according to parameters in params
+}
+\description{
+Process dfm according to parameters provided in params
+}
+\examples{
+preproc(dfm_train, dfm_test = NULL, params)
+}
--- a/man/query_gen_actors.Rd
+++ b/man/query_gen_actors.Rd
@ -4,12 +4,14 @@
 \alias{query_gen_actors}
 \title{Generate actor search queries based on data in actor db}
 \usage{
-query_gen_actors(actor, country)
+query_gen_actors(actor, pre_tags, post_tags)
 }
 \arguments{
 \item{actor}{A row from the output of elasticizer() when run on the 'actor' index}

-\item{country}{2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.}
+\item{pre_tags}{Highlighter pre-tag}
+
+\item{post_tags}{Highlighter post-tag}
 }
 \value{
 A data frame containing the queries, related actor ids and actor function
--- a/man/query_string.Rd
+++ b/man/query_string.Rd
@ -4,10 +4,14 @@
 \alias{query_string}
 \title{Generate a query string query for ElasticSearch}
 \usage{
-query_string(x)
+query_string(query, fields = F, random = F, default_operator = "AND")
 }
 \arguments{
-\item{x}{Query string in ElasticSearch query string format}
+\item{query}{Query string in ElasticSearch query string format}
+
+\item{fields}{List of field names to return, defaults to all}
+
+\item{random}{Return randomized results. Boolean, defaults to FALSE}
 }
 \value{
 A formatted ElasticSearch query string query
@ -16,5 +20,5 @@ A formatted ElasticSearch query string query
 Generate a query string query for ElasticSearch
 }
 \examples{
-query_string(x)
+query_string(query)
 }
--- a/man/sent_merger.Rd
+++ b/man/sent_merger.Rd
@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sent_merger.R
+\name{sent_merger}
+\alias{sent_merger}
+\title{Aggregate sentence-level dataset containing sentiment (from sentencizer())}
+\usage{
+sent_merger(
+  df,
+  actors_meta = NULL,
+  actor_groups = NULL,
+  pos_cutoff = NULL,
+  neg_cutoff = NULL,
+  single_party = F
+)
+}
+\arguments{
+\item{df}{Data frame with actor ids, produced by sentencizer}
+
+\item{actors_meta}{Optional data frame containing actor metadata obtained using elasticizer(index="actors")}
+
+\item{actor_groups}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)}
+
+\item{pos_cutoff}{Optional value above which sentence-level sentiment scores should be considered "positive"}
+
+\item{neg_cutoff}{Optional value below which sentence-level sentiment scores should be considered "negative"}
+
+\item{single_party}{Boolean to generate data only from sentences in which a single party is mentioned, defaults to FALSE}
+}
+\value{
+When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
+}
+\description{
+Aggregate sentence-level dataset containing sentiment (from sentencizer())
+}
+\examples{
+sent_merger(df, actors_meta, ids = NULL)
+}
--- a/man/sentencizer.Rd
+++ b/man/sentencizer.Rd
@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sentencizer.R
+\name{sentencizer}
+\alias{sentencizer}
+\title{Generate sentence-level dataset with sentiment and actor presence}
+\usage{
+sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
+}
+\arguments{
+\item{out}{Data frame produced by elasticizer}
+
+\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.}
+
+\item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
+}
+\value{
+No return value, data per batch is saved in an RDS file
+}
+\description{
+Generate sentence-level dataset with sentiment and actor presence
+}
+\examples{
+sentencizer(out, sent_dict = NULL, validation = F)
+}
--- a/man/ud_update.Rd
+++ b/man/ud_update.Rd
@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ud_update.R
+\name{ud_update}
+\alias{ud_update}
+\title{Generate UDpipe output from base text}
+\usage{
+ud_update(file, wd, ud_file, ver)
+}
+\arguments{
+\item{file}{Filename of file to read in, also used for generating output file name}
+
+\item{wd}{Working directory where *file*s can be found}
+
+\item{ud_file}{Filename of udpipe model to use, should be in *wd*}
+
+\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
+}
+\value{
+A vector of 1's indicating the success of each update call
+}
+\description{
+Generate UDpipe output from base text
+}
+\examples{
+ud_update(out, udmodel, ver, file)
+
+}
--- a/mapping.json
+++ b/mapping.json
@ -0,0 +1,762 @@
+{
+  "maml" : {
+    "mappings" : {
+      "properties" : {
+        "byline" : {
+          "type" : "text"
+        },
+        "category" : {
+          "type" : "text"
+        },
+        "codableBy" : {
+          "type" : "keyword"
+        },
+        "codes" : {
+          "properties" : {
+            "actorCheck" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "actorQuote" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "actorQuoteTone" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "actorTone" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "actorValidation" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "coderId" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "coderProblems" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "coderProblemsJunk" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "coderProblemsTone" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "junk" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "majorTopic" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "nonDomestic" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "nondomesticValidation" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "sentence" : {
+              "properties" : {
+                "actorid" : {
+                  "type" : "text",
+                  "fields" : {
+                    "keyword" : {
+                      "type" : "keyword",
+                      "ignore_above" : 256
+                    }
+                  }
+                },
+                "id" : {
+                  "type" : "long"
+                },
+                "text" : {
+                  "type" : "text",
+                  "fields" : {
+                    "keyword" : {
+                      "type" : "keyword",
+                      "ignore_above" : 256
+                    }
+                  }
+                }
+              }
+            },
+            "timeCoded" : {
+              "type" : "date"
+            },
+            "timeSpent" : {
+              "type" : "long"
+            },
+            "tone" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "toneActor" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "toneOverall" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            }
+          }
+        },
+        "computerCodes" : {
+          "properties" : {
+            "_delete" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "actors" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "actorsDetail" : {
+              "type" : "nested",
+              "properties" : {
+                "actor_end" : {
+                  "type" : "long"
+                },
+                "actor_start" : {
+                  "type" : "long"
+                },
+                "err" : {
+                  "type" : "boolean"
+                },
+                "first" : {
+                  "type" : "long"
+                },
+                "ids" : {
+                  "type" : "text",
+                  "fields" : {
+                    "keyword" : {
+                      "type" : "keyword",
+                      "ignore_above" : 256
+                    }
+                  }
+                },
+                "occ" : {
+                  "type" : "long"
+                },
+                "prom" : {
+                  "type" : "float"
+                },
+                "rel_first" : {
+                  "type" : "float"
+                },
+                "sentence_end" : {
+                  "type" : "long"
+                },
+                "sentence_id" : {
+                  "type" : "long"
+                },
+                "sentence_start" : {
+                  "type" : "long"
+                },
+                "text" : {
+                  "type" : "text",
+                  "fields" : {
+                    "keyword" : {
+                      "type" : "keyword",
+                      "ignore_above" : 256
+                    }
+                  }
+                },
+                "token_id" : {
+                  "type" : "long"
+                }
+              }
+            },
+            "duplicates" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "junk" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            }
+          }
+        },
+        "country" : {
+          "type" : "keyword"
+        },
+        "doctype" : {
+          "type" : "keyword"
+        },
+        "misc" : {
+          "properties" : {
+            "articletype" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "authors" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "captions" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "contentid" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "edition" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "editioncount" : {
+              "type" : "long"
+            },
+            "internalid" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "localdate" : {
+              "type" : "date"
+            },
+            "page" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "paragraph" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "pdfurl" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "physpart" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "refarea" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "section" : {
+              "properties" : {
+                "Id" : {
+                  "type" : "text",
+                  "fields" : {
+                    "keyword" : {
+                      "type" : "keyword",
+                      "ignore_above" : 256
+                    }
+                  }
+                },
+                "Name" : {
+                  "type" : "text",
+                  "fields" : {
+                    "keyword" : {
+                      "type" : "keyword",
+                      "ignore_above" : 256
+                    }
+                  }
+                }
+              }
+            },
+            "srcid" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "suspicious" : {
+              "type" : "boolean"
+            },
+            "txtid" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "xupper" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            }
+          }
+        },
+        "preteaser" : {
+          "type" : "text",
+          "fields" : {
+            "length" : {
+              "type" : "token_count",
+              "analyzer" : "whitespace"
+            }
+          },
+          "analyzer" : "standard_nolowercase"
+        },
+        "publication_date" : {
+          "type" : "date"
+        },
+        "reliabilityCodes" : {
+          "type" : "nested",
+          "properties" : {
+            "actorCheck" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "actorQuote" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "actorQuoteTone" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "actorTone" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "coderId" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "coderProblems" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "coderProblemsJunk" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "coderProblemsTone" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "junk" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "majorTopic" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "nonDomestic" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "sentence" : {
+              "properties" : {
+                "actorid" : {
+                  "type" : "text",
+                  "fields" : {
+                    "keyword" : {
+                      "type" : "keyword",
+                      "ignore_above" : 256
+                    }
+                  }
+                },
+                "id" : {
+                  "type" : "long"
+                },
+                "text" : {
+                  "type" : "text",
+                  "fields" : {
+                    "keyword" : {
+                      "type" : "keyword",
+                      "ignore_above" : 256
+                    }
+                  }
+                }
+              }
+            },
+            "timeCoded" : {
+              "type" : "date"
+            },
+            "timeSpent" : {
+              "type" : "long"
+            },
+            "toneOverall" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            }
+          }
+        },
+        "subtitle" : {
+          "type" : "text",
+          "fields" : {
+            "length" : {
+              "type" : "token_count",
+              "analyzer" : "whitespace"
+            }
+          },
+          "analyzer" : "standard_nolowercase"
+        },
+        "teaser" : {
+          "type" : "text",
+          "fields" : {
+            "length" : {
+              "type" : "token_count",
+              "analyzer" : "whitespace"
+            }
+          },
+          "analyzer" : "standard_nolowercase"
+        },
+        "text" : {
+          "type" : "text",
+          "fields" : {
+            "length" : {
+              "type" : "token_count",
+              "analyzer" : "whitespace"
+            }
+          },
+          "analyzer" : "standard_nolowercase"
+        },
+        "title" : {
+          "type" : "text",
+          "fields" : {
+            "length" : {
+              "type" : "token_count",
+              "analyzer" : "whitespace"
+            }
+          },
+          "analyzer" : "standard_nolowercase"
+        },
+        "tokens" : {
+          "properties" : {
+            "dep_rel" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "exists" : {
+              "type" : "boolean"
+            },
+            "head_token_ids" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "lemmas" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "token_ids" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            },
+            "upos_tags" : {
+              "type" : "text",
+              "fields" : {
+                "keyword" : {
+                  "type" : "keyword",
+                  "ignore_above" : 256
+                }
+              }
+            }
+          }
+        },
+        "ud" : {
+          "properties" : {
+            "dep_rel" : {
+              "type" : "text",
+              "index" : false
+            },
+            "end" : {
+              "type" : "long",
+              "index" : false
+            },
+            "exists" : {
+              "type" : "boolean"
+            },
+            "feats" : {
+              "type" : "text",
+              "index" : false
+            },
+            "head_token_id" : {
+              "type" : "short",
+              "index" : false
+            },
+            "lemma" : {
+              "type" : "text",
+              "index" : false
+            },
+            "paragraph_id" : {
+              "type" : "short",
+              "index" : false
+            },
+            "sentence_id" : {
+              "type" : "short",
+              "index" : false
+            },
+            "start" : {
+              "type" : "long",
+              "index" : false
+            },
+            "token_id" : {
+              "type" : "short",
+              "index" : false
+            },
+            "upos" : {
+              "type" : "text",
+              "index" : false
+            }
+          }
+        },
+        "update" : {
+          "type" : "long"
+        },
+        "version" : {
+          "type" : "keyword"
+        }
+      }
+    }
+  }
+}
+