fix in package version check

update
Add single_party param
54 changed files with 3431 additions and 388 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 .Rproj.user
 .Rhistory
 .RData
 *.RData
 *.Rds
--- a/10
+++ b/10
@ -9,10 +9,16 @@ Depends: R (>= 3.3.1),
    parallel,
    tidyverse,
    quanteda,
    quanteda.textmodels,
    httr,
    caret,
-    e1071
+    e1071,
    udpipe,
    SparseM,
    future,
    future.apply,
    data.table
 License: Copyright Erik de Vries
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.1.0
+RoxygenNote: 7.3.1
--- a/14
+++ b/14
@ -1,10 +1,24 @@
 # Generated by roxygen2: do not edit by hand
 export(actorizer)
 export(bulk_writer)
 export(class_update)
 export(cv_generator)
 export(dfm_gen)
 export(dupe_detect)
 export(elastic_update)
 export(elasticizer)
 export(estimator)
 export(feat_select)
 export(lemma_writer)
 export(merger)
 export(metric_gen)
 export(modelizer)
 export(modelizer_old)
 export(out_parser)
 export(preproc)
 export(query_gen_actors)
 export(query_string)
 export(sent_merger)
 export(sentencizer)
 export(ud_update)
--- a/Old/actor_aggregation.R
+++ b/Old/actor_aggregation.R
@ -0,0 +1,278 @@
 ### Notes:
 # Do you want to search for either one OR other actorid, or both occuring in the same document?
 # Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
 # Search by actorId, then aggregate by month
 # When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
 # Develop query generator for specific actors (ie combine actorId with start and end dates)
 #' Generate aggregated actor measures from raw data
 #'
 #' Generate aggregated actor measures from raw data
 #' @param row The row of the actors data frame used for aggregation
 #' @param actors The data frame containing actor data
 #' @param es_pwd The password for read access to ES
 #' @param localhost Boolean indicating if the script is running locally or not
 #' @param default_operator String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR
 #' @return No return value, data per actor is saved in an RDS file
 #' @export
 #' @examples
 #' actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR')
 #################################################################################################
 #################################### Aggregate actor results ################################
 #################################################################################################
 actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR', sent_dict = NULL, cores = detectCores()) {
  ### Functions
  aggregator <- function (id, duplicates) {
    df <- duplicates %>%
      filter(`_id` == id) %>%
      group_by(`_id`) %>%
      summarise(
        `_source.doctype` = first(`_source.doctype`),
        `_source.publication_date` = first(`_source.publication_date`),
        # actor_end = list(sort(unique(unlist(actor_end)))),
        prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
        sentence_id = list(sort(unique(unlist(sentence_id)))),
        rel_first = list(max(unlist(rel_first))),
        # sentence_end = list(sort(unique(unlist(sentence_end)))),
        # actor_start = list(sort(unique(unlist(actor_start)))),
        ids = list(unique(unlist(ids))),
        # sentence_start = list(sort(unique(unlist(sentence_start)))),
        occ = list(length(unique(unlist(sentence_id)))),
        first = list(min(unlist(sentence_id)))
      )
    return(df)
  }
  ### Calculate sentiment scores for each actor-document
  par_sent <- function(row, out, sent_dict) {
    out_row <- out[row,]
    ### Contains sentiment per sentence for whole article
    sentiment_ud <- out_row$`_source.ud`[[1]] %>%
      select(-one_of('exists')) %>%
      unnest() %>%
      filter(upos != 'PUNCT') %>% # For getting proper word counts
      mutate(V1 = str_c(lemma,'_',upos)) %>%
      left_join(sent_dict, by = 'V1') %>%
      ### Setting binary sentiment as unit of analysis
      mutate(V2 = V3) %>%
      group_by(sentence_id) %>%
      mutate(
        V2 = case_when(
          is.na(V2) == T ~ 0,
          TRUE ~ V2
        )
      ) %>%
      summarise(sent_sum = sum(V2),
                words = length(lemma),
                sent_words = length(na.omit(V3))) %>%
      mutate(
        sent = sent_sum/words,
        arousal = sent_words/words
      )
    out_row <- select(out_row, -`_source.ud`)
    ### Contains sentiment per sentence for actor
    actor_tone <- filter(sentiment_ud, sentence_id %in% unlist(out_row$sentence_id))
    ### Aggregated sentiment per actor (over all sentences containing actor)
    actor <- summarise(actor_tone,
                       sent = sum(sent_sum)/sum(words),
                       sent_sum = sum(sent_sum),
                       sent_words = sum(sent_words),
                       words = sum(words),
                       arousal = sum(sent_words)/sum(words)
    )
    ### Aggregated sentiment per article (over all sentences in article)
    text <- summarise(sentiment_ud,
                      sent = sum(sent_sum)/sum(words),
                      sent_sum = sum(sent_sum),
                      sent_words = sum(sent_words),
                      words = sum(words),
                      arousal = sum(sent_words)/sum(words)
    )
    return(cbind(out_row,data.frame(actor = actor, text = text)))
  }
  ### Creating aggregate measuers at daily, weekly, monthly and yearly level
  grouper <- function(level, out, actorids, sent = F) {
    by_newspaper <- out %>%
      mutate(
        sentence_count = round(unlist(occ)/unlist(prom))
      ) %>%
      group_by_at(vars(level, `_source.doctype`)) %>%
      summarise(
        occ = sum(unlist(occ)),
        prom_art = mean(unlist(prom)),
        rel_first_art = mean(unlist(rel_first)),
        first = mean(unlist(first)),
        sentence_count = sum(sentence_count),
        articles = length(`_id`),
        level = level
      ) %>%
      ungroup()
    aggregate <- out %>%
      mutate(
        sentence_count = round(unlist(occ)/unlist(prom))
      ) %>%
      group_by_at(vars(level)) %>%
      summarise(
        occ = sum(unlist(occ)),
        prom_art = mean(unlist(prom)),
        rel_first_art = mean(unlist(rel_first)),
        first = mean(unlist(first)),
        sentence_count = sum(sentence_count),
        articles = length(`_id`),
        `_source.doctype` = 'agg',
        level = level
      ) %>%
      ungroup()
    if(sent == T) {
      by_newspaper_sent <- out %>%
        group_by_at(vars(level, `_source.doctype`)) %>%
        summarise(
          actor.sent = mean(actor.sent),
          actor.sent_sum = sum(actor.sent_sum),
          actor.sent_words = sum(actor.sent_words),
          actor.words = sum(actor.words),
          actor.arousal = mean(actor.arousal),
          text.sent = mean(text.sent),
          text.sent_sum = sum(text.sent_sum),
          text.sent_words = sum(text.sent_words),
          text.words = sum(text.words),
          text.arousal = mean(text.arousal)
        ) %>%
        ungroup() %>%
        select(-level,-`_source.doctype`)
      aggregate_sent <- out %>%
        group_by_at(vars(level)) %>%
        summarise(
          actor.sent = mean(actor.sent),
          actor.sent_sum = sum(actor.sent_sum),
          actor.sent_words = sum(actor.sent_words),
          actor.words = sum(actor.words),
          actor.arousal = mean(actor.arousal),
          text.sent = mean(text.sent),
          text.sent_sum = sum(text.sent_sum),
          text.sent_words = sum(text.sent_words),
          text.words = sum(text.words),
          text.arousal = mean(text.arousal)
        ) %>%
        ungroup() %>%
        select(-level)
      aggregate <- bind_cols(aggregate,aggregate_sent)
      by_newspaper <- bind_cols(by_newspaper, by_newspaper_sent)
    }
    output <- bind_rows(by_newspaper, aggregate) %>%
      bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),]) %>%
      select(
        -`_index`,
        -`_type`,
        -`_score`,
        -`_id`,
        -contains('Search'),
        -contains('not')
      )
    colnames(output) <- gsub("_source.",'', colnames(output))
    return(output)
  }
  ###########################################################################################
  plan(multiprocess, workers = cores)
  if (is.null(sent_dict) == F) {
    fields <- c('ud','computerCodes.actorsDetail', 'doctype', 'publication_date')
  } else {
    fields <- c('computerCodes.actorsDetail', 'doctype', 'publication_date')
  }
  actor <- actors[row,]
  if (actor$`_source.function` == "Party"){
    years = seq(2000,2019,1)
    startDate <- '2000'
    endDate <- '2019'
  } else {
    years = c(0)
    startDate <- actor$`_source.startDate`
    endDate <- actor$`_source.endDate`
  }
  if (actor$`_source.function` == 'Party' && actor$party_only == T) {
    actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'))
  } else if (actor$`_source.function` == 'Party') {
    actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a'))
    actor$party_only <- F
  } else {
    actorids <- actor$`_source.actorId`
    actor$party_only <- NULL
  }
  actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {
    if (year > 0) {
      query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31]')
    } else {
      query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,']')
    }
    ### Temporary exception for UK missing junk coding
    if (actor$`_source.country` != 'uk') {
      query <- paste0(query,' && computerCodes.junk:0')
    }
    out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query),
                                    fields = fields, default_operator = default_operator),
                       localhost = localhost,
                       es_pwd = es_pwd)
    if (length(out$`_id`) > 0 ) {
      if (is.null(sent_dict) == F) {
        out_ud <- out %>% select(`_id`,`_source.ud`)
        out <- out %>% select(-`_source.ud`)
      }
      ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
      out <- out %>%
        unnest(`_source.computerCodes.actorsDetail`, .preserve = colnames(.)) %>%
        unnest(ids, .preserve = colnames(.)) %>%
        filter(ids1 %in% actorids) # %>%
      # select(-ends_with('start')) %>%
      # select(-ends_with('end')) %>%
      # select(-starts_with('ids'))
      ### Only if there are more rows than articles, recalculate
      if (length(unique(out$`_id`)) != length(out$`_id`)) {
        duplicates <- out[(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),]
        actor_single <- out[!(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),]
        art_id <- unique(duplicates$`_id`)
        dupe_merged <- bind_rows(future_lapply(art_id, aggregator, duplicates = duplicates))
        out <- bind_rows(dupe_merged, actor_single)
      }
      if (is.null(sent_dict) == F) {
        out <- left_join(out, out_ud, by = '_id')
        out <- bind_rows(future_lapply(seq(1,nrow(out),1),par_sent, out = out, sent_dict = sent_dict))
      }
      ### Creating date grouping variables
      out <- out %>%
        mutate(
          year = strftime(`_source.publication_date`, format = '%Y'),
          yearmonth = strftime(out$`_source.publication_date`, format = '%Y%m'),
          yearmonthday = strftime(out$`_source.publication_date`, format = '%Y%m%d'),
          yearweek = strftime(out$`_source.publication_date`, format = "%Y%V")
        ) %>%
        select(
          -`_score`,
          -`_index`,
          -`_type`,
          -`_score`,
          -`_source.computerCodes.actorsDetail`,
          -ids1
        )
      levels <- c('year','yearmonth','yearmonthday','yearweek')
      aggregate_data <- bind_rows(lapply(levels, grouper, out = out, actorids = actorids, sent = !is.null(sent_dict)))
      return(aggregate_data)
    } else {
      return()
    }
  }
  saveRDS(bind_rows(lapply(years, actor_aggregator, query, actor, actorids, default_operator, localhost, es_pwd)), file = paste0(actor$`_source.country`,'_',paste0(actorids,collapse = ''),actor$`_source.function`,startDate,endDate,'.Rds'))
  print(paste0('Done with ',row,'/',nrow(actors),' actors'))
  return()
 }
--- a/Old/actor_fetcher.R
+++ b/Old/actor_fetcher.R
@ -0,0 +1,188 @@
 #' Generate actor data frames (with sentiment) from database
 #'
 #' Generate actor data frames (with sentiment) from database
 #' @param out Data frame produced by elasticizer
 #' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.
 #' @param actor_ids Optional vector containing the actor ids to be collected
 #' @param cores Number of threads to use for parallel processing
 #' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
 #' @return No return value, data per batch is saved in an RDS file
 #' @export
 #' @examples
 #' actor_fetcher(out, sent_dict = NULL, cores = 1)
 #################################################################################################
 #################################### Aggregate actor results ################################
 #################################################################################################
 actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, localhost = NULL, validation = F) {
  plan(multiprocess, workers = cores)
  ### Functions
  ### Calculate sentiment scores for each actor-document
  sent_scorer <- function(row, out_row, ud_sent) {
    ### Contains sentiment per sentence for actor
    actor_tone <- filter(ud_sent, sentence_id %in% unlist(out_row[row,]$sentence_id))
    ### Aggregated sentiment per actor (over all sentences containing actor)
    actor <- summarise(actor_tone,
                       sent = sum(sent_sum)/sum(words),
                       sent_sum = sum(sent_sum),
                       sent_words = sum(sent_words),
                       words = sum(words),
                       arousal = sum(sent_words)/sum(words)
    )
    return(cbind(out_row[row,],data.frame(actor = actor)))
  }
  aggregator <- function (pid, dupe_df) {
    ### Party ids excluding actors
    p_ids <- c(str_c(pid,'_f'),str_c(pid,'_s'))
    ### Party ids including actors
    p_ids_a <- c(p_ids,str_c(pid,'_a'))
    summarizer <- function (p_ids, dupe_df, merged_id) {
      id <- dupe_df$`_id`[[1]]
      dupe_df <- dupe_df %>%
        filter(ids %in% p_ids)
      if (nrow(dupe_df) > 0) {
        return(
          dupe_df %>% summarise(
            `_id` = first(`_id`),
            `_source.doctype` = first(`_source.doctype`),
            `_source.publication_date` = first(`_source.publication_date`),
            prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
            sentence_id = list(sort(unique(unlist(sentence_id)))),
            rel_first = list(max(unlist(rel_first))),
            ids = merged_id,
            occ = list(length(unique(unlist(sentence_id)))),
            first = list(min(unlist(sentence_id))),
            actor_start = list(sort(unique(unlist(actor_start)))),
            actor_end = list(sort(unique(unlist(actor_end)))),
            sentence_start = list(sort(unique(unlist(sentence_start)))),
            sentence_end = list(sort(unique(unlist(sentence_end))))
          )
        )
      } else {
        print(paste0('id:',id))
        return(NULL)
      }
    }
    party <- summarizer(p_ids, dupe_df, str_c(pid,'_mfs'))
    party_actor <- summarizer(p_ids_a, dupe_df, str_c(pid,'_mfsa'))
    return(bind_rows(party,party_actor))
  }
  par_sent <- function(row, out, sent_dict = NULL) {
    out_row <- out[row,]
    ### Generating sentence-level sentiment scores from ud
    if (is.null(sent_dict) == F) {
      ud_sent <- out_row$`_source.ud`[[1]] %>%
        select(-one_of('exists')) %>%
        unnest() %>%
        filter(upos != 'PUNCT') # For getting proper word counts
      if ("lem_u" %in% colnames(sent_dict)) {
        ud_sent <- ud_sent %>%
          mutate(lem_u = str_c(lemma,'_',upos)) %>%
          left_join(sent_dict, by = 'lem_u')
      } else if ("lemma" %in% colnames(sent_dict)) {
        ud_sent <- ud_sent %>%
          left_join(sent_dict, by = 'lemma') %>%
          mutate(lem_u = lemma)
      }
      ud_sent <- ud_sent %>%
        group_by(sentence_id) %>%
        mutate(
          prox = case_when(
            is.na(prox) == T ~ 0,
            TRUE ~ prox
          )
        ) %>%
        summarise(sent_sum = sum(prox),
                  words = length(lemma),
                  sent_words = sum(prox != 0),
                  sent_lemmas = list(lem_u[prox != 0])) %>%
        mutate(
          sent = sent_sum/words,
          arousal = sent_words/words
        )
      out_row <- select(out_row, -`_source.ud`)
    }
    if (validation == T) {
      codes_sent <- filter(ud_sent, sentence_id == out_row$`_source.codes.sentence.id`[1])
      return(cbind(out_row, codes = codes_sent))
    }
    ### Unnest out_row to individual actor ids
    out_row <- out_row %>%
      unnest(`_source.computerCodes.actorsDetail`) %>%
      mutate(ids_list = ids) %>%
      unnest(ids) %>%
      mutate(
        pids = str_sub(ids, start = 1, end = -3)
      )
    if (!is.null(actor_ids)) {
      out_row <- filter(out_row, ids %in% actor_ids )
    }
    ### Get list of party ids occuring more than once in the document
    pids_table <- table(out_row$pids)
    dupe_pids <- names(pids_table[pids_table > 1])%>%
      str_subset(pattern = fixed('P_'))
    single_pids <- names(pids_table[pids_table <= 1]) %>%
      str_subset(pattern = fixed('P_'))
    ### Data frame containing only duplicate party ids
    dupe_df <- out_row %>%
      filter(pids %in% dupe_pids)
    ### Data frame containing only single party ids
    single_df <- out_row %>%
      filter(pids %in% single_pids)
    ### Data frame for single occurrence mfsa
    single_party_actor <- single_df %>%
      mutate(
        ids = str_c(pids,'_mfsa')
      )
    ### Data frame for single occurence mfs
    single_party <- single_df %>%
      filter(!endsWith(ids, '_a')) %>%
      mutate(
        ids = str_c(pids,'_mfs')
      )
    out_row <- out_row %>%
      filter(startsWith(ids,'A_')) %>%
      bind_rows(., single_party, single_party_actor)
    ### For each of the party ids in the list above, aggregate to _mfs and _mfsa
    if (length(dupe_pids) > 0) {
      aggregate <- bind_rows(lapply(dupe_pids, aggregator, dupe_df = dupe_df))
      out_row <- bind_rows(out_row, aggregate)
    }
    ### Generating sentiment scores for article and actors
    if (is.null(sent_dict) == F) {
      ### Aggregated sentiment per article (over all sentences in article)
      text_sent <- summarise(ud_sent,
                             sent = sum(sent_sum)/sum(words),
                             sent_sum = sum(sent_sum),
                             sent_words = sum(sent_words),
                             words = sum(words),
                             arousal = sum(sent_words)/sum(words)
      )
      out_row <- bind_rows(lapply(seq(1,nrow(out_row),1),sent_scorer, out_row = out_row, ud_sent = ud_sent)) %>%
        cbind(., text = text_sent)
    }
    out_row <- out_row %>%
      mutate(
        year = strftime(`_source.publication_date`, format = '%Y'),
        yearmonth = strftime(`_source.publication_date`, format = '%Y%m'),
        yearmonthday = strftime(`_source.publication_date`, format = '%Y%m%d'),
        yearweek = strftime(`_source.publication_date`, format = "%Y%V")
      ) %>%
      select(#-`_source.computerCodes.actorsDetail`,
        -`_score`,
        -`_index`,
        -`_type`,
        -pids)
    return(out_row)
  }
  saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
  return()
 }
--- a/Old/actorizer_old.R
+++ b/Old/actorizer_old.R
@ -0,0 +1,152 @@
 #' Updater function for elasticizer: Conduct actor searches
 #'
 #' Updater function for elasticizer: Conduct actor searches
 #' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
 #' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param ids List of actor ids
 #' @param prefix Regex containing prefixes that should be excluded from hits
 #' @param postfix Regex containing postfixes that should be excluded from hits
 #' @param identifier String used to mark highlights. Should be a lowercase string
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @param es_super Password for write access to ElasticSearch
 #' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
 #' @return As this is a nested function used within elasticizer, there is no return output
 #' @export
 #' @examples
 #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
 actorizer_old <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = 1) {
  plan(multiprocess, workers = cores)
  ### Function to filter out false positives using regex
  exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
    min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
    max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
    split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
    min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
    max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
    sentence <- paste0(' ',str_sub(doc$merged, min, max),' ') # Extract sentence from text, adding whitespaces before and after for double negation (i.e. Con only when preceded by "("))
    # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
    if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
      return(id)
    } else {
      return(NULL)
    }
  }
  ranger <- function(x, ud) {
    return(which((ud$start <= x) & (ud$end >= x)))
  }
  sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
    doc <- out[row,]
    if (nchar(doc$merged) > 990000) {
      return(
        data.frame(
          err = T,
          errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
        )
      )
    }
    # Extracting ud output from document
    ud <- doc$`_source.ud`[[1]] %>%
      select(-one_of('exists')) %>% # Removing ud.exists variable
      unnest() %>%
      mutate(doc_id = doc$`_id`)
    markers <- doc$markers[[1]][,'start'] # Extract list of markers
    # Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
    rows <- unlist(lapply(markers, ranger, ud = ud))
    # Setting up an actor variable
    ud$actor <- F
    ud$actor[rows] <- T
    sentence_count <- max(ud$sentence_id) # Number of sentences in article
    actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
    # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
    if (!is.na(prefix) || !is.na(postfix)) {
      ### If no pre or postfixes, match *not nothing* i.e. anything
      if (is.na(prefix)) {
        prefix = '$^'
      }
      if (is.na(postfix)) {
        postfix = '$^'
      }
      sentence_ids <- unlist(lapply(actor_sentences,
                                    exceptionizer,
                                    ud = ud,
                                    doc = doc,
                                    markers = markers,
                                    pre_tags_regex = pre_tags_regex,
                                    pre_tags = pre_tags,
                                    post_tags_regex = post_tags_regex,
                                    post_tags = post_tags,
                                    prefix = prefix,
                                    postfix = postfix))
    } else {
      sentence_ids <- actor_sentences
    }
    if (length(sentence_ids > 0)) {
      # Generating nested sentence start and end positions for actor sentences
      ud <- ud %>%
        filter(sentence_id %in% sentence_ids)
      actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
      actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
      ud <- ud %>%
        group_by(sentence_id) %>%
        summarise (
          sentence_start = as.integer(min(start)),
          sentence_end = as.integer(max(end)),
          doc_id = first(doc_id)
        ) %>%
        group_by(doc_id) %>%
        summarise(
          sentence_id = list(as.integer(sentence_id)),
          sentence_start = list(sentence_start),
          sentence_end = list(sentence_end)
        )
      return(
        data.frame(ud, # Sentence id, start and end position for actor sentences
                   actor_start = I(list(actor_start)), # List of actor ud token start positions
                   actor_end = I(list(actor_end)), # List of actor ud token end positions
                   occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs
                   prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
                   rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level
                   first = min(sentence_ids), # First sentence in which actor is mentioned
                   ids = I(list(ids)) # List of actor ids
        )
      )
    } else {
      return(NULL)
    }
  }
  out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
  offsetter <- function(x, pre_tags, post_tags) {
    return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
  }
  prefix[prefix==''] <- NA
  postfix[postfix==''] <- NA
  pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
  post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
  out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags)
  # ids <- fromJSON(ids)
  updates <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), sentencizer,
                                     out = out,
                                     ids = ids,
                                     postfix = postfix,
                                     prefix=prefix,
                                     pre_tags_regex = pre_tags_regex,
                                     pre_tags = pre_tags,
                                     post_tags_regex = post_tags_regex,
                                     post_tags = post_tags))
  if (nrow(updates) == 0) {
    print("Nothing to update for this batch")
    return(NULL)
  } else {
    bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
    bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
    return(elastic_update(bulk, es_super = es_super, localhost = localhost))
  }
 }
--- a/Old/aggregator.R
+++ b/Old/aggregator.R
@ -0,0 +1,39 @@
 #' Aggregator function, to aggregate actor results
 #'
 #' Aggregator function, to aggregate actor results
 #' @param id Article id of the article for which actor aggregation should be done
 #' @param actor_df The dataframe containing the actor data
 #' @param merge_id The actorid that should be assigned to the merged result
 #' @return A dataframe with the merged results
 #' @export
 #' @examples
 #' aggregator(id, actor_df, merge_id)
 aggregator <- function (id, actor_df, merge_id) {
 article <- filter(actor_df, `_id` == id) %>%
  unnest(sentence_id, .preserve = colnames(.))
 occ <- length(unlist(unique(article$sentence_id1)))
 sentence_count <- round(article$occ[[1]]/article$prom[[1]])
 prom <- occ/sentence_count
 rel_first <- 1-(min(article$sentence_id1)/sentence_count)
 actor_start <- sort(unique(unlist(article$actor_start)))
 actor_end <- sort(unique(unlist(article$actor_end)))
 sentence_start <- sort(unique(unlist(article$sentence_start)))
 sentence_end <- sort(unique(unlist(article$sentence_end)))
 sentence_id <- sort(unique(unlist(article$sentence_id)))
 return(data.frame(doc_id = first(article$`_id`),
                  sentence_id = I(list(as.integer(sentence_id))),
                  sentence_start = I(list(sentence_start)),
                  sentence_end = I(list(sentence_end)),
                  actor_start = I(list(actor_start)), # List of actor ud token start positions
                  actor_end = I(list(actor_end)), # List of actor ud token end positions
                  occ = occ, # Number of sentences in which actor occurs
                  prom = prom, # Relative prominence of actor in article (number of occurences/total # sentences)
                  rel_first = rel_first, # Relative position of first occurence at sentence level
                  first = min(article$sentence_id1), # First sentence in which actor is mentioned
                  ids = merge_id, # List of actor ids
                  stringsAsFactors = F
 )
 )
 }
--- a/Old/aggregator_elastic.R
+++ b/Old/aggregator_elastic.R
@ -0,0 +1,61 @@
 ### Notes:
 # Do you want to search for either one OR other actorid, or both occuring in the same document?
 # Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
 # Search by actorId, then aggregate by month
 # When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
 # Develop query generator for specific actors (ie combine actorId with start and end dates)
 #' Generate and store aggregate actor measures to elasticsearch
 #'
 #' Generate and store aggregate actor measures to elasticsearch
 #' @param out The output provided by elasticizer()
 #' @param localhost Boolean indicating if the script should run locally, or remote
 #' @param es_super Write password for ES
 #' @param actorids List of actorids used in the search, should be the same as the actorids used for elasticizer()
 #' @param ver String indicating the version of the update
 #' @return Return value is based on output of elastic_update()
 #' @export
 #' @examples
 #' aggregator_elastic(out, localhost = F, actorids, ver, es_super)
 #################################################################################################
 #################################### Aggregate actor results ################################
 #################################################################################################
 aggregator_elastic <- function(out, localhost = F, actorids, ver, es_super) {
  ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
  partyid <- str_sub(actorids[1], end=-3)
  actor_df <- out %>%
    unnest() %>%
    unnest(ids, .preserve = colnames(.)) %>%
    filter(ids1 %in% actorids)
  agg_party_actors <- bind_rows(lapply(unique(actor_df$`_id`),
                                         mamlr:::aggregator,
                                         actor_df = actor_df,
                                         merge_id = paste0(partyid,'_mfsa')))
  party <- actor_df %>%
    filter(!endsWith(ids1, '_a'))
  agg_party <- bind_rows(lapply(unique(party$`_id`),
                                         mamlr:::aggregator,
                                         actor_df = party,
                                         merge_id = paste0(partyid,'_mfs')))
  actors_only <- actor_df %>%
    filter(endsWith(ids1, '_a'))
  agg_actors <- bind_rows(lapply(unique(actors_only$`_id`),
                                  mamlr:::aggregator,
                                  actor_df = actors_only,
                                  merge_id = paste0(partyid,'_ma')))
  df_out <- bind_rows(agg_party_actors, agg_party, agg_actors)
  doc_ids <- df_out$doc_id
  df_out <- df_out %>%
    select(-1) %>%
    split(as.factor(doc_ids))
  df_out <- data.frame(doc_id = names(df_out), list = I(df_out))
  bulk <- apply(df_out, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
  return(elastic_update(bulk, es_super = es_super, localhost = localhost))
 }
--- a/Old/merger_old.R
+++ b/Old/merger_old.R
@ -0,0 +1,47 @@
 #' Merges list of lemmas back into a pseudo-document
 #'
 #' Merges list of lemmas back into a pseudo-document
 #' @param row A row number form the Elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param out The elasticizer-generated data frame
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @return A documentified string of lemmas, one document at a time
 #' @export
 #' @examples
 #' merger(1, words = '999', out, text)
 #################################################################################################
 #################################### Reconstructing documents from lemmas########################
 #################################################################################################
 ## Only merging lemmas for now, feature selection has no impact on junk classification
 merger_old <- function(row, out, text, clean) {
  df <- out[row,]
  # Mergin lemmas into single string
  if (text == 'lemmas') {
    lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
  }
  if (text == 'ud') {
    lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
  }
  if (text == 'ud_upos') {
    df <- unnest(df,`_source.ud`)
    lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
      # Regex removes all words consisting of or containing numbers, @#$%
      # Punctuation is not taken into account, as it is already filtered out, see above
      {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "")  else . }
    # In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string
    if (length(lemmas) == 0 ){
      lemmas <- ''
    }
    return(lemmas)
  }
  # Replacing $-marked punctuation with their regular forms
  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
    # Regex removes all words consisting of or containing numbers, @#$%
    # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
    # Regex also used in out_parser
    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
    paste0(.,". ")
  return(lemmas)
 }
--- a/R/actorizer.R
+++ b/R/actorizer.R
@ -0,0 +1,143 @@
 #' Updater function for elasticizer: Conduct actor searches
 #'
 #' Updater function for elasticizer: Conduct actor searches
 #' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
 #' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param ids List of actor ids
 #' @param prefix Regex containing prefixes that should be excluded from hits
 #' @param postfix Regex containing postfixes that should be excluded from hits
 #' @param identifier String used to mark highlights. Should be a lowercase string
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @param es_super Password for write access to ElasticSearch
 #' @return As this is a nested function used within elasticizer, there is no return output
 #' @export
 #' @examples
 #' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
 actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) {
  offsetter <- function(x, pre_tags, post_tags) {
    return(as.list(as.data.frame(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))))
  }
  out <- mamlr:::out_parser(out, field = 'highlight', clean = F) %>%
    ## Computing offset for first token position (some articles have a minimum token start position of 16, instead of 1 or 2)
    mutate( # Checking if the merged field starts with a whitespace character
      space = case_when(
        str_starts(merged, '\\s') ~ 1,
        T ~ 0)
    ) %>%
    unnest(cols = '_source.ud') %>%
    rowwise() %>%
    mutate(ud_min = min(unlist(start))-1-space) ## Create offset variable, subtract 1 for default token start position of 1, and subtract 1 if merged field starts with a whitespace
  print(str_c('Number of articles with minimum token start position higher than 2: ',sum(out$ud_min > 2)))
  print('Unique ud_min offset values in batch: ')
  print(unique(out$ud_min))
  prefix[prefix==''] <- NA
  postfix[postfix==''] <- NA
  pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
  post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
  if (sum(nchar(out$merged) > 990000) > 0) {
    stop("One or more documents in this batch exceed 990000 characters")
  }
  # Extracting ud output from document
  ud <- out %>%
    select(`_id`,lemma,start,end, sentence_id,merged) %>%
    unnest(cols = colnames(.))
  sentences <- ud %>%
    group_by(`_id`, sentence_id) %>%
    summarise(
      sentence_start = min(start),
      sentence_end = max(end)
    ) %>%
    mutate(
      sentence_count = n()
    )
  out$markers <- lapply(str_locate_all(out$merged,coll(pre_tags)),offsetter, pre_tags = pre_tags, post_tags = post_tags)
  markers <- out %>%
    select(`_id`,markers, ud_min) %>%
    unnest_wider(markers) %>%
    rename(marker_start = start, marker_end = end) %>%
    unnest(colnames(.)) %>%
    ## Modifying marker start and end positions using the ud_min column (see above)
    mutate(marker_start = marker_start +(ud_min),
             marker_end = marker_end + (ud_min))
  hits <- as.data.table(ud)[as.data.table(markers), .(`_id`, lemma,x.start, start, end, x.end, sentence_id, merged), on =.(`_id` = `_id`, start <= marker_start, end >= marker_start)] %>%
    mutate(end = x.end,
           start = x.start) %>%
    select(`_id`, sentence_id, start, end,merged) %>%
    group_by(`_id`,sentence_id) %>%
    summarise(
      actor_start = I(list(start)),
      actor_end = I(list(end)),
      n_markers = length(start),
      merged = first(merged)
    ) %>%
    left_join(.,sentences, by=c('_id','sentence_id')) %>%
    ungroup %>%
    arrange(`_id`,sentence_id) %>%
    group_by(`_id`) %>%
    mutate(n_markers = cumsum(n_markers)) %>%
    mutate(
      sentence_start_tags = sentence_start+((nchar(pre_tags)+nchar(post_tags))*(lag(n_markers, default = 0))),
      sentence_end_tags = sentence_end+((nchar(pre_tags)+nchar(post_tags))*(n_markers))
    ) %>%
    mutate(
      sentence = paste0(' ',str_sub(merged, sentence_start_tags, sentence_end_tags),' ')
    ) %>%
    select(-merged) %>%
    ungroup()
  # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
  if (!is.na(prefix) || !is.na(postfix)) {
    ### If no pre or postfixes, match *not nothing* i.e. anything
    if (is.na(prefix)) {
      prefix = '$^'
    }
    if (is.na(postfix)) {
      postfix = '$^'
    }
    hits <- hits %>%
      filter(
        !str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) & !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))
      )
  }
  ### Checking and removing any na rows, and reporting them in the console
  nas <- hits %>% filter(is.na(sentence_id))
  hits <- hits %>% filter(!is.na(sentence_id))
  if (nrow(nas) > 0) {
    print(str_c('The following articles have not been searched correctly for actorId ',ids))
    print(str_c('id_na: ',nas$`_id`, collapse = '\n '))
  }
  if (nrow(hits) == 0) {
    print("Nothing to update for this batch")
    return(NULL)
  } else {
    hits <- hits %>%
      group_by(`_id`) %>%
      summarise(
        sentence_id = list(as.integer(sentence_id)),
        sentence_start = list(sentence_start),
        sentence_end = list(sentence_end),
        actor_start = I(list(unlist(actor_start))), # List of actor ud token start positions
        actor_end = I(list(unlist(actor_end))), # List of actor ud token end positions
        occ = length(unique(unlist(sentence_id))), # Number of sentences in which actor occurs
        first = min(unlist(sentence_id)), # First sentence in which actor is mentioned
        ids = I(list(ids)),
        sentence_count = first(sentence_count)# List of actor ids
      ) %>%
      mutate(
        prom = occ/sentence_count, # Relative prominence of actor in article (number of occurrences/total # sentences)
        rel_first = 1-(first/sentence_count), # Relative position of first occurrence at sentence level
      ) %>%
      select(`_id`:occ, prom,rel_first,first,ids)
    bulk <- apply(hits, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
    bulk <- c(bulk,apply(hits[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
    return(elastic_update(bulk, es_super = es_super, localhost = localhost))
  }
 }
--- a/R/bulk_writer.R
+++ b/R/bulk_writer.R
@ -1,19 +1,47 @@
 #' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
 #'
 #' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
 #' Type can be either one of three values:
 #' set: set the value of [varname] to x
 #' add: add x to the values of [varname]
 #' varname: When using ud, the ud field will be updated instead of a computerCodes field
 #' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
 #' @param index The name of the Elasticsearch index to update
-#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created)
+#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)
 #' @param type Type of updating to be done, can be either 'set', 'add', or 'addnested'
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
 #' @export
 #' @examples
-#' bulk_writer(x, index = 'maml', varname = 'updated_variable')
+#' bulk_writer(x, index = 'maml')
 #################################################################################################
 #################################### Bulk update writer ################################
 #################################################################################################
-bulk_writer <- function(x, index = 'maml', varname = 'updated_variable') {
+bulk_writer <- function(x, index = 'maml', varname, type, ver) {
-  return(
+  ### Create a json object if more than one variable besides _id, otherwise use value as-is
-    paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
+  if (length(x) > 2) {
-  { "script" : { "source": "ctx._source.',varname,' = params.code", "lang" : "painless","params" : {"code":',toJSON(x[-1], collapse = F),'}}}')
+    json <- toJSON(list(x[-1]), collapse = T)
-  )
+  } else {
    names(x) <- NULL
    json <- toJSON(x[-1], collapse = T)
  }
  if (varname == "ud") {
    return(
      paste0('{"update": {"_index": "',index,'", "_type": "_doc", "_id": "',x[1],'"}}
 { "script" : { "source": "ctx._source.version = \\"',ver,'\\"; ctx._source.ud = params.code; ctx._source.remove(\\"tokens\\")", "lang" : "painless", "params": { "code": ',json,'}}}')
    )
  }
  if (type == 'set') {
    return(
      paste0('{"update": {"_index": "',index,'", "_type": "_doc", "_id": "',x[1],'"}}
 { "script" : { "source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,', "object": {"',varname,'": ',json,'} }}}')
    )
  }
  if (type == "add") {
    return(
      paste0('{"update": {"_index": "',index,'", "_type": "_doc", "_id": "',x[1],'"}}
        {"script": {"source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null && ctx._source.computerCodes.containsKey(\\"',varname,'\\")) {ctx._source.computerCodes.',varname,'.addAll(params.code)} else if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,' , "object": {"',varname,'": ',json,'}}}}'
      )
    )
  }
 }
--- a/R/class_update.R
+++ b/R/class_update.R
@ -2,31 +2,29 @@
 #'
 #' Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information
 #' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
 #' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param model_final The classification model (e.g. output from textstat_nb(), svm() or others)
 #' @param dfm_words A dfm containing all the words and only the words used to generate the model (is used for subsetting)
 #' @param varname String containing the variable name to use for the classification result, usually has the format computerCodes.varname
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @param es_super Password for write access to ElasticSearch
 #' @return As this is a nested function used within elasticizer, there is no return output
 #' @export
 #' @examples
-#' elasticizer(query, src = T, es_pwd = es_pwd, update = class_update, model_final = model_final, dfm_words = dfm_words, varname = computerCodes.varname, es_super = es_super)
+#' class_update(out, localhost = T, model_final, dfm_words, varname, es_super = .rs.askForPassword('ElasticSearch WRITE'))
 #################################################################################################
 #################################### Update any kind of classification ##########################
 #################################################################################################
-class_update <- function(out, model_final, dfm_words, varname, es_super) {
+class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
  print('updating')
-  dfm <- dfm_gen(out, text = 'lemmas') %>%
+  dfm <- dfm_gen(out, text = text, words = words, clean = clean)
-    dfm_keep(dfm_words, valuetype="fixed", verbose=T)
+  if (!is.null(model_final$idf)) {
-  pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm))
+    dfm <- dfm_weight(dfm, weights = model_final$idf)
  bulk <- apply(pred, 1, bulk_writer, varname = varname)
  res <- elastic_update(bulk, es_super = es_super)
  stop_for_status(res)
  content(res, "parsed", "application/json")
  appData <- content(res)
  if (appData$errors == T){
    print(appData)
    stop("Aborting, errors found during updating")
  }
-  print("updated")
+  pred <- data.frame(id = out$`_id`, pred = predict(model_final$text_model, newdata = dfm, type = "class", force = T))
-  Sys.sleep(1)
+  bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set', ver = ver)
  res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
 }
--- a/R/cv_generator.R
+++ b/R/cv_generator.R
@ -0,0 +1,62 @@
 #' Generate CV folds for nested cross-validation
 #'
 #' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
 #'
 #' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data
 #' @param inner_k Number of inner CV (parameter optimization) folds
 #' @param vec Vector containing the true values of the classification
 #' @param grid Parameter grid for optimization
 #' @param seed integer used as seed for random number generation
 #' @return A nested set of lists with row numbers
 #' @export
 #' @examples
 #' cv_generator(outer_k, inner_k, dfm, class_type)
 #################################################################################################
 #################################### Generate CV folds ##########################################
 #################################################################################################
 cv_generator <- function(outer_k, inner_k, vec, grid, seed) {
  ### Generate inner folds for nested cv
  inner_loop <- function(i, folds, vec, inner_k, grid, seed) {
    # RNG needs to be set explicitly for each fold
    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
    inner_folds <- createFolds(as.factor(vec[-folds[[i]]]), k= inner_k)
    grid <- crossing(grid, inner_fold = names(inner_folds), outer_fold = names(folds)[i])
    return(list(grid = grid, inner_folds = inner_folds, outer_fold = names(folds)[i]))
  }
  ### Generate outer folds for nested cv
  generate_folds <- function(outer_k, inner_k, vec, grid, seed){
    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
    if (is.null(outer_k)) { # If no outer_k, use all data to generate inner_k folds for parameter optimization
      inner_folds <- createFolds(as.factor(vec), k= inner_k)
      grid <- crossing(grid, inner_fold = names(inner_folds))
      return(list(grid = grid,
                  inner_folds = inner_folds))
    } else if (outer_k < 1) { # Create holdout validation for model performance estimation, with test set equal to outer_k
      folds <- createDataPartition(as.factor(vec), p=outer_k)
    } else { # Do full nested CV
      folds <- createFolds(as.factor(vec), k= outer_k)
    }
    # Generate grid of hyperparameters for model optimization, and include inner folds row numbers
    grid_folds <- lapply(1:length(folds),
                          inner_loop,
                          folds = folds,
                          vec = vec,
                          inner_k = inner_k,
                          grid = grid,
                          seed = seed)
    # Extract grid dataframe from results
    grid <- grid_folds %>% purrr::map(1) %>% dplyr::bind_rows()
    # Extract row numbers for inner folds from results
    inner_folds <- grid_folds %>% purrr::map(2)
    # Extract the names of the inner folds from results
    names(inner_folds) <- grid_folds %>% purrr::map(3) %>% unlist(.)
    return(list(grid = grid,
                outer_folds = folds,
                inner_folds = inner_folds))
  }
  return(generate_folds(outer_k,inner_k = inner_k, vec = vec, grid = grid, seed = seed))
 }
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -3,7 +3,11 @@
 #' Generates dfm from ElasticSearch output
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
-#' @param text String indicating whether the "merged" field will contain the "full" text, or "lemmas"
+#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @param tolower Boolean indicating whether dfm features should be lowercased
 #' @param binary Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE
 #' @param ngrams Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1
 #' @return A Quanteda dfm
 #' @export
 #' @examples
@ -16,51 +20,57 @@
 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
-dfm_gen <- function(out,words = '999', text = "lemmas") {
+dfm_gen <- function(out, words = '999', text = "lemmas", clean, tolower = T, binary=F, ngrams=1) {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
-  if (text == "lemmas") {
+  if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
-    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
+    out <- left_join(out, merger(out, text=text, clean=clean), by = "_id")
  }
  if (text == "full") {
-    out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
+    out <- mamlr:::out_parser(out, field = '_source' , clean = clean)
                        str_replace_na(out$`_source.subtitle`, replacement = " "),
                        str_replace_na(out$`_source.preteaser`, replacement = " "),
                        str_replace_na(out$`_source.teaser`, replacement = " "),
                        str_replace_na(out$`_source.text`, replacement = " "),
                        sep = " ") %>%
      # Remove html tags
      str_replace_all("<.*?>", " ") %>%
      str_replace_all("\\s+"," ")
  }
  if ('_source.codes.majorTopic' %in% colnames(out)) {
    out <- out %>%
-      mutate(codes = case_when(
+      mutate(codes = `_source.codes.majorTopic`) %>%
        .$`_source.codes.timeSpent` == -1 ~ NA_character_,
        TRUE ~ .$`_source.codes.majorTopic`
      )
      ) %>%
      mutate(junk = case_when(
-        .$codes == 2301 ~ 1,
+          .$codes == 92 ~ 1,
-        .$codes == 3101 ~ 1,
+          .$codes == 91 ~ 1,
-        .$codes == 34 ~ 1,
+          .$codes == 93 ~ 1,
-        .$`_source.codes.timeSpent` == -1 ~ NA_real_,
+          .$codes == 2301 ~ 1,
-        TRUE ~ 0
+          .$codes == 3101 ~ 1,
-      )
+          .$codes == 34 ~ 1,
          TRUE ~ 0
        ),
        aggregate = .$codes %>%
          str_pad(4, side="right", pad="a") %>%
          str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
          .[,2] %>%
          as.numeric(),
        nondomestic = as.numeric(`_source.codes.nonDomestic`)
      ) %>%
-      mutate(aggregate = .$codes %>%
+      mutate(
               str_pad(4, side="right", pad="a") %>%
               str_match("([0-9]{1,2})?[0|a][1-9|a]") %>%
               .[,2] %>%
               as.numeric()
      )
-   vardoc <- out[,-seq(1,(length(names(out))-3),1)]
+   vardoc <- select(out, codes, junk, aggregate, nondomestic)
  } else {
    vardoc <- NULL
  }
  if (words != "999") {
    ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary
    # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))
    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' '))
    if(text != 'ud_upos') {
      out$merged <- str_extract(out$merged,'.*[.?!]')
    }
  }
  dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
-    dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
+    tokens(remove_punct = T) %>%
    tokens_ngrams(n = ngrams, skip = 0, concatenator = '_') %>%
    dfm(tolower = tolower, stem = F, valuetype = "regex")
  if (binary) {
    dfm <- dfm_weight(dfm, scheme = 'boolean')
  }
  return(dfm)
 }
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@ -0,0 +1,67 @@
 #' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
 #'
 #' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
 #' @param row Row of grid to parse
 #' @param grid A cross-table of all possible combinations of doctypes and dates
 #' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)
 #' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)
 #' @param es_pwd Password for Elasticsearch read access
 #' @param es_super Password for write access to ElasticSearch
 #' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
 #' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
 #' @export
 #' @examples
 #' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)
 #################################################################################################
 #################################### Duplicate detector ################################
 #################################################################################################
 dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) {
  params <- grid[row,]
  print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
  query <- paste0('doctype:\\"',params$doctypes,'\\" && publication_date:',params$dates,' && !computerCodes._delete:1')
  out <- elasticizer(query_string(query, fields = c('country','text','title','subtitle','teaser','preteaser')), es_pwd = es_pwd, localhost= localhost)
  if (class(out$hits$hits) != 'list') {
    dfm <- dfm_gen(out, text = "full", words = words, clean = T)
    if (sum(dfm[1,]) > 0) {
      simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
      diag(simil) <- NA
      duplicates <- which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)
      duplicates <- cbind(duplicates, rowid= rownames(duplicates))
      if (length(duplicates) > 0) {
        rownames(duplicates) <- seq(1:length(rownames(duplicates)))
        df <- as.data.frame(duplicates, make.names = NA, stringsAsFactors = F) %>%
          # bind_cols(colid = colnames(simil)[.['col']]) %>%
          mutate(colid = colnames(simil)[as.numeric(col)]) %>%
          .[,c(3,4)] %>%
          group_by(colid) %>% summarise(rowid=list(rowid))
        text <- capture.output(stream_out(df))
        # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
        simil[upper.tri(simil)] <- NA
        # write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
        #       file = paste0(getwd(),'/remove_ids.txt'),
        #       append=T)
        dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
                                  dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
        bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver),
                  apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver))
        res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
        return(paste0('Checked ',params$doctypes,' on ',params$dates ))
      } else {
        return(paste0('No duplicates for ',params$doctypes,' on ',params$dates ))
      }
    } else {
      return(paste0('No results for ',params$doctypes,' on ',params$dates ))
    }
  } else {
    return(paste0('No results for ',params$doctypes,' on ',params$dates ))
  }
  ### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
  # id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
  # dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
  # simil_nodupes <- as.matrix(textstat_simil(dfm_nodupes, margin="documents", method="cosine"))
  # diag(simil_nodupes) <- NA
  # which(simil_nodupes >= cutoff, arr.ind = TRUE)
 }
--- a/R/duplicate_detection.R
+++ b/R/duplicate_detection.R
@ -1,54 +0,0 @@
 #' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
 #'
 #' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
 #' @param row Row of grid to parse
 #' @param grid A cross-table of all possible combinations of doctypes and dates
 #' @param cutoff Cutoff value for cosine similarity above which documents are considered duplicates
 #' @param es_pwd Password for Elasticsearch read access
 #' @return dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory
 #' @export
 #' @examples
 #' dupe_detect(1,grid,es_pwd)
 #################################################################################################
 #################################### Duplicate detector ################################
 #################################################################################################
 dupe_detect <- function(row, grid, cutoff, es_pwd) {
  params <- grid[row,]
  print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
  query <- paste0('{"query":
                  {"bool": {"filter":[{"term":{"doctype": "',params$doctypes,'"}},
                  {"range" : {
                  "publication_date" : {
                  "gte" : "',params$dates,'T00:00:00Z",
                  "lt" :  "',params$dates+1,'T00:00:00Z"
                  }
                  }}]
                  } } }')
  out <- elasticizer(query, es_pwd = es_pwd)
  dfm <- dfm_gen(out, text = "full")
  simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
  diag(simil) <- NA
  simil_og <- simil
  df <- as.data.frame(which(simil >= cutoff, arr.ind = TRUE)) %>%
    rownames_to_column("rowid") %>%
    mutate(colid = colnames(simil)[col]) %>%
    .[,c(1,4)] %>%
    group_by(colid) %>% summarise(rowid=list(rowid))
  text <- capture.output(stream_out(df))
  write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
  simil[upper.tri(simil)] <- NA
  write(unique(rownames(which(simil >= cutoff, arr.ind = TRUE))),
        file = paste0(getwd(),'/remove_ids.txt'),
        append=T)
  ### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
  # id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
  # dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
  # simil_nodupes <- as.matrix(textstat_simil(dfm_nodupes, margin="documents", method="cosine"))
  # diag(simil_nodupes) <- NA
  # which(simil_nodupes >= cutoff, arr.ind = TRUE)
 }
--- a/R/elastic_update.R
+++ b/R/elastic_update.R
@ -3,26 +3,37 @@
 #' Push a line-delimited JSON string to Elasticsearch as bulk update
 #' @param x Line-delimited JSON suitable for use as Elasticsearch bulk update
 #' @param es_super The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database
 #' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @return An html response object indicating the status of the update
 #' @export
 #' @examples
-#' elastic_update(x, es_super = 'secret')
+#' elastic_update(x, es_super = 'secret', local = F)
 #################################################################################################
 #################################### Elasticsearch Updater ################################
 #################################################################################################
-elastic_update <- function(x, es_super = 'secret') {
+elastic_update <- function(x, es_super = 'secret', localhost = T) {
-  bulk <- paste0(x,'\n')
+  bulk <- paste0(paste0(x, collapse = '\n'),'\n')
-  url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty&refresh=wait_for')
+  if (localhost == F) {
    url <- paste0('https://super:',es_super,'@linux01.uis.no/es/_bulk?pretty')
  }
  if (localhost == T) {
    url <- 'http://localhost:9200/_bulk?pretty'
  }
  res <- RETRY("POST", url = url
               , body = bulk
               , encode = "raw"
               , add_headers("Content-Type" = "application/json")
               , times = 10
-               , pause_min = 10
+               , pause_min = 30
  )
-  # stop_for_status(res)
+  httr:::stop_for_status(res)
-  # content(res, "parsed", "application/json")
+  appData <- httr:::content(res)
-  # appData <- content(res)
+  if (appData$errors == T){
-  return(res)
+    print("Aborting, errors found during updating")
    print(appData)
    return(appData)
  }
  print("updated")
  return(1)
 }
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -4,61 +4,162 @@
 #' @param query A JSON-formatted query in the Elasticsearch query DSL
 #' @param src Logical (true/false) indicating whether or not the source of each document should be retrieved
 #' @param index The name of the Elasticsearch index to search through
 #' @param es_user Username used to connect, defaults to 'es'
 #' @param es_pwd The password for Elasticsearch read access
 #' @param batch_size Batch size
 #' @param max_batch Maximum number batches to retrieve
 #' @param time_scroll Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)
 #' @param dump Boolean indicating whether the data frames should be returned, or dumped as .Rds files
 #' @param update When set, indicates an update function to use on each batch of 1000 articles
 #' @param local Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
 #' @param ... Parameters passed on to the update function
 #' @return A data frame containing all the search results
 #' @export
 #' @examples
-#' elasticizer(query, src = TRUE, index = "maml")
+#' elasticizer(query, src = TRUE, index = "maml", update = NULL, localhost = F)
 #################################################################################################
 #################################### Get data from ElasticSearch ################################
 #################################################################################################
-elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL, ...){
+elasticizer <- function(query, src = T, index = 'maml', es_user, es_pwd = .rs.askForPassword("Elasticsearch READ"), batch_size = 1024, max_batch = Inf, time_scroll = "5m", dump = F, update = NULL, localhost = F, ...){
-  connect(es_port = 443,
+  retries <- 10 ### Number of retries on error
-          es_transport = 'https',
+  sleep <- 30 ### Number of seconds between retries
-          es_host = 'linux01.uis.no',
+  httr::set_config(httr::config(http_version = 0))
-          es_path = 'es',
+  ## Transitional code for syntax change in elastic package
-          es_user = 'es',
+  if (packageVersion("elastic") < '1') {
-          es_pwd = es_pwd,
+    if (localhost == F) {
-          errors = 'complete')
+      connect(es_port = 443,
              es_transport = 'https',
              es_host = 'linux01.uis.no',
              es_path = 'es',
              es_user = es_user,
              es_pwd = es_pwd,
              errors = 'complete'
 #              ssl_verifypeer = FALSE,
 #              ssl_verifyhost=FALSE
              )
    }
    if (localhost == T){
      connect(es_port = 9200,
              es_transport = 'http',
              es_host = 'localhost',
              es_path = '',
              es_user = es_user,
              es_pwd = es_pwd,
              errors = 'complete')
    }
    conn <- NULL
  } else {
    if (localhost == F) {
      conn <- connect(port = 443,
              transport = 'https',
              host = 'linux01.uis.no',
              path = 'es',
              user = es_user,
              pwd = es_pwd,
              errors = 'complete'
 #              ssl_verifypeer = FALSE,
 #              ssl_verifyhost=FALSE
              )
    }
    if (localhost == T){
      conn <- connect(port = 9200,
              transport = 'http',
              host = 'localhost',
              path = '',
              user = es_user,
              pwd = es_pwd,
              errors = 'complete')
    }
  }
  # Get all results - one approach is to use a while loop
  if (src == T) {
-    res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T)
+    res <- NULL
    attempt <- 0
    while( is.null(res) && attempt <= retries ) {
      if (attempt > 0) {
        Sys.sleep(sleep)
      }
      attempt <- attempt + 1
      try(
        res <- Search(conn = conn, index = index, time_scroll=time_scroll,body = query, size = batch_size, raw=T)
      )
      if (attempt > 1) {
        print(paste0('Successful after ',attempt,' attempts'))
      }
    }
  }
  if (src == F) {
-    res <- Search(index = index, time_scroll="5m",body = query, size = 1000, raw=T, source = F)
+    res <- NULL
    attempt <- 0
    while( is.null(res) && attempt <= retries ) {
      if (attempt > 0) {
        Sys.sleep(sleep)
      }
      attempt <- attempt + 1
      try(
        res <- Search(conn = conn, index = index, time_scroll=time_scroll,body = query, size = batch_size, raw=T, source = F)
      )
      if (attempt > 1) {
        print(paste0('Successful after ',attempt,' attempts'))
      }
    }
  }
  json <- fromJSON(res)
-  if (json$hits$total == 0) {
+  if (json$hits$total$value == 0) {
    scroll_clear(conn = conn, x = json$`_scroll_id`)
    return(json)
  } else {
    out <-  jsonlite:::flatten(json$hits$hits)
-    total <- json$hits$total
+    total <- json$hits$total$value
-    hits <- 1
+    hits <- length(json$hits$hits)
    batch <- 1
-    print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
+    print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
    if (dump) {
      saveRDS(out, file = paste0('batch_',batch*batch_size,'.Rds'))
    }
    if (length(update) > 0){
-      update(out, ...)
+      update(out, localhost = localhost, ...)
    }
-    while(hits != 0){
+    while(hits > 0 && batch < max_batch ){
-      res <- scroll(json$`_scroll_id`, time_scroll="5m", raw=T)
+      res <- NULL
      attempt <- 0
      while( is.null(res) && attempt <= retries ) {
        if (attempt > 0) {
          Sys.sleep(sleep)
        }
        attempt <- attempt + 1
        try(
          res <- scroll(conn = conn, json$`_scroll_id`, time_scroll=time_scroll, raw=T)
        )
        if (attempt > 1) {
          print(paste0('Successful after ',attempt,' attempts'))
        }
      }
      json <- fromJSON(res)
      hits <- length(json$hits$hits)
      if(hits > 0) {
        batch <- batch+1
-        print(paste0('Processing documents ',batch*1000-1000,' through ',batch*1000,' out of ',total,' documents.'))
+        print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
        if (length(update) > 0){
          out <-  jsonlite:::flatten(json$hits$hits)
-          update(out, ...)
+          update(out, localhost = localhost, ...)
        } else if (dump) {
          saveRDS(jsonlite:::flatten(json$hits$hits), file = paste0('batch_',batch*batch_size,'.Rds'))
        } else {
-          out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
+          # Old merging code
          # out <- bind_rows(out, jsonlite:::flatten(json$hits$hits))
          out <- rbindlist(list(out, jsonlite:::flatten(json$hits$hits)),use.names = T, fill = T)
        }
      }
    }
    if (length(update) > 0) {
      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return("Done updating")
    } else if (dump) {
      return("Dumping complete")
    } else {
      scroll_clear(conn = conn, x = json$`_scroll_id`)
      return(out)
    }
  }
--- a/R/estimator.R
+++ b/R/estimator.R
@ -0,0 +1,100 @@
 #' Generate models and get classifications on test sets
 #'
 #' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
 #'
 #' @param row Row number of current item in grid
 #' @param grid Grid with model parameters and CV folds
 #' @param outer_folds List with row numbers for outer folds
 #' @param dfm DFM containing labeled documents
 #' @param class_type Name of column in docvars() containing the classes
 #' @param model Model to use (currently only nb)
 #' @param we_vectors Matrix with word embedding vectors
 #' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
 #' @export
 #' @examples
 #' estimator(row, grid, outer_folds, dfm, class_type, model)
 #################################################################################################
 #################################### Generate models ############################################
 #################################################################################################
 ### Classification function
 estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model, we_vectors) {
  # Get parameters for current iteration
  params <- grid[row,]
  # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
  if ("inner_fold" %in% colnames(params) && "outer_fold" %in% colnames(params))  {
    dfm_train <- dfm[-outer_folds[[params$outer_fold]],] %>%
      .[-inner_folds[[params$outer_fold]][[params$inner_fold]],]
    dfm_test <- dfm[-outer_folds[[params$outer_fold]],] %>%
      .[inner_folds[[params$outer_fold]][[params$inner_fold]],]
    # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
  } else if ("outer_fold" %in% colnames(params)) {
    dfm_train <- dfm[-outer_folds[[params$outer_fold]],]
    dfm_test <- dfm[outer_folds[[params$outer_fold]],]
    # If only inner folds, validate performance directly on inner folds
  } else if ("inner_fold" %in% colnames(params)) {
    dfm_train <- dfm[-inner_folds[[params$inner_fold]],]
    dfm_test <- dfm[inner_folds[[params$inner_fold]],]
    # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
  } else {
    dfm_test <- NULL
    dfm_train <- dfm
  }
  dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
  if (params$tfidf) {
    idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
    dfm_train <- dfm_weight(dfm_train, weights = idf)
    if (!is.null(dfm_test)) {
      dfm_test <- dfm_weight(dfm_test, weights = idf)
    }
  } else {
    idf <- NULL
  }
  if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
    # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
    words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
                                  feat_select,
                                  dfm = dfm_train,
                                  class_type = params$class_type,
                                  percentile = params$feat_percentiles,
                                  measure = params$feat_measures
    )))
    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
    if (!is.null(dfm_test)) {
      dfm_test <- dfm_keep(dfm_test, words, valuetype="fixed", verbose=F)
    }
  }
  if (model == "nb") {
    text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
  }
  if (model == "svm") {
    text_model <- svm(x=as.matrix(train_data), y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
  }
  # if (model == 'nnet') {
  #   idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
  #   text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
  # }
  ### Add more if statements for different models
  # If training on whole dataset, return final model, and idf values from dataset
  if (is.null(dfm_test)) {
    return(list(text_model=text_model, idf=idf))
  } else { # Create a test set, and classify test items
    # Use force=T to keep only features present in both training and test set
    pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T)
    return(data.frame(
      tv = I(list(docvars(dfm_test, class_type))), # True values from test set
      pred = I(list(pred)), # Predictions of test set
      params, # Parameters used to generate classification model
      text_model = I(list(text_model)), # The classification model
      idf = I(list(idf)), # IDF of the training dataset used for model creation
      stringsAsFactors = F
    ))
  }
 }
--- a/R/feat_select.R
+++ b/R/feat_select.R
@ -0,0 +1,28 @@
 #' Select features using quanteda textstat_keyness
 #'
 #' Select features based on the textstat_keyness function and a percentile cutoff
 #' Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
 #'
 #' @param topic The topic to determine keywords for
 #' @param dfm The input dfm
 #' @param class_type Name of the column in docvars containing the classification
 #' @param percentile Cutoff for the list of words that should be returned
 #' @param measure Measure to use in determining keyness, default = chi2; see textstat_keyness for other options
 #' @return A vector of words that are key to the topic
 #' @export
 #' @examples
 #' feat_select(topic, dfm, class_type, percentile, measure="chi2")
 #################################################################################################
 #################################### Feature selection ##########################################
 #################################################################################################
 feat_select <- function (topic, dfm, class_type, percentile, measure="chi2") {
  # Use quanteda textstat_keyness to determine feature importance
  keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
    na.omit()
  # Convert keyness values to absolute values, to take into account both positive and negative extremes
  keyness[,2] <- abs(keyness[,2])
  # Keep only the words with an absolute keyness value falling in the top [percentile] percentile
  keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
  return(keyness)
 }
--- a/R/lemma_writer.R
+++ b/R/lemma_writer.R
@ -0,0 +1,42 @@
 #' Generates text output files (without punctuation) for external applications, such as GloVe embeddings
 #'
 #' Generates text output files (without punctuation) for external applications, such as GloVe embeddings
 #' @param out The elasticizer-generated data frame
 #' @param file The file to write the output to (including path, when required). When documents = T, provide path including trailing /
 #' @param documents Indicate whether the writer should output to a single file, or individual documents
 #' @param lemma Indicate whether document output should be lemmas or original document
 #' @param cores Indicate the number of cores to use for parallel processing
 #' @param localhost Unused, but defaults to FALSE
 #' @return A Quanteda dfm
 #' @export
 #' @examples
 #' dfm_gen(out, words = '999')
 #################################################################################################
 #################################### Lemma text file generator #############################
 #################################################################################################
 lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1) {
  plan(multiprocess, workers = cores)
  par_writer <- function(row, out, lemma) {
    if (lemma == T) {
      cat(iconv(unlist(unnest(out[row,],`_source.ud`)$lemma), to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F)
    } else {
      cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F)
    }
  }
  if (documents == F) {
    out <- unnest(out,`_source.ud`)
    lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_')
    cat(lemma, file = file, append = T)
  }
  if (documents == T) {
    if (lemma == F) {
      out <- out_parser(out, field = '_source', clean = F)
    }
    future_lapply(1:nrow(out), par_writer, out = out, lemma = lemma)
  }
 }
--- a/R/merger.R
+++ b/R/merger.R
@ -1,28 +1,60 @@
 #' Merges list of lemmas back into a pseudo-document
 #'
 #' Merges list of lemmas back into a pseudo-document
 #' @param row A row number form the Elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param out The elasticizer-generated data frame
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @return A documentified string of lemmas, one document at a time
 #' @export
 #' @examples
-#' merger(1, words = '999', out = out)
+#' merger(out, text, clean)
 #################################################################################################
 #################################### Reconstructing documents from lemmas########################
 #################################################################################################
 ## Only merging lemmas for now, feature selection has no impact on junk classification
-merger <- function(row, words = '999', out = out) {
+merger <- function(out, text, clean) {
-  df <- out[row,]
+  df <- unnest(out, cols = '_source.ud') %>%
-  # Mergin lemmas into single string
+    select(`_id`,lemma,upos) %>%
-  lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
+    unnest(cols = c('lemma','upos')) %>%
-  # Replacing $-marked punctuation with their regular forms
+    # This line is added in the new merger function, in the old merger function this would result in the following:
-  lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
+    # 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma
-    ### Removing numbers and non-words containing numbers
+    # 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA
-    str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
+    filter(!is.na(lemma)) %>%
-    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
+    group_by(`_id`)
-    paste0(.,". ")
+  if (text == 'ud_upos') {
-  if (words != "999") {
+    df <- df %>%
-    lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
+      filter(upos != 'PUNCT') %>%
-  return(lemmas)
+      mutate(
        lem_u = str_c(lemma,upos,sep="_")
      ) %>%
      summarise(
                merged = str_c(c(lem_u), collapse= ' ')
      ) %>%
      # Regex removes all words consisting of or containing numbers, @#$%
      # Punctuation is not taken into account, as it is already filtered out, see above
      {if(clean == T) mutate(.,
                             merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "")
      )
        else . }
  }
  if (text == 'ud') {
    df <- df %>%
      summarise(
                merged = str_c(c(lemma), collapse= ' ')
      ) %>%
      mutate(
        merged = str_replace_all(merged," \\$(.+?)", "\\1")
      ) %>%
      # Regex removes all words consisting of or containing numbers, @#$%
      # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
      # Regex also used in out_parser
      # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
      {if(clean == T) mutate(.,
                             merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")
      )
        else . } %>%
      mutate(.,
             merged = paste0(merged,'. '))
  }
  return(df)
 }
--- a/R/metric_gen.R
+++ b/R/metric_gen.R
@ -0,0 +1,47 @@
 #' Generate performance statistics for models
 #'
 #' Generate performance statistics for models, based on their predictions and the true values
 #'
 #' @param x A data frame containing at least the columns "pred" and "tv"
 #' @return x, with additional columns for performance metrics
 #' @export
 #' @examples
 #' metric_gen(x)
 #################################################################################################
 ############################# Performance metric generation #####################################
 #################################################################################################
 metric_gen <- function(x) {
  ### Fix for missing classes in multiclass classification
  ### Sorting u for easier interpretation of confusion matrix
  u <- as.character(sort(as.numeric(union(unlist(x$pred), unlist(x$tv)))))
  # Create a crosstable with predictions and true values
  class_table <- table(prediction = factor(unlist(x$pred), u), trueValues = factor(unlist(x$tv), u))
  # When only two classes, set positive class explicitly as the class with the highest value
  if (length(unique(u)) == 2) {
    conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
    weighted_measures <- as.data.frame(conf_mat$byClass)
    macro_measures <- as.data.frame(conf_mat$byClass)
  } else {
    # Create a confusion matrix
    conf_mat <- confusionMatrix(class_table, mode = "everything")
    # Set "positive" value to NA, because not applicable
    conf_mat$positive <- NA
    # Compute weighted performance measures
    weighted_measures <- colSums(conf_mat$byClass * colSums(conf_mat$table))/sum(colSums(conf_mat$table))
    # Compute unweighted performance measures (divide by number of classes, each class equally important)
    macro_measures <- colSums(conf_mat$byClass)/nrow(conf_mat$byClass)
    # Replace NaN's by 0 when occurring
    weighted_measures[is.nan(weighted_measures)] <- 0
    macro_measures[is.nan(macro_measures)] <- 0
  }
  return(cbind(x,
               as.data.frame(t(conf_mat$overall)),
               'weighted' = t(as.data.frame(weighted_measures)),
               'macro' = t(as.data.frame(macro_measures)),
               pos_cat = conf_mat$positive,
               conf_mat = I(list(conf_mat))
              )
         )
 }
--- a/R/modelizer.R
+++ b/R/modelizer.R
@ -8,217 +8,124 @@
 #' - percentiles (cutoff point for tf-idf feature selection)
 #' - measures (what measure to use for determining feature importance, see textstat_keyness for options)
 #' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars
 #' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds)
 #' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)
 #' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)
 #' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)
 #' @param seed Integer to use as seed for random number generation, ensures replicability
 #' @param outer_k Number of outer cross-validation folds (for performance estimation)
 #' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection)
 #' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
 #' @param class_type Type of classification to model ("junk", "aggregate", or "codes")
 #' @param opt_measure Label of measure in confusion matrix to use as performance indicator
 #' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename)
 #' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
-#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
+#' @param seed Integer to use as seed for random number generation, ensures replicability
 #' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
 #' @param we_vectors Matrix with word embedding vectors
 #' @param cores Number of threads used for parallel processing using future_lapply, defaults to 1
 #' @return A list containing all relevant output
 #' @export
 #' @examples
-#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
+#' modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
 #################################################################################################
 #################################### Function to generate classification models #################
 #################################################################################################
-modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
+modelizer <- function(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, we_vectors, cores = 1) {
-  ### Functions ###
+  ## Generate list containing outer folds row numbers, inner folds row numbers, and grid for model building
-  feat_select <- function (topic, dfm, class_type, percentile,measure) {
+  folds <- cv_generator(outer_k,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
-    keyness <- textstat_keyness(dfm, measure = measure, docvars(dfm, class_type) == as.numeric(topic)) %>%
+  inner_grid <- folds$grid
-      na.omit()
+  outer_folds <- folds$outer_folds
-    keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
+  inner_folds <- folds$inner_folds
    return(keyness)
  }
  ### Generate inner folds for nested cv
  inner_loop <- function(fold, dfm, inner_k, class_type) {
    # RNG needs to be set explicitly for each fold
    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
    ## Either createDataPartition for simple holdout parameter optimization
    ## Or createFolds for proper inner CV for nested CV
    # if (inner_k <= 1) {
    #   inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
    # } else {
    inner_folds <- createFolds(as.factor(docvars(dfm[-fold], class_type)), k= inner_k)
    # }
    return(c(outer_fold = list(fold),inner_folds))
  }
  ### Generate outer folds for nested cv
  generate_folds <- function(outer_k, inner_k, dfm, class_type){
    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
    folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
    return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type))
  }
  ### Gets called for every parameter combination, and calls classifier for every inner cv fold
  inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
    params <- grid[row,]
    # For each inner fold, cross validate the specified parameters
    res <-
      bind_rows(mclapply(inner_folds,
                         classifier,
                         outer_fold = outer_fold,
                         params = params,
                         dfm = dfm,
                         class_type = class_type,
                         model = model,
                         cores_feats = cores_feats,
                         mc.cores = cores_inner
      )
      )
    return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params))
  }
-  ### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
+  ## Create multithread work pool for future_lapply
-  outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
+  plan(strategy = multiprocess, workers = cores)
    # If fold contains both inner folds and outer fold
    if (length(fold) == inner_k + 1) {
      inner_folds <- fold[-1]
      outer_fold <- fold$outer_fold
      # For each row in grid, cross-validate results
      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
                                inner_cv,
                                cores_feats= cores_feats,
                                grid = grid,
                                dfm = dfm,
                                class_type = class_type,
                                model = model,
                                outer_fold = outer_fold,
                                inner_folds = inner_folds,
                                cores_inner = cores_inner,
                                mc.cores = cores_grid)
      )
      # Determine optimum hyperparameters within outer fold training set
      optimum <- res[which.max(res[,opt_measure]),] %>%
        select(percentiles: ncol(.))
      # Validate performance of optimum hyperparameters on outer fold test set
      return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats))
    } else {
      # If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters
      inner_folds <- fold
      outer_fold <- NULL
      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
                                inner_cv,
                                cores_feats= cores_feats,
                                grid = grid,
                                dfm = dfm,
                                class_type = class_type,
                                model = model,
                                outer_fold = outer_fold,
                                inner_folds = inner_folds,
                                cores_inner = cores_inner,
                                mc.cores = cores_grid)
      )
      return(res)
    }
  }
-  ### Custom tfidf function to allow same idf for different dfm's
+  ## Use estimator function to build models for every parameter combination in grid
-  custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
+  inner_cv_output <- future_lapply(1:nrow(inner_grid), estimator,
-    if (!nfeat(x) || !ndoc(x)) return(x)
+                                   grid = inner_grid,
-    tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
+                                   outer_folds = outer_folds,
-    if (nfeat(x) != length(dfreq))
+                                   inner_folds = inner_folds,
-      stop("missing some values in idf calculation")
+                                   dfm = dfm,
-    # get the document indexes
+                                   class_type = class_type,
-    j <- as(tfreq, "dgTMatrix")@j + 1
+                                   we_vectors = we_vectors,
-    # replace just the non-zero values by product with idf
+                                   model = model) %>%
-    x@x <- tfreq@x * dfreq[j]
+    future_lapply(.,metric_gen) %>% # Generate model performance metrics for each grid row
-    # record attributes
+    bind_rows(.)
    x@weightTf <- tfreq@weightTf
    x@weightDf <- c(list(scheme = scheme_df, base = base), args)
    return(x)
  }
  ### Classification function
  classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
    # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
    if (length(inner_fold) > 0 && length(outer_fold) > 0)  {
      dfm_train <- dfm[-outer_fold] %>%
        .[-inner_fold]
      dfm_test <- dfm[-outer_fold] %>%
        .[inner_fold]
      # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
    } else if (length(outer_fold) > 0 ) {
      dfm_train <- dfm[-outer_fold]
      dfm_test <- dfm[outer_fold]
      # If only inner folds, validate performance directly on inner folds (is the same as above?)
    } else if (length(inner_fold) > 0 ) {
      dfm_train <- dfm[-inner_fold]
      dfm_test <- dfm[inner_fold]
      # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
    } else {
      final <- T ### Indicate final modeling run on whole dataset
      dfm_train <- dfm
    }
    ### Getting features from training dataset
    # Getting idf from training data, and using it to normalize both training and testing feature occurence
    dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
    dfreq <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0, use.names=T)
    dfm_train <- custom_tfidf(dfm_train, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq)
    words <- unlist(mclapply(unique(docvars(dfm_train, class_type)),
                             feat_select,
                             dfm = dfm_train,
                             class_type = class_type,
                             percentile = params$percentiles,
                             measure = params$measures,
                             mc.cores = cores_feats
    ))
    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
  outer_grid <- inner_cv_output %>%
    # Group by outer folds, and by parameters used for model tuning
    group_by_at(c("outer_fold", colnames(grid))) %>%
    # Get mean values for all numeric (performance indicator) variables
    summarise_if(is.numeric, mean, na.rm = F) %>%
    # Group the mean performance indicators by outer_fold
    group_by(outer_fold) %>%
    # Get for each outer_fold the row with the highest value of opt_measure
    slice(which.max((!!as.name(opt_measure)))) %>%
    # Select only the columns outer_fold, and the columns that are in the original parameter grid
    select(outer_fold, colnames(grid))
  ## Create multithread work pool for future_lapply
  plan(strategy = multiprocess, workers = cores)
-    if (model == "nb") {
+  # Use the estimator function to build optimum models for each outer_fold
-      text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
+  outer_cv_output <- future_lapply(1:nrow(outer_grid), estimator,
-    }
+                                   grid = outer_grid,
-    if (model == "svm") {
+                                   outer_folds = outer_folds,
-      text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = "linear", gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
+                                   inner_folds = NULL,
-    }
+                                   dfm = dfm,
-    ### Add more if statements for different models
+                                   class_type = class_type,
-    if (exists("final") == T) {
+                                   we_vectors = we_vectors,
-      return(text_model)
+                                   model = model) %>%
-    } else {
+    future_lapply(., metric_gen) %>% # Generate performance metrics for each row in outer_grid
-      ### Removing all features not in training set from test set and weighting the remaining features according to training idf
+    bind_rows(.)
      dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
      dfm_test <- custom_tfidf(dfm_test, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq[words])
      pred <- predict(text_model, newdata = dfm_test)
      class_table <- table(prediction = pred, trueValues = docvars(dfm_test, class_type))
      conf_mat <- confusionMatrix(class_table, mode = "everything")
      if (is.matrix(conf_mat$byClass) == T) {
        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
      } else {
        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive))
      }
    }
  }
-  ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script)
+  # Create (inner) folds for parameter optimization on the entire dataset
-  folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type)
+  final_folds <- cv_generator(NULL,inner_k = inner_k, vec = docvars(dfm, class_type), grid = grid, seed = seed)
  final_grid <- final_folds$grid
  final_inner <- final_folds$inner_folds
-  ## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator)
+  ## Create multithread work pool for future_lapply
-  performance <- bind_rows(mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)) %>%
+  plan(strategy = multiprocess, workers = cores)
    bind_rows(., colMeans(select(., 1:`Balanced Accuracy`)))
-  ## Set seed and generate folds for final hyperparameter optimization search (using CV)
+  # Use the estimator function to estimate the performance of each row in final_grid
-  set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
+  final_cv_output <- future_lapply(1:nrow(final_grid), estimator,
-  folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k))
+                                   grid = final_grid,
                                   outer_folds = NULL,
                                   inner_folds = final_inner,
                                   dfm = dfm,
                                   class_type = class_type,
                                   we_vectors = we_vectors,
                                   model = model) %>%
    future_lapply(.,metric_gen) %>% # Generate performance metrics for each row in final_grid
    bind_rows(.)
  ## Get the final hyperparameter performance for all value combinations in grid
  params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer))
-  ## Select optimum final hyperparameters
+  final_params <- final_cv_output %>%
-  optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>%
+    # Group final parameter optimization cv results by parameters used for optimization
-    select(percentiles: ncol(.))
+    group_by_at(colnames(grid)) %>%
    # Get mean performance metrics for each fold
    summarise_if(is.numeric, mean, na.rm = F) %>%
    # Ungroup to allow for slicing
    ungroup() %>%
    # Select row with highest value of opt_measure
    slice(which.max((!!as.name(opt_measure)))) %>%
    # Keep only the columns that are present in the original parameter grid
    select(colnames(grid))
  # Use the estimator function to estimate the final model, using the optimum parameters provided in final_params
  model_final <- estimator(1,
                           grid = final_params,
                           outer_folds = NULL,
                           inner_folds = NULL,
                           dfm = dfm,
                           class_type = class_type,
                           model = model)
  # Create list with output variables
  output <- list(final_params = final_params,
                 outer_cv_output = outer_cv_output,
                 model_final = model_final,
                 grid = grid,
                 seed = seed,
                 opt_measure = opt_measure,
                 model = model,
                 country = country,
                 class_type = class_type)
-  ## Estimate final model on whole dataset, using optimum final hyperparameters determined above
+  # Return ouput
-  model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = detectCores())
+  return(output)
  rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
  save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
  return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
 }
--- a/R/modelizer_old.R
+++ b/R/modelizer_old.R
@ -0,0 +1,301 @@
 #' Generate a classification model
 #'
 #' Generate a nested cross validated classification model based on a dfm with class labels as docvars
 #' Currently only supports Naïve Bayes using quanteda's textmodel_nb
 #' Hyperparemeter optimization is enabled through the grid parameter
 #' A grid should be generated from vectors with the labels as described for each model, using the crossing() command
 #' For Naïve Bayes, the following parameters can be used:
 #' - percentiles (cutoff point for tf-idf feature selection)
 #' - measures (what measure to use for determining feature importance, see textstat_keyness for options)
 #' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars
 #' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds)
 #' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)
 #' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)
 #' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)
 #' @param seed Integer to use as seed for random number generation, ensures replicability
 #' @param outer_k Number of outer cross-validation folds (for performance estimation)
 #' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection)
 #' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
 #' @param class_type Type of classification to model ("junk", "aggregate", or "codes")
 #' @param opt_measure Label of measure in confusion matrix to use as performance indicator
 #' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename)
 #' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
 #' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
 #' @export
 #' @examples
 #' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
 #################################################################################################
 #################################### Function to generate classification models #################
 #################################################################################################
 modelizer_old <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
  ### Functions ###
  feat_select <- function (topic, dfm, class_type, percentile,measure) {
    keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
      na.omit()
    keyness[,2] <- abs(keyness[,2])
    keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
    return(keyness)
  }
  ### Generate inner folds for nested cv
  inner_loop <- function(fold, dfm, inner_k, class_type) {
    # RNG needs to be set explicitly for each fold
    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
    ## Either createDataPartition for simple holdout parameter optimization
    ## Or createFolds for proper inner CV for nested CV
    # if (inner_k <= 1) {
    #   inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
    # } else {
    inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k)
    # }
    return(c(outer_fold = list(fold),inner_folds))
  }
  ### Generate outer folds for nested cv
  generate_folds <- function(outer_k, inner_k, dfm, class_type){
    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
    folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
    return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type))
  }
  ### Gets called for every parameter combination, and calls classifier for every inner cv fold
  inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
    print(str_c('params ',row))
    params <- grid[row,]
    # For each inner fold, cross validate the specified parameters
    res <-
      bind_rows(mclapply(inner_folds,
                         classifier,
                         outer_fold = outer_fold,
                         params = params,
                         dfm = dfm,
                         class_type = class_type,
                         model = model,
                         cores_feats = cores_feats,
                         mc.cores = cores_inner
      )
      )
    # print(res)
    # print(res[1,1])
    # print('inner_cv')
    return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params))
  }
  ### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
  outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
    print('outer cv')
    # If fold contains both inner folds and outer fold
    if (length(fold) == inner_k + 1) {
      inner_folds <- fold[-1]
      outer_fold <- fold$outer_fold
      # For each row in grid, cross-validate results
      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
                                inner_cv,
                                cores_feats= cores_feats,
                                grid = grid,
                                dfm = dfm,
                                class_type = class_type,
                                model = model,
                                outer_fold = outer_fold,
                                inner_folds = inner_folds,
                                cores_inner = cores_inner,
                                mc.cores = cores_grid)
      )
      # print(res)
      # print(res[1,1])
      # print('outer_cv')
      # Determine optimum hyperparameters within outer fold training set
      optimum <- res[which.max(res[,opt_measure]),] %>%
        select(percentiles: ncol(.))
      # Validate performance of optimum hyperparameters on outer fold test set
      return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats))
    } else {
      # If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters
      inner_folds <- fold
      outer_fold <- NULL
      res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
                                inner_cv,
                                cores_feats= cores_feats,
                                grid = grid,
                                dfm = dfm,
                                class_type = class_type,
                                model = model,
                                outer_fold = outer_fold,
                                inner_folds = inner_folds,
                                cores_inner = cores_inner,
                                mc.cores = cores_grid)
      )
      # print(res)
      # print(res[1,1])
      # print('line 126, final model parameter optimization')
      return(res)
    }
  }
  # ### Custom tfidf function to allow same idf for different dfm's
  # custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
  #   if (!nfeat(x) || !ndoc(x)) return(x)
  #   tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
  #   if (nfeat(x) != length(dfreq))
  #     stop("missing some values in idf calculation")
  #   # get the document indexes
  #   j <- as(tfreq, "dgTMatrix")@j + 1
  #   # replace just the non-zero values by product with idf
  #   x@x <- tfreq@x * dfreq[j]
  #   # record attributes
  #
  #   ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2
  #
  #   # x@weightTf <- tfreq@weightTf
  #   # x@weightDf <- c(list(scheme = scheme_df, base = base), args)
  #   return(x)
  # }
  ### Classification function
  classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
    # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
    if (length(inner_fold) > 0 && length(outer_fold) > 0)  {
      dfm_train <- dfm[-outer_fold,] %>%
        .[-inner_fold,]
      dfm_test <- dfm[-outer_fold,] %>%
        .[inner_fold,]
      # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
    } else if (length(outer_fold) > 0 ) {
      dfm_train <- dfm[-outer_fold,]
      dfm_test <- dfm[outer_fold,]
      validation_cv <- T
      # If only inner folds, validate performance directly on inner folds (is the same as above?)
    } else if (length(inner_fold) > 0 ) {
      dfm_train <- dfm[-inner_fold,]
      dfm_test <- dfm[inner_fold,]
      # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
    } else {
      final <- T ### Indicate final modeling run on whole dataset
      dfm_train <- dfm
    }
    if (model == 'nb') {
      scheme_tf <- 'count' # The 'old' way
    } else {
      scheme_tf <- 'prop' # The 'new' way
    }
    ### Getting features from training dataset
    # Getting idf from training data, and using it to normalize both training and testing feature occurence
    dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
    idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
    dfm_train <- dfm_weight(dfm_train, weights = idf)
    # Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
    # than the threshold on two or more of those categories
    words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
                             feat_select,
                             dfm = dfm_train,
                             class_type = class_type,
                             percentile = params$percentiles,
                             measure = params$measures,
                             mc.cores = cores_feats
    )))
    # dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq)
    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
    if (model == "nb") {
      text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
    }
    if (model == "svm") {
      text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
    }
    if (model == 'nnet') {
      idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
      text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
    }
    # if (model == 'neuralnet') {
    #   dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
    #   dfm_test <- custom_tfidf(dfm_test, scheme_tf = "prop", scheme_df = "inverse", base = 10, dfreq = dfreq[words])
    #
    #   idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
    #   colnames(idC) <- NULL
    #   nn_train <- cbind(idC, dfm_train)
    #   n <- colnames(nn_train)[3:length(colnames(nn_train))]
    #   f <- as.formula(paste0("feat1 + feat2 ~ `", paste0(n, collapse = "` + `"),"`"))
    #   idC_out <- class.ind(as.factor(docvars(dfm_test, class_type)))
    #   colnames(idC_out) <- NULL
    #   nn_test <- cbind(idC_out, dfm_test)
    #   nn <- neuralnet(f,data=nn_train,hidden=3,linear.output=F,act.fct = 'logistic',lifesign = 'full', threshold = .005)
    #   pr.nn <- compute(nn,nn_test[,3:length(colnames(nn_test))])
    #   class_table <- table(prediction = as.matrix(round(pr.nn$net.result[,2])), trueValues = as.matrix(idC_out[,2]))
    #   nn_conf_mat <- confusionMatrix(class_table, mode = "everything")
    # }
    ### Add more if statements for different models
    if (exists("final")) {
      return(list(text_model=text_model, idf=idf))
    } else {
      ### Removing all features not in training set from test set and weighting the remaining features according to training idf
      dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
      dfm_test <- dfm_weight(dfm_test, weights = idf)
      pred <- predict(text_model, newdata = dfm_test, type = 'class')
      ### Fix for single-class 'predictions' in borderline situations
      # if (length(unique(pred)) == 1 & class_type == 'junk') {
      #   if (unique(pred) == '0') {
      #     pred[1] <- '1'
      #   } else {
      #     pred[1] <- '0'
      #   }
      # }
      ### Fix for missing classes in multiclass classification
      u <- union(pred, docvars(dfm_test, class_type))
      class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u))
      if (length(unique(u)) == 2) {
        conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
      } else {
        conf_mat <- confusionMatrix(class_table, mode = "everything")
        conf_mat$positive <- NA
      }
      if (exists("validation_cv")) {
        return(data.frame(
          tv = docvars(dfm_test, class_type),
          pred = pred,
          params = params,
          pos_cat = conf_mat$positive,
          stringsAsFactors = F
        ))
      }
      if (is.matrix(conf_mat$byClass) == T) {
        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
      } else {
        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive))
      }
    }
  }
  ### If outer_k is 1, do a holdout training run, with only cross-validation for parameter optimization, else, do nested CV
  ### If holdout, training/test distribution is the same as for inner CV
  if (outer_k < 1) {
    set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
    outer_fold <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k)
    folds <- lapply(outer_fold,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type)
  } else {
    ## Generate nested CV folds, based on number of inner and outer folds defined (see start of script)
    folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type)
  }
  ## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator)
  performance <- mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)
  ## Set seed and generate folds for final hyperparameter optimization search (using CV)
  set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
  folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k))
  ## Get the final hyperparameter performance for all value combinations in grid
  params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer))
  ## Select optimum final hyperparameters
  optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>%
    select(percentiles: ncol(.))
  ## Estimate final model on whole dataset, using optimum final hyperparameters determined above
  model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer)))
  rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
  save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
  return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
 }
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -0,0 +1,89 @@
 #' Parse raw text into a single field
 #'
 #' Parse raw text from the MaML database into a single field
 #' @param out The original output data frame
 #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
 #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
 #' @return a parsed output data frame including the additional column 'merged', containing the merged text
 #' @export
 #' @examples
 #' out_parser(out,field)
 #################################################################################################
 #################################### Parser function for output fields ##########################
 #################################################################################################
 out_parser <- function(out, field, clean = F) {
  fncols <- function(data, cname) {
    add <-cname[!cname%in%names(data)]
    if(length(add)!=0) data[, (add) := (NA)]
    data
  }
  out <- fncols(data.table(out), c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
  par_parser <- function(row, out, field, clean) {
    doc <- out[row,]
    if (field == 'highlight') {
      doc <- doc %>%
        unnest(cols = starts_with("highlight")) %>%
      mutate(across(starts_with("highlight"), na_if, "NULL")) %>%
        mutate(highlight.title = coalesce(highlight.title, `_source.title`),
               highlight.subtitle = coalesce(highlight.subtitle, `_source.subtitle`),
               highlight.preteaser = coalesce(highlight.preteaser, `_source.preteaser`),
               highlight.teaser = coalesce(highlight.teaser, `_source.teaser`),
               highlight.text = coalesce(highlight.text, `_source.text`)
        ) %>%
        mutate(highlight.title = str_replace_na(highlight.title, replacement = ''),
               highlight.subtitle = str_replace_na(highlight.subtitle, replacement = ''),
               highlight.preteaser = str_replace_na(highlight.preteaser, replacement = ''),
               highlight.teaser = str_replace_na(highlight.teaser, replacement = ''),
               highlight.text = str_replace_na(highlight.text, replacement = '')
        ) %>%
        mutate(
          merged = str_c(highlight.title,
                         highlight.subtitle,
                         highlight.preteaser,
                         highlight.teaser,
                         highlight.text,
                         '',
                         sep = ". ")
        )
    }
    if (field == '_source') {
      doc <- doc %>%
        mutate(`_source.title` = str_replace_na(`_source.title`, replacement = ''),
               `_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = ''),
               `_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = ''),
               `_source.teaser` = str_replace_na(`_source.teaser`, replacement = ''),
               `_source.text` = str_replace_na(`_source.text`, replacement = '')
               ) %>%
      mutate(
        merged = str_c(`_source.title`,
                       `_source.subtitle`,
                       `_source.preteaser`,
                       `_source.teaser`,
                       `_source.text`,
                       '',
                       sep = ". ")
      )
    }
    ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
    # Remove html tags, and multiple consequent whitespaces
    # Regex removes all words consisting of or containing numbers, @#$%
    # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
    # Regex also used in merger function
    ### Old regex, used for duplicate detection:
    # \\S*?[0-9@#$%]+[^\\s!?.,;:]*
    doc$merged <- doc$merged %>%
      str_replace_all("<.{0,20}?>", " ") %>%
      str_replace_all('(\\. ){2,}', '. ') %>%
      str_replace_all('([!?.])\\.','\\1') %>%
      str_replace_all("\\s+"," ") %>%
      {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . }
    return(doc)
  }
  return(par_parser(1:nrow(out), out=out, clean=clean, field=field))
 }
--- a/R/preproc.R
+++ b/R/preproc.R
@ -0,0 +1,71 @@
 #' Preprocess dfm data for use in modeling procedure
 #'
 #' Process dfm according to parameters provided in params
 #'
 #' @param dfm_train Training dfm
 #' @param dfm_test Testing dfm if applicable, otherwise NULL
 #' @param params Row from grid with parameter optimization
 #' @param we_vectors Matrix with word embedding vectors
 #' @return List with dfm_train and dfm_test, processed according to parameters in params
 #' @export
 #' @examples
 #' preproc(dfm_train, dfm_test = NULL, params)
 #################################################################################################
 #################################### Preprocess data ############################################
 #################################################################################################
 ### CURRENTLY UNUSED!!!###
 preproc <- function(dfm_train, dfm_test = NULL, params, we_vectors) {
  # Remove non-existing features from training dfm
  dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
  if (params$tfidf) {
    idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
    dfm_train <- dfm_weight(dfm_train, weights = idf)
    if (!is.null(dfm_test)) {
      dfm_test <- dfm_weight(dfm_test, weights = idf)
    }
  } else {
    idf <- NULL
  }
  if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
    # Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
    words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
                                  feat_select,
                                  dfm = dfm_train,
                                  class_type = params$class_type,
                                  percentile = params$feat_percentiles,
                                  measure = params$feat_measures
    )))
    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
  }
  if (!is.null(we_vectors)) {
    shared_dict <- sort(intersect(dfm_train@Dimnames$features,we_vectors$V1))
    if (!is.null(dfm_test)) {
      shared_dict <- sort(intersect(dfm_test@Dimnames$features,shared_dict))
      dfm_test <- dfm_keep(dfm_test, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
        .[, sort(colnames(.))]
    }
    dfm_train <- dfm_keep(dfm_train, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
      .[, sort(colnames(.))]
    we_matrix <- filter(we_vectors, V1 %in% shared_dict) %>%
      arrange(V1) %>%
      as.data.table(.) %>%
      .[,2:ncol(.), with = F] %>%
      as.matrix(.)
    dfm_train_we_sum <- dfm_train %*% we_matrix
    # dfm_train_we_mean <- dfm_train_we_sum / as.vector(rowSums(dfm_train))
    if (!is.null(dfm_test)) {
      dfm_test_we_sum <- dfm_test %*% we_matrix
      # dfm_test_we_mean <- dfm_test_we_sum / as.vector(rowSums(dfm_test))
    }
    return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf, dfm_train_we = dfm_train_we_sum, dfm_test_we = dfm_test_we_sum))
  }
  return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
 }
--- a/R/query_gen_actors.R
+++ b/R/query_gen_actors.R
@ -0,0 +1,241 @@
 #' Generate actor search queries based on data in actor db
 #'
 #' Generate actor search queries based on data in actor db
 #' @param actor A row from the output of elasticizer() when run on the 'actor' index
 #' @param pre_tags Highlighter pre-tag
 #' @param post_tags Highlighter post-tag
 #' @return A data frame containing the queries, related actor ids and actor function
 #' @export
 #' @examples
 #' query_gen_actors(actor,country)
 #################################################################################################
 #################################### Actor search query generator ###############################
 #################################################################################################
 query_gen_actors <- function(actor, pre_tags, post_tags) {
  generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
    return(paste0('{"_source": ["ud","title","subtitle","preteaser","teaser","text"],
    "query":
                    {"bool": {
                      "filter":[
                        {"term":{"country":"',country,'"}},
                        {"range":{"publication_date":{"gte":"',startdate,'","lte":"',enddate,'"}}},
                        {"query_string" : {
                          "default_operator" : "OR",
                          "allow_leading_wildcard" : "false",
                          "fields": ["text","teaser","preteaser","title","subtitle"],
                          "query" : "', querystring,'"
                          }
                        }
                      ],
                      "must_not":[
                        {"term":{"computerCodes.actors.keyword":"',actorid,'"}}
                      ]
                    }
                  },
                "highlight" : {
                  "fields" : {
                  "text" : {},
                  "teaser" : {},
                  "preteaser" : {},
                  "title" : {},
                  "subtitle" : {}
                  },
                  "number_of_fragments": 0,
                  "order": "none",
                  "type":"unified",
                  "fragment_size":0,
                  "pre_tags":"', pre_tags,'",
                  "post_tags": "',post_tags,'"
                }
              }'))
  }
  prox_gen <- function(row, grid) {
    return(
      paste0('\\"',grid[row,]$first,' ',grid[row,]$last,'\\"~',grid[row,]$prox)
    )
  }
  country <- actor$`_source.country`
  ### Setting linguistic forms for each country ###
  if (country == "no" | country == "dk") {
    genitive <- 's'
    definitive <- 'en'
  } else if (country == 'uk') {
    genitive <- '\'s'
  } else if (country == 'nl' | country == 'be') {
    genitive <- 's'
  }
  ### Generating queries for individuals (ministers, PM, Party leaders and MPs)
  if (actor$`_source.function` == "JunMin" | actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader" | actor$`_source.function` == "MP") {
    ## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name
    ## Regardless of whether the last name hit is because of a minister name or a full name proximity hit
    ### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name
    if (country == 'be' && T %in% str_detect(actor$`_source.middleNames`,"'")) {
      last_list <- c(actor$`_source.lastName`, str_c(actor$`_source.lastName`,genitive), tolower(actor$`_source.lastName`), str_c(tolower(actor$`_source.lastName`),genitive))
      grid <- crossing(first = actor$`_source.firstName`, last = last_list, prox = 5)
      fullname <- lapply(1:nrow(grid), prox_gen, grid = grid)
      query_string <- paste0('((',
                             paste0(unlist(fullname), collapse = ' OR '),') AND ',
                             paste0('(',paste0(unlist(last_list), collapse = ' OR '),')'))
    } else {
      last_list <- c(actor$`_source.lastName`, str_c(actor$`_source.lastName`,genitive))
      grid <- crossing(first = actor$`_source.firstName`, last = last_list, prox = 5)
      fullname <- lapply(1:nrow(grid), prox_gen, grid = grid)
      query_string <- paste0('((',
                             paste0(unlist(fullname), collapse = ' OR '),') AND ',
                             paste0('(',paste0(unlist(last_list), collapse = ' OR '),')'))
    }
    ### If actor is a minister, generate minister search
    if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM") {
      if (country == "no" || country == "dk") {
        minister <- str_split(actor$`_source.ministerSearch`, pattern = '-| ') %>%
          map(1)
        capital <- unlist(str_to_title(minister))
        capital_def <- unlist(str_c(capital, definitive))
        def <- unlist(str_c(minister,definitive))
        minister <- unlist(c(minister,capital,capital_def,def))
      }
      if (country == "uk") {
        minister <- c(str_to_title(actor$`_source.ministerName`),
                        actor$`_source.ministerName`)
        if(actor$`_source.function` == "PM") {
          minister <- c(minister,
                        "PM")
        }
      }
      if (country == "nl" | country == "be") { # If country is nl or be, add a requirement for Minister to the query
        minister <- c("Minister",
                      "minister")
        if(actor$`_source.function` == "PM") {
          minister <- c(minister,
                        "Premier",
                        "premier")
        }
        if(actor$`_source.function` == "JunMin") {
          minister <- c("Staatssecretaris",
                        "staatssecretaris")
        }
      }
      grid <- crossing(first = last_list, last = minister, prox = 5)
      ministername <- lapply(1:nrow(grid), prox_gen, grid = grid)
      query_string <- paste0(query_string,') OR ((',
                             paste0(unlist(ministername), collapse= ' OR '),') AND ',
                             paste0('(',paste0(unlist(last_list), collapse = ' OR '),'))'))
    } else { ### Else, generate search for first/last name only (MPs and Party leaders, currently)
      query_string <- paste0(query_string,')')
    }
    ids <- list(c(actor$`_source.actorId`,str_c(actor$`_source.partyId`,'_a')))
    actorid <- actor$`_source.actorId`
    query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
    return(data.frame(query = query, ids = I(ids), prefix = NA, postfix = NA, stringsAsFactors = F))
  }
  ### Query generation for party searches
  if (actor$`_source.function` == "Party") {
    # actor$`_source.startDate` <- "2000-01-01"
    # actor$`_source.endDate` <- "2099-01-01"
    if (nchar(actor$`_source.partyNameSearchShort`[[1]]) > 0) {
      # If uk, no or dk, search for both regular abbreviations, and genitive forms
      if (country == "uk" | country == "no" | country == "dk") {
        gen <- unlist(lapply(actor$`_source.partyNameSearchShort`, str_c, genitive))
        names <- paste(unlist(c(gen,actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"')
      } else {
        names <- paste(unlist(actor$`_source.partyNameSearchShort`), collapse = '\\" \\"')
      }
      # If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself)
      if (country == "dk" | country == "no") {
        gen <- gen[which(nchar(gen) > 2)]
      }
      query_string <- paste0('(\\"',unlist(names),'\\")')
      ids <- str_c(actor$`_source.partyId`,'_s')
      actorid <- str_c(actor$`_source.partyId`,'_s')
      query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
      df1 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
    }
    if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
      if (country == "uk" | country == "no" | country == "dk") {
        gen <- unlist(lapply(actor$`_source.partyNameSearch`, str_c, genitive))
        names <- paste(unlist(c(gen,actor$`_source.partyNameSearch`)), collapse = '\\" \\"')
      } else {
        names <- paste(unlist(actor$`_source.partyNameSearch`), collapse = '\\" \\"')
      }
      query_string <- paste0('(\\"',unlist(names),'\\")')
      ids <- str_c(actor$`_source.partyId`,'_f')
      actorid <- str_c(actor$`_source.partyId`,'_f')
      query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
      if (country == 'uk' | country == 'nl' | country == 'be') {
        df2 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
      } else {
        df2 <- data.frame(query = query, ids = I(ids), prefix = NA, postfix = NA, stringsAsFactors = F)
      }
    }
    if (exists('df1') == T & exists('df2') == T) {
      return(bind_rows(df1,df2))
    } else if (exists('df1') == T) {
      return(df1)
    } else if (exists('df2') == T) {
      return(df2)
    }
  }
  ### Institution function currently not used
  # if (actor$`_source.function` == "Institution") {
  #   #uppercasing
  #   firstup <- function(x) {
  #     substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  #     x
  #   }
  #   actor$`_source.startDate` <- "2000-01-01"
  #   actor$`_source.endDate` <- "2099-01-01"
  #   if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) {
  #     upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup))
  #     upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
  #                unlist(lapply(upper, str_c, definitive)),
  #                unlist(lapply(upper, str_c, definitive_genitive)))
  #     capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title))
  #     capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
  #                  unlist(lapply(capital, str_c, definitive)),
  #                  unlist(lapply(capital, str_c, definitive_genitive)))
  #     base <- actor$`_source.institutionNameSearch`
  #     base <- c(base, unlist(lapply(base, str_c, genitive)),
  #               unlist(lapply(base, str_c, definitive)),
  #               unlist(lapply(base, str_c, definitive_genitive)))
  #     names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
  #     query_string <- paste0('(\\"',names,'\\")')
  #     ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f")))
  #     actorid <- str_c(actor$`_source.institutionId`,'_f')
  #     query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
  #     df1 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
  #   }
  #   if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) {
  #     upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup))
  #     upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
  #                unlist(lapply(upper, str_c, definitive)),
  #                unlist(lapply(upper, str_c, definitive_genitive)))
  #     capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title))
  #     capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
  #                  unlist(lapply(capital, str_c, definitive)),
  #                  unlist(lapply(capital, str_c, definitive_genitive)))
  #     base <- actor$`_source.institutionNameSearchShort`
  #     base <- c(base, unlist(lapply(base, str_c, genitive)),
  #               unlist(lapply(base, str_c, definitive)),
  #               unlist(lapply(base, str_c, definitive_genitive)))
  #     names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
  #     query_string <- paste0('(\\"',names,'\\")')
  #     ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s")))
  #     actorid <- str_c(actor$`_source.institutionId`,'_s')
  #     query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
  #     df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
  #   }
  #   if (exists('df1') == T & exists('df2') == T) {
  #     return(bind_rows(df1,df2))
  #   } else if (exists('df1') == T) {
  #     return(df1)
  #   } else if (exists('df2') == T) {
  #     return(df2)
  #   }
  # }
 }
--- a/R/query_string.R
+++ b/R/query_string.R
@ -0,0 +1,60 @@
 #' Generate a query string query for ElasticSearch
 #'
 #' Generate a query string query for ElasticSearch
 #' @param query Query string in ElasticSearch query string format
 #' @param fields List of field names to return, defaults to all
 #' @param random Return randomized results. Boolean, defaults to FALSE
 #' @return A formatted ElasticSearch query string query
 #' @export
 #' @examples
 #' query_string(query)
 #################################################################################################
 #################################### Get data from ElasticSearch ################################
 #################################################################################################
 query_string <- function(query, fields = F, random = F, default_operator = "AND") {
  if (typeof(fields) == 'logical') {
    fields <- '*'
  }
  if (random == T) {
    return(paste0(
      '{
      "_source": ',toJSON(fields),',
        "query": {
          "function_score": {
            "query": {
              "bool":{
                "filter": [{
                  "query_string" : {
                      "query" : "',query,'",
                      "default_operator": "',default_operator,'",
                      "allow_leading_wildcard" : false
                  }
                }]
              }
            },
            "random_score": {},
            "boost_mode": "sum"
          }
        }
      }'
    ))
  } else {
  return(paste0(
    '{
      "_source": ',toJSON(fields),',
      "query": {
        "bool":{
          "filter": [{
            "query_string" : {
                "query" : "',query,'",
                "default_operator": "',default_operator,'",
                "allow_leading_wildcard" : false
            }
          }]
        }
      }
    }'
  ))
  }
 }
--- a/R/sent_merger.R
+++ b/R/sent_merger.R
@ -0,0 +1,260 @@
 #' Aggregate sentence-level dataset containing sentiment (from sentencizer())
 #'
 #' Aggregate sentence-level dataset containing sentiment (from sentencizer())
 #' @param df Data frame with actor ids, produced by sentencizer
 #' @param actors_meta Optional data frame containing actor metadata obtained using elasticizer(index="actors")
 #' @param actor_groups Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
 #' @param pos_cutoff Optional value above which sentence-level sentiment scores should be considered "positive"
 #' @param neg_cutoff Optional value below which sentence-level sentiment scores should be considered "negative"
 #' @param single_party Boolean to generate data only from sentences in which a single party is mentioned, defaults to FALSE
 #' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
 #' @export
 #' @examples
 #' sent_merger(df, actors_meta, ids = NULL)
 #################################################################################################
 #################################### Generate actor-article dataset #############################
 #################################################################################################
 ### NOTE: The exceptions for various partyId_a ids has been implemented because of an error with
 ### some individual actors, where the partyId of an individual actor doesn't match an actual
 ### partyId in the actor dataset
 sent_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff = NULL, neg_cutoff = NULL, single_party = F) {
  grouper <- function(id2, df) {
    # Prevent usage of deprecated partyId_a ids, which are incorrect and should no longer be used
    if (any(str_ends(id2, '_a'))) {
      return("You're seemingly using a deprecated [partyId]_a id in your aggregations")
    }
    return(df[ids %in% id2,] %>%
             .[!duplicated(.,by = c('id','sentence_id')),.(
               actor.sent = sum(sent_binary_weighted)/sum(words),
               actor.sent_words = sum(sent_words),
               actor.words = sum(words),
               # actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
               actor.first = min(sentence_id),
               actor.occ = .N,
               publication_date = first(publication_date),
               ids = str_c(id2, collapse = '-')
             ), by = c('id')]
           )
  }
  ## Remove some of the metadata from the source df
  df <- data.table(df)[,.(
    (.SD),
    doctype = as.factor(`_source.doctype`),
    publication_date = as.Date(`_source.publication_date`),
    id = as.factor(`_id`)
  ), .SDcols = !c('_source.doctype','_source.publication_date','_id')]
  ## Create bogus variables if sentiment is not scored
  if(!"sent_sum" %in% colnames(df)) {
    df <- df[,.(
      (.SD),
      sent_words = 0,
      sent_sum = 0
    )]
  }
  ## Unnest to sentence level
  ## Check if raw sentiment data contains actor ids
  if ('ids' %in% colnames(df)) {
    df <- df[,lapply(.SD, unlist, recursive=F),
             .SDcols = c('sentence_id', 'sent_sum', 'words', 'sent_words','ids'),
             by = list(id,publication_date,doctype)]
  } else {
    df <- df[,lapply(.SD, unlist, recursive=F),
             .SDcols = c('sentence_id', 'sent_sum', 'words', 'sent_words'),
             by = list(id,publication_date,doctype)]
  }
  df <- df[,.(
    (.SD),
    sent = sent_sum/words
  )][,.(
    (.SD),
    sent_binary = case_when(
      sent > pos_cutoff ~ 1,
      sent == 0 ~ 0,
      sent >= neg_cutoff & sent <= pos_cutoff ~ 0,
      TRUE ~ -1
    )
  )][,.(
    (.SD),
    sent_binary_weighted = sent_binary*words
  )]
  text_sent <- df[,
                      .(text.sent = sum(sent_binary_weighted)/sum(words),
                        text.sent_words = sum(sent_words),
                        text.words = sum(words),
                        text.arousal = sum(sent_words)/sum(words),
                        text.sentences = .N,
                        doctype = first(doctype),
                        publication_date = first(publication_date)
                      ), by = list(id)]
    ## Create aggregations according to list of actorId vectors in ids
  if(!is.null(actor_groups)) {
    output <- lapply(actor_groups,grouper, df = df) %>%
      rbindlist(.) %>%
      left_join(text_sent, by=c("id","publication_date")) %>%
      mutate(
        actor.prom = actor.occ/text.sentences,
        actor.rel_first = 1-(actor.first/text.sentences),
        year = strftime(publication_date, format = '%Y'),
        yearmonth = strftime(publication_date, format = '%Y%m'),
        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
        yearweek = strftime(publication_date, format = "%Y%V")
      ) %>%
      mutate(across(where(is.character), as.factor)) %>%
      mutate(across(where(is.Date), as.factor))
    return(output)
  } else if(!is.null(actors_meta)) {
    text_noactors <- df[lengths(ids) == 0L,
                        .(noactor.sent = sum(sent_binary_weighted)/sum(words),
                          noactor.sent_words = sum(sent_words),
                          noactor.words = sum(words),
                          noactor.arousal = sum(sent_words)/sum(words),
                          noactor.first = min(sentence_id),
                          noactor.occ = .N
                        ), by = list(id)]
    all <- df[lengths(ids) > 0L,
              .(actor.sent = sum(sent_binary_weighted)/sum(words),
                actor.sent_words = sum(sent_words),
                actor.words = sum(words),
                actor.arousal = sum(sent_words)/sum(words),
                actor.first = min(sentence_id),
                actor.occ = .N,
                publication_date = first(publication_date),
                ids = 'all'), by = list(id)]
    ## Unnest to actor level
    df <- df[,.(ids = as.character(unlist(ids))),
             by = list(id,publication_date,sentence_id, sent_sum, words, sent_words,sent_binary_weighted)
            ][ # Remove deprecated actor_partyids from ES database
             !str_ends(ids, '_a')]
    ## Prepare actor metadata
    colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','')
    actors_meta <- data.table(actors_meta)[,
                               .((.SD),
                                 startDate = as.Date(startDate),
                                 endDate = as.Date(endDate),
                                 ids = ifelse(!is.na(actorId), actorId, partyId)
                               ), .SDcols = -c('_id','startDate','endDate','_index','_type','_score')
    ]
    ## Create table with partyIds by date and actorId to join by
    actors_party <- actors_meta %>%
      group_by(ids,partyId,startDate,endDate) %>%
      summarise() %>%
      na.omit() %>%
      ungroup() %>%
      data.table(.)
    ## Add partyId to each actorId without filtering parties out
    df <- df %>%
      # Fill partyId column for actor mentions
      actors_party[., c(colnames(.),'partyId'), # Join by actorId, within active period (start/endDate)
                   on = .(ids == ids, startDate <= publication_date, endDate >= publication_date),
                   with = F] %>%
      # Fill partyId column for party mentions
      .[is.na(partyId), partyId:=str_sub(ids, start = 1, end = -3)] %>%
      # Some actors seemingly belong to different parties on the same day, hence basing unique rows on both (actor)ids and partyId
      .[!duplicated(.,by = c('id','ids','sentence_id','partyId')),] # Keep all unique rows
    ## Removing sentences containing more than one party
    if(single_party) {
      # Create variable indicating number of unique party ids per sentence, and keep only sentences where unique parties == 1
      df <- df %>%
        .[, upid := length(unique(partyId)), by = c('id','sentence_id')] %>%
        .[upid == 1,]
    }
    ## Create aggregate measures for individual actors
    actors_merged <- df[str_starts(ids, 'A_')] %>% .[!duplicated(.,by = c('id','ids','sentence_id')),] %>% # Removing duplicate rows when actor is counted multiple times in the same sentence, because of multiple functions or parties.
      .[,
        .(actor.sent = sum(sent_binary_weighted)/sum(words),
          actor.sent_words = sum(sent_words),
          actor.words = sum(words),
          actor.arousal = sum(sent_words)/sum(words),
          actor.first = min(sentence_id),
          actor.occ = .N,
          publication_date = first(publication_date)), by = list(id, ids)]
    ## Create actor metadata dataframe per active date (one row per day per actor)
    actors_merged <- actors_meta[actors_merged,
                           c('x.startDate','x.endDate',colnames(actors_merged), 'lastName','firstName','function.','gender','yearOfBirth','parlPeriod','partyId','ministerName','ministryId','actorId','startDate','endDate'),
                           on =.(ids = ids, startDate <= publication_date, endDate >= publication_date),
                           mult = 'all',
                           with = F][,.(
                             startDate = x.startDate,
                             endDate = x.endDate,
                             (.SD)
                           ), .SDcols = -c('x.startDate', 'x.endDate','startDate','endDate')]
    ## Generate party-actor aggregations (mfsa)
    # Create party data table
    parties_actors <- df %>%
    .[!duplicated(.,by = c('id','partyId','sentence_id')),.( # Remove rows (sentences) where a party is counted multiple times
      actor.sent = sum(sent_binary_weighted)/sum(words),
      actor.sent_words = sum(sent_words),
      actor.words = sum(words),
      actor.arousal = sum(sent_words)/sum(words),
      actor.first = min(sentence_id),
      actor.occ = .N,
      publication_date = first(publication_date)
    ), by = c('id','partyId')] # Summarize by article and partyId
    # Add party metadata
    parties_actors <- actors_meta[str_starts(ids, 'P_')][parties_actors, on = c('partyId'), mult = 'first'][!is.na(id),.(ids = str_c(partyId,"_mfsa"), (.SD)), .SDcols = -c('ids')]
    ## Generate party aggregations (mfs)
    parties <- df[str_ends(ids,'_f') | str_ends(ids,'_s'),.(
      ids = str_sub(ids, start = 1, end = -3),
      (.SD)
    ),.SDcols = -c('ids')] %>% .[!duplicated(.,by = c('id','ids','sentence_id')),.(
      actor.sent = sum(sent_binary_weighted)/sum(words),
      actor.sent_words = sum(sent_words),
      actor.words = sum(words),
      actor.arousal = sum(sent_words)/sum(words),
      actor.first = min(sentence_id),
      actor.occ = .N,
      publication_date = first(publication_date)
    ), by = c('id','ids')]
    parties <- actors_meta[parties, on = c('ids'), mult = 'first'][!is.na(id),.(ids = str_c(ids,"_mfs"), (.SD)), .SDcols = -c('ids')]
    ## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
    df <- bind_rows(actors_merged, parties, parties_actors, all) %>%
      left_join(.,text_sent, by=c("id","publication_date")) %>%
      left_join(.,text_noactors, by="id") %>%
      mutate(
        actor.prom = actor.occ/text.sentences,
        actor.rel_first = 1-(actor.first/text.sentences),
        year = strftime(publication_date, format = '%Y'),
        yearmonth = strftime(publication_date, format = '%Y%m'),
        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
        yearweek = strftime(publication_date, format = "%Y%V")
      ) %>%
      ungroup() %>%
      select(-contains('Search'),-starts_with('not')) %>%
      mutate(across(where(is.character), as.factor)) %>%
      mutate(across(where(is.Date), as.factor))
    return(df)
  } else {
    df <- text_sent %>%
      mutate(
        year = strftime(publication_date, format = '%Y'),
        yearmonth = strftime(publication_date, format = '%Y%m'),
        yearmonthday = strftime(publication_date, format = '%Y%m%d'),
        yearweek = strftime(publication_date, format = "%Y%V")
      ) %>%
      ungroup() %>%
      mutate(across(where(is.character), as.factor)) %>%
      mutate(across(where(is.Date), as.factor))
    return(df)
  }
 }
--- a/R/sentencizer.R
+++ b/R/sentencizer.R
@ -0,0 +1,116 @@
 #' Generate sentence-level dataset with sentiment and actor presence
 #'
 #' Generate sentence-level dataset with sentiment and actor presence
 #' @param out Data frame produced by elasticizer
 #' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.
 #' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
 #' @return No return value, data per batch is saved in an RDS file
 #' @export
 #' @examples
 #' sentencizer(out, sent_dict = NULL, validation = F)
 #################################################################################################
 #################################### Generate sentence-level dataset#############################
 #################################################################################################
 sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) {
  ## Despite the function name, parallel processing is not used, because it is slower
  par_sent <- function(row, out, sent_dict = NULL) {
    out <- out[row,]
    ## Create df with article metadata (fields that are included in the elasticizer function)
    metadata <- out %>%
      select(`_id`,`_source.publication_date`,`_source.doctype`) %>%
      mutate(`_source.publication_date` = as.factor(`_source.publication_date`),
             `_source.doctype` = as.factor(`_source.doctype`))
    ## Unnest documents into individual words
    ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
      unnest(cols = colnames(.)) %>%
      select(-one_of('exists')) %>%
      unnest(cols = colnames(.)) %>%
      filter(upos != 'PUNCT')
    ## If there is a dictionary, apply it
    if (!is.null(sent_dict)) {
      ## If the dictionary contains the column lem_u, assume lemma_upos format
      if ("lem_u" %in% colnames(sent_dict)) {
        ud_sent <- ud_sent %>%
          mutate(lem_u = str_c(lemma,'_',upos)) %>%
          left_join(sent_dict, by = 'lem_u')
        ## If the dictionary contains the column lemma, assume simple lemma format
      } else if ("lemma" %in% colnames(sent_dict)) {
        ud_sent <- ud_sent %>%
          left_join(sent_dict, by = 'lemma') %>%
          mutate(lem_u = lemma)
      }
      ## Group by sentences, and generate dictionary scores per sentence
      ud_sent <- ud_sent %>%
        group_by(`_id`,sentence_id) %>%
        mutate(
          prox = case_when(
            is.na(prox) == T ~ 0,
            TRUE ~ prox
          )
        ) %>%
        summarise(sent_sum = sum(prox),
                  words = length(lemma),
                  sent_words = sum(prox != 0),
                  # sent_lemmas = list(lem_u[prox != 0])
                  )
      ## If there is no dictionary, create a ud_sent, with just sentence ids and word counts per sentence
    } else {
      ud_sent <- ud_sent %>%
        group_by(`_id`,sentence_id) %>%
        summarise(words = length(lemma))
    }
    ## Remove ud ouptut from source before further processing
    out <- select(out, -`_source.ud`)
    ## If dictionary validation, return just the sentences that have been hand-coded
    if (validation == T) {
      codes_sent <- ud_sent %>%
        left_join(.,out, by='_id') %>%
        rowwise() %>%
        filter(sentence_id == `_source.codes.sentence.id`)
      return(codes_sent)
    }
    if("_source.computerCodes.actorsDetail" %in% colnames(out)) {
      ## If actor details in source, create vector of actor ids for each sentence
      out <- out %>%
        unnest(`_source.computerCodes.actorsDetail`) %>%
        # mutate(ids_list = ids) %>%
        unnest(ids) %>%
        unnest(sentence_id) %>%
        group_by(`_id`,sentence_id) %>%
        summarise(
          ids = list(ids)
        )
    } else {
      ## If no actor details, keep one row per article and add a bogus sentence_id
      out <- out %>%
        group_by(`_id`) %>%
        summarise() %>%
        mutate(sentence_id = 1)
    }
    ## Combine ud_sent with the source dataset
      out <- out %>%
        left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
        group_by(`_id`)
      out <- out %>%
        summarise_all(list) %>%
        left_join(.,metadata,by='_id') %>%
        ungroup()
    return(out)
  }
  saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
  return()
  ### Keeping the option for parallel computation
  # microbenchmark::microbenchmark(out_normal <- par_sent(1:nrow(out),out = out, sent_dict=sent_dict), times = 1)
  # plan(multiprocess, workers = cores)
  # chunks <- split(1:nrow(out), sort(1:nrow(out)%%cores))
  # microbenchmark::microbenchmark(out_par <- bind_rows(future_lapply(chunks,par_sent, out=out, sent_dict=sent_dict)), times = 1)
 }
--- a/R/ud_update.R
+++ b/R/ud_update.R
@ -0,0 +1,56 @@
 #' Generate UDpipe output from base text
 #'
 #' Generate UDpipe output from base text
 #' @param file Filename of file to read in, also used for generating output file name
 #' @param wd Working directory where *file*s can be found
 #' @param ud_file Filename of udpipe model to use, should be in *wd*
 #' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
 #' @return A vector of 1's indicating the success of each update call
 #' @export
 #' @examples
 #' ud_update(out, udmodel, ver, file)
 #'
 # punct_check <- function(str) {
 #   if (!(stri_sub(str, from = -1)) %in% c('.','!','?')) {
 #     return(str_c(str, '.'))
 #   }
 # }
 ud_update <- function(file, wd, ud_file, ver) {
  out <- readRDS(str_c(wd,'/',file)) %>%
    out_parser(., field = '_source', clean = F)
  ud_model <- udpipe_load_model(file = str_c(wd,'/',ud_file))
  ud <- as.data.frame(udpipe(ud_model, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
    group_by(doc_id) %>%
    summarise(
      sentence_id = list(as.integer(sentence_id)),
      token_id = list(as.integer(token_id)),
      lemma = list(as.character(lemma)),
      upos = list(as.character(upos)),
      feats = list(as.character(feats)),
      head_token_id = list(as.integer(head_token_id)),
      dep_rel = list(as.character(dep_rel)),
      start = list(as.integer(start)),
      end = list(as.integer(end)),
      exists = list(TRUE)
   )
  bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
  saveRDS(bulk, file = str_c(wd,'/ud_',file))
  # res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
  return()
 }
 #### Old code ####
 # Use | as separator (this is not done anymore, as all data is stored as actual lists, instead of strings. Code kept for future reference)
 # str_replace_all("\\|", "") %>%
 # Remove VERY annoying single backslashes and replace them by whitespaces
 # str_replace_all("\\\\", " ") %>%
 # Replace any occurence of (double) whitespace characters by a single regular whitespace
 # t_id <- paste(ud[,5], collapse = '|')
 # lemmatized <- paste(ud[,7], collapse = '|') %>%
 #   # Replacing double quotes with single quotes in text
 #   str_replace_all("\"","\'")
 # upos_tags <- paste(ud[,8], collapse = '|')
 # head_t_id <- paste(ud[,11], collapse = '|')
 # dep_rel <- paste(ud[,12], collapse = '|')
--- a/man/actorizer.Rd
+++ b/man/actorizer.Rd
@ -0,0 +1,44 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/actorizer.R
 \name{actorizer}
 \alias{actorizer}
 \title{Updater function for elasticizer: Conduct actor searches}
 \usage{
 actorizer(
  out,
  localhost = F,
  ids,
  prefix,
  postfix,
  pre_tags,
  post_tags,
  es_super,
  ver
 )
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
 \item{localhost}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
 \item{ids}{List of actor ids}
 \item{prefix}{Regex containing prefixes that should be excluded from hits}
 \item{postfix}{Regex containing postfixes that should be excluded from hits}
 \item{es_super}{Password for write access to ElasticSearch}
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 \item{identifier}{String used to mark highlights. Should be a lowercase string}
 }
 \value{
 As this is a nested function used within elasticizer, there is no return output
 }
 \description{
 Updater function for elasticizer: Conduct actor searches
 }
 \examples{
 actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
 }
--- a/man/bulk_writer.Rd
+++ b/man/bulk_writer.Rd
@ -4,21 +4,29 @@
 \alias{bulk_writer}
 \title{Generate a line-delimited JSON string for use in Elasticsearch bulk updates}
 \usage{
-bulk_writer(x, index = "maml", varname = "updated_variable")
+bulk_writer(x, index = "maml", varname, type, ver)
 }
 \arguments{
 \item{x}{A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)}
 \item{index}{The name of the Elasticsearch index to update}
-\item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created)}
+\item{varname}{String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)}
 \item{type}{Type of updating to be done, can be either 'set', 'add', or 'addnested'}
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 }
 \value{
 A string usable as Elasticsearch bulk update command, in line-delimited JSON
 }
 \description{
 Generate a line-delimited JSON string for use in Elasticsearch bulk updates
 Type can be either one of three values:
 set: set the value of [varname] to x
 add: add x to the values of [varname]
 varname: When using ud, the ud field will be updated instead of a computerCodes field
 }
 \examples{
-bulk_writer(x, index = 'maml', varname = 'updated_variable')
+bulk_writer(x, index = 'maml')
 }
--- a/man/class_update.Rd
+++ b/man/class_update.Rd
@ -4,18 +4,38 @@
 \alias{class_update}
 \title{Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information}
 \usage{
-class_update(out, model_final, dfm_words, varname, es_super)
+class_update(
  out,
  localhost = T,
  model_final,
  varname,
  text,
  words,
  clean,
  ver,
  es_super = .rs.askForPassword("ElasticSearch WRITE")
 )
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
-\item{model_final}{The classification model (e.g. output from textstat_nb(), svm() or others)}
+\item{localhost}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
-\item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)}
+\item{model_final}{The classification model (e.g. output from textstat_nb(), svm() or others)}
 \item{varname}{String containing the variable name to use for the classification result, usually has the format computerCodes.varname}
 \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 \item{es_super}{Password for write access to ElasticSearch}
 \item{dfm_words}{A dfm containing all the words and only the words used to generate the model (is used for subsetting)}
 }
 \value{
 As this is a nested function used within elasticizer, there is no return output
@ -24,5 +44,5 @@ As this is a nested function used within elasticizer, there is no return output
 Classifier function for use in combination with the elasticizer function as 'update' parameter (without brackets), see elasticizer documentation for more information
 }
 \examples{
-elasticizer(query, src = T, es_pwd = es_pwd, update = class_update, model_final = model_final, dfm_words = dfm_words, varname = computerCodes.varname, es_super = es_super)
+class_update(out, localhost = T, model_final, dfm_words, varname, es_super = .rs.askForPassword('ElasticSearch WRITE'))
 }
--- a/man/cv_generator.Rd
+++ b/man/cv_generator.Rd
@ -0,0 +1,28 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/cv_generator.R
 \name{cv_generator}
 \alias{cv_generator}
 \title{Generate CV folds for nested cross-validation}
 \usage{
 cv_generator(outer_k, inner_k, vec, grid, seed)
 }
 \arguments{
 \item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data}
 \item{inner_k}{Number of inner CV (parameter optimization) folds}
 \item{vec}{Vector containing the true values of the classification}
 \item{grid}{Parameter grid for optimization}
 \item{seed}{integer used as seed for random number generation}
 }
 \value{
 A nested set of lists with row numbers
 }
 \description{
 Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
 }
 \examples{
 cv_generator(outer_k, inner_k, dfm, class_type)
 }
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,14 +4,30 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = "lemmas")
+dfm_gen(
  out,
  words = "999",
  text = "lemmas",
  clean,
  tolower = T,
  binary = F,
  ngrams = 1
 )
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
-\item{text}{String indicating whether the "merged" field will contain the "full" text, or "lemmas"}
+\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
 \item{tolower}{Boolean indicating whether dfm features should be lowercased}
 \item{binary}{Boolean indicating whether or not to generate a binary dfm (only indicating term presence, not count). Defaults to FALSE}
 \item{ngrams}{Numeric, if higher than 1, generates ngrams of the given size. Defaults to 1}
 }
 \value{
 A Quanteda dfm
--- a/man/dupe_detect.Rd
+++ b/man/dupe_detect.Rd
@ -1,26 +1,46 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/duplicate_detection.R
+% Please edit documentation in R/dupe_detect.R
 \name{dupe_detect}
 \alias{dupe_detect}
 \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
 \usage{
-dupe_detect(row, grid, cutoff, es_pwd)
+dupe_detect(
  row,
  grid,
  cutoff_lower,
  cutoff_upper = 1,
  es_pwd,
  es_super,
  words,
  localhost = T,
  ver
 )
 }
 \arguments{
 \item{row}{Row of grid to parse}
 \item{grid}{A cross-table of all possible combinations of doctypes and dates}
-\item{cutoff}{Cutoff value for cosine similarity above which documents are considered duplicates}
+\item{cutoff_lower}{Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)}
 \item{cutoff_upper}{Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)}
 \item{es_pwd}{Password for Elasticsearch read access}
 \item{es_super}{Password for write access to ElasticSearch}
 \item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])}
 \item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)}
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 }
 \value{
-dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory
+dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
 }
 \description{
 Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
 }
 \examples{
-dupe_detect(1,grid,es_pwd)
+dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)
 }
--- a/man/elastic_update.Rd
+++ b/man/elastic_update.Rd
@ -4,12 +4,14 @@
 \alias{elastic_update}
 \title{Push a line-delimited JSON string to Elasticsearch as bulk update}
 \usage{
-elastic_update(x, es_super = "secret")
+elastic_update(x, es_super = "secret", localhost = T)
 }
 \arguments{
 \item{x}{Line-delimited JSON suitable for use as Elasticsearch bulk update}
 \item{es_super}{The even-more-secret (do not store this anywhere!!!) password for updating (or messing up!) the entire database}
 \item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)}
 }
 \value{
 An html response object indicating the status of the update
@ -18,5 +20,5 @@ An html response object indicating the status of the update
 Push a line-delimited JSON string to Elasticsearch as bulk update
 }
 \examples{
-elastic_update(x, es_super = 'secret')
+elastic_update(x, es_super = 'secret', local = F)
 }
--- a/man/elasticizer.Rd
+++ b/man/elasticizer.Rd
@ -4,9 +4,20 @@
 \alias{elasticizer}
 \title{Generate a data frame out of unparsed Elasticsearch JSON}
 \usage{
-elasticizer(query, src = T, index = "maml",
+elasticizer(
-  es_pwd = .rs.askForPassword("Elasticsearch READ"), update = NULL,
+  query,
-  ...)
+  src = T,
  index = "maml",
  es_user,
  es_pwd = .rs.askForPassword("Elasticsearch READ"),
  batch_size = 1024,
  max_batch = Inf,
  time_scroll = "5m",
  dump = F,
  update = NULL,
  localhost = F,
  ...
 )
 }
 \arguments{
 \item{query}{A JSON-formatted query in the Elasticsearch query DSL}
@ -15,9 +26,23 @@ elasticizer(query, src = T, index = "maml",
 \item{index}{The name of the Elasticsearch index to search through}
 \item{es_user}{Username used to connect, defaults to 'es'}
 \item{es_pwd}{The password for Elasticsearch read access}
 \item{batch_size}{Batch size}
 \item{max_batch}{Maximum number batches to retrieve}
 \item{time_scroll}{Time to keep the scroll instance open (defaults to 5m, with a maximum of 500 allowed instances, so a maximum of 100 per minute)}
 \item{dump}{Boolean indicating whether the data frames should be returned, or dumped as .Rds files}
 \item{update}{When set, indicates an update function to use on each batch of 1000 articles}
 \item{...}{Parameters passed on to the update function}
 \item{local}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
 }
 \value{
 A data frame containing all the search results
@ -26,5 +51,5 @@ A data frame containing all the search results
 Generate a data frame out of unparsed Elasticsearch JSON
 }
 \examples{
-elasticizer(query, src = TRUE, index = "maml")
+elasticizer(query, src = TRUE, index = "maml", update = NULL, localhost = F)
 }
--- a/man/estimator.Rd
+++ b/man/estimator.Rd
@ -0,0 +1,41 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/estimator.R
 \name{estimator}
 \alias{estimator}
 \title{Generate models and get classifications on test sets}
 \usage{
 estimator(
  row,
  grid,
  outer_folds,
  inner_folds,
  dfm,
  class_type,
  model,
  we_vectors
 )
 }
 \arguments{
 \item{row}{Row number of current item in grid}
 \item{grid}{Grid with model parameters and CV folds}
 \item{outer_folds}{List with row numbers for outer folds}
 \item{dfm}{DFM containing labeled documents}
 \item{class_type}{Name of column in docvars() containing the classes}
 \item{model}{Model to use (currently only nb)}
 \item{we_vectors}{Matrix with word embedding vectors}
 }
 \value{
 Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
 }
 \description{
 Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
 }
 \examples{
 estimator(row, grid, outer_folds, dfm, class_type, model)
 }
--- a/man/feat_select.Rd
+++ b/man/feat_select.Rd
@ -0,0 +1,29 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/feat_select.R
 \name{feat_select}
 \alias{feat_select}
 \title{Select features using quanteda textstat_keyness}
 \usage{
 feat_select(topic, dfm, class_type, percentile, measure = "chi2")
 }
 \arguments{
 \item{topic}{The topic to determine keywords for}
 \item{dfm}{The input dfm}
 \item{class_type}{Name of the column in docvars containing the classification}
 \item{percentile}{Cutoff for the list of words that should be returned}
 \item{measure}{Measure to use in determining keyness, default = chi2; see textstat_keyness for other options}
 }
 \value{
 A vector of words that are key to the topic
 }
 \description{
 Select features based on the textstat_keyness function and a percentile cutoff
 Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
 }
 \examples{
 feat_select(topic, dfm, class_type, percentile, measure="chi2")
 }
--- a/man/lemma_writer.Rd
+++ b/man/lemma_writer.Rd
@ -0,0 +1,30 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/lemma_writer.R
 \name{lemma_writer}
 \alias{lemma_writer}
 \title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings}
 \usage{
 lemma_writer(out, file, localhost = F, documents = F, lemma = F, cores = 1)
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
 \item{file}{The file to write the output to (including path, when required). When documents = T, provide path including trailing /}
 \item{localhost}{Unused, but defaults to FALSE}
 \item{documents}{Indicate whether the writer should output to a single file, or individual documents}
 \item{lemma}{Indicate whether document output should be lemmas or original document}
 \item{cores}{Indicate the number of cores to use for parallel processing}
 }
 \value{
 A Quanteda dfm
 }
 \description{
 Generates text output files (without punctuation) for external applications, such as GloVe embeddings
 }
 \examples{
 dfm_gen(out, words = '999')
 }
--- a/man/merger.Rd
+++ b/man/merger.Rd
@ -4,14 +4,14 @@
 \alias{merger}
 \title{Merges list of lemmas back into a pseudo-document}
 \usage{
-merger(row, words = "999", out = out)
+merger(out, text, clean)
 }
 \arguments{
-\item{row}{A row number form the Elasticizer-generated data frame}
+\item{out}{The elasticizer-generated data frame}
-\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
+\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
-\item{out}{The elasticizer-generated data frame}
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
 }
 \value{
 A documentified string of lemmas, one document at a time
@ -20,5 +20,5 @@ A documentified string of lemmas, one document at a time
 Merges list of lemmas back into a pseudo-document
 }
 \examples{
-merger(1, words = '999', out = out)
+merger(out, text, clean)
 }
--- a/man/metric_gen.Rd
+++ b/man/metric_gen.Rd
@ -0,0 +1,20 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/metric_gen.R
 \name{metric_gen}
 \alias{metric_gen}
 \title{Generate performance statistics for models}
 \usage{
 metric_gen(x)
 }
 \arguments{
 \item{x}{A data frame containing at least the columns "pred" and "tv"}
 }
 \value{
 x, with additional columns for performance metrics
 }
 \description{
 Generate performance statistics for models, based on their predictions and the true values
 }
 \examples{
 metric_gen(x)
 }
--- a/man/modelizer.Rd
+++ b/man/modelizer.Rd
@ -4,28 +4,27 @@
 \alias{modelizer}
 \title{Generate a classification model}
 \usage{
-modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
+modelizer(
-  outer_k, inner_k, model, class_type, opt_measure, country, grid)
+  dfm,
  outer_k,
  inner_k,
  class_type,
  opt_measure,
  country,
  grid,
  seed,
  model,
  we_vectors,
  cores = 1
 )
 }
 \arguments{
 \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
 \item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)}
 \item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)}
 \item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)}
 \item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)}
 \item{seed}{Integer to use as seed for random number generation, ensures replicability}
 \item{outer_k}{Number of outer cross-validation folds (for performance estimation)}
 \item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}
 \item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
 \item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}
 \item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
@ -33,9 +32,17 @@ modelizer(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed,
 \item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}
 \item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
 \item{seed}{Integer to use as seed for random number generation, ensures replicability}
 \item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
 \item{we_vectors}{Matrix with word embedding vectors}
 \item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1}
 }
 \value{
-An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
+A list containing all relevant output
 }
 \description{
 Generate a nested cross validated classification model based on a dfm with class labels as docvars
@ -47,5 +54,5 @@ For Naïve Bayes, the following parameters can be used:
 - measures (what measure to use for determining feature importance, see textstat_keyness for options)
 }
 \examples{
-modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
+modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
 }
--- a/man/modelizer_old.Rd
+++ b/man/modelizer_old.Rd
@ -0,0 +1,64 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/modelizer_old.R
 \name{modelizer_old}
 \alias{modelizer_old}
 \title{Generate a classification model}
 \usage{
 modelizer_old(
  dfm,
  cores_outer,
  cores_grid,
  cores_inner,
  cores_feats,
  seed,
  outer_k,
  inner_k,
  model,
  class_type,
  opt_measure,
  country,
  grid
 )
 }
 \arguments{
 \item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
 \item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)}
 \item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)}
 \item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)}
 \item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)}
 \item{seed}{Integer to use as seed for random number generation, ensures replicability}
 \item{outer_k}{Number of outer cross-validation folds (for performance estimation)}
 \item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}
 \item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
 \item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}
 \item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
 \item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}
 \item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
 }
 \value{
 An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
 }
 \description{
 Generate a nested cross validated classification model based on a dfm with class labels as docvars
 Currently only supports Naïve Bayes using quanteda's textmodel_nb
 Hyperparemeter optimization is enabled through the grid parameter
 A grid should be generated from vectors with the labels as described for each model, using the crossing() command
 For Naïve Bayes, the following parameters can be used:
 - percentiles (cutoff point for tf-idf feature selection)
 - measures (what measure to use for determining feature importance, see textstat_keyness for options)
 }
 \examples{
 modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
 }
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@ -0,0 +1,24 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/out_parser.R
 \name{out_parser}
 \alias{out_parser}
 \title{Parse raw text into a single field}
 \usage{
 out_parser(out, field, clean = F)
 }
 \arguments{
 \item{out}{The original output data frame}
 \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
 \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
 }
 \value{
 a parsed output data frame including the additional column 'merged', containing the merged text
 }
 \description{
 Parse raw text from the MaML database into a single field
 }
 \examples{
 out_parser(out,field)
 }
--- a/man/preproc.Rd
+++ b/man/preproc.Rd
@ -0,0 +1,26 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/preproc.R
 \name{preproc}
 \alias{preproc}
 \title{Preprocess dfm data for use in modeling procedure}
 \usage{
 preproc(dfm_train, dfm_test = NULL, params, we_vectors)
 }
 \arguments{
 \item{dfm_train}{Training dfm}
 \item{dfm_test}{Testing dfm if applicable, otherwise NULL}
 \item{params}{Row from grid with parameter optimization}
 \item{we_vectors}{Matrix with word embedding vectors}
 }
 \value{
 List with dfm_train and dfm_test, processed according to parameters in params
 }
 \description{
 Process dfm according to parameters provided in params
 }
 \examples{
 preproc(dfm_train, dfm_test = NULL, params)
 }
--- a/man/query_gen_actors.Rd
+++ b/man/query_gen_actors.Rd
@ -0,0 +1,24 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/query_gen_actors.R
 \name{query_gen_actors}
 \alias{query_gen_actors}
 \title{Generate actor search queries based on data in actor db}
 \usage{
 query_gen_actors(actor, pre_tags, post_tags)
 }
 \arguments{
 \item{actor}{A row from the output of elasticizer() when run on the 'actor' index}
 \item{pre_tags}{Highlighter pre-tag}
 \item{post_tags}{Highlighter post-tag}
 }
 \value{
 A data frame containing the queries, related actor ids and actor function
 }
 \description{
 Generate actor search queries based on data in actor db
 }
 \examples{
 query_gen_actors(actor,country)
 }
--- a/man/query_string.Rd
+++ b/man/query_string.Rd
@ -0,0 +1,24 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/query_string.R
 \name{query_string}
 \alias{query_string}
 \title{Generate a query string query for ElasticSearch}
 \usage{
 query_string(query, fields = F, random = F, default_operator = "AND")
 }
 \arguments{
 \item{query}{Query string in ElasticSearch query string format}
 \item{fields}{List of field names to return, defaults to all}
 \item{random}{Return randomized results. Boolean, defaults to FALSE}
 }
 \value{
 A formatted ElasticSearch query string query
 }
 \description{
 Generate a query string query for ElasticSearch
 }
 \examples{
 query_string(query)
 }
--- a/man/sent_merger.Rd
+++ b/man/sent_merger.Rd
@ -0,0 +1,37 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/sent_merger.R
 \name{sent_merger}
 \alias{sent_merger}
 \title{Aggregate sentence-level dataset containing sentiment (from sentencizer())}
 \usage{
 sent_merger(
  df,
  actors_meta = NULL,
  actor_groups = NULL,
  pos_cutoff = NULL,
  neg_cutoff = NULL,
  single_party = F
 )
 }
 \arguments{
 \item{df}{Data frame with actor ids, produced by sentencizer}
 \item{actors_meta}{Optional data frame containing actor metadata obtained using elasticizer(index="actors")}
 \item{actor_groups}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)}
 \item{pos_cutoff}{Optional value above which sentence-level sentiment scores should be considered "positive"}
 \item{neg_cutoff}{Optional value below which sentence-level sentiment scores should be considered "negative"}
 \item{single_party}{Boolean to generate data only from sentences in which a single party is mentioned, defaults to FALSE}
 }
 \value{
 When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
 }
 \description{
 Aggregate sentence-level dataset containing sentiment (from sentencizer())
 }
 \examples{
 sent_merger(df, actors_meta, ids = NULL)
 }
--- a/man/sentencizer.Rd
+++ b/man/sentencizer.Rd
@ -0,0 +1,24 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/sentencizer.R
 \name{sentencizer}
 \alias{sentencizer}
 \title{Generate sentence-level dataset with sentiment and actor presence}
 \usage{
 sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
 }
 \arguments{
 \item{out}{Data frame produced by elasticizer}
 \item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.}
 \item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
 }
 \value{
 No return value, data per batch is saved in an RDS file
 }
 \description{
 Generate sentence-level dataset with sentiment and actor presence
 }
 \examples{
 sentencizer(out, sent_dict = NULL, validation = F)
 }
--- a/man/ud_update.Rd
+++ b/man/ud_update.Rd
@ -0,0 +1,27 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ud_update.R
 \name{ud_update}
 \alias{ud_update}
 \title{Generate UDpipe output from base text}
 \usage{
 ud_update(file, wd, ud_file, ver)
 }
 \arguments{
 \item{file}{Filename of file to read in, also used for generating output file name}
 \item{wd}{Working directory where *file*s can be found}
 \item{ud_file}{Filename of udpipe model to use, should be in *wd*}
 \item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
 }
 \value{
 A vector of 1's indicating the success of each update call
 }
 \description{
 Generate UDpipe output from base text
 }
 \examples{
 ud_update(out, udmodel, ver, file)
 }