#' Generate actor data frames (with sentiment) from database
#'
#' Generate actor data frames (with sentiment) from database
#' @param out Data frame produced by elasticizer
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
#' @return No return value, data per batch is saved in an RDS file
#' @export
#' @examples
#' sentencizer(out, sent_dict = NULL)
#################################################################################################
#################################### Aggregate actor results ################################
#################################################################################################
sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) {
  par_sent <- function(row, out, sent_dict = NULL) {
    out <- out[row,]
    metadata <- out %>%
      select(`_id`,`_source.publication_date`, `_source.doctype`)
    ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
      unnest(cols = colnames(.)) %>%
      select(-one_of('exists')) %>%
      unnest(cols = colnames(.)) %>%
      filter(upos != 'PUNCT')

    if (is.null(sent_dict) == F) {
      if ("lem_u" %in% colnames(sent_dict)) {
        ud_sent <- ud_sent %>%
          mutate(lem_u = str_c(lemma,'_',upos)) %>%
          left_join(sent_dict, by = 'lem_u')
      } else if ("lemma" %in% colnames(sent_dict)) {
        ud_sent <- ud_sent %>%
          left_join(sent_dict, by = 'lemma') %>%
          mutate(lem_u = lemma)
      }
      ud_sent <- ud_sent %>%
        group_by(`_id`,sentence_id) %>%
        mutate(
          prox = case_when(
            is.na(prox) == T ~ 0,
            TRUE ~ prox
          )
        ) %>%
        summarise(sent_sum = sum(prox),
                  words = length(lemma),
                  sent_words = sum(prox != 0),
                  sent_lemmas = list(lem_u[prox != 0])) %>%
        mutate(
          sent = sent_sum/words,
          arousal = sent_words/words
        )
    } else {
      ud_sent <- ud_sent %>% group_by(sentence_id) %>% summarise()
    }
    out <- select(out, -`_source.ud`)

    ### Unnest out_row to individual actor ids
    out <- out %>%
      unnest(`_source.computerCodes.actorsDetail`) %>%
      mutate(ids_list = ids) %>%
      unnest(ids) %>%
      unnest(sentence_id) %>%
      group_by(`_id`,sentence_id) %>%
      summarise(
        ids = list(ids)
      ) %>%
      left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
      group_by(`_id`)

    text_sent <- out %>%
      summarise(
        text.sent_sum = sum(sent_sum),
        text.words = sum(words),
        text.sent_words = sum(sent_words),
        text.sent_lemmas = I(list(unlist(sent_lemmas))),
        text.sentences = n()
      ) %>%
      mutate(
        text.sent = text.sent_sum/text.words,
        text.arousal = text.sent_words/text.words
      )

    out <- out %>%
      summarise_all(list) %>%
      left_join(.,text_sent,by='_id') %>%
      left_join(.,metadata,by='_id')
    return(out)
  }
  saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
  return()
  ### Keeping the option for parallel computation
  # microbenchmark::microbenchmark(out_normal <- par_sent(1:nrow(out),out = out, sent_dict=sent_dict), times = 1)
  # plan(multiprocess, workers = cores)
  # chunks <- split(1:nrow(out), sort(1:nrow(out)%%cores))
  # microbenchmark::microbenchmark(out_par <- bind_rows(future_lapply(chunks,par_sent, out=out, sent_dict=sent_dict)), times = 1)
}