mamlr/R/sentencizer.R

#' Generate sentence-level dataset with sentiment and actor presence
#'
#' Generate sentence-level dataset with sentiment and actor presence
#' @param out Data frame produced by elasticizer
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
#' @return No return value, data per batch is saved in an RDS file
#' @export
#' @examples
#' sentencizer(out, sent_dict = NULL, validation = F)
#################################################################################################
#################################### Generate sentence-level dataset#############################
#################################################################################################
sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) {
  ## Despite the function name, parallel processing is not used, because it is slower
  par_sent <- function(row, out, sent_dict = NULL) {
    out <- out[row,]
    ## Create df with article metadata (fields that are included in the elasticizer function)
    metadata <- out %>%
      select(`_id`,contains("_source"),-contains("computerCodes.actors"),-contains("ud"))

    ## Unnest documents into individual words
    ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
      unnest(cols = colnames(.)) %>%
      select(-one_of('exists')) %>%
      unnest(cols = colnames(.)) %>%
      filter(upos != 'PUNCT')

    ## If there is a dictionary, apply it
    if (!is.null(sent_dict)) {
      ## If the dictionary contains the column lem_u, assume lemma_upos format
      if ("lem_u" %in% colnames(sent_dict)) {
        ud_sent <- ud_sent %>%
          mutate(lem_u = str_c(lemma,'_',upos)) %>%
          left_join(sent_dict, by = 'lem_u')
        ## If the dictionary contains the column lemma, assume simple lemma format
      } else if ("lemma" %in% colnames(sent_dict)) {
        ud_sent <- ud_sent %>%
          left_join(sent_dict, by = 'lemma') %>%
          mutate(lem_u = lemma)
      }

      ## Group by sentences, and generate dictionary scores per sentence
      ud_sent <- ud_sent %>%
        group_by(`_id`,sentence_id) %>%
        mutate(
          prox = case_when(
            is.na(prox) == T ~ 0,
            TRUE ~ prox
          )
        ) %>%
        summarise(sent_sum = sum(prox),
                  words = length(lemma),
                  sent_words = sum(prox != 0),
                  sent_lemmas = list(lem_u[prox != 0])) %>%
        mutate(
          sent = sent_sum/words,
          arousal = sent_words/words
        )
      ## If there is no dictionary, create a ud_sent, with just sentence ids and word counts per sentence
    } else {
      ud_sent <- ud_sent %>%
        group_by(`_id`,sentence_id) %>%
        summarise(words = length(lemma))
    }

    ## Remove ud ouptut from source before further processing
    out <- select(out, -`_source.ud`)

    ## If dictionary validation, return just the sentences that have been hand-coded
    if (validation == T) {
      codes_sent <- ud_sent %>%
        left_join(.,out, by='_id') %>%
        rowwise() %>%
        filter(sentence_id == `_source.codes.sentence.id`)
      return(codes_sent)
    }

    if("_source.computerCodes.actorsDetail" %in% colnames(out)) {

      ## If actor details in source, create vector of actor ids for each sentence
      out <- out %>%
        unnest(`_source.computerCodes.actorsDetail`) %>%
        # mutate(ids_list = ids) %>%
        unnest(ids) %>%
        unnest(sentence_id) %>%
        group_by(`_id`,sentence_id) %>%
        summarise(
          ids = list(ids)
        )
    } else {
      ## If no actor details, keep one row per article and add a bogus sentence_id
      out <- out %>%
        group_by(`_id`) %>%
        summarise() %>%
        mutate(sentence_id = 1)
    }

    ## Combine ud_sent with the source dataset
      out <- out %>%
        left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
        group_by(`_id`)

    ## If there is a sent_dict, generate sentiment scores on article level
    if(!is.null(sent_dict)) {
      text_sent <- out %>%
        summarise(
          text.sent_sum = sum(sent_sum),
          text.words = sum(words),
          text.sent_words = sum(sent_words),
          text.sent_lemmas = I(list(unlist(sent_lemmas))),
          text.sentences = n()
        ) %>%
        mutate(
          text.sent = text.sent_sum/text.words,
          text.arousal = text.sent_words/text.words
        )

    } else {
      text_sent <- out %>%
        summarise(
          text.words = sum(words),
          text.sentences = n()
        )
    }
      out <- out %>%
        summarise_all(list) %>%
        left_join(.,text_sent,by='_id') %>%
        left_join(.,metadata,by='_id') %>%
        ungroup()
    return(out)
  }
  saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
  return()
  ### Keeping the option for parallel computation
  # microbenchmark::microbenchmark(out_normal <- par_sent(1:nrow(out),out = out, sent_dict=sent_dict), times = 1)
  # plan(multiprocess, workers = cores)
  # chunks <- split(1:nrow(out), sort(1:nrow(out)%%cores))
  # microbenchmark::microbenchmark(out_par <- bind_rows(future_lapply(chunks,par_sent, out=out, sent_dict=sent_dict)), times = 1)
}
sentencizer: commented code 4 years ago			`#' Generate sentence-level dataset with sentiment and actor presence`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`#'`
sentencizer: commented code 4 years ago			`#' Generate sentence-level dataset with sentiment and actor presence`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`#' @param out Data frame produced by elasticizer`
			`#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.`
sentencizer: minor updates 4 years ago			`#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`#' @return No return value, data per batch is saved in an RDS file`
			`#' @export`
			`#' @examples`
sentencizer: minor updates 4 years ago			`#' sentencizer(out, sent_dict = NULL, validation = F)`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`#################################################################################################`
sentencizer: commented code 4 years ago			`#################################### Generate sentence-level dataset#############################`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`#################################################################################################`
			`sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) {`
sentencizer: commented code 4 years ago			`## Despite the function name, parallel processing is not used, because it is slower`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`par_sent <- function(row, out, sent_dict = NULL) {`
			`out <- out[row,]`
sentencizer: commented code 4 years ago			`## Create df with article metadata (fields that are included in the elasticizer function)`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`metadata <- out %>%`
sentencizer: minor updates 4 years ago			select(`_id`,contains("_source"),-contains("computerCodes.actors"),-contains("ud"))
sentencizer: commented code 4 years ago
			`## Unnest documents into individual words`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
			`unnest(cols = colnames(.)) %>%`
			`select(-one_of('exists')) %>%`
			`unnest(cols = colnames(.)) %>%`
			`filter(upos != 'PUNCT')`

sentencizer: commented code 4 years ago			`## If there is a dictionary, apply it`
sentencizer: minor updates 4 years ago			`if (!is.null(sent_dict)) {`
sentencizer: commented code 4 years ago			`## If the dictionary contains the column lem_u, assume lemma_upos format`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`if ("lem_u" %in% colnames(sent_dict)) {`
			`ud_sent <- ud_sent %>%`
			`mutate(lem_u = str_c(lemma,'_',upos)) %>%`
			`left_join(sent_dict, by = 'lem_u')`
sentencizer: commented code 4 years ago			`## If the dictionary contains the column lemma, assume simple lemma format`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`} else if ("lemma" %in% colnames(sent_dict)) {`
			`ud_sent <- ud_sent %>%`
			`left_join(sent_dict, by = 'lemma') %>%`
			`mutate(lem_u = lemma)`
			`}`
sentencizer: commented code 4 years ago
			`## Group by sentences, and generate dictionary scores per sentence`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`ud_sent <- ud_sent %>%`
			group_by(`_id`,sentence_id) %>%
			`mutate(`
			`prox = case_when(`
			`is.na(prox) == T ~ 0,`
			`TRUE ~ prox`
			`)`
			`) %>%`
			`summarise(sent_sum = sum(prox),`
			`words = length(lemma),`
			`sent_words = sum(prox != 0),`
			`sent_lemmas = list(lem_u[prox != 0])) %>%`
			`mutate(`
			`sent = sent_sum/words,`
			`arousal = sent_words/words`
			`)`
sentencizer: updates to collect sentence word counts and number of sentences also when no sent_dict is provided 4 years ago			`## If there is no dictionary, create a ud_sent, with just sentence ids and word counts per sentence`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`} else {`
sentencizer: updates to collect sentence word counts and number of sentences also when no sent_dict is provided 4 years ago			`ud_sent <- ud_sent %>%`
			group_by(`_id`,sentence_id) %>%
			`summarise(words = length(lemma))`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`}`
sentencizer: commented code 4 years ago
			`## Remove ud ouptut from source before further processing`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			out <- select(out, -`_source.ud`)

sentencizer: commented code 4 years ago			`## If dictionary validation, return just the sentences that have been hand-coded`
sentencizer: minor updates 4 years ago			`if (validation == T) {`
			`codes_sent <- ud_sent %>%`
			`left_join(.,out, by='_id') %>%`
			`rowwise() %>%`
			filter(sentence_id == `_source.codes.sentence.id`)
			`return(codes_sent)`
			`}`

sentencizer: fixed actorsDetail coding error 4 years ago			`if("_source.computerCodes.actorsDetail" %in% colnames(out)) {`
sentencizer: commented code 4 years ago
			`## If actor details in source, create vector of actor ids for each sentence`
sentencizer: minor updates 4 years ago			`out <- out %>%`
			unnest(`_source.computerCodes.actorsDetail`) %>%
			`# mutate(ids_list = ids) %>%`
			`unnest(ids) %>%`
			`unnest(sentence_id) %>%`
			group_by(`_id`,sentence_id) %>%
			`summarise(`
			`ids = list(ids)`
			`)`
			`} else {`
sentencizer: commented code 4 years ago			`## If no actor details, keep one row per article and add a bogus sentence_id`
sentencizer: minor updates 4 years ago			`out <- out %>%`
			group_by(`_id`) %>%
			`summarise() %>%`
			`mutate(sentence_id = 1)`
			`}`

sentencizer: commented code 4 years ago			`## Combine ud_sent with the source dataset`
sentencizer: minor updates 4 years ago			`out <- out %>%`
			`left_join(ud_sent,.,by = c('_id','sentence_id')) %>%`
			group_by(`_id`)
sentencizer: commented code 4 years ago
			`## If there is a sent_dict, generate sentiment scores on article level`
sentencizer: minor updates 4 years ago			`if(!is.null(sent_dict)) {`
			`text_sent <- out %>%`
			`summarise(`
			`text.sent_sum = sum(sent_sum),`
			`text.words = sum(words),`
			`text.sent_words = sum(sent_words),`
			`text.sent_lemmas = I(list(unlist(sent_lemmas))),`
			`text.sentences = n()`
			`) %>%`
			`mutate(`
			`text.sent = text.sent_sum/text.words,`
			`text.arousal = text.sent_words/text.words`
			`)`
sentencizer: updates to collect sentence word counts and number of sentences also when no sent_dict is provided 4 years ago
sentencizer: minor updates 4 years ago			`} else {`
sentencizer: updates to collect sentence word counts and number of sentences also when no sent_dict is provided 4 years ago			`text_sent <- out %>%`
			`summarise(`
			`text.words = sum(words),`
			`text.sentences = n()`
			`)`
			`}`
sentencizer: minor updates 4 years ago			`out <- out %>%`
			`summarise_all(list) %>%`
sentencizer: updates to collect sentence word counts and number of sentences also when no sent_dict is provided 4 years ago			`left_join(.,text_sent,by='_id') %>%`
actor_merger: added ungroup() calls at the start and end of function, to speed up processing sentencizer: added ungroup() call at the end of the function to speed up processing 4 years ago			`left_join(.,metadata,by='_id') %>%`
			`ungroup()`
sentencizer: added new function for sentiment coding and actor collection 4 years ago			`return(out)`
			`}`
			`saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))`
			`return()`
			`### Keeping the option for parallel computation`
			`# microbenchmark::microbenchmark(out_normal <- par_sent(1:nrow(out),out = out, sent_dict=sent_dict), times = 1)`
			`# plan(multiprocess, workers = cores)`
			`# chunks <- split(1:nrow(out), sort(1:nrow(out)%%cores))`
			`# microbenchmark::microbenchmark(out_par <- bind_rows(future_lapply(chunks,par_sent, out=out, sent_dict=sent_dict)), times = 1)`
			`}`