parent
7f958bbc11
commit
98325bde8f
@ -0,0 +1,94 @@
|
|||||||
|
#' Generate actor data frames (with sentiment) from database
|
||||||
|
#'
|
||||||
|
#' Generate actor data frames (with sentiment) from database
|
||||||
|
#' @param out Data frame produced by elasticizer
|
||||||
|
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.
|
||||||
|
#' @return No return value, data per batch is saved in an RDS file
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' sentencizer(out, sent_dict = NULL)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Aggregate actor results ################################
|
||||||
|
#################################################################################################
|
||||||
|
sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) {
|
||||||
|
par_sent <- function(row, out, sent_dict = NULL) {
|
||||||
|
out <- out[row,]
|
||||||
|
metadata <- out %>%
|
||||||
|
select(`_id`,`_source.publication_date`, `_source.doctype`)
|
||||||
|
ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
|
||||||
|
unnest(cols = colnames(.)) %>%
|
||||||
|
select(-one_of('exists')) %>%
|
||||||
|
unnest(cols = colnames(.)) %>%
|
||||||
|
filter(upos != 'PUNCT')
|
||||||
|
|
||||||
|
if (is.null(sent_dict) == F) {
|
||||||
|
if ("lem_u" %in% colnames(sent_dict)) {
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
mutate(lem_u = str_c(lemma,'_',upos)) %>%
|
||||||
|
left_join(sent_dict, by = 'lem_u')
|
||||||
|
} else if ("lemma" %in% colnames(sent_dict)) {
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
left_join(sent_dict, by = 'lemma') %>%
|
||||||
|
mutate(lem_u = lemma)
|
||||||
|
}
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
group_by(`_id`,sentence_id) %>%
|
||||||
|
mutate(
|
||||||
|
prox = case_when(
|
||||||
|
is.na(prox) == T ~ 0,
|
||||||
|
TRUE ~ prox
|
||||||
|
)
|
||||||
|
) %>%
|
||||||
|
summarise(sent_sum = sum(prox),
|
||||||
|
words = length(lemma),
|
||||||
|
sent_words = sum(prox != 0),
|
||||||
|
sent_lemmas = list(lem_u[prox != 0])) %>%
|
||||||
|
mutate(
|
||||||
|
sent = sent_sum/words,
|
||||||
|
arousal = sent_words/words
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
ud_sent <- ud_sent %>% group_by(sentence_id) %>% summarise()
|
||||||
|
}
|
||||||
|
out <- select(out, -`_source.ud`)
|
||||||
|
|
||||||
|
### Unnest out_row to individual actor ids
|
||||||
|
out <- out %>%
|
||||||
|
unnest(`_source.computerCodes.actorsDetail`) %>%
|
||||||
|
mutate(ids_list = ids) %>%
|
||||||
|
unnest(ids) %>%
|
||||||
|
unnest(sentence_id) %>%
|
||||||
|
group_by(`_id`,sentence_id) %>%
|
||||||
|
summarise(
|
||||||
|
ids = list(ids)
|
||||||
|
) %>%
|
||||||
|
left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
|
||||||
|
group_by(`_id`)
|
||||||
|
|
||||||
|
text_sent <- out %>%
|
||||||
|
summarise(
|
||||||
|
text.sent_sum = sum(sent_sum),
|
||||||
|
text.words = sum(words),
|
||||||
|
text.sent_words = sum(sent_words),
|
||||||
|
text.sent_lemmas = I(list(unlist(sent_lemmas))),
|
||||||
|
text.sentences = n()
|
||||||
|
) %>%
|
||||||
|
mutate(
|
||||||
|
text.sent = text.sent_sum/text.words,
|
||||||
|
text.arousal = text.sent_words/text.words
|
||||||
|
)
|
||||||
|
|
||||||
|
out <- out %>%
|
||||||
|
summarise_all(list) %>%
|
||||||
|
left_join(.,text_sent,by='_id') %>%
|
||||||
|
left_join(.,metadata,by='_id')
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
|
||||||
|
return()
|
||||||
|
### Keeping the option for parallel computation
|
||||||
|
# microbenchmark::microbenchmark(out_normal <- par_sent(1:nrow(out),out = out, sent_dict=sent_dict), times = 1)
|
||||||
|
# plan(multiprocess, workers = cores)
|
||||||
|
# chunks <- split(1:nrow(out), sort(1:nrow(out)%%cores))
|
||||||
|
# microbenchmark::microbenchmark(out_par <- bind_rows(future_lapply(chunks,par_sent, out=out, sent_dict=sent_dict)), times = 1)
|
||||||
|
}
|
@ -0,0 +1,22 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/sentencizer.R
|
||||||
|
\name{sentencizer}
|
||||||
|
\alias{sentencizer}
|
||||||
|
\title{Generate actor data frames (with sentiment) from database}
|
||||||
|
\usage{
|
||||||
|
sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{out}{Data frame produced by elasticizer}
|
||||||
|
|
||||||
|
\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 0s if not applicable.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
No return value, data per batch is saved in an RDS file
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate actor data frames (with sentiment) from database
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
sentencizer(out, sent_dict = NULL)
|
||||||
|
}
|
Loading…
Reference in new issue