actor_fetcher: removed from package other: major update to documentationmaster
parent
4e867214dd
commit
f022312485
@ -0,0 +1,178 @@
|
|||||||
|
#' Aggregate sentence-level dataset containing actors (from sentencizer())
|
||||||
|
#'
|
||||||
|
#' Aggregate sentence-level dataset containing actors (from sentencizer())
|
||||||
|
#' @param df Data frame with actor ids, produced by sentencizer
|
||||||
|
#' @param actors_meta Data frame containing actor metadata obtained using elasticizer(index="actors")
|
||||||
|
#' @param ids Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
|
||||||
|
#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' actor_merger(df, actors_meta, ids = NULL)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Generate actor-article dataset #############################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
actor_merger <- function(df, actors_meta, ids = NULL) {
|
||||||
|
grouper <- function(id, df) {
|
||||||
|
return(df %>%
|
||||||
|
rowwise() %>%
|
||||||
|
filter(length(intersect(id,ids)) > 0) %>%
|
||||||
|
group_by(`_id`) %>%
|
||||||
|
summarise(actor.sent = sum(sent_sum)/sum(words),
|
||||||
|
actor.sent_sum = sum(sent_sum),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
actor.arousal = sum(sent_words)/sum(words),
|
||||||
|
actor.first = first(sentence_id),
|
||||||
|
actor.occ = n(),
|
||||||
|
publication_date = as.Date(first(`_source.publication_date`)),
|
||||||
|
doctype = first(`_source.doctype`)) %>%
|
||||||
|
mutate(
|
||||||
|
ids = str_c(id, collapse = '-')
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
## Remove some of the metadata from the source df
|
||||||
|
text_sent <- df %>%
|
||||||
|
select(`_id`,starts_with("text."),-ends_with("sent_lemmas"))
|
||||||
|
df <- df %>%
|
||||||
|
select(-ends_with("sent_lemmas"),-starts_with("text.")) %>%
|
||||||
|
unnest(cols = colnames(.)) ## Unnest to sentence level
|
||||||
|
|
||||||
|
## Create bogus variables if sentiment is not scored
|
||||||
|
if(!"sent_sum" %in% colnames(df)) {
|
||||||
|
df <- df %>%
|
||||||
|
mutate(
|
||||||
|
sent_words = 0,
|
||||||
|
sent_sum = 0,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
## Create aggregations according to list of actorId vectors in ids
|
||||||
|
if(!is.null(ids)) {
|
||||||
|
output <- lapply(ids,grouper, df = df) %>%
|
||||||
|
bind_rows(.) %>%
|
||||||
|
left_join(text_sent, by="_id") %>%
|
||||||
|
mutate(
|
||||||
|
actor.prom = actor.occ/text.sentences,
|
||||||
|
actor.rel_first = 1-(actor.first/text.sentences),
|
||||||
|
year = strftime(publication_date, format = '%Y'),
|
||||||
|
yearmonth = strftime(publication_date, format = '%Y%m'),
|
||||||
|
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
|
||||||
|
yearweek = strftime(publication_date, format = "%Y%V")
|
||||||
|
)
|
||||||
|
return(output)
|
||||||
|
} else {
|
||||||
|
all <- df %>%
|
||||||
|
rowwise() %>%
|
||||||
|
filter(!is.null(unlist(ids))) %>%
|
||||||
|
group_by(`_id`) %>%
|
||||||
|
summarise(actor.sent = sum(sent_sum)/sum(words),
|
||||||
|
actor.sent_sum = sum(sent_sum),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
actor.arousal = sum(sent_words)/sum(words),
|
||||||
|
actor.first = first(sentence_id),
|
||||||
|
actor.occ = n(),
|
||||||
|
publication_date = as.Date(first(`_source.publication_date`)),
|
||||||
|
doctype = first(`_source.doctype`)) %>%
|
||||||
|
mutate(
|
||||||
|
ids = "all"
|
||||||
|
)
|
||||||
|
|
||||||
|
df <- df %>%
|
||||||
|
unnest(cols = ids) %>% ## Unnest to actor level
|
||||||
|
mutate(
|
||||||
|
`_source.publication_date` = as.Date(`_source.publication_date`)
|
||||||
|
)
|
||||||
|
## Create aggregate measures for individual actors
|
||||||
|
actors <- df %>%
|
||||||
|
filter(str_starts(ids,"A_")) %>%
|
||||||
|
group_by(`_id`,ids) %>%
|
||||||
|
summarise(actor.sent = sum(sent_sum)/sum(words),
|
||||||
|
actor.sent_sum = sum(sent_sum),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
actor.arousal = sum(sent_words)/sum(words),
|
||||||
|
actor.first = first(sentence_id),
|
||||||
|
actor.occ = n(),
|
||||||
|
publication_date = first(`_source.publication_date`),
|
||||||
|
doctype = first(`_source.doctype`)
|
||||||
|
)
|
||||||
|
## Create actor metadata dataframe per active date (one row per day per actor)
|
||||||
|
colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','')
|
||||||
|
actors_meta <- actors_meta[-1128,]
|
||||||
|
actors_meta_bydate <- actors_meta %>%
|
||||||
|
mutate(
|
||||||
|
startDate = as.Date(startDate),
|
||||||
|
endDate = as.Date(endDate)
|
||||||
|
) %>%
|
||||||
|
select(
|
||||||
|
lastName,firstName,`function`,gender,yearOfBirth,parlPeriod,partyId,ministerName,ministryId,actorId,startDate,endDate
|
||||||
|
) %>%
|
||||||
|
rowwise() %>%
|
||||||
|
mutate(
|
||||||
|
publication_date = list(seq(from=startDate, to=endDate,by="day")),
|
||||||
|
ids = actorId
|
||||||
|
) %>%
|
||||||
|
unnest(cols=publication_date)
|
||||||
|
|
||||||
|
## Join the actor metadata with the article data by actor id and date
|
||||||
|
actors <- actors %>%
|
||||||
|
left_join(.,actors_meta_bydate, by=c("ids","publication_date"))
|
||||||
|
|
||||||
|
## Generate party-actor aggregations (mfsa)
|
||||||
|
parties_actors <- df %>%
|
||||||
|
filter(str_starts(ids,"P_")) %>%
|
||||||
|
mutate(
|
||||||
|
ids = str_sub(ids, start = 1, end = -3)
|
||||||
|
) %>%
|
||||||
|
group_by(`_id`,ids) %>%
|
||||||
|
summarise(actor.sent = sum(sent_sum)/sum(words),
|
||||||
|
actor.sent_sum = sum(sent_sum),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
actor.arousal = sum(sent_words)/sum(words),
|
||||||
|
actor.first = first(sentence_id),
|
||||||
|
actor.occ = n(),
|
||||||
|
publication_date = first(`_source.publication_date`),
|
||||||
|
doctype = first(`_source.doctype`)) %>%
|
||||||
|
mutate(
|
||||||
|
ids = str_c(ids,"_mfsa")
|
||||||
|
)
|
||||||
|
|
||||||
|
## Generate party aggregations (mfs)
|
||||||
|
parties <- df %>%
|
||||||
|
filter(str_ends(ids,"_f") | str_ends(ids,"_s")) %>%
|
||||||
|
mutate(
|
||||||
|
ids = str_sub(ids, start = 1, end = -3)
|
||||||
|
) %>%
|
||||||
|
group_by(`_id`,ids) %>%
|
||||||
|
summarise(actor.sent = sum(sent_sum)/sum(words),
|
||||||
|
actor.sent_sum = sum(sent_sum),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
actor.arousal = sum(sent_words)/sum(words),
|
||||||
|
actor.first = first(sentence_id),
|
||||||
|
actor.occ = n(),
|
||||||
|
publication_date = first(`_source.publication_date`),
|
||||||
|
doctype = first(`_source.doctype`)) %>%
|
||||||
|
mutate(
|
||||||
|
ids = str_c(ids,"_mfs")
|
||||||
|
)
|
||||||
|
|
||||||
|
## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
|
||||||
|
df <- bind_rows(actors, parties, parties_actors, all) %>%
|
||||||
|
left_join(text_sent, by="_id") %>%
|
||||||
|
mutate(
|
||||||
|
actor.prom = actor.occ/text.sentences,
|
||||||
|
actor.rel_first = 1-(actor.first/text.sentences),
|
||||||
|
year = strftime(publication_date, format = '%Y'),
|
||||||
|
yearmonth = strftime(publication_date, format = '%Y%m'),
|
||||||
|
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
|
||||||
|
yearweek = strftime(publication_date, format = "%Y%V")
|
||||||
|
)
|
||||||
|
return(df)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,24 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/actor_merger.R
|
||||||
|
\name{actor_merger}
|
||||||
|
\alias{actor_merger}
|
||||||
|
\title{Aggregate sentence-level dataset containing actors (from sentencizer())}
|
||||||
|
\usage{
|
||||||
|
actor_merger(df, actors_meta, ids = NULL)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{df}{Data frame with actor ids, produced by sentencizer}
|
||||||
|
|
||||||
|
\item{actors_meta}{Data frame containing actor metadata obtained using elasticizer(index="actors")}
|
||||||
|
|
||||||
|
\item{ids}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Aggregate sentence-level dataset containing actors (from sentencizer())
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
actor_merger(df, actors_meta, ids = NULL)
|
||||||
|
}
|
Loading…
Reference in new issue