|
|
@ -22,29 +22,82 @@
|
|
|
|
#################################################################################################
|
|
|
|
#################################################################################################
|
|
|
|
#################################### Aggregate actor results ################################
|
|
|
|
#################################### Aggregate actor results ################################
|
|
|
|
#################################################################################################
|
|
|
|
#################################################################################################
|
|
|
|
actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR') {
|
|
|
|
actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR', sent_dict = NULL, cores = detectCores()) {
|
|
|
|
### Functions
|
|
|
|
### Functions
|
|
|
|
aggregator <- function (id, duplicates) {
|
|
|
|
aggregator <- function (id, duplicates) {
|
|
|
|
article <- filter(duplicates, `_id` == id) %>%
|
|
|
|
df <- duplicates %>%
|
|
|
|
unnest(sentence_id, .preserve = colnames(.))
|
|
|
|
filter(`_id` == id) %>%
|
|
|
|
|
|
|
|
group_by(`_id`) %>%
|
|
|
|
|
|
|
|
summarise(
|
|
|
|
|
|
|
|
`_source.doctype` = first(`_source.doctype`),
|
|
|
|
|
|
|
|
`_source.publication_date` = first(`_source.publication_date`),
|
|
|
|
|
|
|
|
actor_end = list(sort(unique(unlist(actor_end)))),
|
|
|
|
|
|
|
|
prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
|
|
|
|
|
|
|
|
sentence_id = list(sort(unique(unlist(sentence_id)))),
|
|
|
|
|
|
|
|
rel_first = list(max(unlist(rel_first))),
|
|
|
|
|
|
|
|
sentence_end = list(sort(unique(unlist(sentence_end)))),
|
|
|
|
|
|
|
|
actor_start = list(sort(unique(unlist(actor_start)))),
|
|
|
|
|
|
|
|
ids = list(sort(unique(unlist(ids)))),
|
|
|
|
|
|
|
|
sentence_start = list(sort(unique(unlist(sentence_start)))),
|
|
|
|
|
|
|
|
occ = list(length(unique(unlist(sentence_id)))),
|
|
|
|
|
|
|
|
first = list(min(unlist(sentence_id)))
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
return(df)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
### Calculate sentiment scores for each actor-document
|
|
|
|
|
|
|
|
par_sent <- function(row, out, sent_dict) {
|
|
|
|
|
|
|
|
out_row <- out[row,]
|
|
|
|
|
|
|
|
### Contains sentiment per sentence for whole article
|
|
|
|
|
|
|
|
sentiment_ud <- out_row$`_source.ud`[[1]] %>%
|
|
|
|
|
|
|
|
select(-one_of('exists')) %>%
|
|
|
|
|
|
|
|
unnest() %>%
|
|
|
|
|
|
|
|
filter(upos != 'PUNCT') %>% # For getting proper word counts
|
|
|
|
|
|
|
|
mutate(V1 = str_c(lemma,'_',upos)) %>%
|
|
|
|
|
|
|
|
left_join(sent_dict, by = 'V1') %>%
|
|
|
|
|
|
|
|
### Setting binary sentiment as unit of analysis
|
|
|
|
|
|
|
|
mutate(V2 = V3) %>%
|
|
|
|
|
|
|
|
group_by(sentence_id) %>%
|
|
|
|
|
|
|
|
mutate(
|
|
|
|
|
|
|
|
V2 = case_when(
|
|
|
|
|
|
|
|
is.na(V2) == T ~ 0,
|
|
|
|
|
|
|
|
TRUE ~ V2
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
) %>%
|
|
|
|
|
|
|
|
summarise(sent_sum = sum(V2),
|
|
|
|
|
|
|
|
words = length(lemma),
|
|
|
|
|
|
|
|
sent_words = length(na.omit(V3))) %>%
|
|
|
|
|
|
|
|
mutate(
|
|
|
|
|
|
|
|
sent = sent_sum/words,
|
|
|
|
|
|
|
|
arousal = sent_words/words
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
out_row <- select(out_row, -`_source.ud`)
|
|
|
|
|
|
|
|
### Contains sentiment per sentence for actor
|
|
|
|
|
|
|
|
actor_tone <- filter(sentiment_ud, sentence_id %in% unlist(out_row$sentence_id))
|
|
|
|
|
|
|
|
|
|
|
|
occ <- length(unlist(unique(article$sentence_id1)))
|
|
|
|
### Aggregated sentiment per actor (over all sentences containing actor)
|
|
|
|
sentence_count <- round(article$occ[[1]]/article$prom[[1]])
|
|
|
|
actor <- summarise(actor_tone,
|
|
|
|
prom <- occ/sentence_count
|
|
|
|
sent = sum(sent_sum)/sum(words),
|
|
|
|
rel_first <- 1-(min(article$sentence_id1)/sentence_count)
|
|
|
|
sent_sum = sum(sent_sum),
|
|
|
|
return(bind_cols(as.list(article[1,1:6]), # Sentence id, start and end position for actor sentences
|
|
|
|
sent_words = sum(sent_words),
|
|
|
|
data.frame(occ = I(list(occ)), # Number of sentences in which actor occurs
|
|
|
|
words = sum(words),
|
|
|
|
prom = I(list(prom)), # Relative prominence of actor in article (number of occurences/total # sentences)
|
|
|
|
arousal = sum(sent_words)/sum(words)
|
|
|
|
rel_first = I(list(rel_first)), # Relative position of first occurence at sentence level
|
|
|
|
|
|
|
|
first = I(list(min(article$sentence_id1))) # First sentence in which actor is mentioned
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Aggregated sentiment per article (over all sentences in article)
|
|
|
|
|
|
|
|
text <- summarise(sentiment_ud,
|
|
|
|
|
|
|
|
sent = sum(sent_sum)/sum(words),
|
|
|
|
|
|
|
|
sent_sum = sum(sent_sum),
|
|
|
|
|
|
|
|
sent_words = sum(sent_words),
|
|
|
|
|
|
|
|
words = sum(words),
|
|
|
|
|
|
|
|
arousal = sum(sent_words)/sum(words)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
return(cbind(out_row,data.frame(actor = actor, text = text)))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Creating aggregate measuers at daily, weekly, monthly and yearly level
|
|
|
|
### Creating aggregate measuers at daily, weekly, monthly and yearly level
|
|
|
|
grouper <- function(level, actor_df, actorids) {
|
|
|
|
grouper <- function(level, out, actorids, sent = F) {
|
|
|
|
by_newspaper <- actor_df %>%
|
|
|
|
by_newspaper <- out %>%
|
|
|
|
mutate(
|
|
|
|
mutate(
|
|
|
|
sentence_count = round(unlist(occ)/unlist(prom))
|
|
|
|
sentence_count = round(unlist(occ)/unlist(prom))
|
|
|
|
) %>%
|
|
|
|
) %>%
|
|
|
@ -57,9 +110,9 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator =
|
|
|
|
sentence_count = sum(sentence_count),
|
|
|
|
sentence_count = sum(sentence_count),
|
|
|
|
articles = length(`_id`),
|
|
|
|
articles = length(`_id`),
|
|
|
|
level = level
|
|
|
|
level = level
|
|
|
|
)
|
|
|
|
) %>%
|
|
|
|
|
|
|
|
ungroup()
|
|
|
|
aggregate <- actor_df %>%
|
|
|
|
aggregate <- out %>%
|
|
|
|
mutate(
|
|
|
|
mutate(
|
|
|
|
sentence_count = round(unlist(occ)/unlist(prom))
|
|
|
|
sentence_count = round(unlist(occ)/unlist(prom))
|
|
|
|
) %>%
|
|
|
|
) %>%
|
|
|
@ -73,12 +126,64 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator =
|
|
|
|
articles = length(`_id`),
|
|
|
|
articles = length(`_id`),
|
|
|
|
`_source.doctype` = 'agg',
|
|
|
|
`_source.doctype` = 'agg',
|
|
|
|
level = level
|
|
|
|
level = level
|
|
|
|
)
|
|
|
|
) %>%
|
|
|
|
|
|
|
|
ungroup()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(sent == T) {
|
|
|
|
|
|
|
|
by_newspaper_sent <- out %>%
|
|
|
|
|
|
|
|
group_by_at(vars(level, `_source.doctype`)) %>%
|
|
|
|
|
|
|
|
summarise(
|
|
|
|
|
|
|
|
actor.sent = mean(actor.sent),
|
|
|
|
|
|
|
|
actor.sent_sum = sum(actor.sent_sum),
|
|
|
|
|
|
|
|
actor.sent_words = sum(actor.sent_words),
|
|
|
|
|
|
|
|
actor.words = sum(actor.words),
|
|
|
|
|
|
|
|
actor.arousal = mean(actor.arousal),
|
|
|
|
|
|
|
|
text.sent = mean(text.sent),
|
|
|
|
|
|
|
|
text.sent_sum = sum(text.sent_sum),
|
|
|
|
|
|
|
|
text.sent_words = sum(text.sent_words),
|
|
|
|
|
|
|
|
text.words = sum(text.words),
|
|
|
|
|
|
|
|
text.arousal = mean(text.arousal)
|
|
|
|
|
|
|
|
) %>%
|
|
|
|
|
|
|
|
ungroup() %>%
|
|
|
|
|
|
|
|
select(-level,-`_source.doctype`)
|
|
|
|
|
|
|
|
aggregate_sent <- out %>%
|
|
|
|
|
|
|
|
group_by_at(vars(level)) %>%
|
|
|
|
|
|
|
|
summarise(
|
|
|
|
|
|
|
|
actor.sent = mean(actor.sent),
|
|
|
|
|
|
|
|
actor.sent_sum = sum(actor.sent_sum),
|
|
|
|
|
|
|
|
actor.sent_words = sum(actor.sent_words),
|
|
|
|
|
|
|
|
actor.words = sum(actor.words),
|
|
|
|
|
|
|
|
actor.arousal = mean(actor.arousal),
|
|
|
|
|
|
|
|
text.sent = mean(text.sent),
|
|
|
|
|
|
|
|
text.sent_sum = sum(text.sent_sum),
|
|
|
|
|
|
|
|
text.sent_words = sum(text.sent_words),
|
|
|
|
|
|
|
|
text.words = sum(text.words),
|
|
|
|
|
|
|
|
text.arousal = mean(text.arousal)
|
|
|
|
|
|
|
|
) %>%
|
|
|
|
|
|
|
|
ungroup() %>%
|
|
|
|
|
|
|
|
select(-level)
|
|
|
|
|
|
|
|
aggregate <- bind_cols(aggregate,aggregate_sent)
|
|
|
|
|
|
|
|
by_newspaper <- bind_cols(by_newspaper, by_newspaper_sent)
|
|
|
|
|
|
|
|
}
|
|
|
|
output <- bind_rows(by_newspaper, aggregate) %>%
|
|
|
|
output <- bind_rows(by_newspaper, aggregate) %>%
|
|
|
|
bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),])
|
|
|
|
bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),]) %>%
|
|
|
|
|
|
|
|
select(
|
|
|
|
|
|
|
|
-`_index`,
|
|
|
|
|
|
|
|
-`_type`,
|
|
|
|
|
|
|
|
-`_score`,
|
|
|
|
|
|
|
|
-`_id`,
|
|
|
|
|
|
|
|
-contains('Search'),
|
|
|
|
|
|
|
|
-contains('not')
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
colnames(output) <- gsub("_source.",'', colnames(output))
|
|
|
|
return(output)
|
|
|
|
return(output)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
###########################################################################################
|
|
|
|
###########################################################################################
|
|
|
|
|
|
|
|
if (is.null(sent_dict) == F) {
|
|
|
|
|
|
|
|
fields <- c('ud','computerCodes.actorsDetail', 'doctype', 'publication_date')
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
fields <- c('computerCodes.actorsDetail', 'doctype', 'publication_date')
|
|
|
|
|
|
|
|
}
|
|
|
|
actor <- actors[row,]
|
|
|
|
actor <- actors[row,]
|
|
|
|
if (actor$`_source.function` == "Party"){
|
|
|
|
if (actor$`_source.function` == "Party"){
|
|
|
|
years = seq(2000,2019,1)
|
|
|
|
years = seq(2000,2019,1)
|
|
|
@ -101,43 +206,66 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator =
|
|
|
|
|
|
|
|
|
|
|
|
actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {
|
|
|
|
actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {
|
|
|
|
if (year > 0) {
|
|
|
|
if (year > 0) {
|
|
|
|
query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31] && computerCodes.junk:0')
|
|
|
|
query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31]')
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,'] && computerCodes.junk:0')
|
|
|
|
query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,']')
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Temporary exception for UK missing junk coding
|
|
|
|
|
|
|
|
if (actor$`_source.country` != 'uk') {
|
|
|
|
|
|
|
|
query <- paste0(query,' && computerCodes.junk:0')
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query),
|
|
|
|
out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query),
|
|
|
|
fields = c('computerCodes.actorsDetail', 'doctype', 'publication_date'), default_operator = default_operator),
|
|
|
|
fields = fields, default_operator = default_operator),
|
|
|
|
localhost = localhost,
|
|
|
|
localhost = localhost,
|
|
|
|
es_pwd = es_pwd)
|
|
|
|
es_pwd = es_pwd)
|
|
|
|
if (length(out$`_id`) > 0 ) {
|
|
|
|
if (length(out$`_id`) > 0 ) {
|
|
|
|
actor_df <- out
|
|
|
|
if (is.null(sent_dict) == F) {
|
|
|
|
|
|
|
|
out_ud <- out %>% select(`_id`,`_source.ud`)
|
|
|
|
|
|
|
|
out <- out %>% select(-`_source.ud`)
|
|
|
|
|
|
|
|
}
|
|
|
|
### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
|
|
|
|
### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
|
|
|
|
actor_df <- actor_df %>%
|
|
|
|
out <- out %>%
|
|
|
|
unnest() %>%
|
|
|
|
unnest(`_source.computerCodes.actorsDetail`, .preserve = colnames(.)) %>%
|
|
|
|
unnest(ids, .preserve = colnames(.)) %>%
|
|
|
|
unnest(ids, .preserve = colnames(.)) %>%
|
|
|
|
filter(ids1 %in% actorids) %>%
|
|
|
|
filter(ids1 %in% actorids) # %>%
|
|
|
|
select(-ends_with('start')) %>%
|
|
|
|
# select(-ends_with('start')) %>%
|
|
|
|
select(-ends_with('end')) %>%
|
|
|
|
# select(-ends_with('end')) %>%
|
|
|
|
select(-starts_with('ids'))
|
|
|
|
# select(-starts_with('ids'))
|
|
|
|
|
|
|
|
|
|
|
|
### Only if there are more rows than articles, recalculate
|
|
|
|
### Only if there are more rows than articles, recalculate
|
|
|
|
if (length(unique(actor_df$`_id`)) != length(actor_df$`_id`)) {
|
|
|
|
if (length(unique(out$`_id`)) != length(out$`_id`)) {
|
|
|
|
duplicates <- actor_df[(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),]
|
|
|
|
duplicates <- out[(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),]
|
|
|
|
actor_single <- actor_df[!(duplicated(actor_df$`_id`) | duplicated(actor_df$`_id`, fromLast = T)),]
|
|
|
|
actor_single <- out[!(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),]
|
|
|
|
art_id <- unique(duplicates$`_id`)
|
|
|
|
art_id <- unique(duplicates$`_id`)
|
|
|
|
dupe_merged <- bind_rows(lapply(art_id, aggregator, duplicates = duplicates))
|
|
|
|
dupe_merged <- bind_rows(mclapply(art_id, aggregator, duplicates = duplicates, mc.cores = cores))
|
|
|
|
actor_df <- bind_rows(dupe_merged, actor_single)
|
|
|
|
out <- bind_rows(dupe_merged, actor_single)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (is.null(sent_dict) == F) {
|
|
|
|
|
|
|
|
out <- left_join(out, out_ud, by = '_id')
|
|
|
|
|
|
|
|
out <- bind_rows(mclapply(seq(1,nrow(out),1),par_sent, out = out, sent_dict = sent_dict, mc.cores = cores))
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
### Creating date grouping variables
|
|
|
|
### Creating date grouping variables
|
|
|
|
actor_df <- actor_df %>%
|
|
|
|
out <- out %>%
|
|
|
|
mutate(
|
|
|
|
mutate(
|
|
|
|
year = strftime(`_source.publication_date`, format = '%Y'),
|
|
|
|
year = strftime(`_source.publication_date`, format = '%Y'),
|
|
|
|
yearmonth = strftime(actor_df$`_source.publication_date`, format = '%Y%m'),
|
|
|
|
yearmonth = strftime(out$`_source.publication_date`, format = '%Y%m'),
|
|
|
|
yearmonthday = strftime(actor_df$`_source.publication_date`, format = '%Y%m%d'),
|
|
|
|
yearmonthday = strftime(out$`_source.publication_date`, format = '%Y%m%d'),
|
|
|
|
yearweek = strftime(actor_df$`_source.publication_date`, format = "%Y%V")
|
|
|
|
yearweek = strftime(out$`_source.publication_date`, format = "%Y%V")
|
|
|
|
|
|
|
|
) %>%
|
|
|
|
|
|
|
|
select(
|
|
|
|
|
|
|
|
-`_score`,
|
|
|
|
|
|
|
|
-`_index`,
|
|
|
|
|
|
|
|
-`_type`,
|
|
|
|
|
|
|
|
-`_score`,
|
|
|
|
|
|
|
|
-`_source.computerCodes.actorsDetail`,
|
|
|
|
|
|
|
|
-ids1
|
|
|
|
)
|
|
|
|
)
|
|
|
|
levels <- c('year','yearmonth','yearmonthday','yearweek')
|
|
|
|
levels <- c('year','yearmonth','yearmonthday','yearweek')
|
|
|
|
aggregate_data <- bind_rows(lapply(levels, grouper, actor_df = actor_df, actorids = actorids))
|
|
|
|
aggregate_data <- bind_rows(lapply(levels, grouper, out = out, actorids = actorids, sent = !is.null(sent_dict)))
|
|
|
|
return(aggregate_data)
|
|
|
|
return(aggregate_data)
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
return()
|
|
|
|
return()
|
|
|
@ -147,4 +275,3 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator =
|
|
|
|
print(paste0('Done with ',row,'/',nrow(actors),' actors'))
|
|
|
|
print(paste0('Done with ',row,'/',nrow(actors),' actors'))
|
|
|
|
return()
|
|
|
|
return()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|