actor_aggregation: small fixes to code

master
Erik de Vries 5 years ago
parent a29d04dacd
commit d9f28a46d8

@ -23,24 +23,6 @@
#################################### Aggregate actor results ################################ #################################### Aggregate actor results ################################
################################################################################################# #################################################################################################
actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR') { actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR') {
actor <- actors[row,]
if (actor$`_source.function` == "Party"){
years = seq(2000,2019,1)
} else {
years = c(0)
}
if (actor$`_source.function` == 'Party' && actor$party_only == T) {
actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'))
} else if (actor$`_source.function` == 'Party') {
actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a'))
actor$party_only <- F
} else {
actorids <- actor$`_source.actorId`
actor$party_only <- NULL
}
actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {
### Functions ### Functions
aggregator <- function (id, duplicates) { aggregator <- function (id, duplicates) {
article <- filter(duplicates, `_id` == id) %>% article <- filter(duplicates, `_id` == id) %>%
@ -59,6 +41,52 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator =
) )
) )
} }
### Creating aggregate measuers at daily, weekly, monthly and yearly level
grouper <- function(level, actor_df, actorids) {
by_newspaper <- actor_df %>% group_by_at(vars(level, `_source.doctype`)) %>%
summarise(
occ = mean(unlist(occ)),
prom = mean(unlist(prom)),
rel_first = mean(unlist(rel_first)),
first = mean(unlist(first)),
articles = length(`_id`),
level = level
)
aggregate <- actor_df %>% group_by_at(vars(level)) %>%
summarise(
occ = mean(unlist(occ)),
prom = mean(unlist(prom)),
rel_first = mean(unlist(rel_first)),
first = mean(unlist(first)),
articles = length(`_id`),
`_source.doctype` = 'agg',
level = level
)
output <- bind_rows(by_newspaper, aggregate) %>%
bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),])
return(output)
}
###########################################################################################
actor <- actors[row,]
if (actor$`_source.function` == "Party"){
years = seq(2000,2019,1)
} else {
years = c(0)
}
if (actor$`_source.function` == 'Party' && actor$party_only == T) {
actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'))
} else if (actor$`_source.function` == 'Party') {
actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a'))
actor$party_only <- F
} else {
actorids <- actor$`_source.actorId`
actor$party_only <- NULL
}
actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {
if (year > 0) { if (year > 0) {
query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31] && computerCodes.junk:0') query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31] && computerCodes.junk:0')
} else { } else {
@ -69,8 +97,9 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator =
localhost = localhost, localhost = localhost,
es_pwd = es_pwd) es_pwd = es_pwd)
if (length(out$`_id`) > 0 ) { if (length(out$`_id`) > 0 ) {
actor_df <- out
### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids. ### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
actor_df <- out %>% actor_df <- actor_df %>%
unnest() %>% unnest() %>%
unnest(ids, .preserve = colnames(.)) %>% unnest(ids, .preserve = colnames(.)) %>%
filter(ids1 %in% actorids) %>% filter(ids1 %in% actorids) %>%
@ -86,7 +115,6 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator =
dupe_merged <- bind_rows(lapply(art_id, aggregator, duplicates = duplicates)) dupe_merged <- bind_rows(lapply(art_id, aggregator, duplicates = duplicates))
actor_df <- bind_rows(dupe_merged, actor_single) actor_df <- bind_rows(dupe_merged, actor_single)
} }
### Creating date grouping variables ### Creating date grouping variables
actor_df <- actor_df %>% actor_df <- actor_df %>%
mutate( mutate(
@ -95,34 +123,8 @@ actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator =
yearmonthday = strftime(actor_df$`_source.publication_date`, format = '%Y%m%d'), yearmonthday = strftime(actor_df$`_source.publication_date`, format = '%Y%m%d'),
yearweek = strftime(actor_df$`_source.publication_date`, format = "%Y%V") yearweek = strftime(actor_df$`_source.publication_date`, format = "%Y%V")
) )
### Creating aggregate measuers at daily, weekly, monthly and yearly level
grouper <- function(level) {
by_newspaper <- actor_df %>% group_by_at(vars(level, `_source.doctype`)) %>%
summarise(
occ = mean(unlist(occ)),
prom = mean(unlist(prom)),
rel_first = mean(unlist(rel_first)),
first = mean(unlist(first)),
articles = length(`_id`),
level = level
)
aggregate <- actor_df %>% group_by_at(vars(level)) %>%
summarise(
occ = mean(unlist(occ)),
prom = mean(unlist(prom)),
rel_first = mean(unlist(rel_first)),
first = mean(unlist(first)),
articles = length(`_id`),
`_source.doctype` = 'agg',
level = level
)
output <- bind_rows(by_newspaper, aggregate) %>%
bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),])
return(output)
}
levels <- c('year','yearmonth','yearmonthday','yearweek') levels <- c('year','yearmonth','yearmonthday','yearweek')
aggregate_data <- bind_rows(lapply(levels, grouper)) aggregate_data <- bind_rows(lapply(levels, grouper, actor_df = actor_df, actorids = actorids))
return(aggregate_data) return(aggregate_data)
} else { } else {
return() return()

Loading…
Cancel
Save