Compare commits
218 Commits
@ -1,3 +1,5 @@
|
|||||||
.Rproj.user
|
.Rproj.user
|
||||||
.Rhistory
|
.Rhistory
|
||||||
.RData
|
.RData
|
||||||
|
*.RData
|
||||||
|
*.Rds
|
||||||
|
@ -1,10 +1,24 @@
|
|||||||
# Generated by roxygen2: do not edit by hand
|
# Generated by roxygen2: do not edit by hand
|
||||||
|
|
||||||
|
export(actorizer)
|
||||||
export(bulk_writer)
|
export(bulk_writer)
|
||||||
export(class_update)
|
export(class_update)
|
||||||
|
export(cv_generator)
|
||||||
export(dfm_gen)
|
export(dfm_gen)
|
||||||
export(dupe_detect)
|
export(dupe_detect)
|
||||||
export(elastic_update)
|
export(elastic_update)
|
||||||
export(elasticizer)
|
export(elasticizer)
|
||||||
|
export(estimator)
|
||||||
|
export(feat_select)
|
||||||
|
export(lemma_writer)
|
||||||
export(merger)
|
export(merger)
|
||||||
|
export(metric_gen)
|
||||||
export(modelizer)
|
export(modelizer)
|
||||||
|
export(modelizer_old)
|
||||||
|
export(out_parser)
|
||||||
|
export(preproc)
|
||||||
|
export(query_gen_actors)
|
||||||
|
export(query_string)
|
||||||
|
export(sent_merger)
|
||||||
|
export(sentencizer)
|
||||||
|
export(ud_update)
|
||||||
|
@ -0,0 +1,278 @@
|
|||||||
|
### Notes:
|
||||||
|
# Do you want to search for either one OR other actorid, or both occuring in the same document?
|
||||||
|
# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
|
||||||
|
# Search by actorId, then aggregate by month
|
||||||
|
# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
|
||||||
|
# Develop query generator for specific actors (ie combine actorId with start and end dates)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#' Generate aggregated actor measures from raw data
|
||||||
|
#'
|
||||||
|
#' Generate aggregated actor measures from raw data
|
||||||
|
#' @param row The row of the actors data frame used for aggregation
|
||||||
|
#' @param actors The data frame containing actor data
|
||||||
|
#' @param es_pwd The password for read access to ES
|
||||||
|
#' @param localhost Boolean indicating if the script is running locally or not
|
||||||
|
#' @param default_operator String indicating whether actor aggregations should be made by searching for the presence of any of the actor ids (OR), or all of them (AND). Defaults to OR
|
||||||
|
#' @return No return value, data per actor is saved in an RDS file
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' actor_aggregation(row, actors, es_pwd, localhost, default_operator = 'OR')
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Aggregate actor results ################################
|
||||||
|
#################################################################################################
|
||||||
|
actor_aggregation <- function(row, actors, es_pwd, localhost, default_operator = 'OR', sent_dict = NULL, cores = detectCores()) {
|
||||||
|
### Functions
|
||||||
|
aggregator <- function (id, duplicates) {
|
||||||
|
df <- duplicates %>%
|
||||||
|
filter(`_id` == id) %>%
|
||||||
|
group_by(`_id`) %>%
|
||||||
|
summarise(
|
||||||
|
`_source.doctype` = first(`_source.doctype`),
|
||||||
|
`_source.publication_date` = first(`_source.publication_date`),
|
||||||
|
# actor_end = list(sort(unique(unlist(actor_end)))),
|
||||||
|
prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
|
||||||
|
sentence_id = list(sort(unique(unlist(sentence_id)))),
|
||||||
|
rel_first = list(max(unlist(rel_first))),
|
||||||
|
# sentence_end = list(sort(unique(unlist(sentence_end)))),
|
||||||
|
# actor_start = list(sort(unique(unlist(actor_start)))),
|
||||||
|
ids = list(unique(unlist(ids))),
|
||||||
|
# sentence_start = list(sort(unique(unlist(sentence_start)))),
|
||||||
|
occ = list(length(unique(unlist(sentence_id)))),
|
||||||
|
first = list(min(unlist(sentence_id)))
|
||||||
|
)
|
||||||
|
return(df)
|
||||||
|
}
|
||||||
|
### Calculate sentiment scores for each actor-document
|
||||||
|
par_sent <- function(row, out, sent_dict) {
|
||||||
|
out_row <- out[row,]
|
||||||
|
### Contains sentiment per sentence for whole article
|
||||||
|
sentiment_ud <- out_row$`_source.ud`[[1]] %>%
|
||||||
|
select(-one_of('exists')) %>%
|
||||||
|
unnest() %>%
|
||||||
|
filter(upos != 'PUNCT') %>% # For getting proper word counts
|
||||||
|
mutate(V1 = str_c(lemma,'_',upos)) %>%
|
||||||
|
left_join(sent_dict, by = 'V1') %>%
|
||||||
|
### Setting binary sentiment as unit of analysis
|
||||||
|
mutate(V2 = V3) %>%
|
||||||
|
group_by(sentence_id) %>%
|
||||||
|
mutate(
|
||||||
|
V2 = case_when(
|
||||||
|
is.na(V2) == T ~ 0,
|
||||||
|
TRUE ~ V2
|
||||||
|
)
|
||||||
|
) %>%
|
||||||
|
summarise(sent_sum = sum(V2),
|
||||||
|
words = length(lemma),
|
||||||
|
sent_words = length(na.omit(V3))) %>%
|
||||||
|
mutate(
|
||||||
|
sent = sent_sum/words,
|
||||||
|
arousal = sent_words/words
|
||||||
|
)
|
||||||
|
out_row <- select(out_row, -`_source.ud`)
|
||||||
|
### Contains sentiment per sentence for actor
|
||||||
|
actor_tone <- filter(sentiment_ud, sentence_id %in% unlist(out_row$sentence_id))
|
||||||
|
|
||||||
|
### Aggregated sentiment per actor (over all sentences containing actor)
|
||||||
|
actor <- summarise(actor_tone,
|
||||||
|
sent = sum(sent_sum)/sum(words),
|
||||||
|
sent_sum = sum(sent_sum),
|
||||||
|
sent_words = sum(sent_words),
|
||||||
|
words = sum(words),
|
||||||
|
arousal = sum(sent_words)/sum(words)
|
||||||
|
)
|
||||||
|
|
||||||
|
### Aggregated sentiment per article (over all sentences in article)
|
||||||
|
text <- summarise(sentiment_ud,
|
||||||
|
sent = sum(sent_sum)/sum(words),
|
||||||
|
sent_sum = sum(sent_sum),
|
||||||
|
sent_words = sum(sent_words),
|
||||||
|
words = sum(words),
|
||||||
|
arousal = sum(sent_words)/sum(words)
|
||||||
|
)
|
||||||
|
return(cbind(out_row,data.frame(actor = actor, text = text)))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### Creating aggregate measuers at daily, weekly, monthly and yearly level
|
||||||
|
grouper <- function(level, out, actorids, sent = F) {
|
||||||
|
by_newspaper <- out %>%
|
||||||
|
mutate(
|
||||||
|
sentence_count = round(unlist(occ)/unlist(prom))
|
||||||
|
) %>%
|
||||||
|
group_by_at(vars(level, `_source.doctype`)) %>%
|
||||||
|
summarise(
|
||||||
|
occ = sum(unlist(occ)),
|
||||||
|
prom_art = mean(unlist(prom)),
|
||||||
|
rel_first_art = mean(unlist(rel_first)),
|
||||||
|
first = mean(unlist(first)),
|
||||||
|
sentence_count = sum(sentence_count),
|
||||||
|
articles = length(`_id`),
|
||||||
|
level = level
|
||||||
|
) %>%
|
||||||
|
ungroup()
|
||||||
|
aggregate <- out %>%
|
||||||
|
mutate(
|
||||||
|
sentence_count = round(unlist(occ)/unlist(prom))
|
||||||
|
) %>%
|
||||||
|
group_by_at(vars(level)) %>%
|
||||||
|
summarise(
|
||||||
|
occ = sum(unlist(occ)),
|
||||||
|
prom_art = mean(unlist(prom)),
|
||||||
|
rel_first_art = mean(unlist(rel_first)),
|
||||||
|
first = mean(unlist(first)),
|
||||||
|
sentence_count = sum(sentence_count),
|
||||||
|
articles = length(`_id`),
|
||||||
|
`_source.doctype` = 'agg',
|
||||||
|
level = level
|
||||||
|
) %>%
|
||||||
|
ungroup()
|
||||||
|
|
||||||
|
if(sent == T) {
|
||||||
|
by_newspaper_sent <- out %>%
|
||||||
|
group_by_at(vars(level, `_source.doctype`)) %>%
|
||||||
|
summarise(
|
||||||
|
actor.sent = mean(actor.sent),
|
||||||
|
actor.sent_sum = sum(actor.sent_sum),
|
||||||
|
actor.sent_words = sum(actor.sent_words),
|
||||||
|
actor.words = sum(actor.words),
|
||||||
|
actor.arousal = mean(actor.arousal),
|
||||||
|
text.sent = mean(text.sent),
|
||||||
|
text.sent_sum = sum(text.sent_sum),
|
||||||
|
text.sent_words = sum(text.sent_words),
|
||||||
|
text.words = sum(text.words),
|
||||||
|
text.arousal = mean(text.arousal)
|
||||||
|
) %>%
|
||||||
|
ungroup() %>%
|
||||||
|
select(-level,-`_source.doctype`)
|
||||||
|
aggregate_sent <- out %>%
|
||||||
|
group_by_at(vars(level)) %>%
|
||||||
|
summarise(
|
||||||
|
actor.sent = mean(actor.sent),
|
||||||
|
actor.sent_sum = sum(actor.sent_sum),
|
||||||
|
actor.sent_words = sum(actor.sent_words),
|
||||||
|
actor.words = sum(actor.words),
|
||||||
|
actor.arousal = mean(actor.arousal),
|
||||||
|
text.sent = mean(text.sent),
|
||||||
|
text.sent_sum = sum(text.sent_sum),
|
||||||
|
text.sent_words = sum(text.sent_words),
|
||||||
|
text.words = sum(text.words),
|
||||||
|
text.arousal = mean(text.arousal)
|
||||||
|
) %>%
|
||||||
|
ungroup() %>%
|
||||||
|
select(-level)
|
||||||
|
aggregate <- bind_cols(aggregate,aggregate_sent)
|
||||||
|
by_newspaper <- bind_cols(by_newspaper, by_newspaper_sent)
|
||||||
|
}
|
||||||
|
output <- bind_rows(by_newspaper, aggregate) %>%
|
||||||
|
bind_cols(.,bind_rows(actor)[rep(seq_len(nrow(bind_rows(actor))), each=nrow(.)),]) %>%
|
||||||
|
select(
|
||||||
|
-`_index`,
|
||||||
|
-`_type`,
|
||||||
|
-`_score`,
|
||||||
|
-`_id`,
|
||||||
|
-contains('Search'),
|
||||||
|
-contains('not')
|
||||||
|
)
|
||||||
|
colnames(output) <- gsub("_source.",'', colnames(output))
|
||||||
|
return(output)
|
||||||
|
}
|
||||||
|
###########################################################################################
|
||||||
|
plan(multiprocess, workers = cores)
|
||||||
|
if (is.null(sent_dict) == F) {
|
||||||
|
fields <- c('ud','computerCodes.actorsDetail', 'doctype', 'publication_date')
|
||||||
|
} else {
|
||||||
|
fields <- c('computerCodes.actorsDetail', 'doctype', 'publication_date')
|
||||||
|
}
|
||||||
|
actor <- actors[row,]
|
||||||
|
if (actor$`_source.function` == "Party"){
|
||||||
|
years = seq(2000,2019,1)
|
||||||
|
startDate <- '2000'
|
||||||
|
endDate <- '2019'
|
||||||
|
} else {
|
||||||
|
years = c(0)
|
||||||
|
startDate <- actor$`_source.startDate`
|
||||||
|
endDate <- actor$`_source.endDate`
|
||||||
|
}
|
||||||
|
if (actor$`_source.function` == 'Party' && actor$party_only == T) {
|
||||||
|
actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'))
|
||||||
|
} else if (actor$`_source.function` == 'Party') {
|
||||||
|
actorids <- c(paste0(actor$`_source.partyId`,'_s'), paste0(actor$`_source.partyId`,'_f'), paste0(actor$`_source.partyId`,'_a'))
|
||||||
|
actor$party_only <- F
|
||||||
|
} else {
|
||||||
|
actorids <- actor$`_source.actorId`
|
||||||
|
actor$party_only <- NULL
|
||||||
|
}
|
||||||
|
|
||||||
|
actor_aggregator <- function(year, query, actor, actorids, default_operator, localhost = F, es_pwd) {
|
||||||
|
if (year > 0) {
|
||||||
|
query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',year,'-01-01 TO ',year,'-12-31]')
|
||||||
|
} else {
|
||||||
|
query <- paste0('computerCodes.actors:(',paste(actorids, collapse = ' '),') && publication_date:[',actor$`_source.startDate`,' TO ',actor$`_source.endDate`,']')
|
||||||
|
}
|
||||||
|
|
||||||
|
### Temporary exception for UK missing junk coding
|
||||||
|
if (actor$`_source.country` != 'uk') {
|
||||||
|
query <- paste0(query,' && computerCodes.junk:0')
|
||||||
|
}
|
||||||
|
|
||||||
|
out <- elasticizer(query_string(paste0('country:',actor$`_source.country`,' && ',query),
|
||||||
|
fields = fields, default_operator = default_operator),
|
||||||
|
localhost = localhost,
|
||||||
|
es_pwd = es_pwd)
|
||||||
|
if (length(out$`_id`) > 0 ) {
|
||||||
|
if (is.null(sent_dict) == F) {
|
||||||
|
out_ud <- out %>% select(`_id`,`_source.ud`)
|
||||||
|
out <- out %>% select(-`_source.ud`)
|
||||||
|
}
|
||||||
|
### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
|
||||||
|
out <- out %>%
|
||||||
|
unnest(`_source.computerCodes.actorsDetail`, .preserve = colnames(.)) %>%
|
||||||
|
unnest(ids, .preserve = colnames(.)) %>%
|
||||||
|
filter(ids1 %in% actorids) # %>%
|
||||||
|
# select(-ends_with('start')) %>%
|
||||||
|
# select(-ends_with('end')) %>%
|
||||||
|
# select(-starts_with('ids'))
|
||||||
|
|
||||||
|
### Only if there are more rows than articles, recalculate
|
||||||
|
if (length(unique(out$`_id`)) != length(out$`_id`)) {
|
||||||
|
duplicates <- out[(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),]
|
||||||
|
actor_single <- out[!(duplicated(out$`_id`) | duplicated(out$`_id`, fromLast = T)),]
|
||||||
|
art_id <- unique(duplicates$`_id`)
|
||||||
|
dupe_merged <- bind_rows(future_lapply(art_id, aggregator, duplicates = duplicates))
|
||||||
|
out <- bind_rows(dupe_merged, actor_single)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is.null(sent_dict) == F) {
|
||||||
|
out <- left_join(out, out_ud, by = '_id')
|
||||||
|
out <- bind_rows(future_lapply(seq(1,nrow(out),1),par_sent, out = out, sent_dict = sent_dict))
|
||||||
|
}
|
||||||
|
|
||||||
|
### Creating date grouping variables
|
||||||
|
out <- out %>%
|
||||||
|
mutate(
|
||||||
|
year = strftime(`_source.publication_date`, format = '%Y'),
|
||||||
|
yearmonth = strftime(out$`_source.publication_date`, format = '%Y%m'),
|
||||||
|
yearmonthday = strftime(out$`_source.publication_date`, format = '%Y%m%d'),
|
||||||
|
yearweek = strftime(out$`_source.publication_date`, format = "%Y%V")
|
||||||
|
) %>%
|
||||||
|
select(
|
||||||
|
-`_score`,
|
||||||
|
-`_index`,
|
||||||
|
-`_type`,
|
||||||
|
-`_score`,
|
||||||
|
-`_source.computerCodes.actorsDetail`,
|
||||||
|
-ids1
|
||||||
|
)
|
||||||
|
levels <- c('year','yearmonth','yearmonthday','yearweek')
|
||||||
|
aggregate_data <- bind_rows(lapply(levels, grouper, out = out, actorids = actorids, sent = !is.null(sent_dict)))
|
||||||
|
return(aggregate_data)
|
||||||
|
} else {
|
||||||
|
return()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
saveRDS(bind_rows(lapply(years, actor_aggregator, query, actor, actorids, default_operator, localhost, es_pwd)), file = paste0(actor$`_source.country`,'_',paste0(actorids,collapse = ''),actor$`_source.function`,startDate,endDate,'.Rds'))
|
||||||
|
print(paste0('Done with ',row,'/',nrow(actors),' actors'))
|
||||||
|
return()
|
||||||
|
}
|
@ -0,0 +1,188 @@
|
|||||||
|
#' Generate actor data frames (with sentiment) from database
|
||||||
|
#'
|
||||||
|
#' Generate actor data frames (with sentiment) from database
|
||||||
|
#' @param out Data frame produced by elasticizer
|
||||||
|
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or NAs if not applicable.
|
||||||
|
#' @param actor_ids Optional vector containing the actor ids to be collected
|
||||||
|
#' @param cores Number of threads to use for parallel processing
|
||||||
|
#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
|
||||||
|
#' @return No return value, data per batch is saved in an RDS file
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' actor_fetcher(out, sent_dict = NULL, cores = 1)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Aggregate actor results ################################
|
||||||
|
#################################################################################################
|
||||||
|
actor_fetcher <- function(out, sent_dict = NULL, actor_ids = NULL, cores = 1, localhost = NULL, validation = F) {
|
||||||
|
plan(multiprocess, workers = cores)
|
||||||
|
### Functions
|
||||||
|
### Calculate sentiment scores for each actor-document
|
||||||
|
sent_scorer <- function(row, out_row, ud_sent) {
|
||||||
|
### Contains sentiment per sentence for actor
|
||||||
|
actor_tone <- filter(ud_sent, sentence_id %in% unlist(out_row[row,]$sentence_id))
|
||||||
|
|
||||||
|
### Aggregated sentiment per actor (over all sentences containing actor)
|
||||||
|
actor <- summarise(actor_tone,
|
||||||
|
sent = sum(sent_sum)/sum(words),
|
||||||
|
sent_sum = sum(sent_sum),
|
||||||
|
sent_words = sum(sent_words),
|
||||||
|
words = sum(words),
|
||||||
|
arousal = sum(sent_words)/sum(words)
|
||||||
|
)
|
||||||
|
return(cbind(out_row[row,],data.frame(actor = actor)))
|
||||||
|
}
|
||||||
|
|
||||||
|
aggregator <- function (pid, dupe_df) {
|
||||||
|
### Party ids excluding actors
|
||||||
|
p_ids <- c(str_c(pid,'_f'),str_c(pid,'_s'))
|
||||||
|
### Party ids including actors
|
||||||
|
p_ids_a <- c(p_ids,str_c(pid,'_a'))
|
||||||
|
summarizer <- function (p_ids, dupe_df, merged_id) {
|
||||||
|
id <- dupe_df$`_id`[[1]]
|
||||||
|
dupe_df <- dupe_df %>%
|
||||||
|
filter(ids %in% p_ids)
|
||||||
|
if (nrow(dupe_df) > 0) {
|
||||||
|
return(
|
||||||
|
dupe_df %>% summarise(
|
||||||
|
`_id` = first(`_id`),
|
||||||
|
`_source.doctype` = first(`_source.doctype`),
|
||||||
|
`_source.publication_date` = first(`_source.publication_date`),
|
||||||
|
prom = list(length(unique(unlist(sentence_id)))/round(occ[[1]]/prom[[1]])),
|
||||||
|
sentence_id = list(sort(unique(unlist(sentence_id)))),
|
||||||
|
rel_first = list(max(unlist(rel_first))),
|
||||||
|
ids = merged_id,
|
||||||
|
occ = list(length(unique(unlist(sentence_id)))),
|
||||||
|
first = list(min(unlist(sentence_id))),
|
||||||
|
actor_start = list(sort(unique(unlist(actor_start)))),
|
||||||
|
actor_end = list(sort(unique(unlist(actor_end)))),
|
||||||
|
sentence_start = list(sort(unique(unlist(sentence_start)))),
|
||||||
|
sentence_end = list(sort(unique(unlist(sentence_end))))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
print(paste0('id:',id))
|
||||||
|
return(NULL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
party <- summarizer(p_ids, dupe_df, str_c(pid,'_mfs'))
|
||||||
|
party_actor <- summarizer(p_ids_a, dupe_df, str_c(pid,'_mfsa'))
|
||||||
|
return(bind_rows(party,party_actor))
|
||||||
|
}
|
||||||
|
|
||||||
|
par_sent <- function(row, out, sent_dict = NULL) {
|
||||||
|
out_row <- out[row,]
|
||||||
|
### Generating sentence-level sentiment scores from ud
|
||||||
|
if (is.null(sent_dict) == F) {
|
||||||
|
ud_sent <- out_row$`_source.ud`[[1]] %>%
|
||||||
|
select(-one_of('exists')) %>%
|
||||||
|
unnest() %>%
|
||||||
|
filter(upos != 'PUNCT') # For getting proper word counts
|
||||||
|
if ("lem_u" %in% colnames(sent_dict)) {
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
mutate(lem_u = str_c(lemma,'_',upos)) %>%
|
||||||
|
left_join(sent_dict, by = 'lem_u')
|
||||||
|
} else if ("lemma" %in% colnames(sent_dict)) {
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
left_join(sent_dict, by = 'lemma') %>%
|
||||||
|
mutate(lem_u = lemma)
|
||||||
|
}
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
group_by(sentence_id) %>%
|
||||||
|
mutate(
|
||||||
|
prox = case_when(
|
||||||
|
is.na(prox) == T ~ 0,
|
||||||
|
TRUE ~ prox
|
||||||
|
)
|
||||||
|
) %>%
|
||||||
|
summarise(sent_sum = sum(prox),
|
||||||
|
words = length(lemma),
|
||||||
|
sent_words = sum(prox != 0),
|
||||||
|
sent_lemmas = list(lem_u[prox != 0])) %>%
|
||||||
|
mutate(
|
||||||
|
sent = sent_sum/words,
|
||||||
|
arousal = sent_words/words
|
||||||
|
)
|
||||||
|
out_row <- select(out_row, -`_source.ud`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (validation == T) {
|
||||||
|
codes_sent <- filter(ud_sent, sentence_id == out_row$`_source.codes.sentence.id`[1])
|
||||||
|
return(cbind(out_row, codes = codes_sent))
|
||||||
|
}
|
||||||
|
|
||||||
|
### Unnest out_row to individual actor ids
|
||||||
|
out_row <- out_row %>%
|
||||||
|
unnest(`_source.computerCodes.actorsDetail`) %>%
|
||||||
|
mutate(ids_list = ids) %>%
|
||||||
|
unnest(ids) %>%
|
||||||
|
mutate(
|
||||||
|
pids = str_sub(ids, start = 1, end = -3)
|
||||||
|
)
|
||||||
|
|
||||||
|
if (!is.null(actor_ids)) {
|
||||||
|
out_row <- filter(out_row, ids %in% actor_ids )
|
||||||
|
}
|
||||||
|
|
||||||
|
### Get list of party ids occuring more than once in the document
|
||||||
|
pids_table <- table(out_row$pids)
|
||||||
|
dupe_pids <- names(pids_table[pids_table > 1])%>%
|
||||||
|
str_subset(pattern = fixed('P_'))
|
||||||
|
single_pids <- names(pids_table[pids_table <= 1]) %>%
|
||||||
|
str_subset(pattern = fixed('P_'))
|
||||||
|
### Data frame containing only duplicate party ids
|
||||||
|
dupe_df <- out_row %>%
|
||||||
|
filter(pids %in% dupe_pids)
|
||||||
|
### Data frame containing only single party ids
|
||||||
|
single_df <- out_row %>%
|
||||||
|
filter(pids %in% single_pids)
|
||||||
|
|
||||||
|
### Data frame for single occurrence mfsa
|
||||||
|
single_party_actor <- single_df %>%
|
||||||
|
mutate(
|
||||||
|
ids = str_c(pids,'_mfsa')
|
||||||
|
)
|
||||||
|
### Data frame for single occurence mfs
|
||||||
|
single_party <- single_df %>%
|
||||||
|
filter(!endsWith(ids, '_a')) %>%
|
||||||
|
mutate(
|
||||||
|
ids = str_c(pids,'_mfs')
|
||||||
|
)
|
||||||
|
out_row <- out_row %>%
|
||||||
|
filter(startsWith(ids,'A_')) %>%
|
||||||
|
bind_rows(., single_party, single_party_actor)
|
||||||
|
### For each of the party ids in the list above, aggregate to _mfs and _mfsa
|
||||||
|
if (length(dupe_pids) > 0) {
|
||||||
|
aggregate <- bind_rows(lapply(dupe_pids, aggregator, dupe_df = dupe_df))
|
||||||
|
out_row <- bind_rows(out_row, aggregate)
|
||||||
|
}
|
||||||
|
|
||||||
|
### Generating sentiment scores for article and actors
|
||||||
|
if (is.null(sent_dict) == F) {
|
||||||
|
### Aggregated sentiment per article (over all sentences in article)
|
||||||
|
text_sent <- summarise(ud_sent,
|
||||||
|
sent = sum(sent_sum)/sum(words),
|
||||||
|
sent_sum = sum(sent_sum),
|
||||||
|
sent_words = sum(sent_words),
|
||||||
|
words = sum(words),
|
||||||
|
arousal = sum(sent_words)/sum(words)
|
||||||
|
)
|
||||||
|
out_row <- bind_rows(lapply(seq(1,nrow(out_row),1),sent_scorer, out_row = out_row, ud_sent = ud_sent)) %>%
|
||||||
|
cbind(., text = text_sent)
|
||||||
|
}
|
||||||
|
out_row <- out_row %>%
|
||||||
|
mutate(
|
||||||
|
year = strftime(`_source.publication_date`, format = '%Y'),
|
||||||
|
yearmonth = strftime(`_source.publication_date`, format = '%Y%m'),
|
||||||
|
yearmonthday = strftime(`_source.publication_date`, format = '%Y%m%d'),
|
||||||
|
yearweek = strftime(`_source.publication_date`, format = "%Y%V")
|
||||||
|
) %>%
|
||||||
|
select(#-`_source.computerCodes.actorsDetail`,
|
||||||
|
-`_score`,
|
||||||
|
-`_index`,
|
||||||
|
-`_type`,
|
||||||
|
-pids)
|
||||||
|
return(out_row)
|
||||||
|
}
|
||||||
|
saveRDS(bind_rows(future_lapply(1:nrow(out), par_sent, out = out, sent_dict = sent_dict)), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
|
||||||
|
return()
|
||||||
|
}
|
@ -0,0 +1,152 @@
|
|||||||
|
#' Updater function for elasticizer: Conduct actor searches
|
||||||
|
#'
|
||||||
|
#' Updater function for elasticizer: Conduct actor searches
|
||||||
|
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
|
||||||
|
#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
|
||||||
|
#' @param ids List of actor ids
|
||||||
|
#' @param prefix Regex containing prefixes that should be excluded from hits
|
||||||
|
#' @param postfix Regex containing postfixes that should be excluded from hits
|
||||||
|
#' @param identifier String used to mark highlights. Should be a lowercase string
|
||||||
|
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
|
||||||
|
#' @param es_super Password for write access to ElasticSearch
|
||||||
|
#' @param cores Number of cores to use for parallel processing, defaults to cores (all cores available)
|
||||||
|
#' @return As this is a nested function used within elasticizer, there is no return output
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
|
||||||
|
actorizer_old <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver, cores = 1) {
|
||||||
|
plan(multiprocess, workers = cores)
|
||||||
|
### Function to filter out false positives using regex
|
||||||
|
exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
|
||||||
|
min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
|
||||||
|
max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
|
||||||
|
split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
|
||||||
|
min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
|
||||||
|
max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
|
||||||
|
sentence <- paste0(' ',str_sub(doc$merged, min, max),' ') # Extract sentence from text, adding whitespaces before and after for double negation (i.e. Con only when preceded by "("))
|
||||||
|
|
||||||
|
# Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
|
||||||
|
if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
|
||||||
|
return(id)
|
||||||
|
} else {
|
||||||
|
return(NULL)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ranger <- function(x, ud) {
|
||||||
|
return(which((ud$start <= x) & (ud$end >= x)))
|
||||||
|
}
|
||||||
|
sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
|
||||||
|
doc <- out[row,]
|
||||||
|
if (nchar(doc$merged) > 990000) {
|
||||||
|
return(
|
||||||
|
data.frame(
|
||||||
|
err = T,
|
||||||
|
errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
# Extracting ud output from document
|
||||||
|
ud <- doc$`_source.ud`[[1]] %>%
|
||||||
|
select(-one_of('exists')) %>% # Removing ud.exists variable
|
||||||
|
unnest() %>%
|
||||||
|
mutate(doc_id = doc$`_id`)
|
||||||
|
markers <- doc$markers[[1]][,'start'] # Extract list of markers
|
||||||
|
# Convert markers to udpipe rows (in some cases the start position doesn't align with the udpipe token start position (e.g. when anti-|||EU is treated as a single word))
|
||||||
|
rows <- unlist(lapply(markers, ranger, ud = ud))
|
||||||
|
|
||||||
|
# Setting up an actor variable
|
||||||
|
ud$actor <- F
|
||||||
|
ud$actor[rows] <- T
|
||||||
|
|
||||||
|
sentence_count <- max(ud$sentence_id) # Number of sentences in article
|
||||||
|
actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
|
||||||
|
|
||||||
|
# Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
|
||||||
|
if (!is.na(prefix) || !is.na(postfix)) {
|
||||||
|
### If no pre or postfixes, match *not nothing* i.e. anything
|
||||||
|
if (is.na(prefix)) {
|
||||||
|
prefix = '$^'
|
||||||
|
}
|
||||||
|
if (is.na(postfix)) {
|
||||||
|
postfix = '$^'
|
||||||
|
}
|
||||||
|
sentence_ids <- unlist(lapply(actor_sentences,
|
||||||
|
exceptionizer,
|
||||||
|
ud = ud,
|
||||||
|
doc = doc,
|
||||||
|
markers = markers,
|
||||||
|
pre_tags_regex = pre_tags_regex,
|
||||||
|
pre_tags = pre_tags,
|
||||||
|
post_tags_regex = post_tags_regex,
|
||||||
|
post_tags = post_tags,
|
||||||
|
prefix = prefix,
|
||||||
|
postfix = postfix))
|
||||||
|
} else {
|
||||||
|
sentence_ids <- actor_sentences
|
||||||
|
}
|
||||||
|
if (length(sentence_ids > 0)) {
|
||||||
|
# Generating nested sentence start and end positions for actor sentences
|
||||||
|
ud <- ud %>%
|
||||||
|
filter(sentence_id %in% sentence_ids)
|
||||||
|
actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
|
||||||
|
actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
|
||||||
|
ud <- ud %>%
|
||||||
|
group_by(sentence_id) %>%
|
||||||
|
summarise (
|
||||||
|
sentence_start = as.integer(min(start)),
|
||||||
|
sentence_end = as.integer(max(end)),
|
||||||
|
doc_id = first(doc_id)
|
||||||
|
) %>%
|
||||||
|
group_by(doc_id) %>%
|
||||||
|
summarise(
|
||||||
|
sentence_id = list(as.integer(sentence_id)),
|
||||||
|
sentence_start = list(sentence_start),
|
||||||
|
sentence_end = list(sentence_end)
|
||||||
|
)
|
||||||
|
|
||||||
|
return(
|
||||||
|
data.frame(ud, # Sentence id, start and end position for actor sentences
|
||||||
|
actor_start = I(list(actor_start)), # List of actor ud token start positions
|
||||||
|
actor_end = I(list(actor_end)), # List of actor ud token end positions
|
||||||
|
occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs
|
||||||
|
prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
|
||||||
|
rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level
|
||||||
|
first = min(sentence_ids), # First sentence in which actor is mentioned
|
||||||
|
ids = I(list(ids)) # List of actor ids
|
||||||
|
)
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
return(NULL)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
|
||||||
|
offsetter <- function(x, pre_tags, post_tags) {
|
||||||
|
return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
|
||||||
|
}
|
||||||
|
prefix[prefix==''] <- NA
|
||||||
|
postfix[postfix==''] <- NA
|
||||||
|
pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
|
||||||
|
post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
|
||||||
|
out$markers <- future_lapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags)
|
||||||
|
|
||||||
|
# ids <- fromJSON(ids)
|
||||||
|
updates <- bind_rows(future_lapply(seq(1,length(out[[1]]),1), sentencizer,
|
||||||
|
out = out,
|
||||||
|
ids = ids,
|
||||||
|
postfix = postfix,
|
||||||
|
prefix=prefix,
|
||||||
|
pre_tags_regex = pre_tags_regex,
|
||||||
|
pre_tags = pre_tags,
|
||||||
|
post_tags_regex = post_tags_regex,
|
||||||
|
post_tags = post_tags))
|
||||||
|
if (nrow(updates) == 0) {
|
||||||
|
print("Nothing to update for this batch")
|
||||||
|
return(NULL)
|
||||||
|
} else {
|
||||||
|
bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
|
||||||
|
bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
|
||||||
|
return(elastic_update(bulk, es_super = es_super, localhost = localhost))
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,39 @@
|
|||||||
|
#' Aggregator function, to aggregate actor results
|
||||||
|
#'
|
||||||
|
#' Aggregator function, to aggregate actor results
|
||||||
|
#' @param id Article id of the article for which actor aggregation should be done
|
||||||
|
#' @param actor_df The dataframe containing the actor data
|
||||||
|
#' @param merge_id The actorid that should be assigned to the merged result
|
||||||
|
#' @return A dataframe with the merged results
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' aggregator(id, actor_df, merge_id)
|
||||||
|
|
||||||
|
aggregator <- function (id, actor_df, merge_id) {
|
||||||
|
article <- filter(actor_df, `_id` == id) %>%
|
||||||
|
unnest(sentence_id, .preserve = colnames(.))
|
||||||
|
occ <- length(unlist(unique(article$sentence_id1)))
|
||||||
|
sentence_count <- round(article$occ[[1]]/article$prom[[1]])
|
||||||
|
prom <- occ/sentence_count
|
||||||
|
rel_first <- 1-(min(article$sentence_id1)/sentence_count)
|
||||||
|
actor_start <- sort(unique(unlist(article$actor_start)))
|
||||||
|
actor_end <- sort(unique(unlist(article$actor_end)))
|
||||||
|
sentence_start <- sort(unique(unlist(article$sentence_start)))
|
||||||
|
sentence_end <- sort(unique(unlist(article$sentence_end)))
|
||||||
|
sentence_id <- sort(unique(unlist(article$sentence_id)))
|
||||||
|
|
||||||
|
return(data.frame(doc_id = first(article$`_id`),
|
||||||
|
sentence_id = I(list(as.integer(sentence_id))),
|
||||||
|
sentence_start = I(list(sentence_start)),
|
||||||
|
sentence_end = I(list(sentence_end)),
|
||||||
|
actor_start = I(list(actor_start)), # List of actor ud token start positions
|
||||||
|
actor_end = I(list(actor_end)), # List of actor ud token end positions
|
||||||
|
occ = occ, # Number of sentences in which actor occurs
|
||||||
|
prom = prom, # Relative prominence of actor in article (number of occurences/total # sentences)
|
||||||
|
rel_first = rel_first, # Relative position of first occurence at sentence level
|
||||||
|
first = min(article$sentence_id1), # First sentence in which actor is mentioned
|
||||||
|
ids = merge_id, # List of actor ids
|
||||||
|
stringsAsFactors = F
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
@ -0,0 +1,61 @@
|
|||||||
|
### Notes:
|
||||||
|
# Do you want to search for either one OR other actorid, or both occuring in the same document?
|
||||||
|
# Do you want to keep only the occurences of the actorids you are searching for, or all actor occurences in the hits?
|
||||||
|
# Search by actorId, then aggregate by month
|
||||||
|
# When actorId starts with P_, define what hits you want to get (short, full, actor), if more than one, aggregate properly
|
||||||
|
# Develop query generator for specific actors (ie combine actorId with start and end dates)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#' Generate and store aggregate actor measures to elasticsearch
|
||||||
|
#'
|
||||||
|
#' Generate and store aggregate actor measures to elasticsearch
|
||||||
|
#' @param out The output provided by elasticizer()
|
||||||
|
#' @param localhost Boolean indicating if the script should run locally, or remote
|
||||||
|
#' @param es_super Write password for ES
|
||||||
|
#' @param actorids List of actorids used in the search, should be the same as the actorids used for elasticizer()
|
||||||
|
#' @param ver String indicating the version of the update
|
||||||
|
#' @return Return value is based on output of elastic_update()
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' aggregator_elastic(out, localhost = F, actorids, ver, es_super)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Aggregate actor results ################################
|
||||||
|
#################################################################################################
|
||||||
|
aggregator_elastic <- function(out, localhost = F, actorids, ver, es_super) {
|
||||||
|
### Generating actor dataframe, unnest by actorsDetail, then by actor ids. Filter out non-relevant actor ids.
|
||||||
|
partyid <- str_sub(actorids[1], end=-3)
|
||||||
|
actor_df <- out %>%
|
||||||
|
unnest() %>%
|
||||||
|
unnest(ids, .preserve = colnames(.)) %>%
|
||||||
|
filter(ids1 %in% actorids)
|
||||||
|
|
||||||
|
agg_party_actors <- bind_rows(lapply(unique(actor_df$`_id`),
|
||||||
|
mamlr:::aggregator,
|
||||||
|
actor_df = actor_df,
|
||||||
|
merge_id = paste0(partyid,'_mfsa')))
|
||||||
|
|
||||||
|
party <- actor_df %>%
|
||||||
|
filter(!endsWith(ids1, '_a'))
|
||||||
|
agg_party <- bind_rows(lapply(unique(party$`_id`),
|
||||||
|
mamlr:::aggregator,
|
||||||
|
actor_df = party,
|
||||||
|
merge_id = paste0(partyid,'_mfs')))
|
||||||
|
|
||||||
|
actors_only <- actor_df %>%
|
||||||
|
filter(endsWith(ids1, '_a'))
|
||||||
|
agg_actors <- bind_rows(lapply(unique(actors_only$`_id`),
|
||||||
|
mamlr:::aggregator,
|
||||||
|
actor_df = actors_only,
|
||||||
|
merge_id = paste0(partyid,'_ma')))
|
||||||
|
df_out <- bind_rows(agg_party_actors, agg_party, agg_actors)
|
||||||
|
doc_ids <- df_out$doc_id
|
||||||
|
df_out <- df_out %>%
|
||||||
|
select(-1) %>%
|
||||||
|
split(as.factor(doc_ids))
|
||||||
|
df_out <- data.frame(doc_id = names(df_out), list = I(df_out))
|
||||||
|
bulk <- apply(df_out, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
|
||||||
|
return(elastic_update(bulk, es_super = es_super, localhost = localhost))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,47 @@
|
|||||||
|
#' Merges list of lemmas back into a pseudo-document
|
||||||
|
#'
|
||||||
|
#' Merges list of lemmas back into a pseudo-document
|
||||||
|
#' @param row A row number form the Elasticizer-generated data frame
|
||||||
|
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
|
||||||
|
#' @param out The elasticizer-generated data frame
|
||||||
|
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
|
||||||
|
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
|
||||||
|
#' @return A documentified string of lemmas, one document at a time
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' merger(1, words = '999', out, text)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Reconstructing documents from lemmas########################
|
||||||
|
#################################################################################################
|
||||||
|
## Only merging lemmas for now, feature selection has no impact on junk classification
|
||||||
|
merger_old <- function(row, out, text, clean) {
|
||||||
|
df <- out[row,]
|
||||||
|
# Mergin lemmas into single string
|
||||||
|
if (text == 'lemmas') {
|
||||||
|
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
|
||||||
|
}
|
||||||
|
if (text == 'ud') {
|
||||||
|
lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
|
||||||
|
}
|
||||||
|
if (text == 'ud_upos') {
|
||||||
|
df <- unnest(df,`_source.ud`)
|
||||||
|
lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
|
||||||
|
# Regex removes all words consisting of or containing numbers, @#$%
|
||||||
|
# Punctuation is not taken into account, as it is already filtered out, see above
|
||||||
|
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "") else . }
|
||||||
|
# In the very rare but obviously occuring (CxqrOmMB4Bzg6Uhtzw0P) case that a document consists only of punctuation, return an empty string
|
||||||
|
if (length(lemmas) == 0 ){
|
||||||
|
lemmas <- ''
|
||||||
|
}
|
||||||
|
return(lemmas)
|
||||||
|
}
|
||||||
|
# Replacing $-marked punctuation with their regular forms
|
||||||
|
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
|
||||||
|
# Regex removes all words consisting of or containing numbers, @#$%
|
||||||
|
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
|
||||||
|
# Regex also used in out_parser
|
||||||
|
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . } %>%
|
||||||
|
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
|
||||||
|
paste0(.,". ")
|
||||||
|
return(lemmas)
|
||||||
|
}
|
@ -0,0 +1,143 @@
|
|||||||
|
#' Updater function for elasticizer: Conduct actor searches
|
||||||
|
#'
|
||||||
|
#' Updater function for elasticizer: Conduct actor searches
|
||||||
|
#' @param out Does not need to be defined explicitly! (is already parsed in the elasticizer function)
|
||||||
|
#' @param localhost Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)
|
||||||
|
#' @param ids List of actor ids
|
||||||
|
#' @param prefix Regex containing prefixes that should be excluded from hits
|
||||||
|
#' @param postfix Regex containing postfixes that should be excluded from hits
|
||||||
|
#' @param identifier String used to mark highlights. Should be a lowercase string
|
||||||
|
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
|
||||||
|
#' @param es_super Password for write access to ElasticSearch
|
||||||
|
#' @return As this is a nested function used within elasticizer, there is no return output
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
|
||||||
|
actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_tags, es_super, ver) {
|
||||||
|
offsetter <- function(x, pre_tags, post_tags) {
|
||||||
|
return(as.list(as.data.frame(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))))
|
||||||
|
}
|
||||||
|
|
||||||
|
out <- mamlr:::out_parser(out, field = 'highlight', clean = F) %>%
|
||||||
|
## Computing offset for first token position (some articles have a minimum token start position of 16, instead of 1 or 2)
|
||||||
|
mutate( # Checking if the merged field starts with a whitespace character
|
||||||
|
space = case_when(
|
||||||
|
str_starts(merged, '\\s') ~ 1,
|
||||||
|
T ~ 0)
|
||||||
|
) %>%
|
||||||
|
unnest(cols = '_source.ud') %>%
|
||||||
|
rowwise() %>%
|
||||||
|
mutate(ud_min = min(unlist(start))-1-space) ## Create offset variable, subtract 1 for default token start position of 1, and subtract 1 if merged field starts with a whitespace
|
||||||
|
|
||||||
|
print(str_c('Number of articles with minimum token start position higher than 2: ',sum(out$ud_min > 2)))
|
||||||
|
print('Unique ud_min offset values in batch: ')
|
||||||
|
print(unique(out$ud_min))
|
||||||
|
prefix[prefix==''] <- NA
|
||||||
|
postfix[postfix==''] <- NA
|
||||||
|
pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
|
||||||
|
post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
|
||||||
|
|
||||||
|
if (sum(nchar(out$merged) > 990000) > 0) {
|
||||||
|
stop("One or more documents in this batch exceed 990000 characters")
|
||||||
|
}
|
||||||
|
# Extracting ud output from document
|
||||||
|
ud <- out %>%
|
||||||
|
select(`_id`,lemma,start,end, sentence_id,merged) %>%
|
||||||
|
unnest(cols = colnames(.))
|
||||||
|
|
||||||
|
sentences <- ud %>%
|
||||||
|
group_by(`_id`, sentence_id) %>%
|
||||||
|
summarise(
|
||||||
|
sentence_start = min(start),
|
||||||
|
sentence_end = max(end)
|
||||||
|
) %>%
|
||||||
|
mutate(
|
||||||
|
sentence_count = n()
|
||||||
|
)
|
||||||
|
|
||||||
|
out$markers <- lapply(str_locate_all(out$merged,coll(pre_tags)),offsetter, pre_tags = pre_tags, post_tags = post_tags)
|
||||||
|
markers <- out %>%
|
||||||
|
select(`_id`,markers, ud_min) %>%
|
||||||
|
unnest_wider(markers) %>%
|
||||||
|
rename(marker_start = start, marker_end = end) %>%
|
||||||
|
unnest(colnames(.)) %>%
|
||||||
|
## Modifying marker start and end positions using the ud_min column (see above)
|
||||||
|
mutate(marker_start = marker_start +(ud_min),
|
||||||
|
marker_end = marker_end + (ud_min))
|
||||||
|
|
||||||
|
|
||||||
|
hits <- as.data.table(ud)[as.data.table(markers), .(`_id`, lemma,x.start, start, end, x.end, sentence_id, merged), on =.(`_id` = `_id`, start <= marker_start, end >= marker_start)] %>%
|
||||||
|
mutate(end = x.end,
|
||||||
|
start = x.start) %>%
|
||||||
|
select(`_id`, sentence_id, start, end,merged) %>%
|
||||||
|
group_by(`_id`,sentence_id) %>%
|
||||||
|
summarise(
|
||||||
|
actor_start = I(list(start)),
|
||||||
|
actor_end = I(list(end)),
|
||||||
|
n_markers = length(start),
|
||||||
|
merged = first(merged)
|
||||||
|
) %>%
|
||||||
|
left_join(.,sentences, by=c('_id','sentence_id')) %>%
|
||||||
|
ungroup %>%
|
||||||
|
arrange(`_id`,sentence_id) %>%
|
||||||
|
group_by(`_id`) %>%
|
||||||
|
mutate(n_markers = cumsum(n_markers)) %>%
|
||||||
|
mutate(
|
||||||
|
sentence_start_tags = sentence_start+((nchar(pre_tags)+nchar(post_tags))*(lag(n_markers, default = 0))),
|
||||||
|
sentence_end_tags = sentence_end+((nchar(pre_tags)+nchar(post_tags))*(n_markers))
|
||||||
|
) %>%
|
||||||
|
mutate(
|
||||||
|
sentence = paste0(' ',str_sub(merged, sentence_start_tags, sentence_end_tags),' ')
|
||||||
|
) %>%
|
||||||
|
select(-merged) %>%
|
||||||
|
ungroup()
|
||||||
|
# Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
|
||||||
|
if (!is.na(prefix) || !is.na(postfix)) {
|
||||||
|
### If no pre or postfixes, match *not nothing* i.e. anything
|
||||||
|
if (is.na(prefix)) {
|
||||||
|
prefix = '$^'
|
||||||
|
}
|
||||||
|
if (is.na(postfix)) {
|
||||||
|
postfix = '$^'
|
||||||
|
}
|
||||||
|
hits <- hits %>%
|
||||||
|
filter(
|
||||||
|
!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) & !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
### Checking and removing any na rows, and reporting them in the console
|
||||||
|
nas <- hits %>% filter(is.na(sentence_id))
|
||||||
|
hits <- hits %>% filter(!is.na(sentence_id))
|
||||||
|
if (nrow(nas) > 0) {
|
||||||
|
print(str_c('The following articles have not been searched correctly for actorId ',ids))
|
||||||
|
print(str_c('id_na: ',nas$`_id`, collapse = '\n '))
|
||||||
|
}
|
||||||
|
if (nrow(hits) == 0) {
|
||||||
|
print("Nothing to update for this batch")
|
||||||
|
return(NULL)
|
||||||
|
} else {
|
||||||
|
hits <- hits %>%
|
||||||
|
group_by(`_id`) %>%
|
||||||
|
summarise(
|
||||||
|
sentence_id = list(as.integer(sentence_id)),
|
||||||
|
sentence_start = list(sentence_start),
|
||||||
|
sentence_end = list(sentence_end),
|
||||||
|
actor_start = I(list(unlist(actor_start))), # List of actor ud token start positions
|
||||||
|
actor_end = I(list(unlist(actor_end))), # List of actor ud token end positions
|
||||||
|
occ = length(unique(unlist(sentence_id))), # Number of sentences in which actor occurs
|
||||||
|
first = min(unlist(sentence_id)), # First sentence in which actor is mentioned
|
||||||
|
ids = I(list(ids)),
|
||||||
|
sentence_count = first(sentence_count)# List of actor ids
|
||||||
|
) %>%
|
||||||
|
mutate(
|
||||||
|
prom = occ/sentence_count, # Relative prominence of actor in article (number of occurrences/total # sentences)
|
||||||
|
rel_first = 1-(first/sentence_count), # Relative position of first occurrence at sentence level
|
||||||
|
) %>%
|
||||||
|
select(`_id`:occ, prom,rel_first,first,ids)
|
||||||
|
bulk <- apply(hits, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
|
||||||
|
bulk <- c(bulk,apply(hits[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
|
||||||
|
return(elastic_update(bulk, es_super = es_super, localhost = localhost))
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,19 +1,47 @@
|
|||||||
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
||||||
#'
|
#'
|
||||||
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
#' Generate a line-delimited JSON string for use in Elasticsearch bulk updates
|
||||||
|
#' Type can be either one of three values:
|
||||||
|
#' set: set the value of [varname] to x
|
||||||
|
#' add: add x to the values of [varname]
|
||||||
|
#' varname: When using ud, the ud field will be updated instead of a computerCodes field
|
||||||
#' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
|
#' @param x A single-row data frame, or a string containing the variables and/or values that should be updated (a data frame is converted to a JSON object, strings are stored as-is)
|
||||||
#' @param index The name of the Elasticsearch index to update
|
#' @param index The name of the Elasticsearch index to update
|
||||||
#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created)
|
#' @param varname String indicating the parent variable that should be updated (when it does not exist, it will be created, all varnames are prefixed by computerCodes)
|
||||||
|
#' @param type Type of updating to be done, can be either 'set', 'add', or 'addnested'
|
||||||
|
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
|
||||||
#' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
|
#' @return A string usable as Elasticsearch bulk update command, in line-delimited JSON
|
||||||
#' @export
|
#' @export
|
||||||
#' @examples
|
#' @examples
|
||||||
#' bulk_writer(x, index = 'maml', varname = 'updated_variable')
|
#' bulk_writer(x, index = 'maml')
|
||||||
#################################################################################################
|
#################################################################################################
|
||||||
#################################### Bulk update writer ################################
|
#################################### Bulk update writer ################################
|
||||||
#################################################################################################
|
#################################################################################################
|
||||||
bulk_writer <- function(x, index = 'maml', varname = 'updated_variable') {
|
bulk_writer <- function(x, index = 'maml', varname, type, ver) {
|
||||||
return(
|
### Create a json object if more than one variable besides _id, otherwise use value as-is
|
||||||
paste0('{"update": {"_index": "',index,'", "_type": "doc", "_id": "',x[1],'"}}
|
if (length(x) > 2) {
|
||||||
{ "script" : { "source": "ctx._source.',varname,' = params.code", "lang" : "painless","params" : {"code":',toJSON(x[-1], collapse = F),'}}}')
|
json <- toJSON(list(x[-1]), collapse = T)
|
||||||
)
|
} else {
|
||||||
|
names(x) <- NULL
|
||||||
|
json <- toJSON(x[-1], collapse = T)
|
||||||
|
}
|
||||||
|
if (varname == "ud") {
|
||||||
|
return(
|
||||||
|
paste0('{"update": {"_index": "',index,'", "_type": "_doc", "_id": "',x[1],'"}}
|
||||||
|
{ "script" : { "source": "ctx._source.version = \\"',ver,'\\"; ctx._source.ud = params.code; ctx._source.remove(\\"tokens\\")", "lang" : "painless", "params": { "code": ',json,'}}}')
|
||||||
|
)
|
||||||
|
}
|
||||||
|
if (type == 'set') {
|
||||||
|
return(
|
||||||
|
paste0('{"update": {"_index": "',index,'", "_type": "_doc", "_id": "',x[1],'"}}
|
||||||
|
{ "script" : { "source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,', "object": {"',varname,'": ',json,'} }}}')
|
||||||
|
)
|
||||||
|
}
|
||||||
|
if (type == "add") {
|
||||||
|
return(
|
||||||
|
paste0('{"update": {"_index": "',index,'", "_type": "_doc", "_id": "',x[1],'"}}
|
||||||
|
{"script": {"source": "ctx._source.version = \\"',ver,'\\"; if (ctx._source.computerCodes != null && ctx._source.computerCodes.containsKey(\\"',varname,'\\")) {ctx._source.computerCodes.',varname,'.addAll(params.code)} else if (ctx._source.computerCodes != null) {ctx._source.computerCodes.',varname,' = params.code} else {ctx._source.computerCodes = params.object}", "lang" : "painless", "params": { "code": ',json,' , "object": {"',varname,'": ',json,'}}}}'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
@ -0,0 +1,62 @@
|
|||||||
|
#' Generate CV folds for nested cross-validation
|
||||||
|
#'
|
||||||
|
#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
|
||||||
|
#'
|
||||||
|
#' @param outer_k Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data
|
||||||
|
#' @param inner_k Number of inner CV (parameter optimization) folds
|
||||||
|
#' @param vec Vector containing the true values of the classification
|
||||||
|
#' @param grid Parameter grid for optimization
|
||||||
|
#' @param seed integer used as seed for random number generation
|
||||||
|
#' @return A nested set of lists with row numbers
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' cv_generator(outer_k, inner_k, dfm, class_type)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Generate CV folds ##########################################
|
||||||
|
#################################################################################################
|
||||||
|
cv_generator <- function(outer_k, inner_k, vec, grid, seed) {
|
||||||
|
### Generate inner folds for nested cv
|
||||||
|
inner_loop <- function(i, folds, vec, inner_k, grid, seed) {
|
||||||
|
# RNG needs to be set explicitly for each fold
|
||||||
|
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||||
|
inner_folds <- createFolds(as.factor(vec[-folds[[i]]]), k= inner_k)
|
||||||
|
grid <- crossing(grid, inner_fold = names(inner_folds), outer_fold = names(folds)[i])
|
||||||
|
return(list(grid = grid, inner_folds = inner_folds, outer_fold = names(folds)[i]))
|
||||||
|
}
|
||||||
|
|
||||||
|
### Generate outer folds for nested cv
|
||||||
|
generate_folds <- function(outer_k, inner_k, vec, grid, seed){
|
||||||
|
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||||
|
if (is.null(outer_k)) { # If no outer_k, use all data to generate inner_k folds for parameter optimization
|
||||||
|
inner_folds <- createFolds(as.factor(vec), k= inner_k)
|
||||||
|
grid <- crossing(grid, inner_fold = names(inner_folds))
|
||||||
|
return(list(grid = grid,
|
||||||
|
inner_folds = inner_folds))
|
||||||
|
} else if (outer_k < 1) { # Create holdout validation for model performance estimation, with test set equal to outer_k
|
||||||
|
folds <- createDataPartition(as.factor(vec), p=outer_k)
|
||||||
|
} else { # Do full nested CV
|
||||||
|
folds <- createFolds(as.factor(vec), k= outer_k)
|
||||||
|
}
|
||||||
|
# Generate grid of hyperparameters for model optimization, and include inner folds row numbers
|
||||||
|
grid_folds <- lapply(1:length(folds),
|
||||||
|
inner_loop,
|
||||||
|
folds = folds,
|
||||||
|
vec = vec,
|
||||||
|
inner_k = inner_k,
|
||||||
|
grid = grid,
|
||||||
|
seed = seed)
|
||||||
|
|
||||||
|
# Extract grid dataframe from results
|
||||||
|
grid <- grid_folds %>% purrr::map(1) %>% dplyr::bind_rows()
|
||||||
|
|
||||||
|
# Extract row numbers for inner folds from results
|
||||||
|
inner_folds <- grid_folds %>% purrr::map(2)
|
||||||
|
|
||||||
|
# Extract the names of the inner folds from results
|
||||||
|
names(inner_folds) <- grid_folds %>% purrr::map(3) %>% unlist(.)
|
||||||
|
return(list(grid = grid,
|
||||||
|
outer_folds = folds,
|
||||||
|
inner_folds = inner_folds))
|
||||||
|
}
|
||||||
|
return(generate_folds(outer_k,inner_k = inner_k, vec = vec, grid = grid, seed = seed))
|
||||||
|
}
|
@ -0,0 +1,67 @@
|
|||||||
|
#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
|
||||||
|
#'
|
||||||
|
#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
|
||||||
|
#' @param row Row of grid to parse
|
||||||
|
#' @param grid A cross-table of all possible combinations of doctypes and dates
|
||||||
|
#' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)
|
||||||
|
#' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)
|
||||||
|
#' @param es_pwd Password for Elasticsearch read access
|
||||||
|
#' @param es_super Password for write access to ElasticSearch
|
||||||
|
#' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
|
||||||
|
#' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)
|
||||||
|
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
|
||||||
|
#' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)
|
||||||
|
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Duplicate detector ################################
|
||||||
|
#################################################################################################
|
||||||
|
dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T, ver) {
|
||||||
|
params <- grid[row,]
|
||||||
|
print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
|
||||||
|
query <- paste0('doctype:\\"',params$doctypes,'\\" && publication_date:',params$dates,' && !computerCodes._delete:1')
|
||||||
|
out <- elasticizer(query_string(query, fields = c('country','text','title','subtitle','teaser','preteaser')), es_pwd = es_pwd, localhost= localhost)
|
||||||
|
if (class(out$hits$hits) != 'list') {
|
||||||
|
dfm <- dfm_gen(out, text = "full", words = words, clean = T)
|
||||||
|
if (sum(dfm[1,]) > 0) {
|
||||||
|
simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
|
||||||
|
diag(simil) <- NA
|
||||||
|
duplicates <- which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)
|
||||||
|
duplicates <- cbind(duplicates, rowid= rownames(duplicates))
|
||||||
|
if (length(duplicates) > 0) {
|
||||||
|
rownames(duplicates) <- seq(1:length(rownames(duplicates)))
|
||||||
|
df <- as.data.frame(duplicates, make.names = NA, stringsAsFactors = F) %>%
|
||||||
|
# bind_cols(colid = colnames(simil)[.['col']]) %>%
|
||||||
|
mutate(colid = colnames(simil)[as.numeric(col)]) %>%
|
||||||
|
.[,c(3,4)] %>%
|
||||||
|
group_by(colid) %>% summarise(rowid=list(rowid))
|
||||||
|
text <- capture.output(stream_out(df))
|
||||||
|
# write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
|
||||||
|
simil[upper.tri(simil)] <- NA
|
||||||
|
# write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
|
||||||
|
# file = paste0(getwd(),'/remove_ids.txt'),
|
||||||
|
# append=T)
|
||||||
|
dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
|
||||||
|
dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
|
||||||
|
bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver),
|
||||||
|
apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver))
|
||||||
|
res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
|
||||||
|
return(paste0('Checked ',params$doctypes,' on ',params$dates ))
|
||||||
|
} else {
|
||||||
|
return(paste0('No duplicates for ',params$doctypes,' on ',params$dates ))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return(paste0('No results for ',params$doctypes,' on ',params$dates ))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return(paste0('No results for ',params$doctypes,' on ',params$dates ))
|
||||||
|
}
|
||||||
|
### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
|
||||||
|
# id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
|
||||||
|
# dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
|
||||||
|
# simil_nodupes <- as.matrix(textstat_simil(dfm_nodupes, margin="documents", method="cosine"))
|
||||||
|
# diag(simil_nodupes) <- NA
|
||||||
|
# which(simil_nodupes >= cutoff, arr.ind = TRUE)
|
||||||
|
}
|
@ -1,54 +0,0 @@
|
|||||||
#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
|
|
||||||
#'
|
|
||||||
#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
|
|
||||||
#' @param row Row of grid to parse
|
|
||||||
#' @param grid A cross-table of all possible combinations of doctypes and dates
|
|
||||||
#' @param cutoff Cutoff value for cosine similarity above which documents are considered duplicates
|
|
||||||
#' @param es_pwd Password for Elasticsearch read access
|
|
||||||
#' @return dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory
|
|
||||||
#' @export
|
|
||||||
#' @examples
|
|
||||||
#' dupe_detect(1,grid,es_pwd)
|
|
||||||
|
|
||||||
#################################################################################################
|
|
||||||
#################################### Duplicate detector ################################
|
|
||||||
#################################################################################################
|
|
||||||
|
|
||||||
dupe_detect <- function(row, grid, cutoff, es_pwd) {
|
|
||||||
params <- grid[row,]
|
|
||||||
print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
|
|
||||||
query <- paste0('{"query":
|
|
||||||
{"bool": {"filter":[{"term":{"doctype": "',params$doctypes,'"}},
|
|
||||||
{"range" : {
|
|
||||||
"publication_date" : {
|
|
||||||
"gte" : "',params$dates,'T00:00:00Z",
|
|
||||||
"lt" : "',params$dates+1,'T00:00:00Z"
|
|
||||||
}
|
|
||||||
}}]
|
|
||||||
|
|
||||||
} } }')
|
|
||||||
|
|
||||||
|
|
||||||
out <- elasticizer(query, es_pwd = es_pwd)
|
|
||||||
dfm <- dfm_gen(out, text = "full")
|
|
||||||
simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
|
|
||||||
diag(simil) <- NA
|
|
||||||
simil_og <- simil
|
|
||||||
df <- as.data.frame(which(simil >= cutoff, arr.ind = TRUE)) %>%
|
|
||||||
rownames_to_column("rowid") %>%
|
|
||||||
mutate(colid = colnames(simil)[col]) %>%
|
|
||||||
.[,c(1,4)] %>%
|
|
||||||
group_by(colid) %>% summarise(rowid=list(rowid))
|
|
||||||
text <- capture.output(stream_out(df))
|
|
||||||
write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
|
|
||||||
simil[upper.tri(simil)] <- NA
|
|
||||||
write(unique(rownames(which(simil >= cutoff, arr.ind = TRUE))),
|
|
||||||
file = paste0(getwd(),'/remove_ids.txt'),
|
|
||||||
append=T)
|
|
||||||
### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
|
|
||||||
# id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
|
|
||||||
# dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
|
|
||||||
# simil_nodupes <- as.matrix(textstat_simil(dfm_nodupes, margin="documents", method="cosine"))
|
|
||||||
# diag(simil_nodupes) <- NA
|
|
||||||
# which(simil_nodupes >= cutoff, arr.ind = TRUE)
|
|
||||||
}
|
|
@ -0,0 +1,100 @@
|
|||||||
|
#' Generate models and get classifications on test sets
|
||||||
|
#'
|
||||||
|
#' Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
|
||||||
|
#'
|
||||||
|
#' @param row Row number of current item in grid
|
||||||
|
#' @param grid Grid with model parameters and CV folds
|
||||||
|
#' @param outer_folds List with row numbers for outer folds
|
||||||
|
#' @param dfm DFM containing labeled documents
|
||||||
|
#' @param class_type Name of column in docvars() containing the classes
|
||||||
|
#' @param model Model to use (currently only nb)
|
||||||
|
#' @param we_vectors Matrix with word embedding vectors
|
||||||
|
#' @return Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' estimator(row, grid, outer_folds, dfm, class_type, model)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Generate models ############################################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
### Classification function
|
||||||
|
estimator <- function (row, grid, outer_folds, inner_folds, dfm, class_type, model, we_vectors) {
|
||||||
|
# Get parameters for current iteration
|
||||||
|
params <- grid[row,]
|
||||||
|
|
||||||
|
# If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
|
||||||
|
if ("inner_fold" %in% colnames(params) && "outer_fold" %in% colnames(params)) {
|
||||||
|
dfm_train <- dfm[-outer_folds[[params$outer_fold]],] %>%
|
||||||
|
.[-inner_folds[[params$outer_fold]][[params$inner_fold]],]
|
||||||
|
dfm_test <- dfm[-outer_folds[[params$outer_fold]],] %>%
|
||||||
|
.[inner_folds[[params$outer_fold]][[params$inner_fold]],]
|
||||||
|
# If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
|
||||||
|
} else if ("outer_fold" %in% colnames(params)) {
|
||||||
|
dfm_train <- dfm[-outer_folds[[params$outer_fold]],]
|
||||||
|
dfm_test <- dfm[outer_folds[[params$outer_fold]],]
|
||||||
|
# If only inner folds, validate performance directly on inner folds
|
||||||
|
} else if ("inner_fold" %in% colnames(params)) {
|
||||||
|
dfm_train <- dfm[-inner_folds[[params$inner_fold]],]
|
||||||
|
dfm_test <- dfm[inner_folds[[params$inner_fold]],]
|
||||||
|
# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
|
||||||
|
} else {
|
||||||
|
dfm_test <- NULL
|
||||||
|
dfm_train <- dfm
|
||||||
|
}
|
||||||
|
|
||||||
|
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
|
||||||
|
if (params$tfidf) {
|
||||||
|
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
|
||||||
|
dfm_train <- dfm_weight(dfm_train, weights = idf)
|
||||||
|
if (!is.null(dfm_test)) {
|
||||||
|
dfm_test <- dfm_weight(dfm_test, weights = idf)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
idf <- NULL
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
|
||||||
|
|
||||||
|
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
|
||||||
|
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
|
||||||
|
feat_select,
|
||||||
|
dfm = dfm_train,
|
||||||
|
class_type = params$class_type,
|
||||||
|
percentile = params$feat_percentiles,
|
||||||
|
measure = params$feat_measures
|
||||||
|
)))
|
||||||
|
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
|
||||||
|
if (!is.null(dfm_test)) {
|
||||||
|
dfm_test <- dfm_keep(dfm_test, words, valuetype="fixed", verbose=F)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (model == "nb") {
|
||||||
|
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
|
||||||
|
}
|
||||||
|
if (model == "svm") {
|
||||||
|
text_model <- svm(x=as.matrix(train_data), y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
|
||||||
|
}
|
||||||
|
# if (model == 'nnet') {
|
||||||
|
# idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
|
||||||
|
# text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
|
||||||
|
# }
|
||||||
|
### Add more if statements for different models
|
||||||
|
|
||||||
|
# If training on whole dataset, return final model, and idf values from dataset
|
||||||
|
if (is.null(dfm_test)) {
|
||||||
|
return(list(text_model=text_model, idf=idf))
|
||||||
|
} else { # Create a test set, and classify test items
|
||||||
|
# Use force=T to keep only features present in both training and test set
|
||||||
|
pred <- predict(text_model, newdata = dfm_test, type = 'class', force = T)
|
||||||
|
|
||||||
|
return(data.frame(
|
||||||
|
tv = I(list(docvars(dfm_test, class_type))), # True values from test set
|
||||||
|
pred = I(list(pred)), # Predictions of test set
|
||||||
|
params, # Parameters used to generate classification model
|
||||||
|
text_model = I(list(text_model)), # The classification model
|
||||||
|
idf = I(list(idf)), # IDF of the training dataset used for model creation
|
||||||
|
stringsAsFactors = F
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
#' Select features using quanteda textstat_keyness
|
||||||
|
#'
|
||||||
|
#' Select features based on the textstat_keyness function and a percentile cutoff
|
||||||
|
#' Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
|
||||||
|
#'
|
||||||
|
#' @param topic The topic to determine keywords for
|
||||||
|
#' @param dfm The input dfm
|
||||||
|
#' @param class_type Name of the column in docvars containing the classification
|
||||||
|
#' @param percentile Cutoff for the list of words that should be returned
|
||||||
|
#' @param measure Measure to use in determining keyness, default = chi2; see textstat_keyness for other options
|
||||||
|
#' @return A vector of words that are key to the topic
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' feat_select(topic, dfm, class_type, percentile, measure="chi2")
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Feature selection ##########################################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
feat_select <- function (topic, dfm, class_type, percentile, measure="chi2") {
|
||||||
|
# Use quanteda textstat_keyness to determine feature importance
|
||||||
|
keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
|
||||||
|
na.omit()
|
||||||
|
# Convert keyness values to absolute values, to take into account both positive and negative extremes
|
||||||
|
keyness[,2] <- abs(keyness[,2])
|
||||||
|
# Keep only the words with an absolute keyness value falling in the top [percentile] percentile
|
||||||
|
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
|
||||||
|
return(keyness)
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
#' Generates text output files (without punctuation) for external applications, such as GloVe embeddings
|
||||||
|
#'
|
||||||
|
#' Generates text output files (without punctuation) for external applications, such as GloVe embeddings
|
||||||
|
#' @param out The elasticizer-generated data frame
|
||||||
|
#' @param file The file to write the output to (including path, when required). When documents = T, provide path including trailing /
|
||||||
|
#' @param documents Indicate whether the writer should output to a single file, or individual documents
|
||||||
|
#' @param lemma Indicate whether document output should be lemmas or original document
|
||||||
|
#' @param cores Indicate the number of cores to use for parallel processing
|
||||||
|
#' @param localhost Unused, but defaults to FALSE
|
||||||
|
#' @return A Quanteda dfm
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' dfm_gen(out, words = '999')
|
||||||
|
|
||||||
|
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Lemma text file generator #############################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
lemma_writer <- function(out, file, localhost = F, documents = F, lemma = F, cores = 1) {
|
||||||
|
plan(multiprocess, workers = cores)
|
||||||
|
par_writer <- function(row, out, lemma) {
|
||||||
|
if (lemma == T) {
|
||||||
|
cat(iconv(unlist(unnest(out[row,],`_source.ud`)$lemma), to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F)
|
||||||
|
} else {
|
||||||
|
cat(iconv(out[row,]$merged, to = "UTF-8"), file = paste0(file,out[row,]$`_id`,'.txt'), append = F)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (documents == F) {
|
||||||
|
out <- unnest(out,`_source.ud`)
|
||||||
|
lemma <- str_c(unlist(out$lemma)[-which(unlist(out$upos) == 'PUNCT')], unlist(out$upos)[-which(unlist(out$upos) == 'PUNCT')], sep = '_')
|
||||||
|
cat(lemma, file = file, append = T)
|
||||||
|
}
|
||||||
|
if (documents == T) {
|
||||||
|
if (lemma == F) {
|
||||||
|
out <- out_parser(out, field = '_source', clean = F)
|
||||||
|
}
|
||||||
|
future_lapply(1:nrow(out), par_writer, out = out, lemma = lemma)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,28 +1,60 @@
|
|||||||
#' Merges list of lemmas back into a pseudo-document
|
#' Merges list of lemmas back into a pseudo-document
|
||||||
#'
|
#'
|
||||||
#' Merges list of lemmas back into a pseudo-document
|
#' Merges list of lemmas back into a pseudo-document
|
||||||
#' @param row A row number form the Elasticizer-generated data frame
|
|
||||||
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
|
|
||||||
#' @param out The elasticizer-generated data frame
|
#' @param out The elasticizer-generated data frame
|
||||||
|
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
|
||||||
|
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
|
||||||
#' @return A documentified string of lemmas, one document at a time
|
#' @return A documentified string of lemmas, one document at a time
|
||||||
#' @export
|
#' @export
|
||||||
#' @examples
|
#' @examples
|
||||||
#' merger(1, words = '999', out = out)
|
#' merger(out, text, clean)
|
||||||
#################################################################################################
|
#################################################################################################
|
||||||
#################################### Reconstructing documents from lemmas########################
|
#################################### Reconstructing documents from lemmas########################
|
||||||
#################################################################################################
|
#################################################################################################
|
||||||
## Only merging lemmas for now, feature selection has no impact on junk classification
|
## Only merging lemmas for now, feature selection has no impact on junk classification
|
||||||
merger <- function(row, words = '999', out = out) {
|
merger <- function(out, text, clean) {
|
||||||
df <- out[row,]
|
df <- unnest(out, cols = '_source.ud') %>%
|
||||||
# Mergin lemmas into single string
|
select(`_id`,lemma,upos) %>%
|
||||||
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
|
unnest(cols = c('lemma','upos')) %>%
|
||||||
# Replacing $-marked punctuation with their regular forms
|
# This line is added in the new merger function, in the old merger function this would result in the following:
|
||||||
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
|
# 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma
|
||||||
### Removing numbers and non-words containing numbers
|
# 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA
|
||||||
str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
|
filter(!is.na(lemma)) %>%
|
||||||
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
|
group_by(`_id`)
|
||||||
paste0(.,". ")
|
if (text == 'ud_upos') {
|
||||||
if (words != "999") {
|
df <- df %>%
|
||||||
lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
|
filter(upos != 'PUNCT') %>%
|
||||||
return(lemmas)
|
mutate(
|
||||||
|
lem_u = str_c(lemma,upos,sep="_")
|
||||||
|
) %>%
|
||||||
|
summarise(
|
||||||
|
merged = str_c(c(lem_u), collapse= ' ')
|
||||||
|
) %>%
|
||||||
|
# Regex removes all words consisting of or containing numbers, @#$%
|
||||||
|
# Punctuation is not taken into account, as it is already filtered out, see above
|
||||||
|
{if(clean == T) mutate(.,
|
||||||
|
merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "")
|
||||||
|
)
|
||||||
|
else . }
|
||||||
|
}
|
||||||
|
if (text == 'ud') {
|
||||||
|
df <- df %>%
|
||||||
|
summarise(
|
||||||
|
merged = str_c(c(lemma), collapse= ' ')
|
||||||
|
) %>%
|
||||||
|
mutate(
|
||||||
|
merged = str_replace_all(merged," \\$(.+?)", "\\1")
|
||||||
|
) %>%
|
||||||
|
# Regex removes all words consisting of or containing numbers, @#$%
|
||||||
|
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
|
||||||
|
# Regex also used in out_parser
|
||||||
|
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
|
||||||
|
{if(clean == T) mutate(.,
|
||||||
|
merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")
|
||||||
|
)
|
||||||
|
else . } %>%
|
||||||
|
mutate(.,
|
||||||
|
merged = paste0(merged,'. '))
|
||||||
|
}
|
||||||
|
return(df)
|
||||||
}
|
}
|
@ -0,0 +1,47 @@
|
|||||||
|
#' Generate performance statistics for models
|
||||||
|
#'
|
||||||
|
#' Generate performance statistics for models, based on their predictions and the true values
|
||||||
|
#'
|
||||||
|
#' @param x A data frame containing at least the columns "pred" and "tv"
|
||||||
|
#' @return x, with additional columns for performance metrics
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' metric_gen(x)
|
||||||
|
#################################################################################################
|
||||||
|
############################# Performance metric generation #####################################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
metric_gen <- function(x) {
|
||||||
|
### Fix for missing classes in multiclass classification
|
||||||
|
### Sorting u for easier interpretation of confusion matrix
|
||||||
|
u <- as.character(sort(as.numeric(union(unlist(x$pred), unlist(x$tv)))))
|
||||||
|
# Create a crosstable with predictions and true values
|
||||||
|
class_table <- table(prediction = factor(unlist(x$pred), u), trueValues = factor(unlist(x$tv), u))
|
||||||
|
|
||||||
|
# When only two classes, set positive class explicitly as the class with the highest value
|
||||||
|
if (length(unique(u)) == 2) {
|
||||||
|
conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
|
||||||
|
weighted_measures <- as.data.frame(conf_mat$byClass)
|
||||||
|
macro_measures <- as.data.frame(conf_mat$byClass)
|
||||||
|
} else {
|
||||||
|
# Create a confusion matrix
|
||||||
|
conf_mat <- confusionMatrix(class_table, mode = "everything")
|
||||||
|
# Set "positive" value to NA, because not applicable
|
||||||
|
conf_mat$positive <- NA
|
||||||
|
# Compute weighted performance measures
|
||||||
|
weighted_measures <- colSums(conf_mat$byClass * colSums(conf_mat$table))/sum(colSums(conf_mat$table))
|
||||||
|
# Compute unweighted performance measures (divide by number of classes, each class equally important)
|
||||||
|
macro_measures <- colSums(conf_mat$byClass)/nrow(conf_mat$byClass)
|
||||||
|
# Replace NaN's by 0 when occurring
|
||||||
|
weighted_measures[is.nan(weighted_measures)] <- 0
|
||||||
|
macro_measures[is.nan(macro_measures)] <- 0
|
||||||
|
}
|
||||||
|
return(cbind(x,
|
||||||
|
as.data.frame(t(conf_mat$overall)),
|
||||||
|
'weighted' = t(as.data.frame(weighted_measures)),
|
||||||
|
'macro' = t(as.data.frame(macro_measures)),
|
||||||
|
pos_cat = conf_mat$positive,
|
||||||
|
conf_mat = I(list(conf_mat))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
@ -0,0 +1,301 @@
|
|||||||
|
#' Generate a classification model
|
||||||
|
#'
|
||||||
|
#' Generate a nested cross validated classification model based on a dfm with class labels as docvars
|
||||||
|
#' Currently only supports Naïve Bayes using quanteda's textmodel_nb
|
||||||
|
#' Hyperparemeter optimization is enabled through the grid parameter
|
||||||
|
#' A grid should be generated from vectors with the labels as described for each model, using the crossing() command
|
||||||
|
#' For Naïve Bayes, the following parameters can be used:
|
||||||
|
#' - percentiles (cutoff point for tf-idf feature selection)
|
||||||
|
#' - measures (what measure to use for determining feature importance, see textstat_keyness for options)
|
||||||
|
#' @param dfm A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars
|
||||||
|
#' @param cores_outer Number of cores to use for outer CV (cannot be more than the number of outer folds)
|
||||||
|
#' @param cores_grid Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)
|
||||||
|
#' @param cores_inner Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)
|
||||||
|
#' @param cores_feats Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)
|
||||||
|
#' @param seed Integer to use as seed for random number generation, ensures replicability
|
||||||
|
#' @param outer_k Number of outer cross-validation folds (for performance estimation)
|
||||||
|
#' @param inner_k Number of inner cross-validation folds (for hyperparameter optimization and feature selection)
|
||||||
|
#' @param model Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)
|
||||||
|
#' @param class_type Type of classification to model ("junk", "aggregate", or "codes")
|
||||||
|
#' @param opt_measure Label of measure in confusion matrix to use as performance indicator
|
||||||
|
#' @param country Two-letter country abbreviation of the country the model is estimated for (used for filename)
|
||||||
|
#' @param grid Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)
|
||||||
|
#' @return An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Function to generate classification models #################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
modelizer_old <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
|
||||||
|
### Functions ###
|
||||||
|
feat_select <- function (topic, dfm, class_type, percentile,measure) {
|
||||||
|
keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
|
||||||
|
na.omit()
|
||||||
|
keyness[,2] <- abs(keyness[,2])
|
||||||
|
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
|
||||||
|
return(keyness)
|
||||||
|
}
|
||||||
|
### Generate inner folds for nested cv
|
||||||
|
inner_loop <- function(fold, dfm, inner_k, class_type) {
|
||||||
|
# RNG needs to be set explicitly for each fold
|
||||||
|
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||||
|
## Either createDataPartition for simple holdout parameter optimization
|
||||||
|
## Or createFolds for proper inner CV for nested CV
|
||||||
|
# if (inner_k <= 1) {
|
||||||
|
# inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
|
||||||
|
# } else {
|
||||||
|
inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k)
|
||||||
|
# }
|
||||||
|
return(c(outer_fold = list(fold),inner_folds))
|
||||||
|
}
|
||||||
|
|
||||||
|
### Generate outer folds for nested cv
|
||||||
|
generate_folds <- function(outer_k, inner_k, dfm, class_type){
|
||||||
|
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||||
|
folds <- createFolds(as.factor(docvars(dfm, class_type)), k= outer_k)
|
||||||
|
return(lapply(folds,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type))
|
||||||
|
}
|
||||||
|
|
||||||
|
### Gets called for every parameter combination, and calls classifier for every inner cv fold
|
||||||
|
inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
|
||||||
|
print(str_c('params ',row))
|
||||||
|
params <- grid[row,]
|
||||||
|
# For each inner fold, cross validate the specified parameters
|
||||||
|
res <-
|
||||||
|
bind_rows(mclapply(inner_folds,
|
||||||
|
classifier,
|
||||||
|
outer_fold = outer_fold,
|
||||||
|
params = params,
|
||||||
|
dfm = dfm,
|
||||||
|
class_type = class_type,
|
||||||
|
model = model,
|
||||||
|
cores_feats = cores_feats,
|
||||||
|
mc.cores = cores_inner
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# print(res)
|
||||||
|
# print(res[1,1])
|
||||||
|
# print('inner_cv')
|
||||||
|
return(cbind(as.data.frame(t(colMeans(select(res, 1:`Balanced Accuracy`)))),params))
|
||||||
|
}
|
||||||
|
|
||||||
|
### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
|
||||||
|
outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
|
||||||
|
print('outer cv')
|
||||||
|
# If fold contains both inner folds and outer fold
|
||||||
|
if (length(fold) == inner_k + 1) {
|
||||||
|
inner_folds <- fold[-1]
|
||||||
|
outer_fold <- fold$outer_fold
|
||||||
|
# For each row in grid, cross-validate results
|
||||||
|
res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
|
||||||
|
inner_cv,
|
||||||
|
cores_feats= cores_feats,
|
||||||
|
grid = grid,
|
||||||
|
dfm = dfm,
|
||||||
|
class_type = class_type,
|
||||||
|
model = model,
|
||||||
|
outer_fold = outer_fold,
|
||||||
|
inner_folds = inner_folds,
|
||||||
|
cores_inner = cores_inner,
|
||||||
|
mc.cores = cores_grid)
|
||||||
|
)
|
||||||
|
# print(res)
|
||||||
|
# print(res[1,1])
|
||||||
|
# print('outer_cv')
|
||||||
|
# Determine optimum hyperparameters within outer fold training set
|
||||||
|
optimum <- res[which.max(res[,opt_measure]),] %>%
|
||||||
|
select(percentiles: ncol(.))
|
||||||
|
# Validate performance of optimum hyperparameters on outer fold test set
|
||||||
|
return(classifier(NULL, outer_fold = outer_fold, params = optimum, dfm = dfm, class_type = class_type, model = model, cores_feats = cores_feats))
|
||||||
|
} else {
|
||||||
|
# If no outer fold, go directly to parameter optimization using inner folds, and return performance of hyperparameters
|
||||||
|
inner_folds <- fold
|
||||||
|
outer_fold <- NULL
|
||||||
|
res <- bind_rows(mclapply(seq(1,length(grid[[1]]),1),
|
||||||
|
inner_cv,
|
||||||
|
cores_feats= cores_feats,
|
||||||
|
grid = grid,
|
||||||
|
dfm = dfm,
|
||||||
|
class_type = class_type,
|
||||||
|
model = model,
|
||||||
|
outer_fold = outer_fold,
|
||||||
|
inner_folds = inner_folds,
|
||||||
|
cores_inner = cores_inner,
|
||||||
|
mc.cores = cores_grid)
|
||||||
|
)
|
||||||
|
# print(res)
|
||||||
|
# print(res[1,1])
|
||||||
|
# print('line 126, final model parameter optimization')
|
||||||
|
return(res)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# ### Custom tfidf function to allow same idf for different dfm's
|
||||||
|
# custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
|
||||||
|
# if (!nfeat(x) || !ndoc(x)) return(x)
|
||||||
|
# tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
|
||||||
|
# if (nfeat(x) != length(dfreq))
|
||||||
|
# stop("missing some values in idf calculation")
|
||||||
|
# # get the document indexes
|
||||||
|
# j <- as(tfreq, "dgTMatrix")@j + 1
|
||||||
|
# # replace just the non-zero values by product with idf
|
||||||
|
# x@x <- tfreq@x * dfreq[j]
|
||||||
|
# # record attributes
|
||||||
|
#
|
||||||
|
# ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2
|
||||||
|
#
|
||||||
|
# # x@weightTf <- tfreq@weightTf
|
||||||
|
# # x@weightDf <- c(list(scheme = scheme_df, base = base), args)
|
||||||
|
# return(x)
|
||||||
|
# }
|
||||||
|
|
||||||
|
### Classification function
|
||||||
|
classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
|
||||||
|
# If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
|
||||||
|
if (length(inner_fold) > 0 && length(outer_fold) > 0) {
|
||||||
|
dfm_train <- dfm[-outer_fold,] %>%
|
||||||
|
.[-inner_fold,]
|
||||||
|
dfm_test <- dfm[-outer_fold,] %>%
|
||||||
|
.[inner_fold,]
|
||||||
|
# If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
|
||||||
|
} else if (length(outer_fold) > 0 ) {
|
||||||
|
dfm_train <- dfm[-outer_fold,]
|
||||||
|
dfm_test <- dfm[outer_fold,]
|
||||||
|
validation_cv <- T
|
||||||
|
# If only inner folds, validate performance directly on inner folds (is the same as above?)
|
||||||
|
} else if (length(inner_fold) > 0 ) {
|
||||||
|
dfm_train <- dfm[-inner_fold,]
|
||||||
|
dfm_test <- dfm[inner_fold,]
|
||||||
|
# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
|
||||||
|
} else {
|
||||||
|
final <- T ### Indicate final modeling run on whole dataset
|
||||||
|
dfm_train <- dfm
|
||||||
|
}
|
||||||
|
if (model == 'nb') {
|
||||||
|
scheme_tf <- 'count' # The 'old' way
|
||||||
|
} else {
|
||||||
|
scheme_tf <- 'prop' # The 'new' way
|
||||||
|
}
|
||||||
|
### Getting features from training dataset
|
||||||
|
# Getting idf from training data, and using it to normalize both training and testing feature occurence
|
||||||
|
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
|
||||||
|
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
|
||||||
|
dfm_train <- dfm_weight(dfm_train, weights = idf)
|
||||||
|
# Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
|
||||||
|
# than the threshold on two or more of those categories
|
||||||
|
words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
|
||||||
|
feat_select,
|
||||||
|
dfm = dfm_train,
|
||||||
|
class_type = class_type,
|
||||||
|
percentile = params$percentiles,
|
||||||
|
measure = params$measures,
|
||||||
|
mc.cores = cores_feats
|
||||||
|
)))
|
||||||
|
# dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq)
|
||||||
|
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
|
||||||
|
|
||||||
|
|
||||||
|
if (model == "nb") {
|
||||||
|
text_model <- textmodel_nb(dfm_train, y = docvars(dfm_train, class_type), smooth = .001, prior = "uniform", distribution = "multinomial")
|
||||||
|
}
|
||||||
|
if (model == "svm") {
|
||||||
|
text_model <- svm(x=dfm_train, y=as.factor(docvars(dfm_train, class_type)), type = "C-classification", kernel = params$kernel, gamma = params$gamma, cost = params$cost, epsilon = params$epsilon)
|
||||||
|
}
|
||||||
|
if (model == 'nnet') {
|
||||||
|
idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
|
||||||
|
text_model <- nnet(dfm_train, idC, decay = params$decay, size=params$size, maxit=params$maxit, softmax=T, reltol = params$reltol, MaxNWts = params$size*(length(dfm_train@Dimnames$features)+1)+(params$size*2)+2)
|
||||||
|
}
|
||||||
|
# if (model == 'neuralnet') {
|
||||||
|
# dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
|
||||||
|
# dfm_test <- custom_tfidf(dfm_test, scheme_tf = "prop", scheme_df = "inverse", base = 10, dfreq = dfreq[words])
|
||||||
|
#
|
||||||
|
# idC <- class.ind(as.factor(docvars(dfm_train, class_type)))
|
||||||
|
# colnames(idC) <- NULL
|
||||||
|
# nn_train <- cbind(idC, dfm_train)
|
||||||
|
# n <- colnames(nn_train)[3:length(colnames(nn_train))]
|
||||||
|
# f <- as.formula(paste0("feat1 + feat2 ~ `", paste0(n, collapse = "` + `"),"`"))
|
||||||
|
# idC_out <- class.ind(as.factor(docvars(dfm_test, class_type)))
|
||||||
|
# colnames(idC_out) <- NULL
|
||||||
|
# nn_test <- cbind(idC_out, dfm_test)
|
||||||
|
# nn <- neuralnet(f,data=nn_train,hidden=3,linear.output=F,act.fct = 'logistic',lifesign = 'full', threshold = .005)
|
||||||
|
# pr.nn <- compute(nn,nn_test[,3:length(colnames(nn_test))])
|
||||||
|
# class_table <- table(prediction = as.matrix(round(pr.nn$net.result[,2])), trueValues = as.matrix(idC_out[,2]))
|
||||||
|
# nn_conf_mat <- confusionMatrix(class_table, mode = "everything")
|
||||||
|
# }
|
||||||
|
### Add more if statements for different models
|
||||||
|
if (exists("final")) {
|
||||||
|
return(list(text_model=text_model, idf=idf))
|
||||||
|
} else {
|
||||||
|
### Removing all features not in training set from test set and weighting the remaining features according to training idf
|
||||||
|
dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
|
||||||
|
dfm_test <- dfm_weight(dfm_test, weights = idf)
|
||||||
|
pred <- predict(text_model, newdata = dfm_test, type = 'class')
|
||||||
|
### Fix for single-class 'predictions' in borderline situations
|
||||||
|
# if (length(unique(pred)) == 1 & class_type == 'junk') {
|
||||||
|
# if (unique(pred) == '0') {
|
||||||
|
# pred[1] <- '1'
|
||||||
|
# } else {
|
||||||
|
# pred[1] <- '0'
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
|
||||||
|
### Fix for missing classes in multiclass classification
|
||||||
|
u <- union(pred, docvars(dfm_test, class_type))
|
||||||
|
|
||||||
|
class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u))
|
||||||
|
if (length(unique(u)) == 2) {
|
||||||
|
conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
|
||||||
|
} else {
|
||||||
|
conf_mat <- confusionMatrix(class_table, mode = "everything")
|
||||||
|
conf_mat$positive <- NA
|
||||||
|
}
|
||||||
|
if (exists("validation_cv")) {
|
||||||
|
return(data.frame(
|
||||||
|
tv = docvars(dfm_test, class_type),
|
||||||
|
pred = pred,
|
||||||
|
params = params,
|
||||||
|
pos_cat = conf_mat$positive,
|
||||||
|
stringsAsFactors = F
|
||||||
|
))
|
||||||
|
}
|
||||||
|
if (is.matrix(conf_mat$byClass) == T) {
|
||||||
|
return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
|
||||||
|
} else {
|
||||||
|
return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(conf_mat$byClass)),params, pos_cat = conf_mat$positive))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
### If outer_k is 1, do a holdout training run, with only cross-validation for parameter optimization, else, do nested CV
|
||||||
|
### If holdout, training/test distribution is the same as for inner CV
|
||||||
|
if (outer_k < 1) {
|
||||||
|
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||||
|
outer_fold <- createDataPartition(as.factor(docvars(dfm, class_type)), p=outer_k)
|
||||||
|
folds <- lapply(outer_fold,inner_loop, dfm = dfm, inner_k = inner_k, class_type = class_type)
|
||||||
|
} else {
|
||||||
|
## Generate nested CV folds, based on number of inner and outer folds defined (see start of script)
|
||||||
|
folds <- generate_folds(outer_k,inner_k = inner_k, dfm = dfm, class_type = class_type)
|
||||||
|
}
|
||||||
|
|
||||||
|
## Get performance of each outer fold validation, and add row with mean scores (This is the final performance indicator)
|
||||||
|
performance <- mclapply(folds, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer)
|
||||||
|
|
||||||
|
## Set seed and generate folds for final hyperparameter optimization search (using CV)
|
||||||
|
set.seed(seed, kind = "Mersenne-Twister", normal.kind = "Inversion")
|
||||||
|
folds_final <- list(createFolds(as.factor(docvars(dfm, class_type)), k= inner_k))
|
||||||
|
|
||||||
|
## Get the final hyperparameter performance for all value combinations in grid
|
||||||
|
params_final <- bind_rows(mclapply(folds_final, outer_cv, grid=grid, dfm=dfm, class_type=class_type, model=model, cores_grid=cores_grid, cores_inner=cores_inner, cores_feats=cores_feats, mc.cores = cores_outer))
|
||||||
|
|
||||||
|
## Select optimum final hyperparameters
|
||||||
|
optimum_final <- params_final[which.max(params_final[,opt_measure]),] %>%
|
||||||
|
select(percentiles: ncol(.))
|
||||||
|
|
||||||
|
## Estimate final model on whole dataset, using optimum final hyperparameters determined above
|
||||||
|
model_final <- classifier(NULL, outer_fold = NULL, params = optimum_final, dfm = dfm, class_type = class_type, model = model, cores_feats = max(c(cores_feats,cores_grid,cores_inner,cores_outer)))
|
||||||
|
rm(list=setdiff(ls(), c("model_final", "optimum_final","params_final","performance","grid","folds","folds_final","country","model","class_type","opt_measure")), envir = environment())
|
||||||
|
save(list = ls(all.names = TRUE), file = paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'), envir = environment())
|
||||||
|
return(paste0(getwd(),'/',country,'_',model,'_',class_type,'_',opt_measure,'_',Sys.time(),'.RData'))
|
||||||
|
}
|
@ -0,0 +1,89 @@
|
|||||||
|
#' Parse raw text into a single field
|
||||||
|
#'
|
||||||
|
#' Parse raw text from the MaML database into a single field
|
||||||
|
#' @param out The original output data frame
|
||||||
|
#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
|
||||||
|
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
|
||||||
|
#' @return a parsed output data frame including the additional column 'merged', containing the merged text
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' out_parser(out,field)
|
||||||
|
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Parser function for output fields ##########################
|
||||||
|
#################################################################################################
|
||||||
|
out_parser <- function(out, field, clean = F) {
|
||||||
|
fncols <- function(data, cname) {
|
||||||
|
add <-cname[!cname%in%names(data)]
|
||||||
|
|
||||||
|
if(length(add)!=0) data[, (add) := (NA)]
|
||||||
|
data
|
||||||
|
}
|
||||||
|
|
||||||
|
out <- fncols(data.table(out), c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
|
||||||
|
par_parser <- function(row, out, field, clean) {
|
||||||
|
doc <- out[row,]
|
||||||
|
if (field == 'highlight') {
|
||||||
|
|
||||||
|
doc <- doc %>%
|
||||||
|
unnest(cols = starts_with("highlight")) %>%
|
||||||
|
mutate(across(starts_with("highlight"), na_if, "NULL")) %>%
|
||||||
|
mutate(highlight.title = coalesce(highlight.title, `_source.title`),
|
||||||
|
highlight.subtitle = coalesce(highlight.subtitle, `_source.subtitle`),
|
||||||
|
highlight.preteaser = coalesce(highlight.preteaser, `_source.preteaser`),
|
||||||
|
highlight.teaser = coalesce(highlight.teaser, `_source.teaser`),
|
||||||
|
highlight.text = coalesce(highlight.text, `_source.text`)
|
||||||
|
) %>%
|
||||||
|
mutate(highlight.title = str_replace_na(highlight.title, replacement = ''),
|
||||||
|
highlight.subtitle = str_replace_na(highlight.subtitle, replacement = ''),
|
||||||
|
highlight.preteaser = str_replace_na(highlight.preteaser, replacement = ''),
|
||||||
|
highlight.teaser = str_replace_na(highlight.teaser, replacement = ''),
|
||||||
|
highlight.text = str_replace_na(highlight.text, replacement = '')
|
||||||
|
) %>%
|
||||||
|
mutate(
|
||||||
|
merged = str_c(highlight.title,
|
||||||
|
highlight.subtitle,
|
||||||
|
highlight.preteaser,
|
||||||
|
highlight.teaser,
|
||||||
|
highlight.text,
|
||||||
|
'',
|
||||||
|
sep = ". ")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (field == '_source') {
|
||||||
|
doc <- doc %>%
|
||||||
|
mutate(`_source.title` = str_replace_na(`_source.title`, replacement = ''),
|
||||||
|
`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = ''),
|
||||||
|
`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = ''),
|
||||||
|
`_source.teaser` = str_replace_na(`_source.teaser`, replacement = ''),
|
||||||
|
`_source.text` = str_replace_na(`_source.text`, replacement = '')
|
||||||
|
) %>%
|
||||||
|
mutate(
|
||||||
|
merged = str_c(`_source.title`,
|
||||||
|
`_source.subtitle`,
|
||||||
|
`_source.preteaser`,
|
||||||
|
`_source.teaser`,
|
||||||
|
`_source.text`,
|
||||||
|
'',
|
||||||
|
sep = ". ")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
|
||||||
|
# Remove html tags, and multiple consequent whitespaces
|
||||||
|
# Regex removes all words consisting of or containing numbers, @#$%
|
||||||
|
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
|
||||||
|
# Regex also used in merger function
|
||||||
|
### Old regex, used for duplicate detection:
|
||||||
|
# \\S*?[0-9@#$%]+[^\\s!?.,;:]*
|
||||||
|
doc$merged <- doc$merged %>%
|
||||||
|
str_replace_all("<.{0,20}?>", " ") %>%
|
||||||
|
str_replace_all('(\\. ){2,}', '. ') %>%
|
||||||
|
str_replace_all('([!?.])\\.','\\1') %>%
|
||||||
|
str_replace_all("\\s+"," ") %>%
|
||||||
|
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "") else . }
|
||||||
|
return(doc)
|
||||||
|
}
|
||||||
|
return(par_parser(1:nrow(out), out=out, clean=clean, field=field))
|
||||||
|
}
|
@ -0,0 +1,71 @@
|
|||||||
|
#' Preprocess dfm data for use in modeling procedure
|
||||||
|
#'
|
||||||
|
#' Process dfm according to parameters provided in params
|
||||||
|
#'
|
||||||
|
#' @param dfm_train Training dfm
|
||||||
|
#' @param dfm_test Testing dfm if applicable, otherwise NULL
|
||||||
|
#' @param params Row from grid with parameter optimization
|
||||||
|
#' @param we_vectors Matrix with word embedding vectors
|
||||||
|
#' @return List with dfm_train and dfm_test, processed according to parameters in params
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' preproc(dfm_train, dfm_test = NULL, params)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Preprocess data ############################################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
### CURRENTLY UNUSED!!!###
|
||||||
|
|
||||||
|
preproc <- function(dfm_train, dfm_test = NULL, params, we_vectors) {
|
||||||
|
# Remove non-existing features from training dfm
|
||||||
|
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
|
||||||
|
if (params$tfidf) {
|
||||||
|
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
|
||||||
|
dfm_train <- dfm_weight(dfm_train, weights = idf)
|
||||||
|
if (!is.null(dfm_test)) {
|
||||||
|
dfm_test <- dfm_weight(dfm_test, weights = idf)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
idf <- NULL
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is.null(params$feat_percentiles) && !is.null(params$feat_measures)) {
|
||||||
|
|
||||||
|
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
|
||||||
|
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
|
||||||
|
feat_select,
|
||||||
|
dfm = dfm_train,
|
||||||
|
class_type = params$class_type,
|
||||||
|
percentile = params$feat_percentiles,
|
||||||
|
measure = params$feat_measures
|
||||||
|
)))
|
||||||
|
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is.null(we_vectors)) {
|
||||||
|
shared_dict <- sort(intersect(dfm_train@Dimnames$features,we_vectors$V1))
|
||||||
|
if (!is.null(dfm_test)) {
|
||||||
|
shared_dict <- sort(intersect(dfm_test@Dimnames$features,shared_dict))
|
||||||
|
dfm_test <- dfm_keep(dfm_test, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
|
||||||
|
.[, sort(colnames(.))]
|
||||||
|
}
|
||||||
|
dfm_train <- dfm_keep(dfm_train, pattern = shared_dict, valuetype = "fixed", case_insensitive=F) %>%
|
||||||
|
.[, sort(colnames(.))]
|
||||||
|
we_matrix <- filter(we_vectors, V1 %in% shared_dict) %>%
|
||||||
|
arrange(V1) %>%
|
||||||
|
as.data.table(.) %>%
|
||||||
|
.[,2:ncol(.), with = F] %>%
|
||||||
|
as.matrix(.)
|
||||||
|
|
||||||
|
dfm_train_we_sum <- dfm_train %*% we_matrix
|
||||||
|
# dfm_train_we_mean <- dfm_train_we_sum / as.vector(rowSums(dfm_train))
|
||||||
|
|
||||||
|
if (!is.null(dfm_test)) {
|
||||||
|
dfm_test_we_sum <- dfm_test %*% we_matrix
|
||||||
|
# dfm_test_we_mean <- dfm_test_we_sum / as.vector(rowSums(dfm_test))
|
||||||
|
}
|
||||||
|
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf, dfm_train_we = dfm_train_we_sum, dfm_test_we = dfm_test_we_sum))
|
||||||
|
}
|
||||||
|
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
|
||||||
|
}
|
@ -0,0 +1,241 @@
|
|||||||
|
#' Generate actor search queries based on data in actor db
|
||||||
|
#'
|
||||||
|
#' Generate actor search queries based on data in actor db
|
||||||
|
#' @param actor A row from the output of elasticizer() when run on the 'actor' index
|
||||||
|
#' @param pre_tags Highlighter pre-tag
|
||||||
|
#' @param post_tags Highlighter post-tag
|
||||||
|
#' @return A data frame containing the queries, related actor ids and actor function
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' query_gen_actors(actor,country)
|
||||||
|
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Actor search query generator ###############################
|
||||||
|
#################################################################################################
|
||||||
|
query_gen_actors <- function(actor, pre_tags, post_tags) {
|
||||||
|
generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
|
||||||
|
return(paste0('{"_source": ["ud","title","subtitle","preteaser","teaser","text"],
|
||||||
|
"query":
|
||||||
|
{"bool": {
|
||||||
|
"filter":[
|
||||||
|
{"term":{"country":"',country,'"}},
|
||||||
|
{"range":{"publication_date":{"gte":"',startdate,'","lte":"',enddate,'"}}},
|
||||||
|
{"query_string" : {
|
||||||
|
"default_operator" : "OR",
|
||||||
|
"allow_leading_wildcard" : "false",
|
||||||
|
"fields": ["text","teaser","preteaser","title","subtitle"],
|
||||||
|
"query" : "', querystring,'"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"must_not":[
|
||||||
|
{"term":{"computerCodes.actors.keyword":"',actorid,'"}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"highlight" : {
|
||||||
|
"fields" : {
|
||||||
|
"text" : {},
|
||||||
|
"teaser" : {},
|
||||||
|
"preteaser" : {},
|
||||||
|
"title" : {},
|
||||||
|
"subtitle" : {}
|
||||||
|
},
|
||||||
|
"number_of_fragments": 0,
|
||||||
|
"order": "none",
|
||||||
|
"type":"unified",
|
||||||
|
"fragment_size":0,
|
||||||
|
"pre_tags":"', pre_tags,'",
|
||||||
|
"post_tags": "',post_tags,'"
|
||||||
|
}
|
||||||
|
}'))
|
||||||
|
}
|
||||||
|
prox_gen <- function(row, grid) {
|
||||||
|
return(
|
||||||
|
paste0('\\"',grid[row,]$first,' ',grid[row,]$last,'\\"~',grid[row,]$prox)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
country <- actor$`_source.country`
|
||||||
|
### Setting linguistic forms for each country ###
|
||||||
|
if (country == "no" | country == "dk") {
|
||||||
|
genitive <- 's'
|
||||||
|
definitive <- 'en'
|
||||||
|
} else if (country == 'uk') {
|
||||||
|
genitive <- '\'s'
|
||||||
|
} else if (country == 'nl' | country == 'be') {
|
||||||
|
genitive <- 's'
|
||||||
|
}
|
||||||
|
|
||||||
|
### Generating queries for individuals (ministers, PM, Party leaders and MPs)
|
||||||
|
if (actor$`_source.function` == "JunMin" | actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader" | actor$`_source.function` == "MP") {
|
||||||
|
## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name
|
||||||
|
## Regardless of whether the last name hit is because of a minister name or a full name proximity hit
|
||||||
|
|
||||||
|
### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name
|
||||||
|
if (country == 'be' && T %in% str_detect(actor$`_source.middleNames`,"'")) {
|
||||||
|
last_list <- c(actor$`_source.lastName`, str_c(actor$`_source.lastName`,genitive), tolower(actor$`_source.lastName`), str_c(tolower(actor$`_source.lastName`),genitive))
|
||||||
|
grid <- crossing(first = actor$`_source.firstName`, last = last_list, prox = 5)
|
||||||
|
fullname <- lapply(1:nrow(grid), prox_gen, grid = grid)
|
||||||
|
query_string <- paste0('((',
|
||||||
|
paste0(unlist(fullname), collapse = ' OR '),') AND ',
|
||||||
|
paste0('(',paste0(unlist(last_list), collapse = ' OR '),')'))
|
||||||
|
} else {
|
||||||
|
last_list <- c(actor$`_source.lastName`, str_c(actor$`_source.lastName`,genitive))
|
||||||
|
grid <- crossing(first = actor$`_source.firstName`, last = last_list, prox = 5)
|
||||||
|
fullname <- lapply(1:nrow(grid), prox_gen, grid = grid)
|
||||||
|
query_string <- paste0('((',
|
||||||
|
paste0(unlist(fullname), collapse = ' OR '),') AND ',
|
||||||
|
paste0('(',paste0(unlist(last_list), collapse = ' OR '),')'))
|
||||||
|
}
|
||||||
|
|
||||||
|
### If actor is a minister, generate minister search
|
||||||
|
if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM") {
|
||||||
|
if (country == "no" || country == "dk") {
|
||||||
|
minister <- str_split(actor$`_source.ministerSearch`, pattern = '-| ') %>%
|
||||||
|
map(1)
|
||||||
|
|
||||||
|
capital <- unlist(str_to_title(minister))
|
||||||
|
capital_def <- unlist(str_c(capital, definitive))
|
||||||
|
def <- unlist(str_c(minister,definitive))
|
||||||
|
minister <- unlist(c(minister,capital,capital_def,def))
|
||||||
|
}
|
||||||
|
if (country == "uk") {
|
||||||
|
minister <- c(str_to_title(actor$`_source.ministerName`),
|
||||||
|
actor$`_source.ministerName`)
|
||||||
|
if(actor$`_source.function` == "PM") {
|
||||||
|
minister <- c(minister,
|
||||||
|
"PM")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (country == "nl" | country == "be") { # If country is nl or be, add a requirement for Minister to the query
|
||||||
|
minister <- c("Minister",
|
||||||
|
"minister")
|
||||||
|
if(actor$`_source.function` == "PM") {
|
||||||
|
minister <- c(minister,
|
||||||
|
"Premier",
|
||||||
|
"premier")
|
||||||
|
}
|
||||||
|
if(actor$`_source.function` == "JunMin") {
|
||||||
|
minister <- c("Staatssecretaris",
|
||||||
|
"staatssecretaris")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
grid <- crossing(first = last_list, last = minister, prox = 5)
|
||||||
|
ministername <- lapply(1:nrow(grid), prox_gen, grid = grid)
|
||||||
|
query_string <- paste0(query_string,') OR ((',
|
||||||
|
paste0(unlist(ministername), collapse= ' OR '),') AND ',
|
||||||
|
paste0('(',paste0(unlist(last_list), collapse = ' OR '),'))'))
|
||||||
|
} else { ### Else, generate search for first/last name only (MPs and Party leaders, currently)
|
||||||
|
query_string <- paste0(query_string,')')
|
||||||
|
}
|
||||||
|
ids <- list(c(actor$`_source.actorId`,str_c(actor$`_source.partyId`,'_a')))
|
||||||
|
actorid <- actor$`_source.actorId`
|
||||||
|
query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
|
||||||
|
return(data.frame(query = query, ids = I(ids), prefix = NA, postfix = NA, stringsAsFactors = F))
|
||||||
|
}
|
||||||
|
|
||||||
|
### Query generation for party searches
|
||||||
|
if (actor$`_source.function` == "Party") {
|
||||||
|
# actor$`_source.startDate` <- "2000-01-01"
|
||||||
|
# actor$`_source.endDate` <- "2099-01-01"
|
||||||
|
if (nchar(actor$`_source.partyNameSearchShort`[[1]]) > 0) {
|
||||||
|
# If uk, no or dk, search for both regular abbreviations, and genitive forms
|
||||||
|
if (country == "uk" | country == "no" | country == "dk") {
|
||||||
|
gen <- unlist(lapply(actor$`_source.partyNameSearchShort`, str_c, genitive))
|
||||||
|
names <- paste(unlist(c(gen,actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"')
|
||||||
|
} else {
|
||||||
|
names <- paste(unlist(actor$`_source.partyNameSearchShort`), collapse = '\\" \\"')
|
||||||
|
}
|
||||||
|
# If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself)
|
||||||
|
if (country == "dk" | country == "no") {
|
||||||
|
gen <- gen[which(nchar(gen) > 2)]
|
||||||
|
}
|
||||||
|
query_string <- paste0('(\\"',unlist(names),'\\")')
|
||||||
|
ids <- str_c(actor$`_source.partyId`,'_s')
|
||||||
|
actorid <- str_c(actor$`_source.partyId`,'_s')
|
||||||
|
query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
|
||||||
|
df1 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
|
||||||
|
}
|
||||||
|
if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
|
||||||
|
if (country == "uk" | country == "no" | country == "dk") {
|
||||||
|
gen <- unlist(lapply(actor$`_source.partyNameSearch`, str_c, genitive))
|
||||||
|
names <- paste(unlist(c(gen,actor$`_source.partyNameSearch`)), collapse = '\\" \\"')
|
||||||
|
} else {
|
||||||
|
names <- paste(unlist(actor$`_source.partyNameSearch`), collapse = '\\" \\"')
|
||||||
|
}
|
||||||
|
query_string <- paste0('(\\"',unlist(names),'\\")')
|
||||||
|
ids <- str_c(actor$`_source.partyId`,'_f')
|
||||||
|
actorid <- str_c(actor$`_source.partyId`,'_f')
|
||||||
|
query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
|
||||||
|
if (country == 'uk' | country == 'nl' | country == 'be') {
|
||||||
|
df2 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
|
||||||
|
} else {
|
||||||
|
df2 <- data.frame(query = query, ids = I(ids), prefix = NA, postfix = NA, stringsAsFactors = F)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (exists('df1') == T & exists('df2') == T) {
|
||||||
|
return(bind_rows(df1,df2))
|
||||||
|
} else if (exists('df1') == T) {
|
||||||
|
return(df1)
|
||||||
|
} else if (exists('df2') == T) {
|
||||||
|
return(df2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
### Institution function currently not used
|
||||||
|
# if (actor$`_source.function` == "Institution") {
|
||||||
|
# #uppercasing
|
||||||
|
# firstup <- function(x) {
|
||||||
|
# substr(x, 1, 1) <- toupper(substr(x, 1, 1))
|
||||||
|
# x
|
||||||
|
# }
|
||||||
|
# actor$`_source.startDate` <- "2000-01-01"
|
||||||
|
# actor$`_source.endDate` <- "2099-01-01"
|
||||||
|
# if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) {
|
||||||
|
# upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup))
|
||||||
|
# upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
|
||||||
|
# unlist(lapply(upper, str_c, definitive)),
|
||||||
|
# unlist(lapply(upper, str_c, definitive_genitive)))
|
||||||
|
# capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title))
|
||||||
|
# capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
|
||||||
|
# unlist(lapply(capital, str_c, definitive)),
|
||||||
|
# unlist(lapply(capital, str_c, definitive_genitive)))
|
||||||
|
# base <- actor$`_source.institutionNameSearch`
|
||||||
|
# base <- c(base, unlist(lapply(base, str_c, genitive)),
|
||||||
|
# unlist(lapply(base, str_c, definitive)),
|
||||||
|
# unlist(lapply(base, str_c, definitive_genitive)))
|
||||||
|
# names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
|
||||||
|
# query_string <- paste0('(\\"',names,'\\")')
|
||||||
|
# ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f")))
|
||||||
|
# actorid <- str_c(actor$`_source.institutionId`,'_f')
|
||||||
|
# query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
|
||||||
|
# df1 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
|
||||||
|
# }
|
||||||
|
# if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) {
|
||||||
|
# upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup))
|
||||||
|
# upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
|
||||||
|
# unlist(lapply(upper, str_c, definitive)),
|
||||||
|
# unlist(lapply(upper, str_c, definitive_genitive)))
|
||||||
|
# capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title))
|
||||||
|
# capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
|
||||||
|
# unlist(lapply(capital, str_c, definitive)),
|
||||||
|
# unlist(lapply(capital, str_c, definitive_genitive)))
|
||||||
|
# base <- actor$`_source.institutionNameSearchShort`
|
||||||
|
# base <- c(base, unlist(lapply(base, str_c, genitive)),
|
||||||
|
# unlist(lapply(base, str_c, definitive)),
|
||||||
|
# unlist(lapply(base, str_c, definitive_genitive)))
|
||||||
|
# names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
|
||||||
|
# query_string <- paste0('(\\"',names,'\\")')
|
||||||
|
# ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s")))
|
||||||
|
# actorid <- str_c(actor$`_source.institutionId`,'_s')
|
||||||
|
# query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
|
||||||
|
# df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
|
||||||
|
# }
|
||||||
|
# if (exists('df1') == T & exists('df2') == T) {
|
||||||
|
# return(bind_rows(df1,df2))
|
||||||
|
# } else if (exists('df1') == T) {
|
||||||
|
# return(df1)
|
||||||
|
# } else if (exists('df2') == T) {
|
||||||
|
# return(df2)
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
}
|
@ -0,0 +1,60 @@
|
|||||||
|
#' Generate a query string query for ElasticSearch
|
||||||
|
#'
|
||||||
|
#' Generate a query string query for ElasticSearch
|
||||||
|
#' @param query Query string in ElasticSearch query string format
|
||||||
|
#' @param fields List of field names to return, defaults to all
|
||||||
|
#' @param random Return randomized results. Boolean, defaults to FALSE
|
||||||
|
#' @return A formatted ElasticSearch query string query
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' query_string(query)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Get data from ElasticSearch ################################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
query_string <- function(query, fields = F, random = F, default_operator = "AND") {
|
||||||
|
if (typeof(fields) == 'logical') {
|
||||||
|
fields <- '*'
|
||||||
|
}
|
||||||
|
if (random == T) {
|
||||||
|
return(paste0(
|
||||||
|
'{
|
||||||
|
"_source": ',toJSON(fields),',
|
||||||
|
"query": {
|
||||||
|
"function_score": {
|
||||||
|
"query": {
|
||||||
|
"bool":{
|
||||||
|
"filter": [{
|
||||||
|
"query_string" : {
|
||||||
|
"query" : "',query,'",
|
||||||
|
"default_operator": "',default_operator,'",
|
||||||
|
"allow_leading_wildcard" : false
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"random_score": {},
|
||||||
|
"boost_mode": "sum"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
))
|
||||||
|
} else {
|
||||||
|
return(paste0(
|
||||||
|
'{
|
||||||
|
"_source": ',toJSON(fields),',
|
||||||
|
"query": {
|
||||||
|
"bool":{
|
||||||
|
"filter": [{
|
||||||
|
"query_string" : {
|
||||||
|
"query" : "',query,'",
|
||||||
|
"default_operator": "',default_operator,'",
|
||||||
|
"allow_leading_wildcard" : false
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,260 @@
|
|||||||
|
#' Aggregate sentence-level dataset containing sentiment (from sentencizer())
|
||||||
|
#'
|
||||||
|
#' Aggregate sentence-level dataset containing sentiment (from sentencizer())
|
||||||
|
#' @param df Data frame with actor ids, produced by sentencizer
|
||||||
|
#' @param actors_meta Optional data frame containing actor metadata obtained using elasticizer(index="actors")
|
||||||
|
#' @param actor_groups Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)
|
||||||
|
#' @param pos_cutoff Optional value above which sentence-level sentiment scores should be considered "positive"
|
||||||
|
#' @param neg_cutoff Optional value below which sentence-level sentiment scores should be considered "negative"
|
||||||
|
#' @param single_party Boolean to generate data only from sentences in which a single party is mentioned, defaults to FALSE
|
||||||
|
#' @return When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' sent_merger(df, actors_meta, ids = NULL)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Generate actor-article dataset #############################
|
||||||
|
#################################################################################################
|
||||||
|
|
||||||
|
### NOTE: The exceptions for various partyId_a ids has been implemented because of an error with
|
||||||
|
### some individual actors, where the partyId of an individual actor doesn't match an actual
|
||||||
|
### partyId in the actor dataset
|
||||||
|
|
||||||
|
sent_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff = NULL, neg_cutoff = NULL, single_party = F) {
|
||||||
|
grouper <- function(id2, df) {
|
||||||
|
# Prevent usage of deprecated partyId_a ids, which are incorrect and should no longer be used
|
||||||
|
if (any(str_ends(id2, '_a'))) {
|
||||||
|
return("You're seemingly using a deprecated [partyId]_a id in your aggregations")
|
||||||
|
}
|
||||||
|
return(df[ids %in% id2,] %>%
|
||||||
|
.[!duplicated(.,by = c('id','sentence_id')),.(
|
||||||
|
actor.sent = sum(sent_binary_weighted)/sum(words),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
# actor.arousal = sum(abs(sent_binary_weighted))/sum(words),
|
||||||
|
actor.first = min(sentence_id),
|
||||||
|
actor.occ = .N,
|
||||||
|
publication_date = first(publication_date),
|
||||||
|
ids = str_c(id2, collapse = '-')
|
||||||
|
), by = c('id')]
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
## Remove some of the metadata from the source df
|
||||||
|
df <- data.table(df)[,.(
|
||||||
|
(.SD),
|
||||||
|
doctype = as.factor(`_source.doctype`),
|
||||||
|
publication_date = as.Date(`_source.publication_date`),
|
||||||
|
id = as.factor(`_id`)
|
||||||
|
), .SDcols = !c('_source.doctype','_source.publication_date','_id')]
|
||||||
|
|
||||||
|
## Create bogus variables if sentiment is not scored
|
||||||
|
if(!"sent_sum" %in% colnames(df)) {
|
||||||
|
df <- df[,.(
|
||||||
|
(.SD),
|
||||||
|
sent_words = 0,
|
||||||
|
sent_sum = 0
|
||||||
|
)]
|
||||||
|
}
|
||||||
|
|
||||||
|
## Unnest to sentence level
|
||||||
|
|
||||||
|
## Check if raw sentiment data contains actor ids
|
||||||
|
if ('ids' %in% colnames(df)) {
|
||||||
|
df <- df[,lapply(.SD, unlist, recursive=F),
|
||||||
|
.SDcols = c('sentence_id', 'sent_sum', 'words', 'sent_words','ids'),
|
||||||
|
by = list(id,publication_date,doctype)]
|
||||||
|
} else {
|
||||||
|
df <- df[,lapply(.SD, unlist, recursive=F),
|
||||||
|
.SDcols = c('sentence_id', 'sent_sum', 'words', 'sent_words'),
|
||||||
|
by = list(id,publication_date,doctype)]
|
||||||
|
}
|
||||||
|
|
||||||
|
df <- df[,.(
|
||||||
|
(.SD),
|
||||||
|
sent = sent_sum/words
|
||||||
|
)][,.(
|
||||||
|
(.SD),
|
||||||
|
sent_binary = case_when(
|
||||||
|
sent > pos_cutoff ~ 1,
|
||||||
|
sent == 0 ~ 0,
|
||||||
|
sent >= neg_cutoff & sent <= pos_cutoff ~ 0,
|
||||||
|
TRUE ~ -1
|
||||||
|
)
|
||||||
|
)][,.(
|
||||||
|
(.SD),
|
||||||
|
sent_binary_weighted = sent_binary*words
|
||||||
|
)]
|
||||||
|
|
||||||
|
text_sent <- df[,
|
||||||
|
.(text.sent = sum(sent_binary_weighted)/sum(words),
|
||||||
|
text.sent_words = sum(sent_words),
|
||||||
|
text.words = sum(words),
|
||||||
|
text.arousal = sum(sent_words)/sum(words),
|
||||||
|
text.sentences = .N,
|
||||||
|
doctype = first(doctype),
|
||||||
|
publication_date = first(publication_date)
|
||||||
|
), by = list(id)]
|
||||||
|
|
||||||
|
## Create aggregations according to list of actorId vectors in ids
|
||||||
|
if(!is.null(actor_groups)) {
|
||||||
|
output <- lapply(actor_groups,grouper, df = df) %>%
|
||||||
|
rbindlist(.) %>%
|
||||||
|
left_join(text_sent, by=c("id","publication_date")) %>%
|
||||||
|
mutate(
|
||||||
|
actor.prom = actor.occ/text.sentences,
|
||||||
|
actor.rel_first = 1-(actor.first/text.sentences),
|
||||||
|
year = strftime(publication_date, format = '%Y'),
|
||||||
|
yearmonth = strftime(publication_date, format = '%Y%m'),
|
||||||
|
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
|
||||||
|
yearweek = strftime(publication_date, format = "%Y%V")
|
||||||
|
) %>%
|
||||||
|
mutate(across(where(is.character), as.factor)) %>%
|
||||||
|
mutate(across(where(is.Date), as.factor))
|
||||||
|
return(output)
|
||||||
|
} else if(!is.null(actors_meta)) {
|
||||||
|
text_noactors <- df[lengths(ids) == 0L,
|
||||||
|
.(noactor.sent = sum(sent_binary_weighted)/sum(words),
|
||||||
|
noactor.sent_words = sum(sent_words),
|
||||||
|
noactor.words = sum(words),
|
||||||
|
noactor.arousal = sum(sent_words)/sum(words),
|
||||||
|
noactor.first = min(sentence_id),
|
||||||
|
noactor.occ = .N
|
||||||
|
), by = list(id)]
|
||||||
|
|
||||||
|
all <- df[lengths(ids) > 0L,
|
||||||
|
.(actor.sent = sum(sent_binary_weighted)/sum(words),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
actor.arousal = sum(sent_words)/sum(words),
|
||||||
|
actor.first = min(sentence_id),
|
||||||
|
actor.occ = .N,
|
||||||
|
publication_date = first(publication_date),
|
||||||
|
ids = 'all'), by = list(id)]
|
||||||
|
|
||||||
|
## Unnest to actor level
|
||||||
|
df <- df[,.(ids = as.character(unlist(ids))),
|
||||||
|
by = list(id,publication_date,sentence_id, sent_sum, words, sent_words,sent_binary_weighted)
|
||||||
|
][ # Remove deprecated actor_partyids from ES database
|
||||||
|
!str_ends(ids, '_a')]
|
||||||
|
|
||||||
|
## Prepare actor metadata
|
||||||
|
colnames(actors_meta) <- str_replace(colnames(actors_meta),'_source.','')
|
||||||
|
actors_meta <- data.table(actors_meta)[,
|
||||||
|
.((.SD),
|
||||||
|
startDate = as.Date(startDate),
|
||||||
|
endDate = as.Date(endDate),
|
||||||
|
ids = ifelse(!is.na(actorId), actorId, partyId)
|
||||||
|
), .SDcols = -c('_id','startDate','endDate','_index','_type','_score')
|
||||||
|
]
|
||||||
|
|
||||||
|
## Create table with partyIds by date and actorId to join by
|
||||||
|
actors_party <- actors_meta %>%
|
||||||
|
group_by(ids,partyId,startDate,endDate) %>%
|
||||||
|
summarise() %>%
|
||||||
|
na.omit() %>%
|
||||||
|
ungroup() %>%
|
||||||
|
data.table(.)
|
||||||
|
|
||||||
|
## Add partyId to each actorId without filtering parties out
|
||||||
|
df <- df %>%
|
||||||
|
# Fill partyId column for actor mentions
|
||||||
|
actors_party[., c(colnames(.),'partyId'), # Join by actorId, within active period (start/endDate)
|
||||||
|
on = .(ids == ids, startDate <= publication_date, endDate >= publication_date),
|
||||||
|
with = F] %>%
|
||||||
|
# Fill partyId column for party mentions
|
||||||
|
.[is.na(partyId), partyId:=str_sub(ids, start = 1, end = -3)] %>%
|
||||||
|
# Some actors seemingly belong to different parties on the same day, hence basing unique rows on both (actor)ids and partyId
|
||||||
|
.[!duplicated(.,by = c('id','ids','sentence_id','partyId')),] # Keep all unique rows
|
||||||
|
|
||||||
|
## Removing sentences containing more than one party
|
||||||
|
if(single_party) {
|
||||||
|
# Create variable indicating number of unique party ids per sentence, and keep only sentences where unique parties == 1
|
||||||
|
df <- df %>%
|
||||||
|
.[, upid := length(unique(partyId)), by = c('id','sentence_id')] %>%
|
||||||
|
.[upid == 1,]
|
||||||
|
}
|
||||||
|
|
||||||
|
## Create aggregate measures for individual actors
|
||||||
|
actors_merged <- df[str_starts(ids, 'A_')] %>% .[!duplicated(.,by = c('id','ids','sentence_id')),] %>% # Removing duplicate rows when actor is counted multiple times in the same sentence, because of multiple functions or parties.
|
||||||
|
.[,
|
||||||
|
.(actor.sent = sum(sent_binary_weighted)/sum(words),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
actor.arousal = sum(sent_words)/sum(words),
|
||||||
|
actor.first = min(sentence_id),
|
||||||
|
actor.occ = .N,
|
||||||
|
publication_date = first(publication_date)), by = list(id, ids)]
|
||||||
|
|
||||||
|
## Create actor metadata dataframe per active date (one row per day per actor)
|
||||||
|
actors_merged <- actors_meta[actors_merged,
|
||||||
|
c('x.startDate','x.endDate',colnames(actors_merged), 'lastName','firstName','function.','gender','yearOfBirth','parlPeriod','partyId','ministerName','ministryId','actorId','startDate','endDate'),
|
||||||
|
on =.(ids = ids, startDate <= publication_date, endDate >= publication_date),
|
||||||
|
mult = 'all',
|
||||||
|
with = F][,.(
|
||||||
|
startDate = x.startDate,
|
||||||
|
endDate = x.endDate,
|
||||||
|
(.SD)
|
||||||
|
), .SDcols = -c('x.startDate', 'x.endDate','startDate','endDate')]
|
||||||
|
## Generate party-actor aggregations (mfsa)
|
||||||
|
|
||||||
|
# Create party data table
|
||||||
|
parties_actors <- df %>%
|
||||||
|
.[!duplicated(.,by = c('id','partyId','sentence_id')),.( # Remove rows (sentences) where a party is counted multiple times
|
||||||
|
actor.sent = sum(sent_binary_weighted)/sum(words),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
actor.arousal = sum(sent_words)/sum(words),
|
||||||
|
actor.first = min(sentence_id),
|
||||||
|
actor.occ = .N,
|
||||||
|
publication_date = first(publication_date)
|
||||||
|
), by = c('id','partyId')] # Summarize by article and partyId
|
||||||
|
# Add party metadata
|
||||||
|
parties_actors <- actors_meta[str_starts(ids, 'P_')][parties_actors, on = c('partyId'), mult = 'first'][!is.na(id),.(ids = str_c(partyId,"_mfsa"), (.SD)), .SDcols = -c('ids')]
|
||||||
|
|
||||||
|
## Generate party aggregations (mfs)
|
||||||
|
parties <- df[str_ends(ids,'_f') | str_ends(ids,'_s'),.(
|
||||||
|
ids = str_sub(ids, start = 1, end = -3),
|
||||||
|
(.SD)
|
||||||
|
),.SDcols = -c('ids')] %>% .[!duplicated(.,by = c('id','ids','sentence_id')),.(
|
||||||
|
actor.sent = sum(sent_binary_weighted)/sum(words),
|
||||||
|
actor.sent_words = sum(sent_words),
|
||||||
|
actor.words = sum(words),
|
||||||
|
actor.arousal = sum(sent_words)/sum(words),
|
||||||
|
actor.first = min(sentence_id),
|
||||||
|
actor.occ = .N,
|
||||||
|
publication_date = first(publication_date)
|
||||||
|
), by = c('id','ids')]
|
||||||
|
parties <- actors_meta[parties, on = c('ids'), mult = 'first'][!is.na(id),.(ids = str_c(ids,"_mfs"), (.SD)), .SDcols = -c('ids')]
|
||||||
|
|
||||||
|
## Join all aggregations into a single data frame, compute derived actor-level measures, and add date dummies
|
||||||
|
df <- bind_rows(actors_merged, parties, parties_actors, all) %>%
|
||||||
|
left_join(.,text_sent, by=c("id","publication_date")) %>%
|
||||||
|
left_join(.,text_noactors, by="id") %>%
|
||||||
|
mutate(
|
||||||
|
actor.prom = actor.occ/text.sentences,
|
||||||
|
actor.rel_first = 1-(actor.first/text.sentences),
|
||||||
|
year = strftime(publication_date, format = '%Y'),
|
||||||
|
yearmonth = strftime(publication_date, format = '%Y%m'),
|
||||||
|
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
|
||||||
|
yearweek = strftime(publication_date, format = "%Y%V")
|
||||||
|
) %>%
|
||||||
|
ungroup() %>%
|
||||||
|
select(-contains('Search'),-starts_with('not')) %>%
|
||||||
|
mutate(across(where(is.character), as.factor)) %>%
|
||||||
|
mutate(across(where(is.Date), as.factor))
|
||||||
|
return(df)
|
||||||
|
} else {
|
||||||
|
df <- text_sent %>%
|
||||||
|
mutate(
|
||||||
|
year = strftime(publication_date, format = '%Y'),
|
||||||
|
yearmonth = strftime(publication_date, format = '%Y%m'),
|
||||||
|
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
|
||||||
|
yearweek = strftime(publication_date, format = "%Y%V")
|
||||||
|
) %>%
|
||||||
|
ungroup() %>%
|
||||||
|
mutate(across(where(is.character), as.factor)) %>%
|
||||||
|
mutate(across(where(is.Date), as.factor))
|
||||||
|
return(df)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,116 @@
|
|||||||
|
#' Generate sentence-level dataset with sentiment and actor presence
|
||||||
|
#'
|
||||||
|
#' Generate sentence-level dataset with sentiment and actor presence
|
||||||
|
#' @param out Data frame produced by elasticizer
|
||||||
|
#' @param sent_dict Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.
|
||||||
|
#' @param validation Boolean indicating whether human validation should be performed on sentiment scoring
|
||||||
|
#' @return No return value, data per batch is saved in an RDS file
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' sentencizer(out, sent_dict = NULL, validation = F)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Generate sentence-level dataset#############################
|
||||||
|
#################################################################################################
|
||||||
|
sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F) {
|
||||||
|
## Despite the function name, parallel processing is not used, because it is slower
|
||||||
|
par_sent <- function(row, out, sent_dict = NULL) {
|
||||||
|
out <- out[row,]
|
||||||
|
## Create df with article metadata (fields that are included in the elasticizer function)
|
||||||
|
metadata <- out %>%
|
||||||
|
select(`_id`,`_source.publication_date`,`_source.doctype`) %>%
|
||||||
|
mutate(`_source.publication_date` = as.factor(`_source.publication_date`),
|
||||||
|
`_source.doctype` = as.factor(`_source.doctype`))
|
||||||
|
|
||||||
|
## Unnest documents into individual words
|
||||||
|
ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
|
||||||
|
unnest(cols = colnames(.)) %>%
|
||||||
|
select(-one_of('exists')) %>%
|
||||||
|
unnest(cols = colnames(.)) %>%
|
||||||
|
filter(upos != 'PUNCT')
|
||||||
|
|
||||||
|
## If there is a dictionary, apply it
|
||||||
|
if (!is.null(sent_dict)) {
|
||||||
|
## If the dictionary contains the column lem_u, assume lemma_upos format
|
||||||
|
if ("lem_u" %in% colnames(sent_dict)) {
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
mutate(lem_u = str_c(lemma,'_',upos)) %>%
|
||||||
|
left_join(sent_dict, by = 'lem_u')
|
||||||
|
## If the dictionary contains the column lemma, assume simple lemma format
|
||||||
|
} else if ("lemma" %in% colnames(sent_dict)) {
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
left_join(sent_dict, by = 'lemma') %>%
|
||||||
|
mutate(lem_u = lemma)
|
||||||
|
}
|
||||||
|
|
||||||
|
## Group by sentences, and generate dictionary scores per sentence
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
group_by(`_id`,sentence_id) %>%
|
||||||
|
mutate(
|
||||||
|
prox = case_when(
|
||||||
|
is.na(prox) == T ~ 0,
|
||||||
|
TRUE ~ prox
|
||||||
|
)
|
||||||
|
) %>%
|
||||||
|
summarise(sent_sum = sum(prox),
|
||||||
|
words = length(lemma),
|
||||||
|
sent_words = sum(prox != 0),
|
||||||
|
# sent_lemmas = list(lem_u[prox != 0])
|
||||||
|
)
|
||||||
|
## If there is no dictionary, create a ud_sent, with just sentence ids and word counts per sentence
|
||||||
|
} else {
|
||||||
|
ud_sent <- ud_sent %>%
|
||||||
|
group_by(`_id`,sentence_id) %>%
|
||||||
|
summarise(words = length(lemma))
|
||||||
|
}
|
||||||
|
|
||||||
|
## Remove ud ouptut from source before further processing
|
||||||
|
out <- select(out, -`_source.ud`)
|
||||||
|
|
||||||
|
## If dictionary validation, return just the sentences that have been hand-coded
|
||||||
|
if (validation == T) {
|
||||||
|
codes_sent <- ud_sent %>%
|
||||||
|
left_join(.,out, by='_id') %>%
|
||||||
|
rowwise() %>%
|
||||||
|
filter(sentence_id == `_source.codes.sentence.id`)
|
||||||
|
return(codes_sent)
|
||||||
|
}
|
||||||
|
|
||||||
|
if("_source.computerCodes.actorsDetail" %in% colnames(out)) {
|
||||||
|
|
||||||
|
## If actor details in source, create vector of actor ids for each sentence
|
||||||
|
out <- out %>%
|
||||||
|
unnest(`_source.computerCodes.actorsDetail`) %>%
|
||||||
|
# mutate(ids_list = ids) %>%
|
||||||
|
unnest(ids) %>%
|
||||||
|
unnest(sentence_id) %>%
|
||||||
|
group_by(`_id`,sentence_id) %>%
|
||||||
|
summarise(
|
||||||
|
ids = list(ids)
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
## If no actor details, keep one row per article and add a bogus sentence_id
|
||||||
|
out <- out %>%
|
||||||
|
group_by(`_id`) %>%
|
||||||
|
summarise() %>%
|
||||||
|
mutate(sentence_id = 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
## Combine ud_sent with the source dataset
|
||||||
|
out <- out %>%
|
||||||
|
left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
|
||||||
|
group_by(`_id`)
|
||||||
|
|
||||||
|
out <- out %>%
|
||||||
|
summarise_all(list) %>%
|
||||||
|
left_join(.,metadata,by='_id') %>%
|
||||||
|
ungroup()
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
saveRDS(par_sent(1:nrow(out),out = out, sent_dict=sent_dict), file = paste0('df_out',as.numeric(as.POSIXct(Sys.time())),'.Rds'))
|
||||||
|
return()
|
||||||
|
### Keeping the option for parallel computation
|
||||||
|
# microbenchmark::microbenchmark(out_normal <- par_sent(1:nrow(out),out = out, sent_dict=sent_dict), times = 1)
|
||||||
|
# plan(multiprocess, workers = cores)
|
||||||
|
# chunks <- split(1:nrow(out), sort(1:nrow(out)%%cores))
|
||||||
|
# microbenchmark::microbenchmark(out_par <- bind_rows(future_lapply(chunks,par_sent, out=out, sent_dict=sent_dict)), times = 1)
|
||||||
|
}
|
@ -0,0 +1,56 @@
|
|||||||
|
#' Generate UDpipe output from base text
|
||||||
|
#'
|
||||||
|
#' Generate UDpipe output from base text
|
||||||
|
#' @param file Filename of file to read in, also used for generating output file name
|
||||||
|
#' @param wd Working directory where *file*s can be found
|
||||||
|
#' @param ud_file Filename of udpipe model to use, should be in *wd*
|
||||||
|
#' @param ver Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')
|
||||||
|
#' @return A vector of 1's indicating the success of each update call
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' ud_update(out, udmodel, ver, file)
|
||||||
|
#'
|
||||||
|
|
||||||
|
# punct_check <- function(str) {
|
||||||
|
# if (!(stri_sub(str, from = -1)) %in% c('.','!','?')) {
|
||||||
|
# return(str_c(str, '.'))
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
|
||||||
|
ud_update <- function(file, wd, ud_file, ver) {
|
||||||
|
out <- readRDS(str_c(wd,'/',file)) %>%
|
||||||
|
out_parser(., field = '_source', clean = F)
|
||||||
|
ud_model <- udpipe_load_model(file = str_c(wd,'/',ud_file))
|
||||||
|
ud <- as.data.frame(udpipe(ud_model, x = out$merged, parser = "default", doc_id = out$`_id`)) %>%
|
||||||
|
group_by(doc_id) %>%
|
||||||
|
summarise(
|
||||||
|
sentence_id = list(as.integer(sentence_id)),
|
||||||
|
token_id = list(as.integer(token_id)),
|
||||||
|
lemma = list(as.character(lemma)),
|
||||||
|
upos = list(as.character(upos)),
|
||||||
|
feats = list(as.character(feats)),
|
||||||
|
head_token_id = list(as.integer(head_token_id)),
|
||||||
|
dep_rel = list(as.character(dep_rel)),
|
||||||
|
start = list(as.integer(start)),
|
||||||
|
end = list(as.integer(end)),
|
||||||
|
exists = list(TRUE)
|
||||||
|
)
|
||||||
|
bulk <- apply(ud, 1, bulk_writer, varname = 'ud', type = 'set', ver = ver)
|
||||||
|
saveRDS(bulk, file = str_c(wd,'/ud_',file))
|
||||||
|
# res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
|
||||||
|
return()
|
||||||
|
}
|
||||||
|
|
||||||
|
#### Old code ####
|
||||||
|
# Use | as separator (this is not done anymore, as all data is stored as actual lists, instead of strings. Code kept for future reference)
|
||||||
|
# str_replace_all("\\|", "") %>%
|
||||||
|
# Remove VERY annoying single backslashes and replace them by whitespaces
|
||||||
|
# str_replace_all("\\\\", " ") %>%
|
||||||
|
# Replace any occurence of (double) whitespace characters by a single regular whitespace
|
||||||
|
# t_id <- paste(ud[,5], collapse = '|')
|
||||||
|
# lemmatized <- paste(ud[,7], collapse = '|') %>%
|
||||||
|
# # Replacing double quotes with single quotes in text
|
||||||
|
# str_replace_all("\"","\'")
|
||||||
|
# upos_tags <- paste(ud[,8], collapse = '|')
|
||||||
|
# head_t_id <- paste(ud[,11], collapse = '|')
|
||||||
|
# dep_rel <- paste(ud[,12], collapse = '|')
|
@ -0,0 +1,44 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/actorizer.R
|
||||||
|
\name{actorizer}
|
||||||
|
\alias{actorizer}
|
||||||
|
\title{Updater function for elasticizer: Conduct actor searches}
|
||||||
|
\usage{
|
||||||
|
actorizer(
|
||||||
|
out,
|
||||||
|
localhost = F,
|
||||||
|
ids,
|
||||||
|
prefix,
|
||||||
|
postfix,
|
||||||
|
pre_tags,
|
||||||
|
post_tags,
|
||||||
|
es_super,
|
||||||
|
ver
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
|
||||||
|
|
||||||
|
\item{localhost}{Defaults to false. When true, connect to a local Elasticsearch instance on the default port (9200)}
|
||||||
|
|
||||||
|
\item{ids}{List of actor ids}
|
||||||
|
|
||||||
|
\item{prefix}{Regex containing prefixes that should be excluded from hits}
|
||||||
|
|
||||||
|
\item{postfix}{Regex containing postfixes that should be excluded from hits}
|
||||||
|
|
||||||
|
\item{es_super}{Password for write access to ElasticSearch}
|
||||||
|
|
||||||
|
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
|
||||||
|
|
||||||
|
\item{identifier}{String used to mark highlights. Should be a lowercase string}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
As this is a nested function used within elasticizer, there is no return output
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Updater function for elasticizer: Conduct actor searches
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
|
||||||
|
}
|
@ -0,0 +1,28 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/cv_generator.R
|
||||||
|
\name{cv_generator}
|
||||||
|
\alias{cv_generator}
|
||||||
|
\title{Generate CV folds for nested cross-validation}
|
||||||
|
\usage{
|
||||||
|
cv_generator(outer_k, inner_k, vec, grid, seed)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{outer_k}{Number of outer CV (performance estimation) folds. If outer_k < 1 holdout sampling is used, with outer_k being the amount of test data}
|
||||||
|
|
||||||
|
\item{inner_k}{Number of inner CV (parameter optimization) folds}
|
||||||
|
|
||||||
|
\item{vec}{Vector containing the true values of the classification}
|
||||||
|
|
||||||
|
\item{grid}{Parameter grid for optimization}
|
||||||
|
|
||||||
|
\item{seed}{integer used as seed for random number generation}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A nested set of lists with row numbers
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
cv_generator(outer_k, inner_k, dfm, class_type)
|
||||||
|
}
|
@ -1,26 +1,46 @@
|
|||||||
% Generated by roxygen2: do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/duplicate_detection.R
|
% Please edit documentation in R/dupe_detect.R
|
||||||
\name{dupe_detect}
|
\name{dupe_detect}
|
||||||
\alias{dupe_detect}
|
\alias{dupe_detect}
|
||||||
\title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
|
\title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
|
||||||
\usage{
|
\usage{
|
||||||
dupe_detect(row, grid, cutoff, es_pwd)
|
dupe_detect(
|
||||||
|
row,
|
||||||
|
grid,
|
||||||
|
cutoff_lower,
|
||||||
|
cutoff_upper = 1,
|
||||||
|
es_pwd,
|
||||||
|
es_super,
|
||||||
|
words,
|
||||||
|
localhost = T,
|
||||||
|
ver
|
||||||
|
)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{row}{Row of grid to parse}
|
\item{row}{Row of grid to parse}
|
||||||
|
|
||||||
\item{grid}{A cross-table of all possible combinations of doctypes and dates}
|
\item{grid}{A cross-table of all possible combinations of doctypes and dates}
|
||||||
|
|
||||||
\item{cutoff}{Cutoff value for cosine similarity above which documents are considered duplicates}
|
\item{cutoff_lower}{Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)}
|
||||||
|
|
||||||
|
\item{cutoff_upper}{Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)}
|
||||||
|
|
||||||
\item{es_pwd}{Password for Elasticsearch read access}
|
\item{es_pwd}{Password for Elasticsearch read access}
|
||||||
|
|
||||||
|
\item{es_super}{Password for write access to ElasticSearch}
|
||||||
|
|
||||||
|
\item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])}
|
||||||
|
|
||||||
|
\item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)}
|
||||||
|
|
||||||
|
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory
|
dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
|
Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
dupe_detect(1,grid,es_pwd)
|
dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,41 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/estimator.R
|
||||||
|
\name{estimator}
|
||||||
|
\alias{estimator}
|
||||||
|
\title{Generate models and get classifications on test sets}
|
||||||
|
\usage{
|
||||||
|
estimator(
|
||||||
|
row,
|
||||||
|
grid,
|
||||||
|
outer_folds,
|
||||||
|
inner_folds,
|
||||||
|
dfm,
|
||||||
|
class_type,
|
||||||
|
model,
|
||||||
|
we_vectors
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{row}{Row number of current item in grid}
|
||||||
|
|
||||||
|
\item{grid}{Grid with model parameters and CV folds}
|
||||||
|
|
||||||
|
\item{outer_folds}{List with row numbers for outer folds}
|
||||||
|
|
||||||
|
\item{dfm}{DFM containing labeled documents}
|
||||||
|
|
||||||
|
\item{class_type}{Name of column in docvars() containing the classes}
|
||||||
|
|
||||||
|
\item{model}{Model to use (currently only nb)}
|
||||||
|
|
||||||
|
\item{we_vectors}{Matrix with word embedding vectors}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
Dependent on mode, if folds are included, returns true and predicted classes of test set, with parameters, model and model idf. When no folds, returns final model and idf values.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Creates a grid of models to be estimated for each outer fold, inner fold and parameter combination
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
estimator(row, grid, outer_folds, dfm, class_type, model)
|
||||||
|
}
|
@ -0,0 +1,29 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/feat_select.R
|
||||||
|
\name{feat_select}
|
||||||
|
\alias{feat_select}
|
||||||
|
\title{Select features using quanteda textstat_keyness}
|
||||||
|
\usage{
|
||||||
|
feat_select(topic, dfm, class_type, percentile, measure = "chi2")
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{topic}{The topic to determine keywords for}
|
||||||
|
|
||||||
|
\item{dfm}{The input dfm}
|
||||||
|
|
||||||
|
\item{class_type}{Name of the column in docvars containing the classification}
|
||||||
|
|
||||||
|
\item{percentile}{Cutoff for the list of words that should be returned}
|
||||||
|
|
||||||
|
\item{measure}{Measure to use in determining keyness, default = chi2; see textstat_keyness for other options}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A vector of words that are key to the topic
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Select features based on the textstat_keyness function and a percentile cutoff
|
||||||
|
Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
feat_select(topic, dfm, class_type, percentile, measure="chi2")
|
||||||
|
}
|
@ -0,0 +1,30 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/lemma_writer.R
|
||||||
|
\name{lemma_writer}
|
||||||
|
\alias{lemma_writer}
|
||||||
|
\title{Generates text output files (without punctuation) for external applications, such as GloVe embeddings}
|
||||||
|
\usage{
|
||||||
|
lemma_writer(out, file, localhost = F, documents = F, lemma = F, cores = 1)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{out}{The elasticizer-generated data frame}
|
||||||
|
|
||||||
|
\item{file}{The file to write the output to (including path, when required). When documents = T, provide path including trailing /}
|
||||||
|
|
||||||
|
\item{localhost}{Unused, but defaults to FALSE}
|
||||||
|
|
||||||
|
\item{documents}{Indicate whether the writer should output to a single file, or individual documents}
|
||||||
|
|
||||||
|
\item{lemma}{Indicate whether document output should be lemmas or original document}
|
||||||
|
|
||||||
|
\item{cores}{Indicate the number of cores to use for parallel processing}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A Quanteda dfm
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generates text output files (without punctuation) for external applications, such as GloVe embeddings
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
dfm_gen(out, words = '999')
|
||||||
|
}
|
@ -0,0 +1,20 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/metric_gen.R
|
||||||
|
\name{metric_gen}
|
||||||
|
\alias{metric_gen}
|
||||||
|
\title{Generate performance statistics for models}
|
||||||
|
\usage{
|
||||||
|
metric_gen(x)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{x}{A data frame containing at least the columns "pred" and "tv"}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
x, with additional columns for performance metrics
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate performance statistics for models, based on their predictions and the true values
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
metric_gen(x)
|
||||||
|
}
|
@ -0,0 +1,64 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/modelizer_old.R
|
||||||
|
\name{modelizer_old}
|
||||||
|
\alias{modelizer_old}
|
||||||
|
\title{Generate a classification model}
|
||||||
|
\usage{
|
||||||
|
modelizer_old(
|
||||||
|
dfm,
|
||||||
|
cores_outer,
|
||||||
|
cores_grid,
|
||||||
|
cores_inner,
|
||||||
|
cores_feats,
|
||||||
|
seed,
|
||||||
|
outer_k,
|
||||||
|
inner_k,
|
||||||
|
model,
|
||||||
|
class_type,
|
||||||
|
opt_measure,
|
||||||
|
country,
|
||||||
|
grid
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
|
||||||
|
|
||||||
|
\item{cores_outer}{Number of cores to use for outer CV (cannot be more than the number of outer folds)}
|
||||||
|
|
||||||
|
\item{cores_grid}{Number of cores to use for grid search (cannot be more than the number of grid rows (i.e. possible parameter combinations), multiplies with cores_outer)}
|
||||||
|
|
||||||
|
\item{cores_inner}{Number of cores to use for inner CV loop (cannot be more than number of inner CV folds, multiplies with cores_outer and cores_grid)}
|
||||||
|
|
||||||
|
\item{cores_feats}{Number of cores to use for feature selection (multiplies with cores outer, cores_grid and cores_inner)}
|
||||||
|
|
||||||
|
\item{seed}{Integer to use as seed for random number generation, ensures replicability}
|
||||||
|
|
||||||
|
\item{outer_k}{Number of outer cross-validation folds (for performance estimation)}
|
||||||
|
|
||||||
|
\item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}
|
||||||
|
|
||||||
|
\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
|
||||||
|
|
||||||
|
\item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}
|
||||||
|
|
||||||
|
\item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
|
||||||
|
|
||||||
|
\item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}
|
||||||
|
|
||||||
|
\item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
An .RData file in the current working directory (getwd()) containing the final model, performance estimates and the parameters used for grid search and cross-validation
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate a nested cross validated classification model based on a dfm with class labels as docvars
|
||||||
|
Currently only supports Naïve Bayes using quanteda's textmodel_nb
|
||||||
|
Hyperparemeter optimization is enabled through the grid parameter
|
||||||
|
A grid should be generated from vectors with the labels as described for each model, using the crossing() command
|
||||||
|
For Naïve Bayes, the following parameters can be used:
|
||||||
|
- percentiles (cutoff point for tf-idf feature selection)
|
||||||
|
- measures (what measure to use for determining feature importance, see textstat_keyness for options)
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
modelizer(dfm, cores_outer = 1, cores_grid = 1, cores_inner = 1, cores_feats = 1, seed = 42, outer_k = 3, inner_k = 5, model = model, class_type = class_type, opt_measure = opt_measure, country = country, grid = grid)
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/out_parser.R
|
||||||
|
\name{out_parser}
|
||||||
|
\alias{out_parser}
|
||||||
|
\title{Parse raw text into a single field}
|
||||||
|
\usage{
|
||||||
|
out_parser(out, field, clean = F)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{out}{The original output data frame}
|
||||||
|
|
||||||
|
\item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
|
||||||
|
|
||||||
|
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
a parsed output data frame including the additional column 'merged', containing the merged text
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Parse raw text from the MaML database into a single field
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
out_parser(out,field)
|
||||||
|
}
|
@ -0,0 +1,26 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/preproc.R
|
||||||
|
\name{preproc}
|
||||||
|
\alias{preproc}
|
||||||
|
\title{Preprocess dfm data for use in modeling procedure}
|
||||||
|
\usage{
|
||||||
|
preproc(dfm_train, dfm_test = NULL, params, we_vectors)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{dfm_train}{Training dfm}
|
||||||
|
|
||||||
|
\item{dfm_test}{Testing dfm if applicable, otherwise NULL}
|
||||||
|
|
||||||
|
\item{params}{Row from grid with parameter optimization}
|
||||||
|
|
||||||
|
\item{we_vectors}{Matrix with word embedding vectors}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
List with dfm_train and dfm_test, processed according to parameters in params
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Process dfm according to parameters provided in params
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
preproc(dfm_train, dfm_test = NULL, params)
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/query_gen_actors.R
|
||||||
|
\name{query_gen_actors}
|
||||||
|
\alias{query_gen_actors}
|
||||||
|
\title{Generate actor search queries based on data in actor db}
|
||||||
|
\usage{
|
||||||
|
query_gen_actors(actor, pre_tags, post_tags)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{actor}{A row from the output of elasticizer() when run on the 'actor' index}
|
||||||
|
|
||||||
|
\item{pre_tags}{Highlighter pre-tag}
|
||||||
|
|
||||||
|
\item{post_tags}{Highlighter post-tag}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A data frame containing the queries, related actor ids and actor function
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate actor search queries based on data in actor db
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
query_gen_actors(actor,country)
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/query_string.R
|
||||||
|
\name{query_string}
|
||||||
|
\alias{query_string}
|
||||||
|
\title{Generate a query string query for ElasticSearch}
|
||||||
|
\usage{
|
||||||
|
query_string(query, fields = F, random = F, default_operator = "AND")
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{query}{Query string in ElasticSearch query string format}
|
||||||
|
|
||||||
|
\item{fields}{List of field names to return, defaults to all}
|
||||||
|
|
||||||
|
\item{random}{Return randomized results. Boolean, defaults to FALSE}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A formatted ElasticSearch query string query
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate a query string query for ElasticSearch
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
query_string(query)
|
||||||
|
}
|
@ -0,0 +1,37 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/sent_merger.R
|
||||||
|
\name{sent_merger}
|
||||||
|
\alias{sent_merger}
|
||||||
|
\title{Aggregate sentence-level dataset containing sentiment (from sentencizer())}
|
||||||
|
\usage{
|
||||||
|
sent_merger(
|
||||||
|
df,
|
||||||
|
actors_meta = NULL,
|
||||||
|
actor_groups = NULL,
|
||||||
|
pos_cutoff = NULL,
|
||||||
|
neg_cutoff = NULL,
|
||||||
|
single_party = F
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{df}{Data frame with actor ids, produced by sentencizer}
|
||||||
|
|
||||||
|
\item{actors_meta}{Optional data frame containing actor metadata obtained using elasticizer(index="actors")}
|
||||||
|
|
||||||
|
\item{actor_groups}{Optional list of vectors, where each vector contains actor ids to be merged (e.g. merge all left-wing parties)}
|
||||||
|
|
||||||
|
\item{pos_cutoff}{Optional value above which sentence-level sentiment scores should be considered "positive"}
|
||||||
|
|
||||||
|
\item{neg_cutoff}{Optional value below which sentence-level sentiment scores should be considered "negative"}
|
||||||
|
|
||||||
|
\item{single_party}{Boolean to generate data only from sentences in which a single party is mentioned, defaults to FALSE}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
When no ids, returns actor-article dataset with individual actors, party aggregations, party-actor aggregations and overall actor sentiment (regardless of specific actors). When ids, returns aggregations for each vector in list
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Aggregate sentence-level dataset containing sentiment (from sentencizer())
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
sent_merger(df, actors_meta, ids = NULL)
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/sentencizer.R
|
||||||
|
\name{sentencizer}
|
||||||
|
\alias{sentencizer}
|
||||||
|
\title{Generate sentence-level dataset with sentiment and actor presence}
|
||||||
|
\usage{
|
||||||
|
sentencizer(out, sent_dict = NULL, localhost = NULL, validation = F)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{out}{Data frame produced by elasticizer}
|
||||||
|
|
||||||
|
\item{sent_dict}{Optional dataframe containing the sentiment dictionary and values. Words should be either in the "lem_u" column when they consist of lemma_upos pairs, or in the "lemma" column when they are just lemmas. The "prox" column should either contain word values, or 1 for all words if there are no values.}
|
||||||
|
|
||||||
|
\item{validation}{Boolean indicating whether human validation should be performed on sentiment scoring}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
No return value, data per batch is saved in an RDS file
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate sentence-level dataset with sentiment and actor presence
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
sentencizer(out, sent_dict = NULL, validation = F)
|
||||||
|
}
|
@ -0,0 +1,27 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/ud_update.R
|
||||||
|
\name{ud_update}
|
||||||
|
\alias{ud_update}
|
||||||
|
\title{Generate UDpipe output from base text}
|
||||||
|
\usage{
|
||||||
|
ud_update(file, wd, ud_file, ver)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{file}{Filename of file to read in, also used for generating output file name}
|
||||||
|
|
||||||
|
\item{wd}{Working directory where *file*s can be found}
|
||||||
|
|
||||||
|
\item{ud_file}{Filename of udpipe model to use, should be in *wd*}
|
||||||
|
|
||||||
|
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
A vector of 1's indicating the success of each update call
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Generate UDpipe output from base text
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
ud_update(out, udmodel, ver, file)
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in new issue