#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
#'
#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
#' @param row Row of grid to parse
#' @param grid A cross-table of all possible combinations of doctypes and dates
#' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)
#' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)
#' @param es_pwd Password for Elasticsearch read access
#' @param es_super Password for write access to ElasticSearch
#' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
#' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
#' @export
#' @examples
#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)
#################################################################################################
#################################### Duplicate detector ################################
#################################################################################################
dupe_detect <- function ( row , grid , cutoff_lower , cutoff_upper = 1 , es_pwd , es_super , words , localhost = T ) {
params <- grid [row , ]
print ( paste0 ( ' Parsing ' , params $ doctypes , ' on ' , params $ dates ) )
query <- paste0 ( ' { " q u e r y " :
{ " bool" : { " filter" : [ { " term" : { " doctype" : " ',params$doctypes,'" } } ,
{ " range" : {
" publication_date" : {
" gte" : " ',params$dates,'T00:00:00Z" ,
" lt" : " ',params$dates+1,'T00:00:00Z"
}
} } ]
} } } ' )
out <- elasticizer ( query , es_pwd = es_pwd , localhost = localhost )
if ( class ( out $ hits $ hits ) != ' list' ) {
dfm <- dfm_gen ( out , text = " full" , words = words )
if ( sum ( dfm [1 , ] ) > 0 ) {
simil <- as.matrix ( textstat_simil ( dfm , margin = " documents" , method = " cosine" ) )
diag ( simil ) <- NA
df <- as.data.frame ( which ( simil >= cutoff_lower & simil <= cutoff_upper , arr.ind = TRUE ) ) %>%
rownames_to_column ( " rowid" ) %>%
mutate ( colid = colnames ( simil ) [col ] ) %>%
.[ , c ( 1 , 4 ) ] %>%
group_by ( colid ) %>% summarise ( rowid = list ( rowid ) )
text <- capture.output ( stream_out ( df ) )
# write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
simil [upper.tri ( simil ) ] <- NA
# write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
# file = paste0(getwd(),'/remove_ids.txt'),
# append=T)
dupe_delete <- data.frame ( id = unique ( rownames ( which ( simil >= cutoff_lower & simil <= cutoff_upper , arr.ind = TRUE ) ) ) ,
dupe_delete = rep ( 1 , length ( unique ( rownames ( which ( simil >= cutoff_lower & simil <= cutoff_upper , arr.ind = TRUE ) ) ) ) ) )
bulk <- c ( apply ( df , 1 , bulk_writer , varname = ' duplicates' , type = ' set' ) ,
apply ( dupe_delete , 1 , bulk_writer , varname = ' _delete' , type = ' set' ) )
if ( length ( bulk ) > 0 ) {
res <- elastic_update ( bulk , es_super = es_super , localhost = localhost )
}
return ( paste0 ( ' Checked ' , params $ doctypes , ' on ' , params $ dates ) )
} else {
return ( paste0 ( ' No results for ' , params $ doctypes , ' on ' , params $ dates ) )
}
} else {
return ( paste0 ( ' No results for ' , params $ doctypes , ' on ' , params $ dates ) )
}
### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
# id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
# dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
# simil_nodupes <- as.matrix(textstat_simil(dfm_nodupes, margin="documents", method="cosine"))
# diag(simil_nodupes) <- NA
# which(simil_nodupes >= cutoff, arr.ind = TRUE)
}