|
|
|
@ -39,26 +39,28 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
|
|
|
|
|
diag(simil) <- NA
|
|
|
|
|
duplicates <- which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)
|
|
|
|
|
duplicates <- cbind(duplicates, rowid= rownames(duplicates))
|
|
|
|
|
rownames(duplicates) <- seq(1:length(rownames(duplicates)))
|
|
|
|
|
df <- as.data.frame(duplicates, make.names = NA, stringsAsFactors = F) %>%
|
|
|
|
|
# bind_cols(colid = colnames(simil)[.['col']]) %>%
|
|
|
|
|
mutate(colid = colnames(simil)[as.numeric(col)]) %>%
|
|
|
|
|
.[,c(3,4)] %>%
|
|
|
|
|
group_by(colid) %>% summarise(rowid=list(rowid))
|
|
|
|
|
text <- capture.output(stream_out(df))
|
|
|
|
|
# write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
|
|
|
|
|
simil[upper.tri(simil)] <- NA
|
|
|
|
|
# write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
|
|
|
|
|
# file = paste0(getwd(),'/remove_ids.txt'),
|
|
|
|
|
# append=T)
|
|
|
|
|
dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
|
|
|
|
|
dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
|
|
|
|
|
bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver),
|
|
|
|
|
apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver))
|
|
|
|
|
if (length(bulk) > 0) {
|
|
|
|
|
if (length(duplicates) > 0) {
|
|
|
|
|
rownames(duplicates) <- seq(1:length(rownames(duplicates)))
|
|
|
|
|
df <- as.data.frame(duplicates, make.names = NA, stringsAsFactors = F) %>%
|
|
|
|
|
# bind_cols(colid = colnames(simil)[.['col']]) %>%
|
|
|
|
|
mutate(colid = colnames(simil)[as.numeric(col)]) %>%
|
|
|
|
|
.[,c(3,4)] %>%
|
|
|
|
|
group_by(colid) %>% summarise(rowid=list(rowid))
|
|
|
|
|
text <- capture.output(stream_out(df))
|
|
|
|
|
# write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
|
|
|
|
|
simil[upper.tri(simil)] <- NA
|
|
|
|
|
# write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
|
|
|
|
|
# file = paste0(getwd(),'/remove_ids.txt'),
|
|
|
|
|
# append=T)
|
|
|
|
|
dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
|
|
|
|
|
dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
|
|
|
|
|
bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver),
|
|
|
|
|
apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver))
|
|
|
|
|
res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
|
|
|
|
|
return(paste0('Checked ',params$doctypes,' on ',params$dates ))
|
|
|
|
|
} else {
|
|
|
|
|
return(paste0('No duplicates for ',params$doctypes,' on ',params$dates ))
|
|
|
|
|
}
|
|
|
|
|
return(paste0('Checked ',params$doctypes,' on ',params$dates ))
|
|
|
|
|
} else {
|
|
|
|
|
return(paste0('No results for ',params$doctypes,' on ',params$dates ))
|
|
|
|
|
}
|
|
|
|
|