From 7218f6b8d094c01fdbbed17c831596dc3e6c5204 Mon Sep 17 00:00:00 2001 From: Erik de Vries Date: Fri, 11 Jan 2019 15:38:19 +0100 Subject: [PATCH] dupe_detect: fixed error on no duplicates --- R/dupe_detect.R | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/R/dupe_detect.R b/R/dupe_detect.R index 15ff360..993e2d5 100644 --- a/R/dupe_detect.R +++ b/R/dupe_detect.R @@ -39,26 +39,28 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su diag(simil) <- NA duplicates <- which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE) duplicates <- cbind(duplicates, rowid= rownames(duplicates)) - rownames(duplicates) <- seq(1:length(rownames(duplicates))) - df <- as.data.frame(duplicates, make.names = NA, stringsAsFactors = F) %>% - # bind_cols(colid = colnames(simil)[.['col']]) %>% - mutate(colid = colnames(simil)[as.numeric(col)]) %>% - .[,c(3,4)] %>% - group_by(colid) %>% summarise(rowid=list(rowid)) - text <- capture.output(stream_out(df)) - # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T) - simil[upper.tri(simil)] <- NA - # write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), - # file = paste0(getwd(),'/remove_ids.txt'), - # append=T) - dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), - dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))) - bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver), - apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver)) - if (length(bulk) > 0) { + if (length(duplicates) > 0) { + rownames(duplicates) <- seq(1:length(rownames(duplicates))) + df <- as.data.frame(duplicates, make.names = NA, stringsAsFactors = F) %>% + # bind_cols(colid = colnames(simil)[.['col']]) %>% + mutate(colid = colnames(simil)[as.numeric(col)]) %>% + .[,c(3,4)] %>% + group_by(colid) %>% summarise(rowid=list(rowid)) + text <- capture.output(stream_out(df)) + # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T) + simil[upper.tri(simil)] <- NA + # write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), + # file = paste0(getwd(),'/remove_ids.txt'), + # append=T) + dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), + dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))) + bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set', ver = ver), + apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set', ver = ver)) res <- elastic_update(bulk, es_super = es_super, localhost = localhost) + return(paste0('Checked ',params$doctypes,' on ',params$dates )) + } else { + return(paste0('No duplicates for ',params$doctypes,' on ',params$dates )) } - return(paste0('Checked ',params$doctypes,' on ',params$dates )) } else { return(paste0('No results for ',params$doctypes,' on ',params$dates )) }