diff --git a/R/duplicate_detection.R b/R/dupe_detect.R similarity index 90% rename from R/duplicate_detection.R rename to R/dupe_detect.R index ff69b81..5ced5fb 100644 --- a/R/duplicate_detection.R +++ b/R/dupe_detect.R @@ -5,7 +5,7 @@ #' @param grid A cross-table of all possible combinations of doctypes and dates #' @param cutoff Cutoff value for cosine similarity above which documents are considered duplicates #' @param es_pwd Password for Elasticsearch read access -#' @return dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory +#' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory #' @export #' @examples #' dupe_detect(1,grid,es_pwd) @@ -45,6 +45,7 @@ dupe_detect <- function(row, grid, cutoff, es_pwd) { write(unique(rownames(which(simil >= cutoff, arr.ind = TRUE))), file = paste0(getwd(),'/remove_ids.txt'), append=T) + return(list(df,unique(rownames(which(simil >= cutoff, arr.ind = TRUE))))) ### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works # id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE))) # dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter)) diff --git a/man/dupe_detect.Rd b/man/dupe_detect.Rd index 13209bd..03710b5 100644 --- a/man/dupe_detect.Rd +++ b/man/dupe_detect.Rd @@ -16,7 +16,7 @@ dupe_detect(row, grid, cutoff, es_pwd) \item{es_pwd}{Password for Elasticsearch read access} } \value{ -dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory +dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory } \description{ Get ids of duplicate documents that have a cosine similarity score higher than [threshold]