Renamed dupe_detect, and added return output

master
Erik de Vries 6 years ago
parent db418d7396
commit 65f8c26ec6

@ -5,7 +5,7 @@
#' @param grid A cross-table of all possible combinations of doctypes and dates
#' @param cutoff Cutoff value for cosine similarity above which documents are considered duplicates
#' @param es_pwd Password for Elasticsearch read access
#' @return dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory
#' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
#' @export
#' @examples
#' dupe_detect(1,grid,es_pwd)
@ -45,6 +45,7 @@ dupe_detect <- function(row, grid, cutoff, es_pwd) {
write(unique(rownames(which(simil >= cutoff, arr.ind = TRUE))),
file = paste0(getwd(),'/remove_ids.txt'),
append=T)
return(list(df,unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))))
### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
# id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
# dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))

@ -16,7 +16,7 @@ dupe_detect(row, grid, cutoff, es_pwd)
\item{es_pwd}{Password for Elasticsearch read access}
}
\value{
dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory
dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
}
\description{
Get ids of duplicate documents that have a cosine similarity score higher than [threshold]

Loading…
Cancel
Save