Renamed dupe_detect, and added return output

master
Erik de Vries 6 years ago
parent db418d7396
commit 65f8c26ec6

@ -5,7 +5,7 @@
#' @param grid A cross-table of all possible combinations of doctypes and dates #' @param grid A cross-table of all possible combinations of doctypes and dates
#' @param cutoff Cutoff value for cosine similarity above which documents are considered duplicates #' @param cutoff Cutoff value for cosine similarity above which documents are considered duplicates
#' @param es_pwd Password for Elasticsearch read access #' @param es_pwd Password for Elasticsearch read access
#' @return dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
#' @export #' @export
#' @examples #' @examples
#' dupe_detect(1,grid,es_pwd) #' dupe_detect(1,grid,es_pwd)
@ -45,6 +45,7 @@ dupe_detect <- function(row, grid, cutoff, es_pwd) {
write(unique(rownames(which(simil >= cutoff, arr.ind = TRUE))), write(unique(rownames(which(simil >= cutoff, arr.ind = TRUE))),
file = paste0(getwd(),'/remove_ids.txt'), file = paste0(getwd(),'/remove_ids.txt'),
append=T) append=T)
return(list(df,unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))))
### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works ### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
# id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE))) # id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
# dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter)) # dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))

@ -16,7 +16,7 @@ dupe_detect(row, grid, cutoff, es_pwd)
\item{es_pwd}{Password for Elasticsearch read access} \item{es_pwd}{Password for Elasticsearch read access}
} }
\value{ \value{
dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
} }
\description{ \description{
Get ids of duplicate documents that have a cosine similarity score higher than [threshold] Get ids of duplicate documents that have a cosine similarity score higher than [threshold]

Loading…
Cancel
Save