Duplicate detection first commit

7 years ago · c815dc7f2b
parent 1f06b0b716
commit c815dc7f2b
4 changed files with 82 additions and 1 deletions
--- a/1
+++ b/1
@ -2,6 +2,7 @@

 export(bulk_writer)
 export(dfm_gen)
+export(dupe_detect)
 export(elastic_update)
 export(elasticizer)
 export(merger)
--- a/R/duplicate_detection.R
+++ b/R/duplicate_detection.R
@ -0,0 +1,54 @@
+#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
+#'
+#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
+#' @param row Row of grid to parse
+#' @param grid A cross-table of all possible combinations of doctypes and dates
+#' @param cutoff Cutoff value for cosine similarity above which documents are considered duplicates
+#' @param es_pwd Password for Elasticsearch read access
+#' @return dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory
+#' @export
+#' @examples
+#' dupe_detect(1,grid,es_pwd)
+
+#################################################################################################
+#################################### Duplicate detector ################################
+#################################################################################################
+
+dupe_detect <- function(row, grid, cutoff, es_pwd) {
+  params <- grid[row,]
+  print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
+  query <- paste0('{"query":
+                  {"bool": {"filter":[{"term":{"doctype": "',params$doctypes,'"}},
+                  {"range" : {
+                  "publication_date" : {
+                  "gte" : "',params$dates,'T00:00:00Z",
+                  "lt" :  "',params$dates+1,'T00:00:00Z"
+                  }
+                  }}]
+
+                  } } }')
+
+
+  out <- elasticizer(query, es_pwd = es_pwd)
+  dfm <- dfm_gen(out, text = "full")
+  simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
+  diag(simil) <- NA
+  simil_og <- simil
+  df <- as.data.frame(which(simil >= cutoff, arr.ind = TRUE)) %>%
+    rownames_to_column("rowid") %>%
+    mutate(colid = colnames(simil)[col]) %>%
+    .[,c(1,4)] %>%
+    group_by(colid) %>% summarise(rowid=list(rowid))
+  text <- capture.output(stream_out(df))
+  write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
+  simil[upper.tri(simil)] <- NA
+  write(unique(rownames(which(simil >= cutoff, arr.ind = TRUE))),
+        file = paste0(getwd(),'/remove_ids.txt'),
+        append=T)
+  ### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
+  # id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
+  # dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
+  # simil_nodupes <- as.matrix(textstat_simil(dfm_nodupes, margin="documents", method="cosine"))
+  # diag(simil_nodupes) <- NA
+  # which(simil_nodupes >= cutoff, arr.ind = TRUE)
+}
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -4,7 +4,7 @@
 \alias{dfm_gen}
 \title{Generates dfm from ElasticSearch output}
 \usage{
-dfm_gen(out, words = "999", text = c("lemmas", "full"))
+dfm_gen(out, words = "999", text = "lemmas")
 }
 \arguments{
 \item{out}{The elasticizer-generated data frame}
--- a/man/dupe_detect.Rd
+++ b/man/dupe_detect.Rd
@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/duplicate_detection.R
+\name{dupe_detect}
+\alias{dupe_detect}
+\title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
+\usage{
+dupe_detect(row, grid, cutoff, es_pwd)
+}
+\arguments{
+\item{row}{Row of grid to parse}
+
+\item{grid}{A cross-table of all possible combinations of doctypes and dates}
+
+\item{cutoff}{Cutoff value for cosine similarity above which documents are considered duplicates}
+
+\item{es_pwd}{Password for Elasticsearch read access}
+}
+\value{
+dupe_objects.json (containing each id and all its duplicates) and remove_ids.txt (list of ids to be removed) in current working directory
+}
+\description{
+Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
+}
+\examples{
+dupe_detect(1,grid,es_pwd)
+}