mamlr/R/dupe_detect.R

#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
#'
#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
#' @param row Row of grid to parse
#' @param grid A cross-table of all possible combinations of doctypes and dates
#' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)
#' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)
#' @param es_pwd Password for Elasticsearch read access
#' @param es_super Password for write access to ElasticSearch
#' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
#' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)
#' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
#' @export
#' @examples
#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)

#################################################################################################
#################################### Duplicate detector ################################
#################################################################################################
dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T) {
  params <- grid[row,]
  print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
  query <- paste0('{"query":
                  {"bool": {"filter":[{"term":{"doctype": "',params$doctypes,'"}},
                  {"range" : {
                  "publication_date" : {
                  "gte" : "',params$dates,'T00:00:00Z",
                  "lt" :  "',params$dates+1,'T00:00:00Z"
                  }
                  }}]

                  } } }')
  out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost)
  if (class(out$hits$hits) != 'list') {
    dfm <- dfm_gen(out, text = "full", words = words)
    if (sum(dfm[1,]) > 0) {
      simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
      diag(simil) <- NA
      df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
        rownames_to_column("rowid") %>%
        mutate(colid = colnames(simil)[col]) %>%
        .[,c(1,4)] %>%
        group_by(colid) %>% summarise(rowid=list(rowid))
      text <- capture.output(stream_out(df))
      # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
      simil[upper.tri(simil)] <- NA
      # write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
      #       file = paste0(getwd(),'/remove_ids.txt'),
      #       append=T)
      dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
                                dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
      bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'),
                apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set'))
      if (length(bulk) > 0) {
        res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
      }
      return(paste0('Checked ',params$doctypes,' on ',params$dates ))
    } else {
      return(paste0('No results for ',params$doctypes,' on ',params$dates ))
    }
  } else {
    return(paste0('No results for ',params$doctypes,' on ',params$dates ))
  }
  ### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
  # id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
  # dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
  # simil_nodupes <- as.matrix(textstat_simil(dfm_nodupes, margin="documents", method="cosine"))
  # diag(simil_nodupes) <- NA
  # which(simil_nodupes >= cutoff, arr.ind = TRUE)
}
Duplicate detection first commit 6 years ago			`#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]`
			`#'`
			`#' Get ids of duplicate documents that have a cosine similarity score higher than [threshold]`
			`#' @param row Row of grid to parse`
			`#' @param grid A cross-table of all possible combinations of doctypes and dates`
dupe_detect: added support for both lower and upper cutoff point 6 years ago			`#' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)`
			`#' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)`
Duplicate detection first commit 6 years ago			`#' @param es_pwd Password for Elasticsearch read access`
bulk_writer: fixes for JSON generation and added exception for use of 'tokens' varname class_update/elastic_update: Moved response checking to elastic_update dupe_detect: Finalized dupe_detect 6 years ago			`#' @param es_super Password for write access to ElasticSearch`
dfm_gen & merger: Changed word cutoff point to be a general setting in dfm_gen. Cuts off at the last [.?!] before the cutoff point (so returns documents at a sentence, shorter than cutoff). 6 years ago			`#' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])`
bulk_writer: fixes for JSON generation and added exception for use of 'tokens' varname class_update/elastic_update: Moved response checking to elastic_update dupe_detect: Finalized dupe_detect 6 years ago			`#' @param localhost Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)`
Renamed dupe_detect, and added return output 6 years ago			`#' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory`
Duplicate detection first commit 6 years ago			`#' @export`
			`#' @examples`
bulk_writer: fixes for JSON generation and added exception for use of 'tokens' varname class_update/elastic_update: Moved response checking to elastic_update dupe_detect: Finalized dupe_detect 6 years ago			`#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)`
Duplicate detection first commit 6 years ago
			`#################################################################################################`
			`#################################### Duplicate detector ################################`
			`#################################################################################################`
bulk_writer: fixes for JSON generation and added exception for use of 'tokens' varname class_update/elastic_update: Moved response checking to elastic_update dupe_detect: Finalized dupe_detect 6 years ago			`dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T) {`
Duplicate detection first commit 6 years ago			`params <- grid[row,]`
			`print(paste0('Parsing ',params$doctypes,' on ',params$dates ))`
			`query <- paste0('{"query":`
			`{"bool": {"filter":[{"term":{"doctype": "',params$doctypes,'"}},`
			`{"range" : {`
			`"publication_date" : {`
			`"gte" : "',params$dates,'T00:00:00Z",`
			`"lt" : "',params$dates+1,'T00:00:00Z"`
			`}`
			`}}]`

			`} } }')`
Fixed dupe_detect error on documents with one sentence or less, and a maximum # of words in dfm_gen 6 years ago			`out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost)`
bulk_writer: fixes for JSON generation and added exception for use of 'tokens' varname class_update/elastic_update: Moved response checking to elastic_update dupe_detect: Finalized dupe_detect 6 years ago			`if (class(out$hits$hits) != 'list') {`
dupe_detect: fix to prevent errors when a query returns no results 6 years ago			`dfm <- dfm_gen(out, text = "full", words = words)`
Fixed dupe_detect error on documents with one sentence or less, and a maximum # of words in dfm_gen 6 years ago			`if (sum(dfm[1,]) > 0) {`
			`simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))`
			`diag(simil) <- NA`
			`df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%`
			`rownames_to_column("rowid") %>%`
			`mutate(colid = colnames(simil)[col]) %>%`
			`.[,c(1,4)] %>%`
			`group_by(colid) %>% summarise(rowid=list(rowid))`
			`text <- capture.output(stream_out(df))`
			`# write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)`
			`simil[upper.tri(simil)] <- NA`
			`# write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),`
			`# file = paste0(getwd(),'/remove_ids.txt'),`
			`# append=T)`
			`dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),`
			`dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))`
			`bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'),`
			`apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set'))`
			`if (length(bulk) > 0) {`
			`res <- elastic_update(bulk, es_super = es_super, localhost = localhost)`
			`}`
			`return(paste0('Checked ',params$doctypes,' on ',params$dates ))`
			`} else {`
			`return(paste0('No results for ',params$doctypes,' on ',params$dates ))`
bulk_writer: fixes for JSON generation and added exception for use of 'tokens' varname class_update/elastic_update: Moved response checking to elastic_update dupe_detect: Finalized dupe_detect 6 years ago			`}`
dupe_detect: fix for empty results dataframe (no duplicates for given date and newspaper) 6 years ago			`} else {`
bulk_writer: fixes for JSON generation and added exception for use of 'tokens' varname class_update/elastic_update: Moved response checking to elastic_update dupe_detect: Finalized dupe_detect 6 years ago			`return(paste0('No results for ',params$doctypes,' on ',params$dates ))`
dupe_detect: fix for empty results dataframe (no duplicates for given date and newspaper) 6 years ago			`}`
Duplicate detection first commit 6 years ago			`### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works`
			`# id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))`
			`# dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))`
			`# simil_nodupes <- as.matrix(textstat_simil(dfm_nodupes, margin="documents", method="cosine"))`
			`# diag(simil_nodupes) <- NA`
			`# which(simil_nodupes >= cutoff, arr.ind = TRUE)`
			`}`