diff --git a/R/dupe_detect.R b/R/dupe_detect.R index 5c09296..9232045 100644 --- a/R/dupe_detect.R +++ b/R/dupe_detect.R @@ -30,30 +30,34 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su }}] } } }') - out <- elasticizer(query, es_pwd = es_pwd, localhost=T) + out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost) if (class(out$hits$hits) != 'list') { dfm <- dfm_gen(out, text = "full", words = words) - simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine")) - diag(simil) <- NA - df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>% - rownames_to_column("rowid") %>% - mutate(colid = colnames(simil)[col]) %>% - .[,c(1,4)] %>% - group_by(colid) %>% summarise(rowid=list(rowid)) - text <- capture.output(stream_out(df)) - # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T) - simil[upper.tri(simil)] <- NA - # write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), - # file = paste0(getwd(),'/remove_ids.txt'), - # append=T) - dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), - dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))) - bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'), - apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set')) - if (length(bulk) > 0) { - res <- elastic_update(bulk, es_super = es_super, localhost = localhost) + if (sum(dfm[1,]) > 0) { + simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine")) + diag(simil) <- NA + df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>% + rownames_to_column("rowid") %>% + mutate(colid = colnames(simil)[col]) %>% + .[,c(1,4)] %>% + group_by(colid) %>% summarise(rowid=list(rowid)) + text <- capture.output(stream_out(df)) + # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T) + simil[upper.tri(simil)] <- NA + # write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), + # file = paste0(getwd(),'/remove_ids.txt'), + # append=T) + dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))), + dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))) + bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'), + apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set')) + if (length(bulk) > 0) { + res <- elastic_update(bulk, es_super = es_super, localhost = localhost) + } + return(paste0('Checked ',params$doctypes,' on ',params$dates )) + } else { + return(paste0('No results for ',params$doctypes,' on ',params$dates )) } - return(paste0('Checked ',params$doctypes,' on ',params$dates )) } else { return(paste0('No results for ',params$doctypes,' on ',params$dates )) } diff --git a/man/dupe_detect.Rd b/man/dupe_detect.Rd index 0d458e6..ee2b699 100644 --- a/man/dupe_detect.Rd +++ b/man/dupe_detect.Rd @@ -4,7 +4,8 @@ \alias{dupe_detect} \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]} \usage{ -dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words) +dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super, + words, localhost = T) } \arguments{ \item{row}{Row of grid to parse} @@ -17,7 +18,11 @@ dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words) \item{es_pwd}{Password for Elasticsearch read access} +\item{es_super}{Password for write access to ElasticSearch} + \item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])} + +\item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)} } \value{ dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory @@ -26,5 +31,5 @@ dupe_objects.json and data frame containing each id and all its duplicates. remo Get ids of duplicate documents that have a cosine similarity score higher than [threshold] } \examples{ -dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, words) +dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T) }