Fixed dupe_detect error on documents with one sentence or less, and a maximum # of words in dfm_gen

master DupeDetect
Erik de Vries 6 years ago
parent 0e8c127b86
commit ef51ce60a9

@ -30,30 +30,34 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
}}]
} } }')
out <- elasticizer(query, es_pwd = es_pwd, localhost=T)
out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost)
if (class(out$hits$hits) != 'list') {
dfm <- dfm_gen(out, text = "full", words = words)
simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
diag(simil) <- NA
df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
rownames_to_column("rowid") %>%
mutate(colid = colnames(simil)[col]) %>%
.[,c(1,4)] %>%
group_by(colid) %>% summarise(rowid=list(rowid))
text <- capture.output(stream_out(df))
# write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
simil[upper.tri(simil)] <- NA
# write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
# file = paste0(getwd(),'/remove_ids.txt'),
# append=T)
dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'),
apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set'))
if (length(bulk) > 0) {
res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
if (sum(dfm[1,]) > 0) {
simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
diag(simil) <- NA
df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
rownames_to_column("rowid") %>%
mutate(colid = colnames(simil)[col]) %>%
.[,c(1,4)] %>%
group_by(colid) %>% summarise(rowid=list(rowid))
text <- capture.output(stream_out(df))
# write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
simil[upper.tri(simil)] <- NA
# write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
# file = paste0(getwd(),'/remove_ids.txt'),
# append=T)
dupe_delete <- data.frame(id=unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
dupe_delete = rep(1,length(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))))
bulk <- c(apply(df, 1, bulk_writer, varname='duplicates', type = 'set'),
apply(dupe_delete, 1, bulk_writer, varname='_delete', type = 'set'))
if (length(bulk) > 0) {
res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
}
return(paste0('Checked ',params$doctypes,' on ',params$dates ))
} else {
return(paste0('No results for ',params$doctypes,' on ',params$dates ))
}
return(paste0('Checked ',params$doctypes,' on ',params$dates ))
} else {
return(paste0('No results for ',params$doctypes,' on ',params$dates ))
}

@ -4,7 +4,8 @@
\alias{dupe_detect}
\title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
\usage{
dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words)
dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
words, localhost = T)
}
\arguments{
\item{row}{Row of grid to parse}
@ -17,7 +18,11 @@ dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words)
\item{es_pwd}{Password for Elasticsearch read access}
\item{es_super}{Password for write access to ElasticSearch}
\item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])}
\item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)}
}
\value{
dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
@ -26,5 +31,5 @@ dupe_objects.json and data frame containing each id and all its duplicates. remo
Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
}
\examples{
dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, words)
dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)
}

Loading…
Cancel
Save