|
|
@ -35,18 +35,24 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words
|
|
|
|
dfm <- dfm_gen(out, text = "full", words = words)
|
|
|
|
dfm <- dfm_gen(out, text = "full", words = words)
|
|
|
|
simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
|
|
|
|
simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
|
|
|
|
diag(simil) <- NA
|
|
|
|
diag(simil) <- NA
|
|
|
|
df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
|
|
|
|
df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))
|
|
|
|
rownames_to_column("rowid") %>%
|
|
|
|
|
|
|
|
mutate(colid = colnames(simil)[col]) %>%
|
|
|
|
if (length(rownames(df)) > 0) {
|
|
|
|
.[,c(1,4)] %>%
|
|
|
|
df <- df %>%
|
|
|
|
group_by(colid) %>% summarise(rowid=list(rowid))
|
|
|
|
rownames_to_column("rowid") %>%
|
|
|
|
text <- capture.output(stream_out(df))
|
|
|
|
mutate(colid = colnames(simil)[col]) %>%
|
|
|
|
write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
|
|
|
|
.[,c(1,4)] %>%
|
|
|
|
simil[upper.tri(simil)] <- NA
|
|
|
|
group_by(colid) %>% summarise(rowid=list(rowid))
|
|
|
|
write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
|
|
|
|
text <- capture.output(stream_out(df))
|
|
|
|
file = paste0(getwd(),'/remove_ids.txt'),
|
|
|
|
write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
|
|
|
|
append=T)
|
|
|
|
simil[upper.tri(simil)] <- NA
|
|
|
|
return(list(df,unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))
|
|
|
|
write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
|
|
|
|
|
|
|
|
file = paste0(getwd(),'/remove_ids.txt'),
|
|
|
|
|
|
|
|
append=T)
|
|
|
|
|
|
|
|
return(list(df,unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
return(NULL)
|
|
|
|
|
|
|
|
}
|
|
|
|
### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
|
|
|
|
### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
|
|
|
|
# id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
|
|
|
|
# id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
|
|
|
|
# dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
|
|
|
|
# dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
|
|
|
|