dupe_detect: fix for empty results dataframe (no duplicates for given date and newspaper)

master
Erik de Vries 6 years ago
parent 993f39957a
commit 887f1aa774

@ -35,7 +35,10 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words
dfm <- dfm_gen(out, text = "full", words = words) dfm <- dfm_gen(out, text = "full", words = words)
simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine")) simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
diag(simil) <- NA diag(simil) <- NA
df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>% df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))
if (length(rownames(df)) > 0) {
df <- df %>%
rownames_to_column("rowid") %>% rownames_to_column("rowid") %>%
mutate(colid = colnames(simil)[col]) %>% mutate(colid = colnames(simil)[col]) %>%
.[,c(1,4)] %>% .[,c(1,4)] %>%
@ -47,6 +50,9 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words
file = paste0(getwd(),'/remove_ids.txt'), file = paste0(getwd(),'/remove_ids.txt'),
append=T) append=T)
return(list(df,unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))))) return(list(df,unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))
} else {
return(NULL)
}
### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works ### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
# id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE))) # id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
# dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter)) # dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))

Loading…
Cancel
Save