dupe_detect: fix for empty results dataframe (no duplicates for given date and newspaper)

7 years ago · 887f1aa774
parent 993f39957a
commit 887f1aa774
1 changed files with 18 additions and 12 deletions
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@ -35,18 +35,24 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words
  dfm <- dfm_gen(out, text = "full", words = words)
  simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
  diag(simil) <- NA
-  df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
+  df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))
-    rownames_to_column("rowid") %>%
+
-    mutate(colid = colnames(simil)[col]) %>%
+  if (length(rownames(df)) > 0) {
-    .[,c(1,4)] %>%
+    df <- df %>%
-    group_by(colid) %>% summarise(rowid=list(rowid))
+      rownames_to_column("rowid") %>%
-  text <- capture.output(stream_out(df))
+      mutate(colid = colnames(simil)[col]) %>%
-  write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
+      .[,c(1,4)] %>%
-  simil[upper.tri(simil)] <- NA
+      group_by(colid) %>% summarise(rowid=list(rowid))
-  write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
+    text <- capture.output(stream_out(df))
-        file = paste0(getwd(),'/remove_ids.txt'),
+    write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
-        append=T)
+    simil[upper.tri(simil)] <- NA
-  return(list(df,unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))
+    write(unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE))),
          file = paste0(getwd(),'/remove_ids.txt'),
          append=T)
    return(list(df,unique(rownames(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)))))
  } else {
    return(NULL)
  }
  ### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
  # id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
  # dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))