Fixed dupe_detect error on documents with one sentence or less, and a maximum # of words in dfm_gen

7 years ago · ef51ce60a9
parent 0e8c127b86
commit ef51ce60a9
2 changed files with 32 additions and 23 deletions
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@ -30,9 +30,10 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
                  }}]

                  } } }')
-  out <- elasticizer(query, es_pwd = es_pwd, localhost=T)
+  out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost)
  if (class(out$hits$hits) != 'list') {
    dfm <- dfm_gen(out, text = "full", words = words)
+    if (sum(dfm[1,]) > 0) {
      simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
      diag(simil) <- NA
      df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
@ -57,6 +58,9 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
    } else {
      return(paste0('No results for ',params$doctypes,' on ',params$dates ))
    }
+  } else {
+    return(paste0('No results for ',params$doctypes,' on ',params$dates ))
+  }
  ### Dummy code to verify that filtering out unique ids using the bottom half of the matrix actually works
  # id_filter <- unique(rownames(which(simil >= cutoff, arr.ind = TRUE)))
  # dfm_nodupes <- dfm_subset(dfm, !(docnames(dfm) %in% id_filter))
--- a/man/dupe_detect.Rd
+++ b/man/dupe_detect.Rd
@ -4,7 +4,8 @@
 \alias{dupe_detect}
 \title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
 \usage{
-dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words)
+dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_super,
+  words, localhost = T)
 }
 \arguments{
 \item{row}{Row of grid to parse}
@ -17,7 +18,11 @@ dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words)

 \item{es_pwd}{Password for Elasticsearch read access}

+\item{es_super}{Password for write access to ElasticSearch}
+
 \item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])}
+
+\item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)}
 }
 \value{
 dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
@ -26,5 +31,5 @@ dupe_objects.json and data frame containing each id and all its duplicates. remo
 Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
 }
 \examples{
-dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, words)
+dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)
 }