dfm_gen, out_parser: updated documentation

dupe_detect: major fix to function, no longer using rownames for article ids
6 years ago · 1955692346
parent 34531b0da8
commit 1955692346
5 changed files with 11 additions and 9 deletions
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -4,7 +4,7 @@
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
-#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!
 #' @return A Quanteda dfm
 #' @export
 #' @examples
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@ -33,15 +33,17 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
                  } } }')
  out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost)
  if (class(out$hits$hits) != 'list') {
-    dfm <- dfm_gen(out, text = "full", words = words)
+    dfm <- dfm_gen(out, text = "full", words = words, clean = T)
    if (sum(dfm[1,]) > 0) {
      simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
      diag(simil) <- NA
-      df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
+      duplicates <- which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)
-        rownames_to_column("rowid") %>%
+      duplicates <- cbind(duplicates, rowid= rownames(duplicates))
      rownames(duplicates) <- seq(1:length(rownames(duplicates)))
      df <- as.data.frame(duplicates, make.names = NA) %>%
        mutate(colid = colnames(simil)[col]) %>%
-        .[,c(1,4)] %>%
+        .[,c(3,4)] %>%
-        group_by(colid) %>% summarise(rowid=list(rowid))
+        group_by(rowid) %>% summarise(colid=list(colid))
      text <- capture.output(stream_out(df))
      # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
      simil[upper.tri(simil)] <- NA
--- a/R/out_parser.R
+++ b/R/out_parser.R
@ -3,7 +3,7 @@
 #' Parse raw text into a single field
 #' @param out The original output data frame
 #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
-#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
 #' @return a parsed output data frame including the additional column 'merged', containing the merged text
 #' @examples
 #' out_parser(out,field)
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@ -13,7 +13,7 @@ dfm_gen(out, words = "999", text = "lemmas", clean)
 \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
-\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!}
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!}
 }
 \value{
 A Quanteda dfm
--- a/man/out_parser.Rd
+++ b/man/out_parser.Rd
@ -11,7 +11,7 @@ out_parser(out, field, clean = F)
 \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
-\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*}
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
 }
 \value{
 a parsed output data frame including the additional column 'merged', containing the merged text