diff --git a/R/dfm_gen.R b/R/dfm_gen.R index 91e6e46..1eb9bd9 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -4,7 +4,7 @@ #' @param out The elasticizer-generated data frame #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" -#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned! +#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned! #' @return A Quanteda dfm #' @export #' @examples diff --git a/R/dupe_detect.R b/R/dupe_detect.R index 63e4e9e..f03a58f 100644 --- a/R/dupe_detect.R +++ b/R/dupe_detect.R @@ -33,15 +33,17 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su } } }') out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost) if (class(out$hits$hits) != 'list') { - dfm <- dfm_gen(out, text = "full", words = words) + dfm <- dfm_gen(out, text = "full", words = words, clean = T) if (sum(dfm[1,]) > 0) { simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine")) diag(simil) <- NA - df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>% - rownames_to_column("rowid") %>% + duplicates <- which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE) + duplicates <- cbind(duplicates, rowid= rownames(duplicates)) + rownames(duplicates) <- seq(1:length(rownames(duplicates))) + df <- as.data.frame(duplicates, make.names = NA) %>% mutate(colid = colnames(simil)[col]) %>% - .[,c(1,4)] %>% - group_by(colid) %>% summarise(rowid=list(rowid)) + .[,c(3,4)] %>% + group_by(rowid) %>% summarise(colid=list(colid)) text <- capture.output(stream_out(df)) # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T) simil[upper.tri(simil)] <- NA diff --git a/R/out_parser.R b/R/out_parser.R index 369e114..eee720b 100644 --- a/R/out_parser.R +++ b/R/out_parser.R @@ -3,7 +3,7 @@ #' Parse raw text into a single field #' @param out The original output data frame #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text -#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]* +#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code) #' @return a parsed output data frame including the additional column 'merged', containing the merged text #' @examples #' out_parser(out,field) diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd index 1ef1ea3..4bed478 100644 --- a/man/dfm_gen.Rd +++ b/man/dfm_gen.Rd @@ -13,7 +13,7 @@ dfm_gen(out, words = "999", text = "lemmas", clean) \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} -\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!} +\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!} } \value{ A Quanteda dfm diff --git a/man/out_parser.Rd b/man/out_parser.Rd index a67a904..cb13609 100644 --- a/man/out_parser.Rd +++ b/man/out_parser.Rd @@ -11,7 +11,7 @@ out_parser(out, field, clean = F) \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} -\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*} +\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)} } \value{ a parsed output data frame including the additional column 'merged', containing the merged text