dfm_gen, out_parser: updated documentation

dupe_detect: major fix to function, no longer using rownames for article ids
master
Erik de Vries 6 years ago
parent 34531b0da8
commit 1955692346

@ -4,7 +4,7 @@
#' @param out The elasticizer-generated data frame #' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned! #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!
#' @return A Quanteda dfm #' @return A Quanteda dfm
#' @export #' @export
#' @examples #' @examples

@ -33,15 +33,17 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, es_su
} } }') } } }')
out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost) out <- elasticizer(query, es_pwd = es_pwd, localhost= localhost)
if (class(out$hits$hits) != 'list') { if (class(out$hits$hits) != 'list') {
dfm <- dfm_gen(out, text = "full", words = words) dfm <- dfm_gen(out, text = "full", words = words, clean = T)
if (sum(dfm[1,]) > 0) { if (sum(dfm[1,]) > 0) {
simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine")) simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
diag(simil) <- NA diag(simil) <- NA
df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>% duplicates <- which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)
rownames_to_column("rowid") %>% duplicates <- cbind(duplicates, rowid= rownames(duplicates))
rownames(duplicates) <- seq(1:length(rownames(duplicates)))
df <- as.data.frame(duplicates, make.names = NA) %>%
mutate(colid = colnames(simil)[col]) %>% mutate(colid = colnames(simil)[col]) %>%
.[,c(1,4)] %>% .[,c(3,4)] %>%
group_by(colid) %>% summarise(rowid=list(rowid)) group_by(rowid) %>% summarise(colid=list(colid))
text <- capture.output(stream_out(df)) text <- capture.output(stream_out(df))
# write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T) # write(text[-length(text)], file = paste0(getwd(),'/dupe_objects.json'), append=T)
simil[upper.tri(simil)] <- NA simil[upper.tri(simil)] <- NA

@ -3,7 +3,7 @@
#' Parse raw text into a single field #' Parse raw text into a single field
#' @param out The original output data frame #' @param out The original output data frame
#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]* #' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
#' @return a parsed output data frame including the additional column 'merged', containing the merged text #' @return a parsed output data frame including the additional column 'merged', containing the merged text
#' @examples #' @examples
#' out_parser(out,field) #' out_parser(out,field)

@ -13,7 +13,7 @@ dfm_gen(out, words = "999", text = "lemmas", clean)
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!}
} }
\value{ \value{
A Quanteda dfm A Quanteda dfm

@ -11,7 +11,7 @@ out_parser(out, field, clean = F)
\item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*} \item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code)}
} }
\value{ \value{
a parsed output data frame including the additional column 'merged', containing the merged text a parsed output data frame including the additional column 'merged', containing the merged text

Loading…
Cancel
Save