diff --git a/R/dfm_gen.R b/R/dfm_gen.R index fee5173..bc603e7 100644 --- a/R/dfm_gen.R +++ b/R/dfm_gen.R @@ -16,13 +16,13 @@ # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack -dfm_gen <- function(out,words = '999', text = "lemmas") { +dfm_gen <- function(out, words = '999', text = "lemmas") { # Create subset with just ids, codes and text out <- out %>% select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field fields <- length(names(out)) if (text == "lemmas") { - out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores())) + out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, mc.cores = detectCores())) } if (text == "full") { out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "), @@ -35,6 +35,12 @@ dfm_gen <- function(out,words = '999', text = "lemmas") { str_replace_all("<.*?>", " ") %>% str_replace_all("\\s+"," ") } + if (words != "999") { + ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary + # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?")) + out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>% + str_extract('.*[.?!]')) + } if ('_source.codes.majorTopic' %in% colnames(out)) { out <- out %>% mutate(codes = as.numeric(case_when( diff --git a/R/dupe_detect.R b/R/dupe_detect.R index cf01a0f..4010ed8 100644 --- a/R/dupe_detect.R +++ b/R/dupe_detect.R @@ -6,16 +6,17 @@ #' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive) #' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive) #' @param es_pwd Password for Elasticsearch read access +#' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words]) #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory #' @export #' @examples -#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd) +#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, words) ################################################################################################# #################################### Duplicate detector ################################ ################################################################################################# -dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd) { +dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words) { params <- grid[row,] print(paste0('Parsing ',params$doctypes,' on ',params$dates )) query <- paste0('{"query": @@ -31,7 +32,7 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd) { out <- elasticizer(query, es_pwd = es_pwd) - dfm <- dfm_gen(out, text = "full") + dfm <- dfm_gen(out, text = "full", words = words) simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine")) diag(simil) <- NA df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>% diff --git a/R/merger.R b/R/merger.R index 8d17318..975d32a 100644 --- a/R/merger.R +++ b/R/merger.R @@ -12,7 +12,7 @@ #################################### Reconstructing documents from lemmas######################## ################################################################################################# ## Only merging lemmas for now, feature selection has no impact on junk classification -merger <- function(row, words = '999', out = out) { +merger <- function(row, out = out) { df <- out[row,] # Mergin lemmas into single string lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ') @@ -22,7 +22,5 @@ merger <- function(row, words = '999', out = out) { str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>% # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " paste0(.,". ") - if (words != "999") { - lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))} return(lemmas) -} \ No newline at end of file +}