dfm_gen & merger: Changed word cutoff point to be a general setting in dfm_gen. Cuts off at the last [.?!] before the cutoff point (so returns documents at a sentence, shorter than cutoff).

7 years ago · 02b8a8c1da
parent 4a713ddc23
commit 02b8a8c1da
3 changed files with 14 additions and 9 deletions
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -16,13 +16,13 @@
 # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
-dfm_gen <- function(out,words = '999', text = "lemmas") {
+dfm_gen <- function(out, words = '999', text = "lemmas") {
  # Create subset with just ids, codes and text
  out <- out %>%
    select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
  fields <- length(names(out))
  if (text == "lemmas") {
-    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores()))
+    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, mc.cores = detectCores()))
  }
  if (text == "full") {
    out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
@ -35,6 +35,12 @@ dfm_gen <- function(out,words = '999', text = "lemmas") {
      str_replace_all("<.*?>", " ") %>%
      str_replace_all("\\s+"," ")
  }
  if (words != "999") {
    ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary
    # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))
    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>%
      str_extract('.*[.?!]'))
  }
  if ('_source.codes.majorTopic' %in% colnames(out)) {
    out <- out %>%
      mutate(codes = as.numeric(case_when(
--- a/R/dupe_detect.R
+++ b/R/dupe_detect.R
@ -6,16 +6,17 @@
 #' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)
 #' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)
 #' @param es_pwd Password for Elasticsearch read access
 #' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
 #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
 #' @export
 #' @examples
-#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd)
+#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, words)
 #################################################################################################
 #################################### Duplicate detector ################################
 #################################################################################################
-dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd) {
+dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words) {
  params <- grid[row,]
  print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
  query <- paste0('{"query":
@ -31,7 +32,7 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd) {
  out <- elasticizer(query, es_pwd = es_pwd)
-  dfm <- dfm_gen(out, text = "full")
+  dfm <- dfm_gen(out, text = "full", words = words)
  simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
  diag(simil) <- NA
  df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%
--- a/R/merger.R
+++ b/R/merger.R
@ -12,7 +12,7 @@
 #################################### Reconstructing documents from lemmas########################
 #################################################################################################
 ## Only merging lemmas for now, feature selection has no impact on junk classification
-merger <- function(row, words = '999', out = out) {
+merger <- function(row, out = out) {
  df <- out[row,]
  # Mergin lemmas into single string
  lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
@ -22,7 +22,5 @@ merger <- function(row, words = '999', out = out) {
    str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
    # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
    paste0(.,". ")
  if (words != "999") {
    lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
  return(lemmas)
 }