dfm_gen & merger: Changed word cutoff point to be a general setting in dfm_gen. Cuts off at the last [.?!] before the cutoff point (so returns documents at a sentence, shorter than cutoff).

master
Erik de Vries 6 years ago
parent 4a713ddc23
commit 02b8a8c1da

@ -16,13 +16,13 @@
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out,words = '999', text = "lemmas") { dfm_gen <- function(out, words = '999', text = "lemmas") {
# Create subset with just ids, codes and text # Create subset with just ids, codes and text
out <- out %>% out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out)) fields <- length(names(out))
if (text == "lemmas") { if (text == "lemmas") {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, words = words, out = out, mc.cores = detectCores())) out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, mc.cores = detectCores()))
} }
if (text == "full") { if (text == "full") {
out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "), out$merged <- str_c(str_replace_na(out$`_source.title`, replacement = " "),
@ -35,6 +35,12 @@ dfm_gen <- function(out,words = '999', text = "lemmas") {
str_replace_all("<.*?>", " ") %>% str_replace_all("<.*?>", " ") %>%
str_replace_all("\\s+"," ") str_replace_all("\\s+"," ")
} }
if (words != "999") {
### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary
# out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))
out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>%
str_extract('.*[.?!]'))
}
if ('_source.codes.majorTopic' %in% colnames(out)) { if ('_source.codes.majorTopic' %in% colnames(out)) {
out <- out %>% out <- out %>%
mutate(codes = as.numeric(case_when( mutate(codes = as.numeric(case_when(

@ -6,16 +6,17 @@
#' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive) #' @param cutoff_lower Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)
#' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive) #' @param cutoff_upper Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)
#' @param es_pwd Password for Elasticsearch read access #' @param es_pwd Password for Elasticsearch read access
#' @param words Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])
#' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory #' @return dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
#' @export #' @export
#' @examples #' @examples
#' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd) #' dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, words)
################################################################################################# #################################################################################################
#################################### Duplicate detector ################################ #################################### Duplicate detector ################################
################################################################################################# #################################################################################################
dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd) { dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd, words) {
params <- grid[row,] params <- grid[row,]
print(paste0('Parsing ',params$doctypes,' on ',params$dates )) print(paste0('Parsing ',params$doctypes,' on ',params$dates ))
query <- paste0('{"query": query <- paste0('{"query":
@ -31,7 +32,7 @@ dupe_detect <- function(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd) {
out <- elasticizer(query, es_pwd = es_pwd) out <- elasticizer(query, es_pwd = es_pwd)
dfm <- dfm_gen(out, text = "full") dfm <- dfm_gen(out, text = "full", words = words)
simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine")) simil <- as.matrix(textstat_simil(dfm, margin="documents", method="cosine"))
diag(simil) <- NA diag(simil) <- NA
df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>% df <- as.data.frame(which(simil >= cutoff_lower & simil <= cutoff_upper, arr.ind = TRUE)) %>%

@ -12,7 +12,7 @@
#################################### Reconstructing documents from lemmas######################## #################################### Reconstructing documents from lemmas########################
################################################################################################# #################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification ## Only merging lemmas for now, feature selection has no impact on junk classification
merger <- function(row, words = '999', out = out) { merger <- function(row, out = out) {
df <- out[row,] df <- out[row,]
# Mergin lemmas into single string # Mergin lemmas into single string
lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ') lemmas <- paste(str_split(df$`_source.tokens.lemmas`, "\\|")[[1]],collapse = ' ')
@ -22,7 +22,5 @@ merger <- function(row, words = '999', out = out) {
str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>% str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>%
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
paste0(.,". ") paste0(.,". ")
if (words != "999") {
lemmas <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))}
return(lemmas) return(lemmas)
} }
Loading…
Cancel
Save