out_parser: added option to clean output using regex to remove numbers and non-words

dfm_gen, ud_update: updated functions to make use of out_parser cleaning option
merger: updated regex for cleaning lemmatized output
master
Erik de Vries 6 years ago
parent 5851c56369
commit 34531b0da8

@ -4,6 +4,7 @@
#' @param out The elasticizer-generated data frame #' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud" #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!
#' @return A Quanteda dfm #' @return A Quanteda dfm
#' @export #' @export
#' @examples #' @examples
@ -16,16 +17,17 @@
# filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack # filter(`_source.codes.timeSpent` != -1) %>% ### Exclude Norwegian summer sample hack
dfm_gen <- function(out, words = '999', text = "lemmas") { dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
# Create subset with just ids, codes and text # Create subset with just ids, codes and text
out <- out %>% out <- out %>%
select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
fields <- length(names(out)) fields <- length(names(out))
if (text == "lemmas" || text == 'ud') { if (text == "lemmas" || text == 'ud') {
out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores())) out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores()))
} }
if (text == "full") { if (text == "full") {
out <- out_parser(out, field = '_source') out <- out_parser(out, field = '_source' , clean = clean)
} }
if ('_source.codes.majorTopic' %in% colnames(out)) { if ('_source.codes.majorTopic' %in% colnames(out)) {
out <- out %>% out <- out %>%

@ -25,7 +25,7 @@ merger <- function(row, out, text) {
# Replacing $-marked punctuation with their regular forms # Replacing $-marked punctuation with their regular forms
lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>% lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
### Removing numbers and non-words containing numbers ### Removing numbers and non-words containing numbers
str_replace_all("\\S*?[0-9@#]+(\\S*?)([:;.,?!\\s])+?", "\\2") %>% str_replace_all("\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") %>%
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". " # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
paste0(.,". ") paste0(.,". ")
return(lemmas) return(lemmas)

@ -3,6 +3,7 @@
#' Parse raw text into a single field #' Parse raw text into a single field
#' @param out The original output data frame #' @param out The original output data frame
#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text #' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*
#' @return a parsed output data frame including the additional column 'merged', containing the merged text #' @return a parsed output data frame including the additional column 'merged', containing the merged text
#' @examples #' @examples
#' out_parser(out,field) #' out_parser(out,field)
@ -10,7 +11,7 @@
################################################################################################# #################################################################################################
#################################### Parser function for output fields ########################## #################################### Parser function for output fields ##########################
################################################################################################# #################################################################################################
out_parser <- function(out, field) { out_parser <- function(out, field, clean = F) {
fncols <- function(data, cname) { fncols <- function(data, cname) {
add <-cname[!cname%in%names(data)] add <-cname[!cname%in%names(data)]
@ -62,6 +63,7 @@ out_parser <- function(out, field) {
### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
# Remove html tags, and multiple consequent whitespaces # Remove html tags, and multiple consequent whitespaces
out$merged <- out$merged %>% out$merged <- out$merged %>%
{if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") else . } %>%
str_replace_all("<.{0,20}?>", " ") %>% str_replace_all("<.{0,20}?>", " ") %>%
str_replace_all('(\\. ){2,}', '. ') %>% str_replace_all('(\\. ){2,}', '. ') %>%
str_replace_all('([!?.])\\.','\\1') %>% str_replace_all('([!?.])\\.','\\1') %>%

@ -20,7 +20,7 @@
# } # }
ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) { ud_update <- function(out, localhost = T, udmodel, es_super = .rs.askForPassword("ElasticSearch WRITE"), cores = detectCores(), ver) {
out <- out_parser(out, field = '_source') out <- out_parser(out, field = '_source', clean = F)
par_proc <- function(row, out, udmodel) { par_proc <- function(row, out, udmodel) {
doc <- out[row,] doc <- out[row,]
ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>% ud <- as.data.frame(udpipe_annotate(udmodel, x = doc$merged, parser = "default", doc_id = doc$`_id`)) %>%

@ -4,7 +4,7 @@
\alias{dfm_gen} \alias{dfm_gen}
\title{Generates dfm from ElasticSearch output} \title{Generates dfm from ElasticSearch output}
\usage{ \usage{
dfm_gen(out, words = "999", text = "lemmas") dfm_gen(out, words = "999", text = "lemmas", clean)
} }
\arguments{ \arguments{
\item{out}{The elasticizer-generated data frame} \item{out}{The elasticizer-generated data frame}
@ -12,6 +12,8 @@ dfm_gen(out, words = "999", text = "lemmas")
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document} \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"} \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*. Lemmatized output is always cleaned!}
} }
\value{ \value{
A Quanteda dfm A Quanteda dfm

@ -4,12 +4,14 @@
\alias{out_parser} \alias{out_parser}
\title{Parse raw text into a single field} \title{Parse raw text into a single field}
\usage{ \usage{
out_parser(out, field) out_parser(out, field, clean = F)
} }
\arguments{ \arguments{
\item{out}{The original output data frame} \item{out}{The original output data frame}
\item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text} \item{field}{Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching the regex \S*?[0-9@#$%]+[^\s!?.,;:]*}
} }
\value{ \value{
a parsed output data frame including the additional column 'merged', containing the merged text a parsed output data frame including the additional column 'merged', containing the merged text

Loading…
Cancel
Save