mamlr/R/out_parser.R

#' Parse raw text into a single field
#'
#' Parse raw text into a single field
#' @param out The original output data frame
#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)
#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)
#' @return a parsed output data frame including the additional column 'merged', containing the merged text
#' @examples
#' out_parser(out,field)

#################################################################################################
#################################### Parser function for output fields ##########################
#################################################################################################
out_parser <- function(out, field, clean = F, cores = detectCores()) {
  fncols <- function(data, cname) {
    add <-cname[!cname%in%names(data)]

    if(length(add)!=0) data[add] <- NA
    data
  }

  out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))
  par_parser <- function(row, out, field, clean) {
    doc <- out[row,]
    if (field == 'highlight') {
      doc <- replace(doc, doc=="NULL", NA)
      ### Replacing empty highlights with source text (to have the exact same text for udpipe to process)
      doc$highlight.title[is.na(doc$highlight.title)] <- doc$`_source.title`[is.na(doc$highlight.title)]
      doc$highlight.text[is.na(doc$highlight.text)] <- doc$`_source.text`[is.na(doc$highlight.text)]
      doc$highlight.teaser[is.na(doc$highlight.teaser)] <- doc$`_source.teaser`[is.na(doc$highlight.teaser)]
      doc$highlight.subtitle[is.na(doc$highlight.subtitle)] <- doc$`_source.subtitle`[is.na(doc$highlight.subtitle)]
      doc$highlight.preteaser[is.na(doc$highlight.preteaser)] <- doc$`_source.preteaser`[is.na(doc$highlight.preteaser)]

      doc <- doc %>%
        mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%
        mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%
        mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%
        mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%
        mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))
      doc$merged <- str_c(doc$highlight.title,
                          doc$highlight.subtitle,
                          doc$highlight.preteaser,
                          doc$highlight.teaser,
                          doc$highlight.text,
                          '',
                          sep = ". ")
    }

    if (field == '_source') {
      doc <- doc %>%
        mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
        mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
        mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
        mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
        mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
      doc$merged <- str_c(doc$`_source.title`,
                          doc$`_source.subtitle`,
                          doc$`_source.preteaser`,
                          doc$`_source.teaser`,
                          doc$`_source.text`,
                          '',
                          sep = ". ")
    }

    ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
    # Remove html tags, and multiple consequent whitespaces
    # Regex removes all words consisting of or containing numbers, @#$%
    # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
    # Regex also used in merger function
    ### Old regex, used for duplicate detection:
    # \\S*?[0-9@#$%]+[^\\s!?.,;:]*
    doc$merged <- doc$merged %>%
      str_replace_all("<.{0,20}?>", " ") %>%
      str_replace_all('(\\. ){2,}', '. ') %>%
      str_replace_all('([!?.])\\.','\\1') %>%
      str_replace_all("\\s+"," ") %>%
      {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . }
    return(doc)
  }
  if (Sys.info()[['sysname']] == "Windows") {
    cores <- 1
  } else {
    cores <- cores
  }
  out <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field, mc.cores = cores))
}
actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser) out_parser: function to parse raw text output into a single field, either from _source or highlight fields dupe_detect: updated function to use 'ver' parameter for versioning 6 years ago			`#' Parse raw text into a single field`
			`#'`
			`#' Parse raw text into a single field`
			`#' @param out The original output data frame`
			`#' @param field Either 'highlight' or '_source', for parsing of the highlighted search result text, or the original source text`
dfm_gen, out_parser: updated documentation dupe_detect: major fix to function, no longer using rownames for article ids 6 years ago			`#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code)`
actorizer, dfm_gen, modelizer, out_parser: replaced all instances of detectCores by cores parameter (which defaults to detectCores) 6 years ago			`#' @param cores Number of cores to use for parallel processing, defaults to detectCores() (all cores available)`
actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser) out_parser: function to parse raw text output into a single field, either from _source or highlight fields dupe_detect: updated function to use 'ver' parameter for versioning 6 years ago			`#' @return a parsed output data frame including the additional column 'merged', containing the merged text`
			`#' @examples`
			`#' out_parser(out,field)`

			`#################################################################################################`
			`#################################### Parser function for output fields ##########################`
			`#################################################################################################`
actorizer, dfm_gen, modelizer, out_parser: replaced all instances of detectCores by cores parameter (which defaults to detectCores) 6 years ago			`out_parser <- function(out, field, clean = F, cores = detectCores()) {`
actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser) out_parser: function to parse raw text output into a single field, either from _source or highlight fields dupe_detect: updated function to use 'ver' parameter for versioning 6 years ago			`fncols <- function(data, cname) {`
			`add <-cname[!cname%in%names(data)]`

			`if(length(add)!=0) data[add] <- NA`
			`data`
			`}`

			`out <- fncols(out, c("highlight.text","highlight.title","highlight.teaser", "highlight.subtitle", "highlight.preteaser", '_source.text', '_source.title','_source.teaser','_source.subtitle','_source.preteaser'))`
out_parser: parallelized when not in windoze 6 years ago			`par_parser <- function(row, out, field, clean) {`
			`doc <- out[row,]`
			`if (field == 'highlight') {`
			`doc <- replace(doc, doc=="NULL", NA)`
			`### Replacing empty highlights with source text (to have the exact same text for udpipe to process)`
			doc$highlight.title[is.na(doc$highlight.title)] <- doc$`_source.title`[is.na(doc$highlight.title)]
			doc$highlight.text[is.na(doc$highlight.text)] <- doc$`_source.text`[is.na(doc$highlight.text)]
			doc$highlight.teaser[is.na(doc$highlight.teaser)] <- doc$`_source.teaser`[is.na(doc$highlight.teaser)]
			doc$highlight.subtitle[is.na(doc$highlight.subtitle)] <- doc$`_source.subtitle`[is.na(doc$highlight.subtitle)]
			doc$highlight.preteaser[is.na(doc$highlight.preteaser)] <- doc$`_source.preteaser`[is.na(doc$highlight.preteaser)]
actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser) out_parser: function to parse raw text output into a single field, either from _source or highlight fields dupe_detect: updated function to use 'ver' parameter for versioning 6 years ago
out_parser: parallelized when not in windoze 6 years ago			`doc <- doc %>%`
			`mutate(highlight.title = str_replace_na(highlight.title, replacement = '')) %>%`
			`mutate(highlight.subtitle = str_replace_na(highlight.subtitle, replacement = '')) %>%`
			`mutate(highlight.preteaser = str_replace_na(highlight.preteaser, replacement = '')) %>%`
			`mutate(highlight.teaser = str_replace_na(highlight.teaser, replacement = '')) %>%`
			`mutate(highlight.text = str_replace_na(highlight.text, replacement = ''))`
			`doc$merged <- str_c(doc$highlight.title,`
			`doc$highlight.subtitle,`
			`doc$highlight.preteaser,`
			`doc$highlight.teaser,`
			`doc$highlight.text,`
			`'',`
			`sep = ". ")`
			`}`
actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser) out_parser: function to parse raw text output into a single field, either from _source or highlight fields dupe_detect: updated function to use 'ver' parameter for versioning 6 years ago
out_parser: parallelized when not in windoze 6 years ago			`if (field == '_source') {`
			`doc <- doc %>%`
			mutate(`_source.title` = str_replace_na(`_source.title`, replacement = '')) %>%
			mutate(`_source.subtitle` = str_replace_na(`_source.subtitle`, replacement = '')) %>%
			mutate(`_source.preteaser` = str_replace_na(`_source.preteaser`, replacement = '')) %>%
			mutate(`_source.teaser` = str_replace_na(`_source.teaser`, replacement = '')) %>%
			mutate(`_source.text` = str_replace_na(`_source.text`, replacement = ''))
			doc$merged <- str_c(doc$`_source.title`,
			doc$`_source.subtitle`,
			doc$`_source.preteaser`,
			doc$`_source.teaser`,
			doc$`_source.text`,
			`'',`
			`sep = ". ")`
			`}`
actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser) out_parser: function to parse raw text output into a single field, either from _source or highlight fields dupe_detect: updated function to use 'ver' parameter for versioning 6 years ago
out_parser: parallelized when not in windoze 6 years ago			`### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences`
			`# Remove html tags, and multiple consequent whitespaces`
			`# Regex removes all words consisting of or containing numbers, @#$%`
			`# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above`
			`# Regex also used in merger function`
			`### Old regex, used for duplicate detection:`
			`# \\S?[0-9@#$%]+[^\\s!?.,;:]`
			`doc$merged <- doc$merged %>%`
			`str_replace_all("<.{0,20}?>", " ") %>%`
			`str_replace_all('(\\. ){2,}', '. ') %>%`
			`str_replace_all('([!?.])\\.','\\1') %>%`
			`str_replace_all("\\s+"," ") %>%`
			`{if(clean == T) str_replace_all(.,"\\S?[0-9@#$%]+([^\\s!?.,;:]\|[!?.,:;]\\S)", "") else . }`
			`return(doc)`
			`}`
			`if (Sys.info()[['sysname']] == "Windows") {`
			`cores <- 1`
			`} else {`
actorizer, dfm_gen, modelizer, out_parser: replaced all instances of detectCores by cores parameter (which defaults to detectCores) 6 years ago			`cores <- cores`
out_parser: parallelized when not in windoze 6 years ago			`}`
			`out <- bind_rows(mclapply(seq(1,length(out[[1]]),1), par_parser, out = out, clean = clean, field = field, mc.cores = cores))`
actorizer, dfm_gen, ud_update: unified output parsing from _source and highlight fields into a single function (out_parser) out_parser: function to parse raw text output into a single field, either from _source or highlight fields dupe_detect: updated function to use 'ver' parameter for versioning 6 years ago			`}`