You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mamlr/R/merger.R

60 lines
2.8 KiB

#' Merges list of lemmas back into a pseudo-document
#'
#' Merges list of lemmas back into a pseudo-document
#' @param out The elasticizer-generated data frame
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @return A documentified string of lemmas, one document at a time
#' @export
#' @examples
#' merger(out, text, clean)
#################################################################################################
#################################### Reconstructing documents from lemmas########################
#################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification
merger <- function(out, text, clean) {
df <- unnest(out, cols = '_source.ud') %>%
unnest(cols = c('lemma','upos')) %>%
# This line is added in the new merger function, in the old merger function this would result in the following:
# 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma
# 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA
filter(!is.na(lemma)) %>%
group_by(`_id`)
if (text == 'ud_upos') {
df <- df %>%
filter(upos != 'PUNCT') %>%
mutate(
lem_u = str_c(lemma,upos,sep="_")
) %>%
summarise(
merged = str_c(c(lem_u), collapse= ' ')
) %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is not taken into account, as it is already filtered out, see above
{if(clean == T) mutate(.,
merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "")
)
else . }
}
if (text == 'ud') {
df <- df %>%
summarise(
merged = str_c(c(lemma), collapse= ' ')
) %>%
mutate(
merged = str_replace_all(merged," \\$(.+?)", "\\1")
) %>%
# Regex removes all words consisting of or containing numbers, @#$%
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
# Regex also used in out_parser
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
{if(clean == T) mutate(.,
merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")
)
else . } %>%
mutate(.,
merged = paste0(merged,'. '))
}
return(df)
}