You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
60 lines
2.8 KiB
60 lines
2.8 KiB
#' Merges list of lemmas back into a pseudo-document
|
|
#'
|
|
#' Merges list of lemmas back into a pseudo-document
|
|
#' @param out The elasticizer-generated data frame
|
|
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
|
|
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
|
|
#' @return A documentified string of lemmas, one document at a time
|
|
#' @export
|
|
#' @examples
|
|
#' merger(out, text, clean)
|
|
#################################################################################################
|
|
#################################### Reconstructing documents from lemmas########################
|
|
#################################################################################################
|
|
## Only merging lemmas for now, feature selection has no impact on junk classification
|
|
merger <- function(out, text, clean) {
|
|
df <- unnest(out, cols = '_source.ud') %>%
|
|
unnest(cols = c('lemma','upos')) %>%
|
|
# This line is added in the new merger function, in the old merger function this would result in the following:
|
|
# 1: when using ud, it would result in the string "NA" being present in place of the faulty lemma
|
|
# 2: when using ud_upos, it would result in the entire article becoming NA, because of str_c() returning NA when any value is NA
|
|
filter(!is.na(lemma)) %>%
|
|
group_by(`_id`)
|
|
if (text == 'ud_upos') {
|
|
df <- df %>%
|
|
filter(upos != 'PUNCT') %>%
|
|
mutate(
|
|
lem_u = str_c(lemma,upos,sep="_")
|
|
) %>%
|
|
summarise(
|
|
merged = str_c(c(lem_u), collapse= ' ')
|
|
) %>%
|
|
# Regex removes all words consisting of or containing numbers, @#$%
|
|
# Punctuation is not taken into account, as it is already filtered out, see above
|
|
{if(clean == T) mutate(.,
|
|
merged = str_replace_all(merged,"\\S*?[0-9@#$%]+[^\\s]*", "")
|
|
)
|
|
else . }
|
|
}
|
|
if (text == 'ud') {
|
|
df <- df %>%
|
|
summarise(
|
|
merged = str_c(c(lemma), collapse= ' ')
|
|
) %>%
|
|
mutate(
|
|
merged = str_replace_all(merged," \\$(.+?)", "\\1")
|
|
) %>%
|
|
# Regex removes all words consisting of or containing numbers, @#$%
|
|
# Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
|
|
# Regex also used in out_parser
|
|
# Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
|
|
{if(clean == T) mutate(.,
|
|
merged = str_replace_all(merged,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")
|
|
)
|
|
else . } %>%
|
|
mutate(.,
|
|
merged = paste0(merged,'. '))
|
|
}
|
|
return(df)
|
|
}
|