@ -4,18 +4,24 @@
#' @param row A row number form the Elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param out The elasticizer-generated data frame
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @return A documentified string of lemmas, one document at a time
#' @export
#' @examples
#' merger(1, words = '999', out = ou t)
#' merger(1, words = '999', out , tex t)
#################################################################################################
#################################### Reconstructing documents from lemmas########################
#################################################################################################
## Only merging lemmas for now, feature selection has no impact on junk classification
merger <- function ( row , out = ou t) {
merger <- function ( row , out , tex t) {
df <- out [row , ]
# Mergin lemmas into single string
lemmas <- paste ( str_split ( df $ `_source.tokens.lemmas` , " \\|" ) [ [1 ] ] , collapse = ' ' )
if ( text == ' lemmas' ) {
lemmas <- paste ( str_split ( df $ `_source.tokens.lemmas` , " \\|" ) [ [1 ] ] , collapse = ' ' )
}
if ( text == ' ud' ) {
lemmas <- paste0 ( df $ `_source.ud` [ [1 ] ] $ lemma [ [1 ] ] , collapse = ' ' )
}
# Replacing $-marked punctuation with their regular forms
lemmas <- str_replace_all ( lemmas , " \\$(.+?)" , " \\1" ) %>%
### Removing numbers and non-words containing numbers