merger: Added custom clean option (sometimes not cleaning is preferred, even with lemmas)
merger, out_parser: Updated regex for filtering out non-words to also include email addresses (containing both @ and .)
#' @param out The elasticizer-generated data frame
#' @param out The elasticizer-generated data frame
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!
#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
#' @return A Quanteda dfm
#' @return A Quanteda dfm
#' @export
#' @export
#' @examples
#' @examples
@ -22,9 +22,8 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
out<-out%>%
out<-out%>%
select(`_id`,matches("_source.*"))### Keep only the id and anything belonging to the source field
select(`_id`,matches("_source.*"))### Keep only the id and anything belonging to the source field
@ -11,9 +11,9 @@ dfm_gen(out, words = "999", text = "lemmas", clean)
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
\item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!}
\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}