diff --git a/R/dfm_gen.R b/R/dfm_gen.R
index 1eb9bd9..3b9ff2e 100644
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@@ -3,8 +3,8 @@
 #' Generates dfm from ElasticSearch output
 #' @param out The elasticizer-generated data frame
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
-#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
-#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!
+#' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @return A Quanteda dfm
 #' @export
 #' @examples
@@ -22,9 +22,8 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
   out <- out %>%
     select(`_id`, matches("_source.*")) ### Keep only the id and anything belonging to the source field
   fields <- length(names(out))
-  if (text == "lemmas" || text == 'ud') {
-    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, mc.cores = detectCores()))
-
+  if (text == "lemmas" || text == 'ud' || text == 'ud_upos') {
+    out$merged <- unlist(mclapply(seq(1,length(out[[1]]),1),merger, out = out, text = text, clean = clean, mc.cores = detectCores()))
   }
   if (text == "full") {
     out <- out_parser(out, field = '_source' , clean = clean)
@@ -57,8 +56,11 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean) {
   if (words != "999") {
     ### Former word count regex, includes words up until the next sentence boundary, instead of cutting to the last sentence boundary
     # out$merged2 <- str_extract(lemmas, str_c("^(([\\s\\S]*? ){0,",words,"}[\\s\\S]*?[.!?])\\s+?"))
-    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' ') %>%
-                                          str_extract('.*[.?!]'))
+    out <- out %>% rowwise() %>% mutate(merged = paste0(str_split(merged, '\\s')[[1]][1:words], collapse = ' '))
+
+    if(text != 'ud_upos') {
+      out$merged <- str_extract(out$merged,'.*[.?!]')
+    }
   }
   dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
     dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
diff --git a/R/merger.R b/R/merger.R
index a8f8706..440c462 100644
--- a/R/merger.R
+++ b/R/merger.R
@@ -5,6 +5,7 @@
 #' @param words String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document
 #' @param out The elasticizer-generated data frame
 #' @param text String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"
+#' @param clean Boolean indicating whether the results should be cleaned by removing words matching regex (see code).
 #' @return A documentified string of lemmas, one document at a time
 #' @export
 #' @examples
@@ -13,7 +14,7 @@
 #################################### Reconstructing documents from lemmas########################
 #################################################################################################
 ## Only merging lemmas for now, feature selection has no impact on junk classification
-merger <- function(row, out, text) {
+merger <- function(row, out, text, clean) {
   df <- out[row,]
   # Mergin lemmas into single string
   if (text == 'lemmas') {
@@ -22,10 +23,20 @@ merger <- function(row, out, text) {
   if (text == 'ud') {
     lemmas <- paste0(df$`_source.ud`[[1]]$lemma[[1]], collapse = ' ')
   }
+  if (text == 'ud_upos') {
+    df <- unnest(df,`_source.ud`)
+    lemmas <- str_c(unlist(df$lemma)[which(unlist(df$upos) != 'PUNCT')], unlist(df$upos)[which(unlist(df$upos) != 'PUNCT')], sep = '_', collapse = ' ') %>%
+      # Regex removes all words consisting of or containing numbers, @#$%
+      # Punctuation is not taken into account, as it is already filtered out, see above
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s]*", "")  else . }
+    return(lemmas)
+  }
   # Replacing $-marked punctuation with their regular forms
   lemmas <- str_replace_all(lemmas," \\$(.+?)", "\\1") %>%
-    ### Removing numbers and non-words containing numbers
-    str_replace_all("\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "") %>%
+    # Regex removes all words consisting of or containing numbers, @#$%
+    # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
+    # Regex also used in out_parser
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
     # Adding extra . at end of string to allow for strings that contain less than 150 words and do not end on ". "
     paste0(.,". ")
   return(lemmas)
diff --git a/R/out_parser.R b/R/out_parser.R
index eee720b..a93e3ad 100644
--- a/R/out_parser.R
+++ b/R/out_parser.R
@@ -62,8 +62,11 @@ out_parser <- function(out, field, clean = F) {
 
   ### Use correct interpunction, by inserting a '. ' at the end of every text field, then removing any duplicate occurences
   # Remove html tags, and multiple consequent whitespaces
+  # Regex removes all words consisting of or containing numbers, @#$%
+  # Punctuation is only filtered out when not followed by a whitespace character, and when the word contains any of the characters above
+  # Regex also used in merger function
   out$merged <- out$merged %>%
-    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+[^\\s!?.,;:]*", "")  else . } %>%
+    {if(clean == T) str_replace_all(.,"\\S*?[0-9@#$%]+([^\\s!?.,;:]|[!?.,:;]\\S)*", "")  else . } %>%
     str_replace_all("<.{0,20}?>", " ") %>%
     str_replace_all('(\\. ){2,}', '. ') %>%
     str_replace_all('([!?.])\\.','\\1') %>%
diff --git a/man/dfm_gen.Rd b/man/dfm_gen.Rd
index 4bed478..5ed3679 100644
--- a/man/dfm_gen.Rd
+++ b/man/dfm_gen.Rd
@@ -11,9 +11,9 @@ dfm_gen(out, words = "999", text = "lemmas", clean)
 
 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
 
-\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
+\item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud", or ud_upos combining lemmas with upos tags}
 
-\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code). Lemmatized output is always cleaned!}
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
 }
 \value{
 A Quanteda dfm
diff --git a/man/merger.Rd b/man/merger.Rd
index 4efe147..85f280a 100644
--- a/man/merger.Rd
+++ b/man/merger.Rd
@@ -4,7 +4,7 @@
 \alias{merger}
 \title{Merges list of lemmas back into a pseudo-document}
 \usage{
-merger(row, out, text)
+merger(row, out, text, clean)
 }
 \arguments{
 \item{row}{A row number form the Elasticizer-generated data frame}
@@ -13,6 +13,8 @@ merger(row, out, text)
 
 \item{text}{String indicating whether the "merged" field will contain the "full" text, old-style "lemmas" (will be deprecated), new-style "ud"}
 
+\item{clean}{Boolean indicating whether the results should be cleaned by removing words matching regex (see code).}
+
 \item{words}{String indicating the number of words to keep from each document (maximum document length), 999 indicates the whole document}
 }
 \value{