You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mamlr/R/feat_select.R

29 lines
1.6 KiB

#' Select features using quanteda textstat_keyness
#'
#' Select features based on the textstat_keyness function and a percentile cutoff
#' Percentiles are based on absolute values i.e. both on words that are key and *not* key to the topic
#'
#' @param topic The topic to determine keywords for
#' @param dfm The input dfm
#' @param class_type Name of the column in docvars containing the classification
#' @param percentile Cutoff for the list of words that should be returned
#' @param measure Measure to use in determining keyness, default = chi2; see textstat_keyness for other options
#' @return A vector of words that are key to the topic
#' @export
#' @examples
#' feat_select(topic, dfm, class_type, percentile, measure="chi2")
#################################################################################################
#################################### Feature selection ##########################################
#################################################################################################
feat_select <- function (topic, dfm, class_type, percentile, measure="chi2") {
# Use quanteda textstat_keyness to determine feature importance
keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
na.omit()
# Convert keyness values to absolute values, to take into account both positive and negative extremes
keyness[,2] <- abs(keyness[,2])
# Keep only the words with an absolute keyness value falling in the top [percentile] percentile
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
return(keyness)
}