class_update: check if there are idf values associated with model, before applying weights estimator: make use of preproc() function for data preprocessing preproc: function containing all logic with regards to text data preprocessing and weightingmaster
parent
a3b6e19646
commit
9eae486a80
@ -0,0 +1,42 @@
|
|||||||
|
#' Preprocess dfm data for use in modeling procedure
|
||||||
|
#'
|
||||||
|
#' Process dfm according to parameters provided in params
|
||||||
|
#'
|
||||||
|
#' @param dfm_train Training dfm
|
||||||
|
#' @param dfm_test Testing dfm if applicable, otherwise NULL
|
||||||
|
#' @param params Row from grid with parameter optimization
|
||||||
|
#' @return List with dfm_train and dfm_test, processed according to parameters in params
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' preproc(dfm_train, dfm_test = NULL, params)
|
||||||
|
#################################################################################################
|
||||||
|
#################################### Preprocess data ############################################
|
||||||
|
#################################################################################################
|
||||||
|
preproc <- function(dfm_train, dfm_test = NULL, params) {
|
||||||
|
# Remove non-existing features from training dfm
|
||||||
|
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
|
||||||
|
if (params$tfidf) {
|
||||||
|
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
|
||||||
|
dfm_train <- dfm_weight(dfm_train, weights = idf)
|
||||||
|
if (!is.null(dfm_test)) {
|
||||||
|
dfm_test <- dfm_weight(dfm_test, weights = idf)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
idf <- NULL
|
||||||
|
}
|
||||||
|
|
||||||
|
if ("feat_percentiles" %in% colnames(params) && "feat_measures" %in% colnames(params)) {
|
||||||
|
|
||||||
|
# Keeping unique words that are important to one or more categories (see textstat_keyness and feat_select)
|
||||||
|
words <- unique(unlist(lapply(unique(docvars(dfm_train, params$class_type)),
|
||||||
|
feat_select,
|
||||||
|
dfm = dfm_train,
|
||||||
|
class_type = params$class_type,
|
||||||
|
percentile = params$feat_percentiles,
|
||||||
|
measure = params$feat_measures
|
||||||
|
)))
|
||||||
|
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=F)
|
||||||
|
}
|
||||||
|
|
||||||
|
return(list(dfm_train = dfm_train, dfm_test = dfm_test, idf = idf))
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/preproc.R
|
||||||
|
\name{preproc}
|
||||||
|
\alias{preproc}
|
||||||
|
\title{Preprocess dfm data for use in modeling procedure}
|
||||||
|
\usage{
|
||||||
|
preproc(dfm_train, dfm_test = NULL, params)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{dfm_train}{Training dfm}
|
||||||
|
|
||||||
|
\item{dfm_test}{Testing dfm if applicable, otherwise NULL}
|
||||||
|
|
||||||
|
\item{params}{Row from grid with parameter optimization}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
List with dfm_train and dfm_test, processed according to parameters in params
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Process dfm according to parameters provided in params
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
preproc(dfm_train, dfm_test = NULL, params)
|
||||||
|
}
|
Loading…
Reference in new issue