|
|
|
% Generated by roxygen2: do not edit by hand
|
|
|
|
% Please edit documentation in R/modelizer.R
|
|
|
|
\name{modelizer}
|
|
|
|
\alias{modelizer}
|
|
|
|
\title{Generate a classification model}
|
|
|
|
\usage{
|
|
|
|
modelizer(
|
|
|
|
dfm,
|
|
|
|
outer_k,
|
|
|
|
inner_k,
|
|
|
|
class_type,
|
|
|
|
opt_measure,
|
|
|
|
country,
|
|
|
|
grid,
|
|
|
|
seed,
|
|
|
|
model,
|
|
|
|
we_vectors,
|
|
|
|
cores = 1
|
|
|
|
)
|
|
|
|
}
|
|
|
|
\arguments{
|
|
|
|
\item{dfm}{A quanteda dfm used to train and evaluate the model, should contain the vector with class labels in docvars}
|
|
|
|
|
|
|
|
\item{outer_k}{Number of outer cross-validation folds (for performance estimation)}
|
|
|
|
|
|
|
|
\item{inner_k}{Number of inner cross-validation folds (for hyperparameter optimization and feature selection)}
|
|
|
|
|
|
|
|
\item{class_type}{Type of classification to model ("junk", "aggregate", or "codes")}
|
|
|
|
|
|
|
|
\item{opt_measure}{Label of measure in confusion matrix to use as performance indicator}
|
|
|
|
|
|
|
|
\item{country}{Two-letter country abbreviation of the country the model is estimated for (used for filename)}
|
|
|
|
|
|
|
|
\item{grid}{Data frame providing all possible combinations of hyperparameters and feature selection parameters for a given model (grid search)}
|
|
|
|
|
|
|
|
\item{seed}{Integer to use as seed for random number generation, ensures replicability}
|
|
|
|
|
|
|
|
\item{model}{Classification algorithm to use (currently only "nb" for Naïve Bayes using textmodel_nb)}
|
|
|
|
|
|
|
|
\item{we_vectors}{Matrix with word embedding vectors}
|
|
|
|
|
|
|
|
\item{cores}{Number of threads used for parallel processing using future_lapply, defaults to 1}
|
|
|
|
}
|
|
|
|
\value{
|
|
|
|
A list containing all relevant output
|
|
|
|
}
|
|
|
|
\description{
|
|
|
|
Generate a nested cross validated classification model based on a dfm with class labels as docvars
|
|
|
|
Currently only supports Naïve Bayes using quanteda's textmodel_nb
|
|
|
|
Hyperparemeter optimization is enabled through the grid parameter
|
|
|
|
A grid should be generated from vectors with the labels as described for each model, using the crossing() command
|
|
|
|
For Naïve Bayes, the following parameters can be used:
|
|
|
|
- percentiles (cutoff point for tf-idf feature selection)
|
|
|
|
- measures (what measure to use for determining feature importance, see textstat_keyness for options)
|
|
|
|
}
|
|
|
|
\examples{
|
|
|
|
modelizer(dfm, outer_k, inner_k, class_type, opt_measure, country, grid, seed, model, cores = 1)
|
|
|
|
}
|