modelizer: tf-idf application updated, final model now also includes idf values from training set, explicitly setting positive category in binary classification for confusion matrices, minor code fixes

dfm_gen: added old junk codes for recoding, and removed deprecated ngrams parameter from dfm function
class_update: removed dfm_words parameter, which is replaced by the force = T parameter in predict(), training/model idf is now applied to unseen data
DESCRIPTION: added quanteda.textmodels as new dependency, since these have been separated from base quanteda 2.0.0 onwards
master
Your Name 5 years ago
parent 06bfec71bc
commit d9f936c566

@ -9,6 +9,7 @@ Depends: R (>= 3.3.1),
parallel,
tidyverse,
quanteda,
quanteda.textmodels,
httr,
caret,
e1071,

@ -18,11 +18,11 @@
#################################################################################################
#################################### Update any kind of classification ##########################
#################################################################################################
class_update <- function(out, localhost = T, model_final, dfm_words, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
print('updating')
dfm <- dfm_gen(out, text = text, words = words, clean = clean) %>%
dfm_keep(dfm_words, valuetype="fixed", verbose=T)
pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm))
dfm_weight(weights = model_final$idf)
pred <- data.frame(id = out$`_id`, pred = predict(model_final$text_model, newdata = dfm, type = "class", force = T))
bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set', ver = ver)
res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
}

@ -36,6 +36,9 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1) {
.$codes == 92 ~ 1,
.$codes == 91 ~ 1,
.$codes == 93 ~ 1,
.$codes == 2301 ~ 1,
.$codes == 3101 ~ 1,
.$codes == 34 ~ 1,
TRUE ~ 0
)
) %>%
@ -59,6 +62,6 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1) {
}
}
dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex")
return(dfm)
}

@ -31,7 +31,7 @@
modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
### Functions ###
feat_select <- function (topic, dfm, class_type, percentile,measure) {
keyness <- textstat_keyness(dfm, measure = measure, docvars(dfm, class_type) == as.numeric(topic)) %>%
keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
na.omit()
keyness[,2] <- abs(keyness[,2])
keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
@ -46,7 +46,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
# if (inner_k <= 1) {
# inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
# } else {
inner_folds <- createFolds(as.factor(docvars(dfm[-fold], class_type)), k= inner_k)
inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k)
# }
return(c(outer_fold = list(fold),inner_folds))
}
@ -60,6 +60,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
### Gets called for every parameter combination, and calls classifier for every inner cv fold
inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
print(str_c('params ',row))
params <- grid[row,]
# For each inner fold, cross validate the specified parameters
res <-
@ -82,6 +83,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
print('outer cv')
# If fold contains both inner folds and outer fold
if (length(fold) == inner_k + 1) {
inner_folds <- fold[-1]
@ -130,39 +132,42 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
}
}
### Custom tfidf function to allow same idf for different dfm's
custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
if (!nfeat(x) || !ndoc(x)) return(x)
tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
if (nfeat(x) != length(dfreq))
stop("missing some values in idf calculation")
# get the document indexes
j <- as(tfreq, "dgTMatrix")@j + 1
# replace just the non-zero values by product with idf
x@x <- tfreq@x * dfreq[j]
# record attributes
x@weightTf <- tfreq@weightTf
x@weightDf <- c(list(scheme = scheme_df, base = base), args)
return(x)
}
# ### Custom tfidf function to allow same idf for different dfm's
# custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
# if (!nfeat(x) || !ndoc(x)) return(x)
# tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
# if (nfeat(x) != length(dfreq))
# stop("missing some values in idf calculation")
# # get the document indexes
# j <- as(tfreq, "dgTMatrix")@j + 1
# # replace just the non-zero values by product with idf
# x@x <- tfreq@x * dfreq[j]
# # record attributes
#
# ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2
#
# # x@weightTf <- tfreq@weightTf
# # x@weightDf <- c(list(scheme = scheme_df, base = base), args)
# return(x)
# }
### Classification function
classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
# If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
if (length(inner_fold) > 0 && length(outer_fold) > 0) {
dfm_train <- dfm[-outer_fold] %>%
.[-inner_fold]
dfm_test <- dfm[-outer_fold] %>%
.[inner_fold]
dfm_train <- dfm[-outer_fold,] %>%
.[-inner_fold,]
dfm_test <- dfm[-outer_fold,] %>%
.[inner_fold,]
# If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
} else if (length(outer_fold) > 0 ) {
dfm_train <- dfm[-outer_fold]
dfm_test <- dfm[outer_fold]
dfm_train <- dfm[-outer_fold,]
dfm_test <- dfm[outer_fold,]
validation_cv <- T
# If only inner folds, validate performance directly on inner folds (is the same as above?)
} else if (length(inner_fold) > 0 ) {
dfm_train <- dfm[-inner_fold]
dfm_test <- dfm[inner_fold]
dfm_train <- dfm[-inner_fold,]
dfm_test <- dfm[inner_fold,]
# If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
} else {
final <- T ### Indicate final modeling run on whole dataset
@ -176,8 +181,8 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
### Getting features from training dataset
# Getting idf from training data, and using it to normalize both training and testing feature occurence
dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
dfreq <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0, use.names=T)
dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq)
idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
dfm_train <- dfm_weight(dfm_train, weights = idf)
# Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
# than the threshold on two or more of those categories
words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
@ -188,6 +193,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
measure = params$measures,
mc.cores = cores_feats
)))
# dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq)
dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)
@ -220,20 +226,12 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
# }
### Add more if statements for different models
if (exists("final")) {
return(text_model)
return(list(text_model=text_model, idf=idf))
} else {
### Removing all features not in training set from test set and weighting the remaining features according to training idf
dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
dfm_test <- custom_tfidf(dfm_test, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq[words])
dfm_test <- dfm_weight(dfm_test, weights = idf)
pred <- predict(text_model, newdata = dfm_test, type = 'class')
if (exists("validation_cv")) {
return(data.frame(
tv = docvars(dfm_test, class_type),
pred = pred,
params = params,
stringsAsFactors = F
))
}
### Fix for single-class 'predictions' in borderline situations
# if (length(unique(pred)) == 1 & class_type == 'junk') {
# if (unique(pred) == '0') {
@ -247,7 +245,21 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
u <- union(pred, docvars(dfm_test, class_type))
class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u))
conf_mat <- confusionMatrix(class_table, mode = "everything")
if (length(unique(u)) == 2) {
conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
} else {
conf_mat <- confusionMatrix(class_table, mode = "everything")
conf_mat$positive <- NA
}
if (exists("validation_cv")) {
return(data.frame(
tv = docvars(dfm_test, class_type),
pred = pred,
params = params,
pos_cat = conf_mat$positive,
stringsAsFactors = F
))
}
if (is.matrix(conf_mat$byClass) == T) {
return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
} else {

Loading…
Cancel
Save