modelizer: tf-idf application updated, final model now also includes idf values from training set, explicitly setting positive category in binary classification for confusion matrices, minor code fixes

dfm_gen: added old junk codes for recoding, and removed deprecated ngrams parameter from dfm function class_update: removed dfm_words parameter, which is replaced by the force = T parameter in predict(), training/model idf is now applied to unseen data DESCRIPTION: added quanteda.textmodels as new dependency, since these have been separated from base quanteda 2.0.0 onwards
5 years ago · d9f936c566
parent 06bfec71bc
commit d9f936c566
4 changed files with 58 additions and 42 deletions
--- a/1
+++ b/1
@ -9,6 +9,7 @@ Depends: R (>= 3.3.1),
    parallel,
    tidyverse,
    quanteda,
+    quanteda.textmodels,
    httr,
    caret,
    e1071,
--- a/R/class_update.R
+++ b/R/class_update.R
@ -18,11 +18,11 @@
 #################################################################################################
 #################################### Update any kind of classification ##########################
 #################################################################################################
-class_update <- function(out, localhost = T, model_final, dfm_words, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
+class_update <- function(out, localhost = T, model_final, varname, text, words, clean, ver, es_super = .rs.askForPassword('ElasticSearch WRITE')) {
  print('updating')
  dfm <- dfm_gen(out, text = text, words = words, clean = clean) %>%
-    dfm_keep(dfm_words, valuetype="fixed", verbose=T)
-  pred <- data.frame(id = out$`_id`, pred = predict(model_final, newdata = dfm))
+    dfm_weight(weights = model_final$idf)
+  pred <- data.frame(id = out$`_id`, pred = predict(model_final$text_model, newdata = dfm, type = "class", force = T))
  bulk <- apply(pred, 1, bulk_writer, varname = varname, type = 'set', ver = ver)
  res <- elastic_update(bulk, es_super = es_super, localhost = localhost)
 }
--- a/R/dfm_gen.R
+++ b/R/dfm_gen.R
@ -36,6 +36,9 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1) {
          .$codes == 92 ~ 1,
          .$codes == 91 ~ 1,
          .$codes == 93 ~ 1,
+          .$codes == 2301 ~ 1,
+          .$codes == 3101 ~ 1,
+          .$codes == 34 ~ 1,
          TRUE ~ 0
        )
      ) %>%
@ -59,6 +62,6 @@ dfm_gen <- function(out, words = '999', text = "lemmas", clean, cores = 1) {
    }
  }
  dfm <- corpus(out$merged, docnames = out$`_id`, docvars = vardoc) %>%
-    dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex", ngrams = 1)
+    dfm(tolower = T, stem = F, remove_punct = T, valuetype = "regex")
  return(dfm)
 }
--- a/R/modelizer.R
+++ b/R/modelizer.R
@ -31,7 +31,7 @@
 modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, seed, outer_k, inner_k, model, class_type, opt_measure, country, grid) {
  ### Functions ###
  feat_select <- function (topic, dfm, class_type, percentile,measure) {
-    keyness <- textstat_keyness(dfm, measure = measure, docvars(dfm, class_type) == as.numeric(topic)) %>%
+    keyness <- textstat_keyness(dfm, measure = measure, target = docvars(dfm, class_type) == as.numeric(topic)) %>%
      na.omit()
    keyness[,2] <- abs(keyness[,2])
    keyness <- filter(keyness, keyness[,2] > quantile(as.matrix(keyness[,2]),percentile))$feature
@ -46,7 +46,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
    # if (inner_k <= 1) {
    #   inner_folds <- createDataPartition(as.factor(docvars(dfm[-fold], class_type)), times = 1, p = 1-0.8)
    # } else {
-    inner_folds <- createFolds(as.factor(docvars(dfm[-fold], class_type)), k= inner_k)
+    inner_folds <- createFolds(as.factor(docvars(dfm[-fold,], class_type)), k= inner_k)
    # }
    return(c(outer_fold = list(fold),inner_folds))
  }
@ -60,6 +60,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se

  ### Gets called for every parameter combination, and calls classifier for every inner cv fold
  inner_cv <- function(row,grid,outer_fold, inner_folds, dfm, class_type, model, cores_inner, cores_feats) {
+    print(str_c('params ',row))
    params <- grid[row,]
    # For each inner fold, cross validate the specified parameters
    res <-
@ -82,6 +83,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se

  ### Gets called for every outer cv fold, and calls inner_cv for all parameter combinations in grid
  outer_cv <- function(fold, grid, dfm, class_type, model, cores_grid, cores_inner, cores_feats) {
+    print('outer cv')
    # If fold contains both inner folds and outer fold
    if (length(fold) == inner_k + 1) {
      inner_folds <- fold[-1]
@ -130,39 +132,42 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
    }
  }

-  ### Custom tfidf function to allow same idf for different dfm's
-  custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
-    if (!nfeat(x) || !ndoc(x)) return(x)
-    tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
-    if (nfeat(x) != length(dfreq))
-      stop("missing some values in idf calculation")
-    # get the document indexes
-    j <- as(tfreq, "dgTMatrix")@j + 1
-    # replace just the non-zero values by product with idf
-    x@x <- tfreq@x * dfreq[j]
-    # record attributes
-    x@weightTf <- tfreq@weightTf
-    x@weightDf <- c(list(scheme = scheme_df, base = base), args)
-    return(x)
-  }
+  # ### Custom tfidf function to allow same idf for different dfm's
+  # custom_tfidf <- function(x, scheme_tf = "count", scheme_df = "inverse", base = 10, dfreq = dfreq) {
+  #   if (!nfeat(x) || !ndoc(x)) return(x)
+  #   tfreq <- dfm_weight(x, scheme = scheme_tf, base = base)
+  #   if (nfeat(x) != length(dfreq))
+  #     stop("missing some values in idf calculation")
+  #   # get the document indexes
+  #   j <- as(tfreq, "dgTMatrix")@j + 1
+  #   # replace just the non-zero values by product with idf
+  #   x@x <- tfreq@x * dfreq[j]
+  #   # record attributes
+  #
+  #   ### Not setting weighting parameters in dfm to avoid "grouping after weighting" errors that occur since quanteda 1.4.2
+  #
+  #   # x@weightTf <- tfreq@weightTf
+  #   # x@weightDf <- c(list(scheme = scheme_df, base = base), args)
+  #   return(x)
+  # }

  ### Classification function
  classifier <- function (inner_fold, outer_fold, params, dfm, class_type, model, cores_feats) {
    # If both inner and outer folds, subset dfm to outer_fold training set, then create train and test sets according to inner fold. Evaluate performance
    if (length(inner_fold) > 0 && length(outer_fold) > 0)  {
-      dfm_train <- dfm[-outer_fold] %>%
-        .[-inner_fold]
-      dfm_test <- dfm[-outer_fold] %>%
-        .[inner_fold]
+      dfm_train <- dfm[-outer_fold,] %>%
+        .[-inner_fold,]
+      dfm_test <- dfm[-outer_fold,] %>%
+        .[inner_fold,]
      # If only outer folds, but no inner folds, validate performance of outer fold training data on outer fold test data
    } else if (length(outer_fold) > 0 ) {
-      dfm_train <- dfm[-outer_fold]
-      dfm_test <- dfm[outer_fold]
+      dfm_train <- dfm[-outer_fold,]
+      dfm_test <- dfm[outer_fold,]
      validation_cv <- T
      # If only inner folds, validate performance directly on inner folds (is the same as above?)
    } else if (length(inner_fold) > 0 ) {
-      dfm_train <- dfm[-inner_fold]
-      dfm_test <- dfm[inner_fold]
+      dfm_train <- dfm[-inner_fold,]
+      dfm_test <- dfm[inner_fold,]
      # If both inner and outer folds are NULL, set training set to whole dataset, estimate model and return final model
    } else {
      final <- T ### Indicate final modeling run on whole dataset
@ -176,8 +181,8 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
    ### Getting features from training dataset
    # Getting idf from training data, and using it to normalize both training and testing feature occurence
    dfm_train <- dfm_trim(dfm_train, min_termfreq = 1, min_docfreq = 0)
-    dfreq <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0, use.names=T)
-    dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq)
+    idf <- docfreq(dfm_train, scheme = "inverse", base = 10, smoothing = 0, k = 0, threshold = 0)
+    dfm_train <- dfm_weight(dfm_train, weights = idf)
    # Added unique to filter out duplicate words, these are caused when there are multiple categories, and a words scores higher
    # than the threshold on two or more of those categories
    words <- unique(unlist(mclapply(unique(docvars(dfm_train, class_type)),
@ -188,6 +193,7 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
                             measure = params$measures,
                             mc.cores = cores_feats
    )))
+    # dfm_train <- custom_tfidf(dfm_train, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq)
    dfm_train <- dfm_keep(dfm_train, words, valuetype="fixed", verbose=T)


@ -220,20 +226,12 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
    # }
    ### Add more if statements for different models
    if (exists("final")) {
-      return(text_model)
+      return(list(text_model=text_model, idf=idf))
    } else {
      ### Removing all features not in training set from test set and weighting the remaining features according to training idf
      dfm_test <- dfm_keep(dfm_test, pattern = dfm_train, valuetype = "fixed", verbose = T)
-      dfm_test <- custom_tfidf(dfm_test, scheme_tf = scheme_tf, scheme_df = "inverse", base = 10, dfreq = dfreq[words])
+      dfm_test <- dfm_weight(dfm_test, weights = idf)
      pred <- predict(text_model, newdata = dfm_test, type = 'class')
-      if (exists("validation_cv")) {
-        return(data.frame(
-          tv = docvars(dfm_test, class_type),
-          pred = pred,
-          params = params,
-          stringsAsFactors = F
-        ))
-      }
      ### Fix for single-class 'predictions' in borderline situations
      # if (length(unique(pred)) == 1 & class_type == 'junk') {
      #   if (unique(pred) == '0') {
@ -247,7 +245,21 @@ modelizer <- function(dfm, cores_outer, cores_grid, cores_inner, cores_feats, se
      u <- union(pred, docvars(dfm_test, class_type))

      class_table <- table(prediction = factor(pred, u), trueValues = factor(docvars(dfm_test, class_type), u))
+      if (length(unique(u)) == 2) {
+        conf_mat <- confusionMatrix(class_table, mode = "everything", positive = max(u))
+      } else {
        conf_mat <- confusionMatrix(class_table, mode = "everything")
+        conf_mat$positive <- NA
+      }
+      if (exists("validation_cv")) {
+        return(data.frame(
+          tv = docvars(dfm_test, class_type),
+          pred = pred,
+          params = params,
+          pos_cat = conf_mat$positive,
+          stringsAsFactors = F
+        ))
+      }
      if (is.matrix(conf_mat$byClass) == T) {
        return(cbind(as.data.frame(t(conf_mat$overall)),as.data.frame(t(colMeans(conf_mat$byClass))),params))
      } else {