0.1 Missing values are imputed with the mice package.
1 Modeling the original unbalanced data
2 Under-sampling
3 Oversampling
4 ROSE
5 SMOTE
6 Predictions
7 Models comparaison
8 Function to pull all code and return plot to compare models

library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

source

bc_data <- read.table("breast-cancer-wisconsin.data.txt", 
                      header = FALSE, 
                      sep = ",")


colnames(bc_data) <- c("sample_code_number", 
                       "clump_thickness", 
                       "uniformity_of_cell_size", 
                       "uniformity_of_cell_shape", 
                       "marginal_adhesion", 
                       "single_epithelial_cell_size", 
                       "bare_nuclei", 
                       "bland_chromatin", 
                       "normal_nucleoli", 
                       "mitosis", 
                       "classes")

bc_data$classes <- ifelse(bc_data$classes == "2", "benign",
                          ifelse(bc_data$classes == "4", "malignant", NA))


bc_data$classes <- as.factor(bc_data$classes)
bc_data[bc_data == "?"] <- NA

# how many benign and malignant cases are there?
summary(bc_data$classes)

##    benign malignant 
##       458       241

0.1 Missing values are imputed with the mice package.

# impute missing data
library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:tidyr':
## 
##     complete

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

bc_data[,2:10] <- apply(bc_data[, 2:10], 2, function(x) as.numeric(as.character(x)))
dataset_impute <- mice(bc_data[, 2:10],  print = FALSE)
bc_data <- cbind(bc_data[, 11, drop = FALSE], mice::complete(dataset_impute, 1))

bc_data$classes <- as.factor(bc_data$classes)

nrow(bc_data[is.na(bc_data), ])

## [1] 0

1 Modeling the original unbalanced data

set.seed(42)
index <- createDataPartition(bc_data$classes, p = 0.7, list = FALSE)
train_data <- bc_data[index, ]
test_data  <- bc_data[-index, ]

set.seed(42)
model_rf <- caret::train(classes ~ .,
                         data = train_data,
                         method = "rf",
                         preProcess = c("scale", "center"),
                         trControl = trainControl(method = "repeatedcv", 
                                                  number = 10, 
                                                  repeats = 10, 
                                                  verboseIter = FALSE)
                         )




final <- data.frame(actual = test_data$classes,
                    predict(model_rf, newdata = test_data, type = "prob"))

final$predict <- as.factor(ifelse(final$benign > 0.5, "benign", "malignant"))

cm_original <- confusionMatrix(final$predict, final$actual)

cm_original

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign       133         2
##   malignant      4        70
##                                           
##                Accuracy : 0.9713          
##                  95% CI : (0.9386, 0.9894)
##     No Information Rate : 0.6555          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9369          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.9708          
##             Specificity : 0.9722          
##          Pos Pred Value : 0.9852          
##          Neg Pred Value : 0.9459          
##              Prevalence : 0.6555          
##          Detection Rate : 0.6364          
##    Detection Prevalence : 0.6459          
##       Balanced Accuracy : 0.9715          
##                                           
##        'Positive' Class : benign          
##

2 Under-sampling

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     repeats = 10, 
                     verboseIter = FALSE,
                     sampling = "down")

set.seed(42)
model_rf_under <- caret::train(classes ~ .,
                         data = train_data,
                         method = "rf",
                         preProcess = c("scale", "center"),
                         trControl = ctrl)

final_under <- data.frame(actual = test_data$classes,
                    predict(model_rf_under, newdata = test_data, type = "prob"))

final_under$predict <- as.factor(ifelse(final_under$benign > 0.5, "benign", "malignant"))

cm_under <- confusionMatrix(final_under$predict, final_under$actual)

cm_under

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign       133         1
##   malignant      4        71
##                                           
##                Accuracy : 0.9761          
##                  95% CI : (0.9451, 0.9922)
##     No Information Rate : 0.6555          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9475          
##                                           
##  Mcnemar's Test P-Value : 0.3711          
##                                           
##             Sensitivity : 0.9708          
##             Specificity : 0.9861          
##          Pos Pred Value : 0.9925          
##          Neg Pred Value : 0.9467          
##              Prevalence : 0.6555          
##          Detection Rate : 0.6364          
##    Detection Prevalence : 0.6411          
##       Balanced Accuracy : 0.9785          
##                                           
##        'Positive' Class : benign          
##

3 Oversampling

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     repeats = 10, 
                     verboseIter = FALSE,
                     sampling = "up")

set.seed(42)
model_rf_over <- caret::train(classes ~ .,
                         data = train_data,
                         method = "rf",
                         preProcess = c("scale", "center"),
                         trControl = ctrl)

final_over <- data.frame(actual = test_data$classes,
                          predict(model_rf_over, newdata = test_data, type = "prob"))

final_over$predict <- as.factor(ifelse(final_over$benign > 0.5, "benign", "malignant"))

cm_over <- confusionMatrix(final_over$predict, final_over$actual)

cm_over

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign       133         2
##   malignant      4        70
##                                           
##                Accuracy : 0.9713          
##                  95% CI : (0.9386, 0.9894)
##     No Information Rate : 0.6555          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9369          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.9708          
##             Specificity : 0.9722          
##          Pos Pred Value : 0.9852          
##          Neg Pred Value : 0.9459          
##              Prevalence : 0.6555          
##          Detection Rate : 0.6364          
##    Detection Prevalence : 0.6459          
##       Balanced Accuracy : 0.9715          
##                                           
##        'Positive' Class : benign          
##

4 ROSE

Besides over- and under-sampling, there are hybrid methods that combine under-sampling with the generation of additional data. Two of the most popular are ROSE and SMOTE.

library(ROSE)

## Loaded ROSE 0.0-3

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     repeats = 10, 
                     verboseIter = FALSE,
                     sampling = "rose")

set.seed(42)
model_rf_rose <- caret::train(classes ~ .,
                              data = train_data,
                              method = "rf",
                              preProcess = c("scale", "center"),
                              trControl = ctrl)

final_rose <- data.frame(actual = test_data$classes,
                         predict(model_rf_rose, newdata = test_data, type = "prob"))

final_rose$predict <- as.factor(ifelse(final_rose$benign > 0.5, "benign", "malignant"))

cm_rose <- confusionMatrix(final_rose$predict, final_rose$actual)

cm_rose

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign       131         1
##   malignant      6        71
##                                           
##                Accuracy : 0.9665          
##                  95% CI : (0.9322, 0.9864)
##     No Information Rate : 0.6555          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.927           
##                                           
##  Mcnemar's Test P-Value : 0.1306          
##                                           
##             Sensitivity : 0.9562          
##             Specificity : 0.9861          
##          Pos Pred Value : 0.9924          
##          Neg Pred Value : 0.9221          
##              Prevalence : 0.6555          
##          Detection Rate : 0.6268          
##    Detection Prevalence : 0.6316          
##       Balanced Accuracy : 0.9712          
##                                           
##        'Positive' Class : benign          
##

5 SMOTE

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     repeats = 10, 
                     verboseIter = FALSE,
                     sampling = "smote")

set.seed(42)
model_rf_smote <- caret::train(classes ~ .,
                              data = train_data,
                              method = "rf",
                              preProcess = c("scale", "center"),
                              trControl = ctrl)

## Loading required package: grid

final_smote <- data.frame(actual = test_data$classes,
                         predict(model_rf_smote, newdata = test_data, type = "prob"))

final_smote$predict <- as.factor(ifelse(final_smote$benign > 0.5, "benign", "malignant"))

cm_smote <- confusionMatrix(final_smote$predict, final_smote$actual)

6 Predictions

Now let’s compare the predictions of all these models:

models <- list(original = model_rf,
                       under = model_rf_under,
                       over = model_rf_over,
                       smote = model_rf_smote,
                       rose = model_rf_rose)

resampling <- resamples(models)
bwplot(resampling)

7 Models comparaison

library(dplyr)
comparison <- data.frame(model = names(models),
                         Sensitivity = rep(NA, length(models)),
                         Specificity = rep(NA, length(models)),
                         Precision = rep(NA, length(models)),
                         Recall = rep(NA, length(models)),
                         F1 = rep(NA, length(models)))

for (name in names(models)) {
  label <- get(paste0("cm_", name))
  
  comparison[comparison$model == name, ] <- comparison %>%
    filter(model == name) %>%
    mutate(Sensitivity = label$byClass[["Sensitivity"]],
           Specificity = label$byClass[["Specificity"]],
           Precision = label$byClass[["Precision"]],
           Recall = label$byClass[["Recall"]],
           F1 = label$byClass[["F1"]])
}

library(tidyr)
comparison %>%
  gather(x, y, Sensitivity:F1) %>%
  ggplot(aes(x = x, y = y, color = model)) +
    geom_jitter(width = 0.2, alpha = 0.5, size = 3)

8 Function to pull all code and return plot to compare models

get_cms <- function(ctrl_method = "repeatedcv",
                                 ctrl_sampling,
                                 target,
                                 data,
                                 train_method = "rf",
                                 preProcess = c("scale", "center"),
                                 test
                           ){
  
  ctrl <- trainControl(method = ctrl_method, 
                       number = 10, 
                       repeats = 10, 
                       verboseIter = FALSE,
                       sampling = ctrl_sampling)
  
  model <- caret::train(target,
                        data = data,
                        method = train_method,
                        preProcess = preProcess,
                        trControl = ctrl)
  
final <- data.frame(actual = test$classes,
                         predict(model, newdata = test, type = "prob"))

final$predict <- as.factor(ifelse(final$benign > 0.5, "benign", "malignant"))

cm <- confusionMatrix(final$predict, final$actual)
  
  return(cm)
  
}


sampling_list <- c( "down", "up", "rose", "smote") # "none", , "rose", "smote"


plot_cms <- function(ctrl_method = "repeatedcv",
                                 ctrl_sampling,
                                 target,
                                 train,
                                 train_method = "rf",
                                 preProcess = c("scale", "center"),
                                 test){



cms <- lapply(sampling_list, function(x) x <- get_cms(ctrl_sampling = x,
                                            target = target,
                                            data = train,
                                            test = test))

names(cms) <- sampling_list
  
comparison <- data.frame(model = names(cms),
                         Sensitivity = rep(NA, length(cms)),
                         Specificity = rep(NA, length(cms)),
                         Precision = rep(NA, length(cms)),
                         Recall = rep(NA, length(cms)),
                         F1 = rep(NA, length(cms)))

for (name in names(cms)) {
  label <- cms[[name]]
  
  comparison[comparison$model == name, ] <- comparison %>%
    filter(model == name) %>%
    mutate(Sensitivity = label$byClass[["Sensitivity"]],
           Specificity = label$byClass[["Specificity"]],
           Precision = label$byClass[["Precision"]],
           Recall = label$byClass[["Recall"]],
           F1 = label$byClass[["F1"]])
}

comparison %>%
  gather(x, y, Sensitivity:F1) %>%
  ggplot(aes(x = x, y = y, color = model)) +
    geom_jitter(width = 0.2, alpha = 0.5, size = 3)

}

set.seed(42)
index <- createDataPartition(bc_data$classes, p = 0.7, list = FALSE)
train_data <- bc_data[index, ]
test_data  <- bc_data[-index, ]

plot_cms(ctrl_sampling = sampling_list,
         target = as.formula("classes~."),
         train = train_data,
          test = test_data)

Dealing with unbalanced data in machine learning

2019-12-05