## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
bc_data <- read.table("breast-cancer-wisconsin.data.txt",
header = FALSE,
sep = ",")
colnames(bc_data) <- c("sample_code_number",
"clump_thickness",
"uniformity_of_cell_size",
"uniformity_of_cell_shape",
"marginal_adhesion",
"single_epithelial_cell_size",
"bare_nuclei",
"bland_chromatin",
"normal_nucleoli",
"mitosis",
"classes")
bc_data$classes <- ifelse(bc_data$classes == "2", "benign",
ifelse(bc_data$classes == "4", "malignant", NA))
bc_data$classes <- as.factor(bc_data$classes)
bc_data[bc_data == "?"] <- NA
# how many benign and malignant cases are there?
summary(bc_data$classes)
## benign malignant
## 458 241
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
bc_data[,2:10] <- apply(bc_data[, 2:10], 2, function(x) as.numeric(as.character(x)))
dataset_impute <- mice(bc_data[, 2:10], print = FALSE)
bc_data <- cbind(bc_data[, 11, drop = FALSE], mice::complete(dataset_impute, 1))
bc_data$classes <- as.factor(bc_data$classes)
nrow(bc_data[is.na(bc_data), ])
## [1] 0
set.seed(42)
index <- createDataPartition(bc_data$classes, p = 0.7, list = FALSE)
train_data <- bc_data[index, ]
test_data <- bc_data[-index, ]
set.seed(42)
model_rf <- caret::train(classes ~ .,
data = train_data,
method = "rf",
preProcess = c("scale", "center"),
trControl = trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
verboseIter = FALSE)
)
final <- data.frame(actual = test_data$classes,
predict(model_rf, newdata = test_data, type = "prob"))
final$predict <- as.factor(ifelse(final$benign > 0.5, "benign", "malignant"))
cm_original <- confusionMatrix(final$predict, final$actual)
cm_original
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 133 2
## malignant 4 70
##
## Accuracy : 0.9713
## 95% CI : (0.9386, 0.9894)
## No Information Rate : 0.6555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9369
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9708
## Specificity : 0.9722
## Pos Pred Value : 0.9852
## Neg Pred Value : 0.9459
## Prevalence : 0.6555
## Detection Rate : 0.6364
## Detection Prevalence : 0.6459
## Balanced Accuracy : 0.9715
##
## 'Positive' Class : benign
##
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
verboseIter = FALSE,
sampling = "down")
set.seed(42)
model_rf_under <- caret::train(classes ~ .,
data = train_data,
method = "rf",
preProcess = c("scale", "center"),
trControl = ctrl)
final_under <- data.frame(actual = test_data$classes,
predict(model_rf_under, newdata = test_data, type = "prob"))
final_under$predict <- as.factor(ifelse(final_under$benign > 0.5, "benign", "malignant"))
cm_under <- confusionMatrix(final_under$predict, final_under$actual)
cm_under
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 133 1
## malignant 4 71
##
## Accuracy : 0.9761
## 95% CI : (0.9451, 0.9922)
## No Information Rate : 0.6555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9475
##
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.9708
## Specificity : 0.9861
## Pos Pred Value : 0.9925
## Neg Pred Value : 0.9467
## Prevalence : 0.6555
## Detection Rate : 0.6364
## Detection Prevalence : 0.6411
## Balanced Accuracy : 0.9785
##
## 'Positive' Class : benign
##
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
verboseIter = FALSE,
sampling = "up")
set.seed(42)
model_rf_over <- caret::train(classes ~ .,
data = train_data,
method = "rf",
preProcess = c("scale", "center"),
trControl = ctrl)
final_over <- data.frame(actual = test_data$classes,
predict(model_rf_over, newdata = test_data, type = "prob"))
final_over$predict <- as.factor(ifelse(final_over$benign > 0.5, "benign", "malignant"))
cm_over <- confusionMatrix(final_over$predict, final_over$actual)
cm_over
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 133 2
## malignant 4 70
##
## Accuracy : 0.9713
## 95% CI : (0.9386, 0.9894)
## No Information Rate : 0.6555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9369
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9708
## Specificity : 0.9722
## Pos Pred Value : 0.9852
## Neg Pred Value : 0.9459
## Prevalence : 0.6555
## Detection Rate : 0.6364
## Detection Prevalence : 0.6459
## Balanced Accuracy : 0.9715
##
## 'Positive' Class : benign
##
Besides over- and under-sampling, there are hybrid methods that combine under-sampling with the generation of additional data. Two of the most popular are ROSE and SMOTE.
## Loaded ROSE 0.0-3
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
verboseIter = FALSE,
sampling = "rose")
set.seed(42)
model_rf_rose <- caret::train(classes ~ .,
data = train_data,
method = "rf",
preProcess = c("scale", "center"),
trControl = ctrl)
final_rose <- data.frame(actual = test_data$classes,
predict(model_rf_rose, newdata = test_data, type = "prob"))
final_rose$predict <- as.factor(ifelse(final_rose$benign > 0.5, "benign", "malignant"))
cm_rose <- confusionMatrix(final_rose$predict, final_rose$actual)
cm_rose
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 131 1
## malignant 6 71
##
## Accuracy : 0.9665
## 95% CI : (0.9322, 0.9864)
## No Information Rate : 0.6555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.927
##
## Mcnemar's Test P-Value : 0.1306
##
## Sensitivity : 0.9562
## Specificity : 0.9861
## Pos Pred Value : 0.9924
## Neg Pred Value : 0.9221
## Prevalence : 0.6555
## Detection Rate : 0.6268
## Detection Prevalence : 0.6316
## Balanced Accuracy : 0.9712
##
## 'Positive' Class : benign
##
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
verboseIter = FALSE,
sampling = "smote")
set.seed(42)
model_rf_smote <- caret::train(classes ~ .,
data = train_data,
method = "rf",
preProcess = c("scale", "center"),
trControl = ctrl)
## Loading required package: grid
Now let’s compare the predictions of all these models:
library(dplyr)
comparison <- data.frame(model = names(models),
Sensitivity = rep(NA, length(models)),
Specificity = rep(NA, length(models)),
Precision = rep(NA, length(models)),
Recall = rep(NA, length(models)),
F1 = rep(NA, length(models)))
for (name in names(models)) {
label <- get(paste0("cm_", name))
comparison[comparison$model == name, ] <- comparison %>%
filter(model == name) %>%
mutate(Sensitivity = label$byClass[["Sensitivity"]],
Specificity = label$byClass[["Specificity"]],
Precision = label$byClass[["Precision"]],
Recall = label$byClass[["Recall"]],
F1 = label$byClass[["F1"]])
}
library(tidyr)
comparison %>%
gather(x, y, Sensitivity:F1) %>%
ggplot(aes(x = x, y = y, color = model)) +
geom_jitter(width = 0.2, alpha = 0.5, size = 3)
get_cms <- function(ctrl_method = "repeatedcv",
ctrl_sampling,
target,
data,
train_method = "rf",
preProcess = c("scale", "center"),
test
){
ctrl <- trainControl(method = ctrl_method,
number = 10,
repeats = 10,
verboseIter = FALSE,
sampling = ctrl_sampling)
model <- caret::train(target,
data = data,
method = train_method,
preProcess = preProcess,
trControl = ctrl)
final <- data.frame(actual = test$classes,
predict(model, newdata = test, type = "prob"))
final$predict <- as.factor(ifelse(final$benign > 0.5, "benign", "malignant"))
cm <- confusionMatrix(final$predict, final$actual)
return(cm)
}
sampling_list <- c( "down", "up", "rose", "smote") # "none", , "rose", "smote"
plot_cms <- function(ctrl_method = "repeatedcv",
ctrl_sampling,
target,
train,
train_method = "rf",
preProcess = c("scale", "center"),
test){
cms <- lapply(sampling_list, function(x) x <- get_cms(ctrl_sampling = x,
target = target,
data = train,
test = test))
names(cms) <- sampling_list
comparison <- data.frame(model = names(cms),
Sensitivity = rep(NA, length(cms)),
Specificity = rep(NA, length(cms)),
Precision = rep(NA, length(cms)),
Recall = rep(NA, length(cms)),
F1 = rep(NA, length(cms)))
for (name in names(cms)) {
label <- cms[[name]]
comparison[comparison$model == name, ] <- comparison %>%
filter(model == name) %>%
mutate(Sensitivity = label$byClass[["Sensitivity"]],
Specificity = label$byClass[["Specificity"]],
Precision = label$byClass[["Precision"]],
Recall = label$byClass[["Recall"]],
F1 = label$byClass[["F1"]])
}
comparison %>%
gather(x, y, Sensitivity:F1) %>%
ggplot(aes(x = x, y = y, color = model)) +
geom_jitter(width = 0.2, alpha = 0.5, size = 3)
}
set.seed(42)
index <- createDataPartition(bc_data$classes, p = 0.7, list = FALSE)
train_data <- bc_data[index, ]
test_data <- bc_data[-index, ]
plot_cms(ctrl_sampling = sampling_list,
target = as.formula("classes~."),
train = train_data,
test = test_data)