train <- fread(file = "train.csv", showProgress = TRUE)
test <- data.table::fread(file = "test.csv")
train[1:10,1:14]
## ID_code target var_0 var_1 var_2 var_3 var_4 var_5 var_6
## 1: train_0 0 8.9255 -6.7863 11.9081 5.0930 11.4607 -9.2834 5.1187
## 2: train_1 0 11.5006 -4.1473 13.8588 5.3890 12.3622 7.0433 5.6208
## 3: train_2 0 8.6093 -2.7457 12.0805 7.8928 10.5825 -9.0837 6.9427
## 4: train_3 0 11.0604 -2.1518 8.9522 7.1957 12.5846 -1.8361 5.8428
## 5: train_4 0 9.8369 -1.4834 12.8746 6.6375 12.2772 2.4486 5.9405
## 6: train_5 0 11.4763 -2.3182 12.6080 8.6264 10.9621 3.5609 4.5322
## 7: train_6 0 11.8091 -0.0832 9.3494 4.2916 11.1355 -8.0198 6.1961
## 8: train_7 0 13.5580 -7.9881 13.8776 7.5985 8.6543 0.8310 5.6890
## 9: train_8 0 16.1071 2.4426 13.9307 5.6327 8.8014 6.1630 4.4514
## 10: train_9 0 12.5088 1.9743 8.8960 5.4508 13.6043 -16.2859 6.0637
## var_7 var_8 var_9 var_10 var_11
## 1: 18.6266 -4.9200 5.7470 2.9252 3.1821
## 2: 16.5338 3.1468 8.0851 -0.4032 8.0585
## 3: 14.6155 -4.9193 5.9525 -0.3249 -11.2648
## 4: 14.9250 -5.8609 8.2450 2.3061 2.8102
## 5: 19.2514 6.2654 7.6784 -9.4458 -12.1419
## 6: 15.2255 3.5855 5.9790 0.8010 -0.6192
## 7: 12.0771 -4.3781 7.9232 -5.1288 -7.5271
## 8: 22.3262 5.0647 7.1971 1.4532 -6.7033
## 9: 10.1854 -3.1882 9.0827 0.9501 1.7982
## 10: 16.8410 0.1287 7.9682 0.8787 3.0537
trainremoveCols <- c('target','ID_code')
testremoveCols <- c('ID_code')
target <- train$target
ID_code <- test$ID_code
train[,(trainremoveCols) := NULL]
test[,(testremoveCols) := NULL]
# Do scaling
dt <- rbind(train, test)
scale.cols <- colnames(dt)
dt[, (scale.cols) := lapply(.SD, scale), .SDcols = scale.cols]
train <- cbind(target, head(dt,nrow(train)))
test <- cbind(ID_code, tail(dt, nrow(test)))
rm(dt)
gc() # It can be useful to call gc after a large object has been removed, as this may prompt R to return memory to the operating system.
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 1938007 103.6 3204075 171.2 NA 3097054 165.5
## Vcells 92619550 706.7 326116329 2488.1 102400 339622435 2591.2
## ID_code var_0 var_1 var_2 var_3 var_4
## 1: test_0 0.13042211 2.32511591 0.8501873 1.2874960 0.2193683211
## 2: test_1 -0.70396487 0.71199928 0.2249866 -0.7844479 -1.1606129525
## 3: test_2 -1.70702624 -2.15860918 -0.2163583 0.1247680 -0.5028792931
## 4: test_3 -0.70166102 0.07508334 0.4969597 -0.1061858 -1.3776761726
## 5: test_4 0.34112523 0.36913011 1.2960442 0.4678785 -1.2185829081
## 6: test_5 -1.54131393 -0.16448012 -0.7983345 0.1348265 1.9564914401
## 7: test_6 -0.72634508 -1.10760517 -1.2705811 0.7179239 0.5760162797
## 8: test_7 2.18344456 -0.19659166 1.0190282 0.7848663 0.0002060824
## 9: test_8 -1.21239063 0.60963294 1.1392985 -0.9850799 -1.4810219719
## 10: test_9 -0.09486106 -1.31236259 1.5006022 1.1238753 0.5114405891
## var_5 var_6 var_7 var_8 var_9 var_10
## 1: 0.3403692 0.5051034 0.5056589 0.5559708 1.0066564 -0.437375876
## 2: 0.1330029 0.7018338 0.6120808 -1.4082645 -1.2926337 -0.320403823
## 3: 1.8894762 -0.5973030 1.0862008 0.3728326 0.6290218 -0.924393079
## 4: 1.0465358 -0.5456656 1.1774823 0.9285478 -0.0896026 -0.067821213
## 5: -0.4483523 1.6720854 -1.7340544 0.8125863 -0.3442505 0.857382492
## 6: -0.4599589 -1.3462185 -0.5186051 0.4795532 -0.2303440 -0.963214213
## 7: -0.2732377 -1.4382879 -1.0103207 -1.3744511 -1.0135830 0.547358411
## 8: 1.8690981 0.6325218 0.3829565 -1.5263863 -0.0843329 0.663549318
## 9: -1.1063386 -1.2400557 -0.5708953 0.6923643 -0.2811763 1.633532666
## 10: -0.7301884 -2.2656414 1.0943556 -0.5076050 -0.6714581 -0.001841636
## var_11 var_12
## 1: -0.184102128 -0.2826372
## 2: 0.490470259 0.4713123
## 3: 0.036017259 -1.8321008
## 4: -0.303196572 0.1540538
## 5: -0.004295498 0.4102808
## 6: -0.993961110 0.1829911
## 7: 0.207275237 -1.3422703
## 8: 0.817246641 0.6428319
## 9: 1.954736870 1.6251234
## 10: -0.156679396 0.3308347
## <160001/39999/200000>
We can retrieve our training and testing sets using training() and testing() functions.
# Retrieve train and test sets
train_8 <- rsample::training(train_test_split)
test_2 <- rsample::testing(train_test_split)
train_8[1:10, 1:14]
## target var_0 var_1 var_2 var_3 var_4
## 1: 0 0.2735896 -0.6232870 1.1934049 -0.6852305 0.79320297
## 2: 0 -0.6779972 -0.2768087 0.5191407 0.5373111 -0.30550981
## 3: 0 0.1287107 -0.1299954 -0.6669927 0.1969350 0.93050348
## 4: 0 0.2655920 -0.1711299 0.7191488 0.8955093 -0.07116055
## 5: 0 0.9507227 -1.5727404 1.2005332 0.3936120 -1.49590031
## 6: 0 1.7896845 1.0057494 1.2206667 -0.5662380 -1.40508689
## 7: 0 -1.8427886 0.2672829 -0.4251630 -1.2175962 0.81098289
## 8: 0 0.6745246 -1.5695021 -0.1272552 1.0828602 1.09867192
## 9: 0 -0.6260620 -0.7390023 -0.3742794 0.3085057 -1.26679860
## 10: 1 1.8761774 0.7958255 2.2856577 0.2639751 0.65911272
## var_5 var_6 var_7 var_8 var_9 var_10
## 1: 1.5383697 0.2411404 -0.001080387 0.8599310 0.4189636 -0.14279305
## 2: -0.5117750 1.7681979 -0.561775966 -1.5601396 -1.3099832 -0.12856893
## 3: 0.4095761 0.4975946 -0.471312909 -1.8426477 0.5485982 0.34938335
## 4: 1.0956696 -1.0164092 -0.383480442 0.9915541 -1.2884990 0.07596413
## 5: 0.7486311 0.3199250 1.691967139 1.4353582 -0.3009580 0.19444397
## 6: 1.4264616 -1.1097493 -1.856639899 -1.0407580 1.2277404 0.10304990
## 7: -1.7556930 0.7233205 -0.630697416 -0.2255469 -0.2014823 0.77705163
## 8: -0.8933407 0.2707135 -1.373897462 0.3044258 -0.6153561 -1.18678543
## 9: 0.8241052 1.0043804 -1.234388522 1.6248870 -1.2555026 0.88461360
## 10: 1.3930785 -0.6814015 1.296414136 0.2477501 0.9156933 0.35348891
## var_11 var_12
## 1: 1.8967234 0.003053396
## 2: -1.3422600 0.892219255
## 3: 1.0170001 -0.931359957
## 4: 0.4421620 -2.027296338
## 5: -0.5776586 1.413091563
## 6: 0.8473681 0.221398858
## 7: -1.8099215 -0.617784305
## 8: -0.3747035 -1.852093835
## 9: -0.2110555 0.921682739
## 10: -1.3550159 -1.175485968
#train_8$ID_code <- NULL
train_8_sparse <- sparse.model.matrix(target ~., data=train_8)
dtrain_8 <- xgb.DMatrix(data=train_8_sparse, label = train_8$target)
#test_2$ID_code <- NULL
test_2_sparse <- sparse.model.matrix(target ~., data=test_2)
dtest_2 <- xgb.DMatrix(data=test_2_sparse, label = test_2$target)
Here, we can see after how many rounds, we achieved the smallest test error.
params <- list(booster = "gbtree",
tree_method = "auto",
objective = "binary:logistic",
eval_metric = "auc", # for Binary classification error rate
max_depth = 2, # default 6, it makes training heavy, there is no correlation between features nor complex data/classification (binary)
eta = 0.01, # learning rate
subsample = 0.5, # (1) prevent overfitting. O.5 means xgboost samples half of the training data prior to growing trees.
colsample_bytree = 0.1, # specify the fraction of columns to be subsampled.
nthread = parallel::detectCores(all.tests = FALSE, logical = TRUE) # detect and use all cpu in any OS.
)
tme <- Sys.time()
cv_model <- xgb.cv(params = params,
data = dtrain_8,
nrounds = 30,
verbose = TRUE, # print AUC
print_every_n = 5, # print
nfold = 5, # default = 3
early_stopping_rounds = 5, # CV error needs to decrease at least every <early_stopping_rounds>
maximize = TRUE, # When it is TRUE, it means the larger evaluation score of <early_stopping_rounds>.
prediction = TRUE) # prediction of cv folds
## [1] train-auc:0.554845+0.020936 test-auc:0.547460+0.021108
## Multiple eval metrics are present. Will use test_auc for early stopping.
## Will train until test_auc hasn't improved in 5 rounds.
##
## [6] train-auc:0.672023+0.003630 test-auc:0.663031+0.005170
## [11] train-auc:0.706164+0.004200 test-auc:0.696083+0.005594
## [16] train-auc:0.727476+0.007101 test-auc:0.718789+0.009358
## [21] train-auc:0.740644+0.006604 test-auc:0.731419+0.007655
## [26] train-auc:0.746494+0.004003 test-auc:0.737233+0.006636
## [30] train-auc:0.752478+0.005963 test-auc:0.743094+0.008043
## Time difference of 2.665371 mins
watchlist <- list(train = dtrain_8, eval = dtest_2)
tme <- Sys.time()
xgboost_tree <- xgb.train(data = dtrain_8,
params = params,
watchlist = watchlist,
nrounds = cv_model$best_iteration,
print_every_n = 1,
verbose = TRUE)
## [1] train-auc:0.558381 eval-auc:0.553793
## [2] train-auc:0.608049 eval-auc:0.598934
## [3] train-auc:0.634050 eval-auc:0.629809
## [4] train-auc:0.647444 eval-auc:0.643795
## [5] train-auc:0.653049 eval-auc:0.647573
## [6] train-auc:0.669031 eval-auc:0.659467
## [7] train-auc:0.683516 eval-auc:0.674042
## [8] train-auc:0.686785 eval-auc:0.676482
## [9] train-auc:0.703510 eval-auc:0.690051
## [10] train-auc:0.711103 eval-auc:0.699285
## [11] train-auc:0.720605 eval-auc:0.708173
## [12] train-auc:0.731126 eval-auc:0.717686
## [13] train-auc:0.737943 eval-auc:0.726292
## [14] train-auc:0.736494 eval-auc:0.724213
## [15] train-auc:0.740324 eval-auc:0.728974
## [16] train-auc:0.737052 eval-auc:0.725547
## [17] train-auc:0.744481 eval-auc:0.733439
## [18] train-auc:0.745041 eval-auc:0.734170
## [19] train-auc:0.747679 eval-auc:0.735256
## [20] train-auc:0.747321 eval-auc:0.734147
## [21] train-auc:0.742333 eval-auc:0.729227
## [22] train-auc:0.744022 eval-auc:0.731914
## [23] train-auc:0.744383 eval-auc:0.732373
## [24] train-auc:0.742866 eval-auc:0.730232
## [25] train-auc:0.743158 eval-auc:0.730232
## [26] train-auc:0.744609 eval-auc:0.731327
## [27] train-auc:0.744701 eval-auc:0.731782
## [28] train-auc:0.744458 eval-auc:0.731217
## [29] train-auc:0.743945 eval-auc:0.730663
## [30] train-auc:0.746816 eval-auc:0.732649
## Time difference of 36.41271 secs
#test <- data.table::fread(file = "test.csv")
#ID_code <- test$ID_code
#test$ID_code <- NULL
test_sparse <- sparse.model.matrix(ID_code ~., data=test)
dtest <- xgb.DMatrix(data=test_sparse, label = test$ID_code)
## Warning in setinfo.xgb.DMatrix(dmat, names(p), p[[1]]): NAs introduced by
## coercion
## [1] 0.4019109 0.3954378 0.4001880 0.3940031 0.3980342 0.3940164
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3928 0.3943 0.3957 0.3963 0.3977 0.4176
## ID_code target
## test_0 : 1 Min. :0.3928
## test_1 : 1 1st Qu.:0.3943
## test_10 : 1 Median :0.3957
## test_100 : 1 Mean :0.3963
## test_1000 : 1 3rd Qu.:0.3977
## test_10000: 1 Max. :0.4176
## (Other) :199994
# r <- 8
# c <- 10
# m0 <- matrix(0, r, c)
# features<-apply(m0, c(1,2), function(x) sample(c(0,1),1))
# folds<-CreateFolds(features,4)
#
# Subtrain <- train[1:10, 1:10]
# Subtrain[,ID_code := NULL]
# Subtrain[2, `:=`(target =3)]
# Subtrain_bkp <- Subtrain
# scale.cols <- colnames(Subtrain)
# Subtrain[, (scale.cols) := lapply(.SD, scale), .SDcols = scale.cols]