train <- read_rds("datasets/train.rds")
train_labels <- read_rds("datasets/train_labes.rds")
test <- read_rds("datasets/test.rds")
#specs <- read_rds("datasets/specs.rds")
sample_submission <- read_rds("datasets/sample_submission.rds")
The description of the data shows some variables are grouped into other variables. for example, instllation_id groups all installed Games in each device. Each game has several game_session. Each game_session has game_time. During Game_time several Events are programmed. Each Event has data collapsed in event_data.
library(DiagrammeR)
create_graph() %>%
add_node(label = "installation_id", type = 'person') %>%
add_node(label = "Games") %>%
add_node(label = "Title") %>%
add_node(label = "Type") %>%
add_node(label = "Game_session") %>%
add_node(label = "Game_time") %>%
add_node(label = "Events") %>%
add_node(label = "Events_data") %>%
add_node(label = "Event_id") %>%
add_node(label = "Event_code") %>%
add_node(label = "Event_count") %>%
add_node(label = "Assessement") %>%
add_edge(from = 1,to = 2) %>%
add_edge(from = 2,to = 3) %>%
add_edge(from = 3,to = 4) %>%
add_edge(from = 4,to = 5) %>%
add_edge(from = 5,to = 6) %>%
add_edge(from = 6,to = 7) %>%
add_edge(from = 7,to = 8) %>%
add_edge(from = 8,to = 9) %>%
add_edge(from = 9,to = 10) %>%
add_edge(from = 9,to = 11) %>%
add_edge(from = 9,to = 12) %>%
render_graph(layout = "kk")
We grouped the accuracy using, first only event_code and the second event_code and Assessment type.
# p1 <- train %>%
# filter(event_code == 4100| event_code == 4110)%>% #select(installation_id) %>% n_distinct()
# mutate(event_data = gsub('"', '', event_data)) %>%
# mutate(Status = ifelse(str_detect(event_data, pattern = 'correct:true'), 'correct',
# ifelse(str_detect(event_data, pattern = 'correct:false'), 'incorrect',
# NA))) %>%
# filter(!is.na(Status)) %>%
# group_by(installation_id) %>%
# summarise(num_correct = sum(Status %in% 'correct'),
# num_incorrect = sum(Status %in% 'incorrect'),
# num_NA = sum(is.na(Status))) %>%
# ungroup %>%
# mutate(accuracy = num_correct/(num_correct+num_incorrect+num_NA)) %>%
# mutate(accuracy_group = ifelse(accuracy == 0, 0,
# ifelse(accuracy> 0 & accuracy < 0.5, 1,
# ifelse(accuracy >= 0.5 & accuracy < 1, 2,
# 3
# )))) %>%
# group_by(accuracy_group) %>%
# summarise(Frequencies = n()) %>%
# ggplot() +
# aes(x = accuracy_group, y = Frequencies) +
# geom_col()+
# geom_text(aes(label = Frequencies), vjust = -0.5) +
# #theme( axis.text.x = element_text(angle=45, hjust=1, vjust=0.9)) +
# labs(title = paste0("Accuracy distribution of sub-train data: event_code == 4100|4110"))
#
#
# p2 <- train %>%
# filter(type == 'Assessment') %>%
# filter(event_code == 4100| event_code == 4110) %>%
# mutate(event_data = gsub('"', '', event_data)) %>%
# mutate(Status = ifelse(str_detect(event_data, pattern = 'correct:true'), 'correct',
# ifelse(str_detect(event_data, pattern = 'correct:false'), 'incorrect',
# NA))) %>%
# #filter(!is.na(Status)) %>%
# group_by(installation_id) %>%
# summarise(num_correct = sum(Status %in% 'correct'),
# num_incorrect = sum(Status%in% 'incorrect'),
# num_NA = sum(is.na(Status))) %>%
# ungroup %>%
# mutate(accuracy = num_correct/(num_correct+num_incorrect+num_NA)) %>%
# mutate(accuracy_group = ifelse(accuracy == 0, 0,
# ifelse(accuracy> 0 & accuracy < 0.5, 1,
# ifelse(accuracy >= 0.5 & accuracy < 1, 2,
# 3
# )))) %>%
# group_by(accuracy_group) %>%
# summarise(Frequencies = n()) %>%
# ggplot() +
# aes(x = accuracy_group, y = Frequencies) +
# geom_col()+
# geom_text(aes(label = Frequencies), vjust = -0.5) +
# #theme( axis.text.x = element_text(angle=45, hjust=1, vjust=0.9)) +
# labs(title = paste0("Accuracy distribution of sub-train data: type == Assessment & event_code == 4100|4110"))
#
#
# gridExtra::grid.arrange(p1, p2, nrow = 2)
not_req <- setdiff(unique(train$installation_id), unique(train_labels$installation_id))
new_train <- train %>%
filter(!installation_id %in% not_req)
paste0("Is new_train and train_lables have the same list of installation_id? ", all_equal(unique(new_train$installation_id), unique(train_labels$installation_id)))
## [1] "Is new_train and train_lables have the same list of installation_id? TRUE"
new_train <- new_train %>%
mutate(timestamp = gsub( "T", " ", timestamp)) %>%
mutate(timestamp = gsub( "\\..*Z$", "", timestamp)) %>%
mutate(Date = format(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"),"%Y-%m-%d")) %>%
mutate(Months = lubridate::month(ymd(Date))) %>%
mutate(game_time = format( as.POSIXct(timestamp)+game_time/1000, "%Y-%m-%d %H:%M:%S")) %>% #
#arrange(Date) %>%
mutate(timing = ymd_hms(game_time) - ymd_hms(timestamp))
test <- test %>%
mutate(timestamp = gsub( "T", " ", timestamp)) %>%
mutate(timestamp = gsub( "\\..*Z$", "", timestamp)) %>%
mutate(Date = format(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"),"%Y-%m-%d")) %>%
mutate(Months = lubridate::month(ymd(Date)))%>%
mutate(game_time = format( as.POSIXct(timestamp)+game_time/1000, "%Y-%m-%d %H:%M:%S")) %>% #
#arrange(Date) %>%
mutate(timing = ymd_hms(game_time) - ymd_hms(timestamp))
#
# p1 <- new_train %>%
# group_by(Months, installation_id, title) %>%
# summarise(Installation = n()) %>%
# ggplot() +
# aes(x = Months, y = Installation, fill = title) +
# geom_col() +
# theme( axis.text.x = element_text(angle=45, hjust=1, vjust=0.9)) +
# labs(title = "Train data")
#
#
# p2 <- test %>%
# group_by(Months, installation_id, title) %>%
# summarise(Installation = n()) %>%
# ggplot() +
# aes(x = Months, y = Installation, fill = title) +
# geom_col() +
# theme( axis.text.x = element_text(angle=45, hjust=1, vjust=0.9)) +
# labs(title = "Test data")
#
# gridExtra::grid.arrange(p1, p2, nrow = 2)
# p <- new_train %>%
# mutate(Months = lubridate::month(ymd(Date))) %>%
# group_by(Months, title, world) %>%
# summarise(frequency = n()) %>%
# ggplot()+
# facet_grid(world~ .) +
# aes(x = Months, y = frequency, color = title) +
# geom_point()
#
# ggplotly(p)
# p <- test %>%
# group_by(Months, title, world) %>%
# summarise(frequency = n()) %>%
# ggplot()+
# facet_grid(world~ .) +
# aes(x = Months, y = frequency, color = title) +
# geom_point()
#
# ggplotly(p)
p1 <- new_train %>%
group_by(Months, event_id, game_session) %>%
summarise(Events = sum(event_count)) %>%
ggplot() +
aes(x = Months, y = Events) +
geom_col() +
labs(title = 'Train Data')
p2 <- test %>%
group_by(Months, event_id, game_session) %>%
summarise(Events = sum(event_count)) %>%
ggplot() +
aes(x = Months, y = Events) +
geom_col() +
labs(title = 'Test Data')
gridExtra::grid.arrange(p1,p2, nrow = 2)
get_accuracy <- function(df, game ,code){
tmp <- df %>%
filter(str_detect(title , pattern = game)) %>%
filter(event_code == code) %>%
mutate(event_data = gsub('"', '', event_data)) %>%
mutate(Status = ifelse(str_detect(event_data, pattern = 'correct:true'), 'correct',
ifelse(str_detect(event_data, pattern = 'correct:false'), 'incorrect',
NA))) %>%
group_by(installation_id, game_session, title) %>%
summarise(num_correct = sum(Status %in% 'correct'),
num_incorrect = sum(Status %in% 'incorrect')) %>%
ungroup %>%
mutate(accuracy = num_correct/(num_correct+num_incorrect)) %>%
mutate(accuracy_group = ifelse(accuracy == 0, 0L,
ifelse(accuracy> 0 & accuracy < 0.5, 1L,
ifelse(accuracy >= 0.5 & accuracy < 1, 2L,
3L
))))
return(tmp)
}
bird_accuracy_ <- get_accuracy(test ,'Bird Measurer' , '4110')
cart_accuracy_ <- get_accuracy(test, "Cart Balancer", '4100')
caudron_accuracy_ <- get_accuracy(test, "Cauldron Filler", "4100")
chest_accuracy_ <- get_accuracy(test, "Chest Sorter", "4100")
mushroom_accuracy_ <- get_accuracy(test, "Mushroom Sorter", "4100")
test_labels <- dplyr::bind_rows(bird_accuracy_, cart_accuracy_, caudron_accuracy_, chest_accuracy_, mushroom_accuracy_)
test_labels %>% head()
## # A tibble: 6 x 7
## installation_id game_session title num_correct num_incorrect accuracy
## <chr> <chr> <chr> <int> <int> <dbl>
## 1 01242218 597a8839a5a… Bird… 1 2 0.333
## 2 027e7ce5 5079b43514b… Bird… 1 0 1
## 3 02a29f99 c043d48e6b0… Bird… 0 11 0
## 4 02a29f99 dc1c979f6c9… Bird… 0 2 0
## 5 04d31500 2747d8022de… Bird… 1 0 1
## 6 05771bba a9ee75f6507… Bird… 0 4 0
## # … with 1 more variable: accuracy_group <int>
## get the last assessment for each game_session
subtest_with_accuracy <- test %>%
left_join(test_labels, by = c("installation_id", "game_session", "title")) %>%
filter(!is.na(accuracy_group)) %>%
#filter(type == "Assessment") %>%
arrange(desc(timestamp)) %>%
distinct(installation_id, .keep_all = TRUE)
subtest_with_accuracy %>% head()
## event_id game_session timestamp
## 1 3393b68b 9dd2ffd1f297124d 2019-10-14 21:00:19
## 2 ecaab346 ae4b5b3b3e61455c 2019-10-14 19:20:04
## 3 2b058fe3 db9dd5f9faebf692 2019-10-14 15:08:09
## 4 ecaab346 b9fd73b70b629c6d 2019-10-14 14:50:21
## 5 5c2f29ca 33cc05ed9b92a7c1 2019-10-14 13:56:14
## 6 ecaab346 7bd2e14dcc3c3ccd 2019-10-14 13:13:57
## event_data
## 1 {""session_duration"":73009,""event_count"":57,""game_time"":73009,""event_code"":2010}
## 2 {""description"":""You did it! The weight is equal on both sides. It's balanced! SFX_completedtask"",""identifier"":""Dot_YouDidIt,SFX_completedtask"",""media_type"":""audio"",""duration"":7430,""event_count"":13,""game_time"":17661,""event_code"":3121}
## 3 {""session_duration"":40812,""event_count"":50,""game_time"":40812,""event_code"":2010}
## 4 {""description"":""You did it! The weight is equal on both sides. It's balanced! SFX_completedtask"",""identifier"":""Dot_YouDidIt,SFX_completedtask"",""media_type"":""audio"",""duration"":7319,""event_count"":31,""game_time"":28524,""event_code"":3121}
## 5 {""crystal_id"":""gem06"",""weight"":4,""coordinates"":{""x"":782,""y"":276,""stage_width"":1015,""stage_height"":762},""side"":""right"",""left"":[{""id"":""gem05"",""weight"":4}],""duration"":435,""right"":[{""id"":""gem06"",""weight"":4}],""crystals"":[{""id"":""gem07"",""weight"":6},{""id"":""gem01"",""weight"":1},{""id"":""gem03"",""weight"":3},{""id"":""gem02"",""weight"":1},{""id"":""gem08"",""weight"":6},{""id"":""gem04"",""weight"":3}],""source"":""resources"",""event_count"":35,""game_time"":15403,""event_code"":4020}
## 6 {""description"":""You did it! The weight is equal on both sides. It's balanced! SFX_completedtask"",""identifier"":""Dot_YouDidIt,SFX_completedtask"",""media_type"":""audio"",""duration"":7333,""event_count"":71,""game_time"":85719,""event_code"":3121}
## installation_id event_count event_code game_time
## 1 101d16f5 57 2010 2019-10-14 21:01:32
## 2 532002a8 13 3121 2019-10-14 19:20:21
## 3 1962067f 50 2010 2019-10-14 15:08:49
## 4 1ed372f8 31 3121 2019-10-14 14:50:49
## 5 7973d319 35 4020 2019-10-14 13:56:29
## 6 7fe9ca96 71 3121 2019-10-14 13:15:22
## title type world Date Months
## 1 Bird Measurer (Assessment) Assessment TREETOPCITY 2019-10-14 10
## 2 Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14 10
## 3 Cauldron Filler (Assessment) Assessment MAGMAPEAK 2019-10-14 10
## 4 Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14 10
## 5 Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14 10
## 6 Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14 10
## timing num_correct num_incorrect accuracy accuracy_group
## 1 73 secs 1 2 0.3333333 1
## 2 17 secs 1 0 1.0000000 3
## 3 40 secs 1 0 1.0000000 3
## 4 28 secs 1 0 1.0000000 3
## 5 15 secs 1 1 0.5000000 2
## 6 85 secs 1 2 0.3333333 1
## subtest without accuracy
## filter only last assessment for each instllation_id in test dataset
subtest_without_accuracy <- test %>%
anti_join( subtest_with_accuracy , by = c("installation_id")) %>%
filter(type == "Assessment") %>%
arrange(desc(timestamp)) %>%
distinct(installation_id, .keep_all = TRUE)
subtest_without_accuracy %>% head()
## event_id game_session timestamp
## 1 3bfd1a65 5b010743b38ab0b6 2019-10-14 18:17:18
## 2 3bfd1a65 9f7493288742d987 2019-10-14 17:43:07
## 3 7ad3efc6 b98699251de79a27 2019-10-14 17:13:02
## 4 5b49460a 9c9f677f5b66d2c7 2019-10-14 13:06:39
## 5 3bfd1a65 b679dd2215f0dba8 2019-10-14 03:10:00
## 6 90d848e0 f3885a783437995b 2019-10-14 01:16:00
## event_data
## 1 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 2 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 3 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 4 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 5 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 6 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## installation_id event_count event_code game_time
## 1 d1e82789 1 2000 2019-10-14 18:17:18
## 2 73a78f04 1 2000 2019-10-14 17:43:07
## 3 46486950 1 2000 2019-10-14 17:13:02
## 4 c64ad87e 1 2000 2019-10-14 13:06:39
## 5 dbb1a1d3 1 2000 2019-10-14 03:10:00
## 6 6a30ec55 1 2000 2019-10-14 01:16:00
## title type world Date Months
## 1 Mushroom Sorter (Assessment) Assessment TREETOPCITY 2019-10-14 10
## 2 Mushroom Sorter (Assessment) Assessment TREETOPCITY 2019-10-14 10
## 3 Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14 10
## 4 Chest Sorter (Assessment) Assessment CRYSTALCAVES 2019-10-14 10
## 5 Mushroom Sorter (Assessment) Assessment TREETOPCITY 2019-10-14 10
## 6 Cauldron Filler (Assessment) Assessment MAGMAPEAK 2019-10-14 10
## timing
## 1 0 secs
## 2 0 secs
## 3 0 secs
## 4 0 secs
## 5 0 secs
## 6 0 secs
#convert_timestamp <- function(df, game, code){
# tmp <- df %>%
# filter(str_detect(title , pattern = game)) %>%
# filter(event_code %in% code) %>%
#mutate(timestamp = gsub( "T", " ", timestamp)) %>%
#mutate(timestamp = gsub( "\\..*Z$", "", timestamp)) %>%
# mutate(Date = as_datetime(format(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"),"%Y-%m-%d %H:%M:%S"))) %>%
# mutate(game_time = format( as.POSIXct(Date)+game_time/1000, "%Y-%m-%d %H:%M:%S")) %>%
# arrange(Date) %>%
# mutate(timing = ymd_hms(game_time) - ymd_hms(Date))
# return(tmp)
#}
featuring <- function(df){
tmp <- df
#tmp <- convert_timestamp(df, game, code)
## How long time has the game session to get the correct answer?
longtime <- tmp %>%
group_by(installation_id, game_session) %>%
summarise(longtime = as.numeric(sum(timing))) %>%
ungroup()
#right_join(tmp, by= c("installation_id", "game_session"))
## `Mean`, `median` and `sd` and timing
timing_feat <- tmp %>%
group_by(installation_id) %>%
summarise(mean_timing = as.numeric(mean(timing, na.rm = TRUE)),
median_timing = as.numeric(median(timing, na.rm = TRUE)),
sd_timing = as.numeric(sd(timing, na.rm = TRUE))) %>%
ungroup()
# Event count and attempts featuring
## How many attempts per instllation_id to get the succes the game_session?
attempts <- tmp %>%
group_by(installation_id) %>%
summarise(attempts =as.numeric(n())) %>%
ungroup()
#right_join(tmp , by = "installation_id")
## mean, median, sd of event_count
event_count_feat <- tmp %>%
group_by(installation_id) %>%
summarise(mean_event_count = as.numeric(mean(event_count, na.rm = TRUE)),
median_event_count = as.numeric(median(mean_event_count, na.rm = TRUE)),
sd_event_count = as.numeric(sd(event_count,na.rm = TRUE))) %>%
ungroup()
# encode title
tmp %>%
left_join(longtime, by= c("installation_id", "game_session")) %>%
left_join(timing_feat, by = "installation_id") %>%
left_join(attempts, by = "installation_id") %>%
left_join(event_count_feat, by = "installation_id") %>%
# Datetime , months, week, day , weekend, features
mutate(datetime = as.numeric(log1p(as.numeric(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"))))) %>%
mutate(Date = as.Date(Date)) %>%
mutate(Months = lubridate::month(ymd(Date))) %>%
mutate(Week = lubridate::week(ymd(Date))) %>%
mutate(wday = lubridate::wday(ymd(Date))) %>%
mutate(weekend = ifelse(wday == 1 | wday == 6, 1, 0)) %>%
mutate(title = as.numeric(as.factor(title))) %>%
mutate(type = as.numeric(as.factor(type))) %>%
mutate(world = as.numeric(as.factor(world))) %>%
select( -Date, -game_session, -event_id, -timing,-game_time, -timestamp, -event_data ) -> tmp
return(tmp)
}
#new_train <- convert_timestamp(new_train, game = paste(c("Bird Measurer","Cart Balancer", "Mushroom Sorter",
# "Cauldron Filler", "Chest Sorter"), collapse = "|") ,
# code = c("4100", "4110"))
Train <- train_labels %>%
select(installation_id, game_session, title, accuracy_group) %>%
right_join(new_train, by = c("installation_id", "game_session", "title")) %>%
filter(!is.na(accuracy_group))
Train <- featuring(Train) %>% select(-installation_id)
#Train <- Train[,colSums(is.na(Train))<nrow(Train)]
#subtest_without_accuracy <- convert_timestamp(subtest_without_accuracy, game = "Assessment", code = "2000")
Test <- featuring(subtest_without_accuracy)
#Test <- Test[,colSums(is.na(Test))<nrow(Test)]
Test %>% head()
## installation_id event_count event_code title type world Months longtime
## 1 d1e82789 1 2000 5 1 3 10 0
## 2 73a78f04 1 2000 5 1 3 10 0
## 3 46486950 1 2000 2 1 1 10 0
## 4 c64ad87e 1 2000 4 1 1 10 0
## 5 dbb1a1d3 1 2000 5 1 3 10 0
## 6 6a30ec55 1 2000 3 1 2 10 0
## mean_timing median_timing sd_timing attempts mean_event_count
## 1 0 0 NA 1 1
## 2 0 0 NA 1 1
## 3 0 0 NA 1 1
## 4 0 0 NA 1 1
## 5 0 0 NA 1 1
## 6 0 0 NA 1 1
## median_event_count sd_event_count datetime Week wday weekend
## 1 1 NA 21.17502 41 2 0
## 2 1 NA 21.17502 41 2 0
## 3 1 NA 21.17502 41 2 0
## 4 1 NA 21.17501 41 2 0
## 5 1 NA 21.17499 41 2 0
## 6 1 NA 21.17499 41 2 0
## title accuracy_group event_count event_code type world Months longtime
## 1 5 3 1 2000 1 3 8 821
## 2 5 3 2 2025 1 3 8 821
## 3 5 3 3 3010 1 3 8 821
## 4 5 3 4 3110 1 3 8 821
## 5 5 3 5 3010 1 3 8 821
## 6 5 3 6 4025 1 3 8 821
## mean_timing median_timing sd_timing attempts mean_event_count
## 1 25.92623 18 23.86197 244 28.95492
## 2 25.92623 18 23.86197 244 28.95492
## 3 25.92623 18 23.86197 244 28.95492
## 4 25.92623 18 23.86197 244 28.95492
## 5 25.92623 18 23.86197 244 28.95492
## 6 25.92623 18 23.86197 244 28.95492
## median_event_count sd_event_count datetime Week wday weekend
## 1 28.95492 21.14933 21.17119 32 3 0
## 2 28.95492 21.14933 21.17119 32 3 0
## 3 28.95492 21.14933 21.17119 32 3 0
## 4 28.95492 21.14933 21.17119 32 3 0
## 5 28.95492 21.14933 21.17119 32 3 0
## 6 28.95492 21.14933 21.17119 32 3 0
rm( new_train, train_labels)
invisible(gc())
tmp <- rsample::initial_split(Train, prop = 3/4)
train <- rsample::training(tmp)
valid <- rsample::testing(tmp)
#x_train <- sample_frac(Train, 0.75)
#x_valid <- x_train %>% anti_join(x_train1, by = "installation_id")
y_train <- train$accuracy_group
x_train <- train %>% select(-accuracy_group)
y_valid <- valid$accuracy_group
x_valid <- valid %>% select(-accuracy_group)
rm(Train)
invisible(gc())
dtrain <- xgb.DMatrix(data = as.matrix(x_train), label = y_train)
dval <- xgb.DMatrix(data = as.matrix(x_valid), label = y_valid)
eval_kappa<-function(preds, dtrain) {
labels <- y_train #getinfo(dtrain, 'label')
err <- Metrics::ScoreQuadraticWeightedKappa(rater.a = labels,
rater.b = as.integer(round(preds)),
min.rating = 0,
max.rating = 3)
return(list(name = "kappa", value = err, higher_better = TRUE))
}
xgb_params <- list(objective = "multi:softprob",
eval_metric = "mlogloss",
num_class = 4,
colsample_bytree= 0.5,
eta = 0.1,
max_depth= 10,
subsample= 0.5
)
cv_model <- xgb.cv(params = xgb_params,
data = dtrain,
#eval_metric = list(dtrain, dval), #eval_kappa,
nrounds = 30,
verbose = TRUE,
maximize = FALSE,
nfold = 5,
early_stopping_round = 10,
print_every_n = 10,
prediction = TRUE)
## [1] train-mlogloss:1.326928+0.006018 test-mlogloss:1.327354+0.006169
## Multiple eval metrics are present. Will use test_mlogloss for early stopping.
## Will train until test_mlogloss hasn't improved in 10 rounds.
##
## [11] train-mlogloss:0.925071+0.007549 test-mlogloss:0.929192+0.007771
## [21] train-mlogloss:0.716480+0.006871 test-mlogloss:0.722792+0.006446
## [30] train-mlogloss:0.601111+0.006551 test-mlogloss:0.608601+0.005855
dvalid = xgb.DMatrix( as.matrix(x_valid))
pred_valid <- predict(xgb_model, dvalid)
Metrics::ScoreQuadraticWeightedKappa(as.integer(round(pred_valid)),
y_valid,
min.rating = 0,
max.rating = 3)
## [1] -0.0001701649
#evaluate the default model
valid_prediction <- matrix(pred_valid, nrow = 4, ncol=length(pred_valid)/4) %>%
t() %>%
data.frame() %>%
mutate(label = y_valid , max_prob = max.col(., "last") - 1)
caret::confusionMatrix(factor(valid_prediction$max_prob), factor(valid_prediction$label), mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3
## 0 56102 3257 2827 1593
## 1 2348 44270 2055 1856
## 2 440 380 14404 221
## 3 5023 1231 8156 72198
##
## Overall Statistics
##
## Accuracy : 0.8642
## 95% CI : (0.8627, 0.8656)
## No Information Rate : 0.3507
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8088
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity 0.8778 0.9009 0.52489 0.9516
## Specificity 0.9496 0.9626 0.99449 0.8974
## Pos Pred Value 0.8796 0.8761 0.93260 0.8336
## Neg Pred Value 0.9488 0.9706 0.93511 0.9717
## Precision 0.8796 0.8761 0.93260 0.8336
## Recall 0.8778 0.9009 0.52489 0.9516
## F1 0.8787 0.8884 0.67172 0.8887
## Prevalence 0.2954 0.2271 0.12683 0.3507
## Detection Rate 0.2593 0.2046 0.06657 0.3337
## Detection Prevalence 0.2948 0.2335 0.07139 0.4003
## Balanced Accuracy 0.9137 0.9318 0.75969 0.9245
# get the feature real names
names <- colnames(x_train)
# compute feature importance matrix
importance_matrix = xgb.importance(feature_names = names, model = xgb_model)
importance_matrix
## Feature Gain Cover Frequency
## 1: longtime 0.311657669 0.179654669 0.12249884
## 2: mean_event_count 0.076179673 0.104020149 0.08651550
## 3: title 0.075033526 0.076192120 0.02471078
## 4: datetime 0.066318722 0.052715334 0.12259139
## 5: sd_event_count 0.065085917 0.083657376 0.08027765
## 6: mean_timing 0.062558779 0.066672920 0.10613605
## 7: attempts 0.060334951 0.085545368 0.08623785
## 8: sd_timing 0.058752235 0.082715567 0.08977325
## 9: median_timing 0.048336943 0.058460970 0.07524294
## 10: event_count 0.040619109 0.041563834 0.03287367
## 11: world 0.034495075 0.043189469 0.01436372
## 12: median_event_count 0.032103852 0.046870422 0.04233225
## 13: event_code 0.032013751 0.058088050 0.02502545
## 14: wday 0.014309550 0.008121158 0.03577973
## 15: Week 0.011420599 0.006978564 0.02709857
## 16: Months 0.006712531 0.002345595 0.01797316
## 17: weekend 0.004067118 0.003208435 0.01056918
test_installation_id = Test$installation_id
x_test <- Test %>% select(-installation_id)
x_test <- x_test %>% select(colnames(x_train))
dtest <- xgb.DMatrix(as.matrix(x_test))
rm(Test)
invisible(gc())
pred_test <- predict(xgb_model, dtest,reshape=TRUE)
## reshape dataframe and get which class
pred_test_class <- matrix(pred_test, nrow = 4, ncol = length(pred_test) / 4) %>%
t() %>%
data.frame() %>%
mutate(max = max.col(., ties.method = "last") - 1, label = test_installation_id)
submission2 <- data.frame(
installation_id = as.character(pred_test_class$label),
accuracy_group = pred_test_class$max #pred_test_class[, unique(pred_test_class$max)]
)
submission3 <- dplyr::bind_rows(submission2, subtest_with_accuracy) %>% select(installation_id, accuracy_group)
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
submission <- submission3 %>%
right_join(sample_submission, by = "installation_id") %>%
select(installation_id, accuracy_group.x) %>%
rename(accuracy_group = accuracy_group.x)
fwrite(submission, "submission.csv")
submission %>% head()
## installation_id accuracy_group
## 1 00abaee7 3
## 2 01242218 1
## 3 017c5718 2
## 4 01a44906 2
## 5 01bc6cb6 1
## 6 02256298 3