train <- read_rds("datasets/train.rds")
train_labels <- read_rds("datasets/train_labes.rds")
test <- read_rds("datasets/test.rds")
#specs <- read_rds("datasets/specs.rds")
sample_submission <- read_rds("datasets/sample_submission.rds")

0.1 Stack Overview of variables

The description of the data shows some variables are grouped into other variables. for example, instllation_id groups all installed Games in each device. Each game has several game_session. Each game_session has game_time. During Game_time several Events are programmed. Each Event has data collapsed in event_data.

library(DiagrammeR)
create_graph() %>%
  add_node(label = "installation_id", type = 'person') %>%
  add_node(label = "Games") %>%
  add_node(label = "Title") %>%
  add_node(label = "Type") %>%
  add_node(label = "Game_session") %>%
  add_node(label = "Game_time") %>%
  add_node(label = "Events") %>%
  add_node(label = "Events_data") %>%
  add_node(label = "Event_id") %>%
  add_node(label = "Event_code") %>%
  add_node(label = "Event_count") %>%
  add_node(label = "Assessement") %>%
  add_edge(from = 1,to = 2) %>%
  add_edge(from = 2,to = 3) %>%
    add_edge(from = 3,to = 4) %>%
  add_edge(from = 4,to = 5) %>%
    add_edge(from = 5,to = 6) %>%
  add_edge(from = 6,to = 7) %>%
    add_edge(from = 7,to = 8) %>%
  add_edge(from = 8,to = 9) %>%
    add_edge(from = 9,to = 10) %>%
    add_edge(from = 9,to = 11) %>%
  add_edge(from = 9,to = 12) %>%
render_graph(layout = "kk")

1 Accuracy group distribution

We grouped the accuracy using, first only event_code and the second event_code and Assessment type.

# p1 <- train %>%
#   filter(event_code == 4100| event_code == 4110)%>% #select(installation_id) %>% n_distinct()
#     mutate(event_data = gsub('"', '', event_data)) %>%
#     mutate(Status =  ifelse(str_detect(event_data, pattern = 'correct:true'),  'correct',
#                       ifelse(str_detect(event_data, pattern = 'correct:false'),  'incorrect',
#                              NA))) %>%
#   filter(!is.na(Status)) %>%
#   group_by(installation_id) %>%
#   summarise(num_correct = sum(Status %in% 'correct'),
#             num_incorrect = sum(Status %in% 'incorrect'),
#             num_NA = sum(is.na(Status))) %>%
#     ungroup %>%
#   mutate(accuracy = num_correct/(num_correct+num_incorrect+num_NA)) %>%
#   mutate(accuracy_group = ifelse(accuracy == 0,  0,
#           ifelse(accuracy> 0 & accuracy < 0.5,  1,
#           ifelse(accuracy >= 0.5 & accuracy < 1,  2,
#                   3
#           )))) %>%
#   group_by(accuracy_group) %>%
#   summarise(Frequencies = n()) %>%
#   ggplot() +
#   aes(x = accuracy_group, y = Frequencies) +
#   geom_col()+
#   geom_text(aes(label = Frequencies), vjust = -0.5) +
#   #theme( axis.text.x  = element_text(angle=45, hjust=1, vjust=0.9)) +
#   labs(title = paste0("Accuracy distribution of sub-train data: event_code == 4100|4110"))
# 
# 
#   p2 <- train %>%
#   filter(type == 'Assessment') %>% 
#   filter(event_code == 4100| event_code == 4110) %>%
#     mutate(event_data = gsub('"', '', event_data)) %>%
#     mutate(Status =  ifelse(str_detect(event_data, pattern = 'correct:true'),  'correct',
#                       ifelse(str_detect(event_data, pattern = 'correct:false'),  'incorrect',
#                              NA))) %>%
#   #filter(!is.na(Status)) %>%
#   group_by(installation_id) %>%
#   summarise(num_correct = sum(Status %in% 'correct'),
#             num_incorrect = sum(Status%in% 'incorrect'),
#             num_NA = sum(is.na(Status))) %>%
#     ungroup %>%
#   mutate(accuracy = num_correct/(num_correct+num_incorrect+num_NA)) %>%
#   mutate(accuracy_group = ifelse(accuracy == 0,  0,
#           ifelse(accuracy> 0 & accuracy < 0.5,  1,
#           ifelse(accuracy >= 0.5 & accuracy < 1,  2,
#                   3
#           )))) %>%
#   group_by(accuracy_group) %>%
#   summarise(Frequencies = n()) %>%
#   ggplot() +
#   aes(x = accuracy_group, y = Frequencies) +
#   geom_col()+
#   geom_text(aes(label = Frequencies), vjust = -0.5) +
#   #theme( axis.text.x  = element_text(angle=45, hjust=1, vjust=0.9)) +
#   labs(title = paste0("Accuracy distribution of sub-train data: type == Assessment & event_code == 4100|4110"))
#   
#   
# gridExtra::grid.arrange(p1, p2, nrow = 2)

The plots are not the same, what is the right way to reproduce accuracy group? (How to reproduce train_labels).

2 Reduce the size of the train data to only installation_id woth accuracy_group

not_req <- setdiff(unique(train$installation_id), unique(train_labels$installation_id))

new_train <- train %>%
  filter(!installation_id %in% not_req)


paste0("Is new_train and train_lables have the same list of installation_id? ", all_equal(unique(new_train$installation_id), unique(train_labels$installation_id)))

## [1] "Is new_train and train_lables have the same list of installation_id? TRUE"

rm(train)
invisible(gc())

3 Convert timestamp

   new_train <- new_train %>% 
  mutate(timestamp = gsub( "T", " ", timestamp)) %>%
  mutate(timestamp = gsub( "\\..*Z$", "", timestamp)) %>%
  mutate(Date = format(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"),"%Y-%m-%d")) %>%
  mutate(Months = lubridate::month(ymd(Date))) %>%
  mutate(game_time = format( as.POSIXct(timestamp)+game_time/1000, "%Y-%m-%d %H:%M:%S")) %>% #
  #arrange(Date) %>%
  mutate(timing = ymd_hms(game_time) - ymd_hms(timestamp))

test <-  test %>%
  mutate(timestamp = gsub( "T", " ", timestamp)) %>%
  mutate(timestamp = gsub( "\\..*Z$", "", timestamp)) %>%
  mutate(Date = format(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"),"%Y-%m-%d")) %>%
  mutate(Months = lubridate::month(ymd(Date)))%>%
  mutate(game_time = format( as.POSIXct(timestamp)+game_time/1000, "%Y-%m-%d %H:%M:%S")) %>% #
  #arrange(Date) %>%
  mutate(timing = ymd_hms(game_time) - ymd_hms(timestamp))

4 ggplot EDA

4.1 How many installation_id per title?

#   
# p1 <- new_train %>%
#   group_by(Months, installation_id, title) %>%
#   summarise(Installation = n()) %>%
#   ggplot() +
#   aes(x = Months, y = Installation, fill = title) +
#   geom_col() +
#   theme( axis.text.x  = element_text(angle=45, hjust=1, vjust=0.9)) +
#   labs(title = "Train data")
#  
# 
# p2 <- test %>%
#   group_by(Months, installation_id, title) %>%
#   summarise(Installation = n()) %>%
#   ggplot() +
#   aes(x = Months, y = Installation, fill = title) +
#   geom_col() +
#   theme( axis.text.x  = element_text(angle=45, hjust=1, vjust=0.9)) +
#   labs(title = "Test data")
# 
# gridExtra::grid.arrange(p1, p2, nrow = 2)

# rm(p1, p2)
# invisible(gc())

4.2 What id the relationships between Title and World

# p <- new_train %>%
#   mutate(Months = lubridate::month(ymd(Date))) %>%
#   group_by(Months, title, world) %>%
#   summarise(frequency = n()) %>%
#   ggplot()+
#    facet_grid(world~ .) +
#   aes(x = Months, y = frequency, color = title) +
#   geom_point()
# 
# ggplotly(p)

# p <- test %>%
#   group_by(Months, title, world) %>%
#   summarise(frequency = n()) %>%
#   ggplot()+
#    facet_grid(world~ .) +
#   aes(x = Months, y = frequency, color = title) +
#   geom_point()
# 
# ggplotly(p)

Some worlds are specific to a specific title.

4.3 event_count is the count of event_id?

event_count is an incremental counter of events within a game session (offset at 1). Extracted from event_data.

p1 <- new_train %>% 
group_by(Months, event_id, game_session) %>%
summarise(Events = sum(event_count)) %>%
ggplot() +
  aes(x = Months, y = Events) +
  geom_col() +
  labs(title = 'Train Data')

p2 <- test %>% 
group_by(Months, event_id, game_session) %>%
summarise(Events = sum(event_count)) %>%
ggplot() +
  aes(x = Months, y = Events) +
  geom_col() +
  labs(title = 'Test Data')

gridExtra::grid.arrange(p1,p2, nrow = 2)

5 Compute accuracy of test data if exist

   get_accuracy <- function(df, game ,code){
tmp <- df %>%
  filter(str_detect(title , pattern = game)) %>%
  filter(event_code == code) %>%
  mutate(event_data = gsub('"', '', event_data)) %>%
  mutate(Status =  ifelse(str_detect(event_data, pattern = 'correct:true'),  'correct',
                    ifelse(str_detect(event_data, pattern = 'correct:false'),  'incorrect',
                           NA))) %>%
  group_by(installation_id, game_session, title) %>%
  summarise(num_correct = sum(Status %in% 'correct'),
            num_incorrect = sum(Status %in% 'incorrect')) %>%
  ungroup %>%
  mutate(accuracy = num_correct/(num_correct+num_incorrect)) %>%
  mutate(accuracy_group = ifelse(accuracy == 0,  0L,
          ifelse(accuracy> 0 & accuracy < 0.5,  1L,
          ifelse(accuracy >= 0.5 & accuracy < 1,  2L,
                  3L
          )))) 
  return(tmp)
}
bird_accuracy_ <- get_accuracy(test ,'Bird Measurer' , '4110')
cart_accuracy_ <- get_accuracy(test, "Cart Balancer", '4100')
caudron_accuracy_ <- get_accuracy(test, "Cauldron Filler", "4100")
chest_accuracy_ <- get_accuracy(test, "Chest Sorter", "4100")
mushroom_accuracy_ <- get_accuracy(test, "Mushroom Sorter", "4100")

test_labels <- dplyr::bind_rows(bird_accuracy_, cart_accuracy_, caudron_accuracy_, chest_accuracy_, mushroom_accuracy_)

test_labels %>% head()

## # A tibble: 6 x 7
##   installation_id game_session title num_correct num_incorrect accuracy
##   <chr>           <chr>        <chr>       <int>         <int>    <dbl>
## 1 01242218        597a8839a5a… Bird…           1             2    0.333
## 2 027e7ce5        5079b43514b… Bird…           1             0    1    
## 3 02a29f99        c043d48e6b0… Bird…           0            11    0    
## 4 02a29f99        dc1c979f6c9… Bird…           0             2    0    
## 5 04d31500        2747d8022de… Bird…           1             0    1    
## 6 05771bba        a9ee75f6507… Bird…           0             4    0    
## # … with 1 more variable: accuracy_group <int>

5.1 subtest without accuracy

## get the last assessment for each game_session
 subtest_with_accuracy <- test %>%
  left_join(test_labels, by = c("installation_id", "game_session", "title")) %>%
  filter(!is.na(accuracy_group)) %>%
  #filter(type == "Assessment") %>% 
  arrange(desc(timestamp)) %>% 
  distinct(installation_id, .keep_all = TRUE)                                                                                                              
 
 subtest_with_accuracy %>% head()

##   event_id     game_session           timestamp
## 1 3393b68b 9dd2ffd1f297124d 2019-10-14 21:00:19
## 2 ecaab346 ae4b5b3b3e61455c 2019-10-14 19:20:04
## 3 2b058fe3 db9dd5f9faebf692 2019-10-14 15:08:09
## 4 ecaab346 b9fd73b70b629c6d 2019-10-14 14:50:21
## 5 5c2f29ca 33cc05ed9b92a7c1 2019-10-14 13:56:14
## 6 ecaab346 7bd2e14dcc3c3ccd 2019-10-14 13:13:57
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              event_data
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                               {""session_duration"":73009,""event_count"":57,""game_time"":73009,""event_code"":2010}
## 2                                                                                                                                                                                                                                                                                         {""description"":""You did it! The weight is equal on both sides. It's balanced! SFX_completedtask"",""identifier"":""Dot_YouDidIt,SFX_completedtask"",""media_type"":""audio"",""duration"":7430,""event_count"":13,""game_time"":17661,""event_code"":3121}
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                               {""session_duration"":40812,""event_count"":50,""game_time"":40812,""event_code"":2010}
## 4                                                                                                                                                                                                                                                                                         {""description"":""You did it! The weight is equal on both sides. It's balanced! SFX_completedtask"",""identifier"":""Dot_YouDidIt,SFX_completedtask"",""media_type"":""audio"",""duration"":7319,""event_count"":31,""game_time"":28524,""event_code"":3121}
## 5 {""crystal_id"":""gem06"",""weight"":4,""coordinates"":{""x"":782,""y"":276,""stage_width"":1015,""stage_height"":762},""side"":""right"",""left"":[{""id"":""gem05"",""weight"":4}],""duration"":435,""right"":[{""id"":""gem06"",""weight"":4}],""crystals"":[{""id"":""gem07"",""weight"":6},{""id"":""gem01"",""weight"":1},{""id"":""gem03"",""weight"":3},{""id"":""gem02"",""weight"":1},{""id"":""gem08"",""weight"":6},{""id"":""gem04"",""weight"":3}],""source"":""resources"",""event_count"":35,""game_time"":15403,""event_code"":4020}
## 6                                                                                                                                                                                                                                                                                         {""description"":""You did it! The weight is equal on both sides. It's balanced! SFX_completedtask"",""identifier"":""Dot_YouDidIt,SFX_completedtask"",""media_type"":""audio"",""duration"":7333,""event_count"":71,""game_time"":85719,""event_code"":3121}
##   installation_id event_count event_code           game_time
## 1        101d16f5          57       2010 2019-10-14 21:01:32
## 2        532002a8          13       3121 2019-10-14 19:20:21
## 3        1962067f          50       2010 2019-10-14 15:08:49
## 4        1ed372f8          31       3121 2019-10-14 14:50:49
## 5        7973d319          35       4020 2019-10-14 13:56:29
## 6        7fe9ca96          71       3121 2019-10-14 13:15:22
##                          title       type        world       Date Months
## 1   Bird Measurer (Assessment) Assessment  TREETOPCITY 2019-10-14     10
## 2   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 3 Cauldron Filler (Assessment) Assessment    MAGMAPEAK 2019-10-14     10
## 4   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 5   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 6   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
##    timing num_correct num_incorrect  accuracy accuracy_group
## 1 73 secs           1             2 0.3333333              1
## 2 17 secs           1             0 1.0000000              3
## 3 40 secs           1             0 1.0000000              3
## 4 28 secs           1             0 1.0000000              3
## 5 15 secs           1             1 0.5000000              2
## 6 85 secs           1             2 0.3333333              1

## subtest without accuracy

## filter only last assessment for each instllation_id in test dataset
subtest_without_accuracy <- test %>%
   anti_join( subtest_with_accuracy , by = c("installation_id")) %>%
   filter(type == "Assessment") %>% 
   arrange(desc(timestamp)) %>% 
   distinct(installation_id, .keep_all = TRUE)  
                                                                                                                  
subtest_without_accuracy %>% head()

##   event_id     game_session           timestamp
## 1 3bfd1a65 5b010743b38ab0b6 2019-10-14 18:17:18
## 2 3bfd1a65 9f7493288742d987 2019-10-14 17:43:07
## 3 7ad3efc6 b98699251de79a27 2019-10-14 17:13:02
## 4 5b49460a 9c9f677f5b66d2c7 2019-10-14 13:06:39
## 5 3bfd1a65 b679dd2215f0dba8 2019-10-14 03:10:00
## 6 90d848e0 f3885a783437995b 2019-10-14 01:16:00
##                                                                    event_data
## 1 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 2 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 3 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 4 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 5 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 6 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
##   installation_id event_count event_code           game_time
## 1        d1e82789           1       2000 2019-10-14 18:17:18
## 2        73a78f04           1       2000 2019-10-14 17:43:07
## 3        46486950           1       2000 2019-10-14 17:13:02
## 4        c64ad87e           1       2000 2019-10-14 13:06:39
## 5        dbb1a1d3           1       2000 2019-10-14 03:10:00
## 6        6a30ec55           1       2000 2019-10-14 01:16:00
##                          title       type        world       Date Months
## 1 Mushroom Sorter (Assessment) Assessment  TREETOPCITY 2019-10-14     10
## 2 Mushroom Sorter (Assessment) Assessment  TREETOPCITY 2019-10-14     10
## 3   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 4    Chest Sorter (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 5 Mushroom Sorter (Assessment) Assessment  TREETOPCITY 2019-10-14     10
## 6 Cauldron Filler (Assessment) Assessment    MAGMAPEAK 2019-10-14     10
##   timing
## 1 0 secs
## 2 0 secs
## 3 0 secs
## 4 0 secs
## 5 0 secs
## 6 0 secs

rm(test)

6 Feature Engineering

6.1 useful Functions

#convert_timestamp <- function(df, game, code){
#  tmp <- df %>%
#  filter(str_detect(title , pattern = game)) %>%
#  filter(event_code %in% code) %>%
  #mutate(timestamp = gsub( "T", " ", timestamp)) %>%
  #mutate(timestamp = gsub( "\\..*Z$", "", timestamp)) %>%
#  mutate(Date = as_datetime(format(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"),"%Y-%m-%d %H:%M:%S"))) %>%
#  mutate(game_time = format( as.POSIXct(Date)+game_time/1000, "%Y-%m-%d %H:%M:%S")) %>%
#  arrange(Date) %>%
#  mutate(timing = ymd_hms(game_time) - ymd_hms(Date))
  
#  return(tmp)
#}



featuring <- function(df){
  tmp <- df
  #tmp <- convert_timestamp(df, game, code)
  
## How long time has the game session to get the correct answer?
  longtime <- tmp %>%
    group_by(installation_id, game_session) %>%
    summarise(longtime = as.numeric(sum(timing))) %>%
    ungroup()
    #right_join(tmp, by= c("installation_id", "game_session"))
  
## `Mean`, `median` and `sd` and timing
  timing_feat <- tmp  %>%
  group_by(installation_id) %>%
  summarise(mean_timing = as.numeric(mean(timing, na.rm = TRUE)),
            median_timing = as.numeric(median(timing, na.rm = TRUE)),
            sd_timing = as.numeric(sd(timing, na.rm = TRUE))) %>%
    ungroup()
  

# Event count and attempts featuring
## How many attempts per instllation_id to get the succes the game_session?
  attempts <- tmp %>%
  group_by(installation_id) %>%
  summarise(attempts =as.numeric(n())) %>%
    ungroup()
  #right_join(tmp , by = "installation_id")
  
## mean, median, sd of event_count
  event_count_feat <- tmp %>%
  group_by(installation_id) %>%
  summarise(mean_event_count = as.numeric(mean(event_count, na.rm = TRUE)),
            median_event_count = as.numeric(median(mean_event_count, na.rm = TRUE)),
            sd_event_count = as.numeric(sd(event_count,na.rm = TRUE))) %>%
    ungroup()
  
  # encode title

  
  tmp %>%
    left_join(longtime, by= c("installation_id", "game_session")) %>%
    left_join(timing_feat, by = "installation_id") %>%
    left_join(attempts, by = "installation_id") %>%
    left_join(event_count_feat, by = "installation_id") %>%
    # Datetime , months, week, day , weekend, features
    mutate(datetime = as.numeric(log1p(as.numeric(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"))))) %>%
    mutate(Date = as.Date(Date)) %>%
    mutate(Months = lubridate::month(ymd(Date))) %>%
    mutate(Week = lubridate::week(ymd(Date))) %>%
    mutate(wday = lubridate::wday(ymd(Date))) %>%
    mutate(weekend = ifelse(wday == 1 | wday == 6, 1, 0)) %>%
    mutate(title = as.numeric(as.factor(title))) %>%
    mutate(type = as.numeric(as.factor(type))) %>%
    mutate(world = as.numeric(as.factor(world))) %>%
    select( -Date, -game_session, -event_id,  -timing,-game_time, -timestamp, -event_data ) -> tmp
  
  
  
  return(tmp)
    
}

7 Train & Test Pools

#new_train <- convert_timestamp(new_train, game = paste(c("Bird Measurer","Cart Balancer", "Mushroom Sorter",
 #                                     "Cauldron Filler", "Chest Sorter"), collapse = "|") ,
  #                                        code = c("4100", "4110"))                                                                                                                

Train <- train_labels %>%
  select(installation_id, game_session, title, accuracy_group) %>%
  right_join(new_train, by = c("installation_id", "game_session", "title")) %>%
  filter(!is.na(accuracy_group))

Train <- featuring(Train) %>% select(-installation_id)
#Train <- Train[,colSums(is.na(Train))<nrow(Train)]                                                                                                                  
#subtest_without_accuracy <- convert_timestamp(subtest_without_accuracy, game = "Assessment", code = "2000")
Test <- featuring(subtest_without_accuracy)

#Test <- Test[,colSums(is.na(Test))<nrow(Test)]

Test %>% head()

##   installation_id event_count event_code title type world Months longtime
## 1        d1e82789           1       2000     5    1     3     10        0
## 2        73a78f04           1       2000     5    1     3     10        0
## 3        46486950           1       2000     2    1     1     10        0
## 4        c64ad87e           1       2000     4    1     1     10        0
## 5        dbb1a1d3           1       2000     5    1     3     10        0
## 6        6a30ec55           1       2000     3    1     2     10        0
##   mean_timing median_timing sd_timing attempts mean_event_count
## 1           0             0        NA        1                1
## 2           0             0        NA        1                1
## 3           0             0        NA        1                1
## 4           0             0        NA        1                1
## 5           0             0        NA        1                1
## 6           0             0        NA        1                1
##   median_event_count sd_event_count datetime Week wday weekend
## 1                  1             NA 21.17502   41    2       0
## 2                  1             NA 21.17502   41    2       0
## 3                  1             NA 21.17502   41    2       0
## 4                  1             NA 21.17501   41    2       0
## 5                  1             NA 21.17499   41    2       0
## 6                  1             NA 21.17499   41    2       0

Train %>% head(1000) %>% head()

##   title accuracy_group event_count event_code type world Months longtime
## 1     5              3           1       2000    1     3      8      821
## 2     5              3           2       2025    1     3      8      821
## 3     5              3           3       3010    1     3      8      821
## 4     5              3           4       3110    1     3      8      821
## 5     5              3           5       3010    1     3      8      821
## 6     5              3           6       4025    1     3      8      821
##   mean_timing median_timing sd_timing attempts mean_event_count
## 1    25.92623            18  23.86197      244         28.95492
## 2    25.92623            18  23.86197      244         28.95492
## 3    25.92623            18  23.86197      244         28.95492
## 4    25.92623            18  23.86197      244         28.95492
## 5    25.92623            18  23.86197      244         28.95492
## 6    25.92623            18  23.86197      244         28.95492
##   median_event_count sd_event_count datetime Week wday weekend
## 1           28.95492       21.14933 21.17119   32    3       0
## 2           28.95492       21.14933 21.17119   32    3       0
## 3           28.95492       21.14933 21.17119   32    3       0
## 4           28.95492       21.14933 21.17119   32    3       0
## 5           28.95492       21.14933 21.17119   32    3       0
## 6           28.95492       21.14933 21.17119   32    3       0

7.1 Data processiong

rm( new_train, train_labels)
invisible(gc())

tmp <- rsample::initial_split(Train, prop = 3/4)

train <- rsample::training(tmp)
valid <- rsample::testing(tmp)

#x_train <- sample_frac(Train, 0.75)
#x_valid <- x_train %>% anti_join(x_train1, by = "installation_id")

y_train <- train$accuracy_group
x_train <- train %>% select(-accuracy_group)
y_valid <- valid$accuracy_group
x_valid <- valid %>% select(-accuracy_group)


rm(Train)
invisible(gc())

8 xgboost modeling

8.1 mlogloss Cross validation

dtrain <- xgb.DMatrix(data = as.matrix(x_train), label = y_train)                                                                                                                
dval <- xgb.DMatrix(data = as.matrix(x_valid), label = y_valid)


eval_kappa<-function(preds, dtrain) {

  labels <- y_train #getinfo(dtrain, 'label')

  err <- Metrics::ScoreQuadraticWeightedKappa(rater.a = labels,
                                              rater.b = as.integer(round(preds)),
                                              min.rating = 0, 
                                              max.rating = 3)
    
  return(list(name = "kappa", value = err, higher_better = TRUE))
}

                                                                                                                  
                                                                                                                  
xgb_params <- list(objective = "multi:softprob",
                   eval_metric = "mlogloss",
                   num_class = 4,
                   colsample_bytree= 0.5,                 
                   eta = 0.1,
                   max_depth= 10,
                   subsample= 0.5
                  )
                
cv_model <- xgb.cv(params = xgb_params,
                   data = dtrain,
                   #eval_metric = list(dtrain, dval),  #eval_kappa,
                   nrounds = 30,
                   verbose = TRUE,
                   maximize = FALSE,
                   nfold = 5,
                   early_stopping_round = 10,
                   print_every_n = 10,
                   prediction = TRUE)

## [1]  train-mlogloss:1.326928+0.006018    test-mlogloss:1.327354+0.006169 
## Multiple eval metrics are present. Will use test_mlogloss for early stopping.
## Will train until test_mlogloss hasn't improved in 10 rounds.
## 
## [11] train-mlogloss:0.925071+0.007549    test-mlogloss:0.929192+0.007771 
## [21] train-mlogloss:0.716480+0.006871    test-mlogloss:0.722792+0.006446 
## [30] train-mlogloss:0.601111+0.006551    test-mlogloss:0.608601+0.005855

                                                                                                                  #Best Number of iterations
    num_iterations = cv_model$best_iteration 
                                                                                                                  
    xgb_model = xgb.train(params = xgb_params,
                            data = dtrain,
                            nrounds= num_iterations,
                            verbose_eval= FALSE
                 )

8.2 kappa evaluation

dvalid = xgb.DMatrix( as.matrix(x_valid))

pred_valid <- predict(xgb_model, dvalid)

Metrics::ScoreQuadraticWeightedKappa(as.integer(round(pred_valid)), 
                                     y_valid, 
                                     min.rating = 0, 
                                     max.rating = 3)

## [1] -0.0001701649

8.3 Confusion matrix

#evaluate the default model
valid_prediction <- matrix(pred_valid, nrow = 4, ncol=length(pred_valid)/4) %>%
                   t() %>% 
                  data.frame() %>% 
                  mutate(label = y_valid , max_prob = max.col(., "last") - 1)

caret::confusionMatrix(factor(valid_prediction$max_prob), factor(valid_prediction$label), mode = "everything")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3
##          0 56102  3257  2827  1593
##          1  2348 44270  2055  1856
##          2   440   380 14404   221
##          3  5023  1231  8156 72198
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8642          
##                  95% CI : (0.8627, 0.8656)
##     No Information Rate : 0.3507          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8088          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity            0.8778   0.9009  0.52489   0.9516
## Specificity            0.9496   0.9626  0.99449   0.8974
## Pos Pred Value         0.8796   0.8761  0.93260   0.8336
## Neg Pred Value         0.9488   0.9706  0.93511   0.9717
## Precision              0.8796   0.8761  0.93260   0.8336
## Recall                 0.8778   0.9009  0.52489   0.9516
## F1                     0.8787   0.8884  0.67172   0.8887
## Prevalence             0.2954   0.2271  0.12683   0.3507
## Detection Rate         0.2593   0.2046  0.06657   0.3337
## Detection Prevalence   0.2948   0.2335  0.07139   0.4003
## Balanced Accuracy      0.9137   0.9318  0.75969   0.9245

8.4 Variable Importance

# get the feature real names
names <- colnames(x_train) 
# compute feature importance matrix
importance_matrix = xgb.importance(feature_names = names, model = xgb_model)

importance_matrix

##                Feature        Gain       Cover  Frequency
##  1:           longtime 0.311657669 0.179654669 0.12249884
##  2:   mean_event_count 0.076179673 0.104020149 0.08651550
##  3:              title 0.075033526 0.076192120 0.02471078
##  4:           datetime 0.066318722 0.052715334 0.12259139
##  5:     sd_event_count 0.065085917 0.083657376 0.08027765
##  6:        mean_timing 0.062558779 0.066672920 0.10613605
##  7:           attempts 0.060334951 0.085545368 0.08623785
##  8:          sd_timing 0.058752235 0.082715567 0.08977325
##  9:      median_timing 0.048336943 0.058460970 0.07524294
## 10:        event_count 0.040619109 0.041563834 0.03287367
## 11:              world 0.034495075 0.043189469 0.01436372
## 12: median_event_count 0.032103852 0.046870422 0.04233225
## 13:         event_code 0.032013751 0.058088050 0.02502545
## 14:               wday 0.014309550 0.008121158 0.03577973
## 15:               Week 0.011420599 0.006978564 0.02709857
## 16:             Months 0.006712531 0.002345595 0.01797316
## 17:            weekend 0.004067118 0.003208435 0.01056918

8.5 Plot features importance

# plot
xgboost::xgb.ggplot.importance(importance_matrix, top_n = 10) +
ggplot2::theme_minimal()

9 xgboost prediction

test_installation_id = Test$installation_id
x_test <- Test %>% select(-installation_id)
x_test <- x_test %>% select(colnames(x_train))
dtest <- xgb.DMatrix(as.matrix(x_test))

rm(Test)
invisible(gc())

pred_test <- predict(xgb_model, dtest,reshape=TRUE) 
          
## reshape dataframe and get which class                                                                                                                  
pred_test_class <- matrix(pred_test, nrow = 4, ncol = length(pred_test) / 4) %>% 
               t() %>%
               data.frame() %>%
               mutate(max = max.col(., ties.method = "last") - 1, label = test_installation_id) 


submission2 <- data.frame(
  installation_id = as.character(pred_test_class$label),
  accuracy_group = pred_test_class$max #pred_test_class[, unique(pred_test_class$max)]
) 

submission3 <- dplyr::bind_rows(submission2, subtest_with_accuracy) %>% select(installation_id, accuracy_group)

## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

submission <-  submission3 %>%                                                                                              
right_join(sample_submission, by = "installation_id") %>%
  select(installation_id, accuracy_group.x) %>%
  rename(accuracy_group = accuracy_group.x)
                                                                                                                  
                                                                                                                  
fwrite(submission, "submission.csv")

submission %>% head()

##   installation_id accuracy_group
## 1        00abaee7              3
## 2        01242218              1
## 3        017c5718              2
## 4        01a44906              2
## 5        01bc6cb6              1
## 6        02256298              3

Predict the number of attempts

2019-12-09