train <- read_rds("datasets/train.rds")
train_labels <- read_rds("datasets/train_labes.rds")
test <- read_rds("datasets/test.rds")
#specs <- read_rds("datasets/specs.rds")
sample_submission <- read_rds("datasets/sample_submission.rds")

0.1 Stack Overview of variables

The description of the data shows some variables are grouped into other variables. for example, instllation_id groups all installed Games in each device. Each game has several game_session. Each game_session has game_time. During Game_time several Events are programmed. Each Event has data collapsed in event_data.

1 Accuracy group distribution

We grouped the accuracy using, first only event_code and the second event_code and Assessment type.

# p1 <- train %>%
#   filter(event_code == 4100| event_code == 4110)%>% #select(installation_id) %>% n_distinct()
#     mutate(event_data = gsub('"', '', event_data)) %>%
#     mutate(Status =  ifelse(str_detect(event_data, pattern = 'correct:true'),  'correct',
#                       ifelse(str_detect(event_data, pattern = 'correct:false'),  'incorrect',
#                              NA))) %>%
#   filter(!is.na(Status)) %>%
#   group_by(installation_id) %>%
#   summarise(num_correct = sum(Status %in% 'correct'),
#             num_incorrect = sum(Status %in% 'incorrect'),
#             num_NA = sum(is.na(Status))) %>%
#     ungroup %>%
#   mutate(accuracy = num_correct/(num_correct+num_incorrect+num_NA)) %>%
#   mutate(accuracy_group = ifelse(accuracy == 0,  0,
#           ifelse(accuracy> 0 & accuracy < 0.5,  1,
#           ifelse(accuracy >= 0.5 & accuracy < 1,  2,
#                   3
#           )))) %>%
#   group_by(accuracy_group) %>%
#   summarise(Frequencies = n()) %>%
#   ggplot() +
#   aes(x = accuracy_group, y = Frequencies) +
#   geom_col()+
#   geom_text(aes(label = Frequencies), vjust = -0.5) +
#   #theme( axis.text.x  = element_text(angle=45, hjust=1, vjust=0.9)) +
#   labs(title = paste0("Accuracy distribution of sub-train data: event_code == 4100|4110"))
# 
# 
#   p2 <- train %>%
#   filter(type == 'Assessment') %>% 
#   filter(event_code == 4100| event_code == 4110) %>%
#     mutate(event_data = gsub('"', '', event_data)) %>%
#     mutate(Status =  ifelse(str_detect(event_data, pattern = 'correct:true'),  'correct',
#                       ifelse(str_detect(event_data, pattern = 'correct:false'),  'incorrect',
#                              NA))) %>%
#   #filter(!is.na(Status)) %>%
#   group_by(installation_id) %>%
#   summarise(num_correct = sum(Status %in% 'correct'),
#             num_incorrect = sum(Status%in% 'incorrect'),
#             num_NA = sum(is.na(Status))) %>%
#     ungroup %>%
#   mutate(accuracy = num_correct/(num_correct+num_incorrect+num_NA)) %>%
#   mutate(accuracy_group = ifelse(accuracy == 0,  0,
#           ifelse(accuracy> 0 & accuracy < 0.5,  1,
#           ifelse(accuracy >= 0.5 & accuracy < 1,  2,
#                   3
#           )))) %>%
#   group_by(accuracy_group) %>%
#   summarise(Frequencies = n()) %>%
#   ggplot() +
#   aes(x = accuracy_group, y = Frequencies) +
#   geom_col()+
#   geom_text(aes(label = Frequencies), vjust = -0.5) +
#   #theme( axis.text.x  = element_text(angle=45, hjust=1, vjust=0.9)) +
#   labs(title = paste0("Accuracy distribution of sub-train data: type == Assessment & event_code == 4100|4110"))
#   
#   
# gridExtra::grid.arrange(p1, p2, nrow = 2)

2 Reduce the size of the train data to only installation_id woth accuracy_group

## [1] "Is new_train and train_lables have the same list of installation_id? TRUE"

3 Convert timestamp

4 ggplot EDA

5 Compute accuracy of test data if exist

## # A tibble: 6 x 7
##   installation_id game_session title num_correct num_incorrect accuracy
##   <chr>           <chr>        <chr>       <int>         <int>    <dbl>
## 1 01242218        597a8839a5a… Bird…           1             2    0.333
## 2 027e7ce5        5079b43514b… Bird…           1             0    1    
## 3 02a29f99        c043d48e6b0… Bird…           0            11    0    
## 4 02a29f99        dc1c979f6c9… Bird…           0             2    0    
## 5 04d31500        2747d8022de… Bird…           1             0    1    
## 6 05771bba        a9ee75f6507… Bird…           0             4    0    
## # … with 1 more variable: accuracy_group <int>

5.1 subtest without accuracy

##   event_id     game_session           timestamp
## 1 3393b68b 9dd2ffd1f297124d 2019-10-14 21:00:19
## 2 ecaab346 ae4b5b3b3e61455c 2019-10-14 19:20:04
## 3 2b058fe3 db9dd5f9faebf692 2019-10-14 15:08:09
## 4 ecaab346 b9fd73b70b629c6d 2019-10-14 14:50:21
## 5 5c2f29ca 33cc05ed9b92a7c1 2019-10-14 13:56:14
## 6 ecaab346 7bd2e14dcc3c3ccd 2019-10-14 13:13:57
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              event_data
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                               {""session_duration"":73009,""event_count"":57,""game_time"":73009,""event_code"":2010}
## 2                                                                                                                                                                                                                                                                                         {""description"":""You did it! The weight is equal on both sides. It's balanced! SFX_completedtask"",""identifier"":""Dot_YouDidIt,SFX_completedtask"",""media_type"":""audio"",""duration"":7430,""event_count"":13,""game_time"":17661,""event_code"":3121}
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                               {""session_duration"":40812,""event_count"":50,""game_time"":40812,""event_code"":2010}
## 4                                                                                                                                                                                                                                                                                         {""description"":""You did it! The weight is equal on both sides. It's balanced! SFX_completedtask"",""identifier"":""Dot_YouDidIt,SFX_completedtask"",""media_type"":""audio"",""duration"":7319,""event_count"":31,""game_time"":28524,""event_code"":3121}
## 5 {""crystal_id"":""gem06"",""weight"":4,""coordinates"":{""x"":782,""y"":276,""stage_width"":1015,""stage_height"":762},""side"":""right"",""left"":[{""id"":""gem05"",""weight"":4}],""duration"":435,""right"":[{""id"":""gem06"",""weight"":4}],""crystals"":[{""id"":""gem07"",""weight"":6},{""id"":""gem01"",""weight"":1},{""id"":""gem03"",""weight"":3},{""id"":""gem02"",""weight"":1},{""id"":""gem08"",""weight"":6},{""id"":""gem04"",""weight"":3}],""source"":""resources"",""event_count"":35,""game_time"":15403,""event_code"":4020}
## 6                                                                                                                                                                                                                                                                                         {""description"":""You did it! The weight is equal on both sides. It's balanced! SFX_completedtask"",""identifier"":""Dot_YouDidIt,SFX_completedtask"",""media_type"":""audio"",""duration"":7333,""event_count"":71,""game_time"":85719,""event_code"":3121}
##   installation_id event_count event_code           game_time
## 1        101d16f5          57       2010 2019-10-14 21:01:32
## 2        532002a8          13       3121 2019-10-14 19:20:21
## 3        1962067f          50       2010 2019-10-14 15:08:49
## 4        1ed372f8          31       3121 2019-10-14 14:50:49
## 5        7973d319          35       4020 2019-10-14 13:56:29
## 6        7fe9ca96          71       3121 2019-10-14 13:15:22
##                          title       type        world       Date Months
## 1   Bird Measurer (Assessment) Assessment  TREETOPCITY 2019-10-14     10
## 2   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 3 Cauldron Filler (Assessment) Assessment    MAGMAPEAK 2019-10-14     10
## 4   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 5   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 6   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
##    timing num_correct num_incorrect  accuracy accuracy_group
## 1 73 secs           1             2 0.3333333              1
## 2 17 secs           1             0 1.0000000              3
## 3 40 secs           1             0 1.0000000              3
## 4 28 secs           1             0 1.0000000              3
## 5 15 secs           1             1 0.5000000              2
## 6 85 secs           1             2 0.3333333              1

## subtest without accuracy

##   event_id     game_session           timestamp
## 1 3bfd1a65 5b010743b38ab0b6 2019-10-14 18:17:18
## 2 3bfd1a65 9f7493288742d987 2019-10-14 17:43:07
## 3 7ad3efc6 b98699251de79a27 2019-10-14 17:13:02
## 4 5b49460a 9c9f677f5b66d2c7 2019-10-14 13:06:39
## 5 3bfd1a65 b679dd2215f0dba8 2019-10-14 03:10:00
## 6 90d848e0 f3885a783437995b 2019-10-14 01:16:00
##                                                                    event_data
## 1 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 2 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 3 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 4 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 5 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
## 6 {""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}
##   installation_id event_count event_code           game_time
## 1        d1e82789           1       2000 2019-10-14 18:17:18
## 2        73a78f04           1       2000 2019-10-14 17:43:07
## 3        46486950           1       2000 2019-10-14 17:13:02
## 4        c64ad87e           1       2000 2019-10-14 13:06:39
## 5        dbb1a1d3           1       2000 2019-10-14 03:10:00
## 6        6a30ec55           1       2000 2019-10-14 01:16:00
##                          title       type        world       Date Months
## 1 Mushroom Sorter (Assessment) Assessment  TREETOPCITY 2019-10-14     10
## 2 Mushroom Sorter (Assessment) Assessment  TREETOPCITY 2019-10-14     10
## 3   Cart Balancer (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 4    Chest Sorter (Assessment) Assessment CRYSTALCAVES 2019-10-14     10
## 5 Mushroom Sorter (Assessment) Assessment  TREETOPCITY 2019-10-14     10
## 6 Cauldron Filler (Assessment) Assessment    MAGMAPEAK 2019-10-14     10
##   timing
## 1 0 secs
## 2 0 secs
## 3 0 secs
## 4 0 secs
## 5 0 secs
## 6 0 secs

6 Feature Engineering

6.1 useful Functions

#convert_timestamp <- function(df, game, code){
#  tmp <- df %>%
#  filter(str_detect(title , pattern = game)) %>%
#  filter(event_code %in% code) %>%
  #mutate(timestamp = gsub( "T", " ", timestamp)) %>%
  #mutate(timestamp = gsub( "\\..*Z$", "", timestamp)) %>%
#  mutate(Date = as_datetime(format(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"),"%Y-%m-%d %H:%M:%S"))) %>%
#  mutate(game_time = format( as.POSIXct(Date)+game_time/1000, "%Y-%m-%d %H:%M:%S")) %>%
#  arrange(Date) %>%
#  mutate(timing = ymd_hms(game_time) - ymd_hms(Date))
  
#  return(tmp)
#}



featuring <- function(df){
  tmp <- df
  #tmp <- convert_timestamp(df, game, code)
  
## How long time has the game session to get the correct answer?
  longtime <- tmp %>%
    group_by(installation_id, game_session) %>%
    summarise(longtime = as.numeric(sum(timing))) %>%
    ungroup()
    #right_join(tmp, by= c("installation_id", "game_session"))
  
## `Mean`, `median` and `sd` and timing
  timing_feat <- tmp  %>%
  group_by(installation_id) %>%
  summarise(mean_timing = as.numeric(mean(timing, na.rm = TRUE)),
            median_timing = as.numeric(median(timing, na.rm = TRUE)),
            sd_timing = as.numeric(sd(timing, na.rm = TRUE))) %>%
    ungroup()
  

# Event count and attempts featuring
## How many attempts per instllation_id to get the succes the game_session?
  attempts <- tmp %>%
  group_by(installation_id) %>%
  summarise(attempts =as.numeric(n())) %>%
    ungroup()
  #right_join(tmp , by = "installation_id")
  
## mean, median, sd of event_count
  event_count_feat <- tmp %>%
  group_by(installation_id) %>%
  summarise(mean_event_count = as.numeric(mean(event_count, na.rm = TRUE)),
            median_event_count = as.numeric(median(mean_event_count, na.rm = TRUE)),
            sd_event_count = as.numeric(sd(event_count,na.rm = TRUE))) %>%
    ungroup()
  
  # encode title

  
  tmp %>%
    left_join(longtime, by= c("installation_id", "game_session")) %>%
    left_join(timing_feat, by = "installation_id") %>%
    left_join(attempts, by = "installation_id") %>%
    left_join(event_count_feat, by = "installation_id") %>%
    # Datetime , months, week, day , weekend, features
    mutate(datetime = as.numeric(log1p(as.numeric(as.POSIXct(timestamp ,format="%Y-%m-%d %H:%M:%S"))))) %>%
    mutate(Date = as.Date(Date)) %>%
    mutate(Months = lubridate::month(ymd(Date))) %>%
    mutate(Week = lubridate::week(ymd(Date))) %>%
    mutate(wday = lubridate::wday(ymd(Date))) %>%
    mutate(weekend = ifelse(wday == 1 | wday == 6, 1, 0)) %>%
    mutate(title = as.numeric(as.factor(title))) %>%
    mutate(type = as.numeric(as.factor(type))) %>%
    mutate(world = as.numeric(as.factor(world))) %>%
    select( -Date, -game_session, -event_id,  -timing,-game_time, -timestamp, -event_data ) -> tmp
  
  
  
  return(tmp)
    
}

7 Train & Test Pools

##   installation_id event_count event_code title type world Months longtime
## 1        d1e82789           1       2000     5    1     3     10        0
## 2        73a78f04           1       2000     5    1     3     10        0
## 3        46486950           1       2000     2    1     1     10        0
## 4        c64ad87e           1       2000     4    1     1     10        0
## 5        dbb1a1d3           1       2000     5    1     3     10        0
## 6        6a30ec55           1       2000     3    1     2     10        0
##   mean_timing median_timing sd_timing attempts mean_event_count
## 1           0             0        NA        1                1
## 2           0             0        NA        1                1
## 3           0             0        NA        1                1
## 4           0             0        NA        1                1
## 5           0             0        NA        1                1
## 6           0             0        NA        1                1
##   median_event_count sd_event_count datetime Week wday weekend
## 1                  1             NA 21.17502   41    2       0
## 2                  1             NA 21.17502   41    2       0
## 3                  1             NA 21.17502   41    2       0
## 4                  1             NA 21.17501   41    2       0
## 5                  1             NA 21.17499   41    2       0
## 6                  1             NA 21.17499   41    2       0
##   title accuracy_group event_count event_code type world Months longtime
## 1     5              3           1       2000    1     3      8      821
## 2     5              3           2       2025    1     3      8      821
## 3     5              3           3       3010    1     3      8      821
## 4     5              3           4       3110    1     3      8      821
## 5     5              3           5       3010    1     3      8      821
## 6     5              3           6       4025    1     3      8      821
##   mean_timing median_timing sd_timing attempts mean_event_count
## 1    25.92623            18  23.86197      244         28.95492
## 2    25.92623            18  23.86197      244         28.95492
## 3    25.92623            18  23.86197      244         28.95492
## 4    25.92623            18  23.86197      244         28.95492
## 5    25.92623            18  23.86197      244         28.95492
## 6    25.92623            18  23.86197      244         28.95492
##   median_event_count sd_event_count datetime Week wday weekend
## 1           28.95492       21.14933 21.17119   32    3       0
## 2           28.95492       21.14933 21.17119   32    3       0
## 3           28.95492       21.14933 21.17119   32    3       0
## 4           28.95492       21.14933 21.17119   32    3       0
## 5           28.95492       21.14933 21.17119   32    3       0
## 6           28.95492       21.14933 21.17119   32    3       0

8 xgboost modeling

8.3 Confusion matrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1     2     3
##          0 56102  3257  2827  1593
##          1  2348 44270  2055  1856
##          2   440   380 14404   221
##          3  5023  1231  8156 72198
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8642          
##                  95% CI : (0.8627, 0.8656)
##     No Information Rate : 0.3507          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8088          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity            0.8778   0.9009  0.52489   0.9516
## Specificity            0.9496   0.9626  0.99449   0.8974
## Pos Pred Value         0.8796   0.8761  0.93260   0.8336
## Neg Pred Value         0.9488   0.9706  0.93511   0.9717
## Precision              0.8796   0.8761  0.93260   0.8336
## Recall                 0.8778   0.9009  0.52489   0.9516
## F1                     0.8787   0.8884  0.67172   0.8887
## Prevalence             0.2954   0.2271  0.12683   0.3507
## Detection Rate         0.2593   0.2046  0.06657   0.3337
## Detection Prevalence   0.2948   0.2335  0.07139   0.4003
## Balanced Accuracy      0.9137   0.9318  0.75969   0.9245

8.4 Variable Importance

##                Feature        Gain       Cover  Frequency
##  1:           longtime 0.311657669 0.179654669 0.12249884
##  2:   mean_event_count 0.076179673 0.104020149 0.08651550
##  3:              title 0.075033526 0.076192120 0.02471078
##  4:           datetime 0.066318722 0.052715334 0.12259139
##  5:     sd_event_count 0.065085917 0.083657376 0.08027765
##  6:        mean_timing 0.062558779 0.066672920 0.10613605
##  7:           attempts 0.060334951 0.085545368 0.08623785
##  8:          sd_timing 0.058752235 0.082715567 0.08977325
##  9:      median_timing 0.048336943 0.058460970 0.07524294
## 10:        event_count 0.040619109 0.041563834 0.03287367
## 11:              world 0.034495075 0.043189469 0.01436372
## 12: median_event_count 0.032103852 0.046870422 0.04233225
## 13:         event_code 0.032013751 0.058088050 0.02502545
## 14:               wday 0.014309550 0.008121158 0.03577973
## 15:               Week 0.011420599 0.006978564 0.02709857
## 16:             Months 0.006712531 0.002345595 0.01797316
## 17:            weekend 0.004067118 0.003208435 0.01056918

9 xgboost prediction

## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
##   installation_id accuracy_group
## 1        00abaee7              3
## 2        01242218              1
## 3        017c5718              2
## 4        01a44906              2
## 5        01bc6cb6              1
## 6        02256298              3