1 Introduction to the dataset

The dataset for this competition is a relational set of files describing customers’ orders over time. The goal of the competition is to predict which products will be in a user’s next order. The dataset is anonymized and contains a sample of over 3 million grocery orders from more than 200,000 Instacart users. For each user, we provide between 4 and 100 of their orders, with the sequence of products purchased in each order. We also provide the week and hour of day the order was placed, and a relative measure of time between orders. For more information, see the blog post accompanying its public release.

1.1 Load Packages

1.2 Function for multiple plot

# Define multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

1.3 Load files

aisles <- data.table::fread("aisles.csv")
departments <- data.table::fread("departments.csv")
order_prior <- data.table::fread("order_products__prior.csv")
order_train <- data.table::fread("order_products__train.csv")
orders <- data.table::fread("orders.csv")
products <- data.table::fread("products.csv")
sample_sub <- data.table::fread("sample_submission.csv")

1.4 Files glimpse

The list of aisles aisles

The list of departments Departments

Example of sample submission Sample submission

The list of orders: order_dow’ is the day of week. user 1 has 11 orders, 1 of which is in the train set, and 10 of which are prior orders. Orders

The list of Products Products

order_Prior: Contains previous order contents for all customers ‘reordered’ indicates that the customer has a previous order that contains the product Orders prior

Order_Train: in each order, products were added de the card by priority. some products were reordered

Orders Train

Orders Train

1.5 Reformating Datasets

aisles <- aisles %>%
          mutate(aisle = as.factor(aisle))

departments <- departments %>%
               mutate(department = as.factor(department))


order_prior <- order_prior %>%
               mutate(reordered = as.logical(reordered)) #%>%
               #mutate(product_id = as.factor(product_id))

order_train <- order_train %>%
               mutate(reordered = as.logical(reordered))

orders <- orders %>%
          mutate(eval_set = as.factor(eval_set)) %>%
          mutate(w_day = wday(order_dow , label = TRUE)) %>% # +1: weekdays have the interval [0:6] an dnot [1:7]
          mutate(user_id = as.factor(user_id))

products <- products %>%
            mutate(product_name = as.factor(product_name))

2 Market Basket analysis

In this first section, we would try explore the of details of the orders, the content of baskets, the best sold items.

2.1 View the distribution of orders / transactions (hours and week days)

## time of ordering
p1 <- orders %>%
  ggplot(aes(x = order_hour_of_day)) +
    geom_histogram( stat="count", color= "blue") # ,bins = 24

## days of ordering
p2 <- orders %>%
  ggplot(aes(x = w_day)) +
  geom_histogram( stat= "count", color = "green")

## interval of days before Reordering
p3 <- orders %>%
  ggplot(aes(x = days_since_prior_order)) +
  geom_histogram(bins = 30, color = "yellow")


 # plot P1, P2, P3, p4 in the same plot
 layout <- matrix(c(1,2,3,3), 2, 2, byrow = TRUE)
 multiplot(p1, p2, p3, layout=layout)

we find: The main orders were done during the working hours of the day (8:17h). We assume that NA corresponds to saturday. A clear effect of the orders number is shown durinr the weekend. We observe a interval of reordering of 30 days. During this cycle we observe a picks at the dat 7, 15 and 30. We can observe more ordering during weekends during a month. But we can aloso observe diffrence of the number of orderinf between weekends.

2.2 plot the number of prior , train, and test orders

## count the number of prior orders
p3 <- orders %>%
  filter(eval_set == 'prior') %>%
  ggplot(aes(order_number)) +
  geom_histogram(stat = "count", color = "red")

p2 <- orders %>%
  filter(eval_set == 'train') %>%
  ggplot(aes(order_number)) +
  geom_histogram(stat = "count", color = "green")

p1 <- orders %>%
  filter(eval_set == 'test') %>%
  ggplot(aes(order_number)) +
  geom_histogram(stat = "count", color = "blue")

 # plot P1, P2, P3 in the same plot
 layout <- matrix(c(1,2,3,3), 2, 2, byrow = TRUE)
 multiplot(p1, p2, p3, layout=layout)

We have more prior (200000) orders, than traiun (15000) than test (7500). We observe a pick at 100 orders number for test and train samples.

2.3 plot the number of items per order

prior <- order_prior %>%
  group_by(order_id) %>%
  dplyr::summarise(n_orders = n()) %>%
  ggplot(aes(x= n_orders)) +
  geom_histogram(bins = 50, color = "yellow")+
  xlim(0,50) +
  labs(title = "Prior orders") +
  xlab("number of items per order") +
  ylab("n° orders")
  
train <- order_train %>%
  group_by(order_id) %>%
  dplyr::summarise(n_orders = n()) %>%
  ggplot(aes(x= n_orders)) +
  geom_histogram(bins = 50, color = "orange") +
  xlim(0,50) +
  labs(title = "Train orders") +
  xlab("number of items per order") +
  ylab("n° orders")

 # plot P1, P2, P3 in the same plot
 layout <- matrix(c(1, 2), 1, 2, byrow = TRUE)
 multiplot(prior, train, layout=layout)

We find: The most frequent nbr of order is about 5-6 items for the Prior and Train datasets. Ordering 25 items seems to be an exception.

2.4 Top costomers for Top products

# top costumers that purchased the largest number of items 
#the dataset is limited to 100 items maximum per costumer
top_costumers_items <- orders %>%
  group_by(user_id) %>%
  dplyr::summarise(n_orders = length(order_id)) %>%
  filter(n_orders < 100) %>%
  top_n(50, wt = n_orders) %>%
  #arrange(freq) %>%
  ggplot(aes(x = reorder(user_id, - n_orders), y = n_orders)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_point() +
  labs(title = "Top costumers which buy more items") +
  xlab("user_id") +
  ylab("nbr items")


top_costumers_visits <- orders %>%
  #filter(user_id == 123) %>%
  group_by(user_id) %>%
  dplyr::summarise(n_visits = last(order_number)) %>%
  filter(n_visits < 100) %>%
  top_n(20, n_visits) %>%
  ggplot(aes(x = user_id, y = n_visits)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col() +
  labs(title = "Number of visits per costumers") +
  xlab("User_id") +
  ylab("Order Number")
  

 top20_item_prior <- order_prior %>%
   group_by(product_id) %>%
   dplyr::summarise(n = n()) %>%
  top_n(20, wt = n) %>%
  left_join(products, by = 'product_id') %>%
  ggplot(aes(x = reorder(product_name, - n) , y = n )) +
  #coord_flip() +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col( color = "green") +
  labs(title = "Top 20 products for Prior orders") +
  xlab("product_id") +
  ylab("n° orders")


  top20_item_train <- order_train %>%
  group_by(product_id) %>%
   dplyr::summarise(n = n()) %>%
  top_n(20, wt = n) %>%
  left_join(products, by = 'product_id') %>%
  ggplot(aes(x = reorder(product_name, - n) , y = n )) +
  #coord_flip() +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col( color = "red") +
  labs(title = "Top 20 products for train orders") +
  xlab("product_id") +
  ylab("nbr orders")
  
  
  # plot P1, P2, P3, p4 in the same plot
 layout <- matrix(c(1,2,3,4), 2, 2, byrow = TRUE)
 multiplot(top_costumers_visits, top_costumers_visits,  top20_item_prior,top20_item_train, layout=layout)

2.5 Top reordered items

reordered_train <- order_train %>% 
  filter(reordered == TRUE) %>%
  group_by(product_id) %>% 
  dplyr::summarise(freq = n()) %>% 
  top_n(10, wt = freq) %>%
  left_join(products, by = 'product_id') %>%
    ggplot(aes(x = reorder(product_name, - freq) , y = freq )) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col( color = "red") +
  labs(title = "Top reordered items for train dataset") +
  xlab("product name") +
  ylab("Reordered frequency")
  
  
reordered_prior <- order_prior %>% 
  filter(reordered == TRUE) %>%
  group_by(product_id) %>% 
  dplyr::summarise(freq = n()) %>% 
  top_n(10, wt = freq) %>%
  left_join(products, by = 'product_id') %>%
    ggplot(aes(x = reorder(product_name, - freq) , y = freq )) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col( color = "blue") +
  labs(title = "Top reordered items for prior dataset") +
  xlab("product name") +
  ylab("Reordered frequency")


  # plot P1, P2, P3, p4 in the same plot
 layout <- matrix(c(1,2), 1, 2, byrow = TRUE)
 multiplot(reordered_train, reordered_prior, layout=layout)

We find: The main difference is in Organic Whole Milk which is not in the top 10 products in train orders.

2.6 Which most items added the first to the card

first_item_to_cart_prior <-
order_prior %>%
  filter(add_to_cart_order == 1) %>%
  #filter(product_id == "345") %>%
  group_by(product_id, reordered) %>%
  dplyr::summarise(n_first = n()) %>%
  arrange(desc(n_first)) %>%
  head(10) %>%
 # dplyr::top_n(10, wt = n_first) %>% doesn't work
  left_join(products, by = 'product_id') %>%
  ggplot(aes(x = product_name, y =  n_first)) +
   theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col(aes(fill = reordered)) +
  labs(title = "Top first added items to the cart (Prior)") +
  xlab("product name") +
  ylab("frequency of added the first to the cart")


first_item_to_cart_train <-
order_train %>%
  filter(add_to_cart_order == 1) %>%
  group_by(product_id, reordered) %>%
  #summarize(proportion_reordered = mean(reordered), n=n())
  dplyr::summarise(n_first = n()) %>%
  arrange(desc(n_first)) %>%
  head(10) %>%
 # dplyr::top_n(10, wt = n_first) %>% doesn't work
  left_join(products, by = 'product_id') %>%
  ggplot(aes(x = product_name, y =  n_first)) +
   theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col(aes(fill = reordered)) +
  labs(title = "Top first added items to the cart (Train)") +
  xlab("product name") +
  ylab("frequency of added the first to the cart")



  # plot P1, P2, P3, p4 in the same plot
 layout <- matrix(c(1,2), 1, 2, byrow = TRUE)
 multiplot(first_item_to_cart_prior, first_item_to_cart_train, layout=layout)

we find: In general only the reordered items are added at the bigening of shopping. Only few cases with banana are not ordered for the first time and added the first to the cart.

2.7 reordering rate versus added rank to the cart

order_train %>% 
  group_by(product_id, add_to_cart_order) %>% 
  summarize(n_items_basket = n(), reordered_rate_basket = mean(reordered)) %>% 
  group_by(add_to_cart_order) %>%
  summarise(reordered_rate_all = mean(reordered_rate_basket))%>%
  ggplot(aes(x= add_to_cart_order, y = reordered_rate_all)) +
  geom_line()

2.8 Which most pourcentage of items added the first to the card

first_pct_item_to_cart_train <- order_train %>% 
  group_by(product_id, add_to_cart_order) %>% 
  summarize(count = n()) %>% 
  mutate(pct=count/sum(count)) %>% 
  filter(add_to_cart_order == 1, count>10) %>% 
  arrange(desc(pct)) %>% 
  left_join(products,by='product_id') %>%
  ungroup() %>% 
  select(product_name, pct, count) %>% 
  top_n(10, wt=pct) %>%
  ggplot(aes(x = reorder(product_name,-pct), y = pct)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col() +
   labs(title = "Top first % added items to the cart (Train)") +
  xlab("Product name") +
  ylab("Pourcentage (%) of item added the first to the cart")



first_pct_item_to_cart_prior <- order_prior %>% 
  group_by(product_id, add_to_cart_order) %>% 
  summarize(count = n()) %>% 
  mutate(pct=count/sum(count)) %>% 
  filter(add_to_cart_order == 1, count>10) %>% 
  arrange(desc(pct)) %>% 
  left_join(products,by='product_id') %>%
  ungroup() %>% 
  select(product_name, pct, count) %>% 
  top_n(10, wt=pct) %>%
  ggplot(aes(x = reorder(product_name,-pct), y = pct)) +
   geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)+
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col() +
   labs(title = "Top first % added items to the cart (Prior)") +
  xlab("Product name") +
  ylab("Pourcentage (%) of item added the first to the cart")

  # plot P1, P2, P3, p4 in the same plot
 layout <- matrix(c(1,2), 1, 2, byrow = TRUE)
 multiplot(first_pct_item_to_cart_prior, first_pct_item_to_cart_train, layout=layout)

2.9 Top90 sellers of Banana and Strawberries

banana_id <- products %>%
             filter(product_name == "Banana") %>%
             #select(product_id) %>%
            .$product_id
strawberries_id <- products %>%
             filter(product_name == "Strawberries") %>%
             #select(product_id) %>%
            .$product_id

spring_water_id <- products %>%
             filter(product_name == "Spring Water") %>%
             #select(product_id) %>%
            .$product_id

asparagus_id <- products %>%
             filter(product_name == "Asparagus") %>%
             #select(product_id) %>%
            .$product_id

# filter the orders with banana
order_train %>%
  filter(product_id %in% c(banana_id, strawberries_id)) %>%
  left_join(orders, by = "order_id") %>%
  group_by( order_id, user_id) %>%
   dplyr::summarise(n_orders = last(order_number)) %>%
  filter(n_orders == 90) 
FALSE # A tibble: 6 x 3
FALSE # Groups:   order_id [6]
FALSE   order_id user_id n_orders
FALSE      <int> <fct>      <int>
FALSE 1   271953 145628        90
FALSE 2  1578927 24195         90
FALSE 3  1589791 188446        90
FALSE 4  1854209 166449        90
FALSE 5  2803296 179192        90
FALSE 6  2975947 195993        90

2.10 Explore Days interval of reordering items

interval_item_reordered_train <- order_train %>%
  left_join(orders, by = "order_id") %>%
  group_by(days_since_prior_order) %>%
  summarize(mean_reorder = mean(reordered), n = n()) %>%
  ggplot(aes(days_since_prior_order, y = mean_reorder)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col( color = "red") +
  labs(title = "Mean Interval (days) of reordered items (Train)") +
  xlab("Days") +
  ylab("Mean reordered (%)")


interval_item_reordered_prior <- order_prior %>%
  left_join(orders, by = "order_id") %>%
  group_by(days_since_prior_order) %>%
  summarize(mean_reorder = mean(reordered), n = n()) %>%
  ggplot(aes(days_since_prior_order, y = mean_reorder)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_col( color = "blue") +
  labs(title = "Mean Interval (days) of reordered items (Prior)") +
  xlab("Days") +
  ylab("Mean reordered (%)")


  # plot P1, P2, P3, p4 in the same plot
 layout <- matrix(c(1,2), 1, 2, byrow = TRUE)
 multiplot(interval_item_reordered_train, interval_item_reordered_prior, layout=layout)

With 0 days items are reordered at 0.85%. After 30 days the same items are ordered at 0.45%

2.11 Explore number of orders and reordering items

grp_pdt_train <- order_train %>%
  #left_join(orders, by = "order_id") %>%
  group_by(product_id) %>%
  summarize(mean_reorder = mean(reordered), n = n()) %>%
  ggplot(aes(n, y = mean_reorder)) +
  #theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_point(size = 0.1, alpha = 0.3) +
  geom_smooth(color="red")+
  labs(title = "Number of reordered items per product_id (Train)") +
  xlab("Number of orders") +
  ylab("Mean reordered (%) per product_id") +
  coord_cartesian(xlim=c(0,5000))

grp_order_train <- order_train %>%
  #left_join(orders, by = "order_id") %>%
  group_by(order_id) %>%
  summarize(mean_reorder = mean(reordered), n = n()) %>%
  ggplot(aes(n, y = mean_reorder)) +
  #theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_point(color= "red", size = 0.1, alpha = 0.3) +
  labs(title = "Number of reordered items per order_id (Train)") +
  xlab("Number of orders") +
  ylab("Mean reordered (%) per order_id") 

grp_pdt_prior <- order_prior %>%
  #left_join(orders, by = "order_id") %>%
  group_by(product_id) %>%
  summarize(mean_reorder = mean(reordered), n = n()) %>%
  ggplot(aes(n, y = mean_reorder)) +
  #theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_point( size = 0.1, alpha = 0.3) +
  geom_smooth(color="blue")+
  labs(title = "Number of reordered items per product_id (Prior)") +
  xlab("Number of orders") +
  ylab("Mean reordered (%) per product_id") +
  coord_cartesian(xlim=c(0,10000))

grp_order_prior <- order_prior %>%
  #left_join(orders, by = "order_id") %>%
  group_by(order_id) %>%
  summarize(mean_reorder = mean(reordered), n = n()) %>%
  ggplot(aes(n, y = mean_reorder)) +
  #theme(axis.text.x = element_text(angle=45, hjust=1)) +
  geom_point(color="blue", size = 0.1, alpha = 0.3) +
  labs(title = "Number of reordered items per order_id (Prior)") +
  xlab("Number of orders") +
  ylab("Mean reordered (%) per order_id") 

  # plot P1, P2, P3, p4 in the same plot
 layout <- matrix(c(1,2, 3, 4), 2, 2, byrow = TRUE)
 multiplot(grp_pdt_train, grp_order_train,grp_pdt_prior, grp_order_prior , layout=layout)
FALSE `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
FALSE `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

2.12 Visualize Departments and Aisles

library(treemap)

items_per_aisle <- products %>% 
  group_by(department_id, aisle_id) %>% 
  summarize(n_items = n()) %>%
  left_join(departments,by="department_id") %>%
  left_join(aisles,by="aisle_id")

tree_aisle <- order_train %>%
  group_by(product_id) %>%
  dplyr::summarise(count_ordered_item = n()) %>%
  left_join(products, by = "product_id") %>%
  ungroup() %>%
  group_by(department_id, aisle_id) %>%
  summarize(sumcount = sum(count_ordered_item)) %>%
  left_join(items_per_aisle, by = c("department_id", "aisle_id")) %>% 
  mutate(onesize = 1)

treemap(tree_aisle,index = c("department","aisle"),
             vSize = "onesize", 
             vColor = "department",
             palette = "Set3",
             title = "super market Map", 
             sortID = "-sumcount",
             border.col = "#FFFFFF",
             type = "categorical",
             fontsize.legend = 0,
             bg.labels = "#FFFFFF")

treemap(tree_aisle,index = c("department","aisle"),
             vSize = "sumcount", 
             vColor = "department",
             palette = "Set3",
             title = "super market Map", 
           #  sortID = "-sumcount",
             border.col = "#FFFFFF",
             type = "categorical",
             fontsize.legend = 0,
             bg.labels = "#FFFFFF")

2.13 Look for proportion of costumers that reorders the same products

order_number_upper2 <- order_prior %>%
  group_by(order_id) %>%
  dplyr::summarise(mean_redordered_item_per_basket = mean(reordered), n_items_per_basket = n()) %>%
  left_join(orders, by = "order_id") %>%
  filter(order_number > 2)

order_number_upper2 %>%
  #filter(eval_set =="prior") %>% 
  group_by(user_id) %>%
  dplyr::summarise(sum_orders = sum(mean_redordered_item_per_basket == 1, na.rm = TRUE), ratio_reordered_items = sum_orders/n()) %>%
  filter(ratio_reordered_items == 1) %>%
  arrange(desc(sum_orders)) %>%
  head(10)
FALSE # A tibble: 10 x 3
FALSE    user_id sum_orders ratio_reordered_items
FALSE    <fct>        <int>                 <dbl>
FALSE  1 99753           97                     1
FALSE  2 55331           49                     1
FALSE  3 106510          49                     1
FALSE  4 111365          47                     1
FALSE  5 74656           46                     1
FALSE  6 170174          45                     1
FALSE  7 12025           43                     1
FALSE  8 164779          37                     1
FALSE  9 37075           34                     1
FALSE 10 110225          33                     1

Here i look for customers who just reorder the same products again all the time. To search those I look at all orders (excluding the first order), where the percentage of reordered items is exactly 1 (This can easily be adapted to look at more lenient thresholds). We can see there are in fact 3,487 customers, just always reordering products. user_id 99753 reorded the same items (same basket countain) for 97 visits/orders.

2.14 the basket for the most fidel user_id

order_number_upper2 %>%
filter(user_id == 99753) %>%
  left_join(order_prior, by = "order_id") %>%
  left_join(products, by = "product_id") %>%
  select(product_name, user_id, order_id, w_day, days_since_prior_order) %>%
  arrange(order_id) %>%
  head(10)
## # A tibble: 10 x 5
##    product_name             user_id order_id w_day days_since_prior_order
##    <fct>                    <fct>      <int> <ord>                  <dbl>
##  1 Organic Whole Milk       99753      46614 Tue                        2
##  2 Organic Reduced Fat Milk 99753      46614 Tue                        2
##  3 Organic Whole Milk       99753      67223 Wed                        2
##  4 Organic Reduced Fat Milk 99753      67223 Wed                        2
##  5 Organic Whole Milk       99753     214506 Sun                        5
##  6 Organic Reduced Fat Milk 99753     214506 Sun                        5
##  7 Organic Whole Milk       99753     240832 Tue                        2
##  8 Organic Reduced Fat Milk 99753     240832 Tue                        2
##  9 Organic Whole Milk       99753     260804 Sun                        4
## 10 Organic Reduced Fat Milk 99753     260804 Sun                        4

The user_id buy always the same two items Organic milk, maybe for a baby.

orders %>%
   left_join(sample_sub, "order_id") %>%
  head(20)
##    order_id user_id eval_set order_number order_dow order_hour_of_day
## 1   2539329       1    prior            1         2                 8
## 2   2398795       1    prior            2         3                 7
## 3    473747       1    prior            3         3                12
## 4   2254736       1    prior            4         4                 7
## 5    431534       1    prior            5         4                15
## 6   3367565       1    prior            6         2                 7
## 7    550135       1    prior            7         1                 9
## 8   3108588       1    prior            8         1                14
## 9   2295261       1    prior            9         1                16
## 10  2550362       1    prior           10         4                 8
## 11  1187899       1    train           11         4                 8
## 12  2168274       2    prior            1         2                11
## 13  1501582       2    prior            2         5                10
## 14  1901567       2    prior            3         1                10
## 15   738281       2    prior            4         2                10
## 16  1673511       2    prior            5         3                11
## 17  1199898       2    prior            6         2                 9
## 18  3194192       2    prior            7         2                12
## 19   788338       2    prior            8         1                15
## 20  1718559       2    prior            9         2                 9
##    days_since_prior_order w_day products
## 1                      NA   Mon     <NA>
## 2                      15   Tue     <NA>
## 3                      21   Tue     <NA>
## 4                      29   Wed     <NA>
## 5                      28   Wed     <NA>
## 6                      19   Mon     <NA>
## 7                      20   Sun     <NA>
## 8                      14   Sun     <NA>
## 9                       0   Sun     <NA>
## 10                     30   Wed     <NA>
## 11                     14   Wed     <NA>
## 12                     NA   Mon     <NA>
## 13                     10   Thu     <NA>
## 14                      3   Sun     <NA>
## 15                      8   Mon     <NA>
## 16                      8   Tue     <NA>
## 17                     13   Mon     <NA>
## 18                     14   Mon     <NA>
## 19                     27   Sun     <NA>
## 20                      8   Mon     <NA>
orders %>%
   inner_join(sample_sub, "order_id") %>%
   head(10)
##    order_id user_id eval_set order_number order_dow order_hour_of_day
## 1   2774568       3     test           13         5                15
## 2    329954       4     test            6         3                12
## 3   1528013       6     test            4         3                16
## 4   1376945      11     test            8         6                11
## 5   1356845      12     test            6         1                20
## 6   2161313      15     test           23         1                 9
## 7   1416320      16     test            7         0                13
## 8   1735923      19     test           10         6                17
## 9   1980631      20     test            5         1                11
## 10   139655      22     test           16         5                 6
##    days_since_prior_order w_day    products
## 1                      11   Thu 39276 29259
## 2                      30   Tue 39276 29259
## 3                      22   Tue 39276 29259
## 4                       8   Fri 39276 29259
## 5                      30   Sun 39276 29259
## 6                       7   Sun 39276 29259
## 7                       7  <NA> 39276 29259
## 8                       8   Fri 39276 29259
## 9                      30   Sun 39276 29259
## 10                      1   Thu 39276 29259

3 Deploy arules algorithme to the orders

3.1 Regroup items per basket

In each orders, costumers bougth multiple items forming a basket. We will regroup the orders per basket.

We need to join orders by order_id and then by products_id. This code take a while.

#ransactions <- orders %>%
#  left_join(order_train, by = "order_id") %>%
#  left_join(products, by = "product_id")

# baskets <- transactions %>%
#            plyr::ddply(c("order_id", "user_id"),
#               function(df1) paste(df1$product_name,
#                                   collapse = ","))
# 
# colnames(baskets) <- c("Order_id","user_id","Baskets")
# baskets <- readRDS("baskets_prior.RDS")
# tibble::glimpse(baskets)
# saveRDS(object = baskets, file = "baskets_train.RDS")
# # Write the Baskets list to a file
# write.csv(baskets$V1,"transactions_list.csv", quote = FALSE, row.names = TRUE)
# 
 library(arules)
FALSE Loading required package: Matrix
FALSE 
FALSE Attaching package: 'arules'
FALSE The following object is masked from 'package:dplyr':
FALSE 
FALSE     recode
FALSE The following objects are masked from 'package:base':
FALSE 
FALSE     abbreviate, write
# # load the Baskets list as a basket format using arules package
tr_list <- read.transactions('transactions_list.csv', format = 'basket', sep=',')
summary(tr_list)
## transactions as itemMatrix in sparse format with
##  3421084 rows (elements/itemsets/transactions) and
##  3702915 columns (items) and a density of 2.788542e-06 
## 
## most frequent items:
##                 Banana Bag of Organic Bananas   Organic Strawberries 
##                 460485                 367637                 252559 
##   Organic Baby Spinach   Organic Hass Avocado                (Other) 
##                 232235                 205061               33807223 
## 
## element (itemset/transaction) length distribution:
## sizes
##      1      2      3      4      5      6      7      8      9     10 
## 206210 173996 198879 213972 224853 227409 225230 216030 200441 182058 
##     11     12     13     14     15     16     17     18     19     20 
## 163733 145878 129947 115507 101932  89900  79491  69553  61292  53686 
##     21     22     23     24     25     26     27     28     29     30 
##  46683  40789  35385  30654  26430  22919  19838  17001  14613  12638 
##     31     32     33     34     35     36     37     38     39     40 
##  10863   9192   7861   6780   5783   5026   4267   3586   3090   2604 
##     41     42     43     44     45     46     47     48     49     50 
##   2184   1828   1556   1371   1141   1014    898    729    645    513 
##     51     52     53     54     55     56     57     58     59     60 
##    459    376    336    242    226    205    176    166    116    109 
##     61     62     63     64     65     66     67     68     69     70 
##     84     83     78     69     54     51     44     37     30     33 
##     71     72     73     74     75     76     77     78     79     80 
##     27     14     13     26     16     17     11     11     11      8 
##     81     82     83     84     85     86     87     88     89     90 
##      6      9      2      3      4      5      2      2      4      5 
##     91     92     93     94     95     96     97     98     99    100 
##      1      2      1      1      2      3      1      1      4      2 
##    101    102    103    106    111    112    113    114    115    120 
##      1      1      4      2      1      2      1      1      1      2 
##    140    151 
##      1      1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    5.00    9.00   10.33   14.00  151.00 
## 
## includes extended item information - examples:
##                                          labels
## 1                          .5\\ Waterproof Tape
## 2 (70% Juice!) Mountain Raspberry Juice Squeeze
## 3                \\Constant Comment\\ Black Tea
There are 3421084 transactions and 3702915 purchased items (One item can be purchased multiple times). Each transaction is a collection of items.

The Density is 2.788542e-06 tells the percentage of non-zero cells in a sparse matrix. It is the total number of items that are purchased and divided by a possible number of items in the matrix.

The multiplication of the three values is the total of products were purchased.
round( 3421084 * 3702915 * 2.788542e-06)
## [1] 35325203
  • The most frequent items are Banana (46048), Bag of Organic Bananas (367637), Organic Strawberries (252559), Organic Baby Spinach (232235).
summary(tr_list)@lengths
## sizes
##      1      2      3      4      5      6      7      8      9     10 
## 206210 173996 198879 213972 224853 227409 225230 216030 200441 182058 
##     11     12     13     14     15     16     17     18     19     20 
## 163733 145878 129947 115507 101932  89900  79491  69553  61292  53686 
##     21     22     23     24     25     26     27     28     29     30 
##  46683  40789  35385  30654  26430  22919  19838  17001  14613  12638 
##     31     32     33     34     35     36     37     38     39     40 
##  10863   9192   7861   6780   5783   5026   4267   3586   3090   2604 
##     41     42     43     44     45     46     47     48     49     50 
##   2184   1828   1556   1371   1141   1014    898    729    645    513 
##     51     52     53     54     55     56     57     58     59     60 
##    459    376    336    242    226    205    176    166    116    109 
##     61     62     63     64     65     66     67     68     69     70 
##     84     83     78     69     54     51     44     37     30     33 
##     71     72     73     74     75     76     77     78     79     80 
##     27     14     13     26     16     17     11     11     11      8 
##     81     82     83     84     85     86     87     88     89     90 
##      6      9      2      3      4      5      2      2      4      5 
##     91     92     93     94     95     96     97     98     99    100 
##      1      2      1      1      2      3      1      1      4      2 
##    101    102    103    106    111    112    113    114    115    120 
##      1      1      4      2      1      2      1      1      1      2 
##    140    151 
##      1      1
  • The element length distribution is formulated as number of products / number of transactions.
  • For example, there are 182058 transactions with 10 items, and 2 transactions with 100 items.

4 Plot the most 10 frequent purchased items

library(RColorBrewer)
#par(mfrow=c(2,1))
arules::itemFrequencyPlot(tr_list,topN=10,type="absolute",col=brewer.pal(8,'Pastel2'), main="Absolute Product Frequency Plot")

arules::itemFrequencyPlot(tr_list,topN=10,type="relative",col=brewer.pal(8,'Pastel2'), main="Relative Product Frequency Plot")

5 Applying of `APRIORI algorithme

The next step is to mine the rules using the APRIORI algorithm. The function apriori() is from arules package.

We can set and optimize the parameter of the minimum support of 0.0001, the `minimum confidence of 0.8, maximum of 10 items (maxlen).

#Min Support as 0.0001, confidence as 0.8 and maximum of 10 products.
association.rules <- arules::apriori(tr_list, parameter = list(supp=0.001, conf=0.8))  #, maxlen=10
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 3421 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[3702915 item(s), 3421084 transaction(s)] done [27.23s].
## sorting and recoding items ... [1635 item(s)] done [0.95s].
## creating transaction tree ... done [4.82s].
## checking subsets of size 1 2 3 4 done [1.26s].
## writing ... [209 rule(s)] done [0.01s].
## creating S4 object  ... done [2.06s].

Definition * Itemset: Collection of one or more items. K-item-set means a set of k items. * Support Count: Frequency of occurrence of an item-set * Support(s): Fraction of transactions that contain the item-set

6 Inspect the top 10 rules

inspect(association.rules[1:10])
##      lhs                                                 rhs                               support confidence     lift count
## [1]  {Paper Towels Choose-A-Sheet}                    => {One-Ply}                     0.001009914  1.0000000 990.1835  3455
## [2]  {One-Ply}                                        => {Paper Towels Choose-A-Sheet} 0.001009914  1.0000000 990.1835  3455
## [3]  {Paper Towels Choose-A-Sheet}                    => {Mega Rolls}                  0.001009914  1.0000000 971.3470  3455
## [4]  {Mega Rolls}                                     => {Paper Towels Choose-A-Sheet} 0.001009914  0.9809767 971.3470  3455
## [5]  {One-Ply}                                        => {Mega Rolls}                  0.001009914  1.0000000 971.3470  3455
## [6]  {Mega Rolls}                                     => {One-Ply}                     0.001009914  0.9809767 971.3470  3455
## [7]  {Medium Pulp}                                    => {Country Stand Juice}         0.001077729  1.0000000 927.8774  3687
## [8]  {Country Stand Juice}                            => {Medium Pulp}                 0.001077729  1.0000000 927.8774  3687
## [9]  {2 Huge Rolls = 5 Regular Rolls  Towels/Napkins} => {Select-A-Size Paper Towels}  0.001146420  1.0000000 644.2719  3922
## [10] {2 Huge Rolls = 5 Regular Rolls  Towels/Napkins} => {White}                       0.001146420  1.0000000 504.6591  3922

Interpretation * 100% of the customers who bought Paper Towels Choose-A-Sheet also bought One-Ply. * 100% of the customers who bought Medium Pulp also bought Country Stand Juice.

7 Extract rules for Olives

Olives.association.rules <- apriori(tr_list, parameter = list(supp = 0.001, conf = 0.8), appearance = list(default="lhs", rhs = "Olives"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 3421 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[3702915 item(s), 3421084 transaction(s)] done [22.90s].
## sorting and recoding items ... [1635 item(s)] done [1.03s].
## creating transaction tree ... done [5.23s].
## checking subsets of size 1 2 3 4 done [1.35s].
## writing ... [6 rule(s)] done [0.01s].
## creating S4 object  ... done [2.32s].

8 Inspect the top 10 rules for Olives

# lhs= canned beer because you want to find out the probability of that in how many customers buy canned beer along with other items
inspect(Olives.association.rules)
##     lhs                          rhs      support     confidence lift    
## [1] {Kalamata}                => {Olives} 0.001097898 1.0000000  694.4953
## [2] {Pitted}                  => {Olives} 0.001094682 0.9067797  629.7542
## [3] {Kalamata,Pitted}         => {Olives} 0.001094390 1.0000000  694.4953
## [4] {Kalamata,Organic}        => {Olives} 0.001094390 1.0000000  694.4953
## [5] {Organic,Pitted}          => {Olives} 0.001094390 0.9306488  646.3312
## [6] {Kalamata,Organic,Pitted} => {Olives} 0.001094390 1.0000000  694.4953
##     count
## [1] 3756 
## [2] 3745 
## [3] 3744 
## [4] 3744 
## [5] 3744 
## [6] 3744
#inspect(association.rules[1:30])

Olives (RHS) was Bought as Kalamata or Pitted.

9 Visualizing Association Rules

A straight-forward visualization of association rules is to use a scatter plot using plot() of the arulesViz package. It uses Support and Confidence on the axes.

library(arulesViz)
# Filter rules with confidence greater than 0.9 or 90%
subRules <- association.rules[quality(association.rules)$confidence > 0.8]
#Plot SubRules
plot(subRules, jitter = 0.3)

10 Two-key Plot

plot(subRules,method="two-key plot", jitter = 0.3)

11 Interactive visualisation of 30 Rules

#top10subRules <- head(subRules, n = 10, by = "confidence")
plot(subRules[1:30], method = "graph",  engine = "htmlwidget", control = list(type = "items"))
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## itemCol   =  #CBD2FC
## nodeCol   =  c("#EE0000", "#EE0303", "#EE0606", "#EE0909", "#EE0C0C", "#EE0F0F", "#EE1212", "#EE1515", "#EE1818", "#EE1B1B", "#EE1E1E", "#EE2222", "#EE2525", "#EE2828", "#EE2B2B", "#EE2E2E", "#EE3131", "#EE3434", "#EE3737", "#EE3A3A", "#EE3D3D", "#EE4040", "#EE4444", "#EE4747", "#EE4A4A", "#EE4D4D", "#EE5050", "#EE5353", "#EE5656", "#EE5959", "#EE5C5C", "#EE5F5F", "#EE6262", "#EE6666", "#EE6969", "#EE6C6C", "#EE6F6F", "#EE7272", "#EE7575", "#EE7878", "#EE7B7B", "#EE7E7E", "#EE8181", "#EE8484", "#EE8888", "#EE8B8B",  "#EE8E8E", "#EE9191", "#EE9494", "#EE9797", "#EE9999", "#EE9B9B", "#EE9D9D", "#EE9F9F", "#EEA0A0", "#EEA2A2", "#EEA4A4", "#EEA5A5", "#EEA7A7", "#EEA9A9", "#EEABAB", "#EEACAC", "#EEAEAE", "#EEB0B0", "#EEB1B1", "#EEB3B3", "#EEB5B5", "#EEB7B7", "#EEB8B8", "#EEBABA", "#EEBCBC", "#EEBDBD", "#EEBFBF", "#EEC1C1", "#EEC3C3", "#EEC4C4", "#EEC6C6", "#EEC8C8", "#EEC9C9", "#EECBCB", "#EECDCD", "#EECFCF", "#EED0D0", "#EED2D2", "#EED4D4", "#EED5D5", "#EED7D7", "#EED9D9", "#EEDBDB", "#EEDCDC", "#EEDEDE", "#EEE0E0",  "#EEE1E1", "#EEE3E3", "#EEE5E5", "#EEE7E7", "#EEE8E8", "#EEEAEA", "#EEECEC", "#EEEEEE")
## precision     =  3
## igraphLayout  =  layout_nicely
## interactive   =  TRUE
## engine    =  visNetwork
## max   =  100
## selection_menu    =  TRUE
## degree_highlight  =  1
## verbose   =  FALSE

12 Individual Rule Representation

The Parallel Coordinates Plot is useful to visualized which items were purchased with an other one.

# Filter top 16 rules with highest lift
subRules2<-head(subRules, n=16, by="lift")
plot(subRules2[1:16], method="paracoord", control = list(reorder = TRUE))