arules
algorithme to the ordersOlives
Olives
The dataset for this competition is a relational set of files describing customers’ orders over time. The goal of the competition is to predict which products will be in a user’s next order. The dataset is anonymized and contains a sample of over 3 million grocery orders from more than 200,000 Instacart users. For each user, we provide between 4 and 100 of their orders, with the sequence of products purchased in each order. We also provide the week and hour of day the order was placed, and a relative measure of time between orders. For more information, see the blog post accompanying its public release.
# Define multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
aisles <- data.table::fread("aisles.csv")
departments <- data.table::fread("departments.csv")
order_prior <- data.table::fread("order_products__prior.csv")
order_train <- data.table::fread("order_products__train.csv")
orders <- data.table::fread("orders.csv")
products <- data.table::fread("products.csv")
sample_sub <- data.table::fread("sample_submission.csv")
The list of aisles
The list of departments
Example of sample submission
The list of orders: order_dow’ is the day of week. user 1 has 11 orders, 1 of which is in the train set, and 10 of which are prior orders.
The list of Products
order_Prior: Contains previous order contents for all customers ‘reordered’ indicates that the customer has a previous order that contains the product
Order_Train: in each order, products were added de the card by priority. some products were reordered
aisles <- aisles %>%
mutate(aisle = as.factor(aisle))
departments <- departments %>%
mutate(department = as.factor(department))
order_prior <- order_prior %>%
mutate(reordered = as.logical(reordered)) #%>%
#mutate(product_id = as.factor(product_id))
order_train <- order_train %>%
mutate(reordered = as.logical(reordered))
orders <- orders %>%
mutate(eval_set = as.factor(eval_set)) %>%
mutate(w_day = wday(order_dow , label = TRUE)) %>% # +1: weekdays have the interval [0:6] an dnot [1:7]
mutate(user_id = as.factor(user_id))
products <- products %>%
mutate(product_name = as.factor(product_name))
In this first section, we would try explore the of details of the orders, the content of baskets, the best sold items.
## time of ordering
p1 <- orders %>%
ggplot(aes(x = order_hour_of_day)) +
geom_histogram( stat="count", color= "blue") # ,bins = 24
## days of ordering
p2 <- orders %>%
ggplot(aes(x = w_day)) +
geom_histogram( stat= "count", color = "green")
## interval of days before Reordering
p3 <- orders %>%
ggplot(aes(x = days_since_prior_order)) +
geom_histogram(bins = 30, color = "yellow")
# plot P1, P2, P3, p4 in the same plot
layout <- matrix(c(1,2,3,3), 2, 2, byrow = TRUE)
multiplot(p1, p2, p3, layout=layout)
we find: The main orders were done during the working hours of the day (8:17h). We assume that NA
corresponds to saturday
. A clear effect of the orders number is shown durinr the weekend. We observe a interval of reordering of 30 days. During this cycle we observe a picks at the dat 7, 15 and 30. We can observe more ordering during weekends during a month. But we can aloso observe diffrence of the number of orderinf between weekends.
## count the number of prior orders
p3 <- orders %>%
filter(eval_set == 'prior') %>%
ggplot(aes(order_number)) +
geom_histogram(stat = "count", color = "red")
p2 <- orders %>%
filter(eval_set == 'train') %>%
ggplot(aes(order_number)) +
geom_histogram(stat = "count", color = "green")
p1 <- orders %>%
filter(eval_set == 'test') %>%
ggplot(aes(order_number)) +
geom_histogram(stat = "count", color = "blue")
# plot P1, P2, P3 in the same plot
layout <- matrix(c(1,2,3,3), 2, 2, byrow = TRUE)
multiplot(p1, p2, p3, layout=layout)
We have more prior (200000) orders, than traiun (15000) than test (7500). We observe a pick at 100 orders number for test and train samples.
prior <- order_prior %>%
group_by(order_id) %>%
dplyr::summarise(n_orders = n()) %>%
ggplot(aes(x= n_orders)) +
geom_histogram(bins = 50, color = "yellow")+
xlim(0,50) +
labs(title = "Prior orders") +
xlab("number of items per order") +
ylab("n° orders")
train <- order_train %>%
group_by(order_id) %>%
dplyr::summarise(n_orders = n()) %>%
ggplot(aes(x= n_orders)) +
geom_histogram(bins = 50, color = "orange") +
xlim(0,50) +
labs(title = "Train orders") +
xlab("number of items per order") +
ylab("n° orders")
# plot P1, P2, P3 in the same plot
layout <- matrix(c(1, 2), 1, 2, byrow = TRUE)
multiplot(prior, train, layout=layout)
We find: The most frequent nbr of order is about 5-6 items for the Prior and Train datasets. Ordering 25 items seems to be an exception.
# top costumers that purchased the largest number of items
#the dataset is limited to 100 items maximum per costumer
top_costumers_items <- orders %>%
group_by(user_id) %>%
dplyr::summarise(n_orders = length(order_id)) %>%
filter(n_orders < 100) %>%
top_n(50, wt = n_orders) %>%
#arrange(freq) %>%
ggplot(aes(x = reorder(user_id, - n_orders), y = n_orders)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_point() +
labs(title = "Top costumers which buy more items") +
xlab("user_id") +
ylab("nbr items")
top_costumers_visits <- orders %>%
#filter(user_id == 123) %>%
group_by(user_id) %>%
dplyr::summarise(n_visits = last(order_number)) %>%
filter(n_visits < 100) %>%
top_n(20, n_visits) %>%
ggplot(aes(x = user_id, y = n_visits)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col() +
labs(title = "Number of visits per costumers") +
xlab("User_id") +
ylab("Order Number")
top20_item_prior <- order_prior %>%
group_by(product_id) %>%
dplyr::summarise(n = n()) %>%
top_n(20, wt = n) %>%
left_join(products, by = 'product_id') %>%
ggplot(aes(x = reorder(product_name, - n) , y = n )) +
#coord_flip() +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col( color = "green") +
labs(title = "Top 20 products for Prior orders") +
xlab("product_id") +
ylab("n° orders")
top20_item_train <- order_train %>%
group_by(product_id) %>%
dplyr::summarise(n = n()) %>%
top_n(20, wt = n) %>%
left_join(products, by = 'product_id') %>%
ggplot(aes(x = reorder(product_name, - n) , y = n )) +
#coord_flip() +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col( color = "red") +
labs(title = "Top 20 products for train orders") +
xlab("product_id") +
ylab("nbr orders")
# plot P1, P2, P3, p4 in the same plot
layout <- matrix(c(1,2,3,4), 2, 2, byrow = TRUE)
multiplot(top_costumers_visits, top_costumers_visits, top20_item_prior,top20_item_train, layout=layout)
reordered_train <- order_train %>%
filter(reordered == TRUE) %>%
group_by(product_id) %>%
dplyr::summarise(freq = n()) %>%
top_n(10, wt = freq) %>%
left_join(products, by = 'product_id') %>%
ggplot(aes(x = reorder(product_name, - freq) , y = freq )) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col( color = "red") +
labs(title = "Top reordered items for train dataset") +
xlab("product name") +
ylab("Reordered frequency")
reordered_prior <- order_prior %>%
filter(reordered == TRUE) %>%
group_by(product_id) %>%
dplyr::summarise(freq = n()) %>%
top_n(10, wt = freq) %>%
left_join(products, by = 'product_id') %>%
ggplot(aes(x = reorder(product_name, - freq) , y = freq )) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col( color = "blue") +
labs(title = "Top reordered items for prior dataset") +
xlab("product name") +
ylab("Reordered frequency")
# plot P1, P2, P3, p4 in the same plot
layout <- matrix(c(1,2), 1, 2, byrow = TRUE)
multiplot(reordered_train, reordered_prior, layout=layout)
We find: The main difference is in Organic Whole Milk
which is not in the top 10 products in train orders.
first_item_to_cart_prior <-
order_prior %>%
filter(add_to_cart_order == 1) %>%
#filter(product_id == "345") %>%
group_by(product_id, reordered) %>%
dplyr::summarise(n_first = n()) %>%
arrange(desc(n_first)) %>%
head(10) %>%
# dplyr::top_n(10, wt = n_first) %>% doesn't work
left_join(products, by = 'product_id') %>%
ggplot(aes(x = product_name, y = n_first)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col(aes(fill = reordered)) +
labs(title = "Top first added items to the cart (Prior)") +
xlab("product name") +
ylab("frequency of added the first to the cart")
first_item_to_cart_train <-
order_train %>%
filter(add_to_cart_order == 1) %>%
group_by(product_id, reordered) %>%
#summarize(proportion_reordered = mean(reordered), n=n())
dplyr::summarise(n_first = n()) %>%
arrange(desc(n_first)) %>%
head(10) %>%
# dplyr::top_n(10, wt = n_first) %>% doesn't work
left_join(products, by = 'product_id') %>%
ggplot(aes(x = product_name, y = n_first)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col(aes(fill = reordered)) +
labs(title = "Top first added items to the cart (Train)") +
xlab("product name") +
ylab("frequency of added the first to the cart")
# plot P1, P2, P3, p4 in the same plot
layout <- matrix(c(1,2), 1, 2, byrow = TRUE)
multiplot(first_item_to_cart_prior, first_item_to_cart_train, layout=layout)
we find: In general only the reordered items are added at the bigening of shopping. Only few cases with banana are not ordered for the first time and added the first to the cart.
order_train %>%
group_by(product_id, add_to_cart_order) %>%
summarize(n_items_basket = n(), reordered_rate_basket = mean(reordered)) %>%
group_by(add_to_cart_order) %>%
summarise(reordered_rate_all = mean(reordered_rate_basket))%>%
ggplot(aes(x= add_to_cart_order, y = reordered_rate_all)) +
geom_line()
first_pct_item_to_cart_train <- order_train %>%
group_by(product_id, add_to_cart_order) %>%
summarize(count = n()) %>%
mutate(pct=count/sum(count)) %>%
filter(add_to_cart_order == 1, count>10) %>%
arrange(desc(pct)) %>%
left_join(products,by='product_id') %>%
ungroup() %>%
select(product_name, pct, count) %>%
top_n(10, wt=pct) %>%
ggplot(aes(x = reorder(product_name,-pct), y = pct)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col() +
labs(title = "Top first % added items to the cart (Train)") +
xlab("Product name") +
ylab("Pourcentage (%) of item added the first to the cart")
first_pct_item_to_cart_prior <- order_prior %>%
group_by(product_id, add_to_cart_order) %>%
summarize(count = n()) %>%
mutate(pct=count/sum(count)) %>%
filter(add_to_cart_order == 1, count>10) %>%
arrange(desc(pct)) %>%
left_join(products,by='product_id') %>%
ungroup() %>%
select(product_name, pct, count) %>%
top_n(10, wt=pct) %>%
ggplot(aes(x = reorder(product_name,-pct), y = pct)) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)+
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col() +
labs(title = "Top first % added items to the cart (Prior)") +
xlab("Product name") +
ylab("Pourcentage (%) of item added the first to the cart")
# plot P1, P2, P3, p4 in the same plot
layout <- matrix(c(1,2), 1, 2, byrow = TRUE)
multiplot(first_pct_item_to_cart_prior, first_pct_item_to_cart_train, layout=layout)
banana_id <- products %>%
filter(product_name == "Banana") %>%
#select(product_id) %>%
.$product_id
strawberries_id <- products %>%
filter(product_name == "Strawberries") %>%
#select(product_id) %>%
.$product_id
spring_water_id <- products %>%
filter(product_name == "Spring Water") %>%
#select(product_id) %>%
.$product_id
asparagus_id <- products %>%
filter(product_name == "Asparagus") %>%
#select(product_id) %>%
.$product_id
# filter the orders with banana
order_train %>%
filter(product_id %in% c(banana_id, strawberries_id)) %>%
left_join(orders, by = "order_id") %>%
group_by( order_id, user_id) %>%
dplyr::summarise(n_orders = last(order_number)) %>%
filter(n_orders == 90)
FALSE # A tibble: 6 x 3
FALSE # Groups: order_id [6]
FALSE order_id user_id n_orders
FALSE <int> <fct> <int>
FALSE 1 271953 145628 90
FALSE 2 1578927 24195 90
FALSE 3 1589791 188446 90
FALSE 4 1854209 166449 90
FALSE 5 2803296 179192 90
FALSE 6 2975947 195993 90
interval_item_reordered_train <- order_train %>%
left_join(orders, by = "order_id") %>%
group_by(days_since_prior_order) %>%
summarize(mean_reorder = mean(reordered), n = n()) %>%
ggplot(aes(days_since_prior_order, y = mean_reorder)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col( color = "red") +
labs(title = "Mean Interval (days) of reordered items (Train)") +
xlab("Days") +
ylab("Mean reordered (%)")
interval_item_reordered_prior <- order_prior %>%
left_join(orders, by = "order_id") %>%
group_by(days_since_prior_order) %>%
summarize(mean_reorder = mean(reordered), n = n()) %>%
ggplot(aes(days_since_prior_order, y = mean_reorder)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_col( color = "blue") +
labs(title = "Mean Interval (days) of reordered items (Prior)") +
xlab("Days") +
ylab("Mean reordered (%)")
# plot P1, P2, P3, p4 in the same plot
layout <- matrix(c(1,2), 1, 2, byrow = TRUE)
multiplot(interval_item_reordered_train, interval_item_reordered_prior, layout=layout)
With 0 days items are reordered at 0.85%. After 30 days the same items are ordered at 0.45%
grp_pdt_train <- order_train %>%
#left_join(orders, by = "order_id") %>%
group_by(product_id) %>%
summarize(mean_reorder = mean(reordered), n = n()) %>%
ggplot(aes(n, y = mean_reorder)) +
#theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_point(size = 0.1, alpha = 0.3) +
geom_smooth(color="red")+
labs(title = "Number of reordered items per product_id (Train)") +
xlab("Number of orders") +
ylab("Mean reordered (%) per product_id") +
coord_cartesian(xlim=c(0,5000))
grp_order_train <- order_train %>%
#left_join(orders, by = "order_id") %>%
group_by(order_id) %>%
summarize(mean_reorder = mean(reordered), n = n()) %>%
ggplot(aes(n, y = mean_reorder)) +
#theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_point(color= "red", size = 0.1, alpha = 0.3) +
labs(title = "Number of reordered items per order_id (Train)") +
xlab("Number of orders") +
ylab("Mean reordered (%) per order_id")
grp_pdt_prior <- order_prior %>%
#left_join(orders, by = "order_id") %>%
group_by(product_id) %>%
summarize(mean_reorder = mean(reordered), n = n()) %>%
ggplot(aes(n, y = mean_reorder)) +
#theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_point( size = 0.1, alpha = 0.3) +
geom_smooth(color="blue")+
labs(title = "Number of reordered items per product_id (Prior)") +
xlab("Number of orders") +
ylab("Mean reordered (%) per product_id") +
coord_cartesian(xlim=c(0,10000))
grp_order_prior <- order_prior %>%
#left_join(orders, by = "order_id") %>%
group_by(order_id) %>%
summarize(mean_reorder = mean(reordered), n = n()) %>%
ggplot(aes(n, y = mean_reorder)) +
#theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_point(color="blue", size = 0.1, alpha = 0.3) +
labs(title = "Number of reordered items per order_id (Prior)") +
xlab("Number of orders") +
ylab("Mean reordered (%) per order_id")
# plot P1, P2, P3, p4 in the same plot
layout <- matrix(c(1,2, 3, 4), 2, 2, byrow = TRUE)
multiplot(grp_pdt_train, grp_order_train,grp_pdt_prior, grp_order_prior , layout=layout)
FALSE `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
FALSE `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
library(treemap)
items_per_aisle <- products %>%
group_by(department_id, aisle_id) %>%
summarize(n_items = n()) %>%
left_join(departments,by="department_id") %>%
left_join(aisles,by="aisle_id")
tree_aisle <- order_train %>%
group_by(product_id) %>%
dplyr::summarise(count_ordered_item = n()) %>%
left_join(products, by = "product_id") %>%
ungroup() %>%
group_by(department_id, aisle_id) %>%
summarize(sumcount = sum(count_ordered_item)) %>%
left_join(items_per_aisle, by = c("department_id", "aisle_id")) %>%
mutate(onesize = 1)
treemap(tree_aisle,index = c("department","aisle"),
vSize = "onesize",
vColor = "department",
palette = "Set3",
title = "super market Map",
sortID = "-sumcount",
border.col = "#FFFFFF",
type = "categorical",
fontsize.legend = 0,
bg.labels = "#FFFFFF")
treemap(tree_aisle,index = c("department","aisle"),
vSize = "sumcount",
vColor = "department",
palette = "Set3",
title = "super market Map",
# sortID = "-sumcount",
border.col = "#FFFFFF",
type = "categorical",
fontsize.legend = 0,
bg.labels = "#FFFFFF")
order_number_upper2 <- order_prior %>%
group_by(order_id) %>%
dplyr::summarise(mean_redordered_item_per_basket = mean(reordered), n_items_per_basket = n()) %>%
left_join(orders, by = "order_id") %>%
filter(order_number > 2)
order_number_upper2 %>%
#filter(eval_set =="prior") %>%
group_by(user_id) %>%
dplyr::summarise(sum_orders = sum(mean_redordered_item_per_basket == 1, na.rm = TRUE), ratio_reordered_items = sum_orders/n()) %>%
filter(ratio_reordered_items == 1) %>%
arrange(desc(sum_orders)) %>%
head(10)
FALSE # A tibble: 10 x 3
FALSE user_id sum_orders ratio_reordered_items
FALSE <fct> <int> <dbl>
FALSE 1 99753 97 1
FALSE 2 55331 49 1
FALSE 3 106510 49 1
FALSE 4 111365 47 1
FALSE 5 74656 46 1
FALSE 6 170174 45 1
FALSE 7 12025 43 1
FALSE 8 164779 37 1
FALSE 9 37075 34 1
FALSE 10 110225 33 1
Here i look for customers who just reorder the same products again all the time. To search those I look at all orders (excluding the first order), where the percentage of reordered items is exactly 1 (This can easily be adapted to look at more lenient thresholds). We can see there are in fact 3,487 customers, just always reordering products. user_id 99753 reorded the same items (same basket countain) for 97 visits/orders.
order_number_upper2 %>%
filter(user_id == 99753) %>%
left_join(order_prior, by = "order_id") %>%
left_join(products, by = "product_id") %>%
select(product_name, user_id, order_id, w_day, days_since_prior_order) %>%
arrange(order_id) %>%
head(10)
## # A tibble: 10 x 5
## product_name user_id order_id w_day days_since_prior_order
## <fct> <fct> <int> <ord> <dbl>
## 1 Organic Whole Milk 99753 46614 Tue 2
## 2 Organic Reduced Fat Milk 99753 46614 Tue 2
## 3 Organic Whole Milk 99753 67223 Wed 2
## 4 Organic Reduced Fat Milk 99753 67223 Wed 2
## 5 Organic Whole Milk 99753 214506 Sun 5
## 6 Organic Reduced Fat Milk 99753 214506 Sun 5
## 7 Organic Whole Milk 99753 240832 Tue 2
## 8 Organic Reduced Fat Milk 99753 240832 Tue 2
## 9 Organic Whole Milk 99753 260804 Sun 4
## 10 Organic Reduced Fat Milk 99753 260804 Sun 4
The user_id buy always the same two items Organic milk
, maybe for a baby.
orders %>%
left_join(sample_sub, "order_id") %>%
head(20)
## order_id user_id eval_set order_number order_dow order_hour_of_day
## 1 2539329 1 prior 1 2 8
## 2 2398795 1 prior 2 3 7
## 3 473747 1 prior 3 3 12
## 4 2254736 1 prior 4 4 7
## 5 431534 1 prior 5 4 15
## 6 3367565 1 prior 6 2 7
## 7 550135 1 prior 7 1 9
## 8 3108588 1 prior 8 1 14
## 9 2295261 1 prior 9 1 16
## 10 2550362 1 prior 10 4 8
## 11 1187899 1 train 11 4 8
## 12 2168274 2 prior 1 2 11
## 13 1501582 2 prior 2 5 10
## 14 1901567 2 prior 3 1 10
## 15 738281 2 prior 4 2 10
## 16 1673511 2 prior 5 3 11
## 17 1199898 2 prior 6 2 9
## 18 3194192 2 prior 7 2 12
## 19 788338 2 prior 8 1 15
## 20 1718559 2 prior 9 2 9
## days_since_prior_order w_day products
## 1 NA Mon <NA>
## 2 15 Tue <NA>
## 3 21 Tue <NA>
## 4 29 Wed <NA>
## 5 28 Wed <NA>
## 6 19 Mon <NA>
## 7 20 Sun <NA>
## 8 14 Sun <NA>
## 9 0 Sun <NA>
## 10 30 Wed <NA>
## 11 14 Wed <NA>
## 12 NA Mon <NA>
## 13 10 Thu <NA>
## 14 3 Sun <NA>
## 15 8 Mon <NA>
## 16 8 Tue <NA>
## 17 13 Mon <NA>
## 18 14 Mon <NA>
## 19 27 Sun <NA>
## 20 8 Mon <NA>
orders %>%
inner_join(sample_sub, "order_id") %>%
head(10)
## order_id user_id eval_set order_number order_dow order_hour_of_day
## 1 2774568 3 test 13 5 15
## 2 329954 4 test 6 3 12
## 3 1528013 6 test 4 3 16
## 4 1376945 11 test 8 6 11
## 5 1356845 12 test 6 1 20
## 6 2161313 15 test 23 1 9
## 7 1416320 16 test 7 0 13
## 8 1735923 19 test 10 6 17
## 9 1980631 20 test 5 1 11
## 10 139655 22 test 16 5 6
## days_since_prior_order w_day products
## 1 11 Thu 39276 29259
## 2 30 Tue 39276 29259
## 3 22 Tue 39276 29259
## 4 8 Fri 39276 29259
## 5 30 Sun 39276 29259
## 6 7 Sun 39276 29259
## 7 7 <NA> 39276 29259
## 8 8 Fri 39276 29259
## 9 30 Sun 39276 29259
## 10 1 Thu 39276 29259
arules
algorithme to the ordersIn each orders, costumers bougth multiple items forming a basket. We will regroup the orders per basket.
We need to join orders by order_id and then by products_id. This code take a while.
#ransactions <- orders %>%
# left_join(order_train, by = "order_id") %>%
# left_join(products, by = "product_id")
# baskets <- transactions %>%
# plyr::ddply(c("order_id", "user_id"),
# function(df1) paste(df1$product_name,
# collapse = ","))
#
# colnames(baskets) <- c("Order_id","user_id","Baskets")
# baskets <- readRDS("baskets_prior.RDS")
# tibble::glimpse(baskets)
# saveRDS(object = baskets, file = "baskets_train.RDS")
# # Write the Baskets list to a file
# write.csv(baskets$V1,"transactions_list.csv", quote = FALSE, row.names = TRUE)
#
library(arules)
FALSE Loading required package: Matrix
FALSE
FALSE Attaching package: 'arules'
FALSE The following object is masked from 'package:dplyr':
FALSE
FALSE recode
FALSE The following objects are masked from 'package:base':
FALSE
FALSE abbreviate, write
# # load the Baskets list as a basket format using arules package
tr_list <- read.transactions('transactions_list.csv', format = 'basket', sep=',')
summary(tr_list)
## transactions as itemMatrix in sparse format with
## 3421084 rows (elements/itemsets/transactions) and
## 3702915 columns (items) and a density of 2.788542e-06
##
## most frequent items:
## Banana Bag of Organic Bananas Organic Strawberries
## 460485 367637 252559
## Organic Baby Spinach Organic Hass Avocado (Other)
## 232235 205061 33807223
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10
## 206210 173996 198879 213972 224853 227409 225230 216030 200441 182058
## 11 12 13 14 15 16 17 18 19 20
## 163733 145878 129947 115507 101932 89900 79491 69553 61292 53686
## 21 22 23 24 25 26 27 28 29 30
## 46683 40789 35385 30654 26430 22919 19838 17001 14613 12638
## 31 32 33 34 35 36 37 38 39 40
## 10863 9192 7861 6780 5783 5026 4267 3586 3090 2604
## 41 42 43 44 45 46 47 48 49 50
## 2184 1828 1556 1371 1141 1014 898 729 645 513
## 51 52 53 54 55 56 57 58 59 60
## 459 376 336 242 226 205 176 166 116 109
## 61 62 63 64 65 66 67 68 69 70
## 84 83 78 69 54 51 44 37 30 33
## 71 72 73 74 75 76 77 78 79 80
## 27 14 13 26 16 17 11 11 11 8
## 81 82 83 84 85 86 87 88 89 90
## 6 9 2 3 4 5 2 2 4 5
## 91 92 93 94 95 96 97 98 99 100
## 1 2 1 1 2 3 1 1 4 2
## 101 102 103 106 111 112 113 114 115 120
## 1 1 4 2 1 2 1 1 1 2
## 140 151
## 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 5.00 9.00 10.33 14.00 151.00
##
## includes extended item information - examples:
## labels
## 1 .5\\ Waterproof Tape
## 2 (70% Juice!) Mountain Raspberry Juice Squeeze
## 3 \\Constant Comment\\ Black Tea
There are 3421084 transactions and 3702915 purchased items (One item can be purchased multiple times). Each transaction is a collection of items.
The Density is 2.788542e-06 tells the percentage of non-zero cells in a sparse matrix. It is the total number of items that are purchased and divided by a possible number of items in the matrix.
The multiplication of the three values is the total of products were purchased.
round( 3421084 * 3702915 * 2.788542e-06)
## [1] 35325203
summary(tr_list)@lengths
## sizes
## 1 2 3 4 5 6 7 8 9 10
## 206210 173996 198879 213972 224853 227409 225230 216030 200441 182058
## 11 12 13 14 15 16 17 18 19 20
## 163733 145878 129947 115507 101932 89900 79491 69553 61292 53686
## 21 22 23 24 25 26 27 28 29 30
## 46683 40789 35385 30654 26430 22919 19838 17001 14613 12638
## 31 32 33 34 35 36 37 38 39 40
## 10863 9192 7861 6780 5783 5026 4267 3586 3090 2604
## 41 42 43 44 45 46 47 48 49 50
## 2184 1828 1556 1371 1141 1014 898 729 645 513
## 51 52 53 54 55 56 57 58 59 60
## 459 376 336 242 226 205 176 166 116 109
## 61 62 63 64 65 66 67 68 69 70
## 84 83 78 69 54 51 44 37 30 33
## 71 72 73 74 75 76 77 78 79 80
## 27 14 13 26 16 17 11 11 11 8
## 81 82 83 84 85 86 87 88 89 90
## 6 9 2 3 4 5 2 2 4 5
## 91 92 93 94 95 96 97 98 99 100
## 1 2 1 1 2 3 1 1 4 2
## 101 102 103 106 111 112 113 114 115 120
## 1 1 4 2 1 2 1 1 1 2
## 140 151
## 1 1
products / number of transactions
.library(RColorBrewer)
#par(mfrow=c(2,1))
arules::itemFrequencyPlot(tr_list,topN=10,type="absolute",col=brewer.pal(8,'Pastel2'), main="Absolute Product Frequency Plot")
arules::itemFrequencyPlot(tr_list,topN=10,type="relative",col=brewer.pal(8,'Pastel2'), main="Relative Product Frequency Plot")
The next step is to mine the rules using the APRIORI algorithm. The function apriori() is from arules package.
We can set and optimize the parameter of the minimum
support of 0.0001, the `minimum confidence of 0.8, maximum of 10 items (maxlen).
#Min Support as 0.0001, confidence as 0.8 and maximum of 10 products.
association.rules <- arules::apriori(tr_list, parameter = list(supp=0.001, conf=0.8)) #, maxlen=10
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 3421
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[3702915 item(s), 3421084 transaction(s)] done [27.23s].
## sorting and recoding items ... [1635 item(s)] done [0.95s].
## creating transaction tree ... done [4.82s].
## checking subsets of size 1 2 3 4 done [1.26s].
## writing ... [209 rule(s)] done [0.01s].
## creating S4 object ... done [2.06s].
Definition * Itemset: Collection of one or more items. K-item-set means a set of k items. * Support Count: Frequency of occurrence of an item-set * Support(s): Fraction of transactions that contain the item-set
inspect(association.rules[1:10])
## lhs rhs support confidence lift count
## [1] {Paper Towels Choose-A-Sheet} => {One-Ply} 0.001009914 1.0000000 990.1835 3455
## [2] {One-Ply} => {Paper Towels Choose-A-Sheet} 0.001009914 1.0000000 990.1835 3455
## [3] {Paper Towels Choose-A-Sheet} => {Mega Rolls} 0.001009914 1.0000000 971.3470 3455
## [4] {Mega Rolls} => {Paper Towels Choose-A-Sheet} 0.001009914 0.9809767 971.3470 3455
## [5] {One-Ply} => {Mega Rolls} 0.001009914 1.0000000 971.3470 3455
## [6] {Mega Rolls} => {One-Ply} 0.001009914 0.9809767 971.3470 3455
## [7] {Medium Pulp} => {Country Stand Juice} 0.001077729 1.0000000 927.8774 3687
## [8] {Country Stand Juice} => {Medium Pulp} 0.001077729 1.0000000 927.8774 3687
## [9] {2 Huge Rolls = 5 Regular Rolls Towels/Napkins} => {Select-A-Size Paper Towels} 0.001146420 1.0000000 644.2719 3922
## [10] {2 Huge Rolls = 5 Regular Rolls Towels/Napkins} => {White} 0.001146420 1.0000000 504.6591 3922
Interpretation * 100% of the customers who bought Paper Towels Choose-A-Sheet
also bought One-Ply
. * 100% of the customers who bought Medium Pulp
also bought Country Stand Juice
.
Olives
Olives.association.rules <- apriori(tr_list, parameter = list(supp = 0.001, conf = 0.8), appearance = list(default="lhs", rhs = "Olives"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 3421
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[3702915 item(s), 3421084 transaction(s)] done [22.90s].
## sorting and recoding items ... [1635 item(s)] done [1.03s].
## creating transaction tree ... done [5.23s].
## checking subsets of size 1 2 3 4 done [1.35s].
## writing ... [6 rule(s)] done [0.01s].
## creating S4 object ... done [2.32s].
Olives
# lhs= canned beer because you want to find out the probability of that in how many customers buy canned beer along with other items
inspect(Olives.association.rules)
## lhs rhs support confidence lift
## [1] {Kalamata} => {Olives} 0.001097898 1.0000000 694.4953
## [2] {Pitted} => {Olives} 0.001094682 0.9067797 629.7542
## [3] {Kalamata,Pitted} => {Olives} 0.001094390 1.0000000 694.4953
## [4] {Kalamata,Organic} => {Olives} 0.001094390 1.0000000 694.4953
## [5] {Organic,Pitted} => {Olives} 0.001094390 0.9306488 646.3312
## [6] {Kalamata,Organic,Pitted} => {Olives} 0.001094390 1.0000000 694.4953
## count
## [1] 3756
## [2] 3745
## [3] 3744
## [4] 3744
## [5] 3744
## [6] 3744
#inspect(association.rules[1:30])
Olives
(RHS) was Bought as Kalamata
or Pitted
.
A straight-forward visualization of association rules is to use a scatter plot using plot() of the arulesViz package. It uses Support and Confidence on the axes.
library(arulesViz)
# Filter rules with confidence greater than 0.9 or 90%
subRules <- association.rules[quality(association.rules)$confidence > 0.8]
#Plot SubRules
plot(subRules, jitter = 0.3)
plot(subRules,method="two-key plot", jitter = 0.3)
#top10subRules <- head(subRules, n = 10, by = "confidence")
plot(subRules[1:30], method = "graph", engine = "htmlwidget", control = list(type = "items"))
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## itemCol = #CBD2FC
## nodeCol = c("#EE0000", "#EE0303", "#EE0606", "#EE0909", "#EE0C0C", "#EE0F0F", "#EE1212", "#EE1515", "#EE1818", "#EE1B1B", "#EE1E1E", "#EE2222", "#EE2525", "#EE2828", "#EE2B2B", "#EE2E2E", "#EE3131", "#EE3434", "#EE3737", "#EE3A3A", "#EE3D3D", "#EE4040", "#EE4444", "#EE4747", "#EE4A4A", "#EE4D4D", "#EE5050", "#EE5353", "#EE5656", "#EE5959", "#EE5C5C", "#EE5F5F", "#EE6262", "#EE6666", "#EE6969", "#EE6C6C", "#EE6F6F", "#EE7272", "#EE7575", "#EE7878", "#EE7B7B", "#EE7E7E", "#EE8181", "#EE8484", "#EE8888", "#EE8B8B", "#EE8E8E", "#EE9191", "#EE9494", "#EE9797", "#EE9999", "#EE9B9B", "#EE9D9D", "#EE9F9F", "#EEA0A0", "#EEA2A2", "#EEA4A4", "#EEA5A5", "#EEA7A7", "#EEA9A9", "#EEABAB", "#EEACAC", "#EEAEAE", "#EEB0B0", "#EEB1B1", "#EEB3B3", "#EEB5B5", "#EEB7B7", "#EEB8B8", "#EEBABA", "#EEBCBC", "#EEBDBD", "#EEBFBF", "#EEC1C1", "#EEC3C3", "#EEC4C4", "#EEC6C6", "#EEC8C8", "#EEC9C9", "#EECBCB", "#EECDCD", "#EECFCF", "#EED0D0", "#EED2D2", "#EED4D4", "#EED5D5", "#EED7D7", "#EED9D9", "#EEDBDB", "#EEDCDC", "#EEDEDE", "#EEE0E0", "#EEE1E1", "#EEE3E3", "#EEE5E5", "#EEE7E7", "#EEE8E8", "#EEEAEA", "#EEECEC", "#EEEEEE")
## precision = 3
## igraphLayout = layout_nicely
## interactive = TRUE
## engine = visNetwork
## max = 100
## selection_menu = TRUE
## degree_highlight = 1
## verbose = FALSE
The Parallel Coordinates Plot is useful to visualized which items were purchased with an other one.
# Filter top 16 rules with highest lift
subRules2<-head(subRules, n=16, by="lift")
plot(subRules2[1:16], method="paracoord", control = list(reorder = TRUE))