order_id
customer_id
and customer_unique_id
?Customer_unique_id
and list paires with
Custumer_id
Customer_unique_id
and their
zip_code_prefix
customers_id
equal the duplicates in Lat/lon for the same
Zip code?This dataset was generously provided by Olist, the largest department store in Brazilian marketplaces. Olist connects small businesses from all over Brazil to channels without hassle and with a single contract. Those merchants are able to sell their products through the Olist Store and ship them directly to the customers using Olist logistics partners. See more on our website: www.olist.com.
The data set comes from kaggle competition named Brazilian E-Commerce Public Dataset by Olist.
# Set python environment and version in RStudio ;-)
reticulate::use_python("/usr/bin/python3.10", required = TRUE)
reticulate::py_config()
## python: /usr/bin/python3.10
## libpython: /usr/lib/python3.10/config-3.10-x86_64-linux-gnu/libpython3.10.so
## pythonhome: //usr://usr
## version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
## numpy: /home/kirus/.local/lib/python3.10/site-packages/numpy
## numpy_version: 1.26.0
##
## NOTE: Python version was forced by use_python() function
# Load needed R packages
library(tidyverse)
library(data.table)
library(leaflet)
library(stringr)
library(gridExtra)
#require(maps)
library(ggplot2)
library(lubridate)
#require(mapview)
#require(scales)
#require(RColorBrewer)
require(lubridate)
#library(plotly)
#require(gganimate)
#require(gifski)
#library(caret)
library(DT)
library(kableExtra)
customers <- fread("data/olist_customers_dataset.csv")
geolocation <- fread("data/olist_geolocation_dataset.csv")
order_items <- fread("data/olist_order_items_dataset.csv")
order_payments <- fread("data/olist_order_payments_dataset.csv")
order_reviews <- fread("data/olist_order_reviews_dataset.csv")
orders <- fread("data/olist_orders_dataset.csv")
products <- fread("data/olist_products_dataset.csv")
sellers <- fread("data/olist_sellers_dataset.csv")
category_translation <- fread("data/product_category_name_translation.csv")
Compute:
\[Volume = product\_length\_cm * product\_height\_cm * product\_width\_cm / 1000\]
Mutate volume range
Mutate weight range
## check for missing value in specific columns
product_id_with_missed_data <-
products |>
select(product_id,product_category_name, product_weight_g,
product_length_cm, product_height_cm, product_width_cm) |>
filter_all(any_vars(is.na(.)))
product_id_with_missed_data |>
knitr::kable() |> kable_styling() |>
scroll_box(width = "900px", height = "300px")
product_id | product_category_name | product_weight_g | product_length_cm | product_height_cm | product_width_cm |
---|---|---|---|---|---|
09ff539a621711667c43eba6a3bd8466 | bebes | NA | NA | NA | NA |
5eb564652db742ff8f28759cd8d2652a | NA | NA | NA | NA |
sellers_missing_data <-
order_items |>
filter(product_id %in% product_id_with_missed_data$product_id) %>%
group_by(product_id, seller_id, shipping_limit_date, price, freight_value) %>%
count() %>%
arrange(shipping_limit_date)
sellers_missing_data %>%
datatable()
sellers_missing_data %>%
#mutate(total= price+ freight_value) %>%
pivot_longer(-c(product_id, seller_id,n,shipping_limit_date)) %>%
ggplot(aes(x = shipping_limit_date, y= value, fill = name)) +
geom_area()+
facet_wrap(~product_id, scales = "free_y")+
geom_point(aes(x = shipping_limit_date, y= value, color = name))
* Increase of the price during the period of study.
## add english name and weight range catagories
products <- products |>
left_join(y = category_translation, by = "product_category_name") |>
mutate(`volume dm³` = product_length_cm * product_height_cm * product_width_cm/1000) %>%
rename(weight = product_weight_g) %>%
mutate(`weight kg` = weight/1000)
products |>
head(1000) |>
datatable()
products <- products |>
mutate(weight_range = if_else(`weight kg` <= 1,"0-1 kg",
if_else(`weight kg`> 1 & `weight kg` <= 5, "1-5 kg",
if_else(`weight kg` > 5 &`weight kg` <=10, "5-10 kg",
if_else(`weight kg` > 10 & `weight kg` <= 20, "10-20 kg",
if_else(`weight kg` > 20 & `weight kg` <= 30, "20-30 kg",
if_else(`weight kg` > 30 & `weight kg` <= 42, "30-42 kg", NA )))))))
products |>
head(1000) |>
datatable()
## glimpse the density of products volumes
products |>
drop_na(`volume dm³`, weight_range) |>
ggplot() +
aes(x=`volume dm³`, color= weight_range, fill= weight_range, na.rm = TRUE)+
geom_density(adjust= 10, alpha=0.1)
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## add volume range
products <- products |>
mutate(volume_range = if_else(`volume dm³` <= 1,"0-1 dm³",
if_else(`volume dm³`> 1 & `volume dm³` <= 5, "1-5 dm³",
if_else(`volume dm³` > 5 &`volume dm³` <=10, "5-10 dm³",
if_else(`volume dm³` > 10 & `volume dm³` <= 20, "10-20 dm³",
if_else(`volume dm³` > 20 & `volume dm³` <= 30, "20-30 dm³",
if_else(`volume dm³` > 30 , ">30 dm3", NA )))))))
products |> head() |>
datatable()
## list categories
products %>%
#group_by(product_category_name_english, product_name_length) |>
#mutate(Qt=n()) |>
pull(product_category_name_english) |>
unique() |> as.data.frame() |>
rownames_to_column(var = "index") |>
knitr::kable() |> kable_styling() |>
scroll_box(width = "300px", height = "500px")
index | unique(products %>% pull(product_category_name_english)) |
---|---|
1 | perfumery |
2 | art |
3 | sports_leisure |
4 | baby |
5 | housewares |
6 | musical_instruments |
7 | cool_stuff |
8 | furniture_decor |
9 | home_appliances |
10 | toys |
11 | bed_bath_table |
12 | construction_tools_safety |
13 | computers_accessories |
14 | health_beauty |
15 | luggage_accessories |
16 | garden_tools |
17 | office_furniture |
18 | auto |
19 | electronics |
20 | fashion_shoes |
21 | telephony |
22 | stationery |
23 | fashion_bags_accessories |
24 | computers |
25 | home_construction |
26 | watches_gifts |
27 | construction_tools_construction |
28 | pet_shop |
29 | small_appliances |
30 | agro_industry_and_commerce |
31 | NA |
32 | furniture_living_room |
33 | signaling_and_security |
34 | air_conditioning |
35 | consoles_games |
36 | books_general_interest |
37 | costruction_tools_tools |
38 | fashion_underwear_beach |
39 | fashion_male_clothing |
40 | kitchen_dining_laundry_garden_furniture |
41 | industry_commerce_and_business |
42 | fixed_telephony |
43 | construction_tools_lights |
44 | books_technical |
45 | home_appliances_2 |
46 | party_supplies |
47 | drinks |
48 | market_place |
49 | la_cuisine |
50 | costruction_tools_garden |
51 | fashio_female_clothing |
52 | home_confort |
53 | audio |
54 | food_drink |
55 | music |
56 | food |
57 | tablets_printing_image |
58 | books_imported |
59 | small_appliances_home_oven_and_coffee |
60 | fashion_sport |
61 | christmas_supplies |
62 | fashion_childrens_clothes |
63 | dvds_blu_ray |
64 | arts_and_craftmanship |
65 | furniture_bedroom |
66 | cine_photo |
67 | diapers_and_hygiene |
68 | flowers |
69 | home_comfort_2 |
70 | security_and_services |
71 | furniture_mattress_and_upholstery |
72 | cds_dvds_musicals |
## count how many products by category and count how many products with the same description length
products %>%
#filter(product_category_name== "cama_mesa_banho") |>
#mutate(product_description_lenght= as.factor(product_description_lenght)) |>
group_by(product_category_name_english, product_name_lenght, product_description_lenght) |>
#mutate(Qt= n()) |>
summarise(products_with_same_name_lenght = n(), .groups='keep') |>
group_by(product_description_lenght) |>
#filter(product_description_lenght=="93") |>
mutate(products_with_same_description_lenght = n()) |>
#head(1000) |>
#arrange(desc(Qt)) |>
#filter(product_category_name_english %in% c("baby", "housewares")) |>
arrange(desc(products_with_same_name_lenght)) |>
head(100) |>
datatable()
There are 610 products without description
We assume that products from the same category and have different name lenght are different.
We assume that products with the same name length and different description length are same bu from different sellers.
group_pdt_sellers <-
order_items |>
group_by(product_id, seller_id) |>
#mutate(Freq = n()) |>
summarise(Freq= n(), .groups="keep") |>
arrange(desc(Freq))
group_pdt_sellers |>
head(100) |>
datatable()
best_selling <- products |>
left_join(order_items , by = "product_id") |>
group_by(product_category_name_english, seller_id) |>
mutate(n_sold = n()) |>
distinct(product_category_name_english, seller_id, n_sold, .keep_all = FALSE) |>
#select( product_category_name_english,product_name_lenght, seller_id, product_id, n_sold )|>
arrange(desc(n_sold))
best_selling |>
filter(n_sold >620)|>
ggplot() +
aes(x = reorder(product_category_name_english, desc(n_sold), sum), y = n_sold, fill= seller_id) +
geom_col() +
geom_text(aes(label = n_sold), position = position_stack(vjust = .5))+
theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 7)
)+
ggtitle("Which caterogies are the best-selling by seller?")
products |>
left_join(order_items , by = "product_id") |>
group_by( seller_id, product_category_name_english, product_name_lenght) |>
summarise(products_with_same_name_lenght= n(), .groups = "keep") |>
#mutate(products_with_same_name_lenght= n())|>
#select(product_id, product_category_name_english, product_name_lenght, seller_id, products_with_same_name_lenght) |>
unique() |>
arrange(!is.na(product_name_lenght)) |>
arrange(desc(products_with_same_name_lenght)) |>
head(500) |>
datatable()
We find which seller has more missing product name:
c826c40d7b19f62a09e2d7c5e7295ee2
. We can find his
geolocation.
Several items with the same product length name exist for the same seller. It seems the same product but with different color or version or option.
It is possible that products belonging to different categories have the same name length and sold by different sellers.
CONCLUSION: The product_name_lenght
is
not specific to a specific product or seller.
products |>
drop_na(`weight kg`, `volume dm³`, `product_category_name_english`) |>
ggplot() +
aes(y = `weight kg` , x = `product_category_name_english`, color= weight_range) +#, size=`volume dm³`
geom_point() +
theme(legend.position = "left", #axis.text.y = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 14)
) +
#facet_wrap(~weight_range, nrow = 6, scales = "free_y")+
facet_wrap(~volume_range, nrow = 6, scales= "free_y")
We can note that some products with small volume but with high weight and vis-versa.
The must sold product have volume upper than 30 dm³.
## join items and orders and products
detail_orders <- order_items |>
left_join(orders, by = "order_id") |>
left_join(products, by = c("product_id")) |>
select(order_id, product_category_name_english, product_id, seller_id, customer_id, order_item_id, everything())
detail_orders|>
head(1000) |>
datatable()
## group by order and check items
orders_customer <-
detail_orders |>
select(order_id, customer_id, product_id, product_category_name_english) |>
group_by( order_id, customer_id) |>
mutate(n_orders= n()) |>
#summarise(n_items= n(), .groups = "keep") |>
group_by( order_id, customer_id, product_id) |>
mutate(n_items= n()) |>
ungroup() |>
arrange(desc(n_orders))
orders_customer |>
head(1000) |>
datatable()
The maximum number of orders per customer is 21.
The maximum number of items per orders is 20.
## check how many customers
n_customer_id <-
customers |>
select(customer_id) |>
unique() |>
nrow()
n_customer_id_with_order <-
orders_customer |>
select(customer_id) |>
unique() |>
nrow()
n_customer_unique_id <-
customers |>
select(customer_unique_id) |>
unique() |>
nrow()
print(paste0("there are ", n_customer_id, " Customers in this study"))
## [1] "there are 99441 Customers in this study"
print(paste0("there are ", n_customer_id_with_order, " Customers that purchased items in this study"))
## [1] "there are 98666 Customers that purchased items in this study"
## [1] "there are 96096 UNIQUE customer in this study"
order_payments %>%
group_by(order_id) %>%
mutate(Total = sum(payment_value)) %>%
head(1000) %>%
datatable()
orders_items
order_payments %>%
#filter(payment_type == "voucher") %>%
group_by(order_id) %>%
reframe(sequential = paste0(payment_sequential, collapse= ", "),
types= paste0(payment_type, collapse= ", "),
installments= paste0( payment_installments, collapse = ", "),
Total = sum(payment_value)) %>%
filter(sequential != "1") %>%
mutate(Sequential = map_chr(strsplit(sequential, ","), ~
toString(sort(as.numeric(.x))))) %>%
mutate(Types = map_chr(strsplit(types, ","), ~
toString(sort(as.character(.x))))) %>%
mutate(Installments = map_chr(strsplit(installments, ","), ~
toString(sort(as.character(.x))))) %>%
# mutate(Sequential= as.factor(Sequential), Types = as.factor(Types)) %>%
group_by(Sequential, Types) %>%
reframe(Freq= n(), median_total = median(Total)) %>%
mutate(credit_card = lengths(regmatches(Types, gregexpr("credit_card", Types)))) %>%
mutate(voucher = purrr::map_int(Types, ~sum(gregexpr("voucher", .x, fixed=TRUE)[[1]] > 0))) %>%
mutate(debit_card = lengths(regmatches(Types, gregexpr("debit_card", Types)))) %>%
mutate(boleto = str_count(Types, 'boleto')) %>%
#mutate(number_of_drugs = map(str_split(gsub('credit_card|voucher', ',', str_trim(Types)), '\\s+'), ~ length(.x))) %>%
mutate(credit_card=(paste0(credit_card, ":credit"))) %>%
mutate(voucher = paste0(voucher, ":voucher")) %>%
mutate(debit_card= paste0(debit_card, ":debit")) %>%
mutate(boleto= paste0(boleto, ":boleto")) %>%
unite(col = "Types", credit_card:boleto, sep = ", ", remove = TRUE) %>%
mutate(Sequential = map_chr(strsplit(Sequential, ","), ~
toString(paste0(min(as.numeric(.x), na.rm=TRUE),":" ,max(as.numeric(.x), na.rm=TRUE))
)
)
) %>%
#select(order_id, Sequential, Types, Installments, Total, Freq,everything()) %>%
#arrange(Sequential)
ggplot() +
aes(x = reorder(Sequential, desc(Freq)), y = Freq, fill = Types) +
geom_col(na.rm = TRUE) +
ggforce::facet_zoom(ylim = c(0, 75), zoom.data = ifelse(a >= 75, NA, FALSE))+
#ggplot2::scale_y_continuous(trans = squash_axis(5, 95, 10)) +
geom_text(aes(label = median_total), position = position_stack(vjust = .5))+
theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 7)
)+
ggtitle("Which payment type used in sequential?")
order_payments %>%
#filter(payment_type == "voucher") %>%
group_by(order_id) %>%
reframe(sequential = paste0(payment_sequential, collapse= ", "),
types= paste0(payment_type, collapse= ", "),
installments= paste0( payment_installments, collapse = ", "),
Total = sum(payment_value)) %>%
filter(sequential != "1") %>%
mutate(Sequential = map_chr(strsplit(sequential, ","), ~
toString(sort(as.numeric(.x))))) %>%
mutate(Types = map_chr(strsplit(types, ","), ~
toString(sort(as.character(.x))))) %>%
mutate(Installments = map_chr(strsplit(installments, ","), ~
toString(sort(as.character(.x))))) %>%
# mutate(Sequential= as.factor(Sequential), Types = as.factor(Types)) %>%
group_by(Sequential, Types) %>%
reframe(Freq= n(), median_total = median(Total)) %>%
mutate(credit_card = lengths(regmatches(Types, gregexpr("credit_card", Types)))) %>%
mutate(voucher = purrr::map_int(Types, ~sum(gregexpr("voucher", .x, fixed=TRUE)[[1]] > 0))) %>%
mutate(debit_card = lengths(regmatches(Types, gregexpr("debit_card", Types)))) %>%
mutate(boleto = str_count(Types, 'boleto')) %>%
#mutate(number_of_drugs = map(str_split(gsub('credit_card|voucher', ',', str_trim(Types)), '\\s+'), ~ length(.x))) %>%
mutate(credit_card=(paste0(credit_card, ":credit"))) %>%
mutate(voucher = paste0(voucher, ":voucher")) %>%
mutate(debit_card= paste0(debit_card, ":debit")) %>%
mutate(boleto= paste0(boleto, ":boleto")) %>%
unite(col = "Types", credit_card:boleto, sep = ", ", remove = TRUE) %>%
mutate(Sequential = map_chr(strsplit(Sequential, ","), ~
toString(paste0(min(as.numeric(.x), na.rm=TRUE),":" ,max(as.numeric(.x), na.rm=TRUE))
)
)
) %>%
ggplot()+
aes(x = reorder(Sequential, desc(median_total)), y = median_total, fill= Types) +
geom_col()+
geom_text(aes(label = Freq), position = position_stack(vjust = .5))+
theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 7)
)+
ggtitle("What is the average amount of each type used sequentially?")
order_id
detail_orders %>%
group_by(order_id, customer_id, product_category_name_english) %>%
summarise(n_items = n(), .groups = "keep",
n_sellers= paste0(seller_id, collapse = ", ")) %>%
#arrange(match(order_id, unique(order_id)))
arrange(desc(n_items)) %>%
select(order_id, customer_id, n_sellers, product_category_name_english, everything()) %>%
head(1000) %>%
datatable()
detail_orders %>%
#filter(seller_id == "001cca7ae9ae17fb1caed9dfb1094831") %>%
#left_join(order_reviews, by = "order_id")
group_by(seller_id, order_id) %>%
summarise(n_items=n(),.groups = "keep",
unique_cat= unique(product_category_name_english),
Total_prise= sum(price)) %>%
#filter(seller_id =="2709af9587499e95e803a6498a5a56e9")
#arrange(desc(n_items)) %>%
left_join(order_reviews, by = "order_id") %>%
group_by(seller_id) %>%
summarise(n_reviews= n(), .groups = "keep",
median_score= median(review_score, na.rm=TRUE),
total_items= sum(n_items),
reviews_ids = paste0(review_id, collapse = ", ")) %>%
mutate(median_score= as.factor(median_score)) %>%
group_by(median_score) %>%
reframe(n_seller_id= n(),
n_reviews = sum(n_reviews)) %>%
ggplot()+
aes(x = reorder(median_score, desc(n_seller_id)), y = n_seller_id, fill= n_reviews) +
geom_col()+
geom_text(aes(label = n_reviews), position = position_stack(vjust = .5), color= "red")+
theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 7)
)+
ggtitle("How many seller_ids by review score")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in left_join(., order_reviews, by = "order_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 222 of `x` matches multiple rows in `y`.
## ℹ Row 57578 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
best_categories_reviews <-
detail_orders %>%
#filter(seller_id == "001cca7ae9ae17fb1caed9dfb1094831") %>%
#left_join(order_reviews, by = "order_id")
group_by(product_category_name_english, order_id) %>%
summarise(n_items=n(),.groups = "keep",
Total_prise= sum(price),
unique_cat= unique(product_category_name_english)) %>%
#filter(seller_id =="2709af9587499e95e803a6498a5a56e9")
#arrange(desc(n_items)) %>%
left_join(order_reviews, by = "order_id") %>%
group_by(product_category_name_english) %>%
summarise(n_reviews= n(), .groups = "keep",
median_score= median(review_score, na.rm=TRUE),
total_items= sum(n_items),
reviews_ids = paste0(review_id, collapse = ", ")) %>%
mutate(median_score= as.factor(median_score))
## Warning in left_join(., order_reviews, by = "order_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 709 of `x` matches multiple rows in `y`.
## ℹ Row 51688 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
best_categories_reviews %>%
ggplot()+
aes(x = median_score, y = n_reviews, color= product_category_name_english)+
geom_point() +
#geom_text(aes(label = n_reviews), position = position_stack(vjust = .5), color= "red")+
theme(legend.position = "bottom",legend.box="horizontal", legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 7)
)+
ggtitle("Which categories have best scores?")+
guides(color=guide_legend(nrow=7, byrow=TRUE))
order_reviews %>%
group_by(order_id) %>%
filter(n()>1) %>%
mutate(reviews_id = paste0(review_id, collapse= ", "),
comment_title= paste0(review_comment_title, collapse= ", "),
comment_message= paste0(review_comment_message, collapse= ", "),
mean_score= round(mean(review_score, na.rm = TRUE), digits = 0)) %>%
select(order_id,mean_score, comment_title, comment_message, reviews_id ) %>%
head(1000) %>%
datatable()
foo <-
detail_orders %>%
group_by(order_id, seller_id) %>%
summarise(n_orders = n(), .groups = "keep") %>%
#arrange(match(seller_id, unique(seller_id)))
left_join(order_reviews, by = "order_id") %>%
group_by(seller_id) %>%
summarise(mean_score= round(mean(review_score, na.rm = TRUE), digits = 0),
n_sellers= n(),
n_reviews= length(review_id)) %>%
#arrange(desc(n_reviews)) %>%
mutate(mean_score= as.factor(mean_score)) %>%
arrange(desc(n_sellers))
## Warning in left_join(., order_reviews, by = "order_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 87 of `x` matches multiple rows in `y`.
## ℹ Row 11574 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
foo %>%
ggplot() +
aes(x = mean_score, y = n_sellers)+
geom_col(na.rm=TRUE) +
#geom_text(aes(label = Freq), position = position_stack(vjust = .5))+
theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 7)
)+
ggtitle("Arranage seller_id by review score for multiple reviews")
customers_without_order <-
customers |>
anti_join(orders_customer, by = "customer_id") |>
group_by(customer_unique_id) |>
mutate(Freq = n()) |>
arrange(desc(Freq))
customers_without_order |>
head(100) |>
datatable()
print(paste0("There are ",
customers_without_order |>
distinct(customer_unique_id)|>
nrow(),
" customers without orders"))
## [1] "There are 761 customers without orders"
## visualize categories that were purchased by customers with more than 6 items per order
orders_customer |>
left_join(customers, by = "customer_id") |>
group_by(customer_unique_id, product_category_name_english) |>
summarise(n_purchased= n(), .groups = "keep") |>
#arrange(desc(n_purchased)) |>
filter(n_purchased > 10) |>
ggplot() +
aes(x = reorder(product_category_name_english, desc(n_purchased), sum), y = n_purchased, fill= customer_unique_id) +
geom_col() +
geom_text(aes(label = n_purchased), position = position_stack(vjust = .5))+
theme(legend.position = "bottom", axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 14)
)+
ggtitle("The best-selling categories using unique customer id")
orders_customer |>
#left_join(customers, by = "customer_id") |>
group_by(customer_id, product_category_name_english) |>
summarise(n_purchased= n(), .groups = "keep") |>
#arrange(desc(n_purchased)) |>
filter(n_purchased > 10) |>
ggplot() +
aes(x = reorder(product_category_name_english, desc(n_purchased), sum), y = n_purchased, fill= customer_id) +
geom_col() +
geom_text(aes(label=n_purchased), position= position_stack(vjust = .5)) +
theme(legend.position = "bottom", axis.title.x = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
plot.title = element_text(size = 14)
)+
ggtitle("The best-selling categories using customer id")
We set limit upper than 10 purchased for these two plots.
customer_unique_id
sums purchased items for multiple
customer_id
.
For this reason that appears more customer_unique_id
than customer_id
in the legend.
#
# orders_seller <- detail_orders |>
# group_by( order_id, seller_id, product_category_name_english) |>
# summarise(n_items= n(), .groups = "keep") |>
# group_by( order_id,seller_id, product_category_name_english, n_items) |>
# summarise(n_orders= n(), .groups = "keep") |>
# #arrange(desc(n_items))
# arrange(desc(n_orders))
#
# orders_seller |>
# head(1000) |>
# datatable()
# orders_seller |>
# filter(n_items > 6) |>
# ggplot() +
# aes(x = reorder(product_category_name_english, desc(n_items), sum), y = n_items, fill= seller_id) +
# geom_col() +
# theme(legend.position = "bottom", axis.title.x = element_blank(),
# axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
# plot.title = element_text(size = 14)
# )
detail_orders |>
#filter(order_id== "088683f795a3d30bfd61152c4fabdfb2")
select(order_id, seller_id, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date,
order_delivered_customer_date, order_estimated_delivery_date, shipping_limit_date) |>
mutate(delay_approving = (order_approved_at - order_purchase_timestamp)) |>
mutate(delay_delivery_carrier= (order_delivered_carrier_date - order_approved_at)) |>
mutate(delay_delivery_customer= (order_delivered_customer_date - order_delivered_carrier_date)) |>
mutate(delta_estimation = (order_estimated_delivery_date - order_delivered_customer_date)) |>
mutate(shipping_days = ((order_delivered_customer_date - order_approved_at))) |>
mutate(shipping_days_limit = (shipping_limit_date - order_approved_at)) |>
gather("step", "Dates", 3:8) |>
filter(seller_id %in% c("5b51032eddd242adc84c38acab88f23d","48436dade18ac8b2bce089ec2a041202", "dd7ddc04e1b6c2c614352b383efe2d36", "c826c40d7b19f62a09e2d7c5e7295ee2", "955fee9216a65b617aa5c0531780ce60")) |> # "
pivot_longer(-c(order_id,seller_id, delta_estimation, shipping_days, shipping_days_limit, step, Dates)) |>
rename(Seconds= value, Delays = name) |>
pivot_longer(-c(order_id,seller_id, shipping_days, step, Dates, Delays,Seconds)) |>
rename(Thresholds= name) |>
#mutate(Time = seconds_to_period(Seconds)) |>
ggplot(aes(x = Dates, y= Seconds, fill = Delays)) +
geom_area(na.rm=TRUE)+
#scale_x_datetime( breacks= date_breaks("12 hours") , labels= date_format("%H:%M %d-%b"))+
#scale_y_time(date_label= "%m")+
geom_line(aes(x=Dates, y =value, color=Thresholds), na.rm = TRUE)+
scale_color_manual(values=c('orange', 'red'))+
facet_wrap(~seller_id, nrow = 5, scales = "free_y") +
ggtitle("Delay for each step: from purchase to delivery")
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.
If delta_estimation
is negative means that the order
is delivered after estimated date
The shinpping_days_limits
is the limit date-time of
the delivery. Higher date is considered not acceptble.
The main delay is shown during delivery from the carriers to customers (Blue area).
c826c40d7b19f62a09e2d7c5e7295ee2
is the seller that
has more unamed products.
955fee9216a65b617aa5c0531780ce60
is the best seller
of one specific product.
customer_id
and
customer_unique_id
?## count customer_id
paste0("there is ",
customers |>
distinct(customer_id) |>
nrow(),
" Customer_id in this data")
## [1] "there is 99441 Customer_id in this data"
paste0("there is ",
customers |> distinct(customer_unique_id) |> nrow(),
" Customer_unique_id in this data")
## [1] "there is 96096 Customer_unique_id in this data"
paste0("There is about ",
customers |> distinct(customer_id) |> nrow() - customers |> distinct(customer_unique_id) |> nrow(),
" rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id " )
## [1] "There is about 3345 rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id "
Customer_unique_id
and list paires with
Custumer_id
## What is the difference between customer_id and customer_unique_id
list_of_unique_id_with_multiple_customer_id <- customers |>
group_by(customer_unique_id) |>
## filter only duplicated
filter(n()>1) |>
arrange(desc(customer_unique_id))
paste0("Here is a sample of table with ",
list_of_unique_id_with_multiple_customer_id |> nrow(),
" rows. ")
## [1] "Here is a sample of table with 6342 rows. "
customer_unique_id
corresponding
to multiple customer_id
.Customer_unique_id
and their
zip_code_prefix
customers_id
equal the duplicates in Lat/lon for the same
Zip code?Conpare with the last table.
## Filter only Zip code for customer_unique_id
customers |>
filter(customer_unique_id %in% duplicated_customer_unique_id) |>
group_by(customer_id, customer_unique_id, zip_code_prefix) |>
#distinct(customer_unique_id) |>
arrange(desc(customer_unique_id)) |>
filter(zip_code_prefix == 99750) |>
datatable()
NO!
Only zip_code_prefix is not enough to distinct between geolocations of customers and sellers
For example there is only one customer with the zip_code equal to 99750.
But in geolicalisation we have 14 Positions related to this zip_code
paste0("There are ",
sellers |>
group_by(seller_id) |>
#summarise(N= n()) |>
filter(n()>1),
"Duplited seller_id.")
## [1] "There are character(0)Duplited seller_id."
## [2] "There are integer(0)Duplited seller_id."
## [3] "There are character(0)Duplited seller_id."
## [4] "There are character(0)Duplited seller_id."
Zip Code is not enough to filter only seller adresses. There is also customers with the same zip code
sellers_geolocation <- geolocation |>
filter(zip_code_prefix %in% sellers$zip_code_prefix) %>% ## IS NOT ENOUTH there are the same zip code for multiple sellers and customers.
left_join(y = sellers, by = "zip_code_prefix")
lng1 <- min(sellers_geolocation$Longitude)
lng2 <- max(sellers_geolocation$Longitude)
lat1 <- min(sellers_geolocation$Latitude)
lat2 <- max(sellers_geolocation$Latitude)
leaflet(sellers_geolocation) %>%
addTiles() %>%
setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
addRectangles(
lng1= lng1, lat1= lat1,
lng2= lng2, lat2=lat2,
fillColor = "transparent"
) %>%
addMarkers(~Longitude, ~Latitude,
popup = ~seller_city, label = ~seller_state,
clusterOptions = markerClusterOptions())
customers_geolocation <- geolocation |>
#rename(zip_code_prefix= geolocation_zip_code_prefix) |>
filter(zip_code_prefix %in% unique(customers$zip_code_prefix)) |>
left_join(y = customers, by = "zip_code_prefix")
lng1 <- min(customers_geolocation$Longitude)
lng2 <- max(customers_geolocation$Longitude)
lat1 <- min(customers_geolocation$Latitude)
lat2 <- max(customers_geolocation$Latitude)
# leaflet(customers_geolocation) %>%
# addTiles() %>%
# setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
# addRectangles(
# lng1= lng1, lat1= lat1,
# lng2= lng2, lat2=lat2,
# fillColor = "transparent"
# ) %>%
# addMarkers(~Longitude, ~Latitude,
# popup = ~customer_city, label = ~customer_state,
# clusterOptions = markerClusterOptions())