0.1 Contexte

This dataset was generously provided by Olist, the largest department store in Brazilian marketplaces. Olist connects small businesses from all over Brazil to channels without hassle and with a single contract. Those merchants are able to sell their products through the Olist Store and ship them directly to the customers using Olist logistics partners. See more on our website: www.olist.com.

The data set comes from kaggle competition named Brazilian E-Commerce Public Dataset by Olist.

0.2 Set and load packages for Python and R

# Set python environment and version in RStudio ;-)
reticulate::use_python("/usr/bin/python3.10", required = TRUE)
reticulate::py_config()
## python:         /usr/bin/python3.10
## libpython:      /usr/lib/python3.10/config-3.10-x86_64-linux-gnu/libpython3.10.so
## pythonhome:     //usr://usr
## version:        3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
## numpy:          /home/kirus/.local/lib/python3.10/site-packages/numpy
## numpy_version:  1.26.0
## 
## NOTE: Python version was forced by use_python() function
# Load needed R packages
library(tidyverse)
library(data.table)
library(leaflet)
library(stringr)
library(gridExtra)
#require(maps)
library(ggplot2)
library(lubridate)
#require(mapview)
#require(scales)
#require(RColorBrewer)
require(lubridate)
#library(plotly)
#require(gganimate)
#require(gifski)
#library(caret)
library(DT)
library(kableExtra)

1 Load Datasets

customers <- fread("data/olist_customers_dataset.csv")
geolocation <- fread("data/olist_geolocation_dataset.csv")
order_items <- fread("data/olist_order_items_dataset.csv")
order_payments <- fread("data/olist_order_payments_dataset.csv")
order_reviews <- fread("data/olist_order_reviews_dataset.csv")
orders <- fread("data/olist_orders_dataset.csv")
products <- fread("data/olist_products_dataset.csv")
sellers <- fread("data/olist_sellers_dataset.csv")
category_translation <- fread("data/product_category_name_translation.csv")

1.1 Customers

customers <- customers |>  
   rename(zip_code_prefix = customer_zip_code_prefix)

customers |> head(1000) |>
datatable()

1.2 Geolocation

geolocation <- geolocation |> 
  rename(Longitude = geolocation_lng) |>
  rename(Latitude = geolocation_lat) |>
  rename(zip_code_prefix= geolocation_zip_code_prefix)

geolocation |> 
  head(1000) |>
  datatable()

1.3 Items

order_items |> 
  head(1000) |>
  datatable()

1.4 Payments

order_payments |>
  head(1000) |>
  datatable()

1.5 Reviews

order_reviews |> 
  head(1000) |>
  datatable()

1.6 Orders

orders |> 
  head(1000) |>
  datatable()

1.7 Categories

category_translation |> 
  head(1000) |>
  datatable()

1.8 Products

products |> 
  head(100) |>
  datatable()

1.9 Sellers

sellers <- sellers |>
          rename(zip_code_prefix = seller_zip_code_prefix)

sellers |> 
  head(1000) |>
  datatable()

2 Dataset Engineering

Compute:

\[Volume = product\_length\_cm * product\_height\_cm * product\_width\_cm / 1000\]

2.1 check for missing value in dimensions used to compute Volumes

## check for  missing value in specific columns
product_id_with_missed_data <-
products |>
select(product_id,product_category_name, product_weight_g,
       product_length_cm, product_height_cm, product_width_cm) |>
 filter_all(any_vars(is.na(.)))

product_id_with_missed_data |>
   knitr::kable() |>   kable_styling() |>
  scroll_box(width = "900px", height = "300px") 
product_id product_category_name product_weight_g product_length_cm product_height_cm product_width_cm
09ff539a621711667c43eba6a3bd8466 bebes NA NA NA NA
5eb564652db742ff8f28759cd8d2652a NA NA NA NA

2.2 Which sellers have product_id with missing data?

sellers_missing_data <-
order_items |>
  filter(product_id %in% product_id_with_missed_data$product_id) %>%
  group_by(product_id, seller_id, shipping_limit_date, price, freight_value) %>%
  count() %>%
  arrange(shipping_limit_date)

sellers_missing_data %>%
  datatable()

2.2.1 Trend of the price of products with missed informations

sellers_missing_data %>%
  #mutate(total= price+ freight_value) %>%
  pivot_longer(-c(product_id, seller_id,n,shipping_limit_date)) %>%
  ggplot(aes(x = shipping_limit_date, y= value, fill = name)) +
  geom_area()+
    facet_wrap(~product_id, scales = "free_y")+
geom_point(aes(x = shipping_limit_date, y= value, color = name))

* Increase of the price during the period of study.

2.2.2 Compute the Volumes of Products

## add english name and weight range catagories

products <- products |>
            left_join(y = category_translation, by = "product_category_name") |>
            mutate(`volume dm³` = product_length_cm * product_height_cm * product_width_cm/1000) %>%
           rename(weight = product_weight_g) %>%
           mutate(`weight kg` = weight/1000)



products |> 
  head(1000)  |>
  datatable()

2.2.3 Mutate the Weight range of products

products <- products |>
    mutate(weight_range = if_else(`weight kg` <= 1,"0-1 kg", 
                                if_else(`weight kg`> 1 &  `weight kg` <= 5, "1-5 kg",
                                        if_else(`weight kg` > 5 &`weight kg` <=10, "5-10 kg",
                                                if_else(`weight kg` > 10 & `weight kg` <= 20, "10-20 kg",
                                                        if_else(`weight kg` > 20 & `weight kg` <= 30, "20-30 kg",
                                                                if_else(`weight kg` > 30 & `weight kg` <= 42, "30-42 kg", NA )))))))

products |> 
  head(1000)  |>
  datatable()
## glimpse the density of products volumes
products |>
  drop_na(`volume dm³`, weight_range) |>
  ggplot() +
  aes(x=`volume dm³`, color= weight_range, fill= weight_range, na.rm = TRUE)+
  geom_density(adjust= 10, alpha=0.1)
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

  • The main volume of products are less than 5 dm³

2.2.4 Mutate Volume Range

## add volume range
products <- products |>
  mutate(volume_range = if_else(`volume dm³` <= 1,"0-1 dm³", 
                                if_else(`volume dm³`> 1 &  `volume dm³` <= 5, "1-5 dm³",
                                        if_else(`volume dm³` > 5 &`volume dm³` <=10, "5-10 dm³",
                                                if_else(`volume dm³` > 10 & `volume dm³` <= 20, "10-20 dm³",
                                                        if_else(`volume dm³` > 20 & `volume dm³` <= 30, "20-30 dm³",
                                                             if_else(`volume dm³` > 30 , ">30 dm3", NA ))))))) 


products |> head() |>
  datatable()
  #knitr::kable() |>   kable_styling() |>
  #scroll_box(width = "900px", height = "300px")

3 Explore Products

3.1 list categories and count same description length

## list categories
products %>%
  #group_by(product_category_name_english, product_name_length) |>
  #mutate(Qt=n()) |>
  pull(product_category_name_english) |> 
  unique() |> as.data.frame() |>
  rownames_to_column(var = "index") |> 
  knitr::kable() |>  kable_styling() |>
  scroll_box(width = "300px", height = "500px")
index unique(products %>% pull(product_category_name_english))
1 perfumery
2 art
3 sports_leisure
4 baby
5 housewares
6 musical_instruments
7 cool_stuff
8 furniture_decor
9 home_appliances
10 toys
11 bed_bath_table
12 construction_tools_safety
13 computers_accessories
14 health_beauty
15 luggage_accessories
16 garden_tools
17 office_furniture
18 auto
19 electronics
20 fashion_shoes
21 telephony
22 stationery
23 fashion_bags_accessories
24 computers
25 home_construction
26 watches_gifts
27 construction_tools_construction
28 pet_shop
29 small_appliances
30 agro_industry_and_commerce
31 NA
32 furniture_living_room
33 signaling_and_security
34 air_conditioning
35 consoles_games
36 books_general_interest
37 costruction_tools_tools
38 fashion_underwear_beach
39 fashion_male_clothing
40 kitchen_dining_laundry_garden_furniture
41 industry_commerce_and_business
42 fixed_telephony
43 construction_tools_lights
44 books_technical
45 home_appliances_2
46 party_supplies
47 drinks
48 market_place
49 la_cuisine
50 costruction_tools_garden
51 fashio_female_clothing
52 home_confort
53 audio
54 food_drink
55 music
56 food
57 tablets_printing_image
58 books_imported
59 small_appliances_home_oven_and_coffee
60 fashion_sport
61 christmas_supplies
62 fashion_childrens_clothes
63 dvds_blu_ray
64 arts_and_craftmanship
65 furniture_bedroom
66 cine_photo
67 diapers_and_hygiene
68 flowers
69 home_comfort_2
70 security_and_services
71 furniture_mattress_and_upholstery
72 cds_dvds_musicals

3.2 List product_id by categories

## count how many products by category and count how many products with the same description length
products %>%
  #filter(product_category_name== "cama_mesa_banho") |>
  #mutate(product_description_lenght= as.factor(product_description_lenght)) |>
  group_by(product_category_name_english, product_name_lenght, product_description_lenght) |>
  #mutate(Qt= n()) |>
  summarise(products_with_same_name_lenght = n(), .groups='keep') |>
  group_by(product_description_lenght) |>
  #filter(product_description_lenght=="93") |>
  mutate(products_with_same_description_lenght = n()) |>
    #head(1000) |>
  #arrange(desc(Qt)) |>
  #filter(product_category_name_english %in% c("baby", "housewares")) |>
 arrange(desc(products_with_same_name_lenght)) |>
  head(100) |>
  datatable()
  • There are 610 products without description

  • We assume that products from the same category and have different name lenght are different.

  • We assume that products with the same name length and different description length are same bu from different sellers.

3.3 Check if products with the same name lenght and different description lenght are same but from different seller

3.3.1 Group products by sellers {best_seller}

group_pdt_sellers <- 
  order_items |>
  group_by(product_id, seller_id) |>
  #mutate(Freq = n()) |>
  summarise(Freq= n(), .groups="keep") |>
  arrange(desc(Freq))

group_pdt_sellers |>
  head(100) |>
  datatable()
  • This table lists the best-selling products and their sellers

3.4 Which caterogies are the best-selling by seller?

best_selling <- products |>
  left_join(order_items , by = "product_id") |>
  group_by(product_category_name_english, seller_id) |>
  mutate(n_sold = n()) |>
  distinct(product_category_name_english, seller_id, n_sold, .keep_all = FALSE) |>
  #select( product_category_name_english,product_name_lenght, seller_id, product_id, n_sold )|>
  arrange(desc(n_sold))


best_selling |>
  filter(n_sold >620)|>
  ggplot() +
  aes(x = reorder(product_category_name_english, desc(n_sold), sum), y = n_sold, fill= seller_id) +
  geom_col() +
  geom_text(aes(label = n_sold), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("Which caterogies are the best-selling by seller?")

3.5 Name length parameter is not specific to product or seller

 products |>
           left_join(order_items , by = "product_id") |>
      group_by(  seller_id, product_category_name_english, product_name_lenght) |>
  summarise(products_with_same_name_lenght= n(), .groups = "keep") |>
  #mutate(products_with_same_name_lenght= n())|>
     #select(product_id, product_category_name_english, product_name_lenght, seller_id, products_with_same_name_lenght) |>
  unique() |>
  arrange(!is.na(product_name_lenght)) |>
      arrange(desc(products_with_same_name_lenght)) |>
 head(500) |>
  datatable()
  • We find which seller has more missing product name: c826c40d7b19f62a09e2d7c5e7295ee2. We can find his geolocation.

  • Several items with the same product length name exist for the same seller. It seems the same product but with different color or version or option.

  • It is possible that products belonging to different categories have the same name length and sold by different sellers.

CONCLUSION: The product_name_lenght is not specific to a specific product or seller.

3.6 Display the ratio weight/volume of products

products |>
  drop_na(`weight kg`, `volume dm³`, `product_category_name_english`) |>
  ggplot() + 
  aes(y = `weight kg` , x = `product_category_name_english`, color= weight_range) +#, size=`volume dm³` 
  geom_point() +
    theme(legend.position = "left", #axis.text.y = element_blank(),
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 14)
        ) +
  #facet_wrap(~weight_range, nrow = 6, scales = "free_y")+
  facet_wrap(~volume_range, nrow = 6, scales= "free_y")

  • We can note that some products with small volume but with high weight and vis-versa.

  • The must sold product have volume upper than 30 dm³.

4 Explore Orders

## join items and orders and products
detail_orders <- order_items |>
  left_join(orders, by = "order_id") |>
  left_join(products, by = c("product_id")) |>
  select(order_id, product_category_name_english, product_id,  seller_id, customer_id,   order_item_id, everything()) 


detail_orders|>
  head(1000) |>
  datatable()

4.1 How many orders by customer and How many items by order

## group by order and check items
orders_customer <- 
  detail_orders |>
  select(order_id, customer_id, product_id, product_category_name_english) |>
  group_by( order_id, customer_id) |>
  mutate(n_orders= n()) |>
  #summarise(n_items= n(), .groups = "keep") |>
  group_by( order_id, customer_id, product_id) |>
  mutate(n_items= n()) |>
  ungroup() |>
  arrange(desc(n_orders))

orders_customer |>
  head(1000) |>
  datatable()
  • The maximum number of orders per customer is 21.

  • The maximum number of items per orders is 20.

4.2 How many Custumer_id / Unique_customer_id / Customer that purchased items?

## check how many customers
n_customer_id <-
customers |>
   select(customer_id) |>
  unique() |>
  nrow()


n_customer_id_with_order <-
orders_customer |>
 select(customer_id) |>
  unique() |>
  nrow()

n_customer_unique_id <-
customers |>
  select(customer_unique_id) |>
  unique() |>
  nrow()

print(paste0("there are ", n_customer_id, " Customers in this study"))
## [1] "there are 99441 Customers in this study"
print(paste0("there are ", n_customer_id_with_order, " Customers that purchased items in this study"))
## [1] "there are 98666 Customers that purchased items in this study"
print(paste0("there are ", n_customer_unique_id, " UNIQUE customer in this study"))
## [1] "there are 96096 UNIQUE customer in this study"

4.3 Orders Payment

order_payments %>%
  group_by(order_id) %>%
  mutate(Total = sum(payment_value)) %>%
  head(1000) %>%
  datatable()

4.3.1 missing orders payment in orders_items

missing_orders <- order_items %>%
  group_by(order_id) %>%
  summarise(price_value = sum(price), total_freight = sum(freight_value) ) %>%
  anti_join(order_payments, by="order_id")

missing_orders %>%
  datatable()

4.3.2 Which payment type used in sequential?

order_payments  %>%
  #filter(payment_type == "voucher") %>%
  group_by(order_id) %>%
  reframe(sequential = paste0(payment_sequential, collapse= ", "), 
         types= paste0(payment_type, collapse= ", "),
         installments= paste0( payment_installments, collapse = ", "),
         Total = sum(payment_value)) %>% 
  filter(sequential != "1") %>%
  mutate(Sequential = map_chr(strsplit(sequential, ","), ~ 
                  toString(sort(as.numeric(.x))))) %>%
  mutate(Types = map_chr(strsplit(types, ","), ~ 
                  toString(sort(as.character(.x))))) %>%
  mutate(Installments = map_chr(strsplit(installments, ","), ~ 
                  toString(sort(as.character(.x))))) %>%
 # mutate(Sequential= as.factor(Sequential), Types = as.factor(Types)) %>%
  group_by(Sequential, Types) %>%
  reframe(Freq= n(), median_total = median(Total)) %>%
  mutate(credit_card = lengths(regmatches(Types, gregexpr("credit_card", Types)))) %>%
  mutate(voucher = purrr::map_int(Types, ~sum(gregexpr("voucher", .x, fixed=TRUE)[[1]] > 0))) %>%
  mutate(debit_card = lengths(regmatches(Types, gregexpr("debit_card", Types)))) %>%
  mutate(boleto = str_count(Types, 'boleto')) %>%
  #mutate(number_of_drugs = map(str_split(gsub('credit_card|voucher', ',', str_trim(Types)), '\\s+'), ~ length(.x))) %>%
  mutate(credit_card=(paste0(credit_card, ":credit"))) %>%
  mutate(voucher = paste0(voucher, ":voucher")) %>%
  mutate(debit_card= paste0(debit_card, ":debit")) %>%
  mutate(boleto= paste0(boleto, ":boleto")) %>%
  unite(col = "Types", credit_card:boleto, sep = ", ", remove = TRUE) %>%
  mutate(Sequential = map_chr(strsplit(Sequential, ","), ~ 
                  toString(paste0(min(as.numeric(.x), na.rm=TRUE),":" ,max(as.numeric(.x), na.rm=TRUE))
                           )
                  )
  ) %>%
  #select(order_id, Sequential, Types, Installments, Total, Freq,everything()) %>%
  #arrange(Sequential)
  ggplot() +
  aes(x = reorder(Sequential, desc(Freq)), y = Freq, fill = Types) +
    geom_col(na.rm = TRUE) +
    ggforce::facet_zoom(ylim = c(0, 75), zoom.data = ifelse(a >= 75, NA, FALSE))+
    #ggplot2::scale_y_continuous(trans = squash_axis(5, 95, 10)) +
  geom_text(aes(label = median_total), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("Which payment type used in sequential?")

order_payments  %>%
  #filter(payment_type == "voucher") %>%
  group_by(order_id) %>%
  reframe(sequential = paste0(payment_sequential, collapse= ", "), 
         types= paste0(payment_type, collapse= ", "),
         installments= paste0( payment_installments, collapse = ", "),
         Total = sum(payment_value)) %>% 
  filter(sequential != "1") %>%
  mutate(Sequential = map_chr(strsplit(sequential, ","), ~ 
                  toString(sort(as.numeric(.x))))) %>%
  mutate(Types = map_chr(strsplit(types, ","), ~ 
                  toString(sort(as.character(.x))))) %>%
  mutate(Installments = map_chr(strsplit(installments, ","), ~ 
                  toString(sort(as.character(.x))))) %>%
 # mutate(Sequential= as.factor(Sequential), Types = as.factor(Types)) %>%
  group_by(Sequential, Types) %>%
  reframe(Freq= n(), median_total = median(Total)) %>%
  mutate(credit_card = lengths(regmatches(Types, gregexpr("credit_card", Types)))) %>%
  mutate(voucher = purrr::map_int(Types, ~sum(gregexpr("voucher", .x, fixed=TRUE)[[1]] > 0))) %>%
  mutate(debit_card = lengths(regmatches(Types, gregexpr("debit_card", Types)))) %>%
  mutate(boleto = str_count(Types, 'boleto')) %>%
  #mutate(number_of_drugs = map(str_split(gsub('credit_card|voucher', ',', str_trim(Types)), '\\s+'), ~ length(.x))) %>%
  mutate(credit_card=(paste0(credit_card, ":credit"))) %>%
  mutate(voucher = paste0(voucher, ":voucher")) %>%
  mutate(debit_card= paste0(debit_card, ":debit")) %>%
  mutate(boleto= paste0(boleto, ":boleto")) %>%
  unite(col = "Types", credit_card:boleto, sep = ", ", remove = TRUE) %>%
  mutate(Sequential = map_chr(strsplit(Sequential, ","), ~ 
                  toString(paste0(min(as.numeric(.x), na.rm=TRUE),":" ,max(as.numeric(.x), na.rm=TRUE))
                           )
                  )
  ) %>%
  ggplot()+
  aes(x = reorder(Sequential, desc(median_total)), y = median_total, fill= Types) +
  geom_col()+
  geom_text(aes(label = Freq), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("What is the average amount of each type used sequentially?")

5 Explore Orders Reviews

5.1 There are multiple Reviews for same order_id

order_reviews %>%
  group_by(order_id) %>% 
  filter(n()>1) %>%
  arrange(order_id) %>%
  #head(500) %>%
  datatable()

5.2 Some orders are composed by items purchased from different seller_id.

detail_orders %>%
   group_by(order_id, customer_id, product_category_name_english) %>%
  summarise(n_items = n(), .groups = "keep",
            n_sellers= paste0(seller_id, collapse = ", ")) %>%
  #arrange(match(order_id, unique(order_id)))
  arrange(desc(n_items)) %>%
  select(order_id, customer_id, n_sellers, product_category_name_english,  everything()) %>%
    head(1000) %>%
    datatable()

5.3 Classify seller_id by review scores

detail_orders %>%
  #filter(seller_id == "001cca7ae9ae17fb1caed9dfb1094831") %>%
  #left_join(order_reviews, by = "order_id") 
  group_by(seller_id, order_id) %>%
  summarise(n_items=n(),.groups = "keep",
            unique_cat= unique(product_category_name_english),
            Total_prise= sum(price)) %>%
  #filter(seller_id =="2709af9587499e95e803a6498a5a56e9")
  #arrange(desc(n_items)) %>%
  left_join(order_reviews, by = "order_id") %>%
  group_by(seller_id) %>%
  summarise(n_reviews= n(), .groups = "keep",
            median_score=  median(review_score, na.rm=TRUE),
            total_items= sum(n_items),
            reviews_ids = paste0(review_id, collapse = ", ")) %>%
  mutate(median_score= as.factor(median_score))  %>%
  group_by(median_score) %>%
  reframe(n_seller_id= n(),
            n_reviews = sum(n_reviews)) %>%
  ggplot()+
  aes(x = reorder(median_score, desc(n_seller_id)), y = n_seller_id, fill= n_reviews) +
  geom_col()+
  geom_text(aes(label = n_reviews), position = position_stack(vjust =  .5), color= "red")+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("How many seller_ids by review score")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in left_join(., order_reviews, by = "order_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 222 of `x` matches multiple rows in `y`.
## ℹ Row 57578 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

5.4 Which categories have best scores?

best_categories_reviews <- 
detail_orders %>%
  #filter(seller_id == "001cca7ae9ae17fb1caed9dfb1094831") %>%
  #left_join(order_reviews, by = "order_id") 
  group_by(product_category_name_english, order_id) %>%
  summarise(n_items=n(),.groups = "keep",
            Total_prise= sum(price),
            unique_cat= unique(product_category_name_english)) %>%
  #filter(seller_id =="2709af9587499e95e803a6498a5a56e9")
  #arrange(desc(n_items)) %>%
  left_join(order_reviews, by = "order_id") %>%
  group_by(product_category_name_english) %>%
  summarise(n_reviews= n(), .groups = "keep",
            median_score=  median(review_score, na.rm=TRUE),
            total_items= sum(n_items),
            reviews_ids = paste0(review_id, collapse = ", ")) %>%
  mutate(median_score= as.factor(median_score))
## Warning in left_join(., order_reviews, by = "order_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 709 of `x` matches multiple rows in `y`.
## ℹ Row 51688 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
best_categories_reviews %>%
  select(-reviews_ids) %>%
datatable()
best_categories_reviews %>%
  ggplot()+
  aes(x = median_score, y = n_reviews, color= product_category_name_english)+
  geom_point() +
  #geom_text(aes(label = n_reviews), position = position_stack(vjust =  .5), color= "red")+
  theme(legend.position = "bottom",legend.box="horizontal", legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("Which categories have best scores?")+
guides(color=guide_legend(nrow=7, byrow=TRUE)) 

5.5 Group reviews per order_id

order_reviews %>%
  group_by(order_id) %>% 
    filter(n()>1) %>%
  mutate(reviews_id = paste0(review_id, collapse= ", "),
         comment_title= paste0(review_comment_title, collapse= ", "),
         comment_message= paste0(review_comment_message, collapse= ", "),
         mean_score= round(mean(review_score, na.rm = TRUE), digits = 0)) %>%
  select(order_id,mean_score, comment_title, comment_message, reviews_id )  %>%
  head(1000) %>%
  datatable()

5.6 Arranage seller_id by review scores

foo <-
  detail_orders %>%
   group_by(order_id, seller_id) %>%
  summarise(n_orders = n(), .groups = "keep") %>%
  #arrange(match(seller_id, unique(seller_id)))
  left_join(order_reviews, by = "order_id") %>%
  group_by(seller_id) %>%
  summarise(mean_score= round(mean(review_score, na.rm = TRUE), digits = 0),
            n_sellers= n(),
            n_reviews= length(review_id)) %>%
  #arrange(desc(n_reviews)) %>%
  mutate(mean_score= as.factor(mean_score)) %>%
  arrange(desc(n_sellers))
## Warning in left_join(., order_reviews, by = "order_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 87 of `x` matches multiple rows in `y`.
## ℹ Row 11574 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
foo %>% 
  ggplot() +
  aes(x = mean_score, y = n_sellers)+
  geom_col(na.rm=TRUE) +
  #geom_text(aes(label = Freq), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("Arranage seller_id by review score for multiple reviews")

#grid.arrange(p1, p2, ncol = 2, main = "Main title")

5.7 List all Customers without purchase

customers_without_order <-
customers |>
  anti_join(orders_customer, by = "customer_id") |>
  group_by(customer_unique_id) |>
  mutate(Freq = n()) |>
  arrange(desc(Freq))


customers_without_order |>
  head(100) |>
  datatable()
print(paste0("There are ",
customers_without_order |>
  distinct(customer_unique_id)|>
  nrow(),
" customers without orders"))
## [1] "There are 761 customers without orders"

5.8 Visualize the best-selling categories by customer

## visualize  categories that were purchased by customers with more than 6 items per order
orders_customer |>
  left_join(customers, by = "customer_id") |>
  group_by(customer_unique_id, product_category_name_english) |>
  summarise(n_purchased= n(), .groups = "keep") |>
  #arrange(desc(n_purchased)) |>
  filter(n_purchased > 10) |>
  ggplot() +
  aes(x = reorder(product_category_name_english, desc(n_purchased), sum), y = n_purchased, fill= customer_unique_id) +
  geom_col() +
    geom_text(aes(label = n_purchased), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom", axis.title.x = element_blank(),
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
         
        plot.title = element_text(size = 14)
        )+
  ggtitle("The best-selling categories using  unique customer id")

orders_customer |>
  #left_join(customers, by = "customer_id") |>
  group_by(customer_id, product_category_name_english) |>
  summarise(n_purchased= n(), .groups = "keep") |>
  #arrange(desc(n_purchased)) |>
  filter(n_purchased > 10) |>
  ggplot() +
  aes(x = reorder(product_category_name_english, desc(n_purchased), sum), y = n_purchased, fill= customer_id) +
  geom_col() +
  geom_text(aes(label=n_purchased), position= position_stack(vjust = .5)) +
  theme(legend.position = "bottom", axis.title.x = element_blank(),
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
         
        plot.title = element_text(size = 14)
        )+
    ggtitle("The best-selling categories using customer id")

  • We set limit upper than 10 purchased for these two plots.

  • customer_unique_id sums purchased items for multiple customer_id.

  • For this reason that appears more customer_unique_id than customer_id in the legend.

# 
# orders_seller <- detail_orders |>
#   group_by( order_id, seller_id, product_category_name_english) |>
#   summarise(n_items= n(), .groups = "keep") |>
#   group_by( order_id,seller_id, product_category_name_english, n_items) |>
#   summarise(n_orders= n(), .groups = "keep") |>
#   #arrange(desc(n_items)) 
#   arrange(desc(n_orders))
# 
# orders_seller |>
#   head(1000) |>
#   datatable()
# orders_seller |>
#   filter(n_items > 6) |>
#   ggplot() +
#   aes(x = reorder(product_category_name_english, desc(n_items), sum), y = n_items, fill= seller_id) +
#   geom_col() +
#   theme(legend.position = "bottom", axis.title.x = element_blank(),
#           axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
#         plot.title = element_text(size = 14)
#         )

5.9 How much delay in each step for selected Sellers

detail_orders |>
    #filter(order_id== "088683f795a3d30bfd61152c4fabdfb2")
  select(order_id, seller_id, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date, 
         order_delivered_customer_date, order_estimated_delivery_date, shipping_limit_date) |>
  mutate(delay_approving = (order_approved_at - order_purchase_timestamp)) |>
  mutate(delay_delivery_carrier= (order_delivered_carrier_date - order_approved_at)) |>
  mutate(delay_delivery_customer= (order_delivered_customer_date - order_delivered_carrier_date)) |>
  mutate(delta_estimation = (order_estimated_delivery_date - order_delivered_customer_date)) |>
  mutate(shipping_days = ((order_delivered_customer_date - order_approved_at))) |>
  mutate(shipping_days_limit = (shipping_limit_date - order_approved_at)) |>
  gather("step", "Dates", 3:8) |>
  filter(seller_id %in% c("5b51032eddd242adc84c38acab88f23d","48436dade18ac8b2bce089ec2a041202", "dd7ddc04e1b6c2c614352b383efe2d36", "c826c40d7b19f62a09e2d7c5e7295ee2", "955fee9216a65b617aa5c0531780ce60")) |> # "
  pivot_longer(-c(order_id,seller_id, delta_estimation, shipping_days, shipping_days_limit, step, Dates)) |>
  rename(Seconds= value, Delays = name) |>
  pivot_longer(-c(order_id,seller_id,  shipping_days, step, Dates, Delays,Seconds)) |>
  rename(Thresholds= name) |>
  #mutate(Time = seconds_to_period(Seconds)) |>
     ggplot(aes(x = Dates, y= Seconds, fill = Delays)) +
  geom_area(na.rm=TRUE)+
  #scale_x_datetime( breacks= date_breaks("12 hours") , labels= date_format("%H:%M %d-%b"))+
  #scale_y_time(date_label= "%m")+
  geom_line(aes(x=Dates, y =value, color=Thresholds), na.rm = TRUE)+
  scale_color_manual(values=c('orange', 'red'))+
       facet_wrap(~seller_id, nrow = 5, scales = "free_y") +
  ggtitle("Delay for each step: from purchase to delivery")
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.

  • If delta_estimation is negative means that the order is delivered after estimated date

  • The shinpping_days_limits is the limit date-time of the delivery. Higher date is considered not acceptble.

  • The main delay is shown during delivery from the carriers to customers (Blue area).

  • c826c40d7b19f62a09e2d7c5e7295ee2 is the seller that has more unamed products.

  • 955fee9216a65b617aa5c0531780ce60 is the best seller of one specific product.

6 Distinct geolocalisation of Sellers/Customers

6.1 What is the difference between customer_id and customer_unique_id?

## count customer_id
paste0("there is ",
customers |> 
  distinct(customer_id) |>
  nrow(),
"  Customer_id in this data")
## [1] "there is 99441  Customer_id in this data"
paste0("there is ",
customers |> distinct(customer_unique_id) |> nrow(),
  "  Customer_unique_id in this data")
## [1] "there is 96096  Customer_unique_id in this data"
paste0("There is about  ",
   customers |> distinct(customer_id) |> nrow() - customers |> distinct(customer_unique_id) |> nrow(),    
      " rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id " )
## [1] "There is about  3345 rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id "

6.2 Group Customer_unique_id and list paires with Custumer_id

## What is the difference between customer_id and customer_unique_id
list_of_unique_id_with_multiple_customer_id <- customers |>
                                        group_by(customer_unique_id) |>
                                        ## filter only duplicated
                                        filter(n()>1) |>
                                        arrange(desc(customer_unique_id))
paste0("Here is a sample of table with  ", 
list_of_unique_id_with_multiple_customer_id |> nrow(),
"  rows. ")
## [1] "Here is a sample of table with  6342  rows. "
list_of_unique_id_with_multiple_customer_id |>
  head(1000) |>
  datatable()
  • This table lists the customer_unique_id corresponding to multiple customer_id.

6.3 Extract Customer_unique_id and their zip_code_prefix

duplicated_customer_unique_id <-
  list_of_unique_id_with_multiple_customer_id |>
  distinct(customer_unique_id) |>
  pull()
  
zip_code_prefix_of_duplicated_customer_unique_id <-
  list_of_unique_id_with_multiple_customer_id |>
  distinct(zip_code_prefix) |>
  pull()

6.4 What about the same Zip code prefix for multiple Latitude/Longiture

geolocation |> 
  filter(zip_code_prefix %in% zip_code_prefix_of_duplicated_customer_unique_id) |>
  arrange(desc(zip_code_prefix)) |>
    head(1000) |>
    datatable()

6.5 If duplicates in customers_id equal the duplicates in Lat/lon for the same Zip code?

Conpare with the last table.

## Filter only Zip code for customer_unique_id
  customers |>
  filter(customer_unique_id %in% duplicated_customer_unique_id) |>
  group_by(customer_id, customer_unique_id, zip_code_prefix) |>
  #distinct(customer_unique_id) |>
  arrange(desc(customer_unique_id)) |>
  filter(zip_code_prefix == 99750) |>
  datatable()

NO!

Only zip_code_prefix is not enough to distinct between geolocations of customers and sellers

For example there is only one customer with the zip_code equal to 99750.

But in geolicalisation we have 14 Positions related to this zip_code

6.6 What about Seller_id

6.6.1 Check for duplicated seller_id

paste0("There are  ",
sellers |>
  group_by(seller_id) |>
  #summarise(N= n()) |>
  filter(n()>1),
"Duplited seller_id.")
## [1] "There are  character(0)Duplited seller_id."
## [2] "There are  integer(0)Duplited seller_id."  
## [3] "There are  character(0)Duplited seller_id."
## [4] "There are  character(0)Duplited seller_id."

6.6.2 Zip code sellers grouping

sellers %>%
  group_by(zip_code_prefix) %>%
  mutate(Freq= n()) %>%
  arrange(desc(Freq)) %>%
  head(1000) %>%
  datatable()

6.7 Explore Sellers localisation

Zip Code is not enough to filter only seller adresses. There is also customers with the same zip code

sellers_geolocation <- geolocation |> 
  filter(zip_code_prefix %in% sellers$zip_code_prefix) %>% ## IS NOT ENOUTH there are the same zip code for multiple sellers and customers.
  left_join(y = sellers, by = "zip_code_prefix")

lng1 <- min(sellers_geolocation$Longitude)
lng2 <- max(sellers_geolocation$Longitude)

lat1 <- min(sellers_geolocation$Latitude)
lat2 <- max(sellers_geolocation$Latitude)
  
leaflet(sellers_geolocation) %>%
  addTiles() %>%
  setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
    addRectangles(
    lng1= lng1, lat1= lat1,
    lng2= lng2, lat2=lat2,
    fillColor = "transparent"
  ) %>%
    addMarkers(~Longitude, ~Latitude,
             popup = ~seller_city, label = ~seller_state,
             clusterOptions = markerClusterOptions())
customers_geolocation <- geolocation |> 
  #rename(zip_code_prefix= geolocation_zip_code_prefix) |>
  filter(zip_code_prefix %in% unique(customers$zip_code_prefix)) |> 
  left_join(y = customers, by = "zip_code_prefix")

lng1 <- min(customers_geolocation$Longitude)
lng2 <- max(customers_geolocation$Longitude)

lat1 <- min(customers_geolocation$Latitude)
lat2 <- max(customers_geolocation$Latitude)
  
# leaflet(customers_geolocation) %>%
#   addTiles() %>%
#   setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
#     addRectangles(
#     lng1= lng1, lat1= lat1,
#     lng2= lng2, lat2=lat2,
#     fillColor = "transparent"
#   ) %>%
#     addMarkers(~Longitude, ~Latitude,
#              popup = ~customer_city, label = ~customer_state,
#              clusterOptions = markerClusterOptions())