0.1 Contexte
0.2 Set and load packages for Python and R
1 Load Datasets
2 Dataset Engineering
- 2.1 check for missing value in dimensions used to compute Volumes
- 2.2 Which sellers have product_id with missing data?
3 Explore Products
4 Explore Orders
5 Explore Orders Reviews
6 Distinct geolocalisation of Sellers/Customers

0.1 Contexte

This dataset was generously provided by Olist, the largest department store in Brazilian marketplaces. Olist connects small businesses from all over Brazil to channels without hassle and with a single contract. Those merchants are able to sell their products through the Olist Store and ship them directly to the customers using Olist logistics partners. See more on our website: www.olist.com.

The data set comes from kaggle competition named Brazilian E-Commerce Public Dataset by Olist.

0.2 Set and load packages for Python and R

# Set python environment and version in RStudio ;-)
reticulate::use_python("/usr/bin/python3.10", required = TRUE)
reticulate::py_config()

## python:         /usr/bin/python3.10
## libpython:      /usr/lib/python3.10/config-3.10-x86_64-linux-gnu/libpython3.10.so
## pythonhome:     //usr://usr
## version:        3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
## numpy:          /home/kirus/.local/lib/python3.10/site-packages/numpy
## numpy_version:  1.26.0
## 
## NOTE: Python version was forced by use_python() function

# Load needed R packages
library(tidyverse)
library(data.table)
library(leaflet)
library(stringr)
library(gridExtra)
#require(maps)
library(ggplot2)
library(lubridate)
#require(mapview)
#require(scales)
#require(RColorBrewer)
require(lubridate)
#library(plotly)
#require(gganimate)
#require(gifski)
#library(caret)
library(DT)
library(kableExtra)

1 Load Datasets

customers <- fread("data/olist_customers_dataset.csv")
geolocation <- fread("data/olist_geolocation_dataset.csv")
order_items <- fread("data/olist_order_items_dataset.csv")
order_payments <- fread("data/olist_order_payments_dataset.csv")
order_reviews <- fread("data/olist_order_reviews_dataset.csv")
orders <- fread("data/olist_orders_dataset.csv")
products <- fread("data/olist_products_dataset.csv")
sellers <- fread("data/olist_sellers_dataset.csv")
category_translation <- fread("data/product_category_name_translation.csv")

1.1 Customers

customers <- customers |>  
   rename(zip_code_prefix = customer_zip_code_prefix)

customers |> head(1000) |>
datatable()

1.2 Geolocation

geolocation <- geolocation |> 
  rename(Longitude = geolocation_lng) |>
  rename(Latitude = geolocation_lat) |>
  rename(zip_code_prefix= geolocation_zip_code_prefix)

geolocation |> 
  head(1000) |>
  datatable()

1.3 Items

order_items |> 
  head(1000) |>
  datatable()

1.4 Payments

order_payments |>
  head(1000) |>
  datatable()

1.5 Reviews

order_reviews |> 
  head(1000) |>
  datatable()

1.6 Orders

orders |> 
  head(1000) |>
  datatable()

1.7 Categories

category_translation |> 
  head(1000) |>
  datatable()

1.8 Products

products |> 
  head(100) |>
  datatable()

1.9 Sellers

sellers <- sellers |>
          rename(zip_code_prefix = seller_zip_code_prefix)

sellers |> 
  head(1000) |>
  datatable()

2 Dataset Engineering

Compute:

Volume of products:

\[Volume = product\_length\_cm * product\_height\_cm * product\_width\_cm / 1000\]

Mutate volume range
Mutate weight range

2.1 check for missing value in dimensions used to compute Volumes

## check for  missing value in specific columns
product_id_with_missed_data <-
products |>
select(product_id,product_category_name, product_weight_g,
       product_length_cm, product_height_cm, product_width_cm) |>
 filter_all(any_vars(is.na(.)))

product_id_with_missed_data |>
   knitr::kable() |>   kable_styling() |>
  scroll_box(width = "900px", height = "300px")

product_id	product_category_name	product_weight_g	product_length_cm	product_height_cm	product_width_cm
09ff539a621711667c43eba6a3bd8466	bebes	NA	NA	NA	NA
5eb564652db742ff8f28759cd8d2652a		NA	NA	NA	NA

2.2 Which sellers have product_id with missing data?

sellers_missing_data <-
order_items |>
  filter(product_id %in% product_id_with_missed_data$product_id) %>%
  group_by(product_id, seller_id, shipping_limit_date, price, freight_value) %>%
  count() %>%
  arrange(shipping_limit_date)

sellers_missing_data %>%
  datatable()

2.2.1 Trend of the price of products with missed informations

sellers_missing_data %>%
  #mutate(total= price+ freight_value) %>%
  pivot_longer(-c(product_id, seller_id,n,shipping_limit_date)) %>%
  ggplot(aes(x = shipping_limit_date, y= value, fill = name)) +
  geom_area()+
    facet_wrap(~product_id, scales = "free_y")+
geom_point(aes(x = shipping_limit_date, y= value, color = name))

* Increase of the price during the period of study.

2.2.2 Compute the Volumes of Products

## add english name and weight range catagories

products <- products |>
            left_join(y = category_translation, by = "product_category_name") |>
            mutate(`volume dm³` = product_length_cm * product_height_cm * product_width_cm/1000) %>%
           rename(weight = product_weight_g) %>%
           mutate(`weight kg` = weight/1000)



products |> 
  head(1000)  |>
  datatable()

2.2.3 Mutate the Weight range of products

products <- products |>
    mutate(weight_range = if_else(`weight kg` <= 1,"0-1 kg", 
                                if_else(`weight kg`> 1 &  `weight kg` <= 5, "1-5 kg",
                                        if_else(`weight kg` > 5 &`weight kg` <=10, "5-10 kg",
                                                if_else(`weight kg` > 10 & `weight kg` <= 20, "10-20 kg",
                                                        if_else(`weight kg` > 20 & `weight kg` <= 30, "20-30 kg",
                                                                if_else(`weight kg` > 30 & `weight kg` <= 42, "30-42 kg", NA )))))))

products |> 
  head(1000)  |>
  datatable()

## glimpse the density of products volumes
products |>
  drop_na(`volume dm³`, weight_range) |>
  ggplot() +
  aes(x=`volume dm³`, color= weight_range, fill= weight_range, na.rm = TRUE)+
  geom_density(adjust= 10, alpha=0.1)

## Warning: Groups with fewer than two data points have been dropped.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

The main volume of products are less than 5 dm³

2.2.4 Mutate Volume Range

## add volume range
products <- products |>
  mutate(volume_range = if_else(`volume dm³` <= 1,"0-1 dm³", 
                                if_else(`volume dm³`> 1 &  `volume dm³` <= 5, "1-5 dm³",
                                        if_else(`volume dm³` > 5 &`volume dm³` <=10, "5-10 dm³",
                                                if_else(`volume dm³` > 10 & `volume dm³` <= 20, "10-20 dm³",
                                                        if_else(`volume dm³` > 20 & `volume dm³` <= 30, "20-30 dm³",
                                                             if_else(`volume dm³` > 30 , ">30 dm3", NA ))))))) 


products |> head() |>
  datatable()

  #knitr::kable() |>   kable_styling() |>
  #scroll_box(width = "900px", height = "300px")

3 Explore Products

3.1 list categories and count same description length

## list categories
products %>%
  #group_by(product_category_name_english, product_name_length) |>
  #mutate(Qt=n()) |>
  pull(product_category_name_english) |> 
  unique() |> as.data.frame() |>
  rownames_to_column(var = "index") |> 
  knitr::kable() |>  kable_styling() |>
  scroll_box(width = "300px", height = "500px")

index	unique(products %>% pull(product_category_name_english))
1	perfumery
2	art
3	sports_leisure
4	baby
5	housewares
6	musical_instruments
7	cool_stuff
8	furniture_decor
9	home_appliances
10	toys
11	bed_bath_table
12	construction_tools_safety
13	computers_accessories
14	health_beauty
15	luggage_accessories
16	garden_tools
17	office_furniture
18	auto
19	electronics
20	fashion_shoes
21	telephony
22	stationery
23	fashion_bags_accessories
24	computers
25	home_construction
26	watches_gifts
27	construction_tools_construction
28	pet_shop
29	small_appliances
30	agro_industry_and_commerce
31	NA
32	furniture_living_room
33	signaling_and_security
34	air_conditioning
35	consoles_games
36	books_general_interest
37	costruction_tools_tools
38	fashion_underwear_beach
39	fashion_male_clothing
40	kitchen_dining_laundry_garden_furniture
41	industry_commerce_and_business
42	fixed_telephony
43	construction_tools_lights
44	books_technical
45	home_appliances_2
46	party_supplies
47	drinks
48	market_place
49	la_cuisine
50	costruction_tools_garden
51	fashio_female_clothing
52	home_confort
53	audio
54	food_drink
55	music
56	food
57	tablets_printing_image
58	books_imported
59	small_appliances_home_oven_and_coffee
60	fashion_sport
61	christmas_supplies
62	fashion_childrens_clothes
63	dvds_blu_ray
64	arts_and_craftmanship
65	furniture_bedroom
66	cine_photo
67	diapers_and_hygiene
68	flowers
69	home_comfort_2
70	security_and_services
71	furniture_mattress_and_upholstery
72	cds_dvds_musicals

3.2 List product_id by categories

## count how many products by category and count how many products with the same description length
products %>%
  #filter(product_category_name== "cama_mesa_banho") |>
  #mutate(product_description_lenght= as.factor(product_description_lenght)) |>
  group_by(product_category_name_english, product_name_lenght, product_description_lenght) |>
  #mutate(Qt= n()) |>
  summarise(products_with_same_name_lenght = n(), .groups='keep') |>
  group_by(product_description_lenght) |>
  #filter(product_description_lenght=="93") |>
  mutate(products_with_same_description_lenght = n()) |>
    #head(1000) |>
  #arrange(desc(Qt)) |>
  #filter(product_category_name_english %in% c("baby", "housewares")) |>
 arrange(desc(products_with_same_name_lenght)) |>
  head(100) |>
  datatable()

There are 610 products without description
We assume that products from the same category and have different name lenght are different.
We assume that products with the same name length and different description length are same bu from different sellers.

3.3 Check if products with the same name lenght and different description lenght are same but from different seller

3.3.1 Group products by sellers {best_seller}

group_pdt_sellers <- 
  order_items |>
  group_by(product_id, seller_id) |>
  #mutate(Freq = n()) |>
  summarise(Freq= n(), .groups="keep") |>
  arrange(desc(Freq))

group_pdt_sellers |>
  head(100) |>
  datatable()

This table lists the best-selling products and their sellers

3.4 Which caterogies are the best-selling by seller?

best_selling <- products |>
  left_join(order_items , by = "product_id") |>
  group_by(product_category_name_english, seller_id) |>
  mutate(n_sold = n()) |>
  distinct(product_category_name_english, seller_id, n_sold, .keep_all = FALSE) |>
  #select( product_category_name_english,product_name_lenght, seller_id, product_id, n_sold )|>
  arrange(desc(n_sold))


best_selling |>
  filter(n_sold >620)|>
  ggplot() +
  aes(x = reorder(product_category_name_english, desc(n_sold), sum), y = n_sold, fill= seller_id) +
  geom_col() +
  geom_text(aes(label = n_sold), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("Which caterogies are the best-selling by seller?")

3.5 Name length parameter is not specific to product or seller

 products |>
           left_join(order_items , by = "product_id") |>
      group_by(  seller_id, product_category_name_english, product_name_lenght) |>
  summarise(products_with_same_name_lenght= n(), .groups = "keep") |>
  #mutate(products_with_same_name_lenght= n())|>
     #select(product_id, product_category_name_english, product_name_lenght, seller_id, products_with_same_name_lenght) |>
  unique() |>
  arrange(!is.na(product_name_lenght)) |>
      arrange(desc(products_with_same_name_lenght)) |>
 head(500) |>
  datatable()

We find which seller has more missing product name: c826c40d7b19f62a09e2d7c5e7295ee2. We can find his geolocation.
Several items with the same product length name exist for the same seller. It seems the same product but with different color or version or option.
It is possible that products belonging to different categories have the same name length and sold by different sellers.

CONCLUSION: The product_name_lenght is not specific to a specific product or seller.

3.6 Display the ratio weight/volume of products

products |>
  drop_na(`weight kg`, `volume dm³`, `product_category_name_english`) |>
  ggplot() + 
  aes(y = `weight kg` , x = `product_category_name_english`, color= weight_range) +#, size=`volume dm³` 
  geom_point() +
    theme(legend.position = "left", #axis.text.y = element_blank(),
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 14)
        ) +
  #facet_wrap(~weight_range, nrow = 6, scales = "free_y")+
  facet_wrap(~volume_range, nrow = 6, scales= "free_y")

We can note that some products with small volume but with high weight and vis-versa.
The must sold product have volume upper than 30 dm³.

4 Explore Orders

## join items and orders and products
detail_orders <- order_items |>
  left_join(orders, by = "order_id") |>
  left_join(products, by = c("product_id")) |>
  select(order_id, product_category_name_english, product_id,  seller_id, customer_id,   order_item_id, everything()) 


detail_orders|>
  head(1000) |>
  datatable()

4.1 How many orders by customer and How many items by order

## group by order and check items
orders_customer <- 
  detail_orders |>
  select(order_id, customer_id, product_id, product_category_name_english) |>
  group_by( order_id, customer_id) |>
  mutate(n_orders= n()) |>
  #summarise(n_items= n(), .groups = "keep") |>
  group_by( order_id, customer_id, product_id) |>
  mutate(n_items= n()) |>
  ungroup() |>
  arrange(desc(n_orders))

orders_customer |>
  head(1000) |>
  datatable()

The maximum number of orders per customer is 21.
The maximum number of items per orders is 20.

4.2 How many Custumer_id / Unique_customer_id / Customer that purchased items?

## check how many customers
n_customer_id <-
customers |>
   select(customer_id) |>
  unique() |>
  nrow()


n_customer_id_with_order <-
orders_customer |>
 select(customer_id) |>
  unique() |>
  nrow()

n_customer_unique_id <-
customers |>
  select(customer_unique_id) |>
  unique() |>
  nrow()

print(paste0("there are ", n_customer_id, " Customers in this study"))

## [1] "there are 99441 Customers in this study"

print(paste0("there are ", n_customer_id_with_order, " Customers that purchased items in this study"))

## [1] "there are 98666 Customers that purchased items in this study"

print(paste0("there are ", n_customer_unique_id, " UNIQUE customer in this study"))

## [1] "there are 96096 UNIQUE customer in this study"

4.3 Orders Payment

order_payments %>%
  group_by(order_id) %>%
  mutate(Total = sum(payment_value)) %>%
  head(1000) %>%
  datatable()

4.3.1 missing orders payment in `orders_items`

missing_orders <- order_items %>%
  group_by(order_id) %>%
  summarise(price_value = sum(price), total_freight = sum(freight_value) ) %>%
  anti_join(order_payments, by="order_id")

missing_orders %>%
  datatable()

4.3.2 Which payment type used in sequential?

order_payments  %>%
  #filter(payment_type == "voucher") %>%
  group_by(order_id) %>%
  reframe(sequential = paste0(payment_sequential, collapse= ", "), 
         types= paste0(payment_type, collapse= ", "),
         installments= paste0( payment_installments, collapse = ", "),
         Total = sum(payment_value)) %>% 
  filter(sequential != "1") %>%
  mutate(Sequential = map_chr(strsplit(sequential, ","), ~ 
                  toString(sort(as.numeric(.x))))) %>%
  mutate(Types = map_chr(strsplit(types, ","), ~ 
                  toString(sort(as.character(.x))))) %>%
  mutate(Installments = map_chr(strsplit(installments, ","), ~ 
                  toString(sort(as.character(.x))))) %>%
 # mutate(Sequential= as.factor(Sequential), Types = as.factor(Types)) %>%
  group_by(Sequential, Types) %>%
  reframe(Freq= n(), median_total = median(Total)) %>%
  mutate(credit_card = lengths(regmatches(Types, gregexpr("credit_card", Types)))) %>%
  mutate(voucher = purrr::map_int(Types, ~sum(gregexpr("voucher", .x, fixed=TRUE)[[1]] > 0))) %>%
  mutate(debit_card = lengths(regmatches(Types, gregexpr("debit_card", Types)))) %>%
  mutate(boleto = str_count(Types, 'boleto')) %>%
  #mutate(number_of_drugs = map(str_split(gsub('credit_card|voucher', ',', str_trim(Types)), '\\s+'), ~ length(.x))) %>%
  mutate(credit_card=(paste0(credit_card, ":credit"))) %>%
  mutate(voucher = paste0(voucher, ":voucher")) %>%
  mutate(debit_card= paste0(debit_card, ":debit")) %>%
  mutate(boleto= paste0(boleto, ":boleto")) %>%
  unite(col = "Types", credit_card:boleto, sep = ", ", remove = TRUE) %>%
  mutate(Sequential = map_chr(strsplit(Sequential, ","), ~ 
                  toString(paste0(min(as.numeric(.x), na.rm=TRUE),":" ,max(as.numeric(.x), na.rm=TRUE))
                           )
                  )
  ) %>%
  #select(order_id, Sequential, Types, Installments, Total, Freq,everything()) %>%
  #arrange(Sequential)
  ggplot() +
  aes(x = reorder(Sequential, desc(Freq)), y = Freq, fill = Types) +
    geom_col(na.rm = TRUE) +
    ggforce::facet_zoom(ylim = c(0, 75), zoom.data = ifelse(a >= 75, NA, FALSE))+
    #ggplot2::scale_y_continuous(trans = squash_axis(5, 95, 10)) +
  geom_text(aes(label = median_total), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("Which payment type used in sequential?")

order_payments  %>%
  #filter(payment_type == "voucher") %>%
  group_by(order_id) %>%
  reframe(sequential = paste0(payment_sequential, collapse= ", "), 
         types= paste0(payment_type, collapse= ", "),
         installments= paste0( payment_installments, collapse = ", "),
         Total = sum(payment_value)) %>% 
  filter(sequential != "1") %>%
  mutate(Sequential = map_chr(strsplit(sequential, ","), ~ 
                  toString(sort(as.numeric(.x))))) %>%
  mutate(Types = map_chr(strsplit(types, ","), ~ 
                  toString(sort(as.character(.x))))) %>%
  mutate(Installments = map_chr(strsplit(installments, ","), ~ 
                  toString(sort(as.character(.x))))) %>%
 # mutate(Sequential= as.factor(Sequential), Types = as.factor(Types)) %>%
  group_by(Sequential, Types) %>%
  reframe(Freq= n(), median_total = median(Total)) %>%
  mutate(credit_card = lengths(regmatches(Types, gregexpr("credit_card", Types)))) %>%
  mutate(voucher = purrr::map_int(Types, ~sum(gregexpr("voucher", .x, fixed=TRUE)[[1]] > 0))) %>%
  mutate(debit_card = lengths(regmatches(Types, gregexpr("debit_card", Types)))) %>%
  mutate(boleto = str_count(Types, 'boleto')) %>%
  #mutate(number_of_drugs = map(str_split(gsub('credit_card|voucher', ',', str_trim(Types)), '\\s+'), ~ length(.x))) %>%
  mutate(credit_card=(paste0(credit_card, ":credit"))) %>%
  mutate(voucher = paste0(voucher, ":voucher")) %>%
  mutate(debit_card= paste0(debit_card, ":debit")) %>%
  mutate(boleto= paste0(boleto, ":boleto")) %>%
  unite(col = "Types", credit_card:boleto, sep = ", ", remove = TRUE) %>%
  mutate(Sequential = map_chr(strsplit(Sequential, ","), ~ 
                  toString(paste0(min(as.numeric(.x), na.rm=TRUE),":" ,max(as.numeric(.x), na.rm=TRUE))
                           )
                  )
  ) %>%
  ggplot()+
  aes(x = reorder(Sequential, desc(median_total)), y = median_total, fill= Types) +
  geom_col()+
  geom_text(aes(label = Freq), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("What is the average amount of each type used sequentially?")

5 Explore Orders Reviews

5.1 There are multiple Reviews for same `order_id`

order_reviews %>%
  group_by(order_id) %>% 
  filter(n()>1) %>%
  arrange(order_id) %>%
  #head(500) %>%
  datatable()

5.2 Some orders are composed by items purchased from different seller_id.

detail_orders %>%
   group_by(order_id, customer_id, product_category_name_english) %>%
  summarise(n_items = n(), .groups = "keep",
            n_sellers= paste0(seller_id, collapse = ", ")) %>%
  #arrange(match(order_id, unique(order_id)))
  arrange(desc(n_items)) %>%
  select(order_id, customer_id, n_sellers, product_category_name_english,  everything()) %>%
    head(1000) %>%
    datatable()

5.3 Classify seller_id by review scores

detail_orders %>%
  #filter(seller_id == "001cca7ae9ae17fb1caed9dfb1094831") %>%
  #left_join(order_reviews, by = "order_id") 
  group_by(seller_id, order_id) %>%
  summarise(n_items=n(),.groups = "keep",
            unique_cat= unique(product_category_name_english),
            Total_prise= sum(price)) %>%
  #filter(seller_id =="2709af9587499e95e803a6498a5a56e9")
  #arrange(desc(n_items)) %>%
  left_join(order_reviews, by = "order_id") %>%
  group_by(seller_id) %>%
  summarise(n_reviews= n(), .groups = "keep",
            median_score=  median(review_score, na.rm=TRUE),
            total_items= sum(n_items),
            reviews_ids = paste0(review_id, collapse = ", ")) %>%
  mutate(median_score= as.factor(median_score))  %>%
  group_by(median_score) %>%
  reframe(n_seller_id= n(),
            n_reviews = sum(n_reviews)) %>%
  ggplot()+
  aes(x = reorder(median_score, desc(n_seller_id)), y = n_seller_id, fill= n_reviews) +
  geom_col()+
  geom_text(aes(label = n_reviews), position = position_stack(vjust =  .5), color= "red")+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("How many seller_ids by review score")

## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning in left_join(., order_reviews, by = "order_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 222 of `x` matches multiple rows in `y`.
## ℹ Row 57578 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

5.4 Which categories have best scores?

best_categories_reviews <- 
detail_orders %>%
  #filter(seller_id == "001cca7ae9ae17fb1caed9dfb1094831") %>%
  #left_join(order_reviews, by = "order_id") 
  group_by(product_category_name_english, order_id) %>%
  summarise(n_items=n(),.groups = "keep",
            Total_prise= sum(price),
            unique_cat= unique(product_category_name_english)) %>%
  #filter(seller_id =="2709af9587499e95e803a6498a5a56e9")
  #arrange(desc(n_items)) %>%
  left_join(order_reviews, by = "order_id") %>%
  group_by(product_category_name_english) %>%
  summarise(n_reviews= n(), .groups = "keep",
            median_score=  median(review_score, na.rm=TRUE),
            total_items= sum(n_items),
            reviews_ids = paste0(review_id, collapse = ", ")) %>%
  mutate(median_score= as.factor(median_score))

## Warning in left_join(., order_reviews, by = "order_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 709 of `x` matches multiple rows in `y`.
## ℹ Row 51688 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

best_categories_reviews %>%
  select(-reviews_ids) %>%
datatable()

best_categories_reviews %>%
  ggplot()+
  aes(x = median_score, y = n_reviews, color= product_category_name_english)+
  geom_point() +
  #geom_text(aes(label = n_reviews), position = position_stack(vjust =  .5), color= "red")+
  theme(legend.position = "bottom",legend.box="horizontal", legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("Which categories have best scores?")+
guides(color=guide_legend(nrow=7, byrow=TRUE))

5.5 Group reviews per order_id

order_reviews %>%
  group_by(order_id) %>% 
    filter(n()>1) %>%
  mutate(reviews_id = paste0(review_id, collapse= ", "),
         comment_title= paste0(review_comment_title, collapse= ", "),
         comment_message= paste0(review_comment_message, collapse= ", "),
         mean_score= round(mean(review_score, na.rm = TRUE), digits = 0)) %>%
  select(order_id,mean_score, comment_title, comment_message, reviews_id )  %>%
  head(1000) %>%
  datatable()

5.6 Arranage seller_id by review scores

foo <-
  detail_orders %>%
   group_by(order_id, seller_id) %>%
  summarise(n_orders = n(), .groups = "keep") %>%
  #arrange(match(seller_id, unique(seller_id)))
  left_join(order_reviews, by = "order_id") %>%
  group_by(seller_id) %>%
  summarise(mean_score= round(mean(review_score, na.rm = TRUE), digits = 0),
            n_sellers= n(),
            n_reviews= length(review_id)) %>%
  #arrange(desc(n_reviews)) %>%
  mutate(mean_score= as.factor(mean_score)) %>%
  arrange(desc(n_sellers))

## Warning in left_join(., order_reviews, by = "order_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 87 of `x` matches multiple rows in `y`.
## ℹ Row 11574 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

foo %>% 
  ggplot() +
  aes(x = mean_score, y = n_sellers)+
  geom_col(na.rm=TRUE) +
  #geom_text(aes(label = Freq), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom",legend.title=element_text(size=7), legend.text=element_text(size=6), axis.title.x = element_blank(), 
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
        plot.title = element_text(size = 7)
        )+
  ggtitle("Arranage seller_id by review score for multiple reviews")

#grid.arrange(p1, p2, ncol = 2, main = "Main title")

5.7 List all Customers without purchase

customers_without_order <-
customers |>
  anti_join(orders_customer, by = "customer_id") |>
  group_by(customer_unique_id) |>
  mutate(Freq = n()) |>
  arrange(desc(Freq))


customers_without_order |>
  head(100) |>
  datatable()

print(paste0("There are ",
customers_without_order |>
  distinct(customer_unique_id)|>
  nrow(),
" customers without orders"))

## [1] "There are 761 customers without orders"

5.8 Visualize the best-selling categories by customer

## visualize  categories that were purchased by customers with more than 6 items per order
orders_customer |>
  left_join(customers, by = "customer_id") |>
  group_by(customer_unique_id, product_category_name_english) |>
  summarise(n_purchased= n(), .groups = "keep") |>
  #arrange(desc(n_purchased)) |>
  filter(n_purchased > 10) |>
  ggplot() +
  aes(x = reorder(product_category_name_english, desc(n_purchased), sum), y = n_purchased, fill= customer_unique_id) +
  geom_col() +
    geom_text(aes(label = n_purchased), position = position_stack(vjust = .5))+
  theme(legend.position = "bottom", axis.title.x = element_blank(),
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
         
        plot.title = element_text(size = 14)
        )+
  ggtitle("The best-selling categories using  unique customer id")

orders_customer |>
  #left_join(customers, by = "customer_id") |>
  group_by(customer_id, product_category_name_english) |>
  summarise(n_purchased= n(), .groups = "keep") |>
  #arrange(desc(n_purchased)) |>
  filter(n_purchased > 10) |>
  ggplot() +
  aes(x = reorder(product_category_name_english, desc(n_purchased), sum), y = n_purchased, fill= customer_id) +
  geom_col() +
  geom_text(aes(label=n_purchased), position= position_stack(vjust = .5)) +
  theme(legend.position = "bottom", axis.title.x = element_blank(),
          axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
         
        plot.title = element_text(size = 14)
        )+
    ggtitle("The best-selling categories using customer id")

We set limit upper than 10 purchased for these two plots.
customer_unique_id sums purchased items for multiple customer_id.
For this reason that appears more customer_unique_id than customer_id in the legend.

# 
# orders_seller <- detail_orders |>
#   group_by( order_id, seller_id, product_category_name_english) |>
#   summarise(n_items= n(), .groups = "keep") |>
#   group_by( order_id,seller_id, product_category_name_english, n_items) |>
#   summarise(n_orders= n(), .groups = "keep") |>
#   #arrange(desc(n_items)) 
#   arrange(desc(n_orders))
# 
# orders_seller |>
#   head(1000) |>
#   datatable()

# orders_seller |>
#   filter(n_items > 6) |>
#   ggplot() +
#   aes(x = reorder(product_category_name_english, desc(n_items), sum), y = n_items, fill= seller_id) +
#   geom_col() +
#   theme(legend.position = "bottom", axis.title.x = element_blank(),
#           axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9, size = 10),
#         plot.title = element_text(size = 14)
#         )

5.9 How much delay in each step for selected Sellers

detail_orders |>
    #filter(order_id== "088683f795a3d30bfd61152c4fabdfb2")
  select(order_id, seller_id, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date, 
         order_delivered_customer_date, order_estimated_delivery_date, shipping_limit_date) |>
  mutate(delay_approving = (order_approved_at - order_purchase_timestamp)) |>
  mutate(delay_delivery_carrier= (order_delivered_carrier_date - order_approved_at)) |>
  mutate(delay_delivery_customer= (order_delivered_customer_date - order_delivered_carrier_date)) |>
  mutate(delta_estimation = (order_estimated_delivery_date - order_delivered_customer_date)) |>
  mutate(shipping_days = ((order_delivered_customer_date - order_approved_at))) |>
  mutate(shipping_days_limit = (shipping_limit_date - order_approved_at)) |>
  gather("step", "Dates", 3:8) |>
  filter(seller_id %in% c("5b51032eddd242adc84c38acab88f23d","48436dade18ac8b2bce089ec2a041202", "dd7ddc04e1b6c2c614352b383efe2d36", "c826c40d7b19f62a09e2d7c5e7295ee2", "955fee9216a65b617aa5c0531780ce60")) |> # "
  pivot_longer(-c(order_id,seller_id, delta_estimation, shipping_days, shipping_days_limit, step, Dates)) |>
  rename(Seconds= value, Delays = name) |>
  pivot_longer(-c(order_id,seller_id,  shipping_days, step, Dates, Delays,Seconds)) |>
  rename(Thresholds= name) |>
  #mutate(Time = seconds_to_period(Seconds)) |>
     ggplot(aes(x = Dates, y= Seconds, fill = Delays)) +
  geom_area(na.rm=TRUE)+
  #scale_x_datetime( breacks= date_breaks("12 hours") , labels= date_format("%H:%M %d-%b"))+
  #scale_y_time(date_label= "%m")+
  geom_line(aes(x=Dates, y =value, color=Thresholds), na.rm = TRUE)+
  scale_color_manual(values=c('orange', 'red'))+
       facet_wrap(~seller_id, nrow = 5, scales = "free_y") +
  ggtitle("Delay for each step: from purchase to delivery")

## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.

If delta_estimation is negative means that the order is delivered after estimated date
The shinpping_days_limits is the limit date-time of the delivery. Higher date is considered not acceptble.
The main delay is shown during delivery from the carriers to customers (Blue area).
c826c40d7b19f62a09e2d7c5e7295ee2 is the seller that has more unamed products.
955fee9216a65b617aa5c0531780ce60 is the best seller of one specific product.

6 Distinct geolocalisation of Sellers/Customers

6.1 What is the difference between `customer_id` and `customer_unique_id`?

## count customer_id
paste0("there is ",
customers |> 
  distinct(customer_id) |>
  nrow(),
"  Customer_id in this data")

## [1] "there is 99441  Customer_id in this data"

paste0("there is ",
customers |> distinct(customer_unique_id) |> nrow(),
  "  Customer_unique_id in this data")

## [1] "there is 96096  Customer_unique_id in this data"

paste0("There is about  ",
   customers |> distinct(customer_id) |> nrow() - customers |> distinct(customer_unique_id) |> nrow(),    
      " rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id " )

## [1] "There is about  3345 rows with duplicated id. But we have to think about one custmer_unique_id have multiple customer_id "

6.2 Group `Customer_unique_id` and list paires with `Custumer_id`

## What is the difference between customer_id and customer_unique_id
list_of_unique_id_with_multiple_customer_id <- customers |>
                                        group_by(customer_unique_id) |>
                                        ## filter only duplicated
                                        filter(n()>1) |>
                                        arrange(desc(customer_unique_id))
paste0("Here is a sample of table with  ", 
list_of_unique_id_with_multiple_customer_id |> nrow(),
"  rows. ")

## [1] "Here is a sample of table with  6342  rows. "

list_of_unique_id_with_multiple_customer_id |>
  head(1000) |>
  datatable()

This table lists the customer_unique_id corresponding to multiple customer_id.

6.3 Extract `Customer_unique_id` and their `zip_code_prefix`

duplicated_customer_unique_id <-
  list_of_unique_id_with_multiple_customer_id |>
  distinct(customer_unique_id) |>
  pull()
  
zip_code_prefix_of_duplicated_customer_unique_id <-
  list_of_unique_id_with_multiple_customer_id |>
  distinct(zip_code_prefix) |>
  pull()

6.4 What about the same Zip code prefix for multiple Latitude/Longiture

geolocation |> 
  filter(zip_code_prefix %in% zip_code_prefix_of_duplicated_customer_unique_id) |>
  arrange(desc(zip_code_prefix)) |>
    head(1000) |>
    datatable()

6.5 If duplicates in `customers_id` equal the duplicates in Lat/lon for the same Zip code?

Conpare with the last table.

## Filter only Zip code for customer_unique_id
  customers |>
  filter(customer_unique_id %in% duplicated_customer_unique_id) |>
  group_by(customer_id, customer_unique_id, zip_code_prefix) |>
  #distinct(customer_unique_id) |>
  arrange(desc(customer_unique_id)) |>
  filter(zip_code_prefix == 99750) |>
  datatable()

NO!

Only zip_code_prefix is not enough to distinct between geolocations of customers and sellers

For example there is only one customer with the zip_code equal to 99750.

But in geolicalisation we have 14 Positions related to this zip_code

6.6 What about Seller_id

6.6.1 Check for duplicated seller_id

paste0("There are  ",
sellers |>
  group_by(seller_id) |>
  #summarise(N= n()) |>
  filter(n()>1),
"Duplited seller_id.")

## [1] "There are  character(0)Duplited seller_id."
## [2] "There are  integer(0)Duplited seller_id."  
## [3] "There are  character(0)Duplited seller_id."
## [4] "There are  character(0)Duplited seller_id."

6.6.2 Zip code sellers grouping

sellers %>%
  group_by(zip_code_prefix) %>%
  mutate(Freq= n()) %>%
  arrange(desc(Freq)) %>%
  head(1000) %>%
  datatable()

6.7 Explore Sellers localisation

Zip Code is not enough to filter only seller adresses. There is also customers with the same zip code

sellers_geolocation <- geolocation |> 
  filter(zip_code_prefix %in% sellers$zip_code_prefix) %>% ## IS NOT ENOUTH there are the same zip code for multiple sellers and customers.
  left_join(y = sellers, by = "zip_code_prefix")

lng1 <- min(sellers_geolocation$Longitude)
lng2 <- max(sellers_geolocation$Longitude)

lat1 <- min(sellers_geolocation$Latitude)
lat2 <- max(sellers_geolocation$Latitude)
  
leaflet(sellers_geolocation) %>%
  addTiles() %>%
  setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
    addRectangles(
    lng1= lng1, lat1= lat1,
    lng2= lng2, lat2=lat2,
    fillColor = "transparent"
  ) %>%
    addMarkers(~Longitude, ~Latitude,
             popup = ~seller_city, label = ~seller_state,
             clusterOptions = markerClusterOptions())

customers_geolocation <- geolocation |> 
  #rename(zip_code_prefix= geolocation_zip_code_prefix) |>
  filter(zip_code_prefix %in% unique(customers$zip_code_prefix)) |> 
  left_join(y = customers, by = "zip_code_prefix")

lng1 <- min(customers_geolocation$Longitude)
lng2 <- max(customers_geolocation$Longitude)

lat1 <- min(customers_geolocation$Latitude)
lat2 <- max(customers_geolocation$Latitude)
  
# leaflet(customers_geolocation) %>%
#   addTiles() %>%
#   setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 3) %>%
#     addRectangles(
#     lng1= lng1, lat1= lat1,
#     lng2= lng2, lat2=lat2,
#     fillColor = "transparent"
#   ) %>%
#     addMarkers(~Longitude, ~Latitude,
#              popup = ~customer_city, label = ~customer_state,
#              clusterOptions = markerClusterOptions())

Brazilian E Commerce by Olist

Karim Mezhoud

2023-10-11