# Set python environment and version in RStudio ;-)
reticulate::use_python("/Users/Mezhoud/anaconda3/bin/python3", required = TRUE)
reticulate::py_config()
## python: /Users/Mezhoud/anaconda3/bin/python3
## libpython: /Users/Mezhoud/anaconda3/lib/libpython3.7m.dylib
## pythonhome: /Users/Mezhoud/anaconda3:/Users/Mezhoud/anaconda3
## version: 3.7.5 (default, Oct 25 2019, 10:52:18) [Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy: /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/numpy
## numpy_version: 1.18.1
##
## NOTE: Python version was forced by use_python function
Train <- fread("Train.csv")
Train <- Train %>%
rename(`Altitude (m)` = elevation) %>%
rename(Longitude = X) %>%
rename(Latitude = Y) %>%
separate(Square_ID, into = c("Square_ID", "v1", "v2", "v3", "v4"), sep = "-" , remove = FALSE) %>%
unite("other", v1, v2,v3, v4, sep = "-") %>%
select(Longitude,Latitude,`Altitude (m)`, LC_Type1_mode,Square_ID, other , everything())
#filter(row_number()==1 )
Train %>% head() %>% knitr::kable() %>% kable_styling() %>%
scroll_box(width = "1000px", height = "1000px")
Longitude | Latitude | Altitude (m) | LC_Type1_mode | Square_ID | other | target_2015 | precip 2014-11-16 - 2014-11-23 | precip 2014-11-23 - 2014-11-30 | precip 2014-11-30 - 2014-12-07 | precip 2014-12-07 - 2014-12-14 | precip 2014-12-14 - 2014-12-21 | precip 2014-12-21 - 2014-12-28 | precip 2014-12-28 - 2015-01-04 | precip 2015-01-04 - 2015-01-11 | precip 2015-01-11 - 2015-01-18 | precip 2015-01-18 - 2015-01-25 | precip 2015-01-25 - 2015-02-01 | precip 2015-02-01 - 2015-02-08 | precip 2015-02-08 - 2015-02-15 | precip 2015-02-15 - 2015-02-22 | precip 2015-02-22 - 2015-03-01 | precip 2015-03-01 - 2015-03-08 | precip 2015-03-08 - 2015-03-15 | precip 2019-01-20 - 2019-01-27 | precip 2019-01-27 - 2019-02-03 | precip 2019-02-03 - 2019-02-10 | precip 2019-02-10 - 2019-02-17 | precip 2019-02-17 - 2019-02-24 | precip 2019-02-24 - 2019-03-03 | precip 2019-03-03 - 2019-03-10 | precip 2019-03-10 - 2019-03-17 | precip 2019-03-17 - 2019-03-24 | precip 2019-03-24 - 2019-03-31 | precip 2019-03-31 - 2019-04-07 | precip 2019-04-07 - 2019-04-14 | precip 2019-04-14 - 2019-04-21 | precip 2019-04-21 - 2019-04-28 | precip 2019-04-28 - 2019-05-05 | precip 2019-05-05 - 2019-05-12 | precip 2019-05-12 - 2019-05-19 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
34.26 | -15.91 | 887.7642 | 9 | 4e3c3896 | 14ce-11ea-bce5-f49634744a41 | 0 | 0 | 0 | 0 | 14.84403 | 14.55282 | 12.23777 | 57.45136 | 30.12705 | 30.44947 | 1.521829 | 29.39 | 32.87832 | 8.179804 | 0.9639814 | 16.6591 | 3.304466 | 0 | 12.99262 | 4.582856 | 35.03753 | 4.796012 | 28.08331 | 0 | 58.36246 | 18.26469 | 17.53749 | 0.8963228 | 1.68 | 0 | 0 | 0 | 0 | 0 | 0 |
34.26 | -15.90 | 743.4039 | 9 | 4e3c3897 | 14ce-11ea-bce5-f49634744a41 | 0 | 0 | 0 | 0 | 14.84403 | 14.55282 | 12.23777 | 57.45136 | 30.12705 | 30.44947 | 1.521829 | 29.39 | 32.87832 | 8.179804 | 0.9639814 | 16.6591 | 3.304466 | 0 | 12.99262 | 4.582856 | 35.03753 | 4.796012 | 28.08331 | 0 | 58.36246 | 18.26469 | 17.53749 | 0.8963228 | 1.68 | 0 | 0 | 0 | 0 | 0 | 0 |
34.26 | -15.89 | 565.7283 | 9 | 4e3c3898 | 14ce-11ea-bce5-f49634744a41 | 0 | 0 | 0 | 0 | 14.84403 | 14.55282 | 12.23777 | 57.45136 | 30.12705 | 30.44947 | 1.521829 | 29.39 | 32.87832 | 8.179804 | 0.9639814 | 16.6591 | 3.304466 | 0 | 12.99262 | 4.582856 | 35.03753 | 4.796012 | 28.08331 | 0 | 58.36246 | 18.26469 | 17.53749 | 0.8963228 | 1.68 | 0 | 0 | 0 | 0 | 0 | 0 |
34.26 | -15.88 | 443.3928 | 10 | 4e3c3899 | 14ce-11ea-bce5-f49634744a41 | 0 | 0 | 0 | 0 | 14.84403 | 14.55282 | 12.23777 | 57.45136 | 30.12705 | 30.44947 | 1.521829 | 29.39 | 32.87832 | 8.179804 | 0.9639814 | 16.6591 | 3.304466 | 0 | 12.99262 | 4.582856 | 35.03753 | 4.796012 | 28.08331 | 0 | 58.36246 | 18.26469 | 17.53749 | 0.8963228 | 1.68 | 0 | 0 | 0 | 0 | 0 | 0 |
34.26 | -15.87 | 437.4434 | 10 | 4e3c389a | 14ce-11ea-bce5-f49634744a41 | 0 | 0 | 0 | 0 | 14.84403 | 14.55282 | 12.23777 | 57.45136 | 30.12705 | 30.44947 | 1.521829 | 29.39 | 32.87832 | 8.179804 | 0.9639814 | 16.6591 | 3.304466 | 0 | 12.99262 | 4.582856 | 35.03753 | 4.796012 | 28.08331 | 0 | 58.36246 | 18.26469 | 17.53749 | 0.8963228 | 1.68 | 0 | 0 | 0 | 0 | 0 | 0 |
34.26 | -15.86 | 405.6317 | 10 | 4e3c389b | 14ce-11ea-bce5-f49634744a41 | 0 | 0 | 0 | 0 | 14.84403 | 14.55282 | 12.23777 | 57.45136 | 30.12705 | 30.44947 | 1.521829 | 29.39 | 32.87832 | 8.179804 | 0.9639814 | 16.6591 | 3.304466 | 0 | 12.99262 | 4.582856 | 35.03753 | 4.796012 | 28.08331 | 0 | 58.36246 | 18.26469 | 17.53749 | 0.8963228 | 1.68 | 0 | 0 | 0 | 0 | 0 | 0 |
lng1 <- min(Train$Longitude)
lng2 <- max(Train$Longitude)
lat1 <- min(Train$Latitude)
lat2 <- max(Train$Latitude)
leaflet(Train) %>%
addTiles() %>%
setView(lng = (lng1+lng2)/2, lat = (lat1+lat2)/2, zoom = 8.4) %>%
addRectangles(
lng1= lng1, lat1= lat1,
lng2= lng2, lat2=lat2,
fillColor = "transparent"
)
Hydrology <- EBImage::readImage("Hydrology_bib.png")
Elevation <- EBImage::readImage("elevation_bib.png")
Landcover <- EBImage::readImage("LandCover_bib.png")
Population <- EBImage::readImage("Population_bib.png")
par(mfrow=c(2,2))
plot(Elevation)
title("Elevation")
plot(Landcover)
title("Landscovers")
plot(Hydrology)
title("Hydrology")
plot(Population)
title("Population")
Soil_pal <- fread("LC_Type1_mode.csv")
#LC_type1_palette <- c("#05450a", "#086a10", "#54a708", "#78d203", "#009900", "#c6b044", "#dcd159", "#dade48", "#fbff13", "#b6ff05", "#27ff87", "#c24f44", "#a5a5a5", "#ff6d4c", "#69fff8", "#f9ffa4", "#1c0dff")
Soil_pal <- Soil_pal %>%
rename(LC_Type1_mode = Value) %>%
separate(Description, into = c("Soil Name", "Description"), sep = ":" , remove = FALSE) %>%
mutate(`Soil Name` = as.factor(`Soil Name`))
Soil_pal %>% knitr::kable() %>% kable_styling() %>%
scroll_box(width = "1000px", height = "1000px")
LC_Type1_mode | Color | Soil Name | Description |
---|---|---|---|
1 | #05450a | Evergreen Needleleaf Forests | dominated by evergreen conifer trees (canopy >2m). Tree cover >60%. |
2 | #086a10 | Evergreen Broadleaf Forests | dominated by evergreen broadleaf and palmate trees (canopy >2m). Tree cover >60%. |
3 | #54a708 | Deciduous Needleleaf Forests | dominated by deciduous needleleaf (larch) trees (canopy >2m). Tree cover >60%. |
4 | #78d203 | Deciduous Broadleaf Forests | dominated by deciduous broadleaf trees (canopy >2m). Tree cover >60%. |
5 | #009900 | Mixed Forests | dominated by neither deciduous nor evergreen (40-60% of each) tree type (canopy >2m). Tree cover >60%. |
6 | #c6b044 | Closed Shrublands | dominated by woody perennials (1-2m height) >60% cover. |
7 | #dcd159 | Open Shrublands | dominated by woody perennials (1-2m height) 10-60% cover. |
8 | #dade48 | Woody Savannas | tree cover 30-60% (canopy >2m). |
9 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). |
10 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). |
11 | #27ff87 | Permanent Wetlands | permanently inundated lands with 30-60% water cover and >10% vegetated cover. |
12 | #c24f44 | Croplands | at least 60% of area is cultivated cropland. |
13 | #a5a5a5 | Urban and Built-up Lands | at least 30% impervious surface area including building materials, asphalt and vehicles. |
14 | #ff6d4c | Cropland/Natural Vegetation Mosaics | mosaics of small-scale cultivation 40-60% with natural tree, shrub, or herbaceous vegetation. |
15 | #69fff8 | Permanent Snow and Ice | at least 60% of area is covered by snow and ice for at least 10 months of the year. |
16 | #f9ffa4 | Barren | at least 60% of area is non-vegetated barren (sand, rock, soil) areas with less than 10% vegetation. |
17 | #1c0dff | Water Bodies | at least 60% of area is covered by permanent water bodies. |
## Color palette of Land Cover Type 1 (LC_Type1_mode)
require(scales)
scales::show_col(Soil_pal$Color)
# preserve the order of levels as in the Soil_pal dataframe
#levels(Soil_pal$`Soil Name`) <- Soil_pal$`Soil Name`
#Create a custom color scale
require(RColorBrewer)
colors <- Soil_pal$Color
names(colors) <- Soil_pal$`Soil Name`
colScale <- scale_colour_manual(name = "Soil Name", values = colors)
as.data.frame(colors)
## colors
## Evergreen Needleleaf Forests #05450a
## Evergreen Broadleaf Forests #086a10
## Deciduous Needleleaf Forests #54a708
## Deciduous Broadleaf Forests #78d203
## Mixed Forests #009900
## Closed Shrublands #c6b044
## Open Shrublands #dcd159
## Woody Savannas #dade48
## Savannas #fbff13
## Grasslands #b6ff05
## Permanent Wetlands #27ff87
## Croplands #c24f44
## Urban and Built-up Lands #a5a5a5
## Cropland/Natural Vegetation Mosaics #ff6d4c
## Permanent Snow and Ice #69fff8
## Barren #f9ffa4
## Water Bodies #1c0dff
p1 <- Train %>%
distinct(Square_ID, .keep_all = TRUE) %>%
ggplot() +
aes(x = Longitude, y = Latitude, colour = `Altitude (m)`) +
geom_point(size = 5) +
scale_colour_gradientn(colours = terrain.colors(10))
## add box to urban zone
p1bis <- p1 +
geom_rect(aes(xmin = 34.95, xmax = 35.1, ymin = -15.71, ymax =-15.87),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 35.05, xmax = 35.15, ymin = -16.52, ymax =-16.64),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 34.8, xmax = 34.88, ymin = -15.98, ymax =-16.29),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 35.39, xmax = 35.41, ymin = -15.77, ymax =-15.85),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 35.38, xmax = 35.4, ymin = -15.24, ymax =-15.3),
fill = "transparent", color = "black", size = 0.5) +
ggtitle("Area Altitude distibution and Urban zone localisation")
p2 <- Train %>%
distinct(Square_ID, .keep_all = TRUE) %>%
left_join(Soil_pal, by = "LC_Type1_mode") %>%
#filter(LC_Type1_mode == 13) %>%
ggplot() +
aes(x = Longitude, y = Latitude, colour = `Soil Name`) +
geom_point(size = 2) +
colScale +
#scale_colour_manual(values = unique(full$Color)) +
geom_rect(aes(xmin = 34.95, xmax = 35.1, ymin = -15.71, ymax =-15.87),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 35.05, xmax = 35.15, ymin = -16.52, ymax =-16.64),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 34.8, xmax = 34.88, ymin = -15.98, ymax =-16.29),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 35.39, xmax = 35.41, ymin = -15.77, ymax =-15.85),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 35.38, xmax = 35.4, ymin = -15.24, ymax =-15.3),
fill = "transparent", color = "black", size = 0.5) +
ggtitle("Soil Name distibution and Urban zone localisation")
p1bis
We obtain similar plots compared to Elevation
and Landcover
plots from biblography.
The right plot shows less Altitude of the green dark area in the south of the Malawi.
The right plot shows the Soil Name distribution. The Urban zone is indicated in grey color. The most large urban zone is the center of the region, loacted in Croplands
Soil Name and in relative high altitude (more that 1000 m (yellow)). The four other urban zone are smaller are located in green area which can more exposed to flood.
omit <- theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_blank(), axis.text.x=element_blank(),axis.text.y=element_blank(), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), axis.title.x=element_blank(), axis.title.y=element_blank(), legend.title = element_blank(), legend.position = "none", plot.title =element_blank())
## omit all text, axes background from plot for image segmentation if any
soil <- Train %>%
ggplot() +
aes(x = Longitude, y = Latitude, colour = `Altitude (m)`) +
geom_point(size = 6) +
scale_colour_gradient(low = "darkblue", high = "white")+
omit
# Save image
invisible(ggsave("Soil.jpeg", plot = soil, dpi = 300))
## Saving 14 x 8 in image
new_train <- Train %>%
gather(key = `Week of` , value = Pluviometry, 8:ncol(Train)) %>%
mutate(`Week of` = str_extract(`Week of`, "\\d{4}-\\d{2}-\\d{2}") ) %>%
mutate(`Week of` = as.Date(`Week of`, format= "%Y-%m-%d")) %>%
mutate(Week = lubridate::week(`Week of`)) %>%
mutate(Year = if_else(`Week of` <= "2015-03-15", 2015, 2019)) %>%
group_by(Square_ID, Year) %>%
mutate(cum_Pluvio = cumsum(Pluviometry)) %>%
ungroup() %>%
mutate(Target = if_else(Year == 2015, target_2015, 0)) %>%
left_join(Soil_pal, by = "LC_Type1_mode") %>%
rename(Soil_type = LC_Type1_mode) %>%
mutate(`Target Range` = cut(Target, breaks= c(-Inf, 0, 0.25, 0.5, 0.75, 1) ,
labels=c("0", "Low" , "Middle" , "High", "Sure"))) %>%
mutate(Height = cut(`Altitude (m)`, breaks= c( 44, 100 ,400, 650 ,750, 1000 , 2300 , 2804) ,
labels=c("R1","L1" , "L2" , "L3", "L4", "L5", "L6"))) %>%
select(-target_2015)
new_train %>% head %>% knitr::kable() %>% kable_styling() %>%
scroll_box(width = "1000px", height = "1000px")
Longitude | Latitude | Altitude (m) | Soil_type | Square_ID | other | Week of | Pluviometry | Week | Year | cum_Pluvio | Target | Color | Soil Name | Description | Target Range | Height |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
34.26 | -15.91 | 887.7642 | 9 | 4e3c3896 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L4 |
34.26 | -15.90 | 743.4039 | 9 | 4e3c3897 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L3 |
34.26 | -15.89 | 565.7283 | 9 | 4e3c3898 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L2 |
34.26 | -15.88 | 443.3928 | 10 | 4e3c3899 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L2 |
34.26 | -15.87 | 437.4434 | 10 | 4e3c389a | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L2 |
34.26 | -15.86 | 405.6317 | 10 | 4e3c389b | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L2 |
new_train <- new_train %>%
mutate(Height = cut(`Altitude (m)`, breaks= c( 44, 100 ,465,480, 700, 715,750, 1000 , 2300 , 2804) ,
labels=c("River1","L1" ,"River2" ,"L2" , "L3","L4", "L5", "L6", "L7"))) %>%
mutate(Height = as.character(Height)) %>%
mutate(Height = if_else(`Soil Name` == "Water Bodies", "Water", Height)) %>%
mutate(Height = if_else(`Soil Name` == "Permanent Wetlands", "Wetlands", Height))%>%
mutate(Height= if_else(Height == "L2" & Target >= 0.7, "River3", Height))
## Add River3 to Height L2 for 2019
SquareID_River3 <- new_train %>% filter(Year == 2015) %>% filter(Height == "River3") %>% distinct(Square_ID, .keep_all = FALSE)
new_train <-new_train %>%
mutate(Height = if_else(Year == 2019 & Height == "L2" & Square_ID %in% SquareID_River3$Square_ID, "River3", Height))
new_train %>% head() %>% knitr::kable() %>% kable_styling() %>%
scroll_box(width = "1000px", height = "1000px")
Longitude | Latitude | Altitude (m) | Soil_type | Square_ID | other | Week of | Pluviometry | Week | Year | cum_Pluvio | Target | Color | Soil Name | Description | Target Range | Height |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
34.26 | -15.91 | 887.7642 | 9 | 4e3c3896 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L5 |
34.26 | -15.90 | 743.4039 | 9 | 4e3c3897 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L4 |
34.26 | -15.89 | 565.7283 | 9 | 4e3c3898 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L2 |
34.26 | -15.88 | 443.3928 | 10 | 4e3c3899 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L1 |
34.26 | -15.87 | 437.4434 | 10 | 4e3c389a | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L1 |
34.26 | -15.86 | 405.6317 | 10 | 4e3c389b | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L1 |
new_train %>%
ggplot() +
geom_line( aes(x = `Week of`, y = Pluviometry), color = "blue")+
geom_line(aes(x = `Week of`, y = cum_Pluvio), color = "red")+
geom_smooth(aes(x = `Week of`, y = cum_Pluvio), method = "gam", formula = y ~ s(x, bs = "cs"), color = "black")+
ylim(0, 400) +
facet_wrap(Year ~., ncol = 2,scales = "free")
new_train %>%
#distinct(Square_ID, .keep_all = TRUE) %>%
filter(Year == 2015) %>%
mutate(wrap_mode = as.factor(if_else(Target == 0, "No Risk 2015", "With Risk 2015"))) %>%
group_by(Soil_type, Height, `Soil Name`, wrap_mode)%>%
summarise(Frequency = log(n())) %>%
ungroup() %>%
mutate(Soil_Height = as.factor(paste0( Height, Soil_type))) %>%
ggplot() +
aes(x = Soil_Height, y= Frequency, fill = `Soil Name`) +
geom_col() +
facet_grid(wrap_mode ~ .)+
theme(legend.position="bottom") +
theme(axis.text.x = element_text(color="black",size=10,hjust=.5,vjust=.5, angle=50))
SoilHeight_weight <- new_train %>%
#filter(Year == 2015) %>%
mutate(wrap_mode = as.factor(if_else(Target == 0, "No Risk 2015", "With Risk 2015"))) %>%
group_by(Soil_type, Height, `Soil Name`, wrap_mode)%>%
summarise(Frequency = log(n()))%>%
ungroup() %>%
mutate(Soil_Height = as.factor(paste0( Height, Soil_type))) %>%
group_by(Soil_Height) %>%
mutate(SumFreq= sum(Frequency)) %>%
#group_by(wrap_mode, add=TRUE) %>%
mutate(SoilHeight_Weight=Frequency/SumFreq) %>%
ungroup() %>%
select(Height, Soil_type, SoilHeight_Weight, wrap_mode ) %>%
arrange(Height)
new_train <- new_train %>%
mutate(wrap_mode = as.factor(if_else(Target == 0, "No Risk 2015", "With Risk 2015"))) %>%
left_join(SoilHeight_weight, by = c("Height", "Soil_type", "wrap_mode"))
new_train %>% head %>% knitr::kable() %>% kable_styling() %>%
scroll_box(width = "1000px", height = "1000px")
Longitude | Latitude | Altitude (m) | Soil_type | Square_ID | other | Week of | Pluviometry | Week | Year | cum_Pluvio | Target | Color | Soil Name | Description | Target Range | Height | wrap_mode | SoilHeight_Weight |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
34.26 | -15.91 | 887.7642 | 9 | 4e3c3896 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L5 | No Risk 2015 | 0.6162409 |
34.26 | -15.90 | 743.4039 | 9 | 4e3c3897 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L4 | No Risk 2015 | 0.6578095 |
34.26 | -15.89 | 565.7283 | 9 | 4e3c3898 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L2 | No Risk 2015 | 0.5901199 |
34.26 | -15.88 | 443.3928 | 10 | 4e3c3899 | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L1 | No Risk 2015 | 0.5949141 |
34.26 | -15.87 | 437.4434 | 10 | 4e3c389a | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L1 | No Risk 2015 | 0.5949141 |
34.26 | -15.86 | 405.6317 | 10 | 4e3c389b | 14ce-11ea-bce5-f49634744a41 | 2014-11-16 | 0 | 46 | 2015 | 0 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L1 | No Risk 2015 | 0.5949141 |
new_train %>%
filter(Year == 2015) %>%
distinct(Square_ID, .keep_all = TRUE) %>%
ggplot +
geom_point(aes(x = Longitude, y = Latitude, color = Height)) + #
geom_point(data = subset(new_train, Target > 0), aes(x = Longitude, y = Latitude,
colour = `Target Range`, shape = `Target Range`),
size = 0.8, stroke = 0, shape = 16) +
#scale_shape_manual(values=c(3, 16, 17, 23), labels = c("0.25", "0.5", "0.75", "1")) +
scale_colour_manual(values = c( "blue" ,"green" , "lightgreen" , "pink" ,"gold" ,"orange" , "darkorange1" ,"chocolate", "cyan" ,"lightblue" , "darkseagreen1","darkseagreen1","darkorchid", "red" , "azure2","burlywood1"),
labels = c("0.75", "L1", "L2", "L3", "L4","L5", "L6", "L7" , "0.25", "0.5","River1", "River2","River3", "1", "Water" , "Wetlands" ),name = "Height/Target") +
ggtitle("Map Altitudes and Flood Risks: 7 Height levels and 4 Probabilities range risks")
new_train %>%
filter(Year == 2015) %>%
distinct(Square_ID, .keep_all = TRUE) %>%
mutate(wrap_mode = as.factor(if_else(Target == 0, "No Risk 2015", "With Risk 2015"))) %>%
group_by(`Target Range`, wrap_mode) %>%
summarise(Frequency = n()) %>%
ungroup() %>%
ggplot +
aes(x = `Target Range`, y = Frequency, colour = `Target Range`) +
geom_point()+
facet_wrap(wrap_mode~ ., ncol = 2 , scales = "free") +
scale_colour_manual(values = terrain.colors(6),labels = c( 0, 0.25, 0.5, 0.75, 1))+
ggtitle("Flood Risk Distribution")
We note:
Unbalanced dataset by class 0
without risk. We can reduce the size of class 0
, by omitting Water body
area, and highest Altitude (>2300m).
The surface with High risk range [0.75,1] is the lowest class, followed by Middle with probability between [0.5, 0.75].
The proportion of area with no risk coverts the most important surface during the Flood 2015.
We expect that these values will increase for surfaces with risk, if the pluviometry is higher during flood 2019.
# balanced_train <- new_train %>%
# filter(Year == 2015) %>%
# #filter(Target== 0) %>%
# group_by(`Target Range`, Height) %>%
# sample_n(1000, replace = TRUE) %>%
# ungroup() %>%
# select(Longitude, Latitude, `Target Range`, `Altitude (m)`, Soil_type, Pluviometry, cum_Pluvio, Target, Height, SoilHeight_Weight)
# #filter(Target > 0) %>%
# # group_by(Height) %>%
# #summarise(n())
#
#
# balanced_train %>%
# mutate(wrap_mode = as.factor(if_else(Target == 0, "No Risk 2015", "With Risk 2015"))) %>%
# group_by(`Target Range`, wrap_mode) %>%
# summarise(Frequency = n()) %>%
# ungroup() %>%
# ggplot +
# aes(x = `Target Range`, y = Frequency, colour = `Target Range`) +
# geom_point()+
# facet_wrap(wrap_mode~ ., ncol = 2 , scales = "free") +
# scale_colour_manual(values = terrain.colors(6),labels = c( 0, 0.25, 0.5, 0.75, 1))+
# ggtitle("Flood Risk Distribution (Balanced)")
Square_ID_F <- new_train %>%
filter(`Week of` <= "2015-03-15") %>%
#filter(Target > 0) %>%
select(Square_ID) %>%
unique()
new_train %>%
filter(Square_ID %in% Square_ID_F$Square_ID) %>%
group_by(Year)%>%
filter(`Week of` == max(`Week of`))%>%
ggplot() +
aes(x = Longitude, y = Latitude) +
geom_point(aes(colour = cum_Pluvio, size = Target)) + # , size = target_2015
scale_colour_gradientn(colours = c("red","darkblue","blue","lightblue","white"),
values = c(1.0,0.8,0.6,0.4,0.2,0)) +
#geom_point(data = subset(new_train, is.na(Target)), aes(size = 0, colour = cum_Pluvio)) +
scale_size(name = "Target",
breaks = c(0, 0.5, 1, 2)) +
facet_wrap(Year ~., ncol = 2, scales = "free") +
geom_rect(aes(xmin = 34.95, xmax = 35.1, ymin = -15.71, ymax =-15.87),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 35.05, xmax = 35.15, ymin = -16.52, ymax =-16.64),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 34.8, xmax = 34.88, ymin = -15.98, ymax =-16.29),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 35.39, xmax = 35.41, ymin = -15.77, ymax =-15.85),
fill = "transparent", color = "black", size = 0.5) +
geom_rect(aes(xmin = 35.38, xmax = 35.4, ymin = -15.24, ymax =-15.3),
fill = "transparent", color = "black", size = 0.5) +
ggtitle("Flood Zone during 2015 and Cumulative Pluviometry Maps ")
Depending only on cumulative pluviometry we expect to have more flood surface during 2019 compared to flood during 2015.
Cumulative pluviomtery of the same zone during 2019 is higher (2 times) that those of 2015.
The Flood seems to be more invasive and the water will reach higher altitude and more area.
# library(randomForest)
# train_2015 <- balanced_train %>%
# #filter(Year == 2015) %>%
# rename(Altitude = `Altitude (m)`) %>%
# mutate(Altitude = log(Altitude)) %>%
# mutate(Height = as.numeric(as.factor(Height))) %>%
# select(Longitude, Latitude, Altitude, Target, Soil_type, cum_Pluvio, Pluviometry, Height, SoilHeight_Weight)
#
#
# rf_model <- randomForest(Target ~ Longitude +Latitude + Soil_type + cum_Pluvio + Altitude + Pluviometry + Height + SoilHeight_Weight +
# Target_Range ,
# data = train_2015,
# ntree=500)
#
# load("rf_model.RData")
# save(rf_model, file ="rf_model.RData")
#
# summary(rf_model)
The flood zone probability has high correlation with Cumulative Pluviometry
, Soil Name
, and Altitude
.
Predict Flood zone probability during 2019.
# train_2019 <- new_train %>%
# filter(Year == 2019) %>%
# rename(Altitude = `Altitude (m)`) %>%
# mutate(Altitude = log(Altitude)) %>%
# mutate(Height = as.numeric(as.factor(Height))) %>%
# select(Longitude, Latitude,Altitude, -Target, Soil_type, cum_Pluvio, Pluviometry, Height, SoilHeight_Weight)
# # select(-Target, Altitude, Soil_type, cum_Pluvio, Pluviometry)
#
# target_2019 <- round(predict(rf_model, train_2019, type="response"), digits = 2)
#
#
#
# summary(target_2019)
#
#
# new_train[new_train$Year == 2019,]$Target <- target_2019
# new_train %>%
# ggplot() +
# aes(x = Longitude, y = Latitude) + #, colour = cum_Pluvio
# geom_point(aes(colour = Target, size = Target)) +
# #scale_colour_gradient(low = "blue", high = "red") +
# scale_colour_gradientn(colours = c("red","darkblue","blue","lightblue","white"),
# values = c(1.0,0.75,0.5,0.25,0)) +
# scale_size(name = "Target",
# breaks = c(0, 0.25, 0.5, 0.75, 1)) +
# facet_wrap(Year ~., ncol = 2) +
# geom_rect(aes(xmin = 34.95, xmax = 35.1, ymin = -15.71, ymax =-15.87),
# fill = "transparent", color = "black", size = 0.5) +
# geom_rect(aes(xmin = 35.05, xmax = 35.15, ymin = -16.52, ymax =-16.64),
# fill = "transparent", color = "black", size = 0.5) +
# geom_rect(aes(xmin = 34.8, xmax = 34.88, ymin = -15.98, ymax =-16.29),
# fill = "transparent", color = "black", size = 0.5) +
# geom_rect(aes(xmin = 35.39, xmax = 35.41, ymin = -15.77, ymax =-15.85),
# fill = "transparent", color = "black", size = 0.5) +
# geom_rect(aes(xmin = 35.38, xmax = 35.4, ymin = -15.24, ymax =-15.3),
# fill = "transparent", color = "black", size = 0.5)+
# ggtitle("Flood zone prediction (RF) during 2019: Prob = ft(Cumulative Pluviometry, Altitude, Pluviometry, Soil_type)")
Random Forest prediction does not give a good predction. We expect to have larger flood zone than flood 2015.
The disaster seems to be more invasive and mostly all the south will be flooded except zone with Altitude more than 2000 m
Very time consuming!
require(caret)
# set.seed(42)
# index <- createDataPartition(train_2015$Target, p = 0.7, list = FALSE)
# train_data <- train_2015[index, ]
# test_data <- train_2015[-index, ]
#
# set.seed(42)
# model_rf <- caret::train(Target ~ .,
# data = train_data,
# method = "rf",
# metric = "RMSE",
# preProcess = c("scale", "center"),
# trControl = trainControl(method = "repeatedcv",
# number = 2,
# repeats = 1,
# verboseIter = TRUE)
# )
#
#
# save(model_rf, file ="model_caret_rf.RData")
#
# final <- data.frame(actual = as.factor(as.character(test_data$Target)),
# pred = predict(model_rf, newdata = test_data, type = "raw"))
#
# #final$predict <- as.factor(ifelse(final$pred >= 0.75, 1, 0))
#
# #cm_original <- confusionMatrix(final$predict, final$actual)
#
# target_2019 <- round(predict(model_rf, train_2019, type="raw"), digits = 2)
#
#
#
# summary(target_2019)
#
#
# #new_train[new_train$Year == 2019,]$Target <- target_2019
new_train01 <- new_train %>%
mutate(binTarget = as.factor(if_else(Target >= 0.5, 1, 0)))%>%
group_by(Square_ID) %>%
mutate(meanPluv = mean(Pluviometry),
medianPluv = median(Pluviometry),
maxPluv = max(Pluviometry))%>%
ungroup()
new_train01 %>%
filter(Year == 2015)%>%
group_by(binTarget) %>%
summarise(nbr = n()) %>%
ggplot() +
aes(x = binTarget, y = nbr, fill= binTarget) +
geom_col()
new_train01 %>%
#mutate(Target = as.factor(Target)) %>%
filter(Year == 2015) %>%
distinct(Square_ID, .keep_all = TRUE) %>%
ggplot +
geom_point(aes(x = Longitude, y = Latitude, color = Height)) +
geom_point(#data = subset(new_train, Target == 1),
aes(x = Longitude, y = Latitude, color = binTarget, shape = binTarget),
size = 0.6, stroke = 0, shape = 19) +
scale_colour_manual(values = c( "lightblue" , "red" , "green" , "lightgreen" , "pink" ,"gold" ,"orange" , "darkorange1" ,"chocolate", "darkseagreen1","darkseagreen1","darkorchid" , "azure2","burlywood1"),
labels = c( "0", "1", "L1", "L2", "L3", "L4","L5", "L6", "L7" , "River1", "River2","River3", "Water" , "Wetlands" ), name = "Height/Target")
Target_Range_vs_XY_2015 <- new_train01 %>%
filter(Year == 2015) %>%
distinct(Square_ID, .keep_all = TRUE) %>%
select(Square_ID,`Target Range`)
Train_2019 <- new_train01 %>%
filter(Year == 2019) %>%
select(-`Target Range`)
Train_2019_TR <- Train_2019 %>%
left_join(Target_Range_vs_XY_2015, by = "Square_ID")
new_train01[new_train01$Year == 2019,]$`Target Range` <- Train_2019_TR$`Target Range`
new_train01[new_train01$Year == 2019,] %>% head %>% knitr::kable() %>% kable_styling() %>%
scroll_box(width = "1000px", height = "1000px")
Longitude | Latitude | Altitude (m) | Soil_type | Square_ID | other | Week of | Pluviometry | Week | Year | cum_Pluvio | Target | Color | Soil Name | Description | Target Range | Height | wrap_mode | SoilHeight_Weight | binTarget | meanPluv | medianPluv | maxPluv |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
34.26 | -15.91 | 887.7642 | 9 | 4e3c3896 | 14ce-11ea-bce5-f49634744a41 | 2019-01-20 | 12.99262 | 3 | 2019 | 12.99262 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L5 | No Risk 2015 | 0.6162409 | 0 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.90 | 743.4039 | 9 | 4e3c3897 | 14ce-11ea-bce5-f49634744a41 | 2019-01-20 | 12.99262 | 3 | 2019 | 12.99262 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L4 | No Risk 2015 | 0.6578095 | 0 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.89 | 565.7283 | 9 | 4e3c3898 | 14ce-11ea-bce5-f49634744a41 | 2019-01-20 | 12.99262 | 3 | 2019 | 12.99262 | 0 | #fbff13 | Savannas | tree cover 10-30% (canopy >2m). | 0 | L2 | No Risk 2015 | 0.5901199 | 0 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.88 | 443.3928 | 10 | 4e3c3899 | 14ce-11ea-bce5-f49634744a41 | 2019-01-20 | 12.99262 | 3 | 2019 | 12.99262 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L1 | No Risk 2015 | 0.5949141 | 0 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.87 | 437.4434 | 10 | 4e3c389a | 14ce-11ea-bce5-f49634744a41 | 2019-01-20 | 12.99262 | 3 | 2019 | 12.99262 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L1 | No Risk 2015 | 0.5949141 | 0 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.86 | 405.6317 | 10 | 4e3c389b | 14ce-11ea-bce5-f49634744a41 | 2019-01-20 | 12.99262 | 3 | 2019 | 12.99262 | 0 | #b6ff05 | Grasslands | dominated by herbaceous annuals (<2m). | 0 | L1 | No Risk 2015 | 0.5949141 | 0 | 12.78804 | 4.689434 | 58.36246 |
dataset2015 <- new_train01 %>%
select(Longitude, Latitude,`Week of` , Week, `Altitude (m)`,
Soil_type, Square_ID, Pluviometry, cum_Pluvio, Target, binTarget,
`Target Range`, Height, SoilHeight_Weight, meanPluv, medianPluv, maxPluv) %>%
rename(weeks = `Week of`) %>%
filter(weeks < "2015-03-30") %>%
rename(Altitude = `Altitude (m)`) %>%
mutate(Altitude = log(Altitude)) %>%
rename(Target_Range = `Target Range`) %>%
mutate(Target_Range = as.numeric(Target_Range) -1) %>%
mutate(Height = as.numeric(as.factor(Height)) - 1)
fwrite(dataset2015, "dataset2015.csv")
dataset2015 %>% head() %>% knitr::kable() %>% kable_styling() %>%
scroll_box(width = "1000px", height = "1000px")
Longitude | Latitude | weeks | Week | Altitude | Soil_type | Square_ID | Pluviometry | cum_Pluvio | Target | binTarget | Target_Range | Height | SoilHeight_Weight | meanPluv | medianPluv | maxPluv |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
34.26 | -15.91 | 2014-11-16 | 46 | 6.788706 | 9 | 4e3c3896 | 0 | 0 | 0 | 0 | 0 | 4 | 0.6162409 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.90 | 2014-11-16 | 46 | 6.611239 | 9 | 4e3c3897 | 0 | 0 | 0 | 0 | 0 | 3 | 0.6578095 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.89 | 2014-11-16 | 46 | 6.338114 | 9 | 4e3c3898 | 0 | 0 | 0 | 0 | 0 | 1 | 0.5901199 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.88 | 2014-11-16 | 46 | 6.094456 | 10 | 4e3c3899 | 0 | 0 | 0 | 0 | 0 | 0 | 0.5949141 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.87 | 2014-11-16 | 46 | 6.080947 | 10 | 4e3c389a | 0 | 0 | 0 | 0 | 0 | 0 | 0.5949141 | 12.78804 | 4.689434 | 58.36246 |
34.26 | -15.86 | 2014-11-16 | 46 | 6.005446 | 10 | 4e3c389b | 0 | 0 | 0 | 0 | 0 | 0 | 0.5949141 | 12.78804 | 4.689434 | 58.36246 |
new_train %>%
filter(`Week of` <= "2015-04-15") %>%
ggplot() +
aes(x = Longitude, y = Latitude, colour = cum_Pluvio) +
geom_point(size = 4) +
#geom_point( aes(x = Longitude, y = Latitude, colour = Target))+ # data = subset(new_train[new_train$Target > 0.75,]),
#scale_colour_gradient(low = "blue", high = "red") +
#scale_colour_gradientn(colours = rainbow(3))+
scale_colour_gradientn(colours = c("red","darkblue","blue","lightblue","white"),
values = c(1.0,0.8,0.6,0.4,0.2,0)) +
#scale_colour_gradientn(colours = c("blue","lightblue","white"),
# values = c(1.0,0.5,0)) +
facet_wrap(`Week of` ~., ncol = 4 ) +
ggtitle("Cumulative Pluviometry of 2015 before Flood")
new_train %>%
filter(`Week of` > "2015-03-15") %>%
ggplot() +
aes(x = Longitude, y = Latitude, colour = cum_Pluvio) +
geom_point(size = 4) +
#scale_colour_gradient(low = "blue", high = "red") +
scale_colour_gradientn(colours = c("red","darkblue","blue","lightblue","white"),
values = c(1.0,0.8,0.6,0.4,0.2,0)) +
facet_wrap(`Week of` ~., ncol = 4 ) +
ggtitle("Cumulative Pluviometry of 2019 before Flood")
The train (2014-2015) and the test (2019) have the same period (17 weeks) but not the same season (2014-Nov : 2015-March, Jan-2019 : May-2019).
Pluviometry during 2019 is two times more that pluviometry during 2015.
We used Google Earth Engine to see pluvimmetry before Flood 2015 and 2019 around the south of Malawi.
# require(gganimate)
#
# goo <- new_train %>%
# filter(`Week of` <= "2015-03-15") %>%
# ggplot() +
# aes(x = Longitude, y = Latitude, colour = cum_Pluvio) +
# geom_point(size = 4) +
# #scale_colour_gradient(low = "blue", high = "red") +
# scale_colour_gradientn(colours = c("blue","lightblue","white"),
# values = c(1.0,0.5,0)) +
# #facet_wrap(`Week of` ~., ncol = 3 ) +
# transition_time(`Week of`) +
# labs(title = "Day: {frame_time}")
#
# # foo <- new_train %>%
# # filter(`Week of` > "2015-03-15") %>%
# # ggplot() +
# # aes(x = Longitude, y = Latitude, colour = Pluviometry) +
# # geom_point(size = 4) +
# # #facet_wrap(`Week of` ~., ncol = 3 ) +
# # transition_time(`Week of`) +
# # labs(title = "Day: {frame_time}")
#
#
# ## get and save animation
# nframes <- length(unique(goo$data$`Week of`)) * 7
# ani_goo <- animate(goo, nframes = nframes, fps = 10)
# png_files <- list.files(".", pattern = ".*png$", full.names = TRUE)
# require(gifski)
# #gifski(png_files, gif_file = "ani_goo.gif", width = 800, height = 600, delay = 1)
# #invisible(file.remove(png_files, ))
#
# #animate(goo, renderer = ffmpeg_renderer(format = "webm"))
# #animate(foo, renderer = ffmpeg_renderer(format = "webm"))
# require(plotly)
#
# new_train %>%
# filter(`Week of` < "2014-12-14") %>%
# plot_ly(
# x = ~Longitude,
# y = ~Latitude,
# #size = ~pop,
# color = ~Pluviometry,
# frame = ~ as.Date(`Week of`),
# text = ~`Week of`,
# hoverinfo = "text",
# type = 'scatter',
# mode = 'markers'
# ) %>%
# layout(
# xaxis = list(
# type = "log"
# )
# )
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter
## Longitude Latitude weeks ... meanPluv medianPluv maxPluv
## 0 34.26 -15.91 2014-11-16 ... 12.788037 4.689434 58.362456
## 1 34.26 -15.90 2014-11-16 ... 12.788037 4.689434 58.362456
## 2 34.26 -15.89 2014-11-16 ... 12.788037 4.689434 58.362456
## 3 34.26 -15.88 2014-11-16 ... 12.788037 4.689434 58.362456
## 4 34.26 -15.87 2014-11-16 ... 12.788037 4.689434 58.362456
##
## [5 rows x 17 columns]
## Counter({0: 232917, 1: 19363, 4: 14178, 2: 8024, 3: 5440})
dic_range={0: "0", 1: "Low", 2 : "medium", 3:"High", 4:"Sure"}
plt.show(
plt.bar(range(len(dic_range)),list(Counter(train['Target_Range']).values()))
)
# split Train and Test
# Separate minors and major Target_Range
train0 = train[train.Target_Range == 0]
train1 = train[train.Target_Range == 1]
train2 = train[train.Target_Range == 2]
train3 = train[train.Target_Range == 3]
train4 = train[train.Target_Range == 4]
from sklearn.utils import resample
# Upsample minority classes
train_0_downsampled = resample(train0,
replace= False, # sample with replacement
n_samples=19363, # to match majority class
random_state=123) # reproducible results
#
# Upsample minority classes
#train_1_downsampled = resample(train1,
# replace=False, # sample with replacement
# n_samples=8024, # to match majority class
# random_state=123) # reproducible results
# Downsample majority class
train_2_upsampled = resample(train2,
replace=True, # sample without replacement
n_samples=19363, # to match minority class
random_state=123) # reproducible results
# Downsample majority class
train_3_upsampled = resample(train3,
replace= True, # sample without replacement
n_samples=19363, # to match minority class
random_state=123) #
# Downsample majority class
train_4_upsampled = resample(train4,
replace= True, # sample without replacement
n_samples=19363, # to match minority class
random_state=123) #
# Combine minority class with downsampled majority class
train_balanced = pd.concat([train_0_downsampled, train1, train_2_upsampled, train_3_upsampled, train_4_upsampled])
Counter(train_balanced['Target_Range'])
## Counter({0: 19363, 1: 19363, 2: 19363, 3: 19363, 4: 19363})
## Index(['Longitude', 'Latitude', 'weeks', 'Week', 'Altitude', 'Soil_type',
## 'Square_ID', 'Pluviometry', 'cum_Pluvio', 'Target', 'binTarget',
## 'Target_Range', 'Height', 'SoilHeight_Weight', 'meanPluv', 'medianPluv',
## 'maxPluv'],
## dtype='object')
ytrain = train.binTarget
xtrain = train.drop(['Target','binTarget' ,'Square_ID', 'weeks', 'Latitude', 'Longitude', 'meanPluv', 'medianPluv', 'maxPluv', 'Pluviometry'], axis=1) #
xtest = test.drop(['Target','binTarget', 'weeks', 'Latitude', 'Longitude', 'meanPluv', 'medianPluv', 'maxPluv', 'Pluviometry'], axis=1) #
## Dummy variable levels is not better
# def prepare_data_for_model(raw_dataframe, target_columns, drop_first = True, make_na_col = False):
# # dummy all categorical fields
# dataframe_dummy = pd.get_dummies(raw_dataframe, columns=target_columns,
# drop_first=drop_first,
# dummy_na=make_na_col)
# return (dataframe_dummy)
#
#
# # create dummy features
# xtrain_dum = prepare_data_for_model(xtrain, target_columns=['Soil_type', 'Height']) #, 'Nstage' : 3 classes
# xtrain_dum = xtrain_dum.dropna()
#
# # create dummy features for test
# xtest_dum = prepare_data_for_model(xtest, target_columns=['Soil_type', 'Height']) #, 'Nstage' : 4 classes
# xtest_dum = xtest_dum.dropna()
#
# xtrain_dum.head
from sklearn.model_selection import train_test_split
import xgboost as xgb
import time
## Split
trn_x, val_x, trn_y, val_y = train_test_split(xtrain , ytrain, random_state = 42, stratify = ytrain, test_size = 0.20) #
start_time = time.process_time()
clf = xgb.XGBRegressor(booster = 'gbtree',
objective = 'reg:logistic', # multi:softmax, multi:softprob #'reg:squarederror', # reg:linear reg:logistic binary:logistic
#num_class = 2,
max_depth = 10,
n_estimators = 10000,
min_child_weight = 9,
learning_rate = 0.01,
nthread = 8,
subsample = 0.80,
colsample_bytree = 0.80,
seed = 4242)
clf.fit(trn_x,
trn_y,
eval_set = [(val_x, val_y)],
verbose = True,
#verbose_eval= 10, # print every 10 boost
eval_metric = 'rmse', # rmse, logloss, mae, map, cox-nloglik
early_stopping_rounds = 10)
## [0] validation_0-rmse:0.495411
## Will train until validation_0-rmse hasn't improved in 10 rounds.
## [1] validation_0-rmse:0.490456
## [2] validation_0-rmse:0.485552
## [3] validation_0-rmse:0.480697
## [4] validation_0-rmse:0.475898
## [5] validation_0-rmse:0.471438
## [6] validation_0-rmse:0.466731
## [7] validation_0-rmse:0.462064
## [8] validation_0-rmse:0.457445
## [9] validation_0-rmse:0.452875
## [10] validation_0-rmse:0.448646
## [11] validation_0-rmse:0.444613
## [12] validation_0-rmse:0.440168
## [13] validation_0-rmse:0.435771
## [14] validation_0-rmse:0.431419
## [15] validation_0-rmse:0.427409
## [16] validation_0-rmse:0.423144
## [17] validation_0-rmse:0.418916
## [18] validation_0-rmse:0.414731
## [19] validation_0-rmse:0.410867
## [20] validation_0-rmse:0.407207
## [21] validation_0-rmse:0.403134
## [22] validation_0-rmse:0.399486
## [23] validation_0-rmse:0.395487
## [24] validation_0-rmse:0.391545
## [25] validation_0-rmse:0.387644
## [26] validation_0-rmse:0.383777
## [27] validation_0-rmse:0.379935
## [28] validation_0-rmse:0.376431
## [29] validation_0-rmse:0.372664
## [30] validation_0-rmse:0.368958
## [31] validation_0-rmse:0.36527
## [32] validation_0-rmse:0.361621
## [33] validation_0-rmse:0.358279
## [34] validation_0-rmse:0.354709
## [35] validation_0-rmse:0.351181
## [36] validation_0-rmse:0.347669
## [37] validation_0-rmse:0.344599
## [38] validation_0-rmse:0.341162
## [39] validation_0-rmse:0.338015
## [40] validation_0-rmse:0.334644
## [41] validation_0-rmse:0.331641
## [42] validation_0-rmse:0.328325
## [43] validation_0-rmse:0.325307
## [44] validation_0-rmse:0.322071
## [45] validation_0-rmse:0.319188
## [46] validation_0-rmse:0.316008
## [47] validation_0-rmse:0.313231
## [48] validation_0-rmse:0.310111
## [49] validation_0-rmse:0.307018
## [50] validation_0-rmse:0.303959
## [51] validation_0-rmse:0.300929
## [52] validation_0-rmse:0.298188
## [53] validation_0-rmse:0.295218
## [54] validation_0-rmse:0.292277
## [55] validation_0-rmse:0.289362
## [56] validation_0-rmse:0.28648
## [57] validation_0-rmse:0.28387
## [58] validation_0-rmse:0.281039
## [59] validation_0-rmse:0.278241
## [60] validation_0-rmse:0.275478
## [61] validation_0-rmse:0.272731
## [62] validation_0-rmse:0.270015
## [63] validation_0-rmse:0.267325
## [64] validation_0-rmse:0.264663
## [65] validation_0-rmse:0.26203
## [66] validation_0-rmse:0.259647
## [67] validation_0-rmse:0.257297
## [68] validation_0-rmse:0.254734
## [69] validation_0-rmse:0.252203
## [70] validation_0-rmse:0.249691
## [71] validation_0-rmse:0.24748
## [72] validation_0-rmse:0.245012
## [73] validation_0-rmse:0.242575
## [74] validation_0-rmse:0.240159
## [75] validation_0-rmse:0.23777
## [76] validation_0-rmse:0.235399
## [77] validation_0-rmse:0.233056
## [78] validation_0-rmse:0.230737
## [79] validation_0-rmse:0.228712
## [80] validation_0-rmse:0.226696
## [81] validation_0-rmse:0.224442
## [82] validation_0-rmse:0.222511
## [83] validation_0-rmse:0.220295
## [84] validation_0-rmse:0.218406
## [85] validation_0-rmse:0.216453
## [86] validation_0-rmse:0.214603
## [87] validation_0-rmse:0.212471
## [88] validation_0-rmse:0.210353
## [89] validation_0-rmse:0.208262
## [90] validation_0-rmse:0.206188
## [91] validation_0-rmse:0.20414
## [92] validation_0-rmse:0.202103
## [93] validation_0-rmse:0.200094
## [94] validation_0-rmse:0.198103
## [95] validation_0-rmse:0.19634
## [96] validation_0-rmse:0.194642
## [97] validation_0-rmse:0.192709
## [98] validation_0-rmse:0.190789
## [99] validation_0-rmse:0.189109
## [100] validation_0-rmse:0.187226
## [101] validation_0-rmse:0.185365
## [102] validation_0-rmse:0.183518
## [103] validation_0-rmse:0.181696
## [104] validation_0-rmse:0.179884
## [105] validation_0-rmse:0.178099
## [106] validation_0-rmse:0.176528
## [107] validation_0-rmse:0.174773
## [108] validation_0-rmse:0.173033
## [109] validation_0-rmse:0.171314
## [110] validation_0-rmse:0.169611
## [111] validation_0-rmse:0.167922
## [112] validation_0-rmse:0.166251
## [113] validation_0-rmse:0.164599
## [114] validation_0-rmse:0.162962
## [115] validation_0-rmse:0.161339
## [116] validation_0-rmse:0.159735
## [117] validation_0-rmse:0.158369
## [118] validation_0-rmse:0.156982
## [119] validation_0-rmse:0.155419
## [120] validation_0-rmse:0.153876
## [121] validation_0-rmse:0.152346
## [122] validation_0-rmse:0.150833
## [123] validation_0-rmse:0.14933
## [124] validation_0-rmse:0.147845
## [125] validation_0-rmse:0.146378
## [126] validation_0-rmse:0.144921
## [127] validation_0-rmse:0.143478
## [128] validation_0-rmse:0.142261
## [129] validation_0-rmse:0.140846
## [130] validation_0-rmse:0.139446
## [131] validation_0-rmse:0.13806
## [132] validation_0-rmse:0.136687
## [133] validation_0-rmse:0.135504
## [134] validation_0-rmse:0.134156
## [135] validation_0-rmse:0.132821
## [136] validation_0-rmse:0.131705
## [137] validation_0-rmse:0.130395
## [138] validation_0-rmse:0.129271
## [139] validation_0-rmse:0.128158
## [140] validation_0-rmse:0.127062
## [141] validation_0-rmse:0.125798
## [142] validation_0-rmse:0.124549
## [143] validation_0-rmse:0.123484
## [144] validation_0-rmse:0.122256
## [145] validation_0-rmse:0.121215
## [146] validation_0-rmse:0.120011
## [147] validation_0-rmse:0.11882
## [148] validation_0-rmse:0.117636
## [149] validation_0-rmse:0.116468
## [150] validation_0-rmse:0.115309
## [151] validation_0-rmse:0.114167
## [152] validation_0-rmse:0.113033
## [153] validation_0-rmse:0.111911
## [154] validation_0-rmse:0.11096
## [155] validation_0-rmse:0.10986
## [156] validation_0-rmse:0.108766
## [157] validation_0-rmse:0.107686
## [158] validation_0-rmse:0.106779
## [159] validation_0-rmse:0.105915
## [160] validation_0-rmse:0.104862
## [161] validation_0-rmse:0.103823
## [162] validation_0-rmse:0.102791
## [163] validation_0-rmse:0.101771
## [164] validation_0-rmse:0.10076
## [165] validation_0-rmse:0.099917
## [166] validation_0-rmse:0.099109
## [167] validation_0-rmse:0.098284
## [168] validation_0-rmse:0.097308
## [169] validation_0-rmse:0.096342
## [170] validation_0-rmse:0.095387
## [171] validation_0-rmse:0.094439
## [172] validation_0-rmse:0.0935
## [173] validation_0-rmse:0.092574
## [174] validation_0-rmse:0.091657
## [175] validation_0-rmse:0.090747
## [176] validation_0-rmse:0.089847
## [177] validation_0-rmse:0.088954
## [178] validation_0-rmse:0.088071
## [179] validation_0-rmse:0.087198
## [180] validation_0-rmse:0.086332
## [181] validation_0-rmse:0.085477
## [182] validation_0-rmse:0.084628
## [183] validation_0-rmse:0.083789
## [184] validation_0-rmse:0.082956
## [185] validation_0-rmse:0.082298
## [186] validation_0-rmse:0.081482
## [187] validation_0-rmse:0.080673
## [188] validation_0-rmse:0.079871
## [189] validation_0-rmse:0.079081
## [190] validation_0-rmse:0.078297
## [191] validation_0-rmse:0.07768
## [192] validation_0-rmse:0.076908
## [193] validation_0-rmse:0.076146
## [194] validation_0-rmse:0.07539
## [195] validation_0-rmse:0.074641
## [196] validation_0-rmse:0.074031
## [197] validation_0-rmse:0.07343
## [198] validation_0-rmse:0.072702
## [199] validation_0-rmse:0.071981
## [200] validation_0-rmse:0.071399
## [201] validation_0-rmse:0.070844
## [202] validation_0-rmse:0.070142
## [203] validation_0-rmse:0.069446
## [204] validation_0-rmse:0.068758
## [205] validation_0-rmse:0.068076
## [206] validation_0-rmse:0.067401
## [207] validation_0-rmse:0.066732
## [208] validation_0-rmse:0.066069
## [209] validation_0-rmse:0.065415
## [210] validation_0-rmse:0.064765
## [211] validation_0-rmse:0.064122
## [212] validation_0-rmse:0.063627
## [213] validation_0-rmse:0.062995
## [214] validation_0-rmse:0.062371
## [215] validation_0-rmse:0.061753
## [216] validation_0-rmse:0.061139
## [217] validation_0-rmse:0.060534
## [218] validation_0-rmse:0.059934
## [219] validation_0-rmse:0.059474
## [220] validation_0-rmse:0.058885
## [221] validation_0-rmse:0.058301
## [222] validation_0-rmse:0.057722
## [223] validation_0-rmse:0.057283
## [224] validation_0-rmse:0.056833
## [225] validation_0-rmse:0.056269
## [226] validation_0-rmse:0.05571
## [227] validation_0-rmse:0.055159
## [228] validation_0-rmse:0.054612
## [229] validation_0-rmse:0.05407
## [230] validation_0-rmse:0.053534
## [231] validation_0-rmse:0.053003
## [232] validation_0-rmse:0.052477
## [233] validation_0-rmse:0.051956
## [234] validation_0-rmse:0.051442
## [235] validation_0-rmse:0.050931
## [236] validation_0-rmse:0.050426
## [237] validation_0-rmse:0.050051
## [238] validation_0-rmse:0.049554
## [239] validation_0-rmse:0.049063
## [240] validation_0-rmse:0.048681
## [241] validation_0-rmse:0.048324
## [242] validation_0-rmse:0.047844
## [243] validation_0-rmse:0.04737
## [244] validation_0-rmse:0.0469
## [245] validation_0-rmse:0.046435
## [246] validation_0-rmse:0.045975
## [247] validation_0-rmse:0.045519
## [248] validation_0-rmse:0.045067
## [249] validation_0-rmse:0.044721
## [250] validation_0-rmse:0.044278
## [251] validation_0-rmse:0.043838
## [252] validation_0-rmse:0.043405
## [253] validation_0-rmse:0.042974
## [254] validation_0-rmse:0.042548
## [255] validation_0-rmse:0.042126
## [256] validation_0-rmse:0.041709
## [257] validation_0-rmse:0.041295
## [258] validation_0-rmse:0.040885
## [259] validation_0-rmse:0.04048
## [260] validation_0-rmse:0.040078
## [261] validation_0-rmse:0.03968
## [262] validation_0-rmse:0.039287
## [263] validation_0-rmse:0.038988
## [264] validation_0-rmse:0.038601
## [265] validation_0-rmse:0.038219
## [266] validation_0-rmse:0.03784
## [267] validation_0-rmse:0.037465
## [268] validation_0-rmse:0.037093
## [269] validation_0-rmse:0.036813
## [270] validation_0-rmse:0.03655
## [271] validation_0-rmse:0.036189
## [272] validation_0-rmse:0.03583
## [273] validation_0-rmse:0.035561
## [274] validation_0-rmse:0.035306
## [275] validation_0-rmse:0.034957
## [276] validation_0-rmse:0.03461
## [277] validation_0-rmse:0.034267
## [278] validation_0-rmse:0.033927
## [279] validation_0-rmse:0.033677
## [280] validation_0-rmse:0.033445
## [281] validation_0-rmse:0.033113
## [282] validation_0-rmse:0.032785
## [283] validation_0-rmse:0.03246
## [284] validation_0-rmse:0.032138
## [285] validation_0-rmse:0.031819
## [286] validation_0-rmse:0.031504
## [287] validation_0-rmse:0.031191
## [288] validation_0-rmse:0.030883
## [289] validation_0-rmse:0.030577
## [290] validation_0-rmse:0.030274
## [291] validation_0-rmse:0.03006
## [292] validation_0-rmse:0.029843
## [293] validation_0-rmse:0.029629
## [294] validation_0-rmse:0.029424
## [295] validation_0-rmse:0.029133
## [296] validation_0-rmse:0.028844
## [297] validation_0-rmse:0.028558
## [298] validation_0-rmse:0.028355
## [299] validation_0-rmse:0.028156
## [300] validation_0-rmse:0.027877
## [301] validation_0-rmse:0.027682
## [302] validation_0-rmse:0.027408
## [303] validation_0-rmse:0.027136
## [304] validation_0-rmse:0.026867
## [305] validation_0-rmse:0.026601
## [306] validation_0-rmse:0.026417
## [307] validation_0-rmse:0.026155
## [308] validation_0-rmse:0.025981
## [309] validation_0-rmse:0.025804
## [310] validation_0-rmse:0.025548
## [311] validation_0-rmse:0.025295
## [312] validation_0-rmse:0.025044
## [313] validation_0-rmse:0.024875
## [314] validation_0-rmse:0.024719
## [315] validation_0-rmse:0.024475
## [316] validation_0-rmse:0.024323
## [317] validation_0-rmse:0.024083
## [318] validation_0-rmse:0.023844
## [319] validation_0-rmse:0.023686
## [320] validation_0-rmse:0.023452
## [321] validation_0-rmse:0.023303
## [322] validation_0-rmse:0.023072
## [323] validation_0-rmse:0.022843
## [324] validation_0-rmse:0.022618
## [325] validation_0-rmse:0.022394
## [326] validation_0-rmse:0.022252
## [327] validation_0-rmse:0.022032
## [328] validation_0-rmse:0.021814
## [329] validation_0-rmse:0.021684
## [330] validation_0-rmse:0.021469
## [331] validation_0-rmse:0.021331
## [332] validation_0-rmse:0.02112
## [333] validation_0-rmse:0.020911
## [334] validation_0-rmse:0.02079
## [335] validation_0-rmse:0.020584
## [336] validation_0-rmse:0.020454
## [337] validation_0-rmse:0.020331
## [338] validation_0-rmse:0.020129
## [339] validation_0-rmse:0.020009
## [340] validation_0-rmse:0.01989
## [341] validation_0-rmse:0.019693
## [342] validation_0-rmse:0.019573
## [343] validation_0-rmse:0.01938
## [344] validation_0-rmse:0.019188
## [345] validation_0-rmse:0.018998
## [346] validation_0-rmse:0.01881
## [347] validation_0-rmse:0.018624
## [348] validation_0-rmse:0.01844
## [349] validation_0-rmse:0.018257
## [350] validation_0-rmse:0.018077
## [351] validation_0-rmse:0.017898
## [352] validation_0-rmse:0.017721
## [353] validation_0-rmse:0.017546
## [354] validation_0-rmse:0.01744
## [355] validation_0-rmse:0.017268
## [356] validation_0-rmse:0.017097
## [357] validation_0-rmse:0.016995
## [358] validation_0-rmse:0.016895
## [359] validation_0-rmse:0.016729
## [360] validation_0-rmse:0.016631
## [361] validation_0-rmse:0.016467
## [362] validation_0-rmse:0.016304
## [363] validation_0-rmse:0.016212
## [364] validation_0-rmse:0.016052
## [365] validation_0-rmse:0.015893
## [366] validation_0-rmse:0.015736
## [367] validation_0-rmse:0.015649
## [368] validation_0-rmse:0.015494
## [369] validation_0-rmse:0.015341
## [370] validation_0-rmse:0.015257
## [371] validation_0-rmse:0.015181
## [372] validation_0-rmse:0.015031
## [373] validation_0-rmse:0.014882
## [374] validation_0-rmse:0.014735
## [375] validation_0-rmse:0.01459
## [376] validation_0-rmse:0.014445
## [377] validation_0-rmse:0.014302
## [378] validation_0-rmse:0.014233
## [379] validation_0-rmse:0.014092
## [380] validation_0-rmse:0.013953
## [381] validation_0-rmse:0.013877
## [382] validation_0-rmse:0.01374
## [383] validation_0-rmse:0.013604
## [384] validation_0-rmse:0.013469
## [385] validation_0-rmse:0.013336
## [386] validation_0-rmse:0.013264
## [387] validation_0-rmse:0.013133
## [388] validation_0-rmse:0.013003
## [389] validation_0-rmse:0.012874
## [390] validation_0-rmse:0.012807
## [391] validation_0-rmse:0.01274
## [392] validation_0-rmse:0.012673
## [393] validation_0-rmse:0.012548
## [394] validation_0-rmse:0.012424
## [395] validation_0-rmse:0.012359
## [396] validation_0-rmse:0.012237
## [397] validation_0-rmse:0.012175
## [398] validation_0-rmse:0.012055
## [399] validation_0-rmse:0.011936
## [400] validation_0-rmse:0.011818
## [401] validation_0-rmse:0.011701
## [402] validation_0-rmse:0.011585
## [403] validation_0-rmse:0.011471
## [404] validation_0-rmse:0.011357
## [405] validation_0-rmse:0.011301
## [406] validation_0-rmse:0.011246
## [407] validation_0-rmse:0.011135
## [408] validation_0-rmse:0.01108
## [409] validation_0-rmse:0.010971
## [410] validation_0-rmse:0.010862
## [411] validation_0-rmse:0.010755
## [412] validation_0-rmse:0.010703
## [413] validation_0-rmse:0.010597
## [414] validation_0-rmse:0.010493
## [415] validation_0-rmse:0.010389
## [416] validation_0-rmse:0.010286
## [417] validation_0-rmse:0.010185
## [418] validation_0-rmse:0.010084
## [419] validation_0-rmse:0.009984
## [420] validation_0-rmse:0.009886
## [421] validation_0-rmse:0.00984
## [422] validation_0-rmse:0.009742
## [423] validation_0-rmse:0.009697
## [424] validation_0-rmse:0.009601
## [425] validation_0-rmse:0.009506
## [426] validation_0-rmse:0.009412
## [427] validation_0-rmse:0.009319
## [428] validation_0-rmse:0.009276
## [429] validation_0-rmse:0.009185
## [430] validation_0-rmse:0.009094
## [431] validation_0-rmse:0.009004
## [432] validation_0-rmse:0.008915
## [433] validation_0-rmse:0.008827
## [434] validation_0-rmse:0.008787
## [435] validation_0-rmse:0.0087
## [436] validation_0-rmse:0.008614
## [437] validation_0-rmse:0.008529
## [438] validation_0-rmse:0.008444
## [439] validation_0-rmse:0.008361
## [440] validation_0-rmse:0.008278
## [441] validation_0-rmse:0.008197
## [442] validation_0-rmse:0.008116
## [443] validation_0-rmse:0.008036
## [444] validation_0-rmse:0.007956
## [445] validation_0-rmse:0.007877
## [446] validation_0-rmse:0.007843
## [447] validation_0-rmse:0.007765
## [448] validation_0-rmse:0.007731
## [449] validation_0-rmse:0.007655
## [450] validation_0-rmse:0.007579
## [451] validation_0-rmse:0.007504
## [452] validation_0-rmse:0.00743
## [453] validation_0-rmse:0.007357
## [454] validation_0-rmse:0.007284
## [455] validation_0-rmse:0.007253
## [456] validation_0-rmse:0.007222
## [457] validation_0-rmse:0.00715
## [458] validation_0-rmse:0.00712
## [459] validation_0-rmse:0.007096
## [460] validation_0-rmse:0.007026
## [461] validation_0-rmse:0.006957
## [462] validation_0-rmse:0.006888
## [463] validation_0-rmse:0.00682
## [464] validation_0-rmse:0.006792
## [465] validation_0-rmse:0.006725
## [466] validation_0-rmse:0.006659
## [467] validation_0-rmse:0.006593
## [468] validation_0-rmse:0.006567
## [469] validation_0-rmse:0.006502
## [470] validation_0-rmse:0.006438
## [471] validation_0-rmse:0.006374
## [472] validation_0-rmse:0.006311
## [473] validation_0-rmse:0.006249
## [474] validation_0-rmse:0.006187
## [475] validation_0-rmse:0.006162
## [476] validation_0-rmse:0.006102
## [477] validation_0-rmse:0.006078
## [478] validation_0-rmse:0.00606
## [479] validation_0-rmse:0.006042
## [480] validation_0-rmse:0.005983
## [481] validation_0-rmse:0.005961
## [482] validation_0-rmse:0.005902
## [483] validation_0-rmse:0.005843
## [484] validation_0-rmse:0.005786
## [485] validation_0-rmse:0.005729
## [486] validation_0-rmse:0.005672
## [487] validation_0-rmse:0.005616
## [488] validation_0-rmse:0.005561
## [489] validation_0-rmse:0.00554
## [490] validation_0-rmse:0.005486
## [491] validation_0-rmse:0.005431
## [492] validation_0-rmse:0.005417
## [493] validation_0-rmse:0.005363
## [494] validation_0-rmse:0.00531
## [495] validation_0-rmse:0.005258
## [496] validation_0-rmse:0.005239
## [497] validation_0-rmse:0.005187
## [498] validation_0-rmse:0.005136
## [499] validation_0-rmse:0.005085
## [500] validation_0-rmse:0.005068
## [501] validation_0-rmse:0.005018
## [502] validation_0-rmse:0.005001
## [503] validation_0-rmse:0.004952
## [504] validation_0-rmse:0.004903
## [505] validation_0-rmse:0.004886
## [506] validation_0-rmse:0.004838
## [507] validation_0-rmse:0.00479
## [508] validation_0-rmse:0.004743
## [509] validation_0-rmse:0.004696
## [510] validation_0-rmse:0.004681
## [511] validation_0-rmse:0.004635
## [512] validation_0-rmse:0.004589
## [513] validation_0-rmse:0.004574
## [514] validation_0-rmse:0.004529
## [515] validation_0-rmse:0.004515
## [516] validation_0-rmse:0.00447
## [517] validation_0-rmse:0.004426
## [518] validation_0-rmse:0.004383
## [519] validation_0-rmse:0.00434
## [520] validation_0-rmse:0.004297
## [521] validation_0-rmse:0.004254
## [522] validation_0-rmse:0.004241
## [523] validation_0-rmse:0.0042
## [524] validation_0-rmse:0.004158
## [525] validation_0-rmse:0.004117
## [526] validation_0-rmse:0.004076
## [527] validation_0-rmse:0.004064
## [528] validation_0-rmse:0.004024
## [529] validation_0-rmse:0.003985
## [530] validation_0-rmse:0.003976
## [531] validation_0-rmse:0.003937
## [532] validation_0-rmse:0.003898
## [533] validation_0-rmse:0.00386
## [534] validation_0-rmse:0.003822
## [535] validation_0-rmse:0.003784
## [536] validation_0-rmse:0.003747
## [537] validation_0-rmse:0.003736
## [538] validation_0-rmse:0.003699
## [539] validation_0-rmse:0.003663
## [540] validation_0-rmse:0.003627
## [541] validation_0-rmse:0.003591
## [542] validation_0-rmse:0.003556
## [543] validation_0-rmse:0.003521
## [544] validation_0-rmse:0.003486
## [545] validation_0-rmse:0.003476
## [546] validation_0-rmse:0.003442
## [547] validation_0-rmse:0.003432
## [548] validation_0-rmse:0.003423
## [549] validation_0-rmse:0.003414
## [550] validation_0-rmse:0.00338
## [551] validation_0-rmse:0.003347
## [552] validation_0-rmse:0.003338
## [553] validation_0-rmse:0.003305
## [554] validation_0-rmse:0.003296
## [555] validation_0-rmse:0.003288
## [556] validation_0-rmse:0.003255
## [557] validation_0-rmse:0.003223
## [558] validation_0-rmse:0.003192
## [559] validation_0-rmse:0.003184
## [560] validation_0-rmse:0.003176
## [561] validation_0-rmse:0.003145
## [562] validation_0-rmse:0.003113
## [563] validation_0-rmse:0.003083
## [564] validation_0-rmse:0.003052
## [565] validation_0-rmse:0.003045
## [566] validation_0-rmse:0.003015
## [567] validation_0-rmse:0.002986
## [568] validation_0-rmse:0.002979
## [569] validation_0-rmse:0.002949
## [570] validation_0-rmse:0.002943
## [571] validation_0-rmse:0.002936
## [572] validation_0-rmse:0.002907
## [573] validation_0-rmse:0.002879
## [574] validation_0-rmse:0.00285
## [575] validation_0-rmse:0.002844
## [576] validation_0-rmse:0.002838
## [577] validation_0-rmse:0.00281
## [578] validation_0-rmse:0.002782
## [579] validation_0-rmse:0.002755
## [580] validation_0-rmse:0.002728
## [581] validation_0-rmse:0.002701
## [582] validation_0-rmse:0.002675
## [583] validation_0-rmse:0.002672
## [584] validation_0-rmse:0.002646
## [585] validation_0-rmse:0.00262
## [586] validation_0-rmse:0.002594
## [587] validation_0-rmse:0.002569
## [588] validation_0-rmse:0.002543
## [589] validation_0-rmse:0.002538
## [590] validation_0-rmse:0.002513
## [591] validation_0-rmse:0.002488
## [592] validation_0-rmse:0.002483
## [593] validation_0-rmse:0.002459
## [594] validation_0-rmse:0.002435
## [595] validation_0-rmse:0.002411
## [596] validation_0-rmse:0.002387
## [597] validation_0-rmse:0.002364
## [598] validation_0-rmse:0.002362
## [599] validation_0-rmse:0.002339
## [600] validation_0-rmse:0.002316
## [601] validation_0-rmse:0.002293
## [602] validation_0-rmse:0.00227
## [603] validation_0-rmse:0.002266
## [604] validation_0-rmse:0.002244
## [605] validation_0-rmse:0.002222
## [606] validation_0-rmse:0.0022
## [607] validation_0-rmse:0.002178
## [608] validation_0-rmse:0.002174
## [609] validation_0-rmse:0.002153
## [610] validation_0-rmse:0.002132
## [611] validation_0-rmse:0.002128
## [612] validation_0-rmse:0.002107
## [613] validation_0-rmse:0.002086
## [614] validation_0-rmse:0.002083
## [615] validation_0-rmse:0.002062
## [616] validation_0-rmse:0.002042
## [617] validation_0-rmse:0.002022
## [618] validation_0-rmse:0.002018
## [619] validation_0-rmse:0.001999
## [620] validation_0-rmse:0.001995
## [621] validation_0-rmse:0.001976
## [622] validation_0-rmse:0.001956
## [623] validation_0-rmse:0.001953
## [624] validation_0-rmse:0.001934
## [625] validation_0-rmse:0.001915
## [626] validation_0-rmse:0.001896
## [627] validation_0-rmse:0.001877
## [628] validation_0-rmse:0.001859
## [629] validation_0-rmse:0.001841
## [630] validation_0-rmse:0.001823
## [631] validation_0-rmse:0.001805
## [632] validation_0-rmse:0.001787
## [633] validation_0-rmse:0.00177
## [634] validation_0-rmse:0.001752
## [635] validation_0-rmse:0.001735
## [636] validation_0-rmse:0.001718
## [637] validation_0-rmse:0.001701
## [638] validation_0-rmse:0.001685
## [639] validation_0-rmse:0.001684
## [640] validation_0-rmse:0.001668
## [641] validation_0-rmse:0.001651
## [642] validation_0-rmse:0.001635
## [643] validation_0-rmse:0.001619
## [644] validation_0-rmse:0.001603
## [645] validation_0-rmse:0.001601
## [646] validation_0-rmse:0.001598
## [647] validation_0-rmse:0.001583
## [648] validation_0-rmse:0.001567
## [649] validation_0-rmse:0.001552
## [650] validation_0-rmse:0.001537
## [651] validation_0-rmse:0.001522
## [652] validation_0-rmse:0.001507
## [653] validation_0-rmse:0.001492
## [654] validation_0-rmse:0.001478
## [655] validation_0-rmse:0.001463
## [656] validation_0-rmse:0.001449
## [657] validation_0-rmse:0.001435
## [658] validation_0-rmse:0.001421
## [659] validation_0-rmse:0.001418
## [660] validation_0-rmse:0.001405
## [661] validation_0-rmse:0.001391
## [662] validation_0-rmse:0.001377
## [663] validation_0-rmse:0.001364
## [664] validation_0-rmse:0.001351
## [665] validation_0-rmse:0.001337
## [666] validation_0-rmse:0.001324
## [667] validation_0-rmse:0.001312
## [668] validation_0-rmse:0.001299
## [669] validation_0-rmse:0.001286
## [670] validation_0-rmse:0.001274
## [671] validation_0-rmse:0.001261
## [672] validation_0-rmse:0.001249
## [673] validation_0-rmse:0.001237
## [674] validation_0-rmse:0.001225
## [675] validation_0-rmse:0.001213
## [676] validation_0-rmse:0.001211
## [677] validation_0-rmse:0.001209
## [678] validation_0-rmse:0.001207
## [679] validation_0-rmse:0.001206
## [680] validation_0-rmse:0.001194
## [681] validation_0-rmse:0.001192
## [682] validation_0-rmse:0.001191
## [683] validation_0-rmse:0.001179
## [684] validation_0-rmse:0.001177
## [685] validation_0-rmse:0.001178
## [686] validation_0-rmse:0.001166
## [687] validation_0-rmse:0.001155
## [688] validation_0-rmse:0.001153
## [689] validation_0-rmse:0.001152
## [690] validation_0-rmse:0.001151
## [691] validation_0-rmse:0.001139
## [692] validation_0-rmse:0.001128
## [693] validation_0-rmse:0.001117
## [694] validation_0-rmse:0.001106
## [695] validation_0-rmse:0.001096
## [696] validation_0-rmse:0.001085
## [697] validation_0-rmse:0.001075
## [698] validation_0-rmse:0.001064
## [699] validation_0-rmse:0.001063
## [700] validation_0-rmse:0.001053
## [701] validation_0-rmse:0.001042
## [702] validation_0-rmse:0.001032
## [703] validation_0-rmse:0.001031
## [704] validation_0-rmse:0.001032
## [705] validation_0-rmse:0.001022
## [706] validation_0-rmse:0.001012
## [707] validation_0-rmse:0.001011
## [708] validation_0-rmse:0.001001
## [709] validation_0-rmse:0.001
## [710] validation_0-rmse:0.00099
## [711] validation_0-rmse:0.00098
## [712] validation_0-rmse:0.000971
## [713] validation_0-rmse:0.00097
## [714] validation_0-rmse:0.000969
## [715] validation_0-rmse:0.000968
## [716] validation_0-rmse:0.000959
## [717] validation_0-rmse:0.000949
## [718] validation_0-rmse:0.000948
## [719] validation_0-rmse:0.000949
## [720] validation_0-rmse:0.00094
## [721] validation_0-rmse:0.000931
## [722] validation_0-rmse:0.000922
## [723] validation_0-rmse:0.000913
## [724] validation_0-rmse:0.000912
## [725] validation_0-rmse:0.000903
## [726] validation_0-rmse:0.000895
## [727] validation_0-rmse:0.000886
## [728] validation_0-rmse:0.000877
## [729] validation_0-rmse:0.000869
## [730] validation_0-rmse:0.00086
## [731] validation_0-rmse:0.00086
## [732] validation_0-rmse:0.000851
## [733] validation_0-rmse:0.000843
## [734] validation_0-rmse:0.000835
## [735] validation_0-rmse:0.000827
## [736] validation_0-rmse:0.000828
## [737] validation_0-rmse:0.00082
## [738] validation_0-rmse:0.000819
## [739] validation_0-rmse:0.000811
## [740] validation_0-rmse:0.000803
## [741] validation_0-rmse:0.000796
## [742] validation_0-rmse:0.000788
## [743] validation_0-rmse:0.00078
## [744] validation_0-rmse:0.000773
## [745] validation_0-rmse:0.000774
## [746] validation_0-rmse:0.000773
## [747] validation_0-rmse:0.000766
## [748] validation_0-rmse:0.000758
## [749] validation_0-rmse:0.000751
## [750] validation_0-rmse:0.000744
## [751] validation_0-rmse:0.000737
## [752] validation_0-rmse:0.00073
## [753] validation_0-rmse:0.000723
## [754] validation_0-rmse:0.000723
## [755] validation_0-rmse:0.000717
## [756] validation_0-rmse:0.000716
## [757] validation_0-rmse:0.000709
## [758] validation_0-rmse:0.000702
## [759] validation_0-rmse:0.000696
## [760] validation_0-rmse:0.000689
## [761] validation_0-rmse:0.000682
## [762] validation_0-rmse:0.000683
## [763] validation_0-rmse:0.000683
## [764] validation_0-rmse:0.000683
## [765] validation_0-rmse:0.000677
## [766] validation_0-rmse:0.00067
## [767] validation_0-rmse:0.000664
## [768] validation_0-rmse:0.000658
## [769] validation_0-rmse:0.000659
## [770] validation_0-rmse:0.000652
## [771] validation_0-rmse:0.000646
## [772] validation_0-rmse:0.00064
## [773] validation_0-rmse:0.000634
## [774] validation_0-rmse:0.000633
## [775] validation_0-rmse:0.000627
## [776] validation_0-rmse:0.000627
## [777] validation_0-rmse:0.000621
## [778] validation_0-rmse:0.00062
## [779] validation_0-rmse:0.000614
## [780] validation_0-rmse:0.000609
## [781] validation_0-rmse:0.000603
## [782] validation_0-rmse:0.000604
## [783] validation_0-rmse:0.000603
## [784] validation_0-rmse:0.000603
## [785] validation_0-rmse:0.000597
## [786] validation_0-rmse:0.000591
## [787] validation_0-rmse:0.000591
## [788] validation_0-rmse:0.000585
## [789] validation_0-rmse:0.00058
## [790] validation_0-rmse:0.000574
## [791] validation_0-rmse:0.000574
## [792] validation_0-rmse:0.000568
## [793] validation_0-rmse:0.000563
## [794] validation_0-rmse:0.000558
## [795] validation_0-rmse:0.000552
## [796] validation_0-rmse:0.000547
## [797] validation_0-rmse:0.000542
## [798] validation_0-rmse:0.000537
## [799] validation_0-rmse:0.000532
## [800] validation_0-rmse:0.000527
## [801] validation_0-rmse:0.000526
## [802] validation_0-rmse:0.000526
## [803] validation_0-rmse:0.000521
## [804] validation_0-rmse:0.000521
## [805] validation_0-rmse:0.000516
## [806] validation_0-rmse:0.000515
## [807] validation_0-rmse:0.000511
## [808] validation_0-rmse:0.000506
## [809] validation_0-rmse:0.000501
## [810] validation_0-rmse:0.000496
## [811] validation_0-rmse:0.000492
## [812] validation_0-rmse:0.000492
## [813] validation_0-rmse:0.000488
## [814] validation_0-rmse:0.000483
## [815] validation_0-rmse:0.000483
## [816] validation_0-rmse:0.000484
## [817] validation_0-rmse:0.000479
## [818] validation_0-rmse:0.000474
## [819] validation_0-rmse:0.00047
## [820] validation_0-rmse:0.000471
## [821] validation_0-rmse:0.000471
## [822] validation_0-rmse:0.000467
## [823] validation_0-rmse:0.000463
## [824] validation_0-rmse:0.000458
## [825] validation_0-rmse:0.000454
## [826] validation_0-rmse:0.00045
## [827] validation_0-rmse:0.000446
## [828] validation_0-rmse:0.000441
## [829] validation_0-rmse:0.000441
## [830] validation_0-rmse:0.000437
## [831] validation_0-rmse:0.000433
## [832] validation_0-rmse:0.000429
## [833] validation_0-rmse:0.000425
## [834] validation_0-rmse:0.000424
## [835] validation_0-rmse:0.00042
## [836] validation_0-rmse:0.000417
## [837] validation_0-rmse:0.000413
## [838] validation_0-rmse:0.000409
## [839] validation_0-rmse:0.000408
## [840] validation_0-rmse:0.000405
## [841] validation_0-rmse:0.000404
## [842] validation_0-rmse:0.000404
## [843] validation_0-rmse:0.0004
## [844] validation_0-rmse:0.0004
## [845] validation_0-rmse:0.000396
## [846] validation_0-rmse:0.000393
## [847] validation_0-rmse:0.000389
## [848] validation_0-rmse:0.00039
## [849] validation_0-rmse:0.000386
## [850] validation_0-rmse:0.000386
## [851] validation_0-rmse:0.000382
## [852] validation_0-rmse:0.000379
## [853] validation_0-rmse:0.000379
## [854] validation_0-rmse:0.000375
## [855] validation_0-rmse:0.000375
## [856] validation_0-rmse:0.000371
## [857] validation_0-rmse:0.000368
## [858] validation_0-rmse:0.000368
## [859] validation_0-rmse:0.000364
## [860] validation_0-rmse:0.000365
## [861] validation_0-rmse:0.000365
## [862] validation_0-rmse:0.000361
## [863] validation_0-rmse:0.000358
## [864] validation_0-rmse:0.000355
## [865] validation_0-rmse:0.000351
## [866] validation_0-rmse:0.000351
## [867] validation_0-rmse:0.000348
## [868] validation_0-rmse:0.000345
## [869] validation_0-rmse:0.000345
## [870] validation_0-rmse:0.000342
## [871] validation_0-rmse:0.000339
## [872] validation_0-rmse:0.000336
## [873] validation_0-rmse:0.000334
## [874] validation_0-rmse:0.000333
## [875] validation_0-rmse:0.000331
## [876] validation_0-rmse:0.00033
## [877] validation_0-rmse:0.000329
## [878] validation_0-rmse:0.000327
## [879] validation_0-rmse:0.000326
## [880] validation_0-rmse:0.000324
## [881] validation_0-rmse:0.000323
## [882] validation_0-rmse:0.000322
## [883] validation_0-rmse:0.000321
## [884] validation_0-rmse:0.000319
## [885] validation_0-rmse:0.000318
## [886] validation_0-rmse:0.000317
## [887] validation_0-rmse:0.000316
## [888] validation_0-rmse:0.000315
## [889] validation_0-rmse:0.000313
## [890] validation_0-rmse:0.000312
## [891] validation_0-rmse:0.000311
## [892] validation_0-rmse:0.00031
## [893] validation_0-rmse:0.000309
## [894] validation_0-rmse:0.000308
## [895] validation_0-rmse:0.000307
## [896] validation_0-rmse:0.000306
## [897] validation_0-rmse:0.000305
## [898] validation_0-rmse:0.000304
## [899] validation_0-rmse:0.000303
## [900] validation_0-rmse:0.000303
## [901] validation_0-rmse:0.000302
## [902] validation_0-rmse:0.000302
## [903] validation_0-rmse:0.000301
## [904] validation_0-rmse:0.0003
## [905] validation_0-rmse:0.000299
## [906] validation_0-rmse:0.000299
## [907] validation_0-rmse:0.000298
## [908] validation_0-rmse:0.000298
## [909] validation_0-rmse:0.000298
## [910] validation_0-rmse:0.000297
## [911] validation_0-rmse:0.000297
## [912] validation_0-rmse:0.000296
## [913] validation_0-rmse:0.000295
## [914] validation_0-rmse:0.000294
## [915] validation_0-rmse:0.000295
## [916] validation_0-rmse:0.000294
## [917] validation_0-rmse:0.000293
## [918] validation_0-rmse:0.000293
## [919] validation_0-rmse:0.000293
## [920] validation_0-rmse:0.000293
## [921] validation_0-rmse:0.000293
## [922] validation_0-rmse:0.000292
## [923] validation_0-rmse:0.000292
## [924] validation_0-rmse:0.000292
## [925] validation_0-rmse:0.000292
## [926] validation_0-rmse:0.000291
## [927] validation_0-rmse:0.000291
## [928] validation_0-rmse:0.00029
## [929] validation_0-rmse:0.000289
## [930] validation_0-rmse:0.000289
## [931] validation_0-rmse:0.000288
## [932] validation_0-rmse:0.000287
## [933] validation_0-rmse:0.000287
## [934] validation_0-rmse:0.000287
## [935] validation_0-rmse:0.000286
## [936] validation_0-rmse:0.000286
## [937] validation_0-rmse:0.000285
## [938] validation_0-rmse:0.000286
## [939] validation_0-rmse:0.000285
## [940] validation_0-rmse:0.000285
## [941] validation_0-rmse:0.000286
## [942] validation_0-rmse:0.000286
## [943] validation_0-rmse:0.000285
## [944] validation_0-rmse:0.000285
## [945] validation_0-rmse:0.000285
## [946] validation_0-rmse:0.000284
## [947] validation_0-rmse:0.000284
## [948] validation_0-rmse:0.000283
## [949] validation_0-rmse:0.000282
## [950] validation_0-rmse:0.000282
## [951] validation_0-rmse:0.000282
## [952] validation_0-rmse:0.000281
## [953] validation_0-rmse:0.000281
## [954] validation_0-rmse:0.00028
## [955] validation_0-rmse:0.00028
## [956] validation_0-rmse:0.000279
## [957] validation_0-rmse:0.000279
## [958] validation_0-rmse:0.000279
## [959] validation_0-rmse:0.000278
## [960] validation_0-rmse:0.000278
## [961] validation_0-rmse:0.000277
## [962] validation_0-rmse:0.000277
## [963] validation_0-rmse:0.000277
## [964] validation_0-rmse:0.000277
## [965] validation_0-rmse:0.000277
## [966] validation_0-rmse:0.000278
## [967] validation_0-rmse:0.000277
## [968] validation_0-rmse:0.000277
## [969] validation_0-rmse:0.000277
## [970] validation_0-rmse:0.000277
## [971] validation_0-rmse:0.000277
## Stopping. Best iteration:
## [961] validation_0-rmse:0.000277
##
## XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
## colsample_bynode=1, colsample_bytree=0.8, gamma=0,
## importance_type='gain', learning_rate=0.01, max_delta_step=0,
## max_depth=10, min_child_weight=9, missing=None, n_estimators=10000,
## n_jobs=1, nthread=8, objective='reg:logistic', random_state=0,
## reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=4242,
## silent=None, subsample=0.8, verbosity=1)
## 4.134720766666666 Minutes
# Predict
preds = clf.predict(val_x)
## Wich max prob
import numpy as np
best_preds = np.asarray([np.argmax(line) for line in preds])
# Determine the precision of this prediction:
from sklearn.metrics import precision_score
precision_score(val_y, best_preds, average='macro')
## 0.46495489863356254
##
## /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
## _warn_prf(average, modifier, msg_start, len(result))
#xtest = test.drop(['Target', 'weeks', 'Latitude', 'Longitude'], axis=1)
test_ID = xtest.Square_ID
xtest = xtest.drop('Square_ID', axis = 1)
pred_test = clf.predict(xtest)
submission = pd.concat([pd.DataFrame(test_ID), pd.DataFrame(pred_test)], axis = 1)
submission.columns = ['Square_ID', 'pred']
submission.to_csv("pred_test.csv")
submission.head
## <bound method NDFrame.head of Square_ID pred
## 0 4e3c3896 0.000041
## 1 4e3c3897 0.000041
## 2 4e3c3898 0.000041
## 3 4e3c3899 0.000041
## 4 4e3c389a 0.000041
## ... ... ...
## 279917 4e6f5dfd 0.000042
## 279918 4e6f5dfe 0.000042
## 279919 4e6f5dff 0.000042
## 279920 4e6f5e00 0.000042
## 279921 4e6f5e01 0.000047
##
## [279922 rows x 2 columns]>
pred_test <- fread("pred_test.csv")
new_train01[new_train01$Year == 2019,]$Target <- pred_test$pred
new_train01 %>%
#filter(Target > 0.5) %>%
ggplot() +
aes(Target) +
geom_density() +
facet_wrap(Year ~., ncol =2, scales = "free")#+
new_train01 %>%
#filter(Year == 2019) %>%
group_by(Year) %>%
filter(`Week of` == max(`Week of`))%>%
ungroup()%>%
mutate(Height = as.factor(Height)) %>%
ggplot +
#geom_point(aes(x = Longitude, y = Latitude, colour = Height)) +
geom_point(#data = subset(new_train, Target == 1),
aes(x = Longitude, y = Latitude, color = Target, shape = Target),
size = 0.8, stroke = 0, shape = 16) +
facet_wrap(Year~., ncol = 2)