In this report, I will go directly to the solution. A HTML version in available fowing this link.
A second document link is available in this link. It will focus on:
Exploration of Radiomics, Clinical and Images data sets.
Event prediction using features and images.
# Set python environment and version in RStudio ;-)
reticulate::use_python("/Users/Mezhoud/anaconda3/bin/python3", required = TRUE)
reticulate::py_config()
## python: /Users/Mezhoud/anaconda3/bin/python3
## libpython: /Users/Mezhoud/anaconda3/lib/libpython3.7m.dylib
## pythonhome: /Users/Mezhoud/anaconda3:/Users/Mezhoud/anaconda3
## version: 3.7.5 (default, Oct 25 2019, 10:52:18) [Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy: /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/numpy
## numpy_version: 1.18.1
##
## NOTE: Python version was forced by use_python function
# Function to impute age
age_imputation <- function(df){
init = mice(df, maxit=0)
meth = init$method
predM = init$predictorMatrix
predM[, c("PatientID")] <- 0
meth[c("PatientID")]=""
meth[c("age")]="cart" # pmm (Predictive Mean Matching suitable for numeric variables )
set.seed(103)
imputed = mice(df, method=meth, predictorMatrix=predM, m=5)
imputed <- complete(imputed)
return(imputed)
}
# Function to reshape and merge Radiomics and clinical datasets
Features_prepocessing <- function(path_radiomics, path_clinical, path_submission){
#Load dataset
radiomics <- fread(path_radiomics, quote = "")
clinical <- fread(path_clinical)
submission <- fread(path_submission, quote = "")
# Attributes groups to (shape, firstorder, textural) and features to (original_shaep_Compacteness1 .....)
groups <- radiomics[1:2,-1] %>%
t() %>%
as.data.frame() %>%
rename("Groups" = V1, "Features" = V2)
# Omit "original_" from features
new_colnames_radiomics <- groups %>%
mutate(Features = stringr::str_remove(Features,"original_")) %>%
pull(Features)
# Renames columns of radiomics
old_names <- colnames(radiomics)
new_names <- c("PatientID", new_colnames_radiomics)
# Get new radiomics dataset
new_radiomics <- radiomics[-1:-3,] %>%
rename_at(vars(old_names), ~ new_names) %>%
mutate_if(is.character, as.numeric)
# Get new Clinical dataset
# age imputation
clinical <- age_imputation(clinical)
# Convert character variables to numeric
new_clinical <- clinical %>%
mutate(Histology = stringi::stri_trans_totitle(Histology)) %>%
mutate_if(is.character, as.factor) %>%
mutate_if(is.factor, as.numeric)
# Join radimics and Clinical dataset by PatientID
features <- new_clinical %>%
mutate_if(is.character, as.factor) %>%
left_join(y = submission, by = "PatientID") %>%
left_join(y = new_radiomics, by = "PatientID") %>%
dplyr::select(PatientID, SurvivalTime, everything()) %>%
setDT()
return(features)
}
train <- Features_prepocessing(path_radiomics = "train/features/radiomics.csv",
path_clinical = "train/features/clinical_data.csv",
path_submission = "output_train.csv")
##
## iter imp variable
## 1 1 age
## 1 2 age
## 1 3 age
## 1 4 age
## 1 5 age
## 2 1 age
## 2 2 age
## 2 3 age
## 2 4 age
## 2 5 age
## 3 1 age
## 3 2 age
## 3 3 age
## 3 4 age
## 3 5 age
## 4 1 age
## 4 2 age
## 4 3 age
## 4 4 age
## 4 5 age
## 5 1 age
## 5 2 age
## 5 3 age
## 5 4 age
## 5 5 age
test <- Features_prepocessing(path_radiomics = "test/features/radiomics.csv",
path_clinical = "test/features/clinical_data.csv",
path_submission = "output_test.csv")
##
## iter imp variable
## 1 1 age
## 1 2 age
## 1 3 age
## 1 4 age
## 1 5 age
## 2 1 age
## 2 2 age
## 2 3 age
## 2 4 age
## 2 5 age
## 3 1 age
## 3 2 age
## 3 3 age
## 3 4 age
## 3 5 age
## 4 1 age
## 4 2 age
## 4 3 age
## 4 4 age
## 4 5 age
## 5 1 age
## 5 2 age
## 5 3 age
## 5 4 age
## 5 5 age
## PatientID SurvivalTime Histology Mstage Nstage SourceDataset Tstage age
## 1: 202 1378 1 0 0 2 2 66.0000
## 2: 371 379 2 0 2 1 4 64.5722
## 3: 246 573 6 0 3 1 2 66.0452
## 4: 240 959 4 0 2 1 3 59.3566
## 5: 284 2119 6 0 3 1 4 71.0554
## 6: 348 706 6 0 2 1 2 65.0212
## 7: 384 78 6 0 0 1 3 78.7105
## 8: 244 1369 1 0 0 2 1 70.0000
## 9: 100 197 1 0 0 1 4 74.4504
## 10: 173 196 1 0 2 1 4 53.0842
## Event shape_Compactness1
## 1: 0 0.02781503
## 2: 1 0.02301549
## 3: 1 0.02734811
## 4: 0 0.02681111
## 5: 0 0.02369124
## 6: 1 0.03098136
## 7: 1 0.02828906
## 8: 1 0.02048073
## 9: 1 0.02802794
## 10: 1 0.01844193
## Load usefull Python modules
import os
import numpy as np
import pandas as pd
import mxnet as mx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from lifelines.utils import concordance_index
from lifelines import CoxPHFitter, WeibullAFTFitter , LogNormalAFTFitter, LogLogisticAFTFitter , PiecewiseExponentialRegressionFitter
from lifelines.utils import k_fold_cross_validation
import seaborn as sns
import xgboost as xgb
import time
import re
import glob
import random
import pickle
# Function to read and select pertinente features
def select_features(path):
df = pd.read_csv(path) #,header=0, delim_whitespace=True
df = df.set_index('PatientID')
# set target ytrain
ydf = df[["SurvivalTime","Event"]]
# set variable matrix xtrain
xdf = df.drop(['SurvivalTime', 'Event'], axis=1)
xdf = xdf[["shape_Sphericity", "shape_SurfaceVolumeRatio", "shape_Maximum3DDiameter",
"glcm_Id","glcm_Idm" ,"SourceDataset","Nstage","Tstage","age" ,"firstorder_Entropy" ]] # , "Histology"
df = pd.concat([xdf, ydf], axis = 1)
return(df)
# Function to evaluate 4 models and return datafarame with scores, cph, Wei, log and logistic models
def evalModels(dt, dtype):
## CoxPHFitter model
cph = CoxPHFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
## WeibullAFTFitter
Wei = WeibullAFTFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
## LogNormalAFTFitter
log = LogNormalAFTFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
## LogLogisticAFTFitter
logistic = LogLogisticAFTFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
## k_fold Cross validation
cph_cv_result = k_fold_cross_validation(cph, dt, duration_col='SurvivalTime', event_col='Event', k=5)
wei_cv_result = k_fold_cross_validation(Wei, dt, duration_col='SurvivalTime', event_col='Event', k=5)
log_cv_result = k_fold_cross_validation(log, dt, duration_col='SurvivalTime', event_col='Event', k=5)
logistic_cv_result = k_fold_cross_validation(logistic, dt, duration_col='SurvivalTime', event_col='Event', k=5)
# built empty dataframe
results = pd.DataFrame(np.nan,
index=[ 'CoxPHFitter',
'WeibullAFTFitter',
'LogNormalAFTFitter',
'Logistic'],
columns=[dtype],
dtype='float')
# Fill dataframe with the median c-index scores
results[dtype] = [np.median(cph_cv_result),
np.median(wei_cv_result),
np.median(log_cv_result),
np.median(logistic_cv_result)]
return [results,cph, Wei, log, logistic]
train = select_features("train.csv")
random.seed(1234)
models = evalModels(dt = train, dtype = "Features")
# save the model to disk
filename = 'models.sav'
pickle.dump(models, open(filename, 'wb'))
# load the model from disk
#loaded_models = pickle.load(open(filename, 'rb'))
#result = loaded_models.score(X_test, Y_test)
#print(result)
models[0]
## Features
## CoxPHFitter 0.663320
## WeibullAFTFitter 0.692367
## LogNormalAFTFitter 0.692047
## Logistic 0.674817
# Function to run the 4 predictions
def getMultiplePred(path_test):
# prepare xtest
test_fea = pd.read_csv(path_test)
test_fea = test_fea.set_index('PatientID')
# select the same variables
xtest = test_fea[["shape_Sphericity", "shape_SurfaceVolumeRatio", "shape_Maximum3DDiameter","firstorder_Entropy",
"glcm_Id","glcm_Idm","SourceDataset","Nstage","Tstage","age", "Histology"]]
# get models
#models = evalModels(dt = train, dtype = "Features")
# load the model from disk
models = pickle.load(open("models.sav", 'rb'))
# predict
cph_pred = models[1].predict_expectation(xtest)
Wei_pred = models[2].predict_expectation(xtest)
log_pred = models[3].predict_expectation(xtest)
logistic_pred = models[4].predict_expectation(xtest)
# Load output test file
output_test = pd.read_csv("output_test.csv")
# index PatientID
output_test = output_test.set_index('PatientID')
#pd.merge(df1, df2, left_index=True, right_index=True)
# add predcition to SurvivalTime
output_test['cph'] = cph_pred
output_test['Wei'] = Wei_pred
output_test['log'] = log_pred
output_test['logistic'] = logistic_pred
return output_test
multiple_pred = getMultiplePred("test.csv")
multiple_pred.head
## <bound method NDFrame.head of SurvivalTime Event ... log logistic
## PatientID ...
## 13 788.417673 NaN ... 2863.813411 3103.516701
## 155 427.650092 NaN ... 2483.604469 2173.214436
## 404 173.587222 NaN ... 1191.491172 1017.250112
## 407 389.877973 NaN ... 2424.686018 2518.838431
## 9 1580.767244 NaN ... 18493.224723 24536.947878
## ... ... ... ... ... ...
## 66 45.092593 NaN ... 736.921194 851.199962
## 132 319.380592 NaN ... 1246.060135 1096.934419
## 169 28.733930 NaN ... 16203.232367 21310.485877
## 199 2087.912140 NaN ... 631.785965 401.094096
## 274 28.352463 NaN ... 924.206163 1046.999636
##
## [125 rows x 6 columns]>
# This function reshape masks image (300, 92, 92, 92) to 300, 778688)
# accepts as input a path to folder npz files, and the number of the file in the folder.
def masks2dtrain(path, size):
#path = 'train/images/'
npz = [np.load(path + '/' + s) for s in os.listdir(path)]
masks = [item['mask'] for item in npz]
x = np.array(masks, dtype = int)
x = x.reshape(size,-1) # convert (300, 92, 92, 92) to (300, 778688)
return(x)
xtrain = masks2dtrain(path ='train/images/', size = 300)
features_tr = pd.read_csv("train.csv")
ytrain = features_tr[['SurvivalTime']]#.as_matrix()
trn_x, val_x, trn_y, val_y = train_test_split(xtrain, ytrain, random_state = 42, # stratify = ytrain,(if classes)
test_size = 0.20)
start_time = time.process_time()
xgb_model = xgb.XGBRegressor(booster = 'gbtree',
objective = 'reg:squarederror', #'survival:cox', #'reg:squarederror', # reg:linear reg:logistic
max_depth = 10,
n_estimators = 100,
min_child_weight = 9,
learning_rate = 0.1,
nthread = 8,
subsample = 0.80,
colsample_bytree = 0.80,
seed = 4242)
xgb_model.fit(trn_x,
trn_y,
eval_set = [(val_x, val_y)],
verbose = True,
#verbose_eval= 10, # print every 10 boost
eval_metric = 'rmse', # rmse, logloss, mae, map, cox-nloglik
early_stopping_rounds = 10)
## [0] validation_0-rmse:957.482
## Will train until validation_0-rmse hasn't improved in 10 rounds.
## [1] validation_0-rmse:902.339
## [2] validation_0-rmse:853.041
## [3] validation_0-rmse:804.14
## [4] validation_0-rmse:771.716
## [5] validation_0-rmse:753.874
## [6] validation_0-rmse:730.009
## [7] validation_0-rmse:716.429
## [8] validation_0-rmse:701.159
## [9] validation_0-rmse:682.286
## [10] validation_0-rmse:673.197
## [11] validation_0-rmse:667.523
## [12] validation_0-rmse:665.009
## [13] validation_0-rmse:660.947
## [14] validation_0-rmse:663.217
## [15] validation_0-rmse:661.952
## [16] validation_0-rmse:667.392
## [17] validation_0-rmse:666.433
## [18] validation_0-rmse:664.523
## [19] validation_0-rmse:661.457
## [20] validation_0-rmse:666.046
## [21] validation_0-rmse:667.587
## [22] validation_0-rmse:669.826
## [23] validation_0-rmse:674.962
## Stopping. Best iteration:
## [13] validation_0-rmse:660.947
##
## XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
## colsample_bynode=1, colsample_bytree=0.8, gamma=0,
## importance_type='gain', learning_rate=0.1, max_delta_step=0,
## max_depth=10, min_child_weight=9, missing=None, n_estimators=100,
## n_jobs=1, nthread=8, objective='reg:squarederror', random_state=0,
## reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=4242,
## silent=None, subsample=0.8, verbosity=1)
## ###### Elapsed time: 14.45222275 Minutes ######
##
## /Users/Mezhoud/anaconda3/bin/python3:1: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
#def getname(path):
# name = os.path.basename(path)
# return(re.sub("\D", "", name))
# get PatientID
#paths = glob.glob('test/images/*.npz')
#labels = [getname(item) for item in paths]
xgb_model = pickle.load(open(filename_xgb, 'rb'))
result = xgb_model.score(val_x, val_y)
#print(result)
xtest = masks2dtrain(path = "test/images/", size = 125)
pred = xgb_model.predict(xtest)
multiple_pred['xgb_masks'] = pred
multiple_pred.to_csv("multiple_pred.csv")
multiple_pred.head
## <bound method NDFrame.head of SurvivalTime Event ... logistic xgb_masks
## PatientID ...
## 13 788.417673 NaN ... 3103.516701 931.311829
## 155 427.650092 NaN ... 2173.214436 623.559265
## 404 173.587222 NaN ... 1017.250112 814.161377
## 407 389.877973 NaN ... 2518.838431 843.975647
## 9 1580.767244 NaN ... 24536.947878 923.655762
## ... ... ... ... ... ...
## 66 45.092593 NaN ... 851.199962 627.391357
## 132 319.380592 NaN ... 1096.934419 704.939392
## 169 28.733930 NaN ... 21310.485877 626.529541
## 199 2087.912140 NaN ... 401.094096 330.398956
## 274 28.352463 NaN ... 1046.999636 696.583984
##
## [125 rows x 7 columns]>
multiple_pred <- fread("multiple_pred.csv")
ensemble <- multiple_pred %>%
rowwise() %>%
mutate(mean = mean(c(cph,Wei,log,logistic,xgb_masks)),
median = median(c(cph,Wei,log,logistic,xgb_masks)),
max = max(c(cph,Wei,log,logistic,xgb_masks)),
min = min(c(cph,Wei,log,logistic,xgb_masks)),
quantile1 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[1]],
quantile2 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[2]],
quantile3 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[3]],
quantile4 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[4]],
quantile5 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[5]])
require(corrplot)
M <- cor(ensemble %>% select(-PatientID, - Event))
#corrplot(M, method = "circle")
corrplot.mixed(M, tl.col="black", tl.pos = "lt")
The idea is to merge UNCORRELATED model prediction. xgb_model seems to be not correlated with all the other.
The key idea is the average xgb_model with the best score of cph, Wei, log , and logistic.
In our laste run cph seems to have the best score (the model c-index scoring is ramdonly changing)