1 Preprocessing of Radiomics and Clinical datasets with R
2 Evaluate CoxPH, WeiBull, Log and Logistic Models with Python
3 Multiple Predictions (cph, Wei, log, logistic)
4 Masks dataset processing
5 xgboost learning model using Masks
6 Prediction by xgboost model
7 Averaging Ensemble with R

In this report, I will go directly to the solution. A HTML version in available fowing this link.
A second document link is available in this link. It will focus on:
- Exploration of Radiomics, Clinical and Images data sets.
- Event prediction using features and images.

# Set python environment and version in RStudio ;-)
reticulate::use_python("/Users/Mezhoud/anaconda3/bin/python3", required = TRUE)
reticulate::py_config()

## python:         /Users/Mezhoud/anaconda3/bin/python3
## libpython:      /Users/Mezhoud/anaconda3/lib/libpython3.7m.dylib
## pythonhome:     /Users/Mezhoud/anaconda3:/Users/Mezhoud/anaconda3
## version:        3.7.5 (default, Oct 25 2019, 10:52:18)  [Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy:          /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/numpy
## numpy_version:  1.18.1
## 
## NOTE: Python version was forced by use_python function

1 Preprocessing of Radiomics and Clinical datasets with R

# Function to impute age  
age_imputation <- function(df){
            init = mice(df, maxit=0)
            meth = init$method
            predM = init$predictorMatrix
            predM[, c("PatientID")] <- 0
            meth[c("PatientID")]=""
            meth[c("age")]="cart"  # pmm (Predictive Mean Matching suitable for numeric variables )
            set.seed(103)
            imputed = mice(df, method=meth, predictorMatrix=predM, m=5)
            imputed <- complete(imputed)
            return(imputed)
}

# Function to reshape and merge Radiomics and clinical datasets
Features_prepocessing <- function(path_radiomics, path_clinical, path_submission){

  #Load dataset
  radiomics <- fread(path_radiomics, quote = "")
  clinical <- fread(path_clinical)
  submission <- fread(path_submission, quote = "")

# Attributes groups to (shape, firstorder, textural) and features to (original_shaep_Compacteness1 .....)
  groups <- radiomics[1:2,-1] %>%
    t() %>%
    as.data.frame() %>%
    rename("Groups" = V1, "Features" = V2)

# Omit "original_" from features
  new_colnames_radiomics <- groups %>%
    mutate(Features = stringr::str_remove(Features,"original_")) %>%
    pull(Features)

# Renames columns of radiomics
  old_names <- colnames(radiomics)
  new_names <- c("PatientID", new_colnames_radiomics)

# Get new radiomics dataset
  new_radiomics <- radiomics[-1:-3,] %>%
      rename_at(vars(old_names), ~ new_names) %>%
      mutate_if(is.character, as.numeric)
  
# Get new Clinical dataset
# age imputation
  clinical <- age_imputation(clinical)
# Convert character variables to numeric
  new_clinical <- clinical %>%
                mutate(Histology = stringi::stri_trans_totitle(Histology)) %>% 
                mutate_if(is.character, as.factor) %>%
                mutate_if(is.factor, as.numeric) 

# Join radimics and Clinical dataset by PatientID
  features <- new_clinical %>%
    mutate_if(is.character, as.factor) %>%
    left_join(y = submission, by = "PatientID") %>%
    left_join(y = new_radiomics, by = "PatientID") %>%
    dplyr::select(PatientID, SurvivalTime, everything()) %>%
    setDT()

  return(features)
}


train <- Features_prepocessing(path_radiomics = "train/features/radiomics.csv",
                      path_clinical = "train/features/clinical_data.csv",
                      path_submission = "output_train.csv")

## 
##  iter imp variable
##   1   1  age
##   1   2  age
##   1   3  age
##   1   4  age
##   1   5  age
##   2   1  age
##   2   2  age
##   2   3  age
##   2   4  age
##   2   5  age
##   3   1  age
##   3   2  age
##   3   3  age
##   3   4  age
##   3   5  age
##   4   1  age
##   4   2  age
##   4   3  age
##   4   4  age
##   4   5  age
##   5   1  age
##   5   2  age
##   5   3  age
##   5   4  age
##   5   5  age

test <- Features_prepocessing(path_radiomics = "test/features/radiomics.csv",
                      path_clinical = "test/features/clinical_data.csv",
                      path_submission = "output_test.csv")

## 
##  iter imp variable
##   1   1  age
##   1   2  age
##   1   3  age
##   1   4  age
##   1   5  age
##   2   1  age
##   2   2  age
##   2   3  age
##   2   4  age
##   2   5  age
##   3   1  age
##   3   2  age
##   3   3  age
##   3   4  age
##   3   5  age
##   4   1  age
##   4   2  age
##   4   3  age
##   4   4  age
##   4   5  age
##   5   1  age
##   5   2  age
##   5   3  age
##   5   4  age
##   5   5  age

fwrite(train, "train.csv")
fwrite(test, "test.csv")

train[1:10, 1:10]

##     PatientID SurvivalTime Histology Mstage Nstage SourceDataset Tstage     age
##  1:       202         1378         1      0      0             2      2 66.0000
##  2:       371          379         2      0      2             1      4 64.5722
##  3:       246          573         6      0      3             1      2 66.0452
##  4:       240          959         4      0      2             1      3 59.3566
##  5:       284         2119         6      0      3             1      4 71.0554
##  6:       348          706         6      0      2             1      2 65.0212
##  7:       384           78         6      0      0             1      3 78.7105
##  8:       244         1369         1      0      0             2      1 70.0000
##  9:       100          197         1      0      0             1      4 74.4504
## 10:       173          196         1      0      2             1      4 53.0842
##     Event shape_Compactness1
##  1:     0         0.02781503
##  2:     1         0.02301549
##  3:     1         0.02734811
##  4:     0         0.02681111
##  5:     0         0.02369124
##  6:     1         0.03098136
##  7:     1         0.02828906
##  8:     1         0.02048073
##  9:     1         0.02802794
## 10:     1         0.01844193

2 Evaluate CoxPH, WeiBull, Log and Logistic Models with Python

## Load usefull Python modules 
import os
import numpy as np
import pandas as pd
import mxnet as mx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from lifelines.utils import concordance_index
from lifelines import CoxPHFitter, WeibullAFTFitter , LogNormalAFTFitter, LogLogisticAFTFitter , PiecewiseExponentialRegressionFitter
from lifelines.utils import k_fold_cross_validation
import seaborn as sns
import xgboost as xgb
import time
import re
import glob
import random
import pickle


# Function to read and select pertinente features
def select_features(path):
  df = pd.read_csv(path) #,header=0, delim_whitespace=True
  df = df.set_index('PatientID')
  # set target ytrain
  ydf = df[["SurvivalTime","Event"]]
  # set variable matrix xtrain
  xdf = df.drop(['SurvivalTime', 'Event'], axis=1)
  xdf = xdf[["shape_Sphericity", "shape_SurfaceVolumeRatio", "shape_Maximum3DDiameter",
    "glcm_Id","glcm_Idm" ,"SourceDataset","Nstage","Tstage","age"  ,"firstorder_Entropy" ]] # , "Histology"
  df = pd.concat([xdf, ydf], axis = 1)
  return(df)
    

# Function to evaluate 4 models and return datafarame with scores, cph, Wei, log and logistic models
def evalModels(dt, dtype):
  
  ##  CoxPHFitter model
  cph = CoxPHFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
  ## WeibullAFTFitter
  Wei = WeibullAFTFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
  ## LogNormalAFTFitter
  log = LogNormalAFTFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
  ## LogLogisticAFTFitter
  logistic = LogLogisticAFTFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
  
  ## k_fold Cross validation 
  cph_cv_result = k_fold_cross_validation(cph, dt, duration_col='SurvivalTime', event_col='Event', k=5)
  wei_cv_result = k_fold_cross_validation(Wei, dt, duration_col='SurvivalTime', event_col='Event', k=5)
  log_cv_result = k_fold_cross_validation(log, dt, duration_col='SurvivalTime', event_col='Event', k=5)
  logistic_cv_result = k_fold_cross_validation(logistic, dt, duration_col='SurvivalTime', event_col='Event', k=5)
  
  # built empty dataframe
  results = pd.DataFrame(np.nan, 
                    index=[ 'CoxPHFitter',
                   'WeibullAFTFitter',
                   'LogNormalAFTFitter',
                   'Logistic'],
                    columns=[dtype],
                    dtype='float')
                    
  # Fill dataframe with the median c-index scores                  
  results[dtype] = [np.median(cph_cv_result),
                   np.median(wei_cv_result),
                   np.median(log_cv_result),
                   np.median(logistic_cv_result)]
  
  return [results,cph, Wei, log, logistic]


train = select_features("train.csv")
random.seed(1234)
models = evalModels(dt = train, dtype = "Features")

# save the model to disk
filename = 'models.sav'
pickle.dump(models, open(filename, 'wb'))

# load the model from disk
#loaded_models = pickle.load(open(filename, 'rb'))
#result = loaded_models.score(X_test, Y_test)
#print(result)

models[0]

##                     Features
## CoxPHFitter         0.663320
## WeibullAFTFitter    0.692367
## LogNormalAFTFitter  0.692047
## Logistic            0.674817

3 Multiple Predictions (cph, Wei, log, logistic)

# Function to run the 4 predictions
def getMultiplePred(path_test):
  # prepare xtest
  test_fea = pd.read_csv(path_test)
  test_fea = test_fea.set_index('PatientID')
  # select the same variables
  xtest = test_fea[["shape_Sphericity", "shape_SurfaceVolumeRatio", "shape_Maximum3DDiameter","firstorder_Entropy",
        "glcm_Id","glcm_Idm","SourceDataset","Nstage","Tstage","age", "Histology"]]
  
  
  # get models
  #models = evalModels(dt = train, dtype = "Features")
  # load the model from disk
  models = pickle.load(open("models.sav", 'rb'))
  
  # predict
  cph_pred = models[1].predict_expectation(xtest)
  Wei_pred = models[2].predict_expectation(xtest)
  log_pred = models[3].predict_expectation(xtest)
  logistic_pred = models[4].predict_expectation(xtest)
  
  # Load output test file
  output_test = pd.read_csv("output_test.csv")
  # index PatientID
  output_test = output_test.set_index('PatientID')
  #pd.merge(df1, df2, left_index=True, right_index=True)
  
  # add predcition to SurvivalTime
  output_test['cph'] = cph_pred
  output_test['Wei'] = Wei_pred
  output_test['log'] = log_pred
  output_test['logistic'] = logistic_pred

  return output_test
  

multiple_pred = getMultiplePred("test.csv")  
multiple_pred.head

## <bound method NDFrame.head of            SurvivalTime  Event  ...           log      logistic
## PatientID                       ...                            
## 13           788.417673    NaN  ...   2863.813411   3103.516701
## 155          427.650092    NaN  ...   2483.604469   2173.214436
## 404          173.587222    NaN  ...   1191.491172   1017.250112
## 407          389.877973    NaN  ...   2424.686018   2518.838431
## 9           1580.767244    NaN  ...  18493.224723  24536.947878
## ...                 ...    ...  ...           ...           ...
## 66            45.092593    NaN  ...    736.921194    851.199962
## 132          319.380592    NaN  ...   1246.060135   1096.934419
## 169           28.733930    NaN  ...  16203.232367  21310.485877
## 199         2087.912140    NaN  ...    631.785965    401.094096
## 274           28.352463    NaN  ...    924.206163   1046.999636
## 
## [125 rows x 6 columns]>

4 Masks dataset processing

In this section we will use masks to train xgboost model.


# This function reshape masks image (300, 92, 92, 92) to 300, 778688)
# accepts as input a path to folder npz files, and the number of the file in the folder.
def masks2dtrain(path, size):
  #path = 'train/images/'
  npz = [np.load(path + '/' + s) for s in os.listdir(path)]
  masks = [item['mask'] for item in npz]
  x = np.array(masks, dtype = int)
  x = x.reshape(size,-1) # convert (300, 92, 92, 92) to (300, 778688)
  return(x)


xtrain = masks2dtrain(path ='train/images/', size = 300)

features_tr = pd.read_csv("train.csv")
ytrain = features_tr[['SurvivalTime']]#.as_matrix()  

trn_x, val_x, trn_y, val_y = train_test_split(xtrain, ytrain, random_state = 42,  # stratify = ytrain,(if classes)
                                                                   test_size = 0.20)

5 xgboost learning model using Masks

start_time = time.process_time()

xgb_model = xgb.XGBRegressor(booster = 'gbtree',
                      objective = 'reg:squarederror', #'survival:cox',  #'reg:squarederror',  # reg:linear reg:logistic
                      max_depth = 10,
                      n_estimators = 100,
                      min_child_weight = 9,
                      learning_rate = 0.1,
                      nthread = 8,
                      subsample = 0.80,
                      colsample_bytree = 0.80,
                      seed = 4242)



xgb_model.fit(trn_x,
          trn_y,
          eval_set = [(val_x, val_y)],
          verbose = True,
          #verbose_eval= 10,  # print every 10 boost
          eval_metric = 'rmse', # rmse, logloss, mae, map, cox-nloglik
          early_stopping_rounds = 10)

## [0]  validation_0-rmse:957.482
## Will train until validation_0-rmse hasn't improved in 10 rounds.
## [1]  validation_0-rmse:902.339
## [2]  validation_0-rmse:853.041
## [3]  validation_0-rmse:804.14
## [4]  validation_0-rmse:771.716
## [5]  validation_0-rmse:753.874
## [6]  validation_0-rmse:730.009
## [7]  validation_0-rmse:716.429
## [8]  validation_0-rmse:701.159
## [9]  validation_0-rmse:682.286
## [10] validation_0-rmse:673.197
## [11] validation_0-rmse:667.523
## [12] validation_0-rmse:665.009
## [13] validation_0-rmse:660.947
## [14] validation_0-rmse:663.217
## [15] validation_0-rmse:661.952
## [16] validation_0-rmse:667.392
## [17] validation_0-rmse:666.433
## [18] validation_0-rmse:664.523
## [19] validation_0-rmse:661.457
## [20] validation_0-rmse:666.046
## [21] validation_0-rmse:667.587
## [22] validation_0-rmse:669.826
## [23] validation_0-rmse:674.962
## Stopping. Best iteration:
## [13] validation_0-rmse:660.947
## 
## XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
##              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
##              importance_type='gain', learning_rate=0.1, max_delta_step=0,
##              max_depth=10, min_child_weight=9, missing=None, n_estimators=100,
##              n_jobs=1, nthread=8, objective='reg:squarederror', random_state=0,
##              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=4242,
##              silent=None, subsample=0.8, verbosity=1)

print("######    Elapsed time: ",str((time.clock() - start_time)/60), "Minutes    ######")

## ######    Elapsed time:  14.45222275 Minutes    ######
## 
## /Users/Mezhoud/anaconda3/bin/python3:1: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead

#save the model to disk
filename_xgb = 'xgb_model.sav'
pickle.dump(xgb_model, open(filename_xgb, 'wb'))

6 Prediction by xgboost model


#def getname(path):
#  name = os.path.basename(path)
#  return(re.sub("\D", "", name))

# get PatientID
#paths = glob.glob('test/images/*.npz')
#labels = [getname(item) for item in paths]


xgb_model = pickle.load(open(filename_xgb, 'rb'))
result = xgb_model.score(val_x, val_y)
#print(result)

xtest = masks2dtrain(path = "test/images/", size = 125)

pred = xgb_model.predict(xtest)

            
multiple_pred['xgb_masks'] = pred

multiple_pred.to_csv("multiple_pred.csv")

multiple_pred.head

## <bound method NDFrame.head of            SurvivalTime  Event  ...      logistic   xgb_masks
## PatientID                       ...                          
## 13           788.417673    NaN  ...   3103.516701  931.311829
## 155          427.650092    NaN  ...   2173.214436  623.559265
## 404          173.587222    NaN  ...   1017.250112  814.161377
## 407          389.877973    NaN  ...   2518.838431  843.975647
## 9           1580.767244    NaN  ...  24536.947878  923.655762
## ...                 ...    ...  ...           ...         ...
## 66            45.092593    NaN  ...    851.199962  627.391357
## 132          319.380592    NaN  ...   1096.934419  704.939392
## 169           28.733930    NaN  ...  21310.485877  626.529541
## 199         2087.912140    NaN  ...    401.094096  330.398956
## 274           28.352463    NaN  ...   1046.999636  696.583984
## 
## [125 rows x 7 columns]>

7 Averaging Ensemble with R

multiple_pred <- fread("multiple_pred.csv")

ensemble <- multiple_pred %>%
  rowwise() %>% 
  mutate(mean = mean(c(cph,Wei,log,logistic,xgb_masks)),
         median = median(c(cph,Wei,log,logistic,xgb_masks)),
         max = max(c(cph,Wei,log,logistic,xgb_masks)),
         min = min(c(cph,Wei,log,logistic,xgb_masks)),
         quantile1 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[1]],
         quantile2 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[2]],
         quantile3 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[3]],
         quantile4 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[4]],
         quantile5 = quantile(c(cph,Wei,log,logistic,xgb_masks))[[5]])

require(corrplot)
M <- cor(ensemble %>% select(-PatientID, - Event))
#corrplot(M,  method = "circle")
corrplot.mixed(M, tl.col="black", tl.pos = "lt")

The idea is to merge UNCORRELATED model prediction. xgb_model seems to be not correlated with all the other.
The key idea is the average xgb_model with the best score of cph, Wei, log , and logistic.
In our laste run cph seems to have the best score (the model c-index scoring is ramdonly changing)

# Mean Ensembling seems to be the best
submission <- multiple_pred %>%
  rowwise() %>% 
  mutate(mean = mean(c(cph,xgb_masks))) %>% # cph,Wei,log,logistic,
  mutate(SurvivalTime = mean) %>%
  select(PatientID, SurvivalTime, Event)

fwrite(submission, "submission_ensemble_mean_cph_xgb.csv")

SurvivalTime prediction of Lung Cancer

Karim Mezhoud

2020-02-13

1 Preprocessing of Radiomics and Clinical datasets with R

2 Evaluate CoxPH, WeiBull, Log and Logistic Models with Python

3 Multiple Predictions (cph, Wei, log, logistic)

4 Masks dataset processing

5 xgboost learning model using Masks

6 Prediction by xgboost model

7 Averaging Ensemble with R