# Set python environment and version in RStudio ;-)
reticulate::use_python("/Users/Mezhoud/anaconda3/bin/python3", required = TRUE)
reticulate::py_config()
## python:         /Users/Mezhoud/anaconda3/bin/python3
## libpython:      /Users/Mezhoud/anaconda3/lib/libpython3.7m.dylib
## pythonhome:     /Users/Mezhoud/anaconda3:/Users/Mezhoud/anaconda3
## version:        3.7.5 (default, Oct 25 2019, 10:52:18)  [Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy:          /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/numpy
## numpy_version:  1.18.1
## 
## NOTE: Python version was forced by use_python function

1 Preprocessing of Radiomics and Clinical datasets with R

# Function to impute age  
age_imputation <- function(df){
            init = mice(df, maxit=0)
            meth = init$method
            predM = init$predictorMatrix
            predM[, c("PatientID")] <- 0
            meth[c("PatientID")]=""
            meth[c("age")]="cart"  # pmm (Predictive Mean Matching suitable for numeric variables )
            set.seed(103)
            imputed = mice(df, method=meth, predictorMatrix=predM, m=5)
            imputed <- complete(imputed)
            return(imputed)
}

# Function to reshape and merge Radiomics and clinical datasets
Features_prepocessing <- function(path_radiomics, path_clinical, path_submission){

  #Load dataset
  radiomics <- fread(path_radiomics, quote = "")
  clinical <- fread(path_clinical)
  submission <- fread(path_submission, quote = "")

# Attributes groups to (shape, firstorder, textural) and features to (original_shaep_Compacteness1 .....)
  groups <- radiomics[1:2,-1] %>%
    t() %>%
    as.data.frame() %>%
    rename("Groups" = V1, "Features" = V2)

# Omit "original_" from features
  new_colnames_radiomics <- groups %>%
    mutate(Features = stringr::str_remove(Features,"original_")) %>%
    pull(Features)

# Renames columns of radiomics
  old_names <- colnames(radiomics)
  new_names <- c("PatientID", new_colnames_radiomics)

# Get new radiomics dataset
  new_radiomics <- radiomics[-1:-3,] %>%
      rename_at(vars(old_names), ~ new_names) %>%
      mutate_if(is.character, as.numeric)
  
# Get new Clinical dataset
# age imputation
  clinical <- age_imputation(clinical)
# Convert character variables to numeric
  new_clinical <- clinical %>%
                mutate(Histology = stringi::stri_trans_totitle(Histology)) %>% 
                mutate_if(is.character, as.factor) %>%
                mutate_if(is.factor, as.numeric) 

# Join radimics and Clinical dataset by PatientID
  features <- new_clinical %>%
    mutate_if(is.character, as.factor) %>%
    left_join(y = submission, by = "PatientID") %>%
    left_join(y = new_radiomics, by = "PatientID") %>%
    dplyr::select(PatientID, SurvivalTime, everything()) %>%
    setDT()

  return(features)
}


train <- Features_prepocessing(path_radiomics = "train/features/radiomics.csv",
                      path_clinical = "train/features/clinical_data.csv",
                      path_submission = "output_train.csv")
## 
##  iter imp variable
##   1   1  age
##   1   2  age
##   1   3  age
##   1   4  age
##   1   5  age
##   2   1  age
##   2   2  age
##   2   3  age
##   2   4  age
##   2   5  age
##   3   1  age
##   3   2  age
##   3   3  age
##   3   4  age
##   3   5  age
##   4   1  age
##   4   2  age
##   4   3  age
##   4   4  age
##   4   5  age
##   5   1  age
##   5   2  age
##   5   3  age
##   5   4  age
##   5   5  age
## 
##  iter imp variable
##   1   1  age
##   1   2  age
##   1   3  age
##   1   4  age
##   1   5  age
##   2   1  age
##   2   2  age
##   2   3  age
##   2   4  age
##   2   5  age
##   3   1  age
##   3   2  age
##   3   3  age
##   3   4  age
##   3   5  age
##   4   1  age
##   4   2  age
##   4   3  age
##   4   4  age
##   4   5  age
##   5   1  age
##   5   2  age
##   5   3  age
##   5   4  age
##   5   5  age
##     PatientID SurvivalTime Histology Mstage Nstage SourceDataset Tstage     age
##  1:       202         1378         1      0      0             2      2 66.0000
##  2:       371          379         2      0      2             1      4 64.5722
##  3:       246          573         6      0      3             1      2 66.0452
##  4:       240          959         4      0      2             1      3 59.3566
##  5:       284         2119         6      0      3             1      4 71.0554
##  6:       348          706         6      0      2             1      2 65.0212
##  7:       384           78         6      0      0             1      3 78.7105
##  8:       244         1369         1      0      0             2      1 70.0000
##  9:       100          197         1      0      0             1      4 74.4504
## 10:       173          196         1      0      2             1      4 53.0842
##     Event shape_Compactness1
##  1:     0         0.02781503
##  2:     1         0.02301549
##  3:     1         0.02734811
##  4:     0         0.02681111
##  5:     0         0.02369124
##  6:     1         0.03098136
##  7:     1         0.02828906
##  8:     1         0.02048073
##  9:     1         0.02802794
## 10:     1         0.01844193

2 Evaluate CoxPH, WeiBull, Log and Logistic Models with Python


# Function to read and select pertinente features
def select_features(path):
  df = pd.read_csv(path) #,header=0, delim_whitespace=True
  df = df.set_index('PatientID')
  # set target ytrain
  ydf = df[["SurvivalTime","Event"]]
  # set variable matrix xtrain
  xdf = df.drop(['SurvivalTime', 'Event'], axis=1)
  xdf = xdf[["shape_Sphericity", "shape_SurfaceVolumeRatio", "shape_Maximum3DDiameter",
    "glcm_Id","glcm_Idm" ,"SourceDataset","Nstage","Tstage","age"  ,"firstorder_Entropy" ]] # , "Histology"
  df = pd.concat([xdf, ydf], axis = 1)
  return(df)
    

# Function to evaluate 4 models and return datafarame with scores, cph, Wei, log and logistic models
def evalModels(dt, dtype):
  
  ##  CoxPHFitter model
  cph = CoxPHFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
  ## WeibullAFTFitter
  Wei = WeibullAFTFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
  ## LogNormalAFTFitter
  log = LogNormalAFTFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
  ## LogLogisticAFTFitter
  logistic = LogLogisticAFTFitter().fit(dt,duration_col = 'SurvivalTime', event_col='Event')
  
  ## k_fold Cross validation 
  cph_cv_result = k_fold_cross_validation(cph, dt, duration_col='SurvivalTime', event_col='Event', k=5)
  wei_cv_result = k_fold_cross_validation(Wei, dt, duration_col='SurvivalTime', event_col='Event', k=5)
  log_cv_result = k_fold_cross_validation(log, dt, duration_col='SurvivalTime', event_col='Event', k=5)
  logistic_cv_result = k_fold_cross_validation(logistic, dt, duration_col='SurvivalTime', event_col='Event', k=5)
  
  # built empty dataframe
  results = pd.DataFrame(np.nan, 
                    index=[ 'CoxPHFitter',
                   'WeibullAFTFitter',
                   'LogNormalAFTFitter',
                   'Logistic'],
                    columns=[dtype],
                    dtype='float')
                    
  # Fill dataframe with the median c-index scores                  
  results[dtype] = [np.median(cph_cv_result),
                   np.median(wei_cv_result),
                   np.median(log_cv_result),
                   np.median(logistic_cv_result)]
  
  return [results,cph, Wei, log, logistic]


train = select_features("train.csv")
random.seed(1234)
models = evalModels(dt = train, dtype = "Features")

# save the model to disk
filename = 'models.sav'
pickle.dump(models, open(filename, 'wb'))

# load the model from disk
#loaded_models = pickle.load(open(filename, 'rb'))
#result = loaded_models.score(X_test, Y_test)
#print(result)

models[0]
##                     Features
## CoxPHFitter         0.663320
## WeibullAFTFitter    0.692367
## LogNormalAFTFitter  0.692047
## Logistic            0.674817

3 Multiple Predictions (cph, Wei, log, logistic)

## <bound method NDFrame.head of            SurvivalTime  Event  ...           log      logistic
## PatientID                       ...                            
## 13           788.417673    NaN  ...   2863.813411   3103.516701
## 155          427.650092    NaN  ...   2483.604469   2173.214436
## 404          173.587222    NaN  ...   1191.491172   1017.250112
## 407          389.877973    NaN  ...   2424.686018   2518.838431
## 9           1580.767244    NaN  ...  18493.224723  24536.947878
## ...                 ...    ...  ...           ...           ...
## 66            45.092593    NaN  ...    736.921194    851.199962
## 132          319.380592    NaN  ...   1246.060135   1096.934419
## 169           28.733930    NaN  ...  16203.232367  21310.485877
## 199         2087.912140    NaN  ...    631.785965    401.094096
## 274           28.352463    NaN  ...    924.206163   1046.999636
## 
## [125 rows x 6 columns]>

4 Masks dataset processing

5 xgboost learning model using Masks

## [0]  validation_0-rmse:957.482
## Will train until validation_0-rmse hasn't improved in 10 rounds.
## [1]  validation_0-rmse:902.339
## [2]  validation_0-rmse:853.041
## [3]  validation_0-rmse:804.14
## [4]  validation_0-rmse:771.716
## [5]  validation_0-rmse:753.874
## [6]  validation_0-rmse:730.009
## [7]  validation_0-rmse:716.429
## [8]  validation_0-rmse:701.159
## [9]  validation_0-rmse:682.286
## [10] validation_0-rmse:673.197
## [11] validation_0-rmse:667.523
## [12] validation_0-rmse:665.009
## [13] validation_0-rmse:660.947
## [14] validation_0-rmse:663.217
## [15] validation_0-rmse:661.952
## [16] validation_0-rmse:667.392
## [17] validation_0-rmse:666.433
## [18] validation_0-rmse:664.523
## [19] validation_0-rmse:661.457
## [20] validation_0-rmse:666.046
## [21] validation_0-rmse:667.587
## [22] validation_0-rmse:669.826
## [23] validation_0-rmse:674.962
## Stopping. Best iteration:
## [13] validation_0-rmse:660.947
## 
## XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
##              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
##              importance_type='gain', learning_rate=0.1, max_delta_step=0,
##              max_depth=10, min_child_weight=9, missing=None, n_estimators=100,
##              n_jobs=1, nthread=8, objective='reg:squarederror', random_state=0,
##              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=4242,
##              silent=None, subsample=0.8, verbosity=1)
## ######    Elapsed time:  14.45222275 Minutes    ######
## 
## /Users/Mezhoud/anaconda3/bin/python3:1: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead

6 Prediction by xgboost model

## <bound method NDFrame.head of            SurvivalTime  Event  ...      logistic   xgb_masks
## PatientID                       ...                          
## 13           788.417673    NaN  ...   3103.516701  931.311829
## 155          427.650092    NaN  ...   2173.214436  623.559265
## 404          173.587222    NaN  ...   1017.250112  814.161377
## 407          389.877973    NaN  ...   2518.838431  843.975647
## 9           1580.767244    NaN  ...  24536.947878  923.655762
## ...                 ...    ...  ...           ...         ...
## 66            45.092593    NaN  ...    851.199962  627.391357
## 132          319.380592    NaN  ...   1096.934419  704.939392
## 169           28.733930    NaN  ...  21310.485877  626.529541
## 199         2087.912140    NaN  ...    401.094096  330.398956
## 274           28.352463    NaN  ...   1046.999636  696.583984
## 
## [125 rows x 7 columns]>

7 Averaging Ensemble with R