# Set python environment and version in RStudio ;-)
reticulate::use_python("/Users/Mezhoud/anaconda3/bin/python3", required = TRUE)
reticulate::py_config()
## python:         /Users/Mezhoud/anaconda3/bin/python3
## libpython:      /Users/Mezhoud/anaconda3/lib/libpython3.7m.dylib
## pythonhome:     /Users/Mezhoud/anaconda3:/Users/Mezhoud/anaconda3
## version:        3.7.5 (default, Oct 25 2019, 10:52:18)  [Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy:          /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/numpy
## numpy_version:  1.18.1
## 
## NOTE: Python version was forced by use_python function

We need rdkit package from anaconda.

reticulate::py_module_available("rdkit")
## [1] TRUE

Install rdkit if is not available

#conda install -c rdkit rdkit

Import needed modules

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef
import joblib

Merge molecules from sdf files to one file

cat antiFungals/*.sdf > all_antiFungals.sdf
cat antiBacterial_viral_biotic_sulfonamides/*.sdf > antiBacterial_viral_biotic_sulfonamides.sdf
# How many molécules in classes
grep -c 'END' antiBacterial_viral_biotic_sulfonamides.sdf
grep -c 'END' all_antiFungals.sdf
## 77
## 74

Merge the two datasets for Training

cat all_antiFungals.sdf  antiBacterial_viral_biotic_sulfonamides.sdf > alldata.sdf
grep -c 'END' alldata.sdf
## 151
# cancatenate our list of molecule to classify as antifungic or not
cat mols2predict/*.sdf > mols2predict.sdf
grep -c "END" mols2predict.sdf
## 5

1 Reading molecules and activity from SDF

## <class 'list'>
## <class 'list'>
## [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2 Calculate descriptors: generate binary Morgan fingerprint with radius 2

3 Add to Morgan fingerprints some other descriptors and look at the model performance¶

4 Split the whole set on training and test sets

5 Create folds for cross-validation

## /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
##   FutureWarning
## 
## Fold_1
## TRAIN: [ 24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41
##   42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59
##   60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
##   78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
##   96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]
## TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
## 
## Fold_2
## TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##   18  19  20  21  22  23  43  46  49  50  52  53  54  55  56  57  58  59
##   60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
##   78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
##   96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]
## TEST: [24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 44 45 47 48 51]
## 
## Fold_3
## TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
##   36  37  38  39  40  41  42  44  45  47  48  51  72  73  74  75  76  77
##   78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
##   96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]
## TEST: [43 46 49 50 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71]
## 
## Fold_4
## TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
##   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
##   54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
##   93  94  96  98  99 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]
## TEST: [ 72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
##   90  91  92  95  97 100]
## 
## Fold_5
## TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
##   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
##   54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
##   72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
##   90  91  92  95  97 100]
## TEST: [ 93  94  96  98  99 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]

6 Scale X

This step may be crucial for certain modeling approaches lke SVM. In the case of binary fingerprints it may be less useful.

## ['MolLogP_scale.pkl']

7 Ramdom Forest Regression model

7.1 Search for optimal tuning parameters and build the model

## Fitting 5 folds for each of 12 candidates, totalling 60 fits
## GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
##              error_score=nan,
##              estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
##                                               class_weight=None,
##                                               criterion='gini', max_depth=None,
##                                               max_features='auto',
##                                               max_leaf_nodes=None,
##                                               max_samples=None,
##                                               min_impurity_decrease=0.0,
##                                               min_impurity_split=None,
##                                               min_samples_leaf=1,
##                                               min_samples_split=2,
##                                               min_weight_fraction_leaf=0.0,
##                                               n_estimators=100, n_jobs=None,
##                                               oob_score=False,
##                                               random_state=None, verbose=0,
##                                               warm_start=False),
##              iid='deprecated', n_jobs=2,
##              param_grid={'max_features': [205, 293, 411, 685],
##                          'n_estimators': [100, 250, 500]},
##              pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
##              scoring=None, verbose=1)
## 
## [Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
## [Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   34.4s
## [Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:   46.7s finished
## {'max_features': 205, 'n_estimators': 100}
## 0.9
## {'mean_fit_time': array([0.60722475, 1.75145659, 2.51453605, 0.33783221, 0.80729842,
##        1.78976154, 0.48680797, 0.78129458, 2.09814954, 0.54545541,
##        2.22143626, 1.88560381]), 'std_fit_time': array([0.09260572, 0.27016048, 0.86679645, 0.00646859, 0.10116017,
##        0.53013058, 0.23748165, 0.03057221, 0.4368675 , 0.20756133,
##        0.62861573, 0.29820018]), 'mean_score_time': array([0.07566233, 0.12821608, 0.09338694, 0.01213961, 0.02948012,
##        0.07251606, 0.01536078, 0.03292408, 0.08377967, 0.02686019,
##        0.06860328, 0.06122513]), 'std_score_time': array([0.08724742, 0.05741384, 0.04072479, 0.0022696 , 0.00465124,
##        0.0257454 , 0.00288178, 0.00383592, 0.04685363, 0.02011621,
##        0.07345874, 0.01344932]), 'param_max_features': masked_array(data=[205, 205, 205, 293, 293, 293, 411, 411, 411, 685, 685,
##                    685],
##              mask=[False, False, False, False, False, False, False, False,
##                    False, False, False, False],
##        fill_value='?',
##             dtype=object), 'param_n_estimators': masked_array(data=[100, 250, 500, 100, 250, 500, 100, 250, 500, 100, 250,
##                    500],
##              mask=[False, False, False, False, False, False, False, False,
##                    False, False, False, False],
##        fill_value='?',
##             dtype=object), 'params': [{'max_features': 205, 'n_estimators': 100}, {'max_features': 205, 'n_estimators': 250}, {'max_features': 205, 'n_estimators': 500}, {'max_features': 293, 'n_estimators': 100}, {'max_features': 293, 'n_estimators': 250}, {'max_features': 293, 'n_estimators': 500}, {'max_features': 411, 'n_estimators': 100}, {'max_features': 411, 'n_estimators': 250}, {'max_features': 411, 'n_estimators': 500}, {'max_features': 685, 'n_estimators': 100}, {'max_features': 685, 'n_estimators': 250}, {'max_features': 685, 'n_estimators': 500}], 'split0_test_score': array([0.875     , 0.83333333, 0.875     , 0.875     , 0.83333333,
##        0.875     , 0.875     , 0.83333333, 0.875     , 0.83333333,
##        0.83333333, 0.83333333]), 'split1_test_score': array([0.95833333, 0.95833333, 0.95833333, 0.91666667, 0.95833333,
##        0.95833333, 0.91666667, 0.91666667, 0.95833333, 0.95833333,
##        0.95833333, 0.95833333]), 'split2_test_score': array([0.875     , 0.83333333, 0.83333333, 0.875     , 0.83333333,
##        0.875     , 0.83333333, 0.83333333, 0.83333333, 0.875     ,
##        0.83333333, 0.83333333]), 'split3_test_score': array([0.875     , 0.83333333, 0.91666667, 0.83333333, 0.91666667,
##        0.875     , 0.875     , 0.875     , 0.91666667, 0.83333333,
##        0.83333333, 0.875     ]), 'split4_test_score': array([0.91666667, 0.875     , 0.875     , 0.875     , 0.875     ,
##        0.875     , 0.875     , 0.875     , 0.875     , 0.875     ,
##        0.875     , 0.875     ]), 'mean_test_score': array([0.9       , 0.86666667, 0.89166667, 0.875     , 0.88333333,
##        0.89166667, 0.875     , 0.86666667, 0.89166667, 0.875     ,
##        0.86666667, 0.875     ]), 'std_test_score': array([0.03333333, 0.04859127, 0.04249183, 0.02635231, 0.04859127,
##        0.03333333, 0.02635231, 0.03118048, 0.04249183, 0.04564355,
##        0.04859127, 0.04564355]), 'rank_test_score': array([ 1, 10,  2,  6,  5,  2,  6, 10,  2,  6, 10,  6], dtype=int32)}
## array([0.9       , 0.86666667, 0.89166667, 0.875     , 0.88333333,
##        0.89166667, 0.875     , 0.86666667, 0.89166667, 0.875     ,
##        0.86666667, 0.875     ])
## [{'max_features': 205, 'n_estimators': 100}, {'max_features': 205, 'n_estimators': 250}, {'max_features': 205, 'n_estimators': 500}, {'max_features': 293, 'n_estimators': 100}, {'max_features': 293, 'n_estimators': 250}, {'max_features': 293, 'n_estimators': 500}, {'max_features': 411, 'n_estimators': 100}, {'max_features': 411, 'n_estimators': 250}, {'max_features': 411, 'n_estimators': 500}, {'max_features': 685, 'n_estimators': 100}, {'max_features': 685, 'n_estimators': 250}, {'max_features': 685, 'n_estimators': 500}]

7.2 Let’s try to analyse which variables are the most important in the model

## RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
##                        criterion='gini', max_depth=None, max_features=205,
##                        max_leaf_nodes=None, max_samples=None,
##                        min_impurity_decrease=0.0, min_impurity_split=None,
##                        min_samples_leaf=1, min_samples_split=2,
##                        min_weight_fraction_leaf=0.0, n_estimators=100,
##                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
##                        warm_start=False)
## array([0.        , 0.00702565, 0.        , ..., 0.01457611, 0.02615353,
##        0.01482525])
## Feature ranking:
## 1. feature 2048 (0.065727)
## 2. feature 2049 (0.061273)
## 3. feature 2050 (0.039736)
## 4. feature 2051 (0.035545)
## 5. feature 525 (0.030458)
## 6. feature 2052 (0.029534)
## 7. feature 2055 (0.026154)
## 8. feature 1603 (0.025509)
## 9. feature 1274 (0.023881)
## 10. feature 2053 (0.019019)
  • Features with numbers 1-2048 are different Morgan fingerprints

2049 - MolLogP 2050 - TPSA(m) 2051 - NHOHCount 2052 - NOCount 2053 - NumHAcceptors 2054 - NumHDonors 2055 - NumRotatableBonds 2056 - NumHeteroatoms 2057 - FractionCSP3

7.3 Predict test set compounds

## array([0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
##        1, 1, 0, 0, 0, 1, 0, 1, 1])

7.4 Calculate statistics for test set preditions

7.4.1 For Classification

## Accuracy =  0.967741935483871
## MCC =  0.9375
## Kappa =  0.9355509355509355

7.4.2 For Regression

## Mean absolute error = 0.03
## Mean squared error = 0.03
## Median absolute error = 0.0
## Explain variance score = 0.88
## R2 score = 0.87
  • Mean absolute error: This is the average of absolute errors of all the data points in the given dataset.

  • Mean squared error: This is the average of the squares of the errors of all the data points in the given dataset. It is one of the most popular metrics out there!

  • Median absolute error: This is the median of all the errors in the given dataset. The main advantage of this metric is that it’s robust to outliers. A single bad point in the test dataset wouldn’t skew the entire error metric, as opposed to a mean error metric.

  • Explained variance score: This score measures how well our model can account for the variation in our dataset. A score of 1.0 indicates that our model is perfect.

  • R2 score: This is pronounced as R-squared, and this score refers to the coefficient of determination. This tells us how well the unknown samples will be predicted by our model. The best possible score is 1.0, but the score can be negative as well.

7.5 Applicability domain estimates

## array([[0.83      , 0.17      ],
##        [0.12      , 0.88      ],
##        [0.79      , 0.21      ],
##        [0.77      , 0.23      ],
##        [0.96      , 0.04      ],
##        [0.01      , 0.99      ],
##        [0.85      , 0.15      ],
##        [0.07      , 0.93      ],
##        [0.19333333, 0.80666667],
##        [1.        , 0.        ],
##        [0.24      , 0.76      ],
##        [0.03      , 0.97      ],
##        [0.08      , 0.92      ],
##        [0.        , 1.        ],
##        [0.02      , 0.98      ],
##        [0.65      , 0.35      ],
##        [0.92166667, 0.07833333],
##        [0.92      , 0.08      ],
##        [0.13      , 0.87      ],
##        [0.94      , 0.06      ],
##        [0.06      , 0.94      ],
##        [0.86666667, 0.13333333],
##        [0.21333333, 0.78666667],
##        [0.44      , 0.56      ],
##        [1.        , 0.        ],
##        [0.91      , 0.09      ],
##        [0.955     , 0.045     ],
##        [0.09      , 0.91      ],
##        [0.73      , 0.27      ],
##        [0.1       , 0.9       ],
##        [0.02      , 0.98      ]])
## array([False, False, False, False,  True,  True, False, False, False,
##         True, False,  True, False,  True,  True, False, False, False,
##        False, False, False, False, False, False,  True, False,  True,
##        False, False, False,  True])

8 GBM Model

8.1 For Classification

## Fitting 5 folds for each of 5 candidates, totalling 25 fits
## GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
##              error_score=nan,
##              estimator=GradientBoostingClassifier(ccp_alpha=0.0,
##                                                   criterion='friedman_mse',
##                                                   init=None, learning_rate=0.1,
##                                                   loss='deviance', max_depth=3,
##                                                   max_features=0.5,
##                                                   max_leaf_nodes=None,
##                                                   min_impurity_decrease=0.0,
##                                                   min_impurity_split=None,
##                                                   min_samples_leaf=1,
##                                                   min_samples_split=2,
##                                                   min_weight_fraction_leaf=0.0,
##                                                   n_estimators=100,
##                                                   n_iter_no_change=None,
##                                                   presort='deprecated',
##                                                   random_state=None,
##                                                   subsample=0.5, tol=0.0001,
##                                                   validation_fraction=0.1,
##                                                   verbose=0, warm_start=False),
##              iid='deprecated', n_jobs=2,
##              param_grid={'n_estimators': [100, 200, 300, 400, 500]},
##              pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
##              scoring=None, verbose=1)
## 
## [Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
## [Parallel(n_jobs=2)]: Done  25 out of  25 | elapsed:    6.5s finished

8.2 For Regression

## 0.9083333333333334
## {'n_estimators': 500}

8.2.2 For Regression

## Mean absolute error = 0.03
## Mean squared error = 0.03
## Median absolute error = 0.0
## Explain variance score = 0.88
## R2 score = 0.87
## array([0.        , 0.00020011, 0.        , ..., 0.00252872, 0.01372849,
##        0.00562592])
## Feature ranking:
## 1. feature 2048 (0.065727)
## 2. feature 2049 (0.061273)
## 3. feature 1829 (0.000414)
## 4. feature 105 (0.000408)
## 5. feature 839 (0.000506)
## 6. feature 571 (0.000000)
## 7. feature 678 (0.000167)
## 8. feature 1859 (0.011451)
## 9. feature 1468 (0.000963)
## 10. feature 2053 (0.019019)

9 Consensus model (ensemble)

## array([0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
##        1, 1, 0, 0, 0, 1, 0, 1, 1])
## Accuracy =  0.967741935483871
## MCC =  0.9375
## Kappa =  0.9355509355509355

10 Classify unknown molecules as antifungic or not

10.1 Predict unknown molecules

10.1.1 With Random Forest

## array([1, 1, 1, 1, 1])
## array([[0.06833333, 0.93166667],
##        [0.03      , 0.97      ],
##        [0.07      , 0.93      ],
##        [0.06      , 0.94      ],
##        [0.25      , 0.75      ]])
## > <PUBCHEM_COMPOUND_CID>
## 5280965
## --
## > <PUBCHEM_COMPOUND_CID>
## 7311
## --
## > <PUBCHEM_COMPOUND_CID>
## 3083779
## --
## > <PUBCHEM_COMPOUND_CID>
## 638072
## --
## > <PUBCHEM_COMPOUND_CID>
## 335377
## array([False,  True, False, False, False])
## {'Amphotericin b': False, '2,4-Di-tert-butylphenol': True, '17-Methyloctadecanoic acid': False, 'squalene': False, '4-tert-Butylcalix[4]arene': False}
## With Probabilities:
## [[0.06833333333333333, 0.9316666666666665], [0.03, 0.97], [0.07, 0.93], [0.06, 0.94], [0.25, 0.75]]
  • which are : Amphotericin b; 2,4-Di-tert-butylphenol; 17-Methyloctadecanoic acid; squalene; 4-tert-Butylcalix[4]arene

10.1.2 With XGB

## array([1, 1, 1, 1, 1])
## array([[3.11358544e-04, 9.99688641e-01],
##        [1.56321917e-05, 9.99984368e-01],
##        [5.68785359e-06, 9.99994312e-01],
##        [6.33273246e-06, 9.99993667e-01],
##        [4.62616133e-03, 9.95373839e-01]])
## > <PUBCHEM_COMPOUND_CID>
## 5280965
## --
## > <PUBCHEM_COMPOUND_CID>
## 7311
## --
## > <PUBCHEM_COMPOUND_CID>
## 3083779
## --
## > <PUBCHEM_COMPOUND_CID>
## 638072
## --
## > <PUBCHEM_COMPOUND_CID>
## 335377
## array([ True,  True,  True,  True,  True])
## {'Amphotericin b': True, '2,4-Di-tert-butylphenol': True, '17-Methyloctadecanoic acid': True, 'squalene': True, '4-tert-Butylcalix[4]arene': True}
## With Probabilities:
## [[0.0003113585439208366, 0.9996886414560792], [1.5632191663561557e-05, 0.9999843678083364], [5.687853591229697e-06, 0.9999943121464088], [6.3327324594242285e-06, 0.9999936672675406], [0.004626161325013256, 0.9953738386749867]]
  • which are : Amphotericin b; 2,4-Di-tert-butylphenol; 17-Methyloctadecanoic acid; squalene; 4-tert-Butylcalix[4]arene