1 Reading molecules and activity from SDF
- 1.1 Get directly descritor
2 Calculate descriptors: generate binary Morgan fingerprint with radius 2
- 2.1 Convert fp to array
3 Add to Morgan fingerprints some other descriptors and look at the model performance¶
- 3.1 Calculate some descriptors
4 Split the whole set on training and test sets
5 Create folds for cross-validation
6 Scale X
7 Ramdom Forest Regression model
8 GBM Model
- 8.1 For Classification
- 8.2 For Regression
  - 8.2.1 For Classification
  - 8.2.2 For Regression
9 Consensus model (ensemble)
10 Classify unknown molecules as antifungic or not
- 10.1 Predict unknown molecules
  - 10.1.1 With Random Forest
  - 10.1.2 With XGB

# Set python environment and version in RStudio ;-)
reticulate::use_python("/Users/Mezhoud/anaconda3/bin/python3", required = TRUE)
reticulate::py_config()

## python:         /Users/Mezhoud/anaconda3/bin/python3
## libpython:      /Users/Mezhoud/anaconda3/lib/libpython3.7m.dylib
## pythonhome:     /Users/Mezhoud/anaconda3:/Users/Mezhoud/anaconda3
## version:        3.7.5 (default, Oct 25 2019, 10:52:18)  [Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy:          /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/numpy
## numpy_version:  1.18.1
## 
## NOTE: Python version was forced by use_python function

We need rdkit package from anaconda.

reticulate::py_module_available("rdkit")

## [1] TRUE

Install rdkit if is not available

#conda install -c rdkit rdkit

Import needed modules

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef
import joblib

Merge molecules from sdf files to one file

cat antiFungals/*.sdf > all_antiFungals.sdf

cat antiBacterial_viral_biotic_sulfonamides/*.sdf > antiBacterial_viral_biotic_sulfonamides.sdf

# How many molécules in classes
grep -c 'END' antiBacterial_viral_biotic_sulfonamides.sdf
grep -c 'END' all_antiFungals.sdf

## 77
## 74

Merge the two datasets for Training

cat all_antiFungals.sdf  antiBacterial_viral_biotic_sulfonamides.sdf > alldata.sdf
grep -c 'END' alldata.sdf

## 151

# cancatenate our list of molecule to classify as antifungic or not
cat mols2predict/*.sdf > mols2predict.sdf
grep -c "END" mols2predict.sdf

## 5

1 Reading molecules and activity from SDF

We select ALOGPS_LOGP as the activity. But we can select other feature.

fname = "alldata.sdf"

mols = []
y = []
for mol in Chem.SDMolSupplier(fname):
    if mol is not None:
        mols.append(mol)
        #y.append(Descriptors.MolLogP(mol)) # For Regression Model
        

# y for Classification Model
y = [1] * 74 + [0] * 77

type(y)

## <class 'list'>

type(mols)

## <class 'list'>

## [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

## Reformat unkown molecules
fname = "mols2predict.sdf"

mols2pred = []
for mol in Chem.SDMolSupplier(fname):
    if mol is not None:
        mols2pred.append(mol)

1.1 Get directly descritor


#mols[1].GetDoubleProp('PUBCHEM_CONFORMER_RMSD') #  works when comment y.append(Descriptors.MolLogP(mol))
#mols[1].GetIntProp("JCHEM_NUMBER_OF_RINGS")
Descriptors.MolLogP(mols[1])

## 6.117800000000004

Descriptors.MolLogP(Chem.SDMolSupplier("alldata.sdf")[11])

## 0.7117000000000069

2 Calculate descriptors: generate binary Morgan fingerprint with radius 2

# generate binary Morgan fingerprint with radius 2
fp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]

fp2pred = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols2pred]

2.1 Convert fp to array

def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

fpx = rdkit_numpy_convert(fp)

fpx2pred = rdkit_numpy_convert(fp2pred)

fpx.shape

## (151, 2048)

fpx2pred.shape

## (5, 2048)

# check wether the data set is balanced
sum(y) / len(y)

## 0.4900662251655629

3 Add to Morgan fingerprints some other descriptors and look at the model performance¶

3.1 Calculate some descriptors

A list of descriptors is available here.



# calc some descriptors
descr = []
for m in mols:
    descr.append([Descriptors.MolLogP(m),
                  Descriptors.TPSA(m),
                  Descriptors.NHOHCount(m),
                  Descriptors.NOCount(m),
                  Descriptors.NumHAcceptors(m),
                  Descriptors.NumHDonors(m),
                  Descriptors.NumRotatableBonds(m),
                  #Descriptors.VSA_EState1(m),
                  Descriptors.NumHeteroatoms(m),
                  Descriptors.FractionCSP3(m)])
descr = np.asarray(descr)

# calc some descriptors for unknown molecules
descr2pred = []
for m in mols2pred:
    descr2pred.append([Descriptors.MolLogP(m),
                  Descriptors.TPSA(m),
                  Descriptors.NHOHCount(m),
                  Descriptors.NOCount(m),
                  Descriptors.NumHAcceptors(m),
                  Descriptors.NumHDonors(m),
                  Descriptors.NumRotatableBonds(m),
                  #Descriptors.VSA_EState1(m),
                  Descriptors.NumHeteroatoms(m),
                  Descriptors.FractionCSP3(m)])
descr2pred = np.asarray(descr2pred)


descr.shape

## (151, 9)

descr2pred.shape

## (5, 9)

# add them to morgan fingerprints
x = np.concatenate((fpx, descr), axis=1)

x2pred = np.concatenate((fpx2pred, descr2pred), axis=1)
x.shape

## (151, 2057)

x2pred.shape

## (5, 2057)

4 Split the whole set on training and test sets


#Set random seed to make all further calculations reproducible
seed = 42

# randomly select 20% of compounds as test set
x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.20, random_state=seed)

5 Create folds for cross-validation

cv = StratifiedKFold(n_splits=5, random_state=seed)  # Binary Y
#cv = KFold(n_splits=5, random_state=seed)             # Countinious Y


# print out ids of folds

## /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
##   FutureWarning

for i, (train_index, test_index) in enumerate(cv.split(x_tr, y_tr)):
    print("\nFold_" + str(i+1))
    print("TRAIN:", train_index)
    print("TEST:", test_index)

## 
## Fold_1
## TRAIN: [ 24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41
##   42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59
##   60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
##   78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
##   96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]
## TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
## 
## Fold_2
## TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##   18  19  20  21  22  23  43  46  49  50  52  53  54  55  56  57  58  59
##   60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
##   78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
##   96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]
## TEST: [24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 44 45 47 48 51]
## 
## Fold_3
## TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
##   36  37  38  39  40  41  42  44  45  47  48  51  72  73  74  75  76  77
##   78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
##   96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]
## TEST: [43 46 49 50 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71]
## 
## Fold_4
## TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
##   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
##   54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
##   93  94  96  98  99 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]
## TEST: [ 72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
##   90  91  92  95  97 100]
## 
## Fold_5
## TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
##   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
##   54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
##   72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
##   90  91  92  95  97 100]
## TEST: [ 93  94  96  98  99 101 102 103 104 105 106 107 108 109 110 111 112 113
##  114 115 116 117 118 119]

6 Scale X

This step may be crucial for certain modeling approaches lke SVM. In the case of binary fingerprints it may be less useful.

# obtain scale object which can be further applied to scale any data to fit the training set
scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)

# it is a good idea to save it for future use
joblib.dump(scale, "MolLogP_scale.pkl", compress=3)

## ['MolLogP_scale.pkl']

7 Ramdom Forest Regression model

7.1 Search for optimal tuning parameters and build the model

# create grid search dictionary
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1) # classification
#m = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=2, cv=cv, verbose=1) # Regression
# run model building
m.fit(x_tr, y_tr)

## Fitting 5 folds for each of 12 candidates, totalling 60 fits
## GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
##              error_score=nan,
##              estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
##                                               class_weight=None,
##                                               criterion='gini', max_depth=None,
##                                               max_features='auto',
##                                               max_leaf_nodes=None,
##                                               max_samples=None,
##                                               min_impurity_decrease=0.0,
##                                               min_impurity_split=None,
##                                               min_samples_leaf=1,
##                                               min_samples_split=2,
##                                               min_weight_fraction_leaf=0.0,
##                                               n_estimators=100, n_jobs=None,
##                                               oob_score=False,
##                                               random_state=None, verbose=0,
##                                               warm_start=False),
##              iid='deprecated', n_jobs=2,
##              param_grid={'max_features': [205, 293, 411, 685],
##                          'n_estimators': [100, 250, 500]},
##              pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
##              scoring=None, verbose=1)
## 
## [Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
## [Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   34.4s
## [Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:   46.7s finished

m.best_params_

## {'max_features': 205, 'n_estimators': 100}

m.best_score_

## 0.9

m.cv_results_

## {'mean_fit_time': array([0.60722475, 1.75145659, 2.51453605, 0.33783221, 0.80729842,
##        1.78976154, 0.48680797, 0.78129458, 2.09814954, 0.54545541,
##        2.22143626, 1.88560381]), 'std_fit_time': array([0.09260572, 0.27016048, 0.86679645, 0.00646859, 0.10116017,
##        0.53013058, 0.23748165, 0.03057221, 0.4368675 , 0.20756133,
##        0.62861573, 0.29820018]), 'mean_score_time': array([0.07566233, 0.12821608, 0.09338694, 0.01213961, 0.02948012,
##        0.07251606, 0.01536078, 0.03292408, 0.08377967, 0.02686019,
##        0.06860328, 0.06122513]), 'std_score_time': array([0.08724742, 0.05741384, 0.04072479, 0.0022696 , 0.00465124,
##        0.0257454 , 0.00288178, 0.00383592, 0.04685363, 0.02011621,
##        0.07345874, 0.01344932]), 'param_max_features': masked_array(data=[205, 205, 205, 293, 293, 293, 411, 411, 411, 685, 685,
##                    685],
##              mask=[False, False, False, False, False, False, False, False,
##                    False, False, False, False],
##        fill_value='?',
##             dtype=object), 'param_n_estimators': masked_array(data=[100, 250, 500, 100, 250, 500, 100, 250, 500, 100, 250,
##                    500],
##              mask=[False, False, False, False, False, False, False, False,
##                    False, False, False, False],
##        fill_value='?',
##             dtype=object), 'params': [{'max_features': 205, 'n_estimators': 100}, {'max_features': 205, 'n_estimators': 250}, {'max_features': 205, 'n_estimators': 500}, {'max_features': 293, 'n_estimators': 100}, {'max_features': 293, 'n_estimators': 250}, {'max_features': 293, 'n_estimators': 500}, {'max_features': 411, 'n_estimators': 100}, {'max_features': 411, 'n_estimators': 250}, {'max_features': 411, 'n_estimators': 500}, {'max_features': 685, 'n_estimators': 100}, {'max_features': 685, 'n_estimators': 250}, {'max_features': 685, 'n_estimators': 500}], 'split0_test_score': array([0.875     , 0.83333333, 0.875     , 0.875     , 0.83333333,
##        0.875     , 0.875     , 0.83333333, 0.875     , 0.83333333,
##        0.83333333, 0.83333333]), 'split1_test_score': array([0.95833333, 0.95833333, 0.95833333, 0.91666667, 0.95833333,
##        0.95833333, 0.91666667, 0.91666667, 0.95833333, 0.95833333,
##        0.95833333, 0.95833333]), 'split2_test_score': array([0.875     , 0.83333333, 0.83333333, 0.875     , 0.83333333,
##        0.875     , 0.83333333, 0.83333333, 0.83333333, 0.875     ,
##        0.83333333, 0.83333333]), 'split3_test_score': array([0.875     , 0.83333333, 0.91666667, 0.83333333, 0.91666667,
##        0.875     , 0.875     , 0.875     , 0.91666667, 0.83333333,
##        0.83333333, 0.875     ]), 'split4_test_score': array([0.91666667, 0.875     , 0.875     , 0.875     , 0.875     ,
##        0.875     , 0.875     , 0.875     , 0.875     , 0.875     ,
##        0.875     , 0.875     ]), 'mean_test_score': array([0.9       , 0.86666667, 0.89166667, 0.875     , 0.88333333,
##        0.89166667, 0.875     , 0.86666667, 0.89166667, 0.875     ,
##        0.86666667, 0.875     ]), 'std_test_score': array([0.03333333, 0.04859127, 0.04249183, 0.02635231, 0.04859127,
##        0.03333333, 0.02635231, 0.03118048, 0.04249183, 0.04564355,
##        0.04859127, 0.04564355]), 'rank_test_score': array([ 1, 10,  2,  6,  5,  2,  6, 10,  2,  6, 10,  6], dtype=int32)}

m.cv_results_['mean_test_score']

## array([0.9       , 0.86666667, 0.89166667, 0.875     , 0.88333333,
##        0.89166667, 0.875     , 0.86666667, 0.89166667, 0.875     ,
##        0.86666667, 0.875     ])

m.cv_results_['params']

## [{'max_features': 205, 'n_estimators': 100}, {'max_features': 205, 'n_estimators': 250}, {'max_features': 205, 'n_estimators': 500}, {'max_features': 293, 'n_estimators': 100}, {'max_features': 293, 'n_estimators': 250}, {'max_features': 293, 'n_estimators': 500}, {'max_features': 411, 'n_estimators': 100}, {'max_features': 411, 'n_estimators': 250}, {'max_features': 411, 'n_estimators': 500}, {'max_features': 685, 'n_estimators': 100}, {'max_features': 685, 'n_estimators': 250}, {'max_features': 685, 'n_estimators': 500}]

7.2 Let’s try to analyse which variables are the most important in the model


# CLASSIFICATION
# rebuild RF model manually using best parameters to be able to extract additional information from the model
rf = RandomForestClassifier(n_estimators=m.best_params_["n_estimators"], 
                           max_features=m.best_params_["max_features"],
                           random_state=seed)
rf.fit(x_tr, y_tr)


# REGRESSION
# # rebuild RF model manually using best parameters to be able to extract additional information from the model
# rf = RandomForestRegressor(n_estimators=m.best_params_["n_estimators"], 
#                            max_features=m.best_params_["max_features"],
#                            random_state=seed)
# rf.fit(x_tr, y_tr)

## RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
##                        criterion='gini', max_depth=None, max_features=205,
##                        max_leaf_nodes=None, max_samples=None,
##                        min_impurity_decrease=0.0, min_impurity_split=None,
##                        min_samples_leaf=1, min_samples_split=2,
##                        min_weight_fraction_leaf=0.0, n_estimators=100,
##                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
##                        warm_start=False)

imp = rf.feature_importances_

imp

## array([0.        , 0.00702565, 0.        , ..., 0.01457611, 0.02615353,
##        0.01482525])

indices = np.argsort(imp)[::-1]

print("Feature ranking:")

# print top 10 features

## Feature ranking:

for i in range(10):
    print("%d. feature %d (%f)" % (i + 1, indices[i], imp[indices[i]]))

## 1. feature 2048 (0.065727)
## 2. feature 2049 (0.061273)
## 3. feature 2050 (0.039736)
## 4. feature 2051 (0.035545)
## 5. feature 525 (0.030458)
## 6. feature 2052 (0.029534)
## 7. feature 2055 (0.026154)
## 8. feature 1603 (0.025509)
## 9. feature 1274 (0.023881)
## 10. feature 2053 (0.019019)

Features with numbers 1-2048 are different Morgan fingerprints

2049 - MolLogP 2050 - TPSA(m) 2051 - NHOHCount 2052 - NOCount 2053 - NumHAcceptors 2054 - NumHDonors 2055 - NumRotatableBonds 2056 - NumHeteroatoms 2057 - FractionCSP3

7.3 Predict test set compounds

# load scale if necessary
scale = joblib.load("MolLogP_scale.pkl")

# scale descriptors of the test set compounds
x_ts = scale.transform(x_ts)

# predict logBB class
pred_rf = rf.predict(x_ts)

pred_rf

## array([0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
##        1, 1, 0, 0, 0, 1, 0, 1, 1])

7.4 Calculate statistics for test set preditions

7.4.1 For Classification

print("Accuracy = ", accuracy_score(y_ts, pred_rf))

## Accuracy =  0.967741935483871

print("MCC = ", matthews_corrcoef(y_ts, pred_rf))

## MCC =  0.9375

print("Kappa = ", cohen_kappa_score(y_ts, pred_rf))

## Kappa =  0.9355509355509355

7.4.2 For Regression

import sklearn.metrics as sm
print("Mean absolute error =", round(sm.mean_absolute_error(y_ts, pred_rf), 2))

## Mean absolute error = 0.03

print("Mean squared error =", round(sm.mean_squared_error(y_ts, pred_rf), 2))

## Mean squared error = 0.03

print("Median absolute error =", round(sm.median_absolute_error(y_ts, pred_rf), 2))

## Median absolute error = 0.0

print("Explain variance score =", round(sm.explained_variance_score(y_ts, pred_rf), 2))

## Explain variance score = 0.88

print("R2 score =", round(sm.r2_score(y_ts, pred_rf), 2))

## R2 score = 0.87

Mean absolute error: This is the average of absolute errors of all the data points in the given dataset.
Mean squared error: This is the average of the squares of the errors of all the data points in the given dataset. It is one of the most popular metrics out there!
Median absolute error: This is the median of all the errors in the given dataset. The main advantage of this metric is that it’s robust to outliers. A single bad point in the test dataset wouldn’t skew the entire error metric, as opposed to a mean error metric.
Explained variance score: This score measures how well our model can account for the variation in our dataset. A score of 1.0 indicates that our model is perfect.
R2 score: This is pronounced as R-squared, and this score refers to the coefficient of determination. This tells us how well the unknown samples will be predicted by our model. The best possible score is 1.0, but the score can be negative as well.

7.5 Applicability domain estimates

# if the model includes several ones like RF models or consensus models (or for probabilistic models)
# we can calculate consistency of predictions amongs those models and use it for estimation of applicability domain
pred_prob = rf.predict_proba(x_ts)
pred_prob

## array([[0.83      , 0.17      ],
##        [0.12      , 0.88      ],
##        [0.79      , 0.21      ],
##        [0.77      , 0.23      ],
##        [0.96      , 0.04      ],
##        [0.01      , 0.99      ],
##        [0.85      , 0.15      ],
##        [0.07      , 0.93      ],
##        [0.19333333, 0.80666667],
##        [1.        , 0.        ],
##        [0.24      , 0.76      ],
##        [0.03      , 0.97      ],
##        [0.08      , 0.92      ],
##        [0.        , 1.        ],
##        [0.02      , 0.98      ],
##        [0.65      , 0.35      ],
##        [0.92166667, 0.07833333],
##        [0.92      , 0.08      ],
##        [0.13      , 0.87      ],
##        [0.94      , 0.06      ],
##        [0.06      , 0.94      ],
##        [0.86666667, 0.13333333],
##        [0.21333333, 0.78666667],
##        [0.44      , 0.56      ],
##        [1.        , 0.        ],
##        [0.91      , 0.09      ],
##        [0.955     , 0.045     ],
##        [0.09      , 0.91      ],
##        [0.73      , 0.27      ],
##        [0.1       , 0.9       ],
##        [0.02      , 0.98      ]])

# setup threshold
threshold = 0.95
# calc maximum predicted probability for each row (compound) and compare to the threshold
da = np.amax(pred_prob, axis=1) > threshold
da

## array([False, False, False, False,  True,  True, False, False, False,
##         True, False,  True, False,  True,  True, False, False, False,
##        False, False, False, False, False, False,  True, False,  True,
##        False, False, False,  True])

8 GBM Model

8.1 For Classification

# setup model building
param_grid = {"n_estimators": [100, 200, 300, 400, 500]}
gbm = GridSearchCV(GradientBoostingClassifier(subsample=0.5, max_features=0.5), 
                   param_grid, n_jobs=2, cv=cv, verbose=1)

# run model building
gbm.fit(x_tr, y_tr)

## Fitting 5 folds for each of 5 candidates, totalling 25 fits
## GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
##              error_score=nan,
##              estimator=GradientBoostingClassifier(ccp_alpha=0.0,
##                                                   criterion='friedman_mse',
##                                                   init=None, learning_rate=0.1,
##                                                   loss='deviance', max_depth=3,
##                                                   max_features=0.5,
##                                                   max_leaf_nodes=None,
##                                                   min_impurity_decrease=0.0,
##                                                   min_impurity_split=None,
##                                                   min_samples_leaf=1,
##                                                   min_samples_split=2,
##                                                   min_weight_fraction_leaf=0.0,
##                                                   n_estimators=100,
##                                                   n_iter_no_change=None,
##                                                   presort='deprecated',
##                                                   random_state=None,
##                                                   subsample=0.5, tol=0.0001,
##                                                   validation_fraction=0.1,
##                                                   verbose=0, warm_start=False),
##              iid='deprecated', n_jobs=2,
##              param_grid={'n_estimators': [100, 200, 300, 400, 500]},
##              pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
##              scoring=None, verbose=1)
## 
## [Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
## [Parallel(n_jobs=2)]: Done  25 out of  25 | elapsed:    6.5s finished

8.2 For Regression

# # setup model building
# param_grid = {"n_estimators": [100, 200, 300, 400, 500]}
# gbm = GridSearchCV(GradientBoostingRegressor(subsample=0.5, max_features=0.5), 
#                    param_grid, n_jobs=2, cv=cv, verbose=1)
# 
# # run model building
# gbm.fit(x_tr, y_tr)

gbm.best_score_

## 0.9083333333333334

gbm.best_params_

## {'n_estimators': 500}

8.2.1 For Classification

pred_gbm = gbm.predict(x_ts)
print("Accuracy = ", accuracy_score(y_ts, pred_gbm))

## Accuracy =  0.967741935483871

print("MCC = ", matthews_corrcoef(y_ts, pred_gbm))

## MCC =  0.9375

print("Kappa = ", cohen_kappa_score(y_ts, pred_gbm))

## Kappa =  0.9355509355509355

8.2.2 For Regression

pred_gbm = gbm.predict(x_ts)
print("Mean absolute error =", round(sm.mean_absolute_error(y_ts, pred_gbm), 2))

## Mean absolute error = 0.03

print("Mean squared error =", round(sm.mean_squared_error(y_ts, pred_gbm), 2))

## Mean squared error = 0.03

print("Median absolute error =", round(sm.median_absolute_error(y_ts, pred_gbm), 2))

## Median absolute error = 0.0

print("Explain variance score =", round(sm.explained_variance_score(y_ts, pred_gbm), 2))

## Explain variance score = 0.88

print("R2 score =", round(sm.r2_score(y_ts, pred_gbm), 2))

## R2 score = 0.87

from sklearn.inspection import permutation_importance
imp_gbm = gbm.best_estimator_.feature_importances_

imp_gbm

## array([0.        , 0.00020011, 0.        , ..., 0.00252872, 0.01372849,
##        0.00562592])

indices = np.argsort(imp_gbm)[::-1]

print("Feature ranking:")

# print top 10 features

## Feature ranking:

for i in range(10):
    print("%d. feature %d (%f)" % (i + 1, indices[i], imp[indices[i]]))

## 1. feature 2048 (0.065727)
## 2. feature 2049 (0.061273)
## 3. feature 1829 (0.000414)
## 4. feature 105 (0.000408)
## 5. feature 839 (0.000506)
## 6. feature 571 (0.000000)
## 7. feature 678 (0.000167)
## 8. feature 1859 (0.011451)
## 9. feature 1468 (0.000963)
## 10. feature 2053 (0.019019)

9 Consensus model (ensemble)

pred_c = 1 * (((pred_rf + pred_gbm) / 2) >= 0.5)
pred_c

## array([0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
##        1, 1, 0, 0, 0, 1, 0, 1, 1])

# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred_c))

## Accuracy =  0.967741935483871

print("MCC = ", matthews_corrcoef(y_ts, pred_c))

## MCC =  0.9375

print("Kappa = ", cohen_kappa_score(y_ts, pred_c))

## Kappa =  0.9355509355509355

10 Classify unknown molecules as antifungic or not

10.1 Predict unknown molecules

10.1.1 With Random Forest

x2pred = scale.transform(x2pred)
pred_unknown_rf = rf.predict(x2pred)
pred_unknown_rf

## array([1, 1, 1, 1, 1])

pred_unknown_rf_prob = rf.predict_proba(x2pred)
pred_unknown_rf_prob

## array([[0.06833333, 0.93166667],
##        [0.03      , 0.97      ],
##        [0.07      , 0.93      ],
##        [0.06      , 0.94      ],
##        [0.25      , 0.75      ]])

# Associate prediction with molecule CID
grep -A1 "> <PUBCHEM_COMPOUND_CID>" mols2predict.sdf

## > <PUBCHEM_COMPOUND_CID>
## 5280965
## --
## > <PUBCHEM_COMPOUND_CID>
## 7311
## --
## > <PUBCHEM_COMPOUND_CID>
## 3083779
## --
## > <PUBCHEM_COMPOUND_CID>
## 638072
## --
## > <PUBCHEM_COMPOUND_CID>
## 335377

## Domaine d'applicability
da2pred_rf = np.amax(pred_unknown_rf_prob, axis = 1) > threshold

da2pred_rf

## array([False,  True, False, False, False])

molname = ["Amphotericin b", "2,4-Di-tert-butylphenol", "17-Methyloctadecanoic acid", "squalene", "4-tert-Butylcalix[4]arene"]
dict(zip(molname,da2pred_rf.tolist()))

## {'Amphotericin b': False, '2,4-Di-tert-butylphenol': True, '17-Methyloctadecanoic acid': False, 'squalene': False, '4-tert-Butylcalix[4]arene': False}

print("With Probabilities:")

## With Probabilities:

pred_unknown_rf_prob.tolist()

## [[0.06833333333333333, 0.9316666666666665], [0.03, 0.97], [0.07, 0.93], [0.06, 0.94], [0.25, 0.75]]

which are : Amphotericin b; 2,4-Di-tert-butylphenol; 17-Methyloctadecanoic acid; squalene; 4-tert-Butylcalix[4]arene

10.1.2 With XGB

#x2pred = scale.transform(x2pred)
pred_unknown_gbm = gbm.predict(x2pred)
pred_unknown_gbm

## array([1, 1, 1, 1, 1])

pred_unknown_gbm_prob = gbm.predict_proba(x2pred)
pred_unknown_gbm_prob

## array([[3.11358544e-04, 9.99688641e-01],
##        [1.56321917e-05, 9.99984368e-01],
##        [5.68785359e-06, 9.99994312e-01],
##        [6.33273246e-06, 9.99993667e-01],
##        [4.62616133e-03, 9.95373839e-01]])

# Associate prediction with molecule CID
grep -A1 "> <PUBCHEM_COMPOUND_CID>" mols2predict.sdf

## > <PUBCHEM_COMPOUND_CID>
## 5280965
## --
## > <PUBCHEM_COMPOUND_CID>
## 7311
## --
## > <PUBCHEM_COMPOUND_CID>
## 3083779
## --
## > <PUBCHEM_COMPOUND_CID>
## 638072
## --
## > <PUBCHEM_COMPOUND_CID>
## 335377

da2pred_gbm = np.amax(pred_unknown_gbm_prob, axis = 1) > threshold

da2pred_gbm

## array([ True,  True,  True,  True,  True])

molname = ["Amphotericin b", "2,4-Di-tert-butylphenol", "17-Methyloctadecanoic acid", "squalene", "4-tert-Butylcalix[4]arene"]
dict(zip(molname,da2pred_gbm.tolist()))

## {'Amphotericin b': True, '2,4-Di-tert-butylphenol': True, '17-Methyloctadecanoic acid': True, 'squalene': True, '4-tert-Butylcalix[4]arene': True}

print("With Probabilities:")

## With Probabilities:

pred_unknown_gbm_prob.tolist()

## [[0.0003113585439208366, 0.9996886414560792], [1.5632191663561557e-05, 0.9999843678083364], [5.687853591229697e-06, 0.9999943121464088], [6.3327324594242285e-06, 0.9999936672675406], [0.004626161325013256, 0.9953738386749867]]

which are : Amphotericin b; 2,4-Di-tert-butylphenol; 17-Methyloctadecanoic acid; squalene; 4-tert-Butylcalix[4]arene

#library(devtools)
#install_github("pauca/rrdkit/rrdkit")
#install.packages("RCDK")

QSAR Antifungal quantification (Binary and Regression Classification)

Karim Mezhoud

2020-07-13