# Set python environment and version in RStudio ;-)
reticulate::use_python("/Users/Mezhoud/anaconda3/bin/python3", required = TRUE)
reticulate::py_config()
## python: /Users/Mezhoud/anaconda3/bin/python3
## libpython: /Users/Mezhoud/anaconda3/lib/libpython3.7m.dylib
## pythonhome: /Users/Mezhoud/anaconda3:/Users/Mezhoud/anaconda3
## version: 3.7.5 (default, Oct 25 2019, 10:52:18) [Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy: /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/numpy
## numpy_version: 1.18.1
##
## NOTE: Python version was forced by use_python function
We need rdkit package from anaconda.
## [1] TRUE
Install rdkit if is not available
Import needed modules
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef
import joblib
Merge molecules from sdf files to one file
# How many molécules in classes
grep -c 'END' antiBacterial_viral_biotic_sulfonamides.sdf
grep -c 'END' all_antiFungals.sdf
## 77
## 74
Merge the two datasets for Training
cat all_antiFungals.sdf antiBacterial_viral_biotic_sulfonamides.sdf > alldata.sdf
grep -c 'END' alldata.sdf
## 151
# cancatenate our list of molecule to classify as antifungic or not
cat mols2predict/*.sdf > mols2predict.sdf
grep -c "END" mols2predict.sdf
## 5
ALOGPS_LOGP
as the activity. But we can select other feature.fname = "alldata.sdf"
mols = []
y = []
for mol in Chem.SDMolSupplier(fname):
if mol is not None:
mols.append(mol)
#y.append(Descriptors.MolLogP(mol)) # For Regression Model
# y for Classification Model
y = [1] * 74 + [0] * 77
## <class 'list'>
## <class 'list'>
## [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
## Reformat unkown molecules
fname = "mols2predict.sdf"
mols2pred = []
for mol in Chem.SDMolSupplier(fname):
if mol is not None:
mols2pred.append(mol)
#mols[1].GetDoubleProp('PUBCHEM_CONFORMER_RMSD') # works when comment y.append(Descriptors.MolLogP(mol))
#mols[1].GetIntProp("JCHEM_NUMBER_OF_RINGS")
Descriptors.MolLogP(mols[1])
## 6.117800000000004
## 0.7117000000000069
# generate binary Morgan fingerprint with radius 2
fp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
fp2pred = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols2pred]
def rdkit_numpy_convert(fp):
output = []
for f in fp:
arr = np.zeros((1,))
DataStructs.ConvertToNumpyArray(f, arr)
output.append(arr)
return np.asarray(output)
fpx = rdkit_numpy_convert(fp)
fpx2pred = rdkit_numpy_convert(fp2pred)
fpx.shape
## (151, 2048)
## (5, 2048)
## 0.4900662251655629
A list of descriptors is available here.
# calc some descriptors
descr = []
for m in mols:
descr.append([Descriptors.MolLogP(m),
Descriptors.TPSA(m),
Descriptors.NHOHCount(m),
Descriptors.NOCount(m),
Descriptors.NumHAcceptors(m),
Descriptors.NumHDonors(m),
Descriptors.NumRotatableBonds(m),
#Descriptors.VSA_EState1(m),
Descriptors.NumHeteroatoms(m),
Descriptors.FractionCSP3(m)])
descr = np.asarray(descr)
# calc some descriptors for unknown molecules
descr2pred = []
for m in mols2pred:
descr2pred.append([Descriptors.MolLogP(m),
Descriptors.TPSA(m),
Descriptors.NHOHCount(m),
Descriptors.NOCount(m),
Descriptors.NumHAcceptors(m),
Descriptors.NumHDonors(m),
Descriptors.NumRotatableBonds(m),
#Descriptors.VSA_EState1(m),
Descriptors.NumHeteroatoms(m),
Descriptors.FractionCSP3(m)])
descr2pred = np.asarray(descr2pred)
descr.shape
## (151, 9)
## (5, 9)
# add them to morgan fingerprints
x = np.concatenate((fpx, descr), axis=1)
x2pred = np.concatenate((fpx2pred, descr2pred), axis=1)
x.shape
## (151, 2057)
## (5, 2057)
cv = StratifiedKFold(n_splits=5, random_state=seed) # Binary Y
#cv = KFold(n_splits=5, random_state=seed) # Countinious Y
# print out ids of folds
## /Users/Mezhoud/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
## FutureWarning
for i, (train_index, test_index) in enumerate(cv.split(x_tr, y_tr)):
print("\nFold_" + str(i+1))
print("TRAIN:", train_index)
print("TEST:", test_index)
##
## Fold_1
## TRAIN: [ 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
## 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
## 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
## 114 115 116 117 118 119]
## TEST: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
##
## Fold_2
## TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 18 19 20 21 22 23 43 46 49 50 52 53 54 55 56 57 58 59
## 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
## 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
## 114 115 116 117 118 119]
## TEST: [24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 44 45 47 48 51]
##
## Fold_3
## TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## 36 37 38 39 40 41 42 44 45 47 48 51 72 73 74 75 76 77
## 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
## 114 115 116 117 118 119]
## TEST: [43 46 49 50 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71]
##
## Fold_4
## TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
## 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
## 93 94 96 98 99 101 102 103 104 105 106 107 108 109 110 111 112 113
## 114 115 116 117 118 119]
## TEST: [ 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
## 90 91 92 95 97 100]
##
## Fold_5
## TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
## 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
## 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
## 90 91 92 95 97 100]
## TEST: [ 93 94 96 98 99 101 102 103 104 105 106 107 108 109 110 111 112 113
## 114 115 116 117 118 119]
This step may be crucial for certain modeling approaches lke SVM. In the case of binary fingerprints it may be less useful.
# obtain scale object which can be further applied to scale any data to fit the training set
scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)
# it is a good idea to save it for future use
joblib.dump(scale, "MolLogP_scale.pkl", compress=3)
## ['MolLogP_scale.pkl']
# create grid search dictionary
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3],
"n_estimators": [100, 250, 500]}
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1) # classification
#m = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=2, cv=cv, verbose=1) # Regression
# run model building
m.fit(x_tr, y_tr)
## Fitting 5 folds for each of 12 candidates, totalling 60 fits
## GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
## error_score=nan,
## estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
## class_weight=None,
## criterion='gini', max_depth=None,
## max_features='auto',
## max_leaf_nodes=None,
## max_samples=None,
## min_impurity_decrease=0.0,
## min_impurity_split=None,
## min_samples_leaf=1,
## min_samples_split=2,
## min_weight_fraction_leaf=0.0,
## n_estimators=100, n_jobs=None,
## oob_score=False,
## random_state=None, verbose=0,
## warm_start=False),
## iid='deprecated', n_jobs=2,
## param_grid={'max_features': [205, 293, 411, 685],
## 'n_estimators': [100, 250, 500]},
## pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
## scoring=None, verbose=1)
##
## [Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
## [Parallel(n_jobs=2)]: Done 46 tasks | elapsed: 34.4s
## [Parallel(n_jobs=2)]: Done 60 out of 60 | elapsed: 46.7s finished
## {'max_features': 205, 'n_estimators': 100}
## 0.9
## {'mean_fit_time': array([0.60722475, 1.75145659, 2.51453605, 0.33783221, 0.80729842,
## 1.78976154, 0.48680797, 0.78129458, 2.09814954, 0.54545541,
## 2.22143626, 1.88560381]), 'std_fit_time': array([0.09260572, 0.27016048, 0.86679645, 0.00646859, 0.10116017,
## 0.53013058, 0.23748165, 0.03057221, 0.4368675 , 0.20756133,
## 0.62861573, 0.29820018]), 'mean_score_time': array([0.07566233, 0.12821608, 0.09338694, 0.01213961, 0.02948012,
## 0.07251606, 0.01536078, 0.03292408, 0.08377967, 0.02686019,
## 0.06860328, 0.06122513]), 'std_score_time': array([0.08724742, 0.05741384, 0.04072479, 0.0022696 , 0.00465124,
## 0.0257454 , 0.00288178, 0.00383592, 0.04685363, 0.02011621,
## 0.07345874, 0.01344932]), 'param_max_features': masked_array(data=[205, 205, 205, 293, 293, 293, 411, 411, 411, 685, 685,
## 685],
## mask=[False, False, False, False, False, False, False, False,
## False, False, False, False],
## fill_value='?',
## dtype=object), 'param_n_estimators': masked_array(data=[100, 250, 500, 100, 250, 500, 100, 250, 500, 100, 250,
## 500],
## mask=[False, False, False, False, False, False, False, False,
## False, False, False, False],
## fill_value='?',
## dtype=object), 'params': [{'max_features': 205, 'n_estimators': 100}, {'max_features': 205, 'n_estimators': 250}, {'max_features': 205, 'n_estimators': 500}, {'max_features': 293, 'n_estimators': 100}, {'max_features': 293, 'n_estimators': 250}, {'max_features': 293, 'n_estimators': 500}, {'max_features': 411, 'n_estimators': 100}, {'max_features': 411, 'n_estimators': 250}, {'max_features': 411, 'n_estimators': 500}, {'max_features': 685, 'n_estimators': 100}, {'max_features': 685, 'n_estimators': 250}, {'max_features': 685, 'n_estimators': 500}], 'split0_test_score': array([0.875 , 0.83333333, 0.875 , 0.875 , 0.83333333,
## 0.875 , 0.875 , 0.83333333, 0.875 , 0.83333333,
## 0.83333333, 0.83333333]), 'split1_test_score': array([0.95833333, 0.95833333, 0.95833333, 0.91666667, 0.95833333,
## 0.95833333, 0.91666667, 0.91666667, 0.95833333, 0.95833333,
## 0.95833333, 0.95833333]), 'split2_test_score': array([0.875 , 0.83333333, 0.83333333, 0.875 , 0.83333333,
## 0.875 , 0.83333333, 0.83333333, 0.83333333, 0.875 ,
## 0.83333333, 0.83333333]), 'split3_test_score': array([0.875 , 0.83333333, 0.91666667, 0.83333333, 0.91666667,
## 0.875 , 0.875 , 0.875 , 0.91666667, 0.83333333,
## 0.83333333, 0.875 ]), 'split4_test_score': array([0.91666667, 0.875 , 0.875 , 0.875 , 0.875 ,
## 0.875 , 0.875 , 0.875 , 0.875 , 0.875 ,
## 0.875 , 0.875 ]), 'mean_test_score': array([0.9 , 0.86666667, 0.89166667, 0.875 , 0.88333333,
## 0.89166667, 0.875 , 0.86666667, 0.89166667, 0.875 ,
## 0.86666667, 0.875 ]), 'std_test_score': array([0.03333333, 0.04859127, 0.04249183, 0.02635231, 0.04859127,
## 0.03333333, 0.02635231, 0.03118048, 0.04249183, 0.04564355,
## 0.04859127, 0.04564355]), 'rank_test_score': array([ 1, 10, 2, 6, 5, 2, 6, 10, 2, 6, 10, 6], dtype=int32)}
## array([0.9 , 0.86666667, 0.89166667, 0.875 , 0.88333333,
## 0.89166667, 0.875 , 0.86666667, 0.89166667, 0.875 ,
## 0.86666667, 0.875 ])
## [{'max_features': 205, 'n_estimators': 100}, {'max_features': 205, 'n_estimators': 250}, {'max_features': 205, 'n_estimators': 500}, {'max_features': 293, 'n_estimators': 100}, {'max_features': 293, 'n_estimators': 250}, {'max_features': 293, 'n_estimators': 500}, {'max_features': 411, 'n_estimators': 100}, {'max_features': 411, 'n_estimators': 250}, {'max_features': 411, 'n_estimators': 500}, {'max_features': 685, 'n_estimators': 100}, {'max_features': 685, 'n_estimators': 250}, {'max_features': 685, 'n_estimators': 500}]
# CLASSIFICATION
# rebuild RF model manually using best parameters to be able to extract additional information from the model
rf = RandomForestClassifier(n_estimators=m.best_params_["n_estimators"],
max_features=m.best_params_["max_features"],
random_state=seed)
rf.fit(x_tr, y_tr)
# REGRESSION
# # rebuild RF model manually using best parameters to be able to extract additional information from the model
# rf = RandomForestRegressor(n_estimators=m.best_params_["n_estimators"],
# max_features=m.best_params_["max_features"],
# random_state=seed)
# rf.fit(x_tr, y_tr)
## RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
## criterion='gini', max_depth=None, max_features=205,
## max_leaf_nodes=None, max_samples=None,
## min_impurity_decrease=0.0, min_impurity_split=None,
## min_samples_leaf=1, min_samples_split=2,
## min_weight_fraction_leaf=0.0, n_estimators=100,
## n_jobs=None, oob_score=False, random_state=42, verbose=0,
## warm_start=False)
## array([0. , 0.00702565, 0. , ..., 0.01457611, 0.02615353,
## 0.01482525])
## Feature ranking:
## 1. feature 2048 (0.065727)
## 2. feature 2049 (0.061273)
## 3. feature 2050 (0.039736)
## 4. feature 2051 (0.035545)
## 5. feature 525 (0.030458)
## 6. feature 2052 (0.029534)
## 7. feature 2055 (0.026154)
## 8. feature 1603 (0.025509)
## 9. feature 1274 (0.023881)
## 10. feature 2053 (0.019019)
2049 - MolLogP 2050 - TPSA(m) 2051 - NHOHCount 2052 - NOCount 2053 - NumHAcceptors 2054 - NumHDonors 2055 - NumRotatableBonds 2056 - NumHeteroatoms 2057 - FractionCSP3
# load scale if necessary
scale = joblib.load("MolLogP_scale.pkl")
# scale descriptors of the test set compounds
x_ts = scale.transform(x_ts)
# predict logBB class
pred_rf = rf.predict(x_ts)
pred_rf
## array([0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
## 1, 1, 0, 0, 0, 1, 0, 1, 1])
## Accuracy = 0.967741935483871
## MCC = 0.9375
## Kappa = 0.9355509355509355
import sklearn.metrics as sm
print("Mean absolute error =", round(sm.mean_absolute_error(y_ts, pred_rf), 2))
## Mean absolute error = 0.03
## Mean squared error = 0.03
## Median absolute error = 0.0
## Explain variance score = 0.88
## R2 score = 0.87
Mean absolute error: This is the average of absolute errors of all the data points in the given dataset.
Mean squared error: This is the average of the squares of the errors of all the data points in the given dataset. It is one of the most popular metrics out there!
Median absolute error: This is the median of all the errors in the given dataset. The main advantage of this metric is that it’s robust to outliers. A single bad point in the test dataset wouldn’t skew the entire error metric, as opposed to a mean error metric.
Explained variance score: This score measures how well our model can account for the variation in our dataset. A score of 1.0 indicates that our model is perfect.
R2 score: This is pronounced as R-squared, and this score refers to the coefficient of determination. This tells us how well the unknown samples will be predicted by our model. The best possible score is 1.0, but the score can be negative as well.
# if the model includes several ones like RF models or consensus models (or for probabilistic models)
# we can calculate consistency of predictions amongs those models and use it for estimation of applicability domain
pred_prob = rf.predict_proba(x_ts)
pred_prob
## array([[0.83 , 0.17 ],
## [0.12 , 0.88 ],
## [0.79 , 0.21 ],
## [0.77 , 0.23 ],
## [0.96 , 0.04 ],
## [0.01 , 0.99 ],
## [0.85 , 0.15 ],
## [0.07 , 0.93 ],
## [0.19333333, 0.80666667],
## [1. , 0. ],
## [0.24 , 0.76 ],
## [0.03 , 0.97 ],
## [0.08 , 0.92 ],
## [0. , 1. ],
## [0.02 , 0.98 ],
## [0.65 , 0.35 ],
## [0.92166667, 0.07833333],
## [0.92 , 0.08 ],
## [0.13 , 0.87 ],
## [0.94 , 0.06 ],
## [0.06 , 0.94 ],
## [0.86666667, 0.13333333],
## [0.21333333, 0.78666667],
## [0.44 , 0.56 ],
## [1. , 0. ],
## [0.91 , 0.09 ],
## [0.955 , 0.045 ],
## [0.09 , 0.91 ],
## [0.73 , 0.27 ],
## [0.1 , 0.9 ],
## [0.02 , 0.98 ]])
# setup threshold
threshold = 0.95
# calc maximum predicted probability for each row (compound) and compare to the threshold
da = np.amax(pred_prob, axis=1) > threshold
da
## array([False, False, False, False, True, True, False, False, False,
## True, False, True, False, True, True, False, False, False,
## False, False, False, False, False, False, True, False, True,
## False, False, False, True])
# setup model building
param_grid = {"n_estimators": [100, 200, 300, 400, 500]}
gbm = GridSearchCV(GradientBoostingClassifier(subsample=0.5, max_features=0.5),
param_grid, n_jobs=2, cv=cv, verbose=1)
# run model building
gbm.fit(x_tr, y_tr)
## Fitting 5 folds for each of 5 candidates, totalling 25 fits
## GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
## error_score=nan,
## estimator=GradientBoostingClassifier(ccp_alpha=0.0,
## criterion='friedman_mse',
## init=None, learning_rate=0.1,
## loss='deviance', max_depth=3,
## max_features=0.5,
## max_leaf_nodes=None,
## min_impurity_decrease=0.0,
## min_impurity_split=None,
## min_samples_leaf=1,
## min_samples_split=2,
## min_weight_fraction_leaf=0.0,
## n_estimators=100,
## n_iter_no_change=None,
## presort='deprecated',
## random_state=None,
## subsample=0.5, tol=0.0001,
## validation_fraction=0.1,
## verbose=0, warm_start=False),
## iid='deprecated', n_jobs=2,
## param_grid={'n_estimators': [100, 200, 300, 400, 500]},
## pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
## scoring=None, verbose=1)
##
## [Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
## [Parallel(n_jobs=2)]: Done 25 out of 25 | elapsed: 6.5s finished
# # setup model building
# param_grid = {"n_estimators": [100, 200, 300, 400, 500]}
# gbm = GridSearchCV(GradientBoostingRegressor(subsample=0.5, max_features=0.5),
# param_grid, n_jobs=2, cv=cv, verbose=1)
#
# # run model building
# gbm.fit(x_tr, y_tr)
## 0.9083333333333334
## {'n_estimators': 500}
## Accuracy = 0.967741935483871
## MCC = 0.9375
## Kappa = 0.9355509355509355
pred_gbm = gbm.predict(x_ts)
print("Mean absolute error =", round(sm.mean_absolute_error(y_ts, pred_gbm), 2))
## Mean absolute error = 0.03
## Mean squared error = 0.03
## Median absolute error = 0.0
## Explain variance score = 0.88
## R2 score = 0.87
from sklearn.inspection import permutation_importance
imp_gbm = gbm.best_estimator_.feature_importances_
imp_gbm
## array([0. , 0.00020011, 0. , ..., 0.00252872, 0.01372849,
## 0.00562592])
## Feature ranking:
## 1. feature 2048 (0.065727)
## 2. feature 2049 (0.061273)
## 3. feature 1829 (0.000414)
## 4. feature 105 (0.000408)
## 5. feature 839 (0.000506)
## 6. feature 571 (0.000000)
## 7. feature 678 (0.000167)
## 8. feature 1859 (0.011451)
## 9. feature 1468 (0.000963)
## 10. feature 2053 (0.019019)
## array([0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
## 1, 1, 0, 0, 0, 1, 0, 1, 1])
## Accuracy = 0.967741935483871
## MCC = 0.9375
## Kappa = 0.9355509355509355
## array([1, 1, 1, 1, 1])
## array([[0.06833333, 0.93166667],
## [0.03 , 0.97 ],
## [0.07 , 0.93 ],
## [0.06 , 0.94 ],
## [0.25 , 0.75 ]])
## > <PUBCHEM_COMPOUND_CID>
## 5280965
## --
## > <PUBCHEM_COMPOUND_CID>
## 7311
## --
## > <PUBCHEM_COMPOUND_CID>
## 3083779
## --
## > <PUBCHEM_COMPOUND_CID>
## 638072
## --
## > <PUBCHEM_COMPOUND_CID>
## 335377
## Domaine d'applicability
da2pred_rf = np.amax(pred_unknown_rf_prob, axis = 1) > threshold
da2pred_rf
## array([False, True, False, False, False])
molname = ["Amphotericin b", "2,4-Di-tert-butylphenol", "17-Methyloctadecanoic acid", "squalene", "4-tert-Butylcalix[4]arene"]
dict(zip(molname,da2pred_rf.tolist()))
## {'Amphotericin b': False, '2,4-Di-tert-butylphenol': True, '17-Methyloctadecanoic acid': False, 'squalene': False, '4-tert-Butylcalix[4]arene': False}
## With Probabilities:
## [[0.06833333333333333, 0.9316666666666665], [0.03, 0.97], [0.07, 0.93], [0.06, 0.94], [0.25, 0.75]]
## array([1, 1, 1, 1, 1])
## array([[3.11358544e-04, 9.99688641e-01],
## [1.56321917e-05, 9.99984368e-01],
## [5.68785359e-06, 9.99994312e-01],
## [6.33273246e-06, 9.99993667e-01],
## [4.62616133e-03, 9.95373839e-01]])
## > <PUBCHEM_COMPOUND_CID>
## 5280965
## --
## > <PUBCHEM_COMPOUND_CID>
## 7311
## --
## > <PUBCHEM_COMPOUND_CID>
## 3083779
## --
## > <PUBCHEM_COMPOUND_CID>
## 638072
## --
## > <PUBCHEM_COMPOUND_CID>
## 335377
## array([ True, True, True, True, True])
molname = ["Amphotericin b", "2,4-Di-tert-butylphenol", "17-Methyloctadecanoic acid", "squalene", "4-tert-Butylcalix[4]arene"]
dict(zip(molname,da2pred_gbm.tolist()))
## {'Amphotericin b': True, '2,4-Di-tert-butylphenol': True, '17-Methyloctadecanoic acid': True, 'squalene': True, '4-tert-Butylcalix[4]arene': True}
## With Probabilities:
## [[0.0003113585439208366, 0.9996886414560792], [1.5632191663561557e-05, 0.9999843678083364], [5.687853591229697e-06, 0.9999943121464088], [6.3327324594242285e-06, 0.9999936672675406], [0.004626161325013256, 0.9953738386749867]]