import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("../DATA/data_banknote_authentication.csv")
df.head()
| Variance_Wavelet | Skewness_Wavelet | Curtosis_Wavelet | Image_Entropy | Class | |
|---|---|---|---|---|---|
| 0 | 3.62160 | 8.6661 | -2.8073 | -0.44699 | 0 |
| 1 | 4.54590 | 8.1674 | -2.4586 | -1.46210 | 0 |
| 2 | 3.86600 | -2.6383 | 1.9242 | 0.10645 | 0 |
| 3 | 3.45660 | 9.5228 | -4.0112 | -3.59440 | 0 |
| 4 | 0.32924 | -4.4552 | 4.5718 | -0.98880 | 0 |
sns.pairplot(df,hue='Class')
<seaborn.axisgrid.PairGrid at 0x244b31c8c48>
X = df.drop("Class",axis=1)
y = df["Class"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Keep in mind oob won't effect the performance whether it's true or false
# Errors may occure because instances of bootstrap=false & oob_score = true are generated
# OOB works only with bootstrap
n_estimators=[64,100,128,200]
max_features= [2,3,4]
bootstrap = [True,False]
oob_score = [True,False]
param_grid = {'n_estimators':n_estimators,
'max_features':max_features,
'bootstrap':bootstrap,
'oob_score':oob_score}
model=RandomForestClassifier()
grid_model=GridSearchCV(model,param_grid)
grid_model.fit(X_train,y_train)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
60 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\ensemble\_forest.py", line 411, in fit
raise ValueError("Out of bag estimation only available if bootstrap=True")
ValueError: Out of bag estimation only available if bootstrap=True
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [0.99571549 0.99314038 0.99399875 0.99314038 0.99399875 0.99571549
0.99399875 0.99314038 0.99314038 0.99228201 0.99314038 0.99314038
0.99228201 0.99314038 0.99228201 0.99399875 0.98799017 0.98799017
0.98799017 0.98799017 0.98884854 0.98884854 0.98799017 0.98884854
nan 0.99313305 nan 0.99313305 nan 0.99313672
nan 0.99399508 nan 0.99141998 nan 0.99056161
nan 0.99228201 nan 0.99227835 nan 0.97770441
nan 0.98027218 nan 0.97941381 nan 0.97941015]
category=UserWarning,
GridSearchCV(estimator=RandomForestClassifier(),
param_grid={'bootstrap': [True, False], 'max_features': [2, 3, 4],
'n_estimators': [64, 100, 128, 200],
'oob_score': [True, False]})
grid_model.best_params_
{'bootstrap': True, 'max_features': 2, 'n_estimators': 64, 'oob_score': True}
predict=grid_model.predict(X_test)
from sklearn.metrics import accuracy_score , plot_confusion_matrix
# to determine how much trees we really want and to observe threshold
errors = []
misclassifications = []
for n in range(1,128):
rfc = RandomForestClassifier( n_estimators=n,bootstrap=True,max_features= 2)
rfc.fit(X_train,y_train)
preds = rfc.predict(X_test)
err = 1 - accuracy_score(preds,y_test)
n_missed = np.sum(preds != y_test) # how many instances in the predictions mismatch the actual label
errors.append(err)
misclassifications.append(n_missed)
plt.plot(range(1,128),errors)
[<matplotlib.lines.Line2D at 0x244b81dce88>]
plt.plot(range(1,128),misclassifications)
[<matplotlib.lines.Line2D at 0x244b81cc6c8>]
plot_confusion_matrix(grid_model,X_test,y_test)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x244b824ebc8>