import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("../DATA/data_banknote_authentication.csv")
df.head()
Variance_Wavelet | Skewness_Wavelet | Curtosis_Wavelet | Image_Entropy | Class | |
---|---|---|---|---|---|
0 | 3.62160 | 8.6661 | -2.8073 | -0.44699 | 0 |
1 | 4.54590 | 8.1674 | -2.4586 | -1.46210 | 0 |
2 | 3.86600 | -2.6383 | 1.9242 | 0.10645 | 0 |
3 | 3.45660 | 9.5228 | -4.0112 | -3.59440 | 0 |
4 | 0.32924 | -4.4552 | 4.5718 | -0.98880 | 0 |
sns.pairplot(df,hue='Class')
<seaborn.axisgrid.PairGrid at 0x244b31c8c48>
X = df.drop("Class",axis=1)
y = df["Class"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Keep in mind oob won't effect the performance whether it's true or false
# Errors may occure because instances of bootstrap=false & oob_score = true are generated
# OOB works only with bootstrap
n_estimators=[64,100,128,200]
max_features= [2,3,4]
bootstrap = [True,False]
oob_score = [True,False]
param_grid = {'n_estimators':n_estimators,
'max_features':max_features,
'bootstrap':bootstrap,
'oob_score':oob_score}
model=RandomForestClassifier()
grid_model=GridSearchCV(model,param_grid)
grid_model.fit(X_train,y_train)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning: 60 fits failed out of a total of 240. The score on these train-test partitions for these parameters will be set to nan. If these failures are not expected, you can try to debug them by setting error_score='raise'. Below are more details about the failures: -------------------------------------------------------------------------------- 60 fits failed with the following error: Traceback (most recent call last): File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\ensemble\_forest.py", line 411, in fit raise ValueError("Out of bag estimation only available if bootstrap=True") ValueError: Out of bag estimation only available if bootstrap=True warnings.warn(some_fits_failed_message, FitFailedWarning) C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [0.99571549 0.99314038 0.99399875 0.99314038 0.99399875 0.99571549 0.99399875 0.99314038 0.99314038 0.99228201 0.99314038 0.99314038 0.99228201 0.99314038 0.99228201 0.99399875 0.98799017 0.98799017 0.98799017 0.98799017 0.98884854 0.98884854 0.98799017 0.98884854 nan 0.99313305 nan 0.99313305 nan 0.99313672 nan 0.99399508 nan 0.99141998 nan 0.99056161 nan 0.99228201 nan 0.99227835 nan 0.97770441 nan 0.98027218 nan 0.97941381 nan 0.97941015] category=UserWarning,
GridSearchCV(estimator=RandomForestClassifier(), param_grid={'bootstrap': [True, False], 'max_features': [2, 3, 4], 'n_estimators': [64, 100, 128, 200], 'oob_score': [True, False]})
grid_model.best_params_
{'bootstrap': True, 'max_features': 2, 'n_estimators': 64, 'oob_score': True}
predict=grid_model.predict(X_test)
from sklearn.metrics import accuracy_score , plot_confusion_matrix
# to determine how much trees we really want and to observe threshold
errors = []
misclassifications = []
for n in range(1,128):
rfc = RandomForestClassifier( n_estimators=n,bootstrap=True,max_features= 2)
rfc.fit(X_train,y_train)
preds = rfc.predict(X_test)
err = 1 - accuracy_score(preds,y_test)
n_missed = np.sum(preds != y_test) # how many instances in the predictions mismatch the actual label
errors.append(err)
misclassifications.append(n_missed)
plt.plot(range(1,128),errors)
[<matplotlib.lines.Line2D at 0x244b81dce88>]
plt.plot(range(1,128),misclassifications)
[<matplotlib.lines.Line2D at 0x244b81cc6c8>]
plot_confusion_matrix(grid_model,X_test,y_test)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x244b824ebc8>