In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df = pd.read_csv("../DATA/data_banknote_authentication.csv")
In [3]:
df.head()
Out[3]:
Variance_Wavelet Skewness_Wavelet Curtosis_Wavelet Image_Entropy Class
0 3.62160 8.6661 -2.8073 -0.44699 0
1 4.54590 8.1674 -2.4586 -1.46210 0
2 3.86600 -2.6383 1.9242 0.10645 0
3 3.45660 9.5228 -4.0112 -3.59440 0
4 0.32924 -4.4552 4.5718 -0.98880 0
In [4]:
sns.pairplot(df,hue='Class')
Out[4]:
<seaborn.axisgrid.PairGrid at 0x244b31c8c48>
In [5]:
X = df.drop("Class",axis=1)
y = df["Class"]
In [6]:
from sklearn.model_selection import train_test_split
In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101)
In [8]:
from sklearn.model_selection import GridSearchCV
In [9]:
from sklearn.ensemble import RandomForestClassifier
In [10]:
# Keep in mind oob won't effect the performance whether it's true or false 

# Errors may occure because instances of bootstrap=false & oob_score = true are generated 

# OOB works only with bootstrap 

n_estimators=[64,100,128,200]
max_features= [2,3,4]
bootstrap = [True,False]
oob_score = [True,False]
In [11]:
param_grid = {'n_estimators':n_estimators,
             'max_features':max_features,
             'bootstrap':bootstrap,
             'oob_score':oob_score}
In [12]:
model=RandomForestClassifier()
grid_model=GridSearchCV(model,param_grid)
In [13]:
grid_model.fit(X_train,y_train)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning: 
60 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\ensemble\_forest.py", line 411, in fit
    raise ValueError("Out of bag estimation only available if bootstrap=True")
ValueError: Out of bag estimation only available if bootstrap=True

  warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [0.99571549 0.99314038 0.99399875 0.99314038 0.99399875 0.99571549
 0.99399875 0.99314038 0.99314038 0.99228201 0.99314038 0.99314038
 0.99228201 0.99314038 0.99228201 0.99399875 0.98799017 0.98799017
 0.98799017 0.98799017 0.98884854 0.98884854 0.98799017 0.98884854
        nan 0.99313305        nan 0.99313305        nan 0.99313672
        nan 0.99399508        nan 0.99141998        nan 0.99056161
        nan 0.99228201        nan 0.99227835        nan 0.97770441
        nan 0.98027218        nan 0.97941381        nan 0.97941015]
  category=UserWarning,
Out[13]:
GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False], 'max_features': [2, 3, 4],
                         'n_estimators': [64, 100, 128, 200],
                         'oob_score': [True, False]})
In [14]:
grid_model.best_params_
Out[14]:
{'bootstrap': True, 'max_features': 2, 'n_estimators': 64, 'oob_score': True}
In [15]:
predict=grid_model.predict(X_test)
In [16]:
from sklearn.metrics import accuracy_score , plot_confusion_matrix
In [17]:
# to determine how much trees we really want and to observe threshold

errors = []
misclassifications = []

for n in range(1,128):
    rfc = RandomForestClassifier( n_estimators=n,bootstrap=True,max_features= 2)
    rfc.fit(X_train,y_train)
    preds = rfc.predict(X_test)
    err = 1 - accuracy_score(preds,y_test)
    n_missed = np.sum(preds != y_test) # how many instances in the predictions mismatch the actual label 
    errors.append(err)
    misclassifications.append(n_missed)
In [18]:
plt.plot(range(1,128),errors)
Out[18]:
[<matplotlib.lines.Line2D at 0x244b81dce88>]
In [19]:
plt.plot(range(1,128),misclassifications)
Out[19]:
[<matplotlib.lines.Line2D at 0x244b81cc6c8>]
In [20]:
plot_confusion_matrix(grid_model,X_test,y_test)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)
Out[20]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x244b824ebc8>