import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv("../DATA/data_banknote_authentication.csv")


df.head()


sns.pairplot(df,hue='Class')

<seaborn.axisgrid.PairGrid at 0x244b31c8c48>


X = df.drop("Class",axis=1)
y = df["Class"]


from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101)


from sklearn.model_selection import GridSearchCV


from sklearn.ensemble import RandomForestClassifier


# Keep in mind oob won't effect the performance whether it's true or false 

# Errors may occure because instances of bootstrap=false & oob_score = true are generated 

# OOB works only with bootstrap 

n_estimators=[64,100,128,200]
max_features= [2,3,4]
bootstrap = [True,False]
oob_score = [True,False]


param_grid = {'n_estimators':n_estimators,
             'max_features':max_features,
             'bootstrap':bootstrap,
             'oob_score':oob_score}


model=RandomForestClassifier()
grid_model=GridSearchCV(model,param_grid)


grid_model.fit(X_train,y_train)

C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning: 
60 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\ensemble\_forest.py", line 411, in fit
    raise ValueError("Out of bag estimation only available if bootstrap=True")
ValueError: Out of bag estimation only available if bootstrap=True

  warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\model_selection\_search.py:972: UserWarning: One or more of the test scores are non-finite: [0.99571549 0.99314038 0.99399875 0.99314038 0.99399875 0.99571549
 0.99399875 0.99314038 0.99314038 0.99228201 0.99314038 0.99314038
 0.99228201 0.99314038 0.99228201 0.99399875 0.98799017 0.98799017
 0.98799017 0.98799017 0.98884854 0.98884854 0.98799017 0.98884854
        nan 0.99313305        nan 0.99313305        nan 0.99313672
        nan 0.99399508        nan 0.99141998        nan 0.99056161
        nan 0.99228201        nan 0.99227835        nan 0.97770441
        nan 0.98027218        nan 0.97941381        nan 0.97941015]
  category=UserWarning,

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False], 'max_features': [2, 3, 4],
                         'n_estimators': [64, 100, 128, 200],
                         'oob_score': [True, False]})


grid_model.best_params_

{'bootstrap': True, 'max_features': 2, 'n_estimators': 64, 'oob_score': True}


predict=grid_model.predict(X_test)


from sklearn.metrics import accuracy_score , plot_confusion_matrix


# to determine how much trees we really want and to observe threshold

errors = []
misclassifications = []

for n in range(1,128):
    rfc = RandomForestClassifier( n_estimators=n,bootstrap=True,max_features= 2)
    rfc.fit(X_train,y_train)
    preds = rfc.predict(X_test)
    err = 1 - accuracy_score(preds,y_test)
    n_missed = np.sum(preds != y_test) # how many instances in the predictions mismatch the actual label 
    errors.append(err)
    misclassifications.append(n_missed)


plt.plot(range(1,128),errors)

[<matplotlib.lines.Line2D at 0x244b81dce88>]


plt.plot(range(1,128),misclassifications)

[<matplotlib.lines.Line2D at 0x244b81cc6c8>]


plot_confusion_matrix(grid_model,X_test,y_test)

C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x244b824ebc8>

	Variance_Wavelet	Skewness_Wavelet	Curtosis_Wavelet	Image_Entropy
0	3.62160	8.6661	-2.8073	-0.44699
1	4.54590	8.1674	-2.4586	-1.46210
2	3.86600	-2.6383	1.9242	0.10645
3	3.45660	9.5228	-4.0112	-3.59440
4	0.32924	-4.4552	4.5718	-0.98880