import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


df=pd.read_csv('../DATA/gene_expression.csv')


df.head()


sns.pairplot(df,hue='Cancer Present')

<seaborn.axisgrid.PairGrid at 0x1f5f0317948>


X=df.drop('Cancer Present',axis=1)


y=df['Cancer Present']


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# we always need to scale our data in KNN because it's a distance based algorithim 
# as features with larger scale would take space

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)


from sklearn.neighbors import KNeighborsClassifier


KNN=KNeighborsClassifier(n_neighbors=1)


KNN.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=1)


from sklearn.metrics import classification_report,plot_confusion_matrix,accuracy_score


pred=KNN.predict(x_test)


confusion_matrix(y_test,pred)

array([[426,  44],
       [ 45, 385]], dtype=int64)


print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.91       470
           1       0.90      0.90      0.90       430

    accuracy                           0.90       900
   macro avg       0.90      0.90      0.90       900
weighted avg       0.90      0.90      0.90       900


test_error_rates = []


for k in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(x_train,y_train) 
    y_pred_test = knn_model.predict(x_test)
    
    # as accuracy give you the percentage that you're right 
    
    test_error = 1 - accuracy_score(y_test,y_pred_test)
    test_error_rates.append(test_error)


test_error_rates

[0.09888888888888892,
 0.09777777777777774,
 0.07333333333333336,
 0.0755555555555556,
 0.07222222222222219,
 0.06444444444444442,
 0.06444444444444442,
 0.061111111111111116,
 0.05777777777777782,
 0.06222222222222218,
 0.061111111111111116,
 0.06000000000000005,
 0.061111111111111116,
 0.06222222222222218,
 0.05888888888888888,
 0.05777777777777782,
 0.05666666666666664,
 0.05555555555555558,
 0.05222222222222217,
 0.053333333333333344,
 0.054444444444444406,
 0.05111111111111111,
 0.054444444444444406,
 0.054444444444444406,
 0.05666666666666664,
 0.05555555555555558,
 0.05555555555555558,
 0.05777777777777782,
 0.05666666666666664]


plt.plot(range(1,30),test_error_rates)
plt.ylabel('Error')
plt.xlabel('K-value')

Text(0.5, 0, 'K-value')


scaler=StandardScaler()


knn = KNeighborsClassifier()


# these strings would be used in the directionary 
operation=[('scaler',scaler),('knn',knn)]


from sklearn.pipeline import Pipeline


pip=Pipeline(steps=operation)


from sklearn.model_selection import GridSearchCV


k_values = list(range(1,20))


param_grid={'knn__n_neighbors':k_values}


# contains the scaled points in KNN model which has the argument n_neighbors
classifier=GridSearchCV(pip,param_grid,cv=5,scoring='accuracy')


classifier.fit(x_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                              12, 13, 14, 15, 16, 17, 18, 19]},
             scoring='accuracy')


classifier.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('knn', KNeighborsClassifier(n_neighbors=16))])


classifier.cv_results_['mean_test_score']

array([0.90380952, 0.90761905, 0.92047619, 0.91285714, 0.92428571,
       0.92142857, 0.92761905, 0.9247619 , 0.9247619 , 0.92238095,
       0.92428571, 0.92571429, 0.92809524, 0.92857143, 0.92857143,
       0.93047619, 0.92857143, 0.92904762, 0.92809524])


pred=classifier.predict(x_test)


confusion_matrix(y_test,pred)

array([[449,  21],
       [ 31, 399]], dtype=int64)


new_patient=[[3.8,6.4]]


classifier.predict(new_patient)

array([0], dtype=int64)


classifier.predict_proba(new_patient)

array([[0.6875, 0.3125]])


plot_confusion_matrix(classifier,x_test,y_test)

C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1f5f02ca608>

	Gene One	Gene Two	Cancer Present
0	4.3	3.9	1
1	2.5	6.3	0
2	5.7	3.9	1
3	6.1	6.2	0
4	7.4	3.4	1

Building our KNN model¶

Simple Case : choose closest neighbor K=1¶

Model Evaluation¶

Choosing The Optimal K_value¶

1-Elbow Method : not feasible for deciding the optimal value for all splits¶

2-GridSearch CV : using Pipeline¶