In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df=pd.read_csv('../DATA/gene_expression.csv')
In [3]:
df.head()
Out[3]:
Gene One Gene Two Cancer Present
0 4.3 3.9 1
1 2.5 6.3 0
2 5.7 3.9 1
3 6.1 6.2 0
4 7.4 3.4 1
In [4]:
sns.pairplot(df,hue='Cancer Present')
Out[4]:
<seaborn.axisgrid.PairGrid at 0x1f5f0317948>

Building our KNN model¶

In [5]:
X=df.drop('Cancer Present',axis=1)
In [6]:
y=df['Cancer Present']
In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
In [8]:
# we always need to scale our data in KNN because it's a distance based algorithim 
# as features with larger scale would take space

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

Simple Case : choose closest neighbor K=1¶

In [9]:
from sklearn.neighbors import KNeighborsClassifier
In [10]:
KNN=KNeighborsClassifier(n_neighbors=1)
In [11]:
KNN.fit(x_train,y_train)
Out[11]:
KNeighborsClassifier(n_neighbors=1)

Model Evaluation¶

In [36]:
from sklearn.metrics import classification_report,plot_confusion_matrix,accuracy_score
In [13]:
pred=KNN.predict(x_test)
In [14]:
confusion_matrix(y_test,pred)
Out[14]:
array([[426,  44],
       [ 45, 385]], dtype=int64)
In [15]:
print(classification_report(y_test,pred))
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       470
           1       0.90      0.90      0.90       430

    accuracy                           0.90       900
   macro avg       0.90      0.90      0.90       900
weighted avg       0.90      0.90      0.90       900

Choosing The Optimal K_value¶

1-Elbow Method : not feasible for deciding the optimal value for all splits¶

In [16]:
test_error_rates = []


for k in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(x_train,y_train) 
    y_pred_test = knn_model.predict(x_test)
    
    # as accuracy give you the percentage that you're right 
    
    test_error = 1 - accuracy_score(y_test,y_pred_test)
    test_error_rates.append(test_error)
In [17]:
test_error_rates
Out[17]:
[0.09888888888888892,
 0.09777777777777774,
 0.07333333333333336,
 0.0755555555555556,
 0.07222222222222219,
 0.06444444444444442,
 0.06444444444444442,
 0.061111111111111116,
 0.05777777777777782,
 0.06222222222222218,
 0.061111111111111116,
 0.06000000000000005,
 0.061111111111111116,
 0.06222222222222218,
 0.05888888888888888,
 0.05777777777777782,
 0.05666666666666664,
 0.05555555555555558,
 0.05222222222222217,
 0.053333333333333344,
 0.054444444444444406,
 0.05111111111111111,
 0.054444444444444406,
 0.054444444444444406,
 0.05666666666666664,
 0.05555555555555558,
 0.05555555555555558,
 0.05777777777777782,
 0.05666666666666664]
In [18]:
plt.plot(range(1,30),test_error_rates)
plt.ylabel('Error')
plt.xlabel('K-value')
Out[18]:
Text(0.5, 0, 'K-value')

2-GridSearch CV : using Pipeline¶

In [19]:
scaler=StandardScaler()
In [20]:
knn = KNeighborsClassifier()
In [21]:
# these strings would be used in the directionary 
operation=[('scaler',scaler),('knn',knn)]
In [22]:
from sklearn.pipeline import Pipeline 
In [23]:
pip=Pipeline(steps=operation)
In [24]:
from sklearn.model_selection import GridSearchCV
In [25]:
k_values = list(range(1,20))

*Note: If your parameter grid is going inside a PipeLine, your parameter name needs to be specified in the following manner:**

  • chosen_string_name + two underscores + parameter key name
  • model_name + __ + parameter name
  • knn_model + __ + n_neighbors
  • knn_model__n_neighbors
In [26]:
param_grid={'knn__n_neighbors':k_values}
In [27]:
# contains the scaled points in KNN model which has the argument n_neighbors
classifier=GridSearchCV(pip,param_grid,cv=5,scoring='accuracy')
In [28]:
classifier.fit(x_train,y_train)
Out[28]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                              12, 13, 14, 15, 16, 17, 18, 19]},
             scoring='accuracy')
In [29]:
classifier.best_estimator_
Out[29]:
Pipeline(steps=[('scaler', StandardScaler()),
                ('knn', KNeighborsClassifier(n_neighbors=16))])
In [30]:
classifier.cv_results_['mean_test_score']
Out[30]:
array([0.90380952, 0.90761905, 0.92047619, 0.91285714, 0.92428571,
       0.92142857, 0.92761905, 0.9247619 , 0.9247619 , 0.92238095,
       0.92428571, 0.92571429, 0.92809524, 0.92857143, 0.92857143,
       0.93047619, 0.92857143, 0.92904762, 0.92809524])
In [31]:
pred=classifier.predict(x_test)
In [32]:
confusion_matrix(y_test,pred)
Out[32]:
array([[449,  21],
       [ 31, 399]], dtype=int64)
In [33]:
new_patient=[[3.8,6.4]]
In [34]:
classifier.predict(new_patient)
Out[34]:
array([0], dtype=int64)
In [35]:
classifier.predict_proba(new_patient)
Out[35]:
array([[0.6875, 0.3125]])
In [38]:
plot_confusion_matrix(classifier,x_test,y_test)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)
Out[38]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1f5f02ca608>
In [ ]: