import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv('../DATA/gene_expression.csv')
df.head()
Gene One | Gene Two | Cancer Present | |
---|---|---|---|
0 | 4.3 | 3.9 | 1 |
1 | 2.5 | 6.3 | 0 |
2 | 5.7 | 3.9 | 1 |
3 | 6.1 | 6.2 | 0 |
4 | 7.4 | 3.4 | 1 |
sns.pairplot(df,hue='Cancer Present')
<seaborn.axisgrid.PairGrid at 0x1f5f0317948>
X=df.drop('Cancer Present',axis=1)
y=df['Cancer Present']
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# we always need to scale our data in KNN because it's a distance based algorithim
# as features with larger scale would take space
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier(n_neighbors=1)
KNN.fit(x_train,y_train)
KNeighborsClassifier(n_neighbors=1)
from sklearn.metrics import classification_report,plot_confusion_matrix,accuracy_score
pred=KNN.predict(x_test)
confusion_matrix(y_test,pred)
array([[426, 44], [ 45, 385]], dtype=int64)
print(classification_report(y_test,pred))
precision recall f1-score support 0 0.90 0.91 0.91 470 1 0.90 0.90 0.90 430 accuracy 0.90 900 macro avg 0.90 0.90 0.90 900 weighted avg 0.90 0.90 0.90 900
test_error_rates = []
for k in range(1,30):
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(x_train,y_train)
y_pred_test = knn_model.predict(x_test)
# as accuracy give you the percentage that you're right
test_error = 1 - accuracy_score(y_test,y_pred_test)
test_error_rates.append(test_error)
test_error_rates
[0.09888888888888892, 0.09777777777777774, 0.07333333333333336, 0.0755555555555556, 0.07222222222222219, 0.06444444444444442, 0.06444444444444442, 0.061111111111111116, 0.05777777777777782, 0.06222222222222218, 0.061111111111111116, 0.06000000000000005, 0.061111111111111116, 0.06222222222222218, 0.05888888888888888, 0.05777777777777782, 0.05666666666666664, 0.05555555555555558, 0.05222222222222217, 0.053333333333333344, 0.054444444444444406, 0.05111111111111111, 0.054444444444444406, 0.054444444444444406, 0.05666666666666664, 0.05555555555555558, 0.05555555555555558, 0.05777777777777782, 0.05666666666666664]
plt.plot(range(1,30),test_error_rates)
plt.ylabel('Error')
plt.xlabel('K-value')
Text(0.5, 0, 'K-value')
scaler=StandardScaler()
knn = KNeighborsClassifier()
# these strings would be used in the directionary
operation=[('scaler',scaler),('knn',knn)]
from sklearn.pipeline import Pipeline
pip=Pipeline(steps=operation)
from sklearn.model_selection import GridSearchCV
k_values = list(range(1,20))
*Note: If your parameter grid is going inside a PipeLine, your parameter name needs to be specified in the following manner:**
param_grid={'knn__n_neighbors':k_values}
# contains the scaled points in KNN model which has the argument n_neighbors
classifier=GridSearchCV(pip,param_grid,cv=5,scoring='accuracy')
classifier.fit(x_train,y_train)
GridSearchCV(cv=5, estimator=Pipeline(steps=[('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]), param_grid={'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]}, scoring='accuracy')
classifier.best_estimator_
Pipeline(steps=[('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=16))])
classifier.cv_results_['mean_test_score']
array([0.90380952, 0.90761905, 0.92047619, 0.91285714, 0.92428571, 0.92142857, 0.92761905, 0.9247619 , 0.9247619 , 0.92238095, 0.92428571, 0.92571429, 0.92809524, 0.92857143, 0.92857143, 0.93047619, 0.92857143, 0.92904762, 0.92809524])
pred=classifier.predict(x_test)
confusion_matrix(y_test,pred)
array([[449, 21], [ 31, 399]], dtype=int64)
new_patient=[[3.8,6.4]]
classifier.predict(new_patient)
array([0], dtype=int64)
classifier.predict_proba(new_patient)
array([[0.6875, 0.3125]])
plot_confusion_matrix(classifier,x_test,y_test)
C:\Users\Aas03\anaconda3\envs\hadeel_en\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1f5f02ca608>