Aller au contenu

Module 2: Apprentissage supervisé

Partie 2.1: KNN - Les K plus Proches Voisins

Le knn est un algorithme d'apprentissage supervisé

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
iris = pd.read_csv("data/iris.csv")
iris.head()
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
plt.figure(figsize=(10, 8))
sns.scatterplot(x="petal_width", y="petal_length", hue="species", data=iris)
plt.show()

Classification

Dans le cadre d'une classification, on va classifier les points en fonction de leur espèce.

X = dt.petal_width.values.reshape(-1, 1)
y = dt.petal_length.values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=.3)
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

Évaluation

from sklearn.metrics import r2_score
y_pred = knn.predict(X_test)
score = r2_score(y_test, y_pred)
score
0.9377572372679783
plt.scatter(y_test, y_pred)
plt.plot(y_test, y_test, c="red")
[<matplotlib.lines.Line2D at 0x7ff07e363090>]

k_values = [2, 3, 4, 5, 6, 7, 8]

train_scores = []
test_scores = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_scores.append(knn.score(X_train, y_train))
    test_scores.append(knn.score(X_test, y_test))

plt.plot(k_values, train_scores)
plt.plot(k_values, test_scores)
[<matplotlib.lines.Line2D at 0x7ff07e337e50>]


X = dt[["petal_width", "petal_length"]]
species = dt["species"]
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(species)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=.3)
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

Évaluation

from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
score = accuracy_score(y_test, y_pred)
score
0.9333333333333333
k_values = [2, 3, 4, 5, 6, 7, 8]

train_scores = []
test_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_scores.append(knn.score(X_train, y_train))
    test_scores.append(knn.score(X_test, y_test))

plt.plot(k_values, train_scores)
plt.plot(k_values, test_scores)
[<matplotlib.lines.Line2D at 0x7ff07e2beb90>]

Classification

Dans le cadre d'une classification

clf.predict([[1.5, 5.0]]), clf.predict_proba([[1.5, 5.0]])
(array(['versicolor'], dtype=object), array([[0. , 0.8, 0.2]]))
sns.pairplot(dt, hue='species')
<seaborn.axisgrid.PairGrid at 0x7ff07d83c610>