Exercice 0¶

In [1]:
import sklearn.datasets as skd
import sklearn.neighbors as nn
from sklearn import model_selection as modsel
import numpy as np
import matplotlib.pyplot as plt
In [2]:
iris = skd.load_iris()
In [3]:
def get_scores(xtrain, xtest, ytrain, ytest, krange):
    scores = []
    for k in krange:
        model = nn.KNeighborsClassifier(n_neighbors=k)
        fitted = model.fit(xtrain, ytrain)
        predictions = fitted.predict(xtest)
        scores.append(np.mean(predictions == ytest))
    return scores
In [4]:
krange = range(1, 150)
scores = get_scores(iris["data"], iris["data"], iris["target"], iris["target"], krange)
plt.plot(krange, scores)
Out[4]:
[<matplotlib.lines.Line2D at 0x7f33cab0b810>]
No description has been provided for this image

La chute observée à $k=100$ correspond au passage de 100% de bien classés à 0% dans le cluster isolé des deux autres.

In [5]:
krange = range(1, 100) #because test_size = 0.3
X_train, X_test, y_train, y_test = modsel.train_test_split(iris["data"], iris["target"], test_size=0.3)
scores = get_scores(X_train, X_test, y_train, y_test, krange)
plt.plot(krange, scores)
Out[5]:
[<matplotlib.lines.Line2D at 0x7f33c60bab10>]
No description has been provided for this image
In [6]:
krange = range(1, 130)
splits = modsel.StratifiedKFold(n_splits=10, shuffle=True)
param_grid = {'n_neighbors': krange}
scv = modsel.GridSearchCV(nn.KNeighborsClassifier(), param_grid, cv=splits, scoring="accuracy")
c = scv.fit(iris["data"], iris["target"])
In [7]:
plt.plot(krange, c.cv_results_['mean_test_score'])
Out[7]:
[<matplotlib.lines.Line2D at 0x7f33c60ac150>]
No description has been provided for this image

Exercice 1¶

In [8]:
n, d = 500, 100
krange = [3, 7, 11, 15, 19, 23, 27]
irange = range(1, 100)
splits = modsel.KFold(n_splits=10)
param_grid = {'n_neighbors': krange}
scv = modsel.GridSearchCV(nn.KNeighborsRegressor(), param_grid, cv=splits, scoring="neg_mean_absolute_error")
res = []
for infor in irange:
    X, y = skd.make_regression(n_samples=n, n_features=d, n_informative=infor)
    c = scv.fit(X, y)
    res.append(c.cv_results_)
In [9]:
scores = np.empty((99, 7))
for infor in irange:
    scores[infor - 1] = np.multiply(res[infor - 1]['mean_test_score'], -1)
plt.plot(scores)
Out[9]:
[<matplotlib.lines.Line2D at 0x7f33bff0ab50>,
 <matplotlib.lines.Line2D at 0x7f33bff38c50>,
 <matplotlib.lines.Line2D at 0x7f33bff38ed0>,
 <matplotlib.lines.Line2D at 0x7f33c6049250>,
 <matplotlib.lines.Line2D at 0x7f33bff394d0>,
 <matplotlib.lines.Line2D at 0x7f33bff39890>,
 <matplotlib.lines.Line2D at 0x7f33bff39c10>]
No description has been provided for this image

Clairement, l'erreur croît avec le nombre de variables informatives. Ce n'est pas vraiment intuitif ; je n'ai pas d'explication pour l'instant mais compte bien en trouver une :)