1.1 Example: Polynomial Curve Fitting¶

import os
import functools
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures

if "OUTPUT_RETINA_IMAGES" in os.environ:
    %config InlineBackend.figure_format = "retina"

plt.rcParams["axes.spines.right"] = False
plt.rcParams["axes.spines.top"] = False

We generate a dataset using \(sin(2\pi x)\) function but also add some observation noise to reflect the true world !

N = 10
SEED = 1234
SCALE = 0.25

np.random.seed(SEED)

def true_fun(x):
    return np.sin(2 * np.pi * x)

def generate_noisy_data(x, scale=SCALE):
    y = true_fun(x) + np.random.normal(scale=scale, size=x.shape)
    return y

x_plot = np.arange(0, 1.01, 0.01)
y_plot = true_fun(x_plot)

# points with noise, will act as train data
x_train = np.linspace(0, 1, N)
y_train = generate_noisy_data(x_train)

plt.plot(x_plot, y_plot, color='lime', label="$\sin(2\pi x)$")
plt.scatter(x_train, y_train, marker='o', color='blue', label="train")
plt.hlines(y=y_train[3], xmin=-2, xmax=x_train[3], linewidth=1, linestyle='--', color = 'k')
plt.vlines(x=x_train[3], ymin=-2, ymax=y_train[3], linewidth=1, linestyle='--', color = 'k')

plt.text(-0.1, y_train[3], "$y_{train}$", fontsize=15)
plt.text(x_train[3] - 0.01, -1.7, "$x_{train}$", fontsize=15)
plt.yticks( [-1.5, 0.0, 1.5] )
plt.xticks( [0.0, 0.5, 1.0] )
plt.xlim(-0.05, 1.05)
plt.ylim(-1.5, 1.5)
plt.legend();

../_images/1_1_example_polynomial_curve_fitting_4_0.png

We will now use different models of regression to determine the weights. These models will differ by the degree of polynomial so as a first step let’s transform our features (\(x_{train}\))

def transform_features(X, m):
    """ Create a polynomial of specified degrees """
    return PolynomialFeatures(degree=m).fit_transform(X.reshape(-1, 1))

# examples of creating the polynomial features
features_m_0 = transform_features(x_train, m=0)
features_m_1 = transform_features(x_train, m=1)
features_m_3 = transform_features(x_train, m=3)
features_m_9 = transform_features(x_train, m=9)

def multiple_regression_fit(features, y_train):
    A_t = features.T @ features 
    weight_vector = np.linalg.solve(A_t, features.T @ y_train)
    return weight_vector

def multiple_regression_predict(features, weight_vector):
    return np.dot(features, weight_vector)

# we will invoke above fit & predict methods in a loop for all
# the polynomials

# test data
# x that we used to draw out sin function will act as the test data

def regress(m, x, y):
    poly_train_features = transform_features(x, m=m)
    return multiple_regression_fit(poly_train_features, y)
    
def predict(m, x, weight_vector):
    poly_test_features = transform_features(x, m=m)
    return multiple_regression_predict(poly_test_features, weight_vector)

def regress_and_predict(m, x_train, y_train, x_test):
    weight_vector = regress(m, x_train, y_train)
    return predict(m, x_test, weight_vector)

fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)

plt.xlim(-0.05, 1.05)
plt.ylim(-1.5, 1.5)

def plot(m, x, y_predict, ax):
    # true function
    ax.plot(x_plot, y_plot, color='lime', label="$\sin(2\pi x)$")

    # training data
    ax.scatter(x_train, y_train, marker='o', color='blue', label="Train")

    # Show the poly degree
    ax.text(0.65, 0.8, f"M={m}")

    # finally the predictions
    ax.plot(x, y_predict, color='red', label="Test")


x_test = np.linspace(0, 1, 100)
y_test = generate_noisy_data(x_test)

partial_rp = functools.partial(regress_and_predict, 
                        x_train=x_train, 
                        y_train=y_train, 
                        x_test=x_test)

results = list(map(partial_rp, [0,1,3,9]))

plot(m=0, x=x_test, y_predict=results[0], ax=ax[0][0])
plot(m=1, x=x_test, y_predict=results[1], ax=ax[0][1])
plot(m=3, x=x_test, y_predict=results[2], ax=ax[1][0])
plot(m=9, x=x_test, y_predict=results[3], ax=ax[1][1])

plt.legend(bbox_to_anchor=(1.05, 1.05));

../_images/1_1_example_polynomial_curve_fitting_8_0.png

Notes -

Red curve shows the fitted curve.
M=0 fails to create the sin curve
M=1 also failed
M=3 seems to re-create the curve close to original function
M=9 creates a curve that passes through all the training data but fails to approximate sin and is very wiggly. This is overfitting.

Evaluating residual value of \(E(w^*)\)¶

def rmse(a, b):
    return np.sqrt(np.mean(np.square(a - b)))

# try polynomial of degrees from 0 to 10
M = range(0, 10)

weight_vectors = []
for m in M:
    w = regress(m=m, x=x_train, y=y_train)
    weight_vectors.append(w)
    
def compute_errors(x, y, m, weight_vector):
    y_predict = predict(m=m, x=x, weight_vector=weight_vector)
    return rmse(y_predict, y)

def accumate_errors(x, y):
    m_w = zip(M, weight_vectors)
    errors = []
    for (m, w) in m_w:
        errors.append(compute_errors(x, y, m, w))
    return errors

x_test = np.arange(0, 1.01, 0.01)
y_test = generate_noisy_data(x_test)

training_errors = accumate_errors(x_train, y_train)
test_errors = accumate_errors(x_test, y_test)

plt.plot(training_errors, 'o-', mfc="none", mec="b", ms=10, c="b", label="Training")
plt.plot(test_errors, 'o-', mfc="none", mec="r", ms=10, c="r", label="Test")
plt.legend()
plt.yticks( [0,0.5,1] )
plt.xticks( [0,3,6,9] )
plt.xlabel("degree")
plt.ylabel("RMSE");

../_images/1_1_example_polynomial_curve_fitting_11_0.png

Notes -

For M = 9, the training set error goes to zero, for test set it shoots up
A polynomial of higher degree contains the functions for lower so why this behavior ?

Inspecting the weight vectors of various polynomials¶

weight_vector_m_0 = regress(m=0, x=x_train, y=y_train)
weight_vector_m_1 = regress(m=1, x=x_train, y=y_train)
weight_vector_m_3 = regress(m=3, x=x_train, y=y_train)
weight_vector_m_9 = regress(m=9, x=x_train, y=y_train)

# makeing them same dimension so 
# as display using a pandas frame
def pad_weight_vectors(weight_vector):
    while len(weight_vector) <= 9:
        weight_vector = np.append(weight_vector, 0.)
    return weight_vector

weight_vector_m_0 = pad_weight_vectors(weight_vector_m_0)
weight_vector_m_1 = pad_weight_vectors(weight_vector_m_1)
weight_vector_m_3 = pad_weight_vectors(weight_vector_m_3)
weight_vector_m_9 = pad_weight_vectors(weight_vector_m_9)

df = pd.DataFrame([weight_vector_m_0,weight_vector_m_1,weight_vector_m_3,weight_vector_m_9]).transpose()
df.columns = ['M=0', 'M=1', 'M=3', 'M=9']

df

	M=0	M=1	M=3	M=9
0	-0.035921	0.783019	0.011334	0.117859
1	0.000000	-1.637881	9.292162	-32.239388
2	0.000000	0.000000	-26.789442	552.759807
3	0.000000	0.000000	17.037287	-2730.224980
4	0.000000	0.000000	0.000000	4770.532869
5	0.000000	0.000000	0.000000	2010.207069
6	0.000000	0.000000	0.000000	-19325.059542
7	0.000000	0.000000	0.000000	28349.068583
8	0.000000	0.000000	0.000000	-17838.530474
9	0.000000	0.000000	0.000000	4242.807525

Notes -

As M increases, the magnitude of weights is becoming larger
In particular for M=9, the weights become very large +ive and -ive so that corresponding polynomial function matches the data point exactly

Intuitively, what is happening is that the more flexible polynomials with larger values of M are becoming increasingly tuned to the random noise on the target values

Increasing the datasize reduces overfitting¶

Here we use M=9 that resulted in overfitting but increase the training dataset size. We will try N=15 and N=100.

def analyze_overfit(N, m=9):
    x_train = np.linspace(0, 1, N)
    y_train = generate_noisy_data(x_train)

    poly_train_features = transform_features(x_train, m=m)
    weight_vector = multiple_regression_fit(poly_train_features, y_train)

    return x_train, y_train, weight_vector
   
def regress_and_plot(x):

    fig, ax = plt.subplots(1, 2, sharey=True, figsize=(10, 4))

    plt.xlim(-0.05, 1.05)
    plt.ylim(-2.0, 2.0)

    for i, n in enumerate([15, 100]):
        # get weight vector
        x_train, y_train, weight_vector = analyze_overfit(N=n)
        # make predictions
        poly_test_features = transform_features(x, m=9)
        y_predict = multiple_regression_predict(poly_test_features, weight_vector)

        # true function
        ax[i].plot(x_plot, y_plot, color='lime', label="$\sin(2\pi x)$")

        # training data
        ax[i].scatter(x_train, y_train, marker='o', color='blue', label="Train")

        # Show the poly degree
        ax[i].text(0.65, 0.8, f"N={n}")

        # finally the predictions
        ax[i].plot(x, y_predict, color='red', label="Test")

regress_and_plot(x=x_test)

../_images/1_1_example_polynomial_curve_fitting_17_0.png

From the above plots, it is evident that as we increased the data size to 100, we got rid of overfitting.

One rough heuristic that is sometimes advocated is that the number of data points should be no less than some multiple (say 5 or 10) of the number of adaptive parameters in the model.

It is indeed a nuisance to have this delicate dance between number of parameters and dataset size. Could Bayesian Inference help ? Authors says - Yes

At the moment we restrict ourselves to current situation only. Here because we had a synthetic dataset we could increase the number of training samples at will but real world is differnt. So next question to ask is - Could we reduce overfitting when we have less number of training examples ?

Regularization¶

We observed that the weights started to reach larger values for M=9; so one technique to circumvent is to add penalty term to the error function in order to discouragethe coefficients from reaching large values.

The new error function then takes the form

\[ E(\mathbf{w}) = \frac{1}{2} \sum_{n=1}^N \{y(x_n, \mathbf{w}) - t_n\}^2 + \frac{\lambda}{2} ||\mathbf{w}||^2 \]

The \(\lambda\) in the above equation controls the contribution of the regularization to the overall error. This is a hyper-parameter.

Above can be done in closed form.

This method is known by various other names as well.

In statistics, it is known as shrinkage since we are reducing the values of \(\mathbf{w}\).

In machine learning, we call it weight decay.

In this particular case of quadratics weight decay, it is also called Ridge Regression

After some mathematical rigor, the closed form solution is -

\[ \beta^{ridge} = (X^T X + \lambda I)^{-1} X^T Y \]

# we will modify our multiple regression fit function
# to take this into account. Rest of the functions defined above
# should still be usable

def multiple_regression_ridge_fit(features, y_train, lamb):
    # Note - addition of second term for A_t
    A_t = features.T @ features + lamb * np.eye(features.shape[1], features.shape[1])
    weight_vector = np.linalg.solve(A_t, features.T @ y_train)
    return weight_vector

def regress_ridge(m, x, y, lamb):
    poly_train_features = transform_features(x, m=m)
    # get weights
    weight_vector = multiple_regression_ridge_fit(poly_train_features, y, lamb)
    return weight_vector

def regress_ridge_and_plot(x):

    fig, ax = plt.subplots(1, 3, sharey=True, figsize=(10,4))

    plt.xlim(-0.05, 1.05)
    plt.ylim(-2.0, 2.0)

    for i, lamb in enumerate([-1000, -18, 0]):

        weight_vector_m_9_ridge = regress_ridge(m=9, x=x_train, y=y_train, lamb=np.exp(lamb))
        y_predict = predict(m=9, x=x, weight_vector=weight_vector_m_9_ridge)

        # true function
        ax[i].plot(x_plot, y_plot, color='lime', label="$\sin(2\pi x)$")

        # training data
        ax[i].scatter(x_train, y_train, marker='o', color='blue', label="Train")

        # Show the poly degree
        ax[i].text(0.65, 0.8, f"$ln \ \lambda$={lamb}")

        # finally the predictions
        ax[i].plot(x, y_predict, color='red', label="Test")

regress_ridge_and_plot(x=x_test)

../_images/1_1_example_polynomial_curve_fitting_20_0.png

\(ln \ \lambda\) = -1000 corresponds to having no regualrizer. In the above plot, are shown the effect of 3 different regularizers. Clearly out of these 3, \(ln \ \lambda\) = -18 seems to do the trick

weight_vector_m_9_0 = regress_ridge(m=9, x=x_train, y=y_train, lamb=np.exp(-1000))
weight_vector_m_9_1 = regress_ridge(m=9, x=x_train, y=y_train, lamb=np.exp(-18))
weight_vector_m_9_2 = regress_ridge(m=9, x=x_train, y=y_train, lamb=np.exp(0))

weight_vector_m_9_0 = pad_weight_vectors(weight_vector_m_9_0)
weight_vector_m_9_1 = pad_weight_vectors(weight_vector_m_9_1)
weight_vector_m_9_2 = pad_weight_vectors(weight_vector_m_9_2)

df = pd.DataFrame([weight_vector_m_9_0,weight_vector_m_9_1,weight_vector_m_9_2]).transpose()
df.columns = ['ln lambda=-1000', 'ln lambda=-18', 'ln lambda=0', ]

df

	ln lambda=-1000	ln lambda=-18	ln lambda=0
0	0.117859	0.084455	0.357091
1	-32.239388	-8.198305	-0.332004
2	552.759807	185.185113	-0.403486
3	-2730.224980	-860.896425	-0.304300
4	4770.532869	1438.245455	-0.191590
5	2010.207069	-422.121845	-0.097300
6	-19325.059542	-1042.178585	-0.023986
7	28349.068583	231.682032	0.031876
8	-17838.530474	1113.582844	0.074296
9	4242.807525	-635.940163	0.106591

We see that, as the value of \(\lambda\) increases, the typical magnitude of the coefficients gets smaller

lamb = range(-40, -10, 1)

train_errors = []
test_errors = []
for l in lamb:
    weight_vector_m_9_ridge = regress_ridge(m=9, x=x_train, y=y_train, lamb=np.exp(l))

    y_train_predict = predict(m=9, x=x_train, weight_vector=weight_vector_m_9_ridge) 
    y_test_predict = predict(m=9, x=x_test, weight_vector=weight_vector_m_9_ridge)

    train_errors.append(rmse(y_train, y_train_predict))
    test_errors.append(rmse(y_test, y_test_predict))

plt.plot(lamb, train_errors, 'o-', mfc="none", mec="b", ms=10, c="b", label="Training")
plt.plot(lamb, test_errors, 'o-', mfc="none", mec="r", ms=10, c="r", label="Test")
plt.legend()
plt.yticks( [0,0.5,1] )
plt.xlim((-40, -10))
plt.xlabel("$ln \lambda$")
plt.ylabel("RMSE");

../_images/1_1_example_polynomial_curve_fitting_24_0.png

Pattern Recognition and Machine Learning

1.1 Example: Polynomial Curve Fitting¶

Evaluating residual value of \(E(w^*)\)¶

Inspecting the weight vectors of various polynomials¶

Increasing the datasize reduces overfitting¶

Regularization¶