In [1]:
import plotly.io as pio; pio.renderers.default = "notebook_connected"

Lecture 13: Gradient Descent and SGD – CS 189, Fall 2025

In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
from ipywidgets import HBox
import numpy as np
In [3]:
# make the images folder if it doesn't exist
import os
if not os.path.exists("images"):
    os.makedirs("images")

Plotting Code you Can Safely Ignore (but need to run).

This lecture has many complex visualizations and to keep the rest of the code short I have put the visualization code here. Much of this code is out-of-scope for the course, but feel free to look through it if you are interested.

In [4]:
def make_plot_grid(figs, rows, cols):
    """Create a grid of figures with Plotly figures."""
    from plotly.subplots import make_subplots
    def get_trace_type(fig):
        for trace in fig.data:
            if trace.type == 'surface':
                return 'surface'  # required for go.Surface
            elif trace.type.startswith('scatter3d') or trace.type.startswith('mesh3d'):
                return 'scene'  # 3D scene
        return 'xy'  # default 2D
    specs = [[{'type': get_trace_type(fig)} for fig in figs[i:i+cols]] for i in range(0, len(figs), cols)]
    fig_grid = make_subplots(rows=rows, cols=cols, specs=specs,
                             subplot_titles=[fig.layout.title.text for fig in figs])
    for i, fig in enumerate(figs):
        fig_grid.add_traces(fig.data, rows=(i//cols) + 1, cols=(i%cols) + 1 )
    return fig_grid
In [5]:
def plot_lr_predictions(w, cancer):
    """Plot predictions of the logistic model."""
    cancer = cancer.copy()
    cancer['logistic_pred'] = np.where(
        logistic_model(w, cancer[['mean radius', 'mean texture']].values) > 0.5,
        'Pred Malignant', 'Pred Benign'
    )
    # Create a scatter plot with logistic predictions
    fig = px.scatter(cancer, x='mean radius', y='mean texture', 
                     symbol='logistic_pred', color='target',
                     symbol_sequence=[ "circle-open", "cross"])
    for (i,t) in enumerate(fig.data): t.legendgroup = str(i)
    # decision boundary
    xs = np.linspace(cancer['mean radius'].min(), cancer['mean radius'].max(), 100)
    decision_boundary = -(w[0] * xs ) / w[1]
    fig.add_scatter(x=xs, y=decision_boundary, mode='lines', 
                    name='Decision Boundary', legendgroup='Decision Boundary',
                    line=dict(color='black', width=2, dash='dash', ))
    # probability surface
    ys = np.linspace(cancer['mean texture'].min(), cancer['mean texture'].max(), 100)
    X, Y = np.meshgrid(xs, ys)
    Z = logistic_model(w, np.c_[X.ravel(), Y.ravel()]).reshape(X.shape)
    fig.add_contour(x=xs, y=ys, z=Z, 
                    colorscale='Matter_r', opacity=0.5,
                    name='Probability Surface', 
                    colorbar=dict(x=1.05, y=0.3, len=0.75))

    fig.update_layout(title=f'w=({w[0]:.2f}, {w[1]:.2f})',
                      xaxis_range=[xs.min(), xs.max()], yaxis_range=[ys.min(), ys.max()],
                      xaxis_title='Mean Radius (scaled)', yaxis_title='Mean Texture (scaled)',
                      width=800, height=600)
    return fig
In [6]:
def plot_loss(w1, w2, error, ncontours=50):
    surf_fig = go.Figure()
    surf_fig.add_surface(z=error, x=w1, y=w2, 
                         colorscale='Viridis_r', opacity=0.7, showscale=False,
                         contours=dict(z=dict(show=True, highlightcolor="white", 
                                          start=error.min(), end=error.max(), 
                                          size=(error.max()-error.min())/ncontours)))
    surf_fig.update_layout(title="Loss Surface")
    contour_fig = go.Figure()
    contour_fig.add_contour(x=w1.flatten(), y=w2.flatten(), z=error.flatten(),
                            colorscale='Viridis_r', opacity=0.7,
                            contours=dict(start=error.min(), end=error.max(), 
                                          size=(error.max()-error.min())/ncontours),
                            colorbar=dict(x=1.05, y=0.35, len=0.75))
    contour_fig.update_layout(title="Loss Contours")
    fig = make_plot_grid([surf_fig, contour_fig], 1, 2).update_layout(height=800)
    fig.update_layout(scene=dict(xaxis_title='w1', yaxis_title='w2', zaxis_title='Loss', aspectmode='cube'))
    fig.update_layout(xaxis_range=[w1.min(), w1.max()], yaxis_range=[w2.min(), w2.max()],
                      xaxis_title='w1', yaxis_title='w2')
    return fig
In [7]:
def plot_gradient(w1, w2, error, dw1, dw2, scale=1.0):
    fig = plot_loss(w1, w2, error)
    fig.add_trace(
        go.Cone(
            x=w1.flatten(), y=w2.flatten(), z=np.zeros_like(error).flatten(),  # Ground plane
            u=dw1.flatten(), v=dw2.flatten(), w=np.zeros_like(error).flatten(),  # No vertical component
            sizeref=2, anchor="tail", showscale=False
        ), 1,1)
    contour_fig = ff.create_quiver(
        x=w1.flatten(), y=w2.flatten(), u=dw1.flatten(), v=dw2.flatten(), 
        line_width=2, line_color="white",
        scale = scale, arrow_scale=.2, showlegend=False)
    fig.add_traces(contour_fig.data, rows=1, cols=2)
    return fig
In [8]:
def add_solution_path(fig, errors, ws):
    s = np.linspace(0, 1, len(ws))
    fig.add_scatter3d(x=ws[:, 0], y=ws[:, 1], z=errors, marker_color=s, marker_size=5,
                    mode='lines+markers', line=dict(color='black', width=2), opacity=0.5,
                    name='Gradient Descent Path', legendgroup='Gradient Descent',
                    row=1, col=1)
    fig.add_scatter(x=ws[:, 0], y=ws[:, 1], marker_color=s,
                    mode='lines+markers', line=dict(color='black', width=2), opacity=0.5,
                    name='Gradient Descent Path', legendgroup='Gradient Descent',
                    showlegend=False,
                    row=1, col=2)
    fig.add_scatter3d(x=[ws[-1, 0]], y=[ws[-1, 1]], z=[errors[-1]],
                       mode='markers', marker=dict(color='red', size=10),
                       name='Final Solution', legendgroup='Final Solution',
                       row=1, col=1)
    fig.add_scatter(x=[ws[-1, 0]], y=[ws[-1, 1]],
                       mode='markers', marker=dict(color='red', size=20),
                       name='Final Solution', legendgroup='Final Solution',
                       showlegend=False,
                       row=1, col=2)
    return fig

Lecture 12 Setup

In the previous lecture we visualized the loss surfaces and derived gradient descent. In this part of the notebook we setup the data, loss, and gradient descent code that we will use to visualize gradient descent and stochastic gradient descent.

The Logistic Regression Problem

In [9]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
cancer_dict = datasets.load_breast_cancer(as_frame=True)
cancer_df = pd.DataFrame(cancer_dict.data, columns=cancer_dict.feature_names)
cancer_df['target'] = cancer_dict.target.astype(str)
cancer_df = cancer_df[['mean radius', 'mean texture', 'target']].dropna()
scaler = StandardScaler()
cancer_df[['mean radius', 'mean texture']] = scaler.fit_transform(cancer_df[['mean radius', 'mean texture']])
print("The dataset:", cancer_df.shape)
display(cancer_df.head())

cancer = dict()
cancer['npts'] = 30
cancer['w1'], cancer['w2'] = np.meshgrid(
    np.linspace(-10, 1, cancer['npts']), 
    np.linspace(-5, 1.3, cancer['npts'])
)
cancer['ws'] = np.stack([cancer['w1'].flatten(), cancer['w2'].flatten()]).T
cancer['x'] = cancer_df[['mean radius', 'mean texture']].values
cancer['t'] = cancer_df['target'].values.astype(float)
The dataset: (569, 3)
mean radius mean texture target
0 1.097064 -2.073335 0
1 1.829821 -0.353632 0
2 1.579888 0.456187 0
3 -0.768909 0.253732 0
4 1.750297 -1.151816 0
In [10]:
def sigmoid(z):
    """Sigmoid function."""
    return 1 / (1 + np.exp(-z))

def logistic_model(w, x):
    """Logistic model for binary classification."""
    z = w @ x.T
    return sigmoid(z)

def NLL(w):
    """Negative log-likelihood for logistic regression."""
    x = cancer['x']
    t = cancer['t']
    z = w @ x.T
    # return -np.mean(t * np.log(logistic_model(w, x)) + (1 - t) * np.log(1 - logistic_model(w, x)))
    # more numerically stable version
    # np.mean(np.log(1+ np.exp(z)) - t * z) 
    # stable softplus: log(1+exp(z))
    softplus_z = np.logaddexp(0, z)
    return np.mean(softplus_z - t * z)


def grad_NLL(w, x=cancer['x'], t=cancer['t']):
    """Compute the gradient of the negative log-likelihood."""
    p = logistic_model(w, x)
    grad = np.mean((p - t).reshape(-1, 1) * x, 0)
    return grad
In [11]:
cancer['error'] = np.array([NLL(w) 
                for w in cancer['ws']])
cancer['error'] = cancer['error'].reshape(cancer['w1'].shape)
# guesses = np.array([[-1., -1.],
#                     [-2., -4.]])
best_ind = np.argmin(cancer['error'])
cancer['grid_best'] = np.array([cancer['w1'].flatten()[best_ind], 
                                cancer['w2'].flatten()[best_ind]])

Computing the gradient fields and visualizing gradient descent.

In [12]:
(cancer['dw1'], cancer['dw2']) = np.array(
    [grad_NLL(w) for w in cancer['ws']]).T
cancer['dw1'] = cancer['dw1'].reshape(cancer['w1'].shape)
cancer['dw2'] = cancer['dw2'].reshape(cancer['w1'].shape)

The Squared Error Loss Squared Loss on a Non-Linear Model

In [13]:
sine = dict()
np.random.seed(42)
sine['n'] = 200
# Generate some random data
sine['x'] = np.random.rand(sine['n']) * 2.5 * np.pi
sine['x'] = np.sort(sine['x']) # sort for easier plotting
sine['y'] = np.sin(1.1 + 2.5 * sine['x']) + 0.5 * np.random.randn(sine['n'])
sine_df = pd.DataFrame({'x': sine['x'], 'y': sine['y']})
In [14]:
def sine_model(w, x):
    return np.sin(w[0] + x * w[1])

def sine_MSE(w):
    x = sine['x']
    y = sine['y']
    y_hat = sine_model(w, x)
    return np.mean((y - y_hat) ** 2)

def grad_sine_MSE(w, x=sine['x'], y=sine['y']):
    """Compute the gradient of the negative log-likelihood."""
    y_hat = sine_model(w, x)
    grad_w0 = -2 * np.mean((y - y_hat) * np.cos(w[0] + w[1] * x))
    grad_w1 = -2 * np.mean((y - y_hat) * x * np.cos(w[0] + w[1] * x))
    return np.array([grad_w0, grad_w1])
In [15]:
sine['guesses'] = np.array([[0, 2], [2, 3], [0, 3.5]])
sine['xhat'] = np.linspace(sine_df['x'].min(), sine_df['x'].max(), 100)
sine['pred_df'] = pd.DataFrame({'x': sine['xhat']})
for w in sine['guesses']:
    sine['pred_df'][f'yhat(w={w})'] = sine_model(w, sine['xhat'])
sine['pred_df'].head()
Out[15]:
x yhat(w=[0. 2.]) yhat(w=[2. 3.]) yhat(w=[0. 3.5])
0 0.043371 0.086632 0.847619 0.151215
1 0.121225 0.240082 0.701797 0.411673
2 0.199080 0.387723 0.517864 0.641752
3 0.276935 0.525982 0.305809 0.824474
4 0.354790 0.651515 0.077147 0.946355
In [16]:
sine['npts'] = 30
sine['w0'], sine['w1'] = np.meshgrid(
    np.linspace(-1.5, 3, sine['npts']), np.linspace(1, 4, sine['npts']))
# combine w1 and w2 into a single tensor
sine['ws'] = np.stack([sine['w0'].flatten(), sine['w1'].flatten()]).T

sine['error'] = np.array([sine_MSE(w) for w in sine['ws']]).reshape(sine['w0'].shape)
In [17]:
ind = np.argmin(sine['error'])
sine['grid_best'] = sine['ws'][ind,:]
sine['grid_best_error'] = sine['error'].flatten()[ind]
print(f"Best weights: {sine['grid_best']}, with error: {sine['grid_best_error']}")
Best weights: [1.29310345 2.44827586], with error: 0.23832770254453578
In [18]:
(sine['dw0'], sine['dw1']) = np.array(
    [grad_sine_MSE(w) for w in sine['ws']]).T
sine['dw0'] = sine['dw0'].reshape(sine['w1'].shape)
sine['dw1'] = sine['dw1'].reshape(sine['w1'].shape)

The Gradient Descent Algorithm

Here we implement the most basic version of gradient descent. This version uses a fixed step size and does not use any fancy tricks like momentum or adaptive step sizes (which we will see soon).

In [19]:
def gradient_descent(w_0, gradient, 
    learning_rate=1, nepochs=10, epsilon=1e-6):
    """Basic gradient descent algorithm.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        learning_rate: Step size for each iteration.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
    Returns:
        path: Array of weights at each iteration."""
    w_old = w_0
    path = [w_old]
    for e in range(nepochs):
        w = w_old - learning_rate * gradient(w_old)
        path.append(w)
        if np.linalg.norm(w - w_old) < epsilon: break
        w_old = w
    return np.array(path)

Gradient Descent Applied to Logistic Regression

In [20]:
# w0 = np.array([-10., -5.])
# w0 = np.array([-1., 2.])
w0 = np.array([0., 0])
path = gradient_descent(w0, 
                     grad_NLL,
                     learning_rate=1, 
                     nepochs=100)
errors = [NLL(w) for w in path]
fig = plot_gradient(
    cancer['w1'], cancer['w2'], cancer['error'], 
    cancer['dw1'], cancer['dw2'], scale=2)
add_solution_path(fig, errors, path)

Gradient Descent Applied to Sine Model

In [21]:
# w0 = np.array([1.2, 2.])
# w0 = np.array([.5, 2.5])
w0 = np.array([2, 2])
path = gradient_descent(w0, 
                        grad_sine_MSE,
                        learning_rate=.01, 
                        nepochs=20)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)

Loss Curves

In practice, we are typically unable to visualize the loss surface. Instead, we can plot the loss value as a function of iteration number. This is called a loss curve. Here we plot the loss curve for gradient descent on our sine model.

In [22]:
def make_loss_curve(path, error_func):
    """Make a loss curve from a path of weights."""
    errors = [error_func(w) for w in path]
    fig = px.line(x=np.arange(len(errors)), y=errors, 
                  labels={'x': 'Iteration (Gradient Steps)', 'y': 'Loss (Error)'})
    fig.update_traces(line_width=4)
    fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
    return fig
In [23]:
w0 = np.array([2.5, 1.9])
path = gradient_descent(w0, 
                        grad_sine_MSE,
                        learning_rate=.05, 
                        nepochs=50)

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

errors = [sine_MSE(w) for w in path]
fig = plot_gradient(sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)
fig.show()
In [24]:
w0 = np.array([-10., -5.])
# w0 = np.array([-1., 2.])
# w0 = np.array([0., 0])
path = gradient_descent(w0, 
                     grad_NLL,
                     learning_rate=10, 
                     nepochs=100)

fig_loss = make_loss_curve(path, NLL)
fig_loss.show()

errors = [NLL(w) for w in path]
fig = plot_gradient(
    cancer['w1'], cancer['w2'], cancer['error'], 
    cancer['dw1'], cancer['dw2'], scale=2)
add_solution_path(fig, errors, path)

Convergence on the Parabola

In [25]:
x = np.linspace(-10, 10, 100)
y = x**2 + 1
fig = px.line(x=x, y=y, labels={'x': 'x', 'y': 'f(x)'})
fig.update_traces(line_width=5)

def grad_f(x):
    return 2 * x
w0 = 2
path = gradient_descent(w0, grad_f, learning_rate=1.1, nepochs=10)
fig.add_scatter(x=path, y=path**2 + 1, mode='markers+lines',
                marker=dict(size=10, color='red'),
                line=dict(color='black', width=2, dash="dash"),
                name='Gradient Descent Path')

The Second Order Structure

Here we examine the second order structure of the loss function through a Taylor expansion.

The Hessian of the Logistic Regression Model

\begin{align} \text{H} &= \begin{bmatrix} \frac{\partial^2 E}{\partial w_0^2} & \frac{\partial^2 E}{\partial w_0 \partial w_1} \\ \frac{\partial^2 E}{\partial w_1 \partial w_0} & \frac{\partial^2 E}{\partial w_1^2} \end{bmatrix}\\ \end{align}

We can start with the gradient:

\begin{align*} \frac{\partial L}{\partial w_j} &= \frac{1}{N} \sum_{i=1}^N \left( p(t_i|x_i) - t_i \right) x_{ij} = \frac{1}{N} \sum_{i=1}^N \left( \sigma(w^T x_i) - t_i \right) x_{ij} \end{align*}

Using the product rule and chain rule, we can compute the second derivatives of the loss function with respect to the parameters $w_k$: \begin{align*} \frac{\partial^2 }{\partial w_k} \frac{\partial L}{\partial w_j} &= \frac{1}{N} \sum_{i=1}^N \frac{\partial^2 }{\partial w_k}\left( \sigma(w^T x_i) - t_i \right) x_{ij}\\ &= \frac{1}{N} \sum_{i=1}^N x_{ij} \frac{\partial^2 }{\partial w_k}\sigma(w^T x_i) \\ &= \frac{1}{N} \sum_{i=1}^N x_{ij} \sigma(w^T x_i)(1 - \sigma(w^T x_i)) x_{ik} \end{align*}

In [26]:
def hessian_NLL(w):
    """Compute the Hessian of the negative log-likelihood."""
    x = cancer['x']
    p = logistic_model(w, x)
    hessian = (x.T @ np.diag(p * (1 - p)) @ x ) / len(p)
    return hessian

The Hessian of the Sine Regression Model

\begin{align} \text{H} &= \begin{bmatrix} \frac{\partial^2 E}{\partial w_0^2} & \frac{\partial^2 E}{\partial w_0 \partial w_1} \\ \frac{\partial^2 E}{\partial w_1 \partial w_0} & \frac{\partial^2 E}{\partial w_1^2} \end{bmatrix}\\ \end{align}

We can start with the gradient: Calculating each term we get:

\begin{align*} \frac{\partial E}{\partial w_0} &= -\frac{2}{N} \sum_{n=1}^N \left(y_n - \sin\left(w_0 + w_1 x_n \right)\right) \cos\left(w_0 + w_ 1 x_n \right) \end{align*}

\begin{align*} \frac{\partial E}{\partial w_1} &= -\frac{2}{N} \sum_{n=1}^N \left(y_n - \sin\left(w_0 + w_1 x_n \right)\right) \cos\left(w_0 + w_1 x_n \right) x_n \end{align*}

Using the product rule and chain rule, we can compute the second derivatives of the loss function with respect to the parameters $w_0$ and $w_1$. \begin{align} \frac{\partial^2 E}{\partial w_0^2} &= \frac{2}{N} \sum_{n=1}^N \left[ \cos^2(w_0 + w_1 x_n) + (y_n - \sin(w_0 + w_1 x_n)) \sin(w_0 + w_1 x_n) \right]\\ \frac{\partial^2 E}{\partial w_1^2} &= \frac{2}{N} \sum_{n=1}^N \left[ \cos^2(w_0 + w_1 x_n) x_n^2 + (y_n - \sin(w_0 + w_1 x_n)) \sin(w_0 + w_1 x_n) x_n^2 \right]\\ \frac{\partial^2 E}{\partial w_0 \partial w_1} &= \frac{2}{N} \sum_{n=1}^N \left[ \cos^2(w_0 + w_1 x_n) x_n + (y_n - \sin(w_0 + w_1 x_n)) \sin(w_0 + w_1 x_n) x_n \right] \end{align}

In [27]:
def hessian_sine_MSE(w):
    """Compute the Hessian of the negative log-likelihood."""
    x = sine['x']
    y = sine['y']
    dw0dw0 = 2 * np.mean(np.cos(w[0] + w[1] * x)**2 + 
                (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
    dw0dw1 = 2 * np.mean(x * np.cos(w[0] + w[1] * x)**2 +
                x * (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
    dw1dw1 = 2 * np.mean(x**2 * np.cos(w[0] + w[1] * x)**2 + 
                (x**2) * (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
    hessian = np.array([[dw0dw0, dw0dw1], [dw0dw1, dw1dw1]])
    return hessian

We can also derive the gradient using a symbolic algebra library. This is a powerful technique that allows us to compute the gradient of a function without having to derive it by hand. We will use the sympy library to do this.

In [28]:
# unfortunately on colab we need to upgrade sympy to get the needed functionality
# !pip install --upgrade sympy
In [29]:
# import sympy as sp
# # define our symbols
# w0, w1, x, y = sp.symbols('w0 w1 x y')
# # Define a symbolic expression for the error
# E = (y - sp.sin(w0 + w1 * x))**2
# # Compute the gradient of E with respect to w0 and w1
# gE = [sp.diff(E, var) for var in (w0, w1)]
# gE
In [30]:
# hessian_E = sp.hessian(E, (w0, w1))
# hessian_E

Using sympy to do everything algebraically we get the same result.

In [31]:
# n = sp.symbols('n', integer=True)    # number of data points (symbolic length)
# i = sp.Idx('i', n)                   # index variable for summation
# w0, w1 = sp.symbols('w0 w1')         # weights
# x = sp.IndexedBase('x', shape=(n,))  # indexed variable for x data
# y = sp.IndexedBase('y', shape=(n,))  # indexed variable for y data

# E_i = (y[i] - sp.sin(w0 + w1 * x[i]))**2
# E = sp.summation(E_i, (i, 0, n - 1)) / n
# H = sp.hessian(E, (w0, w1))

# Hfun = sp.lambdify((w0, w1, x, y, n), H)
# def hessian_sine_MSE2(w):
#     return np.array(Hfun(w[0], w[1], sine['x'], sine['y'], len(sine['x'])))
In [32]:
# hessian_sine_MSE(np.array([1.1, 2.5]))
In [33]:
# hessian_sine_MSE2(np.array([1.1, 2.5]))

Taylor Expansion of the Loss

The Taylor expansion of a function $f(x)$ around a point $x=a$ is given by:

$$ f(x) = f(a) + \nabla f(a)^T (x - a) + \frac{1}{2} (x - a)^T H(a) (x - a) + \ldots $$

In [34]:
def taylor_loss(w, w_star, L_star, g_star, H_star):
    delta = w - w_star
    return (L_star + delta @ g_star + 1/2 * delta.T @ H_star @ delta)

Applied to Logistic Regression

In [35]:
# w_star = np.array([-2., -2.])
w_star = cancer['grid_best']
L_star = NLL(w_star)
g_star = grad_NLL(w_star)
H_star = hessian_NLL(w_star)
s,u = np.linalg.eigh(H_star)
print("w_star:", w_star)
print("L_star:", L_star)
print("g_star:", g_star)
print("H_star:\n", H_star)
print("Eigenvalues of H_star:", s)
print("Eigegenvectors of H_star:\n", u)
w_star: [-3.93103448 -0.87241379]
L_star: 0.2757763227300472
g_star: [0.00030902 0.0013768 ]
H_star:
 [[ 0.01454776 -0.00817985]
 [-0.00817985  0.08057159]]
Eigenvalues of H_star: [0.01354943 0.08156991]
Eigegenvectors of H_star:
 [[-0.99263445 -0.12114806]
 [-0.12114806  0.99263445]]

Visualizing the Hessian at the optimal point we see that it is positive definite (both eigenvalues are positive). This is consistent with the fact that the loss function is convex.

In [36]:
fig = plot_gradient(cancer['w1'], cancer['w2'], 
                    cancer['error'], cancer['dw1'], cancer['dw2'])

cancer['taylor_loss'] = np.array([
    taylor_loss(w, w_star, L_star, g_star, H_star)
for w in cancer['ws']]).reshape(cancer['w1'].shape)

fig.add_surface(z=cancer['taylor_loss'], x=cancer['w1'], y=cancer['w2'], 
                 colorscale='plasma_r', opacity=0.5, showscale=False,
                 contours=dict(z=dict(show=True, highlightcolor="white", 
                                      start=cancer['taylor_loss'].min(), end=cancer['taylor_loss'].max(), 
                                      size=(cancer['taylor_loss'].max()-cancer['taylor_loss'].min())/50)), 
                                      row=1, col=1)
fig.update_layout(scene=dict(zaxis=dict(range=[0, 3])))
# fig.write_image("images/loss_surface_3d_with_gradients.pdf", width=800, height=800)
display(fig)

We can visualize the eigenvectors of the Hessian at the optimal point on the quadratic approximation. The eigenvectors give us the directions of curvature of the loss function. The eigenvalues give us the amount of curvature in each direction.

The contours of the ellipse of the quadratic approximation are given by the equation: $$ \sum_i \lambda_i \alpha_i^2 = \text{const}. $$ If we solve for the direction of the $i^{th}$ eigenvector we get: $$ \alpha_i = \pm \sqrt{\frac{\text{const}}{\lambda_i}} $$ We can visualize the eigenvectors of the Hessian at the optimal point on the quadratic approximation. The eigenvectors give us the directions of curvature of the loss function. The eigenvalues give us the amount of curvature in each direction.

In [37]:
fig = go.Figure()
fig.add_contour(z=cancer['taylor_loss'].flatten(), x=cancer['w1'].flatten(), 
                y=cancer['w2'].flatten(),
                colorscale='viridis_r', opacity=0.5,
                contours=dict(start=cancer['taylor_loss'].min(), end=cancer['taylor_loss'].max(), 
                            size=(cancer['taylor_loss'].max()-cancer['taylor_loss'].min())/50),
                colorbar=dict(x=1.05, y=0.3, len=0.75))
scaling = 0.5
lam, U = np.linalg.eigh(H_star)
scale = scaling / np.sqrt(np.abs(lam))

cx, cy = cancer['grid_best']
for i, (lami, ui, si) in enumerate(zip(lam, U.T, scale), start=1):
    fig.add_scatter(
        x=[cx, cx + si*ui[0]],
        y=[cy, cy + si*ui[1]],
        mode='lines+markers',
        line=dict(width=2),
        name=f'u{i} (λ={lami:.3g})'
    )
    fig.add_scatter(
        x=[cx, cx - si*ui[0]],
        y=[cy, cy - si*ui[1]],
        mode='lines',
        line=dict(width=2, dash='dot'),
        showlegend=False
    )

fig.update_layout(title="Taylor Approximation of Loss",
                  xaxis_title='w1', yaxis_title='w2',
                  height=600, width=1200)
# fig.write_image("images/taylor_approx_loss_contours.pdf", width=1200, height=600)
fig

Applied to Sine Regression

In [38]:
# w_star = np.array([-2., -2.])
sine['w_star'] = sine['grid_best']
sine['L_star'] = sine_MSE(sine['w_star'])
sine['g_star'] = grad_sine_MSE(sine['w_star'])
sine['H_star'] = hessian_sine_MSE(sine['w_star'])
sine['lam'], sine['U'] = np.linalg.eigh(sine['H_star'])
print("w_star:", sine['w_star'])
print("L_star:", sine['L_star'])
print("g_star:", sine['g_star'])
print("H_star:\n", sine['H_star'])
print("Eigenvalues of H_star:", sine['lam'])
print("Eigenvectors of H_star:\n", sine['U'])
w_star: [1.29310345 2.44827586]
L_star: 0.23832770254453578
g_star: [ 0.02639336 -0.10396998]
H_star:
 [[ 0.94921519  3.62350418]
 [ 3.62350418 19.09410494]]
Eigenvalues of H_star: [ 0.25236948 19.79095065]
Eigenvectors of H_star:
 [[-0.98200554  0.18885209]
 [ 0.18885209  0.98200554]]

Visualizing the Hessian at the optimal point we see that it is positive definite (both eigenvalues are positive). This is consistent with the fact that the loss function is convex.

In [39]:
fig = plot_gradient(sine['w0'], sine['w1'], 
                    sine['error'], sine['dw0'], sine['dw1'], scale=0.1)

sine['taylor_loss'] = np.array([
    taylor_loss(w, sine['w_star'], sine['L_star'], sine['g_star'], sine['H_star'])
for w in sine['ws']]).reshape(sine['w1'].shape)

fig.add_surface(z=sine['taylor_loss'], x=sine['w0'], y=sine['w1'], 
                 colorscale='plasma_r', opacity=0.5, showscale=False,
                 contours=dict(z=dict(show=True, highlightcolor="white", 
                                      start=sine['taylor_loss'].min(), end=sine['taylor_loss'].max(), 
                                      size=(sine['taylor_loss'].max()-sine['taylor_loss'].min())/50)), 
                                      row=1, col=1)
fig.update_layout(scene=dict(zaxis=dict(range=[0, 3])))
# fig.write_image("images/loss_surface_3d_with_gradients.pdf", width=800, height=800)
display(fig)
In [40]:
fig = go.Figure()
fig.add_contour(z=sine['taylor_loss'].flatten(), x=sine['w0'].flatten(), 
                y=sine['w1'].flatten(),
                colorscale='viridis_r', opacity=0.5,
                contours=dict(start=sine['taylor_loss'].min(), end=sine['taylor_loss'].max(), 
                            size=(sine['taylor_loss'].max()-sine['taylor_loss'].min())/50),
                colorbar=dict(x=1.05, y=0.3, len=0.75))
scaling = 0.5
scale = scaling / np.sqrt(np.abs(sine['lam']))

cx, cy = sine['grid_best']
for i, (lami, ui, si) in enumerate(zip(sine['lam'], sine['U'].T, scale), start=1):
    fig.add_scatter(
        x=[cx, cx + si*ui[0]],
        y=[cy, cy + si*ui[1]],
        mode='lines+markers',
        line=dict(width=2),
        name=f'u{i} (λ={lami:.3g})'
    )
    fig.add_scatter(
        x=[cx, cx - si*ui[0]],
        y=[cy, cy - si*ui[1]],
        mode='lines',
        line=dict(width=2, dash='dot'),
        showlegend=False
    )

fig.update_layout(title="Taylor Approximation of Loss",
                  xaxis_title='w1', yaxis_title='w2',
                  height=600, width=1200)
# fig.write_image("images/taylor_approx_loss_contours.pdf", width=1200, height=600)
fig

Oscillating Loss Functions

Here we visualize what happens when we have a poorly conditioned quadratic loss function. This can happen when the Hessian has very different eigenvalues. In this case, gradient descent can oscillate and take a long time to converge.

In [41]:
H_bad = np.array([[.1, 0], [0, 1]])
def quad(w):
    """A poorly conditioned quadratic function."""
    return np.sum((w @ H_bad) * w,1)
    
def grad_quad(w):
    """Gradient of a poorly conditioned quadratic function."""
    return np.array(w @ H_bad * 2)
In [42]:
w1,w2 = np.meshgrid(np.linspace(-5, 5, 30), np.linspace(-5, 5, 30))
ws = np.hstack([w1.reshape(-1, 1), w2.reshape(-1, 1)]) 
error =  quad(ws).reshape(w1.shape)
contour = go.Contour(x=w1.flatten(), y=w2.flatten(), z=error.flatten(), colorscale='Viridis_r',
                contours=dict(start=0, end=20, size=.5))
go.Figure(data=contour)

Suppose we start at the point (-4,0) and use a learning rate of 1. This is an ideal point and we can visualize the path taken by gradient descent on the loss surface.

In [43]:
w0 = np.array([-4., 0])
path = gradient_descent(w0, grad_quad, learning_rate=1, nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

What happens if we start a different point and a learning rate of 1.

In [44]:
w0 = np.array([-4.,-2.])
path = gradient_descent(w0, grad_quad, learning_rate=1, nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

Or if we decrease the learning rate slightly to .9.

In [45]:
w0 = np.array([-4.,-2.])
path = gradient_descent(w0, grad_quad, learning_rate=.9, nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

Gradient Descent with Momentum

Here we implement gradient descent with momentum. This is a simple modification to the basic gradient descent algorithm that can help with oscillations and speed up convergence.

In [46]:
def gd_momentum(w_0, gradient, 
    learning_rate=1, nepochs=10, epsilon=1e-6, momentum=0.9):
    """Gradient descent with momentum.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        learning_rate: Step size for each iteration.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
        momentum: Momentum factor.
    Returns:
        path: Array of weights at each iteration."""
    w_old = w_0
    path = [w_old]
    v = np.zeros_like(w_old)
    for e in range(nepochs):
        g = gradient(w_old)
        v = momentum * v - learning_rate * g
        w = w_old + v
        path.append(w)
        if np.linalg.norm(w - w_old) < epsilon: break
        w_old = w
    return np.array(path)

Let's apply gradient descent with momentum to the poorly conditioned quadratic loss function from before. We will start at the point (-4,-2) and use a learning rate of .9 and a momentum factor of .3. We can visualize the path taken by gradient descent with momentum on the loss surface.

In [47]:
w0 = np.array([-4.,-2.])
path = gd_momentum(w0, grad_quad, 
                   learning_rate=.9, 
                   momentum=0.3, 
                   nepochs=50)

fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
                mode='lines+markers', line=dict(color='black', width=2), 
                marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
                name='Gradient Descent Path', legendgroup='Gradient Descent',
                showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)

We can also try momentum on our non-convex messy least squares regression problem (using a non-linear model). Try setting momentum to 0 and see what happens.

In [48]:
w0 = np.array([2.2, 2.2])
#w0 = np.array([.5, 2.5])
w0 = np.array([1.5, 2])
path = gd_momentum(w0, 
                   grad_sine_MSE,
                   learning_rate=.1, 
                   momentum=0.9, 
                   nepochs=50)

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)

Learning Rate Schedules

Linear Schedule

$$ \eta^{(t)} = \eta_0 \left(1 - \frac{t}{T}\right) + \eta_{T} \frac{t}{T} $$

The constant $T$ is the total number of epochs.

Hyperparameters:

  • $\eta_0$: initial learning rate
  • $\eta_T$: final learning rate
In [49]:
def linear_learning_rate(initial_lr, T, end_lr=0):
    """Linearly decaying learning rate.
    Args:
        initial_lr: Initial learning rate.
        T: Total number of iterations.
        end_lr: Final learning rate.
    Returns:
        A function that takes the current iteration and returns the learning rate."""
    return lambda t: initial_lr * (1 - min(1, t / T)) + end_lr * min(1, t / T)
In [50]:
lr = linear_learning_rate(initial_lr=0.1, T=100, end_lr=0.01)
fig = px.line(x=np.arange(100), 
              markers=True,
              y=[lr(epoch) for epoch in range(100)],
              labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig

Power-law Schedule

$$ \eta^{(t)} = \eta_0 \left(1 + \frac{t}{s}\right)^{-c} $$

Hyperparameters:

  • $s$: step size (in epochs) of the decay
  • $c$: power of the decay
In [51]:
def power_law_learning_rate(initial_lr, T, s=1, c=1):
    """Power-law decaying learning rate.
    Args:
        initial_lr: Initial learning rate.
        T: Total number of epochs.
        s: Scaling factor.
        c: Exponent.
    Returns:
        A function that takes the current epoch and returns the learning rate."""
    return lambda t: initial_lr * (1 + t/s) ** (-c)
In [52]:
lr = power_law_learning_rate(initial_lr=0.1, T=100, s=10, c=1)
fig = px.line(x=np.arange(100), 
              markers=True,
              y=[lr(epoch) for epoch in range(100)],
              labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig

Exponential Schedule

$$ \eta^{(t)} = \eta_0 c^{\lfloor t/s \rfloor} $$

Hyperparameters:

  • $s$: step size (in epochs) of the decay
  • $c$: base of the exponential decay
In [53]:
def exponential_learning_rate(initial_lr, decay_rate=0.95, decay_steps=10):
    """Exponential decaying learning rate.
    Args:
        initial_lr: Initial learning rate.
        decay_rate: Decay rate (0 < decay_rate < 1).
        decay_steps: Number of steps before applying decay.
    Returns:
        A function that takes the current step and returns the learning rate."""
    return lambda t: initial_lr * decay_rate**(np.floor(t / decay_steps))
In [54]:
lr = exponential_learning_rate(initial_lr=0.1, 
                               decay_rate=0.90, 
                               decay_steps=10)
fig = px.line(x=np.arange(100), 
              markers=True,
              y=[lr(epoch) for epoch in range(100)],
              labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig

Cosine Schedule

$$ \eta^{(t)} = \eta_{\text{min}} + \frac{1}{2}(\eta_{\text{max}} - \eta_{\text{min}}) \left(1 + \cos\left(\frac{t}{T} \pi\right)\right) $$

Hyperparameters:

  • $\eta_{\text{min}}$: minimum learning rate
  • $\eta_{\text{max}}$: maximum learning rate
In [55]:
def cosine_learning_rate(T, max_lr, min_lr=0):
    """Cosine annealing learning rate.
    Args:
        T: Total number of epochs.
        max_lr: Maximum learning rate.
        min_lr: Minimum learning rate.
    Returns:
        A function that takes the current epoch and returns the learning rate."""
    return lambda t: min_lr + 0.5 * (max_lr - min_lr) * (1 + np.cos(np.pi * t / T))
In [56]:
lr = cosine_learning_rate(T=100, max_lr=1.0)
fig = px.line(x=np.arange(100), 
              markers=True,
              y=[lr(epoch) for epoch in range(100)],
              labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig

Comparing Schedules

In [57]:
linear_lr = linear_learning_rate(initial_lr=0.1, T=100, end_lr=0.01)
power_lr = power_law_learning_rate(initial_lr=0.1, T=100, s=10, c=1)
exp_lr = exponential_learning_rate(initial_lr=0.1, decay_rate=0.90, decay_steps=5)
cosine_lr = cosine_learning_rate(T=100, max_lr=0.1, min_lr=0.01)
t = np.arange(100)
fig = go.Figure()
fig.add_trace(go.Scatter(x=t, y=[linear_lr(epoch) for epoch in t], 
                         mode='lines+markers', name='Linear', line=dict(width=4)))
fig.add_trace(go.Scatter(x=t, y=[power_lr(epoch) for epoch in t], 
                         mode='lines+markers', name='Power Law', line=dict(width=4)))
fig.add_trace(go.Scatter(x=t, y=[exp_lr(epoch) for epoch in t], 
                         mode='lines+markers', name='Exponential', line=dict(width=4))) 
fig.add_trace(go.Scatter(x=t, y=[cosine_lr(epoch) for epoch in t], 
                         mode='lines+markers', name='Cosine', line=dict(width=4))) 
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20),
                  xaxis_title='Epoch', yaxis_title='Learning Rate', 
                  height=800)

Adam Optimizer

In [58]:
def gd_adam(w_0, gradient, 
    learning_rate=lambda t: 0.1, 
    nepochs=100, 
    epsilon=1e-6, 
    beta1=0.9, 
    beta2=0.999):
    """Adam optimizer that combines momentum and RMSProp.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        learning_rate: Step size for each iteration.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
        beta1: Exponential decay rate for the first moment estimate.
        beta2: Exponential decay rate for the second moment estimate.
    Returns:
        path: Array of weights at each iteration."""
    w_old = w_0
    path = [w_old]
    s = np.zeros_like(w_old) # (Momentum)
    r = np.zeros_like(w_old) # (RMSProp)
    for t in range(nepochs):
        g = gradient(w_old)
        # Update biased first moment estimate and second moment estimate
        s = beta1 * s + (1 - beta1) * g
        r = beta2 * r + (1 - beta2) * g**2
        # Bias correction
        s_hat = s / (1 - beta1 ** (t + 1)) 
        r_hat = r / (1 - beta2 ** (t + 1))
        # Weight Update
        w_old = w_old - learning_rate(t) * s_hat / (np.sqrt(r_hat) + epsilon)
        path.append(w_old)
    return np.array(path)
In [59]:
w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
path = gd_adam(
    w0, 
    grad_sine_MSE,
    learning_rate=lambda t: 0.1, 
    nepochs=100, 
    epsilon=1e-6, 
    beta1=0.9, 
    beta2=0.999)

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)

Factorized Adam Optimizer

Here we factor out the Adam optimizer from the gradient descent code. This allows us to easily switch between different optimization algorithms and reuse the Adam optimizer code for stochastic gradient descent.

In [60]:
class AdamOptimizer:
    def __init__(self, 
                 learning_rate=lambda t: 0.1, 
                 epsilon=1e-6, 
                 beta1=0.9, 
                 beta2=0.999):
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.beta1 = beta1
        self.beta2 = beta2
        self.t = 0

    def step(self, w, g):
        # initialize state variables on first call
        if not hasattr(self, 's'):
            self.s = np.zeros_like(w) # (Momentum)
            self.r = np.zeros_like(w) # (RMSProp)
        # Update biased first moment estimate and second moment estimate
        self.s = self.beta1 * self.s + (1 - self.beta1) * g
        self.r = self.beta2 * self.r + (1 - self.beta2) * g ** 2
        # Bias correction
        s_hat = self.s / (1 - self.beta1 ** (self.t + 1))
        r_hat = self.r / (1 - self.beta2 ** (self.t + 1))
        # Weight Update
        w = w - self.learning_rate(self.t) * s_hat / (np.sqrt(r_hat) + self.epsilon)
        self.t += 1
        return w
In [61]:
class VanillaOptimizer:
    def __init__(self, learning_rate=lambda t: 0.1):
        self.learning_rate = learning_rate
        self.t = 0

    def step(self, w, g):
        w = w - self.learning_rate(self.t) * g
        self.t += 1
        return w
In [62]:
class MomentumUpdate:
    def __init__(self, learning_rate=lambda t: 0.1, momentum=0.9):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.t = 0

    def step(self, w, g):
        if not hasattr(self, 'v'):
            self.v = np.zeros_like(w)
        self.v = self.momentum * self.v - self.learning_rate(self.t) * g
        self.t += 1
        return w + self.v
In [63]:
def gd(w_0, gradient, optimizer, nepochs = 100, epsilon=1e-8):
    """Generic gradient descent optimization.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        optimizer: An optimizer object with a step method.
        nepochs: Maximum number of iterations.
        epsilon: Convergence threshold.
    Returns:
        path: Array of weights at each iteration."""
    w = w_0
    path = [w]
    for t in range(nepochs):
        g = gradient(w)
        w = optimizer.step(w, g)
        path.append(w)
        if np.linalg.norm(w - path[-2]) < epsilon: break
    return np.array(path)
In [64]:
w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
adam = AdamOptimizer(
    learning_rate=lambda t: 0.1,  
    beta1=0.9, 
    beta2=0.999)
momentum = MomentumUpdate(
    learning_rate=lambda t: 0.1, 
    momentum=0.9)
basic = VanillaOptimizer(learning_rate=lambda t: 0.1)

path = gd(w0, grad_sine_MSE, adam, nepochs=100)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
fig = add_solution_path(fig, errors, path)
fig.show()

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()

Stochastic Gradient Descent

In [65]:
def sgd(w_0, x, t, gradient, optimizer, nepochs = 2, epsilon=1e-8):
    """Generic stochastic gradient descent optimization.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        optimizer: An optimizer object with a step method.
        epsilon: Convergence threshold.
    Returns:
        path: Array of weights at each iteration."""
    w = w_0
    path = [w]
    for e in range(nepochs):
        random_indices = np.random.permutation(len(t))
        for i in random_indices:
            g = gradient(w, x[i], t[i])
            w = optimizer.step(w, g)
            path.append(w)
        if np.linalg.norm(w - path[-2]) < epsilon: break
    return np.array(path)
In [66]:
w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
nepochs = 2
adam = AdamOptimizer(
    learning_rate=lambda t: 0.1, 
    beta1=0.9, 
    beta2=0.999)
momentum = MomentumUpdate(
    learning_rate=lambda t: 0.01, 
    momentum=0.5)
powerlaw = power_law_learning_rate(initial_lr=0.1, T=nepochs*len(sine['y']), s=10, c=1)
vanilla = VanillaOptimizer(learning_rate=powerlaw)

path = sgd(w0, sine['x'], sine['y'], grad_sine_MSE, 
           adam, 
           nepochs=nepochs)


errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
fig = add_solution_path(fig, errors, path)
fig.show()

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()
In [67]:
#w0 = np.array([-10., -5.])
w0 = np.array([-1., 2.])
#w0 = np.array([0., -4])
nepochs = 1
coslr = cosine_learning_rate(T=nepochs*len(cancer['t']), max_lr=0.4, min_lr=0.01)
adam = AdamOptimizer(
    learning_rate=coslr,
    beta1=0.9, 
    beta2=0.999)
momentum = MomentumUpdate(
    learning_rate=coslr, 
    momentum=0.5)
vanilla = VanillaOptimizer(learning_rate=coslr)

path = sgd(w0, cancer['x'], cancer['t'], grad_NLL, 
           adam, 
           nepochs=nepochs)


errors = [NLL(w) for w in path]
fig = plot_gradient(
    cancer['w1'], cancer['w2'], cancer['error'], 
    cancer['dw1'], cancer['dw2'], scale=1)
fig = add_solution_path(fig, errors, path)
fig.show()

fig_loss = make_loss_curve(path, NLL)
fig_loss.show()

Minibatch Stochastic Gradient Descent

In [68]:
def mb_sgd(w_0, x, t, gradient, optimizer, 
           batch_size=32, nepochs = 2, epsilon=1e-8):
    """Generic stochastic gradient descent optimization.
    Args:
        w_0: Initial weights (numpy array).
        gradient: Function to compute the gradient.
        optimizer: An optimizer object with a step method.
        batch_size: Size of each mini-batch.
        epsilon: Convergence threshold.
    Returns:
        path: Array of weights at each iteration."""
    w = w_0
    path = [w]
    for e in range(nepochs):
        random_indices = np.random.permutation(len(t))
        for start in range(0, len(t), batch_size):
            end = start + batch_size
            batch_indices = random_indices[start:end]
            g = gradient(w, x[batch_indices], t[batch_indices])
            w = optimizer.step(w, g)
            path.append(w)
        if np.linalg.norm(w - path[-2]) < epsilon: break
    return np.array(path)
In [69]:
w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
nepochs = 10
adam = AdamOptimizer(
    learning_rate=lambda t: 0.1, 
    beta1=0.9, 
    beta2=0.999)
momentum = MomentumUpdate(
    learning_rate=lambda t: 0.01, 
    momentum=0.5)
powerlaw = power_law_learning_rate(initial_lr=0.1, T=nepochs*len(sine['y']), s=10, c=1)
vanilla = VanillaOptimizer(learning_rate=powerlaw)

path = mb_sgd(
    w0, sine['x'], sine['y'], grad_sine_MSE, 
    adam, 
    batch_size=32,
    nepochs=nepochs)

errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
    sine['w0'], sine['w1'], sine['error'], 
    sine['dw0'], sine['dw1'], scale=.1)
fig = add_solution_path(fig, errors, path)
fig.show()

fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()