import plotly.io as pio; pio.renderers.default = "notebook_connected"
Lecture 13: Gradient Descent and SGD – CS 189, Fall 2025
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
from ipywidgets import HBox
import numpy as np
# make the images folder if it doesn't exist
import os
if not os.path.exists("images"):
os.makedirs("images")
Plotting Code you Can Safely Ignore (but need to run).
This lecture has many complex visualizations and to keep the rest of the code short I have put the visualization code here. Much of this code is out-of-scope for the course, but feel free to look through it if you are interested.
def make_plot_grid(figs, rows, cols):
"""Create a grid of figures with Plotly figures."""
from plotly.subplots import make_subplots
def get_trace_type(fig):
for trace in fig.data:
if trace.type == 'surface':
return 'surface' # required for go.Surface
elif trace.type.startswith('scatter3d') or trace.type.startswith('mesh3d'):
return 'scene' # 3D scene
return 'xy' # default 2D
specs = [[{'type': get_trace_type(fig)} for fig in figs[i:i+cols]] for i in range(0, len(figs), cols)]
fig_grid = make_subplots(rows=rows, cols=cols, specs=specs,
subplot_titles=[fig.layout.title.text for fig in figs])
for i, fig in enumerate(figs):
fig_grid.add_traces(fig.data, rows=(i//cols) + 1, cols=(i%cols) + 1 )
return fig_grid
def plot_lr_predictions(w, cancer):
"""Plot predictions of the logistic model."""
cancer = cancer.copy()
cancer['logistic_pred'] = np.where(
logistic_model(w, cancer[['mean radius', 'mean texture']].values) > 0.5,
'Pred Malignant', 'Pred Benign'
)
# Create a scatter plot with logistic predictions
fig = px.scatter(cancer, x='mean radius', y='mean texture',
symbol='logistic_pred', color='target',
symbol_sequence=[ "circle-open", "cross"])
for (i,t) in enumerate(fig.data): t.legendgroup = str(i)
# decision boundary
xs = np.linspace(cancer['mean radius'].min(), cancer['mean radius'].max(), 100)
decision_boundary = -(w[0] * xs ) / w[1]
fig.add_scatter(x=xs, y=decision_boundary, mode='lines',
name='Decision Boundary', legendgroup='Decision Boundary',
line=dict(color='black', width=2, dash='dash', ))
# probability surface
ys = np.linspace(cancer['mean texture'].min(), cancer['mean texture'].max(), 100)
X, Y = np.meshgrid(xs, ys)
Z = logistic_model(w, np.c_[X.ravel(), Y.ravel()]).reshape(X.shape)
fig.add_contour(x=xs, y=ys, z=Z,
colorscale='Matter_r', opacity=0.5,
name='Probability Surface',
colorbar=dict(x=1.05, y=0.3, len=0.75))
fig.update_layout(title=f'w=({w[0]:.2f}, {w[1]:.2f})',
xaxis_range=[xs.min(), xs.max()], yaxis_range=[ys.min(), ys.max()],
xaxis_title='Mean Radius (scaled)', yaxis_title='Mean Texture (scaled)',
width=800, height=600)
return fig
def plot_loss(w1, w2, error, ncontours=50):
surf_fig = go.Figure()
surf_fig.add_surface(z=error, x=w1, y=w2,
colorscale='Viridis_r', opacity=0.7, showscale=False,
contours=dict(z=dict(show=True, highlightcolor="white",
start=error.min(), end=error.max(),
size=(error.max()-error.min())/ncontours)))
surf_fig.update_layout(title="Loss Surface")
contour_fig = go.Figure()
contour_fig.add_contour(x=w1.flatten(), y=w2.flatten(), z=error.flatten(),
colorscale='Viridis_r', opacity=0.7,
contours=dict(start=error.min(), end=error.max(),
size=(error.max()-error.min())/ncontours),
colorbar=dict(x=1.05, y=0.35, len=0.75))
contour_fig.update_layout(title="Loss Contours")
fig = make_plot_grid([surf_fig, contour_fig], 1, 2).update_layout(height=800)
fig.update_layout(scene=dict(xaxis_title='w1', yaxis_title='w2', zaxis_title='Loss', aspectmode='cube'))
fig.update_layout(xaxis_range=[w1.min(), w1.max()], yaxis_range=[w2.min(), w2.max()],
xaxis_title='w1', yaxis_title='w2')
return fig
def plot_gradient(w1, w2, error, dw1, dw2, scale=1.0):
fig = plot_loss(w1, w2, error)
fig.add_trace(
go.Cone(
x=w1.flatten(), y=w2.flatten(), z=np.zeros_like(error).flatten(), # Ground plane
u=dw1.flatten(), v=dw2.flatten(), w=np.zeros_like(error).flatten(), # No vertical component
sizeref=2, anchor="tail", showscale=False
), 1,1)
contour_fig = ff.create_quiver(
x=w1.flatten(), y=w2.flatten(), u=dw1.flatten(), v=dw2.flatten(),
line_width=2, line_color="white",
scale = scale, arrow_scale=.2, showlegend=False)
fig.add_traces(contour_fig.data, rows=1, cols=2)
return fig
def add_solution_path(fig, errors, ws):
s = np.linspace(0, 1, len(ws))
fig.add_scatter3d(x=ws[:, 0], y=ws[:, 1], z=errors, marker_color=s, marker_size=5,
mode='lines+markers', line=dict(color='black', width=2), opacity=0.5,
name='Gradient Descent Path', legendgroup='Gradient Descent',
row=1, col=1)
fig.add_scatter(x=ws[:, 0], y=ws[:, 1], marker_color=s,
mode='lines+markers', line=dict(color='black', width=2), opacity=0.5,
name='Gradient Descent Path', legendgroup='Gradient Descent',
showlegend=False,
row=1, col=2)
fig.add_scatter3d(x=[ws[-1, 0]], y=[ws[-1, 1]], z=[errors[-1]],
mode='markers', marker=dict(color='red', size=10),
name='Final Solution', legendgroup='Final Solution',
row=1, col=1)
fig.add_scatter(x=[ws[-1, 0]], y=[ws[-1, 1]],
mode='markers', marker=dict(color='red', size=20),
name='Final Solution', legendgroup='Final Solution',
showlegend=False,
row=1, col=2)
return fig
Lecture 12 Setup
In the previous lecture we visualized the loss surfaces and derived gradient descent. In this part of the notebook we setup the data, loss, and gradient descent code that we will use to visualize gradient descent and stochastic gradient descent.
The Logistic Regression Problem
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
cancer_dict = datasets.load_breast_cancer(as_frame=True)
cancer_df = pd.DataFrame(cancer_dict.data, columns=cancer_dict.feature_names)
cancer_df['target'] = cancer_dict.target.astype(str)
cancer_df = cancer_df[['mean radius', 'mean texture', 'target']].dropna()
scaler = StandardScaler()
cancer_df[['mean radius', 'mean texture']] = scaler.fit_transform(cancer_df[['mean radius', 'mean texture']])
print("The dataset:", cancer_df.shape)
display(cancer_df.head())
cancer = dict()
cancer['npts'] = 30
cancer['w1'], cancer['w2'] = np.meshgrid(
np.linspace(-10, 1, cancer['npts']),
np.linspace(-5, 1.3, cancer['npts'])
)
cancer['ws'] = np.stack([cancer['w1'].flatten(), cancer['w2'].flatten()]).T
cancer['x'] = cancer_df[['mean radius', 'mean texture']].values
cancer['t'] = cancer_df['target'].values.astype(float)
The dataset: (569, 3)
| mean radius | mean texture | target | |
|---|---|---|---|
| 0 | 1.097064 | -2.073335 | 0 |
| 1 | 1.829821 | -0.353632 | 0 |
| 2 | 1.579888 | 0.456187 | 0 |
| 3 | -0.768909 | 0.253732 | 0 |
| 4 | 1.750297 | -1.151816 | 0 |
def sigmoid(z):
"""Sigmoid function."""
return 1 / (1 + np.exp(-z))
def logistic_model(w, x):
"""Logistic model for binary classification."""
z = w @ x.T
return sigmoid(z)
def NLL(w):
"""Negative log-likelihood for logistic regression."""
x = cancer['x']
t = cancer['t']
z = w @ x.T
# return -np.mean(t * np.log(logistic_model(w, x)) + (1 - t) * np.log(1 - logistic_model(w, x)))
# more numerically stable version
# np.mean(np.log(1+ np.exp(z)) - t * z)
# stable softplus: log(1+exp(z))
softplus_z = np.logaddexp(0, z)
return np.mean(softplus_z - t * z)
def grad_NLL(w, x=cancer['x'], t=cancer['t']):
"""Compute the gradient of the negative log-likelihood."""
p = logistic_model(w, x)
grad = np.mean((p - t).reshape(-1, 1) * x, 0)
return grad
cancer['error'] = np.array([NLL(w)
for w in cancer['ws']])
cancer['error'] = cancer['error'].reshape(cancer['w1'].shape)
# guesses = np.array([[-1., -1.],
# [-2., -4.]])
best_ind = np.argmin(cancer['error'])
cancer['grid_best'] = np.array([cancer['w1'].flatten()[best_ind],
cancer['w2'].flatten()[best_ind]])
Computing the gradient fields and visualizing gradient descent.
(cancer['dw1'], cancer['dw2']) = np.array(
[grad_NLL(w) for w in cancer['ws']]).T
cancer['dw1'] = cancer['dw1'].reshape(cancer['w1'].shape)
cancer['dw2'] = cancer['dw2'].reshape(cancer['w1'].shape)
The Squared Error Loss Squared Loss on a Non-Linear Model
sine = dict()
np.random.seed(42)
sine['n'] = 200
# Generate some random data
sine['x'] = np.random.rand(sine['n']) * 2.5 * np.pi
sine['x'] = np.sort(sine['x']) # sort for easier plotting
sine['y'] = np.sin(1.1 + 2.5 * sine['x']) + 0.5 * np.random.randn(sine['n'])
sine_df = pd.DataFrame({'x': sine['x'], 'y': sine['y']})
def sine_model(w, x):
return np.sin(w[0] + x * w[1])
def sine_MSE(w):
x = sine['x']
y = sine['y']
y_hat = sine_model(w, x)
return np.mean((y - y_hat) ** 2)
def grad_sine_MSE(w, x=sine['x'], y=sine['y']):
"""Compute the gradient of the negative log-likelihood."""
y_hat = sine_model(w, x)
grad_w0 = -2 * np.mean((y - y_hat) * np.cos(w[0] + w[1] * x))
grad_w1 = -2 * np.mean((y - y_hat) * x * np.cos(w[0] + w[1] * x))
return np.array([grad_w0, grad_w1])
sine['guesses'] = np.array([[0, 2], [2, 3], [0, 3.5]])
sine['xhat'] = np.linspace(sine_df['x'].min(), sine_df['x'].max(), 100)
sine['pred_df'] = pd.DataFrame({'x': sine['xhat']})
for w in sine['guesses']:
sine['pred_df'][f'yhat(w={w})'] = sine_model(w, sine['xhat'])
sine['pred_df'].head()
| x | yhat(w=[0. 2.]) | yhat(w=[2. 3.]) | yhat(w=[0. 3.5]) | |
|---|---|---|---|---|
| 0 | 0.043371 | 0.086632 | 0.847619 | 0.151215 |
| 1 | 0.121225 | 0.240082 | 0.701797 | 0.411673 |
| 2 | 0.199080 | 0.387723 | 0.517864 | 0.641752 |
| 3 | 0.276935 | 0.525982 | 0.305809 | 0.824474 |
| 4 | 0.354790 | 0.651515 | 0.077147 | 0.946355 |
sine['npts'] = 30
sine['w0'], sine['w1'] = np.meshgrid(
np.linspace(-1.5, 3, sine['npts']), np.linspace(1, 4, sine['npts']))
# combine w1 and w2 into a single tensor
sine['ws'] = np.stack([sine['w0'].flatten(), sine['w1'].flatten()]).T
sine['error'] = np.array([sine_MSE(w) for w in sine['ws']]).reshape(sine['w0'].shape)
ind = np.argmin(sine['error'])
sine['grid_best'] = sine['ws'][ind,:]
sine['grid_best_error'] = sine['error'].flatten()[ind]
print(f"Best weights: {sine['grid_best']}, with error: {sine['grid_best_error']}")
Best weights: [1.29310345 2.44827586], with error: 0.23832770254453578
(sine['dw0'], sine['dw1']) = np.array(
[grad_sine_MSE(w) for w in sine['ws']]).T
sine['dw0'] = sine['dw0'].reshape(sine['w1'].shape)
sine['dw1'] = sine['dw1'].reshape(sine['w1'].shape)
The Gradient Descent Algorithm
Here we implement the most basic version of gradient descent. This version uses a fixed step size and does not use any fancy tricks like momentum or adaptive step sizes (which we will see soon).
def gradient_descent(w_0, gradient,
learning_rate=1, nepochs=10, epsilon=1e-6):
"""Basic gradient descent algorithm.
Args:
w_0: Initial weights (numpy array).
gradient: Function to compute the gradient.
learning_rate: Step size for each iteration.
nepochs: Maximum number of iterations.
epsilon: Convergence threshold.
Returns:
path: Array of weights at each iteration."""
w_old = w_0
path = [w_old]
for e in range(nepochs):
w = w_old - learning_rate * gradient(w_old)
path.append(w)
if np.linalg.norm(w - w_old) < epsilon: break
w_old = w
return np.array(path)
Gradient Descent Applied to Logistic Regression
# w0 = np.array([-10., -5.])
# w0 = np.array([-1., 2.])
w0 = np.array([0., 0])
path = gradient_descent(w0,
grad_NLL,
learning_rate=1,
nepochs=100)
errors = [NLL(w) for w in path]
fig = plot_gradient(
cancer['w1'], cancer['w2'], cancer['error'],
cancer['dw1'], cancer['dw2'], scale=2)
add_solution_path(fig, errors, path)
Gradient Descent Applied to Sine Model
# w0 = np.array([1.2, 2.])
# w0 = np.array([.5, 2.5])
w0 = np.array([2, 2])
path = gradient_descent(w0,
grad_sine_MSE,
learning_rate=.01,
nepochs=20)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
sine['w0'], sine['w1'], sine['error'],
sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)
Loss Curves
In practice, we are typically unable to visualize the loss surface. Instead, we can plot the loss value as a function of iteration number. This is called a loss curve. Here we plot the loss curve for gradient descent on our sine model.
def make_loss_curve(path, error_func):
"""Make a loss curve from a path of weights."""
errors = [error_func(w) for w in path]
fig = px.line(x=np.arange(len(errors)), y=errors,
labels={'x': 'Iteration (Gradient Steps)', 'y': 'Loss (Error)'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
return fig
w0 = np.array([2.5, 1.9])
path = gradient_descent(w0,
grad_sine_MSE,
learning_rate=.05,
nepochs=50)
fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(sine['w0'], sine['w1'], sine['error'],
sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)
fig.show()
w0 = np.array([-10., -5.])
# w0 = np.array([-1., 2.])
# w0 = np.array([0., 0])
path = gradient_descent(w0,
grad_NLL,
learning_rate=10,
nepochs=100)
fig_loss = make_loss_curve(path, NLL)
fig_loss.show()
errors = [NLL(w) for w in path]
fig = plot_gradient(
cancer['w1'], cancer['w2'], cancer['error'],
cancer['dw1'], cancer['dw2'], scale=2)
add_solution_path(fig, errors, path)
Convergence on the Parabola
x = np.linspace(-10, 10, 100)
y = x**2 + 1
fig = px.line(x=x, y=y, labels={'x': 'x', 'y': 'f(x)'})
fig.update_traces(line_width=5)
def grad_f(x):
return 2 * x
w0 = 2
path = gradient_descent(w0, grad_f, learning_rate=1.1, nepochs=10)
fig.add_scatter(x=path, y=path**2 + 1, mode='markers+lines',
marker=dict(size=10, color='red'),
line=dict(color='black', width=2, dash="dash"),
name='Gradient Descent Path')
The Second Order Structure
Here we examine the second order structure of the loss function through a Taylor expansion.
The Hessian of the Logistic Regression Model
\begin{align} \text{H} &= \begin{bmatrix} \frac{\partial^2 E}{\partial w_0^2} & \frac{\partial^2 E}{\partial w_0 \partial w_1} \\ \frac{\partial^2 E}{\partial w_1 \partial w_0} & \frac{\partial^2 E}{\partial w_1^2} \end{bmatrix}\\ \end{align}
We can start with the gradient:
\begin{align*} \frac{\partial L}{\partial w_j} &= \frac{1}{N} \sum_{i=1}^N \left( p(t_i|x_i) - t_i \right) x_{ij} = \frac{1}{N} \sum_{i=1}^N \left( \sigma(w^T x_i) - t_i \right) x_{ij} \end{align*}
Using the product rule and chain rule, we can compute the second derivatives of the loss function with respect to the parameters $w_k$: \begin{align*} \frac{\partial^2 }{\partial w_k} \frac{\partial L}{\partial w_j} &= \frac{1}{N} \sum_{i=1}^N \frac{\partial^2 }{\partial w_k}\left( \sigma(w^T x_i) - t_i \right) x_{ij}\\ &= \frac{1}{N} \sum_{i=1}^N x_{ij} \frac{\partial^2 }{\partial w_k}\sigma(w^T x_i) \\ &= \frac{1}{N} \sum_{i=1}^N x_{ij} \sigma(w^T x_i)(1 - \sigma(w^T x_i)) x_{ik} \end{align*}
def hessian_NLL(w):
"""Compute the Hessian of the negative log-likelihood."""
x = cancer['x']
p = logistic_model(w, x)
hessian = (x.T @ np.diag(p * (1 - p)) @ x ) / len(p)
return hessian
The Hessian of the Sine Regression Model
\begin{align} \text{H} &= \begin{bmatrix} \frac{\partial^2 E}{\partial w_0^2} & \frac{\partial^2 E}{\partial w_0 \partial w_1} \\ \frac{\partial^2 E}{\partial w_1 \partial w_0} & \frac{\partial^2 E}{\partial w_1^2} \end{bmatrix}\\ \end{align}
We can start with the gradient: Calculating each term we get:
\begin{align*} \frac{\partial E}{\partial w_0} &= -\frac{2}{N} \sum_{n=1}^N \left(y_n - \sin\left(w_0 + w_1 x_n \right)\right) \cos\left(w_0 + w_ 1 x_n \right) \end{align*}
\begin{align*} \frac{\partial E}{\partial w_1} &= -\frac{2}{N} \sum_{n=1}^N \left(y_n - \sin\left(w_0 + w_1 x_n \right)\right) \cos\left(w_0 + w_1 x_n \right) x_n \end{align*}
Using the product rule and chain rule, we can compute the second derivatives of the loss function with respect to the parameters $w_0$ and $w_1$. \begin{align} \frac{\partial^2 E}{\partial w_0^2} &= \frac{2}{N} \sum_{n=1}^N \left[ \cos^2(w_0 + w_1 x_n) + (y_n - \sin(w_0 + w_1 x_n)) \sin(w_0 + w_1 x_n) \right]\\ \frac{\partial^2 E}{\partial w_1^2} &= \frac{2}{N} \sum_{n=1}^N \left[ \cos^2(w_0 + w_1 x_n) x_n^2 + (y_n - \sin(w_0 + w_1 x_n)) \sin(w_0 + w_1 x_n) x_n^2 \right]\\ \frac{\partial^2 E}{\partial w_0 \partial w_1} &= \frac{2}{N} \sum_{n=1}^N \left[ \cos^2(w_0 + w_1 x_n) x_n + (y_n - \sin(w_0 + w_1 x_n)) \sin(w_0 + w_1 x_n) x_n \right] \end{align}
def hessian_sine_MSE(w):
"""Compute the Hessian of the negative log-likelihood."""
x = sine['x']
y = sine['y']
dw0dw0 = 2 * np.mean(np.cos(w[0] + w[1] * x)**2 +
(y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
dw0dw1 = 2 * np.mean(x * np.cos(w[0] + w[1] * x)**2 +
x * (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
dw1dw1 = 2 * np.mean(x**2 * np.cos(w[0] + w[1] * x)**2 +
(x**2) * (y - np.sin(w[0] + w[1] * x)) * np.sin(w[0] + w[1] * x))
hessian = np.array([[dw0dw0, dw0dw1], [dw0dw1, dw1dw1]])
return hessian
We can also derive the gradient using a symbolic algebra library. This is a powerful technique that allows us to compute the gradient of a function without having to derive it by hand. We will use the sympy library to do this.
# unfortunately on colab we need to upgrade sympy to get the needed functionality
# !pip install --upgrade sympy
# import sympy as sp
# # define our symbols
# w0, w1, x, y = sp.symbols('w0 w1 x y')
# # Define a symbolic expression for the error
# E = (y - sp.sin(w0 + w1 * x))**2
# # Compute the gradient of E with respect to w0 and w1
# gE = [sp.diff(E, var) for var in (w0, w1)]
# gE
# hessian_E = sp.hessian(E, (w0, w1))
# hessian_E
Using sympy to do everything algebraically we get the same result.
# n = sp.symbols('n', integer=True) # number of data points (symbolic length)
# i = sp.Idx('i', n) # index variable for summation
# w0, w1 = sp.symbols('w0 w1') # weights
# x = sp.IndexedBase('x', shape=(n,)) # indexed variable for x data
# y = sp.IndexedBase('y', shape=(n,)) # indexed variable for y data
# E_i = (y[i] - sp.sin(w0 + w1 * x[i]))**2
# E = sp.summation(E_i, (i, 0, n - 1)) / n
# H = sp.hessian(E, (w0, w1))
# Hfun = sp.lambdify((w0, w1, x, y, n), H)
# def hessian_sine_MSE2(w):
# return np.array(Hfun(w[0], w[1], sine['x'], sine['y'], len(sine['x'])))
# hessian_sine_MSE(np.array([1.1, 2.5]))
# hessian_sine_MSE2(np.array([1.1, 2.5]))
Taylor Expansion of the Loss
The Taylor expansion of a function $f(x)$ around a point $x=a$ is given by:
$$ f(x) = f(a) + \nabla f(a)^T (x - a) + \frac{1}{2} (x - a)^T H(a) (x - a) + \ldots $$
def taylor_loss(w, w_star, L_star, g_star, H_star):
delta = w - w_star
return (L_star + delta @ g_star + 1/2 * delta.T @ H_star @ delta)
Applied to Logistic Regression
# w_star = np.array([-2., -2.])
w_star = cancer['grid_best']
L_star = NLL(w_star)
g_star = grad_NLL(w_star)
H_star = hessian_NLL(w_star)
s,u = np.linalg.eigh(H_star)
print("w_star:", w_star)
print("L_star:", L_star)
print("g_star:", g_star)
print("H_star:\n", H_star)
print("Eigenvalues of H_star:", s)
print("Eigegenvectors of H_star:\n", u)
w_star: [-3.93103448 -0.87241379] L_star: 0.2757763227300472 g_star: [0.00030902 0.0013768 ] H_star: [[ 0.01454776 -0.00817985] [-0.00817985 0.08057159]] Eigenvalues of H_star: [0.01354943 0.08156991] Eigegenvectors of H_star: [[-0.99263445 -0.12114806] [-0.12114806 0.99263445]]
Visualizing the Hessian at the optimal point we see that it is positive definite (both eigenvalues are positive). This is consistent with the fact that the loss function is convex.
fig = plot_gradient(cancer['w1'], cancer['w2'],
cancer['error'], cancer['dw1'], cancer['dw2'])
cancer['taylor_loss'] = np.array([
taylor_loss(w, w_star, L_star, g_star, H_star)
for w in cancer['ws']]).reshape(cancer['w1'].shape)
fig.add_surface(z=cancer['taylor_loss'], x=cancer['w1'], y=cancer['w2'],
colorscale='plasma_r', opacity=0.5, showscale=False,
contours=dict(z=dict(show=True, highlightcolor="white",
start=cancer['taylor_loss'].min(), end=cancer['taylor_loss'].max(),
size=(cancer['taylor_loss'].max()-cancer['taylor_loss'].min())/50)),
row=1, col=1)
fig.update_layout(scene=dict(zaxis=dict(range=[0, 3])))
# fig.write_image("images/loss_surface_3d_with_gradients.pdf", width=800, height=800)
display(fig)
We can visualize the eigenvectors of the Hessian at the optimal point on the quadratic approximation. The eigenvectors give us the directions of curvature of the loss function. The eigenvalues give us the amount of curvature in each direction.
The contours of the ellipse of the quadratic approximation are given by the equation: $$ \sum_i \lambda_i \alpha_i^2 = \text{const}. $$ If we solve for the direction of the $i^{th}$ eigenvector we get: $$ \alpha_i = \pm \sqrt{\frac{\text{const}}{\lambda_i}} $$ We can visualize the eigenvectors of the Hessian at the optimal point on the quadratic approximation. The eigenvectors give us the directions of curvature of the loss function. The eigenvalues give us the amount of curvature in each direction.
fig = go.Figure()
fig.add_contour(z=cancer['taylor_loss'].flatten(), x=cancer['w1'].flatten(),
y=cancer['w2'].flatten(),
colorscale='viridis_r', opacity=0.5,
contours=dict(start=cancer['taylor_loss'].min(), end=cancer['taylor_loss'].max(),
size=(cancer['taylor_loss'].max()-cancer['taylor_loss'].min())/50),
colorbar=dict(x=1.05, y=0.3, len=0.75))
scaling = 0.5
lam, U = np.linalg.eigh(H_star)
scale = scaling / np.sqrt(np.abs(lam))
cx, cy = cancer['grid_best']
for i, (lami, ui, si) in enumerate(zip(lam, U.T, scale), start=1):
fig.add_scatter(
x=[cx, cx + si*ui[0]],
y=[cy, cy + si*ui[1]],
mode='lines+markers',
line=dict(width=2),
name=f'u{i} (λ={lami:.3g})'
)
fig.add_scatter(
x=[cx, cx - si*ui[0]],
y=[cy, cy - si*ui[1]],
mode='lines',
line=dict(width=2, dash='dot'),
showlegend=False
)
fig.update_layout(title="Taylor Approximation of Loss",
xaxis_title='w1', yaxis_title='w2',
height=600, width=1200)
# fig.write_image("images/taylor_approx_loss_contours.pdf", width=1200, height=600)
fig
Applied to Sine Regression
# w_star = np.array([-2., -2.])
sine['w_star'] = sine['grid_best']
sine['L_star'] = sine_MSE(sine['w_star'])
sine['g_star'] = grad_sine_MSE(sine['w_star'])
sine['H_star'] = hessian_sine_MSE(sine['w_star'])
sine['lam'], sine['U'] = np.linalg.eigh(sine['H_star'])
print("w_star:", sine['w_star'])
print("L_star:", sine['L_star'])
print("g_star:", sine['g_star'])
print("H_star:\n", sine['H_star'])
print("Eigenvalues of H_star:", sine['lam'])
print("Eigenvectors of H_star:\n", sine['U'])
w_star: [1.29310345 2.44827586] L_star: 0.23832770254453578 g_star: [ 0.02639336 -0.10396998] H_star: [[ 0.94921519 3.62350418] [ 3.62350418 19.09410494]] Eigenvalues of H_star: [ 0.25236948 19.79095065] Eigenvectors of H_star: [[-0.98200554 0.18885209] [ 0.18885209 0.98200554]]
Visualizing the Hessian at the optimal point we see that it is positive definite (both eigenvalues are positive). This is consistent with the fact that the loss function is convex.
fig = plot_gradient(sine['w0'], sine['w1'],
sine['error'], sine['dw0'], sine['dw1'], scale=0.1)
sine['taylor_loss'] = np.array([
taylor_loss(w, sine['w_star'], sine['L_star'], sine['g_star'], sine['H_star'])
for w in sine['ws']]).reshape(sine['w1'].shape)
fig.add_surface(z=sine['taylor_loss'], x=sine['w0'], y=sine['w1'],
colorscale='plasma_r', opacity=0.5, showscale=False,
contours=dict(z=dict(show=True, highlightcolor="white",
start=sine['taylor_loss'].min(), end=sine['taylor_loss'].max(),
size=(sine['taylor_loss'].max()-sine['taylor_loss'].min())/50)),
row=1, col=1)
fig.update_layout(scene=dict(zaxis=dict(range=[0, 3])))
# fig.write_image("images/loss_surface_3d_with_gradients.pdf", width=800, height=800)
display(fig)
fig = go.Figure()
fig.add_contour(z=sine['taylor_loss'].flatten(), x=sine['w0'].flatten(),
y=sine['w1'].flatten(),
colorscale='viridis_r', opacity=0.5,
contours=dict(start=sine['taylor_loss'].min(), end=sine['taylor_loss'].max(),
size=(sine['taylor_loss'].max()-sine['taylor_loss'].min())/50),
colorbar=dict(x=1.05, y=0.3, len=0.75))
scaling = 0.5
scale = scaling / np.sqrt(np.abs(sine['lam']))
cx, cy = sine['grid_best']
for i, (lami, ui, si) in enumerate(zip(sine['lam'], sine['U'].T, scale), start=1):
fig.add_scatter(
x=[cx, cx + si*ui[0]],
y=[cy, cy + si*ui[1]],
mode='lines+markers',
line=dict(width=2),
name=f'u{i} (λ={lami:.3g})'
)
fig.add_scatter(
x=[cx, cx - si*ui[0]],
y=[cy, cy - si*ui[1]],
mode='lines',
line=dict(width=2, dash='dot'),
showlegend=False
)
fig.update_layout(title="Taylor Approximation of Loss",
xaxis_title='w1', yaxis_title='w2',
height=600, width=1200)
# fig.write_image("images/taylor_approx_loss_contours.pdf", width=1200, height=600)
fig
Oscillating Loss Functions
Here we visualize what happens when we have a poorly conditioned quadratic loss function. This can happen when the Hessian has very different eigenvalues. In this case, gradient descent can oscillate and take a long time to converge.
H_bad = np.array([[.1, 0], [0, 1]])
def quad(w):
"""A poorly conditioned quadratic function."""
return np.sum((w @ H_bad) * w,1)
def grad_quad(w):
"""Gradient of a poorly conditioned quadratic function."""
return np.array(w @ H_bad * 2)
w1,w2 = np.meshgrid(np.linspace(-5, 5, 30), np.linspace(-5, 5, 30))
ws = np.hstack([w1.reshape(-1, 1), w2.reshape(-1, 1)])
error = quad(ws).reshape(w1.shape)
contour = go.Contour(x=w1.flatten(), y=w2.flatten(), z=error.flatten(), colorscale='Viridis_r',
contours=dict(start=0, end=20, size=.5))
go.Figure(data=contour)
Suppose we start at the point (-4,0) and use a learning rate of 1. This is an ideal point and we can visualize the path taken by gradient descent on the loss surface.
w0 = np.array([-4., 0])
path = gradient_descent(w0, grad_quad, learning_rate=1, nepochs=50)
fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
mode='lines+markers', line=dict(color='black', width=2),
marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
name='Gradient Descent Path', legendgroup='Gradient Descent',
showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)
What happens if we start a different point and a learning rate of 1.
w0 = np.array([-4.,-2.])
path = gradient_descent(w0, grad_quad, learning_rate=1, nepochs=50)
fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
mode='lines+markers', line=dict(color='black', width=2),
marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
name='Gradient Descent Path', legendgroup='Gradient Descent',
showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)
Or if we decrease the learning rate slightly to .9.
w0 = np.array([-4.,-2.])
path = gradient_descent(w0, grad_quad, learning_rate=.9, nepochs=50)
fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
mode='lines+markers', line=dict(color='black', width=2),
marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
name='Gradient Descent Path', legendgroup='Gradient Descent',
showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)
Gradient Descent with Momentum
Here we implement gradient descent with momentum. This is a simple modification to the basic gradient descent algorithm that can help with oscillations and speed up convergence.
def gd_momentum(w_0, gradient,
learning_rate=1, nepochs=10, epsilon=1e-6, momentum=0.9):
"""Gradient descent with momentum.
Args:
w_0: Initial weights (numpy array).
gradient: Function to compute the gradient.
learning_rate: Step size for each iteration.
nepochs: Maximum number of iterations.
epsilon: Convergence threshold.
momentum: Momentum factor.
Returns:
path: Array of weights at each iteration."""
w_old = w_0
path = [w_old]
v = np.zeros_like(w_old)
for e in range(nepochs):
g = gradient(w_old)
v = momentum * v - learning_rate * g
w = w_old + v
path.append(w)
if np.linalg.norm(w - w_old) < epsilon: break
w_old = w
return np.array(path)
Let's apply gradient descent with momentum to the poorly conditioned quadratic loss function from before. We will start at the point (-4,-2) and use a learning rate of .9 and a momentum factor of .3. We can visualize the path taken by gradient descent with momentum on the loss surface.
w0 = np.array([-4.,-2.])
path = gd_momentum(w0, grad_quad,
learning_rate=.9,
momentum=0.3,
nepochs=50)
fig = go.Figure()
fig.add_trace(contour)
#add arrows to lines
fig.add_scatter(x=path[:, 0], y=path[:, 1],
mode='lines+markers', line=dict(color='black', width=2),
marker=dict(size=10, color='black', symbol= "arrow-bar-up", angleref="previous"),
name='Gradient Descent Path', legendgroup='Gradient Descent',
showlegend=False)
fig.update_layout(margin=dict(l=5, r=5, t=5, b=5))
# fig.write_image("images/flat_quadratic.pdf", width=1000, height=500)
display(fig)
We can also try momentum on our non-convex messy least squares regression problem (using a non-linear model). Try setting momentum to 0 and see what happens.
w0 = np.array([2.2, 2.2])
#w0 = np.array([.5, 2.5])
w0 = np.array([1.5, 2])
path = gd_momentum(w0,
grad_sine_MSE,
learning_rate=.1,
momentum=0.9,
nepochs=50)
fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
sine['w0'], sine['w1'], sine['error'],
sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)
Learning Rate Schedules
Linear Schedule
$$ \eta^{(t)} = \eta_0 \left(1 - \frac{t}{T}\right) + \eta_{T} \frac{t}{T} $$
The constant $T$ is the total number of epochs.
Hyperparameters:
- $\eta_0$: initial learning rate
- $\eta_T$: final learning rate
def linear_learning_rate(initial_lr, T, end_lr=0):
"""Linearly decaying learning rate.
Args:
initial_lr: Initial learning rate.
T: Total number of iterations.
end_lr: Final learning rate.
Returns:
A function that takes the current iteration and returns the learning rate."""
return lambda t: initial_lr * (1 - min(1, t / T)) + end_lr * min(1, t / T)
lr = linear_learning_rate(initial_lr=0.1, T=100, end_lr=0.01)
fig = px.line(x=np.arange(100),
markers=True,
y=[lr(epoch) for epoch in range(100)],
labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig
Power-law Schedule
$$ \eta^{(t)} = \eta_0 \left(1 + \frac{t}{s}\right)^{-c} $$
Hyperparameters:
- $s$: step size (in epochs) of the decay
- $c$: power of the decay
def power_law_learning_rate(initial_lr, T, s=1, c=1):
"""Power-law decaying learning rate.
Args:
initial_lr: Initial learning rate.
T: Total number of epochs.
s: Scaling factor.
c: Exponent.
Returns:
A function that takes the current epoch and returns the learning rate."""
return lambda t: initial_lr * (1 + t/s) ** (-c)
lr = power_law_learning_rate(initial_lr=0.1, T=100, s=10, c=1)
fig = px.line(x=np.arange(100),
markers=True,
y=[lr(epoch) for epoch in range(100)],
labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig
Exponential Schedule
$$ \eta^{(t)} = \eta_0 c^{\lfloor t/s \rfloor} $$
Hyperparameters:
- $s$: step size (in epochs) of the decay
- $c$: base of the exponential decay
def exponential_learning_rate(initial_lr, decay_rate=0.95, decay_steps=10):
"""Exponential decaying learning rate.
Args:
initial_lr: Initial learning rate.
decay_rate: Decay rate (0 < decay_rate < 1).
decay_steps: Number of steps before applying decay.
Returns:
A function that takes the current step and returns the learning rate."""
return lambda t: initial_lr * decay_rate**(np.floor(t / decay_steps))
lr = exponential_learning_rate(initial_lr=0.1,
decay_rate=0.90,
decay_steps=10)
fig = px.line(x=np.arange(100),
markers=True,
y=[lr(epoch) for epoch in range(100)],
labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig
Cosine Schedule
$$ \eta^{(t)} = \eta_{\text{min}} + \frac{1}{2}(\eta_{\text{max}} - \eta_{\text{min}}) \left(1 + \cos\left(\frac{t}{T} \pi\right)\right) $$
Hyperparameters:
- $\eta_{\text{min}}$: minimum learning rate
- $\eta_{\text{max}}$: maximum learning rate
def cosine_learning_rate(T, max_lr, min_lr=0):
"""Cosine annealing learning rate.
Args:
T: Total number of epochs.
max_lr: Maximum learning rate.
min_lr: Minimum learning rate.
Returns:
A function that takes the current epoch and returns the learning rate."""
return lambda t: min_lr + 0.5 * (max_lr - min_lr) * (1 + np.cos(np.pi * t / T))
lr = cosine_learning_rate(T=100, max_lr=1.0)
fig = px.line(x=np.arange(100),
markers=True,
y=[lr(epoch) for epoch in range(100)],
labels={'x': 'Epoch', 'y': 'Learning Rate'})
fig.update_traces(line_width=4)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
fig
Comparing Schedules
linear_lr = linear_learning_rate(initial_lr=0.1, T=100, end_lr=0.01)
power_lr = power_law_learning_rate(initial_lr=0.1, T=100, s=10, c=1)
exp_lr = exponential_learning_rate(initial_lr=0.1, decay_rate=0.90, decay_steps=5)
cosine_lr = cosine_learning_rate(T=100, max_lr=0.1, min_lr=0.01)
t = np.arange(100)
fig = go.Figure()
fig.add_trace(go.Scatter(x=t, y=[linear_lr(epoch) for epoch in t],
mode='lines+markers', name='Linear', line=dict(width=4)))
fig.add_trace(go.Scatter(x=t, y=[power_lr(epoch) for epoch in t],
mode='lines+markers', name='Power Law', line=dict(width=4)))
fig.add_trace(go.Scatter(x=t, y=[exp_lr(epoch) for epoch in t],
mode='lines+markers', name='Exponential', line=dict(width=4)))
fig.add_trace(go.Scatter(x=t, y=[cosine_lr(epoch) for epoch in t],
mode='lines+markers', name='Cosine', line=dict(width=4)))
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20),
xaxis_title='Epoch', yaxis_title='Learning Rate',
height=800)
Adam Optimizer
def gd_adam(w_0, gradient,
learning_rate=lambda t: 0.1,
nepochs=100,
epsilon=1e-6,
beta1=0.9,
beta2=0.999):
"""Adam optimizer that combines momentum and RMSProp.
Args:
w_0: Initial weights (numpy array).
gradient: Function to compute the gradient.
learning_rate: Step size for each iteration.
nepochs: Maximum number of iterations.
epsilon: Convergence threshold.
beta1: Exponential decay rate for the first moment estimate.
beta2: Exponential decay rate for the second moment estimate.
Returns:
path: Array of weights at each iteration."""
w_old = w_0
path = [w_old]
s = np.zeros_like(w_old) # (Momentum)
r = np.zeros_like(w_old) # (RMSProp)
for t in range(nepochs):
g = gradient(w_old)
# Update biased first moment estimate and second moment estimate
s = beta1 * s + (1 - beta1) * g
r = beta2 * r + (1 - beta2) * g**2
# Bias correction
s_hat = s / (1 - beta1 ** (t + 1))
r_hat = r / (1 - beta2 ** (t + 1))
# Weight Update
w_old = w_old - learning_rate(t) * s_hat / (np.sqrt(r_hat) + epsilon)
path.append(w_old)
return np.array(path)
w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
path = gd_adam(
w0,
grad_sine_MSE,
learning_rate=lambda t: 0.1,
nepochs=100,
epsilon=1e-6,
beta1=0.9,
beta2=0.999)
fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
sine['w0'], sine['w1'], sine['error'],
sine['dw0'], sine['dw1'], scale=.1)
add_solution_path(fig, errors, path)
Factorized Adam Optimizer
Here we factor out the Adam optimizer from the gradient descent code. This allows us to easily switch between different optimization algorithms and reuse the Adam optimizer code for stochastic gradient descent.
class AdamOptimizer:
def __init__(self,
learning_rate=lambda t: 0.1,
epsilon=1e-6,
beta1=0.9,
beta2=0.999):
self.learning_rate = learning_rate
self.epsilon = epsilon
self.beta1 = beta1
self.beta2 = beta2
self.t = 0
def step(self, w, g):
# initialize state variables on first call
if not hasattr(self, 's'):
self.s = np.zeros_like(w) # (Momentum)
self.r = np.zeros_like(w) # (RMSProp)
# Update biased first moment estimate and second moment estimate
self.s = self.beta1 * self.s + (1 - self.beta1) * g
self.r = self.beta2 * self.r + (1 - self.beta2) * g ** 2
# Bias correction
s_hat = self.s / (1 - self.beta1 ** (self.t + 1))
r_hat = self.r / (1 - self.beta2 ** (self.t + 1))
# Weight Update
w = w - self.learning_rate(self.t) * s_hat / (np.sqrt(r_hat) + self.epsilon)
self.t += 1
return w
class VanillaOptimizer:
def __init__(self, learning_rate=lambda t: 0.1):
self.learning_rate = learning_rate
self.t = 0
def step(self, w, g):
w = w - self.learning_rate(self.t) * g
self.t += 1
return w
class MomentumUpdate:
def __init__(self, learning_rate=lambda t: 0.1, momentum=0.9):
self.learning_rate = learning_rate
self.momentum = momentum
self.t = 0
def step(self, w, g):
if not hasattr(self, 'v'):
self.v = np.zeros_like(w)
self.v = self.momentum * self.v - self.learning_rate(self.t) * g
self.t += 1
return w + self.v
def gd(w_0, gradient, optimizer, nepochs = 100, epsilon=1e-8):
"""Generic gradient descent optimization.
Args:
w_0: Initial weights (numpy array).
gradient: Function to compute the gradient.
optimizer: An optimizer object with a step method.
nepochs: Maximum number of iterations.
epsilon: Convergence threshold.
Returns:
path: Array of weights at each iteration."""
w = w_0
path = [w]
for t in range(nepochs):
g = gradient(w)
w = optimizer.step(w, g)
path.append(w)
if np.linalg.norm(w - path[-2]) < epsilon: break
return np.array(path)
w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
adam = AdamOptimizer(
learning_rate=lambda t: 0.1,
beta1=0.9,
beta2=0.999)
momentum = MomentumUpdate(
learning_rate=lambda t: 0.1,
momentum=0.9)
basic = VanillaOptimizer(learning_rate=lambda t: 0.1)
path = gd(w0, grad_sine_MSE, adam, nepochs=100)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
sine['w0'], sine['w1'], sine['error'],
sine['dw0'], sine['dw1'], scale=.1)
fig = add_solution_path(fig, errors, path)
fig.show()
fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()
Stochastic Gradient Descent
def sgd(w_0, x, t, gradient, optimizer, nepochs = 2, epsilon=1e-8):
"""Generic stochastic gradient descent optimization.
Args:
w_0: Initial weights (numpy array).
gradient: Function to compute the gradient.
optimizer: An optimizer object with a step method.
epsilon: Convergence threshold.
Returns:
path: Array of weights at each iteration."""
w = w_0
path = [w]
for e in range(nepochs):
random_indices = np.random.permutation(len(t))
for i in random_indices:
g = gradient(w, x[i], t[i])
w = optimizer.step(w, g)
path.append(w)
if np.linalg.norm(w - path[-2]) < epsilon: break
return np.array(path)
w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
nepochs = 2
adam = AdamOptimizer(
learning_rate=lambda t: 0.1,
beta1=0.9,
beta2=0.999)
momentum = MomentumUpdate(
learning_rate=lambda t: 0.01,
momentum=0.5)
powerlaw = power_law_learning_rate(initial_lr=0.1, T=nepochs*len(sine['y']), s=10, c=1)
vanilla = VanillaOptimizer(learning_rate=powerlaw)
path = sgd(w0, sine['x'], sine['y'], grad_sine_MSE,
adam,
nepochs=nepochs)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
sine['w0'], sine['w1'], sine['error'],
sine['dw0'], sine['dw1'], scale=.1)
fig = add_solution_path(fig, errors, path)
fig.show()
fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()
#w0 = np.array([-10., -5.])
w0 = np.array([-1., 2.])
#w0 = np.array([0., -4])
nepochs = 1
coslr = cosine_learning_rate(T=nepochs*len(cancer['t']), max_lr=0.4, min_lr=0.01)
adam = AdamOptimizer(
learning_rate=coslr,
beta1=0.9,
beta2=0.999)
momentum = MomentumUpdate(
learning_rate=coslr,
momentum=0.5)
vanilla = VanillaOptimizer(learning_rate=coslr)
path = sgd(w0, cancer['x'], cancer['t'], grad_NLL,
adam,
nepochs=nepochs)
errors = [NLL(w) for w in path]
fig = plot_gradient(
cancer['w1'], cancer['w2'], cancer['error'],
cancer['dw1'], cancer['dw2'], scale=1)
fig = add_solution_path(fig, errors, path)
fig.show()
fig_loss = make_loss_curve(path, NLL)
fig_loss.show()
Minibatch Stochastic Gradient Descent
def mb_sgd(w_0, x, t, gradient, optimizer,
batch_size=32, nepochs = 2, epsilon=1e-8):
"""Generic stochastic gradient descent optimization.
Args:
w_0: Initial weights (numpy array).
gradient: Function to compute the gradient.
optimizer: An optimizer object with a step method.
batch_size: Size of each mini-batch.
epsilon: Convergence threshold.
Returns:
path: Array of weights at each iteration."""
w = w_0
path = [w]
for e in range(nepochs):
random_indices = np.random.permutation(len(t))
for start in range(0, len(t), batch_size):
end = start + batch_size
batch_indices = random_indices[start:end]
g = gradient(w, x[batch_indices], t[batch_indices])
w = optimizer.step(w, g)
path.append(w)
if np.linalg.norm(w - path[-2]) < epsilon: break
return np.array(path)
w0 = np.array([-1, 2.5])
#w0 = np.array([.5, 2.5])
#w0 = np.array([1.5, 2])
nepochs = 10
adam = AdamOptimizer(
learning_rate=lambda t: 0.1,
beta1=0.9,
beta2=0.999)
momentum = MomentumUpdate(
learning_rate=lambda t: 0.01,
momentum=0.5)
powerlaw = power_law_learning_rate(initial_lr=0.1, T=nepochs*len(sine['y']), s=10, c=1)
vanilla = VanillaOptimizer(learning_rate=powerlaw)
path = mb_sgd(
w0, sine['x'], sine['y'], grad_sine_MSE,
adam,
batch_size=32,
nepochs=nepochs)
errors = [sine_MSE(w) for w in path]
fig = plot_gradient(
sine['w0'], sine['w1'], sine['error'],
sine['dw0'], sine['dw1'], scale=.1)
fig = add_solution_path(fig, errors, path)
fig.show()
fig_loss = make_loss_curve(path, sine_MSE)
fig_loss.show()