Lecture 24 – CS 189, Fall 2025
In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torchvision as tv
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import random as rn
from torchinfo import summary
style.use('fivethirtyeight')
sns.set(style='whitegrid', color_codes=True)
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
Using device: cpu
In [2]:
mnist_tf = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
In [3]:
# Load MNIST dataset
x_train_mnist = tv.datasets.MNIST(root='./data', train=True,
download=True, transform=mnist_tf)
x_test_mnist = tv.datasets.MNIST(root='./data', train=False,
download=True, transform=mnist_tf)
image_size = x_train_mnist[0][0].shape[1]
print(f"MNIST training data size: {len(x_train_mnist)} of size {image_size}x{image_size}")
print(f"MNIST test data size: {len(x_test_mnist)} of size {image_size}x{image_size}")
MNIST training data size: 60000 of size 28x28 MNIST test data size: 10000 of size 28x28
In [4]:
imgs = [x_train_mnist[i] for i in range(16)]
labels = [i for i in range(16)]
fig, axes = plt.subplots(4, 4, figsize=(4, 4))
for ax, img, lab in zip(axes.flatten(), imgs, labels):
ax.imshow(img[0].squeeze().numpy(), cmap='gray')
ax.set_title(str(lab))
ax.axis('off')
plt.tight_layout()
plt.show()
Denoising Autoencoder¶
In [5]:
# Denormalize the MNIST images back to [0, 1]
x_train_mnist_np = np.array([x[0].numpy() for x in x_train_mnist]) * 0.3081 + 0.1307
x_test_mnist_np = np.array([x[0].numpy() for x in x_test_mnist]) * 0.3081 + 0.1307
# Generate corrupted MNIST images by adding noise with normal distribution
noise = np.random.normal(loc=0.0, scale=0.5, size=x_train_mnist_np.shape)
x_train_noisy = x_train_mnist_np + noise
noise = np.random.normal(loc=0.0, scale=0.5, size=x_test_mnist_np.shape)
x_test_noisy = x_test_mnist_np + noise
# Clip values to [0, 1]
x_train_noisy = np.clip(x_train_noisy, 0., 1.)
x_test_noisy = np.clip(x_test_noisy, 0., 1.)
# Display the first 25 corrupted and original images
rows, cols = 5, 5
num = rows * cols
imgs = np.concatenate([x_train_mnist_np[:num], x_train_noisy[:num]])
imgs = imgs.reshape((rows * 2, cols, image_size, image_size))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 2, -1, image_size, image_size))
imgs = np.vstack([np.hstack(i) for i in imgs])
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.title('Original images: top rows, Corrupted Input: bottom rows')
plt.imshow(imgs, interpolation='none', cmap='gray')
# plt.savefig('original_vs_noisy.png', bbox_inches='tight', dpi=150)
plt.show()
In [6]:
# Convert to PyTorch tensors with shape (N, C, H, W)
x_train_tensor = torch.from_numpy(x_train_mnist_np).float()
x_train_noisy_tensor = torch.from_numpy(x_train_noisy).float()
x_test_tensor = torch.from_numpy(x_test_mnist_np).float()
x_test_noisy_tensor = torch.from_numpy(x_test_noisy).float()
In [7]:
# Create datasets and dataloaders
train_dataset = TensorDataset(x_train_noisy_tensor, x_train_tensor)
test_dataset = TensorDataset(x_test_noisy_tensor, x_test_tensor)
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
In [8]:
# Network parameters
input_shape = (1, image_size, image_size) # (C, H, W) format for PyTorch
kernel_size = 3
latent_dim = 16
# Encoder/Decoder number of CNN layers and filters per layer
layer_filters = [32, 64]
In [9]:
class Encoder(nn.Module):
def __init__(self, latent_dim=16, layer_filters=[32, 64], kernel_size=3):
super(Encoder, self).__init__()
self.latent_dim = latent_dim
# Stack of Conv2D blocks
layers = []
in_channels = 1
for filters in layer_filters:
layers.append(nn.Conv2d(in_channels, filters, kernel_size,
stride=2, padding=1))
layers.append(nn.ReLU(inplace=True))
in_channels = filters
self.conv_layers = nn.Sequential(*layers)
# Calculate the flattened size after convolutions
# After each stride=2 conv with padding=1: size -> (size + 1) // 2
# 28 -> 14 -> 7
self.flatten_size = layer_filters[-1] * 7 * 7
# Latent vector layer
self.fc = nn.Linear(self.flatten_size, latent_dim)
def forward(self, x):
x = self.conv_layers(x)
# Get shape before flattening (needed for decoder)
self.shape_before_flatten = x.shape
x = x.view(x.size(0), -1) # Flatten
latent = self.fc(x)
return latent
# Instantiate Encoder
encoder = Encoder(latent_dim=latent_dim, layer_filters=layer_filters,
kernel_size=kernel_size).to(device)
print("Encoder Architecture:")
# Ensure input_shape matches the expected dimensions (batch_size, channels, height, width)
input_shape = (1, 1, 28, 28) # Batch size of 1, 1 channel, 28x28 image
summary(encoder, input_size=(1, 1, 28, 28), col_names=("input_size", "output_size", "num_params"))
Encoder Architecture:
Out[9]:
=================================================================================================================== Layer (type:depth-idx) Input Shape Output Shape Param # =================================================================================================================== Encoder [1, 1, 28, 28] [1, 16] -- ├─Sequential: 1-1 [1, 1, 28, 28] [1, 64, 7, 7] -- │ └─Conv2d: 2-1 [1, 1, 28, 28] [1, 32, 14, 14] 320 │ └─ReLU: 2-2 [1, 32, 14, 14] [1, 32, 14, 14] -- │ └─Conv2d: 2-3 [1, 32, 14, 14] [1, 64, 7, 7] 18,496 │ └─ReLU: 2-4 [1, 64, 7, 7] [1, 64, 7, 7] -- ├─Linear: 1-2 [1, 3136] [1, 16] 50,192 =================================================================================================================== Total params: 69,008 Trainable params: 69,008 Non-trainable params: 0 Total mult-adds (M): 1.02 =================================================================================================================== Input size (MB): 0.00 Forward/backward pass size (MB): 0.08 Params size (MB): 0.28 Estimated Total Size (MB): 0.35 ===================================================================================================================
In [10]:
class Decoder(nn.Module):
def __init__(self, latent_dim=16, layer_filters=[32, 64], kernel_size=3):
super(Decoder, self).__init__()
# Calculate the size after encoder
# 28 -> 14 -> 7 (shape: [batch, 64, 7, 7])
self.shape_h = 7
self.shape_w = 7
self.shape_c = layer_filters[-1]
# Dense layer to reshape from latent vector
self.fc = nn.Linear(latent_dim, self.shape_c * self.shape_h * self.shape_w)
# Stack of Transposed Conv2D blocks (reverse order)
layers = []
reversed_filters = layer_filters[::-1]
for i in range(len(reversed_filters)):
in_channels = reversed_filters[i]
out_channels = reversed_filters[i+1] if i+1 < len(reversed_filters) else 1
if i < len(reversed_filters) - 1:
# Intermediate layers with ReLU
layers.append(nn.ConvTranspose2d(in_channels, out_channels,
kernel_size, stride=2,
padding=1, output_padding=1))
layers.append(nn.ReLU(inplace=True))
else:
# Last transposed conv to get back to original size
layers.append(nn.ConvTranspose2d(in_channels, out_channels,
kernel_size, stride=2,
padding=1, output_padding=1))
# Final conv to ensure exact output size and add sigmoid
layers.append(nn.Conv2d(1, 1, kernel_size, padding=1))
layers.append(nn.Sigmoid())
self.deconv_layers = nn.Sequential(*layers)
def forward(self, latent):
x = self.fc(latent)
x = x.view(-1, self.shape_c, self.shape_h, self.shape_w)
x = self.deconv_layers(x)
return x
# Instantiate Decoder
decoder = Decoder(latent_dim=latent_dim, layer_filters=layer_filters,
kernel_size=kernel_size).to(device)
print("\nDecoder Architecture:")
summary(decoder, input_size=(latent_dim,), col_names=("input_size", "output_size", "num_params"))
Decoder Architecture:
Out[10]:
=================================================================================================================== Layer (type:depth-idx) Input Shape Output Shape Param # =================================================================================================================== Decoder [16] [1, 1, 28, 28] -- ├─Linear: 1-1 [16] [3136] 53,312 ├─Sequential: 1-2 [1, 64, 7, 7] [1, 1, 28, 28] -- │ └─ConvTranspose2d: 2-1 [1, 64, 7, 7] [1, 32, 14, 14] 18,464 │ └─ReLU: 2-2 [1, 32, 14, 14] [1, 32, 14, 14] -- │ └─ConvTranspose2d: 2-3 [1, 32, 14, 14] [1, 1, 28, 28] 289 │ └─Conv2d: 2-4 [1, 1, 28, 28] [1, 1, 28, 28] 10 │ └─Sigmoid: 2-5 [1, 1, 28, 28] [1, 1, 28, 28] -- =================================================================================================================== Total params: 72,075 Trainable params: 72,075 Non-trainable params: 0 Total mult-adds (M): 171.04 =================================================================================================================== Input size (MB): 0.00 Forward/backward pass size (MB): 0.09 Params size (MB): 0.29 Estimated Total Size (MB): 0.38 ===================================================================================================================
In [11]:
class Autoencoder(nn.Module):
def __init__(self, encoder, decoder):
super(Autoencoder, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, x):
latent = self.encoder(x)
reconstructed = self.decoder(latent)
return reconstructed
# Instantiate Autoencoder
autoencoder = Autoencoder(encoder, decoder).to(device)
print("\nComplete Autoencoder:")
summary(autoencoder, input_size=(1, 1, 28, 28), col_names=("input_size", "output_size", "num_params"))
Complete Autoencoder:
Out[11]:
=================================================================================================================== Layer (type:depth-idx) Input Shape Output Shape Param # =================================================================================================================== Autoencoder [1, 1, 28, 28] [1, 1, 28, 28] -- ├─Encoder: 1-1 [1, 1, 28, 28] [1, 16] -- │ └─Sequential: 2-1 [1, 1, 28, 28] [1, 64, 7, 7] -- │ │ └─Conv2d: 3-1 [1, 1, 28, 28] [1, 32, 14, 14] 320 │ │ └─ReLU: 3-2 [1, 32, 14, 14] [1, 32, 14, 14] -- │ │ └─Conv2d: 3-3 [1, 32, 14, 14] [1, 64, 7, 7] 18,496 │ │ └─ReLU: 3-4 [1, 64, 7, 7] [1, 64, 7, 7] -- │ └─Linear: 2-2 [1, 3136] [1, 16] 50,192 ├─Decoder: 1-2 [1, 16] [1, 1, 28, 28] -- │ └─Linear: 2-3 [1, 16] [1, 3136] 53,312 │ └─Sequential: 2-4 [1, 64, 7, 7] [1, 1, 28, 28] -- │ │ └─ConvTranspose2d: 3-5 [1, 64, 7, 7] [1, 32, 14, 14] 18,464 │ │ └─ReLU: 3-6 [1, 32, 14, 14] [1, 32, 14, 14] -- │ │ └─ConvTranspose2d: 3-7 [1, 32, 14, 14] [1, 1, 28, 28] 289 │ │ └─Conv2d: 3-8 [1, 1, 28, 28] [1, 1, 28, 28] 10 │ │ └─Sigmoid: 3-9 [1, 1, 28, 28] [1, 1, 28, 28] -- =================================================================================================================== Total params: 141,083 Trainable params: 141,083 Non-trainable params: 0 Total mult-adds (M): 4.93 =================================================================================================================== Input size (MB): 0.00 Forward/backward pass size (MB): 0.16 Params size (MB): 0.56 Estimated Total Size (MB): 0.73 ===================================================================================================================
In [12]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters())
# Training function
def train_autoencoder(model, train_loader, test_loader, criterion, optimizer, epochs=2):
"""Train the autoencoder"""
history = {'train_loss': [], 'val_loss': []}
for epoch in range(epochs):
# Training phase
model.train()
train_loss = 0.0
for noisy_imgs, clean_imgs in train_loader:
noisy_imgs = noisy_imgs.to(device)
clean_imgs = clean_imgs.to(device)
# Zero gradients
optimizer.zero_grad()
# Forward pass
outputs = model(noisy_imgs)
loss = criterion(outputs, clean_imgs)
# Backward pass
loss.backward()
optimizer.step()
train_loss += loss.item() * noisy_imgs.size(0)
train_loss = train_loss / len(train_loader.dataset)
# Validation phase
model.eval()
val_loss = 0.0
with torch.no_grad():
for noisy_imgs, clean_imgs in test_loader:
noisy_imgs = noisy_imgs.to(device)
clean_imgs = clean_imgs.to(device)
outputs = model(noisy_imgs)
loss = criterion(outputs, clean_imgs)
val_loss += loss.item() * noisy_imgs.size(0)
val_loss = val_loss / len(test_loader.dataset)
# Store history
history['train_loss'].append(train_loss)
history['val_loss'].append(val_loss)
print(f'Epoch [{epoch+1}/{epochs}] - '
f'Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}')
return history
# Train the autoencoder
print("\nTraining autoencoder...")
history = train_autoencoder(autoencoder, train_loader, test_loader,
criterion, optimizer, epochs=2)
# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training History')
plt.legend()
plt.grid(True)
# plt.savefig('training_history.png', bbox_inches='tight', dpi=150)
plt.show()
Training autoencoder...
Epoch [1/2] - Train Loss: 0.045142, Val Loss: 0.025222
Epoch [2/2] - Train Loss: 0.023146, Val Loss: 0.020659
In [13]:
# Predict the Autoencoder output from corrupted test images
autoencoder.eval()
with torch.no_grad():
# Get predictions for first batch
x_test_noisy_batch = x_test_noisy_tensor.to(device)
x_decoded = autoencoder(x_test_noisy_batch).cpu()
# Convert back to numpy for visualization
x_decoded = x_decoded.numpy() # Already has the correct dimensions
# Display the first 25 images: original, corrupted, and denoised
rows, cols = 5, 5
num = rows * cols
imgs = np.concatenate([x_test_mnist_np[:num], x_test_noisy[:num], x_decoded[:num]])
imgs = imgs.reshape((rows * 3, cols, image_size, image_size))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 3, -1, image_size, image_size))
imgs = np.vstack([np.hstack(i) for i in imgs])
imgs = (imgs * 255).astype(np.uint8)
plt.figure(figsize=(10, 15))
plt.axis('off')
plt.title('Original images: top rows, '
'Corrupted Input: middle rows, '
'Denoised Output: bottom rows')
plt.imshow(imgs, interpolation='none', cmap='gray')
# plt.savefig('denoising_results.png', bbox_inches='tight', dpi=150)
plt.show()
In [ ]: