import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torchvision as tv
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import random as rn
from torchinfo import summary


style.use('fivethirtyeight')
sns.set(style='whitegrid', color_codes=True)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu

mnist_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Load MNIST dataset
x_train_mnist = tv.datasets.MNIST(root='./data', train=True, 
                                          download=True, transform=mnist_tf)
x_test_mnist = tv.datasets.MNIST(root='./data', train=False, 
                                         download=True, transform=mnist_tf)

image_size = x_train_mnist[0][0].shape[1]


print(f"MNIST training data size: {len(x_train_mnist)} of size {image_size}x{image_size}")
print(f"MNIST test data size: {len(x_test_mnist)} of size {image_size}x{image_size}")

MNIST training data size: 60000 of size 28x28
MNIST test data size: 10000 of size 28x28

imgs = [x_train_mnist[i] for i in range(16)]  
labels = [i for i in range(16)] 
fig, axes = plt.subplots(4, 4, figsize=(4, 4))
for ax, img, lab in zip(axes.flatten(), imgs, labels):
    ax.imshow(img[0].squeeze().numpy(), cmap='gray') 
    ax.set_title(str(lab))
    ax.axis('off')
plt.tight_layout()
plt.show()

# Denormalize the MNIST images back to [0, 1]
x_train_mnist_np = np.array([x[0].numpy() for x in x_train_mnist]) * 0.3081 + 0.1307
x_test_mnist_np = np.array([x[0].numpy() for x in x_test_mnist]) * 0.3081 + 0.1307

# Generate corrupted MNIST images by adding noise with normal distribution
noise = np.random.normal(loc=0.0, scale=0.5, size=x_train_mnist_np.shape)
x_train_noisy = x_train_mnist_np + noise
noise = np.random.normal(loc=0.0, scale=0.5, size=x_test_mnist_np.shape)
x_test_noisy = x_test_mnist_np + noise

# Clip values to [0, 1]
x_train_noisy = np.clip(x_train_noisy, 0., 1.)
x_test_noisy = np.clip(x_test_noisy, 0., 1.)

# Display the first 25 corrupted and original images
rows, cols = 5, 5
num = rows * cols
imgs = np.concatenate([x_train_mnist_np[:num], x_train_noisy[:num]])
imgs = imgs.reshape((rows * 2, cols, image_size, image_size))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 2, -1, image_size, image_size))
imgs = np.vstack([np.hstack(i) for i in imgs])

plt.figure(figsize=(10, 10))
plt.axis('off')
plt.title('Original images: top rows, Corrupted Input: bottom rows')
plt.imshow(imgs, interpolation='none', cmap='gray')
# plt.savefig('original_vs_noisy.png', bbox_inches='tight', dpi=150)
plt.show()

# Convert to PyTorch tensors with shape (N, C, H, W)
x_train_tensor       = torch.from_numpy(x_train_mnist_np).float()
x_train_noisy_tensor = torch.from_numpy(x_train_noisy).float()
x_test_tensor        = torch.from_numpy(x_test_mnist_np).float()
x_test_noisy_tensor  = torch.from_numpy(x_test_noisy).float()

# Create datasets and dataloaders
train_dataset = TensorDataset(x_train_noisy_tensor, x_train_tensor)
test_dataset = TensorDataset(x_test_noisy_tensor, x_test_tensor)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Network parameters
input_shape = (1, image_size, image_size)  # (C, H, W) format for PyTorch
kernel_size = 3
latent_dim = 16
# Encoder/Decoder number of CNN layers and filters per layer
layer_filters = [32, 64]

class Encoder(nn.Module):
    def __init__(self, latent_dim=16, layer_filters=[32, 64], kernel_size=3):
        super(Encoder, self).__init__()
        
        self.latent_dim = latent_dim
        
        # Stack of Conv2D blocks
        layers = []
        in_channels = 1
        for filters in layer_filters:
            layers.append(nn.Conv2d(in_channels, filters, kernel_size, 
                                   stride=2, padding=1))
            layers.append(nn.ReLU(inplace=True))
            in_channels = filters
        
        self.conv_layers = nn.Sequential(*layers)
        
        # Calculate the flattened size after convolutions
        # After each stride=2 conv with padding=1: size -> (size + 1) // 2
        # 28 -> 14 -> 7
        self.flatten_size = layer_filters[-1] * 7 * 7
        
        # Latent vector layer
        self.fc = nn.Linear(self.flatten_size, latent_dim)
        
    def forward(self, x):
        x = self.conv_layers(x)
        # Get shape before flattening (needed for decoder)
        self.shape_before_flatten = x.shape
        x = x.view(x.size(0), -1)  # Flatten
        latent = self.fc(x)
        return latent

# Instantiate Encoder
encoder = Encoder(latent_dim=latent_dim, layer_filters=layer_filters, 
                 kernel_size=kernel_size).to(device)

print("Encoder Architecture:")
# Ensure input_shape matches the expected dimensions (batch_size, channels, height, width)
input_shape = (1, 1, 28, 28)  # Batch size of 1, 1 channel, 28x28 image
summary(encoder, input_size=(1, 1, 28, 28), col_names=("input_size", "output_size", "num_params"))

Encoder Architecture:

===================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
===================================================================================================================
Encoder                                  [1, 1, 28, 28]            [1, 16]                   --
├─Sequential: 1-1                        [1, 1, 28, 28]            [1, 64, 7, 7]             --
│    └─Conv2d: 2-1                       [1, 1, 28, 28]            [1, 32, 14, 14]           320
│    └─ReLU: 2-2                         [1, 32, 14, 14]           [1, 32, 14, 14]           --
│    └─Conv2d: 2-3                       [1, 32, 14, 14]           [1, 64, 7, 7]             18,496
│    └─ReLU: 2-4                         [1, 64, 7, 7]             [1, 64, 7, 7]             --
├─Linear: 1-2                            [1, 3136]                 [1, 16]                   50,192
===================================================================================================================
Total params: 69,008
Trainable params: 69,008
Non-trainable params: 0
Total mult-adds (M): 1.02
===================================================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.08
Params size (MB): 0.28
Estimated Total Size (MB): 0.35
===================================================================================================================

class Decoder(nn.Module):
    def __init__(self, latent_dim=16, layer_filters=[32, 64], kernel_size=3):
        super(Decoder, self).__init__()
        
        # Calculate the size after encoder
        # 28 -> 14 -> 7 (shape: [batch, 64, 7, 7])
        self.shape_h = 7
        self.shape_w = 7
        self.shape_c = layer_filters[-1]
        
        # Dense layer to reshape from latent vector
        self.fc = nn.Linear(latent_dim, self.shape_c * self.shape_h * self.shape_w)
        
        # Stack of Transposed Conv2D blocks (reverse order)
        layers = []
        reversed_filters = layer_filters[::-1]
        
        for i in range(len(reversed_filters)):
            in_channels = reversed_filters[i]
            out_channels = reversed_filters[i+1] if i+1 < len(reversed_filters) else 1
            
            if i < len(reversed_filters) - 1:
                # Intermediate layers with ReLU
                layers.append(nn.ConvTranspose2d(in_channels, out_channels, 
                                                kernel_size, stride=2, 
                                                padding=1, output_padding=1))
                layers.append(nn.ReLU(inplace=True))
            else:
                # Last transposed conv to get back to original size
                layers.append(nn.ConvTranspose2d(in_channels, out_channels, 
                                                kernel_size, stride=2, 
                                                padding=1, output_padding=1))
        
        # Final conv to ensure exact output size and add sigmoid
        layers.append(nn.Conv2d(1, 1, kernel_size, padding=1))
        layers.append(nn.Sigmoid())
        
        self.deconv_layers = nn.Sequential(*layers)
        
    def forward(self, latent):
        x = self.fc(latent)
        x = x.view(-1, self.shape_c, self.shape_h, self.shape_w)
        x = self.deconv_layers(x)
        return x

# Instantiate Decoder
decoder = Decoder(latent_dim=latent_dim, layer_filters=layer_filters, 
                 kernel_size=kernel_size).to(device)

print("\nDecoder Architecture:")
summary(decoder, input_size=(latent_dim,), col_names=("input_size", "output_size", "num_params"))

Decoder Architecture:

===================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
===================================================================================================================
Decoder                                  [16]                      [1, 1, 28, 28]            --
├─Linear: 1-1                            [16]                      [3136]                    53,312
├─Sequential: 1-2                        [1, 64, 7, 7]             [1, 1, 28, 28]            --
│    └─ConvTranspose2d: 2-1              [1, 64, 7, 7]             [1, 32, 14, 14]           18,464
│    └─ReLU: 2-2                         [1, 32, 14, 14]           [1, 32, 14, 14]           --
│    └─ConvTranspose2d: 2-3              [1, 32, 14, 14]           [1, 1, 28, 28]            289
│    └─Conv2d: 2-4                       [1, 1, 28, 28]            [1, 1, 28, 28]            10
│    └─Sigmoid: 2-5                      [1, 1, 28, 28]            [1, 1, 28, 28]            --
===================================================================================================================
Total params: 72,075
Trainable params: 72,075
Non-trainable params: 0
Total mult-adds (M): 171.04
===================================================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.09
Params size (MB): 0.29
Estimated Total Size (MB): 0.38
===================================================================================================================

class Autoencoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(Autoencoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return reconstructed

# Instantiate Autoencoder
autoencoder = Autoencoder(encoder, decoder).to(device)

print("\nComplete Autoencoder:")
summary(autoencoder, input_size=(1, 1, 28, 28), col_names=("input_size", "output_size", "num_params"))

Complete Autoencoder:

===================================================================================================================
Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
===================================================================================================================
Autoencoder                              [1, 1, 28, 28]            [1, 1, 28, 28]            --
├─Encoder: 1-1                           [1, 1, 28, 28]            [1, 16]                   --
│    └─Sequential: 2-1                   [1, 1, 28, 28]            [1, 64, 7, 7]             --
│    │    └─Conv2d: 3-1                  [1, 1, 28, 28]            [1, 32, 14, 14]           320
│    │    └─ReLU: 3-2                    [1, 32, 14, 14]           [1, 32, 14, 14]           --
│    │    └─Conv2d: 3-3                  [1, 32, 14, 14]           [1, 64, 7, 7]             18,496
│    │    └─ReLU: 3-4                    [1, 64, 7, 7]             [1, 64, 7, 7]             --
│    └─Linear: 2-2                       [1, 3136]                 [1, 16]                   50,192
├─Decoder: 1-2                           [1, 16]                   [1, 1, 28, 28]            --
│    └─Linear: 2-3                       [1, 16]                   [1, 3136]                 53,312
│    └─Sequential: 2-4                   [1, 64, 7, 7]             [1, 1, 28, 28]            --
│    │    └─ConvTranspose2d: 3-5         [1, 64, 7, 7]             [1, 32, 14, 14]           18,464
│    │    └─ReLU: 3-6                    [1, 32, 14, 14]           [1, 32, 14, 14]           --
│    │    └─ConvTranspose2d: 3-7         [1, 32, 14, 14]           [1, 1, 28, 28]            289
│    │    └─Conv2d: 3-8                  [1, 1, 28, 28]            [1, 1, 28, 28]            10
│    │    └─Sigmoid: 3-9                 [1, 1, 28, 28]            [1, 1, 28, 28]            --
===================================================================================================================
Total params: 141,083
Trainable params: 141,083
Non-trainable params: 0
Total mult-adds (M): 4.93
===================================================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.16
Params size (MB): 0.56
Estimated Total Size (MB): 0.73
===================================================================================================================

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters())

# Training function
def train_autoencoder(model, train_loader, test_loader, criterion, optimizer, epochs=2):
    """Train the autoencoder"""
    history = {'train_loss': [], 'val_loss': []}
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        
        for noisy_imgs, clean_imgs in train_loader:
            noisy_imgs = noisy_imgs.to(device)
            clean_imgs = clean_imgs.to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(noisy_imgs)
            loss = criterion(outputs, clean_imgs)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * noisy_imgs.size(0)
        
        train_loss = train_loss / len(train_loader.dataset)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for noisy_imgs, clean_imgs in test_loader:
                noisy_imgs = noisy_imgs.to(device)
                clean_imgs = clean_imgs.to(device)
                
                outputs = model(noisy_imgs)
                loss = criterion(outputs, clean_imgs)
                
                val_loss += loss.item() * noisy_imgs.size(0)
        
        val_loss = val_loss / len(test_loader.dataset)
        
        # Store history
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        
        print(f'Epoch [{epoch+1}/{epochs}] - '
              f'Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}')
    
    return history

# Train the autoencoder
print("\nTraining autoencoder...")
history = train_autoencoder(autoencoder, train_loader, test_loader, 
                           criterion, optimizer, epochs=2)

# Plot training history
plt.figure(figsize=(10, 4))
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training History')
plt.legend()
plt.grid(True)
# plt.savefig('training_history.png', bbox_inches='tight', dpi=150)
plt.show()

Training autoencoder...

Epoch [1/2] - Train Loss: 0.045142, Val Loss: 0.025222

Epoch [2/2] - Train Loss: 0.023146, Val Loss: 0.020659

# Predict the Autoencoder output from corrupted test images
autoencoder.eval()
with torch.no_grad():
    # Get predictions for first batch
    x_test_noisy_batch = x_test_noisy_tensor.to(device)
    x_decoded = autoencoder(x_test_noisy_batch).cpu()

# Convert back to numpy for visualization
x_decoded = x_decoded.numpy()  # Already has the correct dimensions

# Display the first 25 images: original, corrupted, and denoised
rows, cols = 5, 5
num = rows * cols
imgs = np.concatenate([x_test_mnist_np[:num], x_test_noisy[:num], x_decoded[:num]])
imgs = imgs.reshape((rows * 3, cols, image_size, image_size))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 3, -1, image_size, image_size))
imgs = np.vstack([np.hstack(i) for i in imgs])
imgs = (imgs * 255).astype(np.uint8)

plt.figure(figsize=(10, 15))
plt.axis('off')
plt.title('Original images: top rows, '
          'Corrupted Input: middle rows, '
          'Denoised Output: bottom rows')
plt.imshow(imgs, interpolation='none', cmap='gray')
# plt.savefig('denoising_results.png', bbox_inches='tight', dpi=150)
plt.show()

Lecture 24 – CS 189, Fall 2025

Denoising Autoencoder¶