Build a CNN

Build a CNN#

We have discussed the different components of a convolutional neural network; now, we can bring them all together. For this, we will return to the FashionMNIST dataset we saw previously.

from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
from IPython.utils import io

with io.capture_output() as captured:
    training_data = datasets.FashionMNIST(
        root="../data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    test_data = datasets.FashionMNIST(
        root="../data",
        train=False,
        download=True,
        transform=ToTensor(),
    )

batch_size = 64

train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

With the data loaded, the next step is to create the model. This model consists of three convolutional layers, each using a ReLU activation function, and is followed by a maximum pooling. Following the final maximum pooling, the data is flattened to be passed to the linear, fully connected layers (of which there are four). The final fully connected layer produces 10 output classes, matching the 10 types of clothing in the dataset.

import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNModel(nn.Module):
    """
    A convolutional neural network model for image classification.
    """
    def __init__(self):
        super(CNNModel, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=0)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=0)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=0)
        
        # Pooling layer
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # Fully connected layers
        self.fc1 = nn.Linear(64 * 3 * 3, 250)  # Adjusted for 28x28 input after convolution and pooling
        self.fc2 = nn.Linear(250, 125)
        self.fc3 = nn.Linear(125, 60)
        self.fc4 = nn.Linear(60, 10)  # 10 classes for classification

    def forward(self, x):
        """
        Forward pass of the neural network.
        
        :param x: The input tensor
        :return: The output tensor
        """
        # Outputs: 32 × 26 × 26 → 32 × 13 × 13
        x = self.pool(F.relu(self.conv1(x)))  
        # Output: 64 × 11 × 11 → 64 × 5 × 5
        x = self.pool(F.relu(self.conv2(x)))  
        # Output: 64 × 3 × 3
        x = F.relu(self.conv3(x))             

        # Flattens for fully connected layers to (batch_size, 64 * 3 *3)
        x = torch.flatten(x, start_dim=1) 
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        # No activation, as CrossEntropyLoss in PyTorch expects raw logits
        # x = self.fc4(x)  

        return x

With the simple model built, we can create the object and print what the model looks like.

model = CNNModel()

print(model)
CNNModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=576, out_features=250, bias=True)
  (fc2): Linear(in_features=250, out_features=125, bias=True)
  (fc3): Linear(in_features=125, out_features=60, bias=True)
  (fc4): Linear(in_features=60, out_features=10, bias=True)
)

Next, we define our loss function (here, we use cross-entropy loss as we are working with a classification problem) and the optimiser.

loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Everything is now ready to train the network for 10 epochs.

def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train() 
    for epoch in range(epochs):
        running_loss = 0.0

        for images, labels in train_loader:
            optimizer.zero_grad()  
            outputs = model(images)  
            loss = loss_fn(outputs, labels)  
            loss.backward()  
            optimizer.step() 

            running_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader):.4f}")

train_model(model, train_dataloader, loss_fn, optimizer, epochs=10)
Epoch 1/10, Loss: 0.8036
Epoch 2/10, Loss: 0.3918
Epoch 3/10, Loss: 0.3203
Epoch 4/10, Loss: 0.2852
Epoch 5/10, Loss: 0.2603
Epoch 6/10, Loss: 0.2384
Epoch 7/10, Loss: 0.2194
Epoch 8/10, Loss: 0.2015
Epoch 9/10, Loss: 0.1842
Epoch 10/10, Loss: 0.1710

Then, with the model trained, we can look at how well it handles the test data.

import numpy as np
from sklearn.metrics import accuracy_score, f1_score

model.eval()  # Set model to evaluation mode
f1_scores = []
accuracy_scores = []

with torch.no_grad():  # No need to compute gradients during evaluation
    for images, labels in test_dataloader:

        outputs = model(images)
        _, predicted = torch.max(outputs, 1)  # Get class with highest probability

        f1_scores.append(f1_score(labels, predicted, average='macro'))
        accuracy_scores.append(accuracy_score(labels, predicted))

print(f"Average Test Accuracy: {np.mean(accuracy_scores):.2f}%")
print(f"Average Test F1 Score: {np.mean(f1_scores):.2f}")
Average Test Accuracy: 0.89%
Average Test F1 Score: 0.89

The average accuracy and F1 score are high, indicating that even for this simple network, the model can classify the images better than our simple linear network.