Build a CNN

Build a CNN#

We have discussed the different components of a convolutional neural network; now, we can bring them all together. For this, we will return to the FashionMNIST dataset we saw previously.

from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
from IPython.utils import io

with io.capture_output() as captured:
    training_data = datasets.FashionMNIST(
        root="../data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    test_data = datasets.FashionMNIST(
        root="../data",
        train=False,
        download=True,
        transform=ToTensor(),
    )

batch_size = 64

train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

With the data loaded, the next step is to create the model. This model consists of three convolutional layers, each using a ReLU activation function, and is followed by a maximum pooling. Following the final maximum pooling, the data is flattened to be passed to the linear, fully connected layers (of which there are four). The final fully connected layer produces 10 output classes, matching the 10 types of clothing in the dataset.

import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNModel(nn.Module):
    """
    A convolutional neural network model for image classification.
    """
    def __init__(self):
        super(CNNModel, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=0)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=0)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=0)
        
        # Pooling layer
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # Fully connected layers
        self.fc1 = nn.Linear(64 * 3 * 3, 250)  # Adjusted for 28x28 input after convolution and pooling
        self.fc2 = nn.Linear(250, 125)
        self.fc3 = nn.Linear(125, 60)
        self.fc4 = nn.Linear(60, 10)  # 10 classes for classification

    def forward(self, x):
        """
        Forward pass of the neural network.
        
        :param x: The input tensor
        :return: The output tensor
        """
        # Outputs: 32 × 26 × 26 → 32 × 13 × 13
        x = self.pool(F.relu(self.conv1(x)))  
        # Output: 64 × 11 × 11 → 64 × 5 × 5
        x = self.pool(F.relu(self.conv2(x)))  
        # Output: 64 × 3 × 3
        x = F.relu(self.conv3(x))             

        # Flattens for fully connected layers to (batch_size, 64 * 3 *3)
        x = torch.flatten(x, start_dim=1) 
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        # No activation, as CrossEntropyLoss in PyTorch expects raw logits
        # x = self.fc4(x)  

        return x

With the simple model built, we can create the object and print what the model looks like.

model = CNNModel()

print(model)

CNNModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=576, out_features=250, bias=True)
  (fc2): Linear(in_features=250, out_features=125, bias=True)
  (fc3): Linear(in_features=125, out_features=60, bias=True)
  (fc4): Linear(in_features=60, out_features=10, bias=True)
)

Next, we define our loss function (here, we use cross-entropy loss as we are working with a classification problem) and the optimiser.

loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Everything is now ready to train the network for 10 epochs.

def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train() 
    for epoch in range(epochs):
        running_loss = 0.0

        for images, labels in train_loader:
            optimizer.zero_grad()  
            outputs = model(images)  
            loss = loss_fn(outputs, labels)  
            loss.backward()  
            optimizer.step() 

            running_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader):.4f}")

train_model(model, train_dataloader, loss_fn, optimizer, epochs=10)

Epoch 1/10, Loss: 0.8036

Epoch 2/10, Loss: 0.3918

Epoch 3/10, Loss: 0.3203

Epoch 4/10, Loss: 0.2852

Epoch 5/10, Loss: 0.2603

Epoch 6/10, Loss: 0.2384

Epoch 7/10, Loss: 0.2194

Epoch 8/10, Loss: 0.2015

Epoch 9/10, Loss: 0.1842

Epoch 10/10, Loss: 0.1710

Then, with the model trained, we can look at how well it handles the test data.

import numpy as np
from sklearn.metrics import accuracy_score, f1_score

model.eval()  # Set model to evaluation mode
f1_scores = []
accuracy_scores = []

with torch.no_grad():  # No need to compute gradients during evaluation
    for images, labels in test_dataloader:

        outputs = model(images)
        _, predicted = torch.max(outputs, 1)  # Get class with highest probability

        f1_scores.append(f1_score(labels, predicted, average='macro'))
        accuracy_scores.append(accuracy_score(labels, predicted))

print(f"Average Test Accuracy: {np.mean(accuracy_scores):.2f}%")
print(f"Average Test F1 Score: {np.mean(f1_scores):.2f}")

Average Test Accuracy: 0.89%
Average Test F1 Score: 0.89

The average accuracy and F1 score are high, indicating that even for this simple network, the model can classify the images better than our simple linear network.