Multiclass image classification

On a simple image dataset
Published

November 10, 2022

Here we’ll modify binary image classification from previous example to multiclass image classification by detecting left diagonal and right diagonal separately.

Code
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from shared.step_by_step import StepByStep
import platform
from PIL import Image
import datetime
import matplotlib.pyplot as plt
from matplotlib import cm
from torch.utils.data import DataLoader, Dataset, random_split, WeightedRandomSampler, SubsetRandomSampler
from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomHorizontalFlip, Resize

plt.style.use('fivethirtyeight')
Code
def show_image(im, cmap=None):
    fig = plt.figure(figsize=(3,3))
    plt.imshow(im, cmap=cmap)
    plt.grid(False)
    plt.show()

Data

Code
def gen_img(start, target, fill=1, img_size=10):
    # Generates empty image
    img = np.zeros((img_size, img_size), dtype=float)

    start_row, start_col = None, None

    if start > 0:
        start_row = start
    else:
        start_col = np.abs(start)

    if target == 0:
        if start_row is None:
            img[:, start_col] = fill
        else:
            img[start_row, :] = fill
    else:
        if start_col == 0:
            start_col = 1
        
        if target == 1:
            if start_row is not None:
                up = (range(start_row, -1, -1), 
                      range(0, start_row + 1))
            else:
                up = (range(img_size - 1, start_col - 1, -1), 
                      range(start_col, img_size))
            img[up] = fill
        else:
            if start_row is not None:
                down = (range(start_row, img_size, 1), 
                        range(0, img_size - start_row))
            else:
                down = (range(0, img_size - 1 - start_col + 1), 
                        range(start_col, img_size))
            img[down] = fill
    
    return 255 * img.reshape(1, img_size, img_size)


def generate_dataset(img_size=10, n_images=100, binary=True, seed=17):
    np.random.seed(seed)

    starts = np.random.randint(-(img_size - 1), img_size, size=(n_images,))
    targets = np.random.randint(0, 3, size=(n_images,))
    
    images = np.array([gen_img(s, t, img_size=img_size) 
                       for s, t in zip(starts, targets)], dtype=np.uint8)
    
    if binary:
        targets = (targets > 0).astype(int)
    
    return images, targets

def plot_images(images, targets, n_plot=30, per_row=10):
    n_rows = n_plot // per_row + ((n_plot % per_row) > 0)
    fig, axes = plt.subplots(n_rows, per_row, figsize=(9, 1.5 * n_rows))
    axes = np.atleast_2d(axes)
    
    for i, (image, target) in enumerate(zip(images[:n_plot], targets[:n_plot])):
        row, col = i // per_row, i % per_row    
        ax = axes[row, col]
        ax.set_title('#{} - Label:{}'.format(i, target), {'size': 8})
        # plot filter channel in grayscale
        ax.imshow(image.squeeze(), cmap='gray', vmin=0, vmax=1)

    for ax in axes.flat:
        ax.set_xticks([])
        ax.set_yticks([])
        ax.label_outer()

    plt.tight_layout()
    return fig
images, labels = generate_dataset(img_size=10, n_images=1000, binary=False, seed=13)
fig = plot_images(images, labels, n_plot=30)

Data preparation

We prepare data similary as in the previous exercize thought notable difference is that y_tensor shape is (N), not (N,1) as previously. This is due to loss function (CrossEntropyLoss) requiremnts (it takes class indices).

x_tensor = torch.as_tensor(images / 255.).float()
y_tensor = torch.as_tensor(labels).long()
x_tensor.shape
torch.Size([1000, 1, 10, 10])
y_tensor.shape
torch.Size([1000])
class TransformedTensorDataset(Dataset):
    def __init__(self, x, y, transform=None):
        self.x = x
        self.y = y
        self.transform = transform
        
    def __getitem__(self, index):
        x = self.x[index]
        
        if self.transform:
            x = self.transform(x)
        
        return x, self.y[index]
        
    def __len__(self):
        return len(self.x)
torch.manual_seed(42)
N = len(x_tensor)
n_train = int(.8*N)
train_subset, val_subset = random_split(x_tensor, [n_train, N - n_train])
train_idx = train_subset.indices
val_idx = val_subset.indices

We do not apply augmentation since it would mess up the labels:

train_composer = Compose([Normalize(mean=(.5,), std=(.5,))])
val_composer = Compose([Normalize(mean=(.5,), std=(.5,))])

Now we can build train/val tensors, Datasets and DataLoaders:

x_train_tensor = x_tensor[train_idx]
y_train_tensor = y_tensor[train_idx]

x_val_tensor = x_tensor[val_idx]
y_val_tensor = y_tensor[val_idx]

train_dataset = TransformedTensorDataset(x_train_tensor, y_train_tensor, transform=train_composer)
val_dataset = TransformedTensorDataset(x_val_tensor, y_val_tensor, transform=val_composer)

# Builds a loader of each set
train_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=16)
y_val_tensor
tensor([2, 0, 2, 1, 0, 0, 0, 1, 1, 2, 1, 2, 0, 2, 2, 2, 1, 1, 1, 2, 2, 2, 0, 1,
        2, 1, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 2, 0, 2, 1, 1, 1, 2, 1, 0, 0, 0, 2,
        0, 2, 0, 2, 2, 2, 0, 2, 1, 2, 0, 0, 2, 0, 2, 0, 0, 0, 1, 2, 1, 0, 1, 0,
        2, 0, 0, 0, 1, 0, 2, 2, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 2,
        2, 2, 0, 2, 2, 1, 2, 1, 1, 0, 2, 2, 2, 1, 1, 0, 0, 2, 0, 2, 2, 2, 1, 1,
        1, 1, 2, 2, 2, 0, 2, 0, 2, 1, 1, 2, 0, 0, 2, 2, 1, 0, 2, 2, 2, 1, 1, 1,
        0, 1, 0, 2, 0, 2, 1, 1, 1, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 1, 2, 1, 0, 2,
        2, 1, 1, 0, 2, 2, 1, 1, 0, 2, 2, 1, 1, 2, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0,
        0, 0, 0, 2, 2, 2, 2, 2])

Deep model

A typical architecture uses a sequence of one or more typical convolutional blocks, with each block consisting of three operations:

  • Convolution

  • Activation function

  • Pooling

And for multiclass problems we need to use appropriate loss functions depending if we have Sigmoid/LogSoftmax as the last layer:

Let’s build a model:

torch.manual_seed(13)
model_cnn1 = nn.Sequential()

# Featurizer
# Block 1: 1@10x10 -> n_channels@8x8 -> n_channels@4x4
n_channels = 1
model_cnn1.add_module('conv1', nn.Conv2d(in_channels=1, out_channels=n_channels, kernel_size=3))
model_cnn1.add_module('relu1', nn.ReLU())
model_cnn1.add_module('maxp1', nn.MaxPool2d(kernel_size=2))
# Flattening: n_channels * 4 * 4
model_cnn1.add_module('flatten', nn.Flatten())

# Classification
# Hidden Layer
model_cnn1.add_module('fc1', nn.Linear(in_features=n_channels*4*4, out_features=10))
model_cnn1.add_module('relu2', nn.ReLU())
# Output Layer
model_cnn1.add_module('fc2', nn.Linear(in_features=10, out_features=3))

lr = 0.1
multi_loss_fn = nn.CrossEntropyLoss(reduction='mean')
optimizer_cnn1 = optim.SGD(model_cnn1.parameters(), lr=lr)

Let’s just run one batch to get the sense for loss:

x, y = next(iter(train_loader))
y_pred = model_cnn1(x)
print(y.shape)
print(y_pred.shape)
nn.CrossEntropyLoss()(y_pred,y)
torch.Size([16])
torch.Size([16, 3])
tensor(0.2768, grad_fn=<NllLossBackward0>)

One important observation: y shape is 16, while y_pred shape is 16x3. CrossEntropyLoss expects this (see docs).

sbs = StepByStep(model_cnn1, optimizer_cnn1, multi_loss_fn)
# sbs.set_seed()
sbs.set_loaders(train_loader, val_loader)
sbs.train(20)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  7.16it/s]
fig = sbs.plot_losses()

Code
print('Correct categories:')
print(sbs.loader_apply(sbs.val_loader, sbs.correct))
Correct categories:
tensor([[60, 71],
        [46, 56],
        [73, 73]])
Code
print(f'Accuracy: {sbs.accuracy}%')
Accuracy: 89.5%

This is not the greatest accuracy for label 0 (parallel) and label 1 (counter-diagonal). Once can probably find a better model (TODO) but for now let’s inspect what failed.

Visualize error outputs

# will predict all points at once here, no batches:
logits = sbs.predict(val_loader.dataset.x)
predicted = np.argmax(logits, 1)
logits.shape
(200, 3)
val_loader.dataset.x.shape
torch.Size([200, 1, 10, 10])
logits.shape
(200, 3)
predicted
array([2, 0, 2, 1, 0, 0, 0, 1, 1, 2, 1, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       0, 1, 2, 1, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1, 2, 0, 2, 1, 1, 1, 2, 1,
       0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 0, 0, 2, 0, 2, 0, 0, 0,
       1, 2, 1, 2, 1, 0, 2, 0, 0, 0, 2, 0, 2, 2, 1, 0, 0, 1, 2, 0, 2, 1,
       2, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1,
       1, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 0, 2, 0, 2, 1, 1, 2,
       0, 0, 2, 2, 1, 0, 2, 2, 2, 1, 1, 1, 0, 1, 2, 2, 0, 2, 1, 2, 2, 0,
       2, 0, 0, 0, 0, 1, 1, 2, 0, 1, 2, 1, 2, 2, 2, 1, 2, 0, 2, 2, 1, 1,
       0, 2, 2, 1, 1, 2, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 0, 2, 0, 2, 2, 2,
       2, 2])
not_equal = torch.ne(val_loader.dataset.y, torch.as_tensor(predicted))
images_tensor = val_loader.dataset.x[not_equal]
actual_labels_tensor = val_loader.dataset.y[not_equal]
pred_labels_tensor = predicted[not_equal]
featurizer_layers = ['conv1', 'relu1', 'maxp1', 'flatten']
classifier_layers = ['fc1', 'relu2', 'fc2']

sbs.attach_hooks(layers_to_hook=featurizer_layers + classifier_layers)

start_idx = 0
batch_size = 10
images_batch = images_tensor[start_idx:start_idx+batch_size]
labels_batch = actual_labels_tensor[start_idx:start_idx+batch_size]

logits = sbs.predict(images_batch)
predicted = np.argmax(logits, 1)
sbs.remove_hooks()
with plt.style.context('seaborn-white'):
    fig_maps1 = sbs.visualize_outputs(featurizer_layers)
    fig_maps2 = sbs.visualize_outputs(classifier_layers, y=labels_batch, yhat=predicted)

And we see that lots of predictions are for class label 2. For images 2,3,7,8, filters failed to register anything.

Ordinary batch visualization:

fig_filters = sbs.visualize_filters('conv1', cmap='gray')

featurizer_layers = ['conv1', 'relu1', 'maxp1', 'flatten']
classifier_layers = ['fc1', 'relu2', 'fc2']

sbs.attach_hooks(layers_to_hook=featurizer_layers + classifier_layers)

images_batch, labels_batch = next(iter(val_loader))
logits = sbs.predict(images_batch)
predicted = np.argmax(logits, 1)
sbs.remove_hooks()
with plt.style.context('seaborn-white'):
    fig_maps1 = sbs.visualize_outputs(featurizer_layers)
    fig_maps2 = sbs.visualize_outputs(classifier_layers, y=labels_batch, yhat=predicted)