We'll cover how to build a multimodal vision model for the Bengali.AI competition
from fastai.vision.all import *
Below you will find the exact imports for everything we use today from the fastai library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import partial
from torch import nn
import torch.nn.functional as F
from fastai.callback.progress import ProgressCallback
from fastai.callback.schedule import fit_one_cycle
from fastai.data.block import CategoryBlock
from fastai.data.transforms import ColReader, Normalize, RandomSplitter
from fastai.learner import Metric, Learner
from fastai.torch_core import Module
from fastai.vision.augment import aug_transforms
from fastai.vision.core import PILImageBW
from fastai.vision.data import ImageBlock
from fastai.vision.learner import create_body, create_head
from zipfile import ZipFile
with ZipFile('bengaliai.zip', 'r') as zip_ref:
zip_ref.extractall('bengaliai')
height = 137
width = 236
size = 128
The data is stored in four parquet
files. We can read this into Pandas
tn = 'train_image_data_'
suf = '.parquet'
train = [tn+str(i)+suf for i in range(4)]
train
out_train = 'train.zip'
import cv2
import pandas as pd
def bbox(img):
rows = np.any(img, axis=1)
cols = np.any(img, axis=0)
rmin, rmax = np.where(rows)[0][[0, -1]]
cmin, cmax = np.where(cols)[0][[0, -1]]
return rmin, rmax, cmin, cmax
def crop_resize(img0, size=size, pad=16):
#crop a box around pixels large than the threshold
#some images contain line at the sides
ymin,ymax,xmin,xmax = bbox(img0[5:-5,5:-5] > 80)
#cropping may cut too much, so we need to add it back
xmin = xmin - 13 if (xmin > 13) else 0
ymin = ymin - 10 if (ymin > 10) else 0
xmax = xmax + 13 if (xmax < width - 13) else width
ymax = ymax + 10 if (ymax < height - 10) else height
img = img0[ymin:ymax,xmin:xmax]
#remove lo intensity pixels as noise
img[img < 28] = 0
lx, ly = xmax-xmin,ymax-ymin
l = max(lx,ly) + pad
#make sure that the aspect ratio is kept in rescaling
img = np.pad(img, [((l-ly)//2,), ((l-lx)//2,)], mode='constant')
return cv2.resize(img,(size,size))
df = pd.read_parquet('bengaliai/' + train[0])
df.head()
import matplotlib.pyplot as plt
import numpy as np
n_imgs = 8
fig, axs = plt.subplots(n_imgs, 2, figsize=(10, 5*n_imgs))
for idx in range(n_imgs):
#somehow the original input is inverted
img0 = 255 - df.iloc[idx, 1:].values.reshape(height, width).astype(np.uint8)
#normalize each image by its max val
img = (img0*(255.0/img0.max())).astype(np.uint8)
img = crop_resize(img)
axs[idx,0].imshow(img0)
axs[idx,0].set_title('Original image')
axs[idx,0].axis('off')
axs[idx,1].imshow(img)
axs[idx,1].set_title('Crop & resize')
axs[idx,1].axis('off')
plt.show()
And now we generate the dataset
from tqdm import tqdm
x_tot,x2_tot = [],[]
with ZipFile(out_train, 'w') as img_out:
for fname in train:
df = pd.read_parquet('bengaliai/'+fname)
#the input is inverted
data = 255 - df.iloc[:, 1:].values.reshape(-1, height, width).astype(np.uint8)
for idx in tqdm(range(len(df))):
name = df.iloc[idx,0]
#normalize each image by its max val
img = (data[idx]*(255.0/data[idx].max())).astype(np.uint8)
img = crop_resize(img)
x_tot.append((img/255.0).mean())
x2_tot.append(((img/255.0)**2).mean())
img = cv2.imencode('.png',img)[1]
img_out.writestr(name + '.png', img)
We're finally going to use our own mean and standard deviation for normalizing:
mean = np.array(x_tot).mean()
std = np.sqrt(np.array(x2_tot).mean() - mean**2)
print(f'mean: {mean}, std: {std}')
From here we can begin using fastai
Building the DataBlock
with ZipFile('train.zip', 'r') as zip_ref:
zip_ref.extractall('images')
from fastai.vision.all import *
Out of the three/four csv files that came with it, we care about train.csv
train = pd.read_csv('bengaliai/train.csv')
test = pd.read_csv('bengaliai/test.csv')
class_map = pd.read_csv('bengaliai/class_map.csv')
train.head()
First we need to know how many possible outcomes there can be. We can do this by grabbing the .unique()
attribute from our dataframe
graph_vocab = train['grapheme_root'].unique()
vowel_vocab = train['vowel_diacritic'].unique()
const_vocab = train['consonant_diacritic'].unique()
And now we generate our DataBlocks
. We want one for our input (a B/W 2D image), and three CategoryBlock
's for our outcomes, each with our pre-determined vocabularies
blocks = (ImageBlock(cls=PILImageBW),
CategoryBlock(vocab=graph_vocab),
CategoryBlock(vocab=vowel_vocab),
CategoryBlock(vocab=const_vocab))
Our four getters:
getters = [
ColReader('image_id', pref='images/', suff='.png'),
ColReader('grapheme_root'),
ColReader('vowel_diacritic'),
ColReader('consonant_diacritic')
]
And some transforms. We're going to use our mean
and std
from earlier
batch_tfms = [*aug_transforms(do_flip=False, size=128),
Normalize.from_stats(mean=0.0692, std=0.2051)]
Finally making our DataBlock
. We also should declare that only the first of our blocks is our input
bengel = DataBlock(blocks=blocks,
getters = getters,
splitter=RandomSplitter(),
batch_tfms=batch_tfms,
n_inp=1)
bs=128
For the dataset itself, since it takes a long time to train, we'll use a 1,000 size sample
dls = bengel.dataloaders(train.sample(1000), bs=bs)
dls.show_batch(max_n=1, figsize=(3,3))
n = train[['grapheme_root', 'vowel_diacritic', 'consonant_diacritic']].nunique(); print(n)
dls.c
For the body of our model, we'll use a standard resnet34. We need to modify the first ConvLayer
so that it accepts a 1D image. There are a few ways to go about this. I chose this particular method:
body = create_body(resnet34, pretrained=True)
l = nn.Conv2d(1, 64, kernel_size=(7,7), stride=(2,2),
padding=(3,3), bias=False)
l.weight = nn.Parameter(l.weight.sum(dim=1, keepdim=True))
body[0] = l
from fastdot import *
def _fillcolor(o:str):
if 'Lin' in o:
return 'lightblue'
elif 'ReLU' in o:
return 'gray'
elif 'Flatten' in o:
return 'white'
elif 'Pooling' in o:
return 'pink'
elif 'Conv' in o:
return 'white'
else:
return 'gold'
node_defaults['fillcolor'] = _fillcolor
graph = ['2DPooling', 'Flatten', 'Lin (1024, 512)', 'ReLU', 'Lin (512, 168)']
vow = ['2DPooling', 'Flatten', 'Lin (1024, 512)', 'ReLU', 'Lin (512, 11)']
const = ['2DPooling', 'Flatten', 'Lin (1024, 512)', 'ReLU', 'Lin (512, 7)']
body1 = ['Conv2d (3, 512, 512)']
out = ['G (bs, 168)\n V (bs, 11)\n C (bs, 7)']
block1, block2, block3, block4, block5 = ['ResNet Body', 'Grapheme Head', 'Vowel Head', 'Consonant Head', 'Model Output']
conns = ((block1, block2),
(block1, block3),
(block1, block4),
(block4, block5),
(block3, block5),
(block2, block5),
(graph[-3], graph[4]),
(vow[-3], vow[4]),
(const[-3], const[4]))
visual = graph_items(seq_cluster(body1, block1),
seq_cluster(graph, block2),
seq_cluster(vow, block3),
seq_cluster(const, block4),
seq_cluster(out, block5))
visual.add_items(*object_connections(conns))
And now let's take a look at the head of the model we plan on making, with some assistance from the fastdot
library (if you want to see how I do this, look at the Helper Functions
section
visual
from fastai.vision.all import *
class MultiModel(Module):
"A three-headed model given a `body` and `n` output features"
def __init__(self, body:nn.Sequential, n:L):
nf = num_features_model(nn.Sequential(*body.children())) * (2)
self.body = body
self.grapheme = create_head(nf, n[0])
self.vowel = create_head(nf, n[1])
self.consonant = create_head(nf, n[2])
def forward(self, x):
y = self.body(x)
graph = self.grapheme(y)
vowel = self.vowel(y)
const = self.consonant(y)
return [graph, vowel, const]
net = MultiModel(body, dls.c)
We're going to want a custom loss function here. We'll base it on Miguel Pinto's notebook here.
from sklearn.metrics import recall_score
class CombinationLoss(Module):
"Cross Entropy Loss on multiple targets"
def __init__(self, func=F.cross_entropy, weights=[2, 1, 1]):
self.func, self.w = func, weights
def forward(self, xs, *ys, reduction='mean'):
for i, w, x, y in zip(range(len(xs)), self.w, xs, ys):
if i == 0: loss = w*self.func(x, y, reduction=reduction)
else: loss += w*self.func(x, y, reduction=reduction)
return loss
class RecallPartial(Metric):
"Stores predictions and targets on CPU in accumulate to perform final calculations with `func`."
def __init__(self, a=0, **kwargs):
self.func = partial(recall_score, average='macro', zero_division=0)
self.a = a
def reset(self): self.targs,self.preds = [],[]
def accumulate(self, learn):
pred = learn.pred[self.a].argmax(dim=-1)
targ = learn.y[self.a]
pred,targ = to_detach(pred),to_detach(targ)
pred,targ = flatten_check(pred,targ)
self.preds.append(pred)
self.targs.append(targ)
@property
def value(self):
if len(self.preds) == 0: return
preds,targs = torch.cat(self.preds),torch.cat(self.targs)
return self.func(targs, preds)
@property
def name(self): return train.columns[self.a+1]
class RecallCombine(Metric):
def accumulate(self, learn):
scores = [learn.metrics[i].value for i in range(3)]
self.combine = np.average(scores, weights=[2,1,1])
@property
def value(self):
return self.combine
Finally we'll declare our model:
learn = Learner(dls, net, loss_func=CombinationLoss(),
metrics=[RecallPartial(a=i) for i in range(len(dls.c))] + [RecallCombine()],
)
And train for a few epochs!
learn.fit_one_cycle(10, 1e-3)