EfficientNet (Paper Review)
EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks
Residual Networks (Resnet) allowed researchers to create a deep network with many layers without vanishing gradient problems or losing information from past layers.
However, just simply adding more layers to create a deeper and larger model has a lot of problems.
The first problem is that it is difficult and time costly to train those deep networks.Secondly, people expect that the deeper the model better the performance; however, the truth is that the performance of a model does not improve much compare to models with way few layers.
This means that training deeper networks is costly but their performances do not compensate for the cost.
Therefore, researchers had to find a way to develop an effective model with fewer layers.
This means that instead of traditional ResNets, WideResNets decrease the depth and increase the width of residual networks.
Increasing the width of the layers means increasing the number of channels and feature maps at each layer which improves the overall accuracy of the model.
The second way to improve the efficiency was to get images with higher resolution.
Intuitively, the better the resolution, the better the chance for layers to extract important information and features from images.
Lastly, as everyone can easily imagine, the deeper the network, the better the accuracy
Not only that, the paper implemented SqueezeExcitation and InvertedResidualBlock that were introduced in MobileNet V2.
I will talk about details on SqueezeExcitation, InvertedResidualBlock, and Depthwise convolution in the future.
EfficientNet paper proposes a compound scaling method.
this method allows people to find and scales network width, depth, and resolution by using a compound coefficient φ.
Users can choose the φ coefficient based on available resources.
The floating-point operations per second (FLOPS) of a regular convolution are proportional to d, w^2, r^2 (d = depth factor, w = width factor, r = resultion factor).
In the paper, they choose α, β, and γ that α · β^2· γ^2 ≈ 2 therefore FLOPS will approximately increase by 2^φ since the FLOPS of the EfficientNet is (α · β^2· γ^2)^φ.
import torch | |
import torch.nn as nn | |
from math import ceil | |
base_model = [ | |
# expand_ratio, channels, repeats, stride, kernel_size | |
[1, 16, 1, 1, 3], | |
[6, 24, 2, 2, 3], | |
[6, 40, 2, 2, 5], | |
[6, 80, 3, 2, 3], | |
[6, 112, 3, 1, 5], | |
[6, 192, 4, 2, 5], | |
[6, 320, 1, 1, 3], | |
] | |
phi_values = { | |
# tuple of : (phi_value, resuolution, drop_rate) | |
"b0": (0, 244, 0.2), # alpha, beta, gamma, depth = alpha ** phi | |
"b1": (0.5, 240, 0.2), | |
"b2": (1, 260, 0.3), | |
"b3": (2, 300, 0.3), | |
"b4": (3, 380, 0.4), | |
"b5": (2, 456, 0.4), | |
"b6": (2, 528, 0.5), | |
"b7": (2, 600, 0.6), | |
} | |
class CNNBlock(nn.Module): | |
def __init__(self, in_channels, out_channels, kernel_size, stride, padding, groups=1): | |
super(CNNBlock, self).__init__() | |
self.cnn = nn.Conv2d( | |
in_channels, | |
out_channels, | |
kernel_size, | |
stride, | |
padding, | |
groups=groups, # depth wise convolution | |
bias=False, | |
) | |
self.bn = nn.BatchNorm2d(out_channels) | |
self.silu = nn.SiLU() # Silu == Swish | |
def forward(self, x): | |
return self.silu(self.bn(self.cnn(x))) | |
class SqueezeExcitation(nn.Module): # to compute the attention score for each of the channels | |
def __init__(self, in_channels, reduced_dim): | |
super(SqueezeExcitation, self).__init__() | |
self.se = nn.Sequential( | |
nn.AdaptiveAvgPool2d(1), # C xH x W => c x 1 x 1 | |
nn.Conv2d(in_channels, reduced_dim, 1), | |
nn.SiLU(), | |
nn.Conv2d(reduced_dim, in_channels, 1), | |
nn.Sigmoid(), | |
) | |
def forward(self, x): | |
return x * self.se(x) | |
class InvertedResidualBlock(nn.Module): | |
def __init__(self, | |
in_channels, | |
out_channels, | |
kernel_size, | |
stride, | |
padding, | |
expand_ratio, | |
reduction=4, # for reduced dimensionality for squeeze excitation | |
survival_prob=0.8 # for stochastic depth | |
): | |
super(InvertedResidualBlock, self).__init__() | |
self.survival_prob = survival_prob | |
self.use_residual = in_channels == out_channels and stride == 1 | |
hidden_dim = in_channels * expand_ratio | |
self.expand = in_channels != hidden_dim | |
reduced_dim = int(in_channels / reduction) | |
if self.expand: | |
self.expand_conv = CNNBlock( | |
in_channels, hidden_dim, kernel_size=3, stride=1, padding=1, | |
) | |
self.conv = nn.Sequential( | |
CNNBlock( | |
hidden_dim, hidden_dim, kernel_size, stride=stride, padding=padding, groups=hidden_dim, | |
), | |
SqueezeExcitation(hidden_dim, reduced_dim), | |
nn.Conv2d(hidden_dim, out_channels, 1, bias=False), | |
nn.BatchNorm2d(out_channels) | |
) | |
def stochastic_depth(self, x): | |
if not self.training: | |
return x | |
binary_tensor = torch.rand(x.shape[0], 1, 1, 1, device=x.device) < self.survival_prob | |
return torch.div(x, self.survival_prob) * binary_tensor | |
def forward(self, inputs): | |
x = self.expand_conv(inputs) if self.expand else inputs | |
if self.use_residual: | |
return self.stochastic_depth(self.conv(x)) + inputs | |
else: | |
return self.conv(x) | |
class EfficientNet(nn.Module): | |
def __init__(self, version, num_classes): | |
super(EfficientNet, self).__init__() | |
width_factor, depth_factor, dropout_rate = self.calculate_factors(version) | |
last_channels = ceil(1280 * width_factor) | |
self.pool = nn.AdaptiveAvgPool2d(1) | |
self.features = self.create_features(width_factor, depth_factor, last_channels) | |
self.classifier = nn.Sequential( | |
nn.Dropout(dropout_rate), | |
nn.Linear(last_channels, num_classes), | |
) | |
def calculate_factors(self, version, alpha=1.2, beta=1.1): | |
phi, res, drop_rate = phi_values[version] | |
depth_factor = alpha ** phi | |
width_factor = beta ** phi | |
return width_factor, depth_factor, drop_rate | |
def create_features(self, width_factor, depth_factor, last_channels): | |
channels = int(32 * width_factor) | |
features = [CNNBlock(3, channels, 3, stride=2, padding=1)] | |
in_channels = channels | |
for expand_ratio, channels, repeats, stride, kernel_size in base_model: | |
out_channels = 4 * ceil(int(channels * width_factor) / 4) | |
layers_repeats = ceil(repeats * depth_factor) | |
for layer in range(layers_repeats): | |
features.append( | |
InvertedResidualBlock( | |
in_channels, | |
out_channels, | |
expand_ratio=expand_ratio, | |
stride=stride if layer == 0 else 1, | |
kernel_size=kernel_size, | |
padding=kernel_size // 2, # if kernel = 1 : pad=0, if kernel =3: pad 1, kernel = 5: pad = 2 | |
) | |
) | |
in_channels = out_channels | |
features.append( | |
CNNBlock(in_channels, last_channels, kernel_size=1, stride=1, padding=0) | |
) | |
return nn.Sequential(*features) | |
def forward(self, x): | |
x = self.pool(self.features(x)) | |
return self.classifier(x.view(x.shape[0], -1)) |
Comments
Post a Comment