Commit 720a88ff by Ting PAN

Init repository

0 parents
[flake8]
max-line-length = 120
ignore = E741, # ambiguous variable name
F403, # ‘from module import *’ used; unable to detect undefined names
F405, # name may be undefined, or defined from star imports: module
F811, # redefinition of unused name from line N
F821, # undefined name
W503, # line break before binary operator
W504 # line break after binary operator
# module imported but unused
per-file-ignores = __init__.py: F401
# Compiled Object files
*.slo
*.lo
*.o
*.cuo
# Compiled Dynamic libraries
*.so
*.dll
*.dylib
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Compiled python
*.pyc
__pycache__
# Compiled MATLAB
*.mex*
# IPython notebook checkpoints
.ipynb_checkpoints
# Editor temporaries
*.swp
*~
# Sublime Text settings
*.sublime-workspace
*.sublime-project
# Eclipse Project settings
*.*project
.settings
# QtCreator files
*.user
# VSCode files
.vscode
# IDEA files
.idea
# OSX dir files
.DS_Store
# Android files
.gradle
*.iml
local.properties
Copyright (c) 2017, SeetaTech, Co.,Ltd. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# SeetaBench
SeetaBench is a collection of benchmarks to evaluate device performance.
## Quick Start
See [Tutorial: How To Use](docs/TUTORIAL.md).
## License
[BSD 2-Clause license](LICENSE)
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Bench GEMM performance."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import torch
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--device', default=0, type=int, help='compute device')
parser.add_argument('--m', default=8192, type=int, help='row of matrix A')
parser.add_argument('--k', default=8192, type=int, help='col of matrix A')
parser.add_argument('--n', default=8192, type=int, help='col of matrix B')
parser.add_argument('--gemm-efficiency', default=0.9, help='gemm impl efficiency')
return parser.parse_args()
def bench_gemm_flops(m=8192, k=8192, n=8192, precision=torch.float32,
device=0, allow_tf32=False, efficiency=0.9):
torch.backends.cuda.allow_tf32 = allow_tf32
a = torch.zeros(m, k, dtype=precision).cuda(device)
b = torch.zeros(k, n, dtype=precision).cuda(device)
for _ in range(5): # Warmup.
_ = torch.matmul(a, b)
torch.cuda.synchronize(device)
tic = time.time()
for _ in range(30):
_ = torch.matmul(a, b)
torch.cuda.synchronize(device)
average_time = (time.time() - tic) / 30.0
tflops = (a.numel() * b.size(-1)) * 2 / 1e12
if precision == torch.float16:
title = 'Float16'
elif precision == torch.float32:
title = 'TensorFloat32' if allow_tf32 else 'Float32'
else:
raise ValueError('Unknown precision: ' + str(precision))
title += '-GEMM (M={}, K={}, N={}):'.format(m, k, n)
print(title, '{:.2f} ({:.2f}) TFLOPS'
.format(tflops / average_time, tflops / efficiency / average_time))
if __name__ == '__main__':
args = parse_args()
print('Called with args:\n' + str(args))
bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float16,
device=args.device, allow_tf32=False, efficiency=args.gemm_efficiency)
bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float32,
device=args.device, allow_tf32=False, efficiency=args.gemm_efficiency)
bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float32,
device=args.device, allow_tf32=True, efficiency=args.gemm_efficiency)
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Bench GEMM performance."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
from dragon.vm import torch
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--device', default=0, type=int, help='compute device')
parser.add_argument('--m', default=8192, type=int, help='row of matrix A')
parser.add_argument('--k', default=8192, type=int, help='col of matrix A')
parser.add_argument('--n', default=8192, type=int, help='col of matrix B')
parser.add_argument('--gemm-efficiency', default=0.9, help='gemm impl efficiency')
return parser.parse_args()
def bench_gemm_flops(m=8192, k=8192, n=8192, precision=torch.float32,
device=0, allow_tf32=False, efficiency=0.9):
torch.backends.cuda.allow_tf32 = allow_tf32
a = torch.zeros(m, k, dtype=precision).cuda(device)
b = torch.zeros(k, n, dtype=precision).cuda(device)
for _ in range(5): # Warmup.
_ = torch.matmul(a, b)
torch.cuda.synchronize(device)
tic = time.time()
for _ in range(30):
_ = torch.matmul(a, b)
torch.cuda.synchronize(device)
average_time = (time.time() - tic) / 30.0
tflops = (a.numel() * b.size(-1)) * 2 / 1e12
if precision == torch.float16:
title = 'Float16'
elif precision == torch.float32:
title = 'TensorFloat32' if allow_tf32 else 'Float32'
else:
raise ValueError('Unknown precision: ' + str(precision))
title += '-GEMM (M={}, K={}, N={}): '.format(m, k, n)
print(title, '{:.2f} ({:.2f}) TFLOPS'
.format(tflops / average_time, tflops / efficiency / average_time))
if __name__ == '__main__':
args = parse_args()
print('Called with args:\n' + str(args))
bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float16,
device=args.device, allow_tf32=False, efficiency=args.gemm_efficiency)
bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float32,
device=args.device, allow_tf32=False, efficiency=args.gemm_efficiency)
bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float32,
device=args.device, allow_tf32=True, efficiency=args.gemm_efficiency)
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Bench MobileNetV3."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import functools
import time
import torch
import torch.nn as nn
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--train', action='store_true', help='run training or inference')
parser.add_argument('--precision', default='float16', help='compute precision')
parser.add_argument('--device', default=0, type=int, help='compute device')
parser.add_argument('--model', default='mobilenet_v3_large', help='compute model')
parser.add_argument('--batch_size', default=128, help='mini-batch size')
return parser.parse_args()
def make_divisible(v, divisor=8):
"""Return the divisible value."""
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class ConvNorm2d(nn.Sequential):
"""2d convolution followed by norm."""
def __init__(
self,
dim_in,
dim_out,
kernel_size,
stride=1,
padding=None,
dilation=1,
groups=1,
bias=True,
norm_type='BatchNorm2d',
activation_type='',
inplace=True,
):
super(ConvNorm2d, self).__init__()
if padding is None:
padding = kernel_size // 2
layers = [nn.Conv2d(dim_in, dim_out,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias and (not norm_type))]
if norm_type:
layers += [getattr(nn, norm_type)(dim_out)]
if activation_type:
layers += [getattr(nn, activation_type)()]
layers[-1].inplace = inplace
for i, layer in enumerate(layers):
self.add_module(str(i), layer)
class SqueezeExcite(nn.Module):
"""Squeeze-and-Excitation block."""
def __init__(self, dim_in, dim):
super(SqueezeExcite, self).__init__()
self.conv1 = nn.Conv2d(dim_in, dim, 1)
self.conv2 = nn.Conv2d(dim, dim_in, 1)
self.activation1 = nn.ReLU(True)
self.activation2 = nn.Hardsigmoid(True)
def forward(self, x):
scale = x.mean((2, 3), keepdim=True)
scale = self.activation1(self.conv1(scale))
scale = self.activation2(self.conv2(scale))
return x * scale
class InvertedResidual(nn.Module):
"""Invert residual block."""
def __init__(
self,
dim_in,
dim_out,
kernel_size=3,
stride=1,
expand_ratio=3,
squeeze_ratio=1,
activation_type='ReLU',
):
super(InvertedResidual, self).__init__()
conv_module = functools.partial(
ConvNorm2d, activation_type=activation_type)
self.apply_shortcut = stride == 1 and dim_in == dim_out
self.dim = dim = int(round(dim_in * expand_ratio))
self.conv1 = (conv_module(dim_in, dim, 1)
if expand_ratio > 1 else nn.Identity())
self.conv2 = conv_module(dim, dim, kernel_size, stride, groups=dim)
self.se = (SqueezeExcite(dim, make_divisible(dim * squeeze_ratio))
if squeeze_ratio < 1 else nn.Identity())
self.conv3 = conv_module(dim, dim_out, 1, activation_type='')
def forward(self, x):
shortcut = x
x = self.conv1(x)
x = self.conv2(x)
x = self.se(x)
x = self.conv3(x)
if self.apply_shortcut:
return x.add_(shortcut)
return x
class MobileNetV3(nn.Module):
"""MobileNetV3 class."""
def __init__(self, depths, dims, kernel_sizes, strides,
expand_ratios, squeeze_ratios, width_mult=1.0,
dropout=0.2, num_classes=1000):
super(MobileNetV3, self).__init__()
conv_module = functools.partial(
ConvNorm2d, activation_type='Hardswish')
dims = list(map(lambda x: make_divisible(x * width_mult), dims))
self.conv1 = conv_module(3, dims[0], 3, 2)
dim_in, blocks, coarsest_stride = dims[0], [], 2
for i, (depth, dim) in enumerate(zip(depths, dims[1:])):
coarsest_stride *= strides[i]
layer_expand_ratios = expand_ratios[i]
if not isinstance(layer_expand_ratios, (tuple, list)):
layer_expand_ratios = [layer_expand_ratios]
layer_expand_ratios = list(layer_expand_ratios)
layer_expand_ratios += ([layer_expand_ratios[-1]] *
(depth - len(layer_expand_ratios)))
for j in range(depth):
blocks.append(InvertedResidual(
dim_in, dim,
kernel_size=kernel_sizes[i],
stride=strides[i] if j == 0 else 1,
expand_ratio=layer_expand_ratios[j],
squeeze_ratio=squeeze_ratios[i],
activation_type='Hardswish'
if coarsest_stride >= 16 else 'ReLU'))
dim_in = dim
setattr(self, 'layer%d' % (i + 1), nn.Sequential(*blocks[-depth:]))
self.conv2 = conv_module(dim_in, blocks[-1].dim, 1)
self.blocks = blocks + [self.conv2]
# Head.
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(blocks[-1].dim, dims[-1]),
nn.Hardswish(),
nn.Dropout(p=dropout, inplace=True),
nn.Linear(dims[-1], num_classes),
) if num_classes > 0 else nn.Identity()
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
def forward(self, x):
x = self.conv1(x)
for blk in self.blocks:
x = blk(x)
return self.fc(self.avgpool(x).flatten(1))
def mobilenet_v3_large(num_classes=1000):
return MobileNetV3(
dims=(16,) + (16, 24, 40, 80, 112, 160) + (1280,),
depths=(1, 2, 3, 4, 2, 3),
kernel_sizes=(3, 3, 5, 3, 3, 5),
strides=(1, 2, 2, 2, 1, 2),
expand_ratios=(1, (4, 3), 3, (6, 2.5, 2.3, 2.3), 6, 6),
squeeze_ratios=(1, 1, 0.25, 1, 0.25, 0.25),
num_classes=num_classes)
def mobilenet_v3_small(num_classes=1000):
return MobileNetV3(
dims=(16,) + (16, 24, 40, 48, 96) + (1024,),
depths=(1, 2, 3, 2, 3),
kernel_sizes=(3, 3, 5, 5, 5),
strides=(2, 2, 2, 1, 2),
expand_ratios=(1, (4.5, 88. / 24), (4, 6, 6), 3, 6),
squeeze_ratios=(0.25, 1, 0.25, 0.25, 0.25),
num_classes=num_classes)
if __name__ == '__main__':
args = parse_args()
print('Called with args:\n' + str(args))
use_fp16 = args.precision.lower() == 'float16'
m = globals()[args.model]().cuda(args.device)
m = m if args.train else m.eval()
m = m.half() if use_fp16 else m
criterion = nn.CrossEntropyLoss()
input = torch.zeros(args.batch_size, 3, 224, 224,
dtype=torch.float16 if use_fp16 else torch.float32)
input = input.cuda(args.device)
target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
for iter in range(5):
tic = time.time()
with torch.enable_grad() if args.train else torch.no_grad():
for i in range(30):
x = m(input)
if args.train:
loss = criterion(x.float(), target)
loss.backward()
torch.cuda.synchronize(args.device)
diff_time = time.time() - tic
print({'iter': iter,
'throughout': round(30.0 / diff_time * input.size(0), 2),
'time': round(diff_time, 3)})
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Bench MobileNetV3."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import functools
import time
from dragon.vm import torch
from dragon.vm.torch import nn
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--train', action='store_true', help='run training or inference')
parser.add_argument('--precision', default='float16', help='compute precision')
parser.add_argument('--device', default=0, type=int, help='compute device')
parser.add_argument('--model', default='mobilenet_v3_large', help='compute model')
parser.add_argument('--batch_size', default=128, help='mini-batch size')
return parser.parse_args()
def make_divisible(v, divisor=8):
"""Return the divisible value."""
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class ConvNorm2d(nn.Sequential):
"""2d convolution followed by norm."""
def __init__(
self,
dim_in,
dim_out,
kernel_size,
stride=1,
padding=None,
dilation=1,
groups=1,
bias=True,
norm_type='BatchNorm2d',
activation_type='',
inplace=True,
):
super(ConvNorm2d, self).__init__()
if padding is None:
padding = kernel_size // 2
layers = [nn.Conv2d(dim_in, dim_out,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias and (not norm_type))]
if norm_type:
layers += [getattr(nn, norm_type)(dim_out)]
if activation_type:
layers += [getattr(nn, activation_type)()]
layers[-1].inplace = inplace
for i, layer in enumerate(layers):
self.add_module(str(i), layer)
class SqueezeExcite(nn.Module):
"""Squeeze-and-Excitation block."""
def __init__(self, dim_in, dim):
super(SqueezeExcite, self).__init__()
self.conv1 = nn.Conv2d(dim_in, dim, 1)
self.conv2 = nn.Conv2d(dim, dim_in, 1)
self.activation1 = nn.ReLU(True)
self.activation2 = nn.Hardsigmoid(True)
def forward(self, x):
scale = x.mean((2, 3), keepdim=True)
scale = self.activation1(self.conv1(scale))
scale = self.activation2(self.conv2(scale))
return x * scale
class InvertedResidual(nn.Module):
"""Invert residual block."""
def __init__(
self,
dim_in,
dim_out,
kernel_size=3,
stride=1,
expand_ratio=3,
squeeze_ratio=1,
activation_type='ReLU',
):
super(InvertedResidual, self).__init__()
conv_module = functools.partial(
ConvNorm2d, activation_type=activation_type)
self.apply_shortcut = stride == 1 and dim_in == dim_out
self.dim = dim = int(round(dim_in * expand_ratio))
self.conv1 = (conv_module(dim_in, dim, 1)
if expand_ratio > 1 else nn.Identity())
self.conv2 = conv_module(dim, dim, kernel_size, stride, groups=dim)
self.se = (SqueezeExcite(dim, make_divisible(dim * squeeze_ratio))
if squeeze_ratio < 1 else nn.Identity())
self.conv3 = conv_module(dim, dim_out, 1, activation_type='')
def forward(self, x):
shortcut = x
x = self.conv1(x)
x = self.conv2(x)
x = self.se(x)
x = self.conv3(x)
if self.apply_shortcut:
return x.add_(shortcut)
return x
class MobileNetV3(nn.Module):
"""MobileNetV3 class."""
def __init__(self, depths, dims, kernel_sizes, strides,
expand_ratios, squeeze_ratios, width_mult=1.0,
dropout=0.2, num_classes=1000):
super(MobileNetV3, self).__init__()
conv_module = functools.partial(
ConvNorm2d, activation_type='Hardswish')
dims = list(map(lambda x: make_divisible(x * width_mult), dims))
self.conv1 = conv_module(3, dims[0], 3, 2)
dim_in, blocks, coarsest_stride = dims[0], [], 2
for i, (depth, dim) in enumerate(zip(depths, dims[1:])):
coarsest_stride *= strides[i]
layer_expand_ratios = expand_ratios[i]
if not isinstance(layer_expand_ratios, (tuple, list)):
layer_expand_ratios = [layer_expand_ratios]
layer_expand_ratios = list(layer_expand_ratios)
layer_expand_ratios += ([layer_expand_ratios[-1]] *
(depth - len(layer_expand_ratios)))
for j in range(depth):
blocks.append(InvertedResidual(
dim_in, dim,
kernel_size=kernel_sizes[i],
stride=strides[i] if j == 0 else 1,
expand_ratio=layer_expand_ratios[j],
squeeze_ratio=squeeze_ratios[i],
activation_type='Hardswish'
if coarsest_stride >= 16 else 'ReLU'))
dim_in = dim
setattr(self, 'layer%d' % (i + 1), nn.Sequential(*blocks[-depth:]))
self.conv2 = conv_module(dim_in, blocks[-1].dim, 1)
self.blocks = blocks + [self.conv2]
# Head.
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(blocks[-1].dim, dims[-1]),
nn.Hardswish(),
nn.Dropout(p=dropout, inplace=True),
nn.Linear(dims[-1], num_classes),
) if num_classes > 0 else nn.Identity()
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
def forward(self, x):
x = self.conv1(x)
for blk in self.blocks:
x = blk(x)
return self.fc(self.avgpool(x).flatten_(1))
def mobilenet_v3_large(num_classes=1000):
return MobileNetV3(
dims=(16,) + (16, 24, 40, 80, 112, 160) + (1280,),
depths=(1, 2, 3, 4, 2, 3),
kernel_sizes=(3, 3, 5, 3, 3, 5),
strides=(1, 2, 2, 2, 1, 2),
expand_ratios=(1, (4, 3), 3, (6, 2.5, 2.3, 2.3), 6, 6),
squeeze_ratios=(1, 1, 0.25, 1, 0.25, 0.25),
num_classes=num_classes)
def mobilenet_v3_small(num_classes=1000):
return MobileNetV3(
dims=(16,) + (16, 24, 40, 48, 96) + (1024,),
depths=(1, 2, 3, 2, 3),
kernel_sizes=(3, 3, 5, 5, 5),
strides=(2, 2, 2, 1, 2),
expand_ratios=(1, (4.5, 88. / 24), (4, 6, 6), 3, 6),
squeeze_ratios=(0.25, 1, 0.25, 0.25, 0.25),
num_classes=num_classes)
if __name__ == '__main__':
args = parse_args()
print('Called with args:\n' + str(args))
use_fp16 = args.precision.lower() == 'float16'
m = globals()[args.model]().cuda(args.device)
m = m if args.train else m.eval()
m = m.half() if use_fp16 else m
criterion = nn.CrossEntropyLoss()
input = torch.zeros(args.batch_size, 3, 224, 224,
dtype=torch.float16 if use_fp16 else torch.float32)
input = input.cuda(args.device)
target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
for iter in range(5):
tic = time.time()
with torch.enable_grad() if args.train else torch.no_grad():
for i in range(30):
x = m(input)
if args.train:
loss = criterion(x.float(), target)
loss.backward()
torch.cuda.synchronize(args.device)
diff_time = time.time() - tic
print({'iter': iter,
'throughout': round(30.0 / diff_time * input.size(0), 2),
'time': round(diff_time, 3)})
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Bench ResNet."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import torch
import torch.nn as nn
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--train', action='store_true', help='run training or inference')
parser.add_argument('--precision', default='float16', help='compute precision')
parser.add_argument('--device', default=0, type=int, help='compute device')
parser.add_argument('--model', default='resnet50', help='compute model')
parser.add_argument('--batch_size', default=64, help='mini-batch size')
return parser.parse_args()
class BasicBlock(nn.Module):
"""Basic resnet block."""
expansion = 1
def __init__(self, dim_in, dim, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(dim_in, dim, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(dim)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(dim)
self.downsample = downsample
def forward(self, x):
shortcut = x
x = self.relu(self.bn1(self.conv1(x)))
x = self.bn2(self.conv2(x))
if self.downsample is not None:
shortcut = self.downsample(shortcut)
return self.relu(x.add_(shortcut))
class Bottleneck(nn.Module):
"""Bottleneck resnet block."""
expansion = 4
groups, width_per_group = 1, 64
def __init__(self, dim_in, dim, stride=1, downsample=None):
super(Bottleneck, self).__init__()
width = int(dim * (self.width_per_group / 64.)) * self.groups
self.conv1 = nn.Conv2d(dim_in, width, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(dim)
self.conv2 = nn.Conv2d(width, width, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(dim)
self.conv3 = nn.Conv2d(width, dim * self.expansion, 1, bias=False)
self.bn3 = nn.BatchNorm2d(dim * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
def forward(self, x):
shortcut = x
x = self.relu(self.bn1(self.conv1(x)))
x = self.relu(self.bn2(self.conv2(x)))
x = self.bn3(self.conv3(x))
if self.downsample is not None:
shortcut = self.downsample(shortcut)
return self.relu(x.add_(shortcut))
class ResNet(nn.Module):
"""ResNet."""
def __init__(self, block, depths, num_classes=1000):
super(ResNet, self).__init__()
dim_in, stage_dims, blocks = 64, [64, 128, 256, 512], []
self.num_features = stage_dims[-1] * block.expansion
self.conv1 = nn.Conv2d(3, stage_dims[0], kernel_size=7,
stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(stage_dims[0])
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.avgpool = nn.AdaptiveAvgPool2d(1)
# Blocks.
for i, depth, dim in zip(range(4), depths, stage_dims):
stride = 1 if i == 0 else 2
downsample = None
if stride != 1 or dim_in != dim * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(dim_in, dim * block.expansion, kernel_size=1,
stride=stride, bias=False),
nn.BatchNorm2d(dim * block.expansion))
blocks.append(block(dim_in, dim, stride, downsample))
dim_in = dim * block.expansion
for _ in range(depth - 1):
blocks.append(block(dim_in, dim))
setattr(self, 'layer%d' % (i + 1), nn.Sequential(*blocks[-depth:]))
self.blocks = blocks
# Head.
classifier = nn.Linear if num_classes > 0 else nn.Identity
self.fc = classifier(self.num_features, num_classes)
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
def forward(self, x):
x = self.relu(self.bn1(self.conv1(x)))
x = self.maxpool(x)
for blk in self.blocks:
x = blk(x)
return self.fc(self.avgpool(x).flatten(1))
def resnet18(num_classes=1000):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
def resnet34(num_classes=1000):
return ResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes)
def resnet50(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes)
def resnet101(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)
def resnet152(num_classes=1000):
return ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes)
if __name__ == '__main__':
args = parse_args()
print('Called with args:\n' + str(args))
use_fp16 = args.precision.lower() == 'float16'
m = globals()[args.model]().cuda(args.device)
m = m if args.train else m.eval()
m = m.half() if use_fp16 else m
criterion = nn.CrossEntropyLoss()
input = torch.zeros(args.batch_size, 3, 224, 224,
dtype=torch.float16 if use_fp16 else torch.float32)
input = input.cuda(args.device)
target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
for iter in range(5):
tic = time.time()
with torch.enable_grad() if args.train else torch.no_grad():
for i in range(30):
x = m(input)
if args.train:
loss = criterion(x.float(), target)
loss.backward()
torch.cuda.synchronize(args.device)
diff_time = time.time() - tic
print({'iter': iter,
'throughout': round(30.0 / diff_time * input.size(0), 2),
'time': round(diff_time, 3)})
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Bench ResNet."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
from dragon.vm import torch
from dragon.vm.torch import nn
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--train', action='store_true', help='run training or inference')
parser.add_argument('--precision', default='float16', help='compute precision')
parser.add_argument('--device', default=0, type=int, help='compute device')
parser.add_argument('--model', default='resnet50', help='compute model')
parser.add_argument('--batch_size', default=64, help='mini-batch size')
return parser.parse_args()
class BasicBlock(nn.Module):
"""Basic resnet block."""
expansion = 1
def __init__(self, dim_in, dim, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(dim_in, dim, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(dim)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(dim)
self.downsample = downsample
def forward(self, x):
shortcut = x
x = self.relu(self.bn1(self.conv1(x)))
x = self.bn2(self.conv2(x))
if self.downsample is not None:
shortcut = self.downsample(shortcut)
return self.relu(x.add_(shortcut))
class Bottleneck(nn.Module):
"""Bottleneck resnet block."""
expansion = 4
groups, width_per_group = 1, 64
def __init__(self, dim_in, dim, stride=1, downsample=None):
super(Bottleneck, self).__init__()
width = int(dim * (self.width_per_group / 64.)) * self.groups
self.conv1 = nn.Conv2d(dim_in, width, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(dim)
self.conv2 = nn.Conv2d(width, width, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(dim)
self.conv3 = nn.Conv2d(width, dim * self.expansion, 1, bias=False)
self.bn3 = nn.BatchNorm2d(dim * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
def forward(self, x):
shortcut = x
x = self.relu(self.bn1(self.conv1(x)))
x = self.relu(self.bn2(self.conv2(x)))
x = self.bn3(self.conv3(x))
if self.downsample is not None:
shortcut = self.downsample(shortcut)
return self.relu(x.add_(shortcut))
class ResNet(nn.Module):
"""ResNet."""
def __init__(self, block, depths, num_classes=1000):
super(ResNet, self).__init__()
dim_in, stage_dims, blocks = 64, [64, 128, 256, 512], []
self.num_features = stage_dims[-1] * block.expansion
self.conv1 = nn.Conv2d(3, stage_dims[0], kernel_size=7,
stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(stage_dims[0])
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# Blocks.
for i, depth, dim in zip(range(4), depths, stage_dims):
stride = 1 if i == 0 else 2
downsample = None
if stride != 1 or dim_in != dim * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(dim_in, dim * block.expansion, kernel_size=1,
stride=stride, bias=False),
nn.BatchNorm2d(dim * block.expansion))
blocks.append(block(dim_in, dim, stride, downsample))
dim_in = dim * block.expansion
for _ in range(depth - 1):
blocks.append(block(dim_in, dim))
setattr(self, 'layer%d' % (i + 1), nn.Sequential(*blocks[-depth:]))
self.blocks = blocks
# Head.
self.avgpool = nn.AdaptiveAvgPool2d(1)
classifier = nn.Linear if num_classes > 0 else nn.Identity
self.fc = classifier(self.num_features, num_classes)
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
def forward(self, x):
x = self.relu(self.bn1(self.conv1(x)))
x = self.maxpool(x)
for blk in self.blocks:
x = blk(x)
return self.fc(self.avgpool(x).flatten_(1))
def resnet18(num_classes=1000):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
def resnet34(num_classes=1000):
return ResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes)
def resnet50(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes)
def resnet101(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)
def resnet152(num_classes=1000):
return ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes)
if __name__ == '__main__':
args = parse_args()
print('Called with args:\n' + str(args))
use_fp16 = args.precision.lower() == 'float16'
m = globals()[args.model]().cuda(args.device)
m = m if args.train else m.eval()
m = m.half() if use_fp16 else m
criterion = nn.CrossEntropyLoss()
input = torch.zeros(args.batch_size, 3, 224, 224,
dtype=torch.float16 if use_fp16 else torch.float32)
input = input.cuda(args.device)
target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
for iter in range(5):
tic = time.time()
with torch.enable_grad() if args.train else torch.no_grad():
for i in range(30):
x = m(input)
if args.train:
loss = criterion(x.float(), target)
loss.backward()
torch.cuda.synchronize(args.device)
diff_time = time.time() - tic
print({'iter': iter,
'throughout': round(30.0 / diff_time * input.size(0), 2),
'time': round(diff_time, 3)})
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Bench Vision Transformer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import torch
import torch.nn as nn
try:
from timm.models.layers import DropPath
except ImportError:
DropPath = nn.Identity
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--train', action='store_true', help='run training or inference')
parser.add_argument('--precision', default='float16', help='compute precision')
parser.add_argument('--device', default=0, type=int, help='compute device')
parser.add_argument('--model', default='vit_base_patch16_224', help='compute model')
parser.add_argument('--batch_size', default=64, help='mini-batch size')
return parser.parse_args()
class MLP(nn.Module):
"""Two layers MLP."""
def __init__(self, dim, mlp_ratio=4):
super(MLP, self).__init__()
self.fc1 = nn.Linear(dim, int(dim * mlp_ratio))
self.fc2 = nn.Linear(int(dim * mlp_ratio), dim)
self.activation = nn.GELU()
def forward(self, x):
return self.fc2(self.activation(self.fc1(x)))
class Attention(nn.Module):
"""Multihead attention."""
def __init__(self, dim, num_heads, qkv_bias=True):
super(Attention, self).__init__()
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = self.head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.proj = nn.Linear(dim, dim)
def forward(self, x):
qkv_shape = (-1, x.size(1), 3, self.num_heads, self.head_dim)
qkv = self.qkv(x).reshape(qkv_shape).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(dim=0)
attn = q @ k.transpose(-2, -1).mul(self.scale)
attn = nn.functional.softmax(attn, dim=-1)
return self.proj((attn @ v).transpose(1, 2).flatten(2))
class Block(nn.Module):
"""Transformer block."""
def __init__(self, dim, num_heads, mlp_ratio=4, qkv_bias=True, drop_path=0):
super(Block, self).__init__()
self.norm1 = nn.LayerNorm(dim)
self.attn = Attention(dim, num_heads, qkv_bias=qkv_bias)
self.norm2 = nn.LayerNorm(dim)
self.mlp = MLP(dim, mlp_ratio=mlp_ratio)
self.drop_path = DropPath(drop_path)
def forward(self, x):
x = self.drop_path(self.attn(self.norm1(x))).add_(x)
return self.drop_path(self.mlp(self.norm2(x))).add_(x)
class PatchEmbed(nn.Module):
"""Patch embedding layer."""
def __init__(self, dim=768, patch_size=16):
super(PatchEmbed, self).__init__()
self.proj = nn.Conv2d(3, dim, patch_size, patch_size)
def forward(self, x):
return self.proj(x)
class PosEmbed(nn.Module):
"""Position embedding layer."""
def __init__(self, dim, num_patches):
super(PosEmbed, self).__init__()
self.dim = dim
self.num_patches = num_patches
self.weight = nn.Parameter(torch.zeros(num_patches, dim))
nn.init.normal_(self.weight, std=0.02)
def forward(self, x):
return x.add_(self.weight)
class VisionTransformer(nn.Module):
"""Vision Transformer."""
def __init__(self, depths, dims, num_heads, mlp_ratios,
img_size=224, patch_size=16, drop_path=0, num_classes=1000):
super(VisionTransformer, self).__init__()
drop_path = (torch.linspace(
0, drop_path, sum(depths), dtype=torch.float32).tolist()
if drop_path > 0 else [drop_path] * sum(depths))
self.num_patches = (img_size // patch_size) ** 2
self.num_features = dims[0]
self.patch_embed = PatchEmbed(dims[0], patch_size)
self.pos_embed = PosEmbed(dims[0], self.num_patches + 1)
self.cls_token = nn.Parameter(torch.zeros(1, 1, dims[0]))
self.blocks = nn.ModuleList([Block(
dim=dims[0], num_heads=num_heads[0],
mlp_ratio=mlp_ratios[0], qkv_bias=True,
drop_path=drop_path[i]) for i in range(depths[0])])
self.norm = nn.LayerNorm(self.num_features)
classifier = nn.Linear if num_classes > 0 else nn.Identity
self.fc = classifier(self.num_features, num_classes)
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, std=.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
nn.init.normal_(self.cls_token, std=.02)
def forward(self, x):
x = self.patch_embed(x)
x = x.flatten(2).transpose(1, 2)
cls_tokens = self.cls_token.expand(x.size(0), 1, -1)
x = torch.cat((cls_tokens, x), dim=1)
x = self.pos_embed(x)
for blk in self.blocks:
x = blk(x)
return self.fc(self.norm(x[:, 1:].mean(1)))
def vit_small_patch16_224(num_classes=1000):
return VisionTransformer(depths=(12,), dims=(384,), num_heads=(6,),
mlp_ratios=(4,), img_size=224, patch_size=16,
drop_path=0.1, num_classes=num_classes)
def vit_base_patch16_224(num_classes=1000):
return VisionTransformer(depths=(12,), dims=(768,), num_heads=(12,),
mlp_ratios=(4,), img_size=224, patch_size=16,
drop_path=0.1, num_classes=num_classes)
def vit_large_patch16_224(num_classes=1000):
return VisionTransformer(depths=(24,), dims=(1024,), num_heads=(16,),
mlp_ratios=(4,), img_size=224, patch_size=16,
drop_path=0.1, num_classes=num_classes)
if __name__ == '__main__':
args = parse_args()
print('Called with args:\n' + str(args))
use_fp16 = args.precision.lower() == 'float16'
m = globals()[args.model]().cuda(args.device)
m = m if args.train else m.eval()
m = m.half() if use_fp16 else m
criterion = nn.CrossEntropyLoss()
input = torch.zeros(args.batch_size, 3, 224, 224,
dtype=torch.float16 if use_fp16 else torch.float32)
input = input.cuda(args.device)
target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
for iter in range(5):
tic = time.time()
with torch.enable_grad() if args.train else torch.no_grad():
for i in range(30):
x = m(input)
if args.train:
loss = criterion(x.float(), target)
loss.backward()
torch.cuda.synchronize(args.device)
diff_time = time.time() - tic
throughout = 30.0 / diff_time * input.size(0)
print({'iter': iter,
'throughout': round(30.0 / diff_time * input.size(0), 2),
'time': round(diff_time, 3)})
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Bench Vision Transformer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
from dragon.vm import torch
from dragon.vm.torch import nn
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--train', action='store_true', help='run training or inference')
parser.add_argument('--precision', default='float16', help='compute precision')
parser.add_argument('--device', default=0, type=int, help='compute device')
parser.add_argument('--model', default='vit_base_patch16_224', help='compute model')
parser.add_argument('--batch_size', default=64, help='mini-batch size')
return parser.parse_args()
class MLP(nn.Module):
"""Two layers MLP."""
def __init__(self, dim, mlp_ratio=4):
super(MLP, self).__init__()
self.fc1 = nn.Linear(dim, int(dim * mlp_ratio))
self.fc2 = nn.Linear(int(dim * mlp_ratio), dim)
self.activation = nn.GELU()
def forward(self, x):
return self.fc2(self.activation(self.fc1(x)))
class Attention(nn.Module):
"""Multihead attention."""
def __init__(self, dim, num_heads, qkv_bias=True):
super(Attention, self).__init__()
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = self.head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.proj = nn.Linear(dim, dim)
def forward(self, x):
qkv_shape = (-1, x.size(1), 3, self.num_heads, self.head_dim)
qkv = self.qkv(x).reshape_(qkv_shape).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(dim=0, copy=False)
attn = q @ k.transpose(-2, -1).mul_(self.scale)
attn = nn.functional.softmax(attn, dim=-1, inplace=True)
return self.proj((attn @ v).transpose(1, 2).flatten_(2))
class Block(nn.Module):
"""Transformer block."""
def __init__(self, dim, num_heads, mlp_ratio=4, qkv_bias=True, drop_path=0):
super(Block, self).__init__()
self.norm1 = nn.LayerNorm(dim)
self.attn = Attention(dim, num_heads, qkv_bias=qkv_bias)
self.norm2 = nn.LayerNorm(dim)
self.mlp = MLP(dim, mlp_ratio=mlp_ratio)
self.drop_path = nn.DropPath(drop_path, inplace=True)
def forward(self, x):
x = self.drop_path(self.attn(self.norm1(x))).add_(x)
return self.drop_path(self.mlp(self.norm2(x))).add_(x)
class PatchEmbed(nn.Module):
"""Patch embedding layer."""
def __init__(self, dim=768, patch_size=16):
super(PatchEmbed, self).__init__()
self.proj = nn.Conv2d(3, dim, patch_size, patch_size)
def forward(self, x):
return self.proj(x)
class PosEmbed(nn.Module):
"""Position embedding layer."""
def __init__(self, dim, num_patches):
super(PosEmbed, self).__init__()
self.dim = dim
self.num_patches = num_patches
self.weight = nn.Parameter(torch.zeros(num_patches, dim))
nn.init.normal_(self.weight, std=0.02)
def forward(self, x):
return x.add_(self.weight)
class VisionTransformer(nn.Module):
"""Vision Transformer."""
def __init__(self, depths, dims, num_heads, mlp_ratios,
img_size=224, patch_size=16, drop_path=0, num_classes=1000):
super(VisionTransformer, self).__init__()
drop_path = (torch.linspace(
0, drop_path, sum(depths), dtype=torch.float32).tolist()
if drop_path > 0 else [drop_path] * sum(depths))
self.num_patches = (img_size // patch_size) ** 2
self.num_features = dims[0]
self.patch_embed = PatchEmbed(dims[0], patch_size)
self.pos_embed = PosEmbed(dims[0], self.num_patches + 1)
self.cls_token = nn.Parameter(torch.zeros(1, 1, dims[0]))
self.blocks = nn.ModuleList([Block(
dim=dims[0], num_heads=num_heads[0],
mlp_ratio=mlp_ratios[0], qkv_bias=True,
drop_path=drop_path[i]) for i in range(depths[0])])
self.norm = nn.LayerNorm(self.num_features)
classifier = nn.Linear if num_classes > 0 else nn.Identity
self.fc = classifier(self.num_features, num_classes)
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, std=.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
nn.init.normal_(self.cls_token, std=.02)
def forward(self, x):
x = self.patch_embed(x)
x = x.flatten_(2).transpose(1, 2)
cls_tokens = self.cls_token.expand(x.size(0), 1, -1)
x = torch.cat((cls_tokens, x), dim=1)
x = self.pos_embed(x)
for blk in self.blocks:
x = blk(x)
return self.fc(self.norm(x[:, 1:].mean(1)))
def vit_small_patch16_224(num_classes=1000):
return VisionTransformer(depths=(12,), dims=(384,), num_heads=(6,),
mlp_ratios=(4,), img_size=224, patch_size=16,
drop_path=0.1, num_classes=num_classes)
def vit_base_patch16_224(num_classes=1000):
return VisionTransformer(depths=(12,), dims=(768,), num_heads=(12,),
mlp_ratios=(4,), img_size=224, patch_size=16,
drop_path=0.1, num_classes=num_classes)
def vit_large_patch16_224(num_classes=1000):
return VisionTransformer(depths=(24,), dims=(1024,), num_heads=(16,),
mlp_ratios=(4,), img_size=224, patch_size=16,
drop_path=0.1, num_classes=num_classes)
if __name__ == '__main__':
args = parse_args()
print('Called with args:\n' + str(args))
use_fp16 = args.precision.lower() == 'float16'
m = globals()[args.model]().cuda(args.device)
m = m if args.train else m.eval()
m = m.half() if use_fp16 else m
criterion = nn.CrossEntropyLoss()
input = torch.zeros(args.batch_size, 3, 224, 224,
dtype=torch.float16 if use_fp16 else torch.float32)
input = input.cuda(args.device)
target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
for iter in range(5):
tic = time.time()
with torch.enable_grad() if args.train else torch.no_grad():
for i in range(30):
x = m(input)
if args.train:
loss = criterion(x.float(), target)
loss.backward()
torch.cuda.synchronize(args.device)
diff_time = time.time() - tic
throughout = 30.0 / diff_time * input.size(0)
print({'iter': iter,
'throughout': round(30.0 / diff_time * input.size(0), 2),
'time': round(diff_time, 3)})
# Tutorial: How To Use
## Run benchmarks
We firstly run default benchmarks and save to "results.json":
```
python run.py -f ./results.json
```
To set the backend, add "--backend" argument:
```
python run.py --backend torch -f ./results.json
```
For more usages, see "--help" argument:
```
python run.py --help
```
## Visualize results
We use the measured results in ```references/torch_fp16_a40.json``` and ```references/torch_fp16_titanv.json```.
```
python visualize.py --device \
--input references/torch_fp16_a40.json \
references/torch_fp16_titanv.json \
--output torch_fp16_a40_titanv.png
```
For more usages, see "--help" argument:
```
python visualize.py --help
```
[
{
"device": "A40",
"backend": "torch-1.10.1+cu113",
"model": "resnet50.train",
"throughout": 608.885
},
{
"device": "A40",
"backend": "torch-1.10.1+cu113",
"model": "vit_base_patch16_224.train",
"throughout": 417.72749999999996
},
{
"device": "A40",
"backend": "torch-1.10.1+cu113",
"model": "swin_tiny_patch4_window7_224.train",
"throughout": 461.685
},
{
"device": "A40",
"backend": "torch-1.10.1+cu113",
"model": "mobilenet_v3_large.train",
"throughout": 1281.5575
},
{
"device": "A40",
"backend": "torch-1.10.1+cu113",
"model": "resnet50.eval",
"throughout": 2049.485
},
{
"device": "A40",
"backend": "torch-1.10.1+cu113",
"model": "vit_base_patch16_224.eval",
"throughout": 1000.2925
},
{
"device": "A40",
"backend": "torch-1.10.1+cu113",
"model": "swin_tiny_patch4_window7_224.eval",
"throughout": 1378.7150000000001
},
{
"device": "A40",
"backend": "torch-1.10.1+cu113",
"model": "mobilenet_v3_large.eval",
"throughout": 6638.6775
}
]
[
{
"device": "TITAN V",
"backend": "torch-1.8.1+cu111",
"model": "resnet50.train",
"throughout": 592.5925
},
{
"device": "TITAN V",
"backend": "torch-1.8.1+cu111",
"model": "vit_base_patch16_224.train",
"throughout": 260.4525
},
{
"device": "TITAN V",
"backend": "torch-1.8.1+cu111",
"model": "swin_tiny_patch4_window7_224.train",
"throughout": 385.3825
},
{
"device": "TITAN V",
"backend": "torch-1.8.1+cu111",
"model": "mobilenet_v3_large.train",
"throughout": 1014.5600000000001
},
{
"device": "TITAN V",
"backend": "torch-1.8.1+cu111",
"model": "resnet50.eval",
"throughout": 2113.0075
},
{
"device": "TITAN V",
"backend": "torch-1.8.1+cu111",
"model": "vit_base_patch16_224.eval",
"throughout": 711.6025
},
{
"device": "TITAN V",
"backend": "torch-1.8.1+cu111",
"model": "swin_tiny_patch4_window7_224.eval",
"throughout": 1323.185
},
{
"device": "TITAN V",
"backend": "torch-1.8.1+cu111",
"model": "mobilenet_v3_large.eval",
"throughout": 4600.1775
}
]
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Run benchmarks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import copy
import json
import logging
import sys
import subprocess
import time
BENCHMARKS = [
# Model Training.
('benchmarks/models/resnet/bench_*.py', 'resnet50.train'),
('benchmarks/models/vit/bench_*.py', 'vit_base_patch16_224.train'),
('benchmarks/models/swin/bench_*.py', 'swin_tiny_patch4_window7_224.train'),
('benchmarks/models/mobilenetv3/bench_*.py', 'mobilenet_v3_large.train'),
# Model Inference.
('benchmarks/models/resnet/bench_*.py', 'resnet50.eval'),
('benchmarks/models/vit/bench_*.py', 'vit_base_patch16_224.eval'),
('benchmarks/models/swin/bench_*.py', 'swin_tiny_patch4_window7_224.eval'),
('benchmarks/models/mobilenetv3/bench_*.py', 'mobilenet_v3_large.eval'),
]
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser(
description='Run the benchmarks.')
parser.add_argument('--precision', default='float16', help='compute precision')
parser.add_argument('--device', default=0, help='compute device')
parser.add_argument('--backend', default='torch', help='compute backend')
parser.add_argument('--metric', nargs='+', default=['throughout'],
help='performance metrics')
parser.add_argument('-q', '--quiet', action='store_true',
help='print error information only')
parser.add_argument('-f', '--filename', default='',
help='Save results to the specified file')
return parser.parse_args()
def get_base_command(args):
"""Return the base command."""
cmd = [sys.executable, '{}', '--model', '{}']
cmd += ['--train'] if args.train else []
cmd += ['--precision', args.precision]
cmd += ['--device', str(args.device)]
return cmd
def get_device_name(device_index):
"""Return the device name."""
device_name = subprocess.check_output(
'nvidia-smi --query-gpu=name --format=csv,noheader -i %d'
% device_index, shell=True).decode('ascii').strip()
device_name = device_name.replace('NVIDIA ', '')
return device_name
def get_backend_name(backend):
"""Return the backend name."""
if backend == 'torch':
version = subprocess.check_output(
'%s -c "import torch;print(torch.__version__)"'
% sys.executable, shell=True).decode('ascii').strip()
return 'torch-%s' % version
elif backend == 'tf':
version = subprocess.check_output(
'%s -c "import tensorflow;print(tensorflow.__version__)"'
% sys.executable, shell=True).decode('ascii').strip()
elif 'vm' in backend:
version = subprocess.check_output(
'%s -c "import dragon;print(dragon.__version__)"'
% sys.executable, shell=True).decode('ascii').strip()
return 'seeta-dragon-%s' % version
return backend
def get_model_args(args, model):
"""Return the model-specific args."""
args = copy.deepcopy(args)
model = model.split('.')
args.model = model.pop(0)
presets = {'train': ('train', True),
'eval': ('train', False),
'float16': ('precision', 'float16'),
'float32': ('precision', 'float32')}
for k, v in presets.items():
if k in model:
setattr(args, v[0], v[1])
return args
def get_results(output, keys):
"""Extract results from the output string."""
results = collections.defaultdict(list)
for line in output.splitlines():
if not line.startswith('{'):
continue
if not line.endswith('}'):
continue
metrics = eval(line)
for k in keys:
if k in metrics:
results[k].append(metrics[k])
for k in results.keys():
results[k].pop(0) # Warmup.
results[k] = sum(results[k]) / len(results[k])
return results
def main():
"""Main procedure."""
args = parse_args()
logging.getLogger().setLevel('ERROR' if args.quiet else 'INFO')
log_handler = logging.StreamHandler(sys.stderr)
log_handler.terminator = ''
log_handler.setFormatter(logging.Formatter('%(message)s'))
logging.getLogger().addHandler(log_handler)
all_results = []
for count, (script, model) in enumerate(BENCHMARKS):
model_args = get_model_args(args, model)
base_command = get_base_command(model_args)
logging.info('[%d/%d] bench %s ... '
% (count + 1, len(BENCHMARKS), model))
script = script.replace('*', args.backend)
command = (' '.join(base_command)).format(script, model_args.model)
output = subprocess.check_output(command, shell=True)
output = output.decode('ascii').strip()
results = collections.OrderedDict()
results['device'] = get_device_name(args.device)
results['backend'] = get_backend_name(args.backend)
results['model'] = model
results.update(get_results(output, args.metric))
all_results.append(results)
logging.info('ok\n')
if not args.filename:
args.filename = '../{}.json'.format(time.strftime(
'%Y%m%d_%H%M%S', time.localtime(time.time())))
with open(args.filename, 'w') as f:
json.dump(all_results, f)
if __name__ == '__main__':
main()
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Visualize results."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import json
from matplotlib import pyplot as plt
def parse_args():
"""Parse arguments."""
parser = argparse.ArgumentParser(
description='Visualize JSON results.')
parser.add_argument('--input', nargs='+', help='input json file')
parser.add_argument('--output', default=None, help='output png file')
parser.add_argument('--metric', default='throughout', help='performance metric')
parser.add_argument('--model', default=None, help='select model')
parser.add_argument('--train', action='store_true', help='select training model')
parser.add_argument('--eval', action='store_true', help='select inference model')
parser.add_argument('--device', action='store_true', help='add device name')
parser.add_argument('--title', default='', help='figure title')
parser.add_argument('--dpi', default=200, help='figure dpi')
return parser.parse_args()
def get_plot_xy(args, results):
x = results[args.metric]
y = results['model']
if args.train:
if '.eval' in y:
return None, None
y = y.replace('.train', '')
if args.eval:
if '.train' in y:
return None, None
y = y.replace('.eval', '')
if args.model is not None:
if not y.startswith(args.model):
return None, None
if args.device:
y += ' [' + results['device'] + ']'
return x, y
if __name__ == '__main__':
args = parse_args()
# Collect data.
all_results = []
for input in args.input:
with open(input, 'r') as f:
all_results += json.load(f)
xs, ys = [], []
for results in all_results:
x, y = get_plot_xy(args, results)
if x is not None:
ys.append(y)
xs.append(x)
ys, xs = zip(*sorted(zip(ys, xs), reverse=True))
if len(set(ys)) != len(ys):
print('Warning: duplicate Y-axis elements. Try to add "--device".')
# Set matplotlib.
plt.switch_backend('agg')
# https://matplotlib.org/matplotblog/posts/matplotlib-cyberpunk-style
plt.style.use("seaborn-dark")
for param in ['figure.facecolor', 'axes.facecolor', 'savefig.facecolor']:
plt.rcParams[param] = '#212946'
for param in ['text.color', 'axes.labelcolor', 'xtick.color', 'ytick.color']:
plt.rcParams[param] = '0.9'
# Draw plots.
fig, ax = plt.subplots(1, 1, dpi=args.dpi)
ax.set_title(args.title if args.title else 'GPU Performance', pad=10)
ax.grid(color='#2A3459', axis='x')
ax.set_xlabel(args.metric)
ax.bar_label(ax.barh(ys, xs, color='#FE53BB'), fmt='%.0f', padding=2)
# Save figures.
if args.output:
plt.savefig(args.output, bbox_inches='tight')
else:
plt.show()
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!