Init repository

Ting PAN
Commit 720a88ff authored Apr 08, 2022 by Ting PAN
Showing with 1852 additions and 0 deletions
.flake8
.gitignore
LICENSE
README.md
benchmarks/basic/gemm/bench_torch.py
benchmarks/basic/gemm/bench_torch_vm.py
benchmarks/models/mobilenetv3/bench_torch.py
benchmarks/models/mobilenetv3/bench_torch_vm.py
benchmarks/models/resnet/bench_torch.py
benchmarks/models/resnet/bench_torch_vm.py
benchmarks/models/swin/bench_torch.py
benchmarks/models/swin/bench_torch_vm.py
benchmarks/models/vit/bench_torch.py
benchmarks/models/vit/bench_torch_vm.py
docs/TUTORIAL.md
references/torch_fp16_a40.json
references/torch_fp16_a40_titanv.png
references/torch_fp16_titanv.json
run.py
version.txt
--- a/.flake8
+++ b/.flake8
+[flake8]
+max-line-length = 120
+ignore =  E741, # ambiguous variable name
+          F403, # ‘from module import *’ used; unable to detect undefined names
+          F405, # name may be undefined, or defined from star imports: module
+          F811, # redefinition of unused name from line N
+          F821, # undefined name
+          W503, # line break before binary operator
+          W504  # line break after binary operator
+# module imported but unused
+per-file-ignores = __init__.py: F401
--- a/.gitignore
+++ b/.gitignore
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.cuo
+
+# Compiled Dynamic libraries
+*.so
+*.dll
+*.dylib
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Compiled python
+*.pyc
+__pycache__
+
+# Compiled MATLAB
+*.mex*
+
+# IPython notebook checkpoints
+.ipynb_checkpoints
+
+# Editor temporaries
+*.swp
+*~
+
+# Sublime Text settings
+*.sublime-workspace
+*.sublime-project
+
+# Eclipse Project settings
+*.*project
+.settings
+
+# QtCreator files
+*.user
+
+# VSCode files
+.vscode
+
+# IDEA files
+.idea
+
+# OSX dir files
+.DS_Store
+
+# Android files
+.gradle
+*.iml
+local.properties
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2017, SeetaTech, Co.,Ltd. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
+# SeetaBench
+SeetaBench is a collection of benchmarks to evaluate device performance.
+
+## Quick Start
+See [Tutorial: How To Use](docs/TUTORIAL.md).
+
+## License
+[BSD 2-Clause license](LICENSE)
--- a/benchmarks/basic/gemm/bench_torch.py
+++ b/benchmarks/basic/gemm/bench_torch.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Bench GEMM performance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+
+import torch
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--device', default=0, type=int, help='compute device')
+    parser.add_argument('--m', default=8192, type=int, help='row of matrix A')
+    parser.add_argument('--k', default=8192, type=int, help='col of matrix A')
+    parser.add_argument('--n', default=8192, type=int, help='col of matrix B')
+    parser.add_argument('--gemm-efficiency', default=0.9, help='gemm impl efficiency')
+    return parser.parse_args()
+
+
+def bench_gemm_flops(m=8192, k=8192, n=8192, precision=torch.float32,
+                     device=0, allow_tf32=False, efficiency=0.9):
+    torch.backends.cuda.allow_tf32 = allow_tf32
+    a = torch.zeros(m, k, dtype=precision).cuda(device)
+    b = torch.zeros(k, n, dtype=precision).cuda(device)
+    for _ in range(5):  # Warmup.
+        _ = torch.matmul(a, b)
+    torch.cuda.synchronize(device)
+    tic = time.time()
+    for _ in range(30):
+        _ = torch.matmul(a, b)
+    torch.cuda.synchronize(device)
+    average_time = (time.time() - tic) / 30.0
+    tflops = (a.numel() * b.size(-1)) * 2 / 1e12
+    if precision == torch.float16:
+        title = 'Float16'
+    elif precision == torch.float32:
+        title = 'TensorFloat32' if allow_tf32 else 'Float32'
+    else:
+        raise ValueError('Unknown precision: ' + str(precision))
+    title += '-GEMM (M={}, K={}, N={}):'.format(m, k, n)
+    print(title, '{:.2f} ({:.2f}) TFLOPS'
+          .format(tflops / average_time, tflops / efficiency / average_time))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Called with args:\n' + str(args))
+    bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float16,
+                     device=args.device, allow_tf32=False, efficiency=args.gemm_efficiency)
+    bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float32,
+                     device=args.device, allow_tf32=False, efficiency=args.gemm_efficiency)
+    bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float32,
+                     device=args.device, allow_tf32=True, efficiency=args.gemm_efficiency)
--- a/benchmarks/basic/gemm/bench_torch_vm.py
+++ b/benchmarks/basic/gemm/bench_torch_vm.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Bench GEMM performance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+
+from dragon.vm import torch
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--device', default=0, type=int, help='compute device')
+    parser.add_argument('--m', default=8192, type=int, help='row of matrix A')
+    parser.add_argument('--k', default=8192, type=int, help='col of matrix A')
+    parser.add_argument('--n', default=8192, type=int, help='col of matrix B')
+    parser.add_argument('--gemm-efficiency', default=0.9, help='gemm impl efficiency')
+    return parser.parse_args()
+
+
+def bench_gemm_flops(m=8192, k=8192, n=8192, precision=torch.float32,
+                     device=0, allow_tf32=False, efficiency=0.9):
+    torch.backends.cuda.allow_tf32 = allow_tf32
+    a = torch.zeros(m, k, dtype=precision).cuda(device)
+    b = torch.zeros(k, n, dtype=precision).cuda(device)
+    for _ in range(5):  # Warmup.
+        _ = torch.matmul(a, b)
+    torch.cuda.synchronize(device)
+    tic = time.time()
+    for _ in range(30):
+        _ = torch.matmul(a, b)
+    torch.cuda.synchronize(device)
+    average_time = (time.time() - tic) / 30.0
+    tflops = (a.numel() * b.size(-1)) * 2 / 1e12
+    if precision == torch.float16:
+        title = 'Float16'
+    elif precision == torch.float32:
+        title = 'TensorFloat32' if allow_tf32 else 'Float32'
+    else:
+        raise ValueError('Unknown precision: ' + str(precision))
+    title += '-GEMM (M={}, K={}, N={}): '.format(m, k, n)
+    print(title, '{:.2f} ({:.2f}) TFLOPS'
+          .format(tflops / average_time, tflops / efficiency / average_time))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Called with args:\n' + str(args))
+    bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float16,
+                     device=args.device, allow_tf32=False, efficiency=args.gemm_efficiency)
+    bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float32,
+                     device=args.device, allow_tf32=False, efficiency=args.gemm_efficiency)
+    bench_gemm_flops(m=args.m, k=args.k, n=args.n, precision=torch.float32,
+                     device=args.device, allow_tf32=True, efficiency=args.gemm_efficiency)
--- a/benchmarks/models/mobilenetv3/bench_torch.py
+++ b/benchmarks/models/mobilenetv3/bench_torch.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Bench MobileNetV3."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import time
+
+import torch
+import torch.nn as nn
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train', action='store_true', help='run training or inference')
+    parser.add_argument('--precision', default='float16', help='compute precision')
+    parser.add_argument('--device', default=0, type=int, help='compute device')
+    parser.add_argument('--model', default='mobilenet_v3_large', help='compute model')
+    parser.add_argument('--batch_size', default=128, help='mini-batch size')
+    return parser.parse_args()
+
+
+def make_divisible(v, divisor=8):
+    """Return the divisible value."""
+    min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvNorm2d(nn.Sequential):
+    """2d convolution followed by norm."""
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel_size,
+        stride=1,
+        padding=None,
+        dilation=1,
+        groups=1,
+        bias=True,
+        norm_type='BatchNorm2d',
+        activation_type='',
+        inplace=True,
+    ):
+        super(ConvNorm2d, self).__init__()
+        if padding is None:
+            padding = kernel_size // 2
+        layers = [nn.Conv2d(dim_in, dim_out,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=padding,
+                            dilation=dilation,
+                            groups=groups,
+                            bias=bias and (not norm_type))]
+        if norm_type:
+            layers += [getattr(nn, norm_type)(dim_out)]
+        if activation_type:
+            layers += [getattr(nn, activation_type)()]
+            layers[-1].inplace = inplace
+        for i, layer in enumerate(layers):
+            self.add_module(str(i), layer)
+
+
+class SqueezeExcite(nn.Module):
+    """Squeeze-and-Excitation block."""
+
+    def __init__(self, dim_in, dim):
+        super(SqueezeExcite, self).__init__()
+        self.conv1 = nn.Conv2d(dim_in, dim, 1)
+        self.conv2 = nn.Conv2d(dim, dim_in, 1)
+        self.activation1 = nn.ReLU(True)
+        self.activation2 = nn.Hardsigmoid(True)
+
+    def forward(self, x):
+        scale = x.mean((2, 3), keepdim=True)
+        scale = self.activation1(self.conv1(scale))
+        scale = self.activation2(self.conv2(scale))
+        return x * scale
+
+
+class InvertedResidual(nn.Module):
+    """Invert residual block."""
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel_size=3,
+        stride=1,
+        expand_ratio=3,
+        squeeze_ratio=1,
+        activation_type='ReLU',
+    ):
+        super(InvertedResidual, self).__init__()
+        conv_module = functools.partial(
+            ConvNorm2d, activation_type=activation_type)
+        self.apply_shortcut = stride == 1 and dim_in == dim_out
+        self.dim = dim = int(round(dim_in * expand_ratio))
+        self.conv1 = (conv_module(dim_in, dim, 1)
+                      if expand_ratio > 1 else nn.Identity())
+        self.conv2 = conv_module(dim, dim, kernel_size, stride, groups=dim)
+        self.se = (SqueezeExcite(dim, make_divisible(dim * squeeze_ratio))
+                   if squeeze_ratio < 1 else nn.Identity())
+        self.conv3 = conv_module(dim, dim_out, 1, activation_type='')
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.se(x)
+        x = self.conv3(x)
+        if self.apply_shortcut:
+            return x.add_(shortcut)
+        return x
+
+
+class MobileNetV3(nn.Module):
+    """MobileNetV3 class."""
+
+    def __init__(self, depths, dims, kernel_sizes, strides,
+                 expand_ratios, squeeze_ratios, width_mult=1.0,
+                 dropout=0.2, num_classes=1000):
+        super(MobileNetV3, self).__init__()
+        conv_module = functools.partial(
+            ConvNorm2d, activation_type='Hardswish')
+        dims = list(map(lambda x: make_divisible(x * width_mult), dims))
+        self.conv1 = conv_module(3, dims[0], 3, 2)
+        dim_in, blocks, coarsest_stride = dims[0], [], 2
+        for i, (depth, dim) in enumerate(zip(depths, dims[1:])):
+            coarsest_stride *= strides[i]
+            layer_expand_ratios = expand_ratios[i]
+            if not isinstance(layer_expand_ratios, (tuple, list)):
+                layer_expand_ratios = [layer_expand_ratios]
+            layer_expand_ratios = list(layer_expand_ratios)
+            layer_expand_ratios += ([layer_expand_ratios[-1]] *
+                                    (depth - len(layer_expand_ratios)))
+            for j in range(depth):
+                blocks.append(InvertedResidual(
+                    dim_in, dim,
+                    kernel_size=kernel_sizes[i],
+                    stride=strides[i] if j == 0 else 1,
+                    expand_ratio=layer_expand_ratios[j],
+                    squeeze_ratio=squeeze_ratios[i],
+                    activation_type='Hardswish'
+                    if coarsest_stride >= 16 else 'ReLU'))
+                dim_in = dim
+            setattr(self, 'layer%d' % (i + 1), nn.Sequential(*blocks[-depth:]))
+        self.conv2 = conv_module(dim_in, blocks[-1].dim, 1)
+        self.blocks = blocks + [self.conv2]
+        # Head.
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(blocks[-1].dim, dims[-1]),
+            nn.Hardswish(),
+            nn.Dropout(p=dropout, inplace=True),
+            nn.Linear(dims[-1], num_classes),
+        ) if num_classes > 0 else nn.Identity()
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        for blk in self.blocks:
+            x = blk(x)
+        return self.fc(self.avgpool(x).flatten(1))
+
+
+def mobilenet_v3_large(num_classes=1000):
+    return MobileNetV3(
+        dims=(16,) + (16, 24, 40, 80, 112, 160) + (1280,),
+        depths=(1, 2, 3, 4, 2, 3),
+        kernel_sizes=(3, 3, 5, 3, 3, 5),
+        strides=(1, 2, 2, 2, 1, 2),
+        expand_ratios=(1, (4, 3), 3, (6, 2.5, 2.3, 2.3), 6, 6),
+        squeeze_ratios=(1, 1, 0.25, 1, 0.25, 0.25),
+        num_classes=num_classes)
+
+
+def mobilenet_v3_small(num_classes=1000):
+    return MobileNetV3(
+        dims=(16,) + (16, 24, 40, 48, 96) + (1024,),
+        depths=(1, 2, 3, 2, 3),
+        kernel_sizes=(3, 3, 5, 5, 5),
+        strides=(2, 2, 2, 1, 2),
+        expand_ratios=(1, (4.5, 88. / 24), (4, 6, 6), 3, 6),
+        squeeze_ratios=(0.25, 1, 0.25, 0.25, 0.25),
+        num_classes=num_classes)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Called with args:\n' + str(args))
+    use_fp16 = args.precision.lower() == 'float16'
+    m = globals()[args.model]().cuda(args.device)
+    m = m if args.train else m.eval()
+    m = m.half() if use_fp16 else m
+    criterion = nn.CrossEntropyLoss()
+    input = torch.zeros(args.batch_size, 3, 224, 224,
+                        dtype=torch.float16 if use_fp16 else torch.float32)
+    input = input.cuda(args.device)
+    target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
+    for iter in range(5):
+        tic = time.time()
+        with torch.enable_grad() if args.train else torch.no_grad():
+            for i in range(30):
+                x = m(input)
+                if args.train:
+                    loss = criterion(x.float(), target)
+                    loss.backward()
+        torch.cuda.synchronize(args.device)
+        diff_time = time.time() - tic
+        print({'iter': iter,
+               'throughout': round(30.0 / diff_time * input.size(0), 2),
+               'time': round(diff_time, 3)})
--- a/benchmarks/models/mobilenetv3/bench_torch_vm.py
+++ b/benchmarks/models/mobilenetv3/bench_torch_vm.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Bench MobileNetV3."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import time
+
+from dragon.vm import torch
+from dragon.vm.torch import nn
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train', action='store_true', help='run training or inference')
+    parser.add_argument('--precision', default='float16', help='compute precision')
+    parser.add_argument('--device', default=0, type=int, help='compute device')
+    parser.add_argument('--model', default='mobilenet_v3_large', help='compute model')
+    parser.add_argument('--batch_size', default=128, help='mini-batch size')
+    return parser.parse_args()
+
+
+def make_divisible(v, divisor=8):
+    """Return the divisible value."""
+    min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvNorm2d(nn.Sequential):
+    """2d convolution followed by norm."""
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel_size,
+        stride=1,
+        padding=None,
+        dilation=1,
+        groups=1,
+        bias=True,
+        norm_type='BatchNorm2d',
+        activation_type='',
+        inplace=True,
+    ):
+        super(ConvNorm2d, self).__init__()
+        if padding is None:
+            padding = kernel_size // 2
+        layers = [nn.Conv2d(dim_in, dim_out,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=padding,
+                            dilation=dilation,
+                            groups=groups,
+                            bias=bias and (not norm_type))]
+        if norm_type:
+            layers += [getattr(nn, norm_type)(dim_out)]
+        if activation_type:
+            layers += [getattr(nn, activation_type)()]
+            layers[-1].inplace = inplace
+        for i, layer in enumerate(layers):
+            self.add_module(str(i), layer)
+
+
+class SqueezeExcite(nn.Module):
+    """Squeeze-and-Excitation block."""
+
+    def __init__(self, dim_in, dim):
+        super(SqueezeExcite, self).__init__()
+        self.conv1 = nn.Conv2d(dim_in, dim, 1)
+        self.conv2 = nn.Conv2d(dim, dim_in, 1)
+        self.activation1 = nn.ReLU(True)
+        self.activation2 = nn.Hardsigmoid(True)
+
+    def forward(self, x):
+        scale = x.mean((2, 3), keepdim=True)
+        scale = self.activation1(self.conv1(scale))
+        scale = self.activation2(self.conv2(scale))
+        return x * scale
+
+
+class InvertedResidual(nn.Module):
+    """Invert residual block."""
+
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel_size=3,
+        stride=1,
+        expand_ratio=3,
+        squeeze_ratio=1,
+        activation_type='ReLU',
+    ):
+        super(InvertedResidual, self).__init__()
+        conv_module = functools.partial(
+            ConvNorm2d, activation_type=activation_type)
+        self.apply_shortcut = stride == 1 and dim_in == dim_out
+        self.dim = dim = int(round(dim_in * expand_ratio))
+        self.conv1 = (conv_module(dim_in, dim, 1)
+                      if expand_ratio > 1 else nn.Identity())
+        self.conv2 = conv_module(dim, dim, kernel_size, stride, groups=dim)
+        self.se = (SqueezeExcite(dim, make_divisible(dim * squeeze_ratio))
+                   if squeeze_ratio < 1 else nn.Identity())
+        self.conv3 = conv_module(dim, dim_out, 1, activation_type='')
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.se(x)
+        x = self.conv3(x)
+        if self.apply_shortcut:
+            return x.add_(shortcut)
+        return x
+
+
+class MobileNetV3(nn.Module):
+    """MobileNetV3 class."""
+
+    def __init__(self, depths, dims, kernel_sizes, strides,
+                 expand_ratios, squeeze_ratios, width_mult=1.0,
+                 dropout=0.2, num_classes=1000):
+        super(MobileNetV3, self).__init__()
+        conv_module = functools.partial(
+            ConvNorm2d, activation_type='Hardswish')
+        dims = list(map(lambda x: make_divisible(x * width_mult), dims))
+        self.conv1 = conv_module(3, dims[0], 3, 2)
+        dim_in, blocks, coarsest_stride = dims[0], [], 2
+        for i, (depth, dim) in enumerate(zip(depths, dims[1:])):
+            coarsest_stride *= strides[i]
+            layer_expand_ratios = expand_ratios[i]
+            if not isinstance(layer_expand_ratios, (tuple, list)):
+                layer_expand_ratios = [layer_expand_ratios]
+            layer_expand_ratios = list(layer_expand_ratios)
+            layer_expand_ratios += ([layer_expand_ratios[-1]] *
+                                    (depth - len(layer_expand_ratios)))
+            for j in range(depth):
+                blocks.append(InvertedResidual(
+                    dim_in, dim,
+                    kernel_size=kernel_sizes[i],
+                    stride=strides[i] if j == 0 else 1,
+                    expand_ratio=layer_expand_ratios[j],
+                    squeeze_ratio=squeeze_ratios[i],
+                    activation_type='Hardswish'
+                    if coarsest_stride >= 16 else 'ReLU'))
+                dim_in = dim
+            setattr(self, 'layer%d' % (i + 1), nn.Sequential(*blocks[-depth:]))
+        self.conv2 = conv_module(dim_in, blocks[-1].dim, 1)
+        self.blocks = blocks + [self.conv2]
+        # Head.
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(blocks[-1].dim, dims[-1]),
+            nn.Hardswish(),
+            nn.Dropout(p=dropout, inplace=True),
+            nn.Linear(dims[-1], num_classes),
+        ) if num_classes > 0 else nn.Identity()
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        for blk in self.blocks:
+            x = blk(x)
+        return self.fc(self.avgpool(x).flatten_(1))
+
+
+def mobilenet_v3_large(num_classes=1000):
+    return MobileNetV3(
+        dims=(16,) + (16, 24, 40, 80, 112, 160) + (1280,),
+        depths=(1, 2, 3, 4, 2, 3),
+        kernel_sizes=(3, 3, 5, 3, 3, 5),
+        strides=(1, 2, 2, 2, 1, 2),
+        expand_ratios=(1, (4, 3), 3, (6, 2.5, 2.3, 2.3), 6, 6),
+        squeeze_ratios=(1, 1, 0.25, 1, 0.25, 0.25),
+        num_classes=num_classes)
+
+
+def mobilenet_v3_small(num_classes=1000):
+    return MobileNetV3(
+        dims=(16,) + (16, 24, 40, 48, 96) + (1024,),
+        depths=(1, 2, 3, 2, 3),
+        kernel_sizes=(3, 3, 5, 5, 5),
+        strides=(2, 2, 2, 1, 2),
+        expand_ratios=(1, (4.5, 88. / 24), (4, 6, 6), 3, 6),
+        squeeze_ratios=(0.25, 1, 0.25, 0.25, 0.25),
+        num_classes=num_classes)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Called with args:\n' + str(args))
+    use_fp16 = args.precision.lower() == 'float16'
+    m = globals()[args.model]().cuda(args.device)
+    m = m if args.train else m.eval()
+    m = m.half() if use_fp16 else m
+    criterion = nn.CrossEntropyLoss()
+    input = torch.zeros(args.batch_size, 3, 224, 224,
+                        dtype=torch.float16 if use_fp16 else torch.float32)
+    input = input.cuda(args.device)
+    target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
+    for iter in range(5):
+        tic = time.time()
+        with torch.enable_grad() if args.train else torch.no_grad():
+            for i in range(30):
+                x = m(input)
+                if args.train:
+                    loss = criterion(x.float(), target)
+                    loss.backward()
+        torch.cuda.synchronize(args.device)
+        diff_time = time.time() - tic
+        print({'iter': iter,
+               'throughout': round(30.0 / diff_time * input.size(0), 2),
+               'time': round(diff_time, 3)})
--- a/benchmarks/models/resnet/bench_torch.py
+++ b/benchmarks/models/resnet/bench_torch.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Bench ResNet."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+
+import torch
+import torch.nn as nn
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train', action='store_true', help='run training or inference')
+    parser.add_argument('--precision', default='float16', help='compute precision')
+    parser.add_argument('--device', default=0, type=int, help='compute device')
+    parser.add_argument('--model', default='resnet50', help='compute model')
+    parser.add_argument('--batch_size', default=64, help='mini-batch size')
+    return parser.parse_args()
+
+
+class BasicBlock(nn.Module):
+    """Basic resnet block."""
+
+    expansion = 1
+
+    def __init__(self, dim_in, dim, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(dim_in, dim, kernel_size=3,
+                               stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(dim)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(dim)
+        self.downsample = downsample
+
+    def forward(self, x):
+        shortcut = x
+        x = self.relu(self.bn1(self.conv1(x)))
+        x = self.bn2(self.conv2(x))
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        return self.relu(x.add_(shortcut))
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck resnet block."""
+
+    expansion = 4
+    groups, width_per_group = 1, 64
+
+    def __init__(self, dim_in, dim, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        width = int(dim * (self.width_per_group / 64.)) * self.groups
+        self.conv1 = nn.Conv2d(dim_in, width, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(dim)
+        self.conv2 = nn.Conv2d(width, width, kernel_size=3,
+                               stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(dim)
+        self.conv3 = nn.Conv2d(width, dim * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(dim * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    def forward(self, x):
+        shortcut = x
+        x = self.relu(self.bn1(self.conv1(x)))
+        x = self.relu(self.bn2(self.conv2(x)))
+        x = self.bn3(self.conv3(x))
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        return self.relu(x.add_(shortcut))
+
+
+class ResNet(nn.Module):
+    """ResNet."""
+
+    def __init__(self, block, depths, num_classes=1000):
+        super(ResNet, self).__init__()
+        dim_in, stage_dims, blocks = 64, [64, 128, 256, 512], []
+        self.num_features = stage_dims[-1] * block.expansion
+        self.conv1 = nn.Conv2d(3, stage_dims[0], kernel_size=7,
+                               stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(stage_dims[0])
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        # Blocks.
+        for i, depth, dim in zip(range(4), depths, stage_dims):
+            stride = 1 if i == 0 else 2
+            downsample = None
+            if stride != 1 or dim_in != dim * block.expansion:
+                downsample = nn.Sequential(
+                    nn.Conv2d(dim_in, dim * block.expansion, kernel_size=1,
+                              stride=stride, bias=False),
+                    nn.BatchNorm2d(dim * block.expansion))
+            blocks.append(block(dim_in, dim, stride, downsample))
+            dim_in = dim * block.expansion
+            for _ in range(depth - 1):
+                blocks.append(block(dim_in, dim))
+            setattr(self, 'layer%d' % (i + 1), nn.Sequential(*blocks[-depth:]))
+        self.blocks = blocks
+        # Head.
+        classifier = nn.Linear if num_classes > 0 else nn.Identity
+        self.fc = classifier(self.num_features, num_classes)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, Bottleneck):
+                nn.init.constant_(m.bn3.weight, 0)
+
+    def forward(self, x):
+        x = self.relu(self.bn1(self.conv1(x)))
+        x = self.maxpool(x)
+        for blk in self.blocks:
+            x = blk(x)
+        return self.fc(self.avgpool(x).flatten(1))
+
+
+def resnet18(num_classes=1000):
+    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
+
+
+def resnet34(num_classes=1000):
+    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes)
+
+
+def resnet50(num_classes=1000):
+    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes)
+
+
+def resnet101(num_classes=1000):
+    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)
+
+
+def resnet152(num_classes=1000):
+    return ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Called with args:\n' + str(args))
+    use_fp16 = args.precision.lower() == 'float16'
+    m = globals()[args.model]().cuda(args.device)
+    m = m if args.train else m.eval()
+    m = m.half() if use_fp16 else m
+    criterion = nn.CrossEntropyLoss()
+    input = torch.zeros(args.batch_size, 3, 224, 224,
+                        dtype=torch.float16 if use_fp16 else torch.float32)
+    input = input.cuda(args.device)
+    target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
+    for iter in range(5):
+        tic = time.time()
+        with torch.enable_grad() if args.train else torch.no_grad():
+            for i in range(30):
+                x = m(input)
+                if args.train:
+                    loss = criterion(x.float(), target)
+                    loss.backward()
+        torch.cuda.synchronize(args.device)
+        diff_time = time.time() - tic
+        print({'iter': iter,
+               'throughout': round(30.0 / diff_time * input.size(0), 2),
+               'time': round(diff_time, 3)})
--- a/benchmarks/models/resnet/bench_torch_vm.py
+++ b/benchmarks/models/resnet/bench_torch_vm.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Bench ResNet."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+
+from dragon.vm import torch
+from dragon.vm.torch import nn
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train', action='store_true', help='run training or inference')
+    parser.add_argument('--precision', default='float16', help='compute precision')
+    parser.add_argument('--device', default=0, type=int, help='compute device')
+    parser.add_argument('--model', default='resnet50', help='compute model')
+    parser.add_argument('--batch_size', default=64, help='mini-batch size')
+    return parser.parse_args()
+
+
+class BasicBlock(nn.Module):
+    """Basic resnet block."""
+
+    expansion = 1
+
+    def __init__(self, dim_in, dim, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(dim_in, dim, kernel_size=3,
+                               stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(dim)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(dim)
+        self.downsample = downsample
+
+    def forward(self, x):
+        shortcut = x
+        x = self.relu(self.bn1(self.conv1(x)))
+        x = self.bn2(self.conv2(x))
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        return self.relu(x.add_(shortcut))
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck resnet block."""
+
+    expansion = 4
+    groups, width_per_group = 1, 64
+
+    def __init__(self, dim_in, dim, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        width = int(dim * (self.width_per_group / 64.)) * self.groups
+        self.conv1 = nn.Conv2d(dim_in, width, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(dim)
+        self.conv2 = nn.Conv2d(width, width, kernel_size=3,
+                               stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(dim)
+        self.conv3 = nn.Conv2d(width, dim * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(dim * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    def forward(self, x):
+        shortcut = x
+        x = self.relu(self.bn1(self.conv1(x)))
+        x = self.relu(self.bn2(self.conv2(x)))
+        x = self.bn3(self.conv3(x))
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        return self.relu(x.add_(shortcut))
+
+
+class ResNet(nn.Module):
+    """ResNet."""
+
+    def __init__(self, block, depths, num_classes=1000):
+        super(ResNet, self).__init__()
+        dim_in, stage_dims, blocks = 64, [64, 128, 256, 512], []
+        self.num_features = stage_dims[-1] * block.expansion
+        self.conv1 = nn.Conv2d(3, stage_dims[0], kernel_size=7,
+                               stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(stage_dims[0])
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        # Blocks.
+        for i, depth, dim in zip(range(4), depths, stage_dims):
+            stride = 1 if i == 0 else 2
+            downsample = None
+            if stride != 1 or dim_in != dim * block.expansion:
+                downsample = nn.Sequential(
+                    nn.Conv2d(dim_in, dim * block.expansion, kernel_size=1,
+                              stride=stride, bias=False),
+                    nn.BatchNorm2d(dim * block.expansion))
+            blocks.append(block(dim_in, dim, stride, downsample))
+            dim_in = dim * block.expansion
+            for _ in range(depth - 1):
+                blocks.append(block(dim_in, dim))
+            setattr(self, 'layer%d' % (i + 1), nn.Sequential(*blocks[-depth:]))
+        self.blocks = blocks
+        # Head.
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        classifier = nn.Linear if num_classes > 0 else nn.Identity
+        self.fc = classifier(self.num_features, num_classes)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, Bottleneck):
+                nn.init.constant_(m.bn3.weight, 0)
+
+    def forward(self, x):
+        x = self.relu(self.bn1(self.conv1(x)))
+        x = self.maxpool(x)
+        for blk in self.blocks:
+            x = blk(x)
+        return self.fc(self.avgpool(x).flatten_(1))
+
+
+def resnet18(num_classes=1000):
+    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
+
+
+def resnet34(num_classes=1000):
+    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes)
+
+
+def resnet50(num_classes=1000):
+    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes)
+
+
+def resnet101(num_classes=1000):
+    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)
+
+
+def resnet152(num_classes=1000):
+    return ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Called with args:\n' + str(args))
+    use_fp16 = args.precision.lower() == 'float16'
+    m = globals()[args.model]().cuda(args.device)
+    m = m if args.train else m.eval()
+    m = m.half() if use_fp16 else m
+    criterion = nn.CrossEntropyLoss()
+    input = torch.zeros(args.batch_size, 3, 224, 224,
+                        dtype=torch.float16 if use_fp16 else torch.float32)
+    input = input.cuda(args.device)
+    target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
+    for iter in range(5):
+        tic = time.time()
+        with torch.enable_grad() if args.train else torch.no_grad():
+            for i in range(30):
+                x = m(input)
+                if args.train:
+                    loss = criterion(x.float(), target)
+                    loss.backward()
+        torch.cuda.synchronize(args.device)
+        diff_time = time.time() - tic
+        print({'iter': iter,
+               'throughout': round(30.0 / diff_time * input.size(0), 2),
+               'time': round(diff_time, 3)})
--- a/benchmarks/models/swin/bench_torch.py
+++ b/benchmarks/models/swin/bench_torch.py
--- a/benchmarks/models/swin/bench_torch_vm.py
+++ b/benchmarks/models/swin/bench_torch_vm.py
--- a/benchmarks/models/vit/bench_torch.py
+++ b/benchmarks/models/vit/bench_torch.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Bench Vision Transformer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+
+import torch
+import torch.nn as nn
+try:
+    from timm.models.layers import DropPath
+except ImportError:
+    DropPath = nn.Identity
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train', action='store_true', help='run training or inference')
+    parser.add_argument('--precision', default='float16', help='compute precision')
+    parser.add_argument('--device', default=0, type=int, help='compute device')
+    parser.add_argument('--model', default='vit_base_patch16_224', help='compute model')
+    parser.add_argument('--batch_size', default=64, help='mini-batch size')
+    return parser.parse_args()
+
+
+class MLP(nn.Module):
+    """Two layers MLP."""
+
+    def __init__(self, dim, mlp_ratio=4):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(dim, int(dim * mlp_ratio))
+        self.fc2 = nn.Linear(int(dim * mlp_ratio), dim)
+        self.activation = nn.GELU()
+
+    def forward(self, x):
+        return self.fc2(self.activation(self.fc1(x)))
+
+
+class Attention(nn.Module):
+    """Multihead attention."""
+
+    def __init__(self, dim, num_heads, qkv_bias=True):
+        super(Attention, self).__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x):
+        qkv_shape = (-1, x.size(1), 3, self.num_heads, self.head_dim)
+        qkv = self.qkv(x).reshape(qkv_shape).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(dim=0)
+        attn = q @ k.transpose(-2, -1).mul(self.scale)
+        attn = nn.functional.softmax(attn, dim=-1)
+        return self.proj((attn @ v).transpose(1, 2).flatten(2))
+
+
+class Block(nn.Module):
+    """Transformer block."""
+
+    def __init__(self, dim, num_heads, mlp_ratio=4, qkv_bias=True, drop_path=0):
+        super(Block, self).__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = Attention(dim, num_heads, qkv_bias=qkv_bias)
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = MLP(dim, mlp_ratio=mlp_ratio)
+        self.drop_path = DropPath(drop_path)
+
+    def forward(self, x):
+        x = self.drop_path(self.attn(self.norm1(x))).add_(x)
+        return self.drop_path(self.mlp(self.norm2(x))).add_(x)
+
+
+class PatchEmbed(nn.Module):
+    """Patch embedding layer."""
+
+    def __init__(self, dim=768, patch_size=16):
+        super(PatchEmbed, self).__init__()
+        self.proj = nn.Conv2d(3, dim, patch_size, patch_size)
+
+    def forward(self, x):
+        return self.proj(x)
+
+
+class PosEmbed(nn.Module):
+    """Position embedding layer."""
+
+    def __init__(self, dim, num_patches):
+        super(PosEmbed, self).__init__()
+        self.dim = dim
+        self.num_patches = num_patches
+        self.weight = nn.Parameter(torch.zeros(num_patches, dim))
+        nn.init.normal_(self.weight, std=0.02)
+
+    def forward(self, x):
+        return x.add_(self.weight)
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer."""
+
+    def __init__(self, depths, dims, num_heads, mlp_ratios,
+                 img_size=224, patch_size=16, drop_path=0, num_classes=1000):
+        super(VisionTransformer, self).__init__()
+        drop_path = (torch.linspace(
+            0, drop_path, sum(depths), dtype=torch.float32).tolist()
+            if drop_path > 0 else [drop_path] * sum(depths))
+        self.num_patches = (img_size // patch_size) ** 2
+        self.num_features = dims[0]
+        self.patch_embed = PatchEmbed(dims[0], patch_size)
+        self.pos_embed = PosEmbed(dims[0], self.num_patches + 1)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, dims[0]))
+        self.blocks = nn.ModuleList([Block(
+            dim=dims[0], num_heads=num_heads[0],
+            mlp_ratio=mlp_ratios[0], qkv_bias=True,
+            drop_path=drop_path[i]) for i in range(depths[0])])
+        self.norm = nn.LayerNorm(self.num_features)
+        classifier = nn.Linear if num_classes > 0 else nn.Identity
+        self.fc = classifier(self.num_features, num_classes)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=.02)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+        nn.init.normal_(self.cls_token, std=.02)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = x.flatten(2).transpose(1, 2)
+        cls_tokens = self.cls_token.expand(x.size(0), 1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self.pos_embed(x)
+        for blk in self.blocks:
+            x = blk(x)
+        return self.fc(self.norm(x[:, 1:].mean(1)))
+
+
+def vit_small_patch16_224(num_classes=1000):
+    return VisionTransformer(depths=(12,), dims=(384,), num_heads=(6,),
+                             mlp_ratios=(4,), img_size=224, patch_size=16,
+                             drop_path=0.1, num_classes=num_classes)
+
+
+def vit_base_patch16_224(num_classes=1000):
+    return VisionTransformer(depths=(12,), dims=(768,), num_heads=(12,),
+                             mlp_ratios=(4,), img_size=224, patch_size=16,
+                             drop_path=0.1, num_classes=num_classes)
+
+
+def vit_large_patch16_224(num_classes=1000):
+    return VisionTransformer(depths=(24,), dims=(1024,), num_heads=(16,),
+                             mlp_ratios=(4,), img_size=224, patch_size=16,
+                             drop_path=0.1, num_classes=num_classes)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Called with args:\n' + str(args))
+    use_fp16 = args.precision.lower() == 'float16'
+    m = globals()[args.model]().cuda(args.device)
+    m = m if args.train else m.eval()
+    m = m.half() if use_fp16 else m
+    criterion = nn.CrossEntropyLoss()
+    input = torch.zeros(args.batch_size, 3, 224, 224,
+                        dtype=torch.float16 if use_fp16 else torch.float32)
+    input = input.cuda(args.device)
+    target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
+    for iter in range(5):
+        tic = time.time()
+        with torch.enable_grad() if args.train else torch.no_grad():
+            for i in range(30):
+                x = m(input)
+                if args.train:
+                    loss = criterion(x.float(), target)
+                    loss.backward()
+        torch.cuda.synchronize(args.device)
+        diff_time = time.time() - tic
+        throughout = 30.0 / diff_time * input.size(0)
+        print({'iter': iter,
+               'throughout': round(30.0 / diff_time * input.size(0), 2),
+               'time': round(diff_time, 3)})
--- a/benchmarks/models/vit/bench_torch_vm.py
+++ b/benchmarks/models/vit/bench_torch_vm.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Bench Vision Transformer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+
+from dragon.vm import torch
+from dragon.vm.torch import nn
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train', action='store_true', help='run training or inference')
+    parser.add_argument('--precision', default='float16', help='compute precision')
+    parser.add_argument('--device', default=0, type=int, help='compute device')
+    parser.add_argument('--model', default='vit_base_patch16_224', help='compute model')
+    parser.add_argument('--batch_size', default=64, help='mini-batch size')
+    return parser.parse_args()
+
+
+class MLP(nn.Module):
+    """Two layers MLP."""
+
+    def __init__(self, dim, mlp_ratio=4):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(dim, int(dim * mlp_ratio))
+        self.fc2 = nn.Linear(int(dim * mlp_ratio), dim)
+        self.activation = nn.GELU()
+
+    def forward(self, x):
+        return self.fc2(self.activation(self.fc1(x)))
+
+
+class Attention(nn.Module):
+    """Multihead attention."""
+
+    def __init__(self, dim, num_heads, qkv_bias=True):
+        super(Attention, self).__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x):
+        qkv_shape = (-1, x.size(1), 3, self.num_heads, self.head_dim)
+        qkv = self.qkv(x).reshape_(qkv_shape).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(dim=0, copy=False)
+        attn = q @ k.transpose(-2, -1).mul_(self.scale)
+        attn = nn.functional.softmax(attn, dim=-1, inplace=True)
+        return self.proj((attn @ v).transpose(1, 2).flatten_(2))
+
+
+class Block(nn.Module):
+    """Transformer block."""
+
+    def __init__(self, dim, num_heads, mlp_ratio=4, qkv_bias=True, drop_path=0):
+        super(Block, self).__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = Attention(dim, num_heads, qkv_bias=qkv_bias)
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = MLP(dim, mlp_ratio=mlp_ratio)
+        self.drop_path = nn.DropPath(drop_path, inplace=True)
+
+    def forward(self, x):
+        x = self.drop_path(self.attn(self.norm1(x))).add_(x)
+        return self.drop_path(self.mlp(self.norm2(x))).add_(x)
+
+
+class PatchEmbed(nn.Module):
+    """Patch embedding layer."""
+
+    def __init__(self, dim=768, patch_size=16):
+        super(PatchEmbed, self).__init__()
+        self.proj = nn.Conv2d(3, dim, patch_size, patch_size)
+
+    def forward(self, x):
+        return self.proj(x)
+
+
+class PosEmbed(nn.Module):
+    """Position embedding layer."""
+
+    def __init__(self, dim, num_patches):
+        super(PosEmbed, self).__init__()
+        self.dim = dim
+        self.num_patches = num_patches
+        self.weight = nn.Parameter(torch.zeros(num_patches, dim))
+        nn.init.normal_(self.weight, std=0.02)
+
+    def forward(self, x):
+        return x.add_(self.weight)
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer."""
+
+    def __init__(self, depths, dims, num_heads, mlp_ratios,
+                 img_size=224, patch_size=16, drop_path=0, num_classes=1000):
+        super(VisionTransformer, self).__init__()
+        drop_path = (torch.linspace(
+            0, drop_path, sum(depths), dtype=torch.float32).tolist()
+            if drop_path > 0 else [drop_path] * sum(depths))
+        self.num_patches = (img_size // patch_size) ** 2
+        self.num_features = dims[0]
+        self.patch_embed = PatchEmbed(dims[0], patch_size)
+        self.pos_embed = PosEmbed(dims[0], self.num_patches + 1)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, dims[0]))
+        self.blocks = nn.ModuleList([Block(
+            dim=dims[0], num_heads=num_heads[0],
+            mlp_ratio=mlp_ratios[0], qkv_bias=True,
+            drop_path=drop_path[i]) for i in range(depths[0])])
+        self.norm = nn.LayerNorm(self.num_features)
+        classifier = nn.Linear if num_classes > 0 else nn.Identity
+        self.fc = classifier(self.num_features, num_classes)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=.02)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+        nn.init.normal_(self.cls_token, std=.02)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = x.flatten_(2).transpose(1, 2)
+        cls_tokens = self.cls_token.expand(x.size(0), 1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self.pos_embed(x)
+        for blk in self.blocks:
+            x = blk(x)
+        return self.fc(self.norm(x[:, 1:].mean(1)))
+
+
+def vit_small_patch16_224(num_classes=1000):
+    return VisionTransformer(depths=(12,), dims=(384,), num_heads=(6,),
+                             mlp_ratios=(4,), img_size=224, patch_size=16,
+                             drop_path=0.1, num_classes=num_classes)
+
+
+def vit_base_patch16_224(num_classes=1000):
+    return VisionTransformer(depths=(12,), dims=(768,), num_heads=(12,),
+                             mlp_ratios=(4,), img_size=224, patch_size=16,
+                             drop_path=0.1, num_classes=num_classes)
+
+
+def vit_large_patch16_224(num_classes=1000):
+    return VisionTransformer(depths=(24,), dims=(1024,), num_heads=(16,),
+                             mlp_ratios=(4,), img_size=224, patch_size=16,
+                             drop_path=0.1, num_classes=num_classes)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print('Called with args:\n' + str(args))
+    use_fp16 = args.precision.lower() == 'float16'
+    m = globals()[args.model]().cuda(args.device)
+    m = m if args.train else m.eval()
+    m = m.half() if use_fp16 else m
+    criterion = nn.CrossEntropyLoss()
+    input = torch.zeros(args.batch_size, 3, 224, 224,
+                        dtype=torch.float16 if use_fp16 else torch.float32)
+    input = input.cuda(args.device)
+    target = torch.zeros(input.size(0), dtype=torch.int64).cuda(args.device)
+    for iter in range(5):
+        tic = time.time()
+        with torch.enable_grad() if args.train else torch.no_grad():
+            for i in range(30):
+                x = m(input)
+                if args.train:
+                    loss = criterion(x.float(), target)
+                    loss.backward()
+        torch.cuda.synchronize(args.device)
+        diff_time = time.time() - tic
+        throughout = 30.0 / diff_time * input.size(0)
+        print({'iter': iter,
+               'throughout': round(30.0 / diff_time * input.size(0), 2),
+               'time': round(diff_time, 3)})
--- a/docs/TUTORIAL.md
+++ b/docs/TUTORIAL.md
+# Tutorial: How To Use
+
+## Run benchmarks
+
+We firstly run default benchmarks and save to "results.json":
+
+```
+python run.py -f ./results.json
+```
+
+To set the backend, add "--backend" argument:
+
+```
+python run.py --backend torch -f ./results.json
+```
+
+For more usages, see "--help" argument:
+
+```
+python run.py --help
+```
+
+## Visualize results
+
+We use the measured results in ```references/torch_fp16_a40.json``` and ```references/torch_fp16_titanv.json```.
+
+```
+python visualize.py --device \
+  --input references/torch_fp16_a40.json \
+          references/torch_fp16_titanv.json \
+  --output torch_fp16_a40_titanv.png
+```
+
+For more usages, see "--help" argument:
+
+```
+python visualize.py --help
+```
+
--- a/references/torch_fp16_a40.json
+++ b/references/torch_fp16_a40.json
+[
+  {
+    "device": "A40",
+    "backend": "torch-1.10.1+cu113",
+    "model": "resnet50.train",
+    "throughout": 608.885
+  },
+  {
+    "device": "A40",
+    "backend": "torch-1.10.1+cu113",
+    "model": "vit_base_patch16_224.train",
+    "throughout": 417.72749999999996
+  },
+  {
+    "device": "A40",
+    "backend": "torch-1.10.1+cu113",
+    "model": "swin_tiny_patch4_window7_224.train",
+    "throughout": 461.685
+  },
+  {
+    "device": "A40",
+    "backend": "torch-1.10.1+cu113",
+    "model": "mobilenet_v3_large.train",
+    "throughout": 1281.5575
+  },
+  {
+    "device": "A40",
+    "backend": "torch-1.10.1+cu113",
+    "model": "resnet50.eval",
+    "throughout": 2049.485
+  },
+  {
+    "device": "A40",
+    "backend": "torch-1.10.1+cu113",
+    "model": "vit_base_patch16_224.eval",
+    "throughout": 1000.2925
+  },
+  {
+    "device": "A40",
+    "backend": "torch-1.10.1+cu113",
+    "model": "swin_tiny_patch4_window7_224.eval",
+    "throughout": 1378.7150000000001
+  },
+  {
+    "device": "A40",
+    "backend": "torch-1.10.1+cu113",
+    "model": "mobilenet_v3_large.eval",
+    "throughout": 6638.6775
+  }
+]
--- a/references/torch_fp16_a40_titanv.png
+++ b/references/torch_fp16_a40_titanv.png
--- a/references/torch_fp16_titanv.json
+++ b/references/torch_fp16_titanv.json
+[
+  {
+    "device": "TITAN V",
+    "backend": "torch-1.8.1+cu111",
+    "model": "resnet50.train",
+    "throughout": 592.5925
+  },
+  {
+    "device": "TITAN V",
+    "backend": "torch-1.8.1+cu111",
+    "model": "vit_base_patch16_224.train",
+    "throughout": 260.4525
+  },
+  {
+    "device": "TITAN V",
+    "backend": "torch-1.8.1+cu111",
+    "model": "swin_tiny_patch4_window7_224.train",
+    "throughout": 385.3825
+  },
+  {
+    "device": "TITAN V",
+    "backend": "torch-1.8.1+cu111",
+    "model": "mobilenet_v3_large.train",
+    "throughout": 1014.5600000000001
+  },
+  {
+    "device": "TITAN V",
+    "backend": "torch-1.8.1+cu111",
+    "model": "resnet50.eval",
+    "throughout": 2113.0075
+  },
+  {
+    "device": "TITAN V",
+    "backend": "torch-1.8.1+cu111",
+    "model": "vit_base_patch16_224.eval",
+    "throughout": 711.6025
+  },
+  {
+    "device": "TITAN V",
+    "backend": "torch-1.8.1+cu111",
+    "model": "swin_tiny_patch4_window7_224.eval",
+    "throughout": 1323.185
+  },
+  {
+    "device": "TITAN V",
+    "backend": "torch-1.8.1+cu111",
+    "model": "mobilenet_v3_large.eval",
+    "throughout": 4600.1775
+  }
+]
--- a/run.py
+++ b/run.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Run benchmarks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import copy
+import json
+import logging
+import sys
+import subprocess
+import time
+
+
+BENCHMARKS = [
+    # Model Training.
+    ('benchmarks/models/resnet/bench_*.py', 'resnet50.train'),
+    ('benchmarks/models/vit/bench_*.py', 'vit_base_patch16_224.train'),
+    ('benchmarks/models/swin/bench_*.py', 'swin_tiny_patch4_window7_224.train'),
+    ('benchmarks/models/mobilenetv3/bench_*.py', 'mobilenet_v3_large.train'),
+    # Model Inference.
+    ('benchmarks/models/resnet/bench_*.py', 'resnet50.eval'),
+    ('benchmarks/models/vit/bench_*.py', 'vit_base_patch16_224.eval'),
+    ('benchmarks/models/swin/bench_*.py', 'swin_tiny_patch4_window7_224.eval'),
+    ('benchmarks/models/mobilenetv3/bench_*.py', 'mobilenet_v3_large.eval'),
+]
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser(
+        description='Run the benchmarks.')
+    parser.add_argument('--precision', default='float16', help='compute precision')
+    parser.add_argument('--device', default=0, help='compute device')
+    parser.add_argument('--backend', default='torch', help='compute backend')
+    parser.add_argument('--metric', nargs='+', default=['throughout'],
+                        help='performance metrics')
+    parser.add_argument('-q', '--quiet', action='store_true',
+                        help='print error information only')
+    parser.add_argument('-f', '--filename', default='',
+                        help='Save results to the specified file')
+    return parser.parse_args()
+
+
+def get_base_command(args):
+    """Return the base command."""
+    cmd = [sys.executable, '{}', '--model', '{}']
+    cmd += ['--train'] if args.train else []
+    cmd += ['--precision', args.precision]
+    cmd += ['--device', str(args.device)]
+    return cmd
+
+
+def get_device_name(device_index):
+    """Return the device name."""
+    device_name = subprocess.check_output(
+        'nvidia-smi --query-gpu=name --format=csv,noheader -i %d'
+        % device_index, shell=True).decode('ascii').strip()
+    device_name = device_name.replace('NVIDIA ', '')
+    return device_name
+
+
+def get_backend_name(backend):
+    """Return the backend name."""
+    if backend == 'torch':
+        version = subprocess.check_output(
+            '%s -c "import torch;print(torch.__version__)"'
+            % sys.executable, shell=True).decode('ascii').strip()
+        return 'torch-%s' % version
+    elif backend == 'tf':
+        version = subprocess.check_output(
+            '%s -c "import tensorflow;print(tensorflow.__version__)"'
+            % sys.executable, shell=True).decode('ascii').strip()
+    elif 'vm' in backend:
+        version = subprocess.check_output(
+            '%s -c "import dragon;print(dragon.__version__)"'
+            % sys.executable, shell=True).decode('ascii').strip()
+        return 'seeta-dragon-%s' % version
+    return backend
+
+
+def get_model_args(args, model):
+    """Return the model-specific args."""
+    args = copy.deepcopy(args)
+    model = model.split('.')
+    args.model = model.pop(0)
+    presets = {'train': ('train', True),
+               'eval': ('train', False),
+               'float16': ('precision', 'float16'),
+               'float32': ('precision', 'float32')}
+    for k, v in presets.items():
+        if k in model:
+            setattr(args, v[0], v[1])
+    return args
+
+
+def get_results(output, keys):
+    """Extract results from the output string."""
+    results = collections.defaultdict(list)
+    for line in output.splitlines():
+        if not line.startswith('{'):
+            continue
+        if not line.endswith('}'):
+            continue
+        metrics = eval(line)
+        for k in keys:
+            if k in metrics:
+                results[k].append(metrics[k])
+    for k in results.keys():
+        results[k].pop(0)  # Warmup.
+        results[k] = sum(results[k]) / len(results[k])
+    return results
+
+
+def main():
+    """Main procedure."""
+    args = parse_args()
+    logging.getLogger().setLevel('ERROR' if args.quiet else 'INFO')
+    log_handler = logging.StreamHandler(sys.stderr)
+    log_handler.terminator = ''
+    log_handler.setFormatter(logging.Formatter('%(message)s'))
+    logging.getLogger().addHandler(log_handler)
+    all_results = []
+    for count, (script, model) in enumerate(BENCHMARKS):
+        model_args = get_model_args(args, model)
+        base_command = get_base_command(model_args)
+        logging.info('[%d/%d] bench %s ... '
+                     % (count + 1, len(BENCHMARKS), model))
+        script = script.replace('*', args.backend)
+        command = (' '.join(base_command)).format(script, model_args.model)
+        output = subprocess.check_output(command, shell=True)
+        output = output.decode('ascii').strip()
+        results = collections.OrderedDict()
+        results['device'] = get_device_name(args.device)
+        results['backend'] = get_backend_name(args.backend)
+        results['model'] = model
+        results.update(get_results(output, args.metric))
+        all_results.append(results)
+        logging.info('ok\n')
+    if not args.filename:
+        args.filename = '../{}.json'.format(time.strftime(
+            '%Y%m%d_%H%M%S', time.localtime(time.time())))
+    with open(args.filename, 'w') as f:
+        json.dump(all_results, f)
+
+
+if __name__ == '__main__':
+    main()
--- a/version.txt
+++ b/version.txt
+0.1.0a0
--- a/visualize.py
+++ b/visualize.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Visualize results."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import json
+
+from matplotlib import pyplot as plt
+
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser(
+        description='Visualize JSON results.')
+    parser.add_argument('--input', nargs='+', help='input json file')
+    parser.add_argument('--output', default=None, help='output png file')
+    parser.add_argument('--metric', default='throughout', help='performance metric')
+    parser.add_argument('--model', default=None, help='select model')
+    parser.add_argument('--train', action='store_true', help='select training model')
+    parser.add_argument('--eval', action='store_true', help='select inference model')
+    parser.add_argument('--device', action='store_true', help='add device name')
+    parser.add_argument('--title', default='', help='figure title')
+    parser.add_argument('--dpi', default=200, help='figure dpi')
+    return parser.parse_args()
+
+
+def get_plot_xy(args, results):
+    x = results[args.metric]
+    y = results['model']
+    if args.train:
+        if '.eval' in y:
+            return None, None
+        y = y.replace('.train', '')
+    if args.eval:
+        if '.train' in y:
+            return None, None
+        y = y.replace('.eval', '')
+    if args.model is not None:
+        if not y.startswith(args.model):
+            return None, None
+    if args.device:
+        y += ' [' + results['device'] + ']'
+    return x, y
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    # Collect data.
+    all_results = []
+    for input in args.input:
+        with open(input, 'r') as f:
+            all_results += json.load(f)
+    xs, ys = [], []
+    for results in all_results:
+        x, y = get_plot_xy(args, results)
+        if x is not None:
+            ys.append(y)
+            xs.append(x)
+    ys, xs = zip(*sorted(zip(ys, xs), reverse=True))
+    if len(set(ys)) != len(ys):
+        print('Warning: duplicate Y-axis elements. Try to add "--device".')
+
+    # Set matplotlib.
+    plt.switch_backend('agg')
+    # https://matplotlib.org/matplotblog/posts/matplotlib-cyberpunk-style
+    plt.style.use("seaborn-dark")
+    for param in ['figure.facecolor', 'axes.facecolor', 'savefig.facecolor']:
+        plt.rcParams[param] = '#212946'
+    for param in ['text.color', 'axes.labelcolor', 'xtick.color', 'ytick.color']:
+        plt.rcParams[param] = '0.9'
+
+    # Draw plots.
+    fig, ax = plt.subplots(1, 1, dpi=args.dpi)
+    ax.set_title(args.title if args.title else 'GPU Performance', pad=10)
+    ax.grid(color='#2A3459', axis='x')
+    ax.set_xlabel(args.metric)
+    ax.bar_label(ax.barh(ys, xs, color='#FE53BB'), fmt='%.0f', padding=2)
+
+    # Save figures.
+    if args.output:
+        plt.savefig(args.output, bbox_inches='tight')
+    else:
+        plt.show()