Normalize the getter of operator argument

Summary: This commit renames the operator argument getter to ``GetArgument`` whatever an argument is single or repeated.

Normalize the getter of operator argument
Summary: This commit renames the operator argument getter to ``GetArgument`` whatever an argument is single or repeated.
Ting PAN
Commit cca00c0d authored Aug 30, 2020 by Ting PAN
Showing with 2231 additions and 1491 deletions
docs/api/cc/dragon/core/Operator.rst
docs/api/python/dragon.rst
docs/api/python/dragon/random.rst
docs/api/python/dragon/random/permutation.rst
docs/api/python/dragon/arange.rst → docs/api/python/dragon/range.rst
docs/api/python/torch.rst
docs/api/python/torch/Tensor_.rst
docs/api/python/torch/flatten.rst
docs/api/python/torch/nn.rst
docs/api/python/torch/nn/Flatten.rst
docs/api/python/torch/randperm.rst
dragon/core/operator.cc
dragon/core/operator.h
dragon/kernels/activation/drop_block_op_kernel.cc
dragon/kernels/activation/drop_block_op_kernel.cu
dragon/kernels/activation/dropout_op_kernel.cc
dragon/kernels/activation/dropout_op_kernel.cu
dragon/kernels/array/permutation_op_kernel.cc
dragon/kernels/array/permutation_op_kernel.cu
dragon/kernels/array/arange_op_kernel.cc → dragon/kernels/array/range_op_kernel.cc
--- a/docs/api/cc/dragon/core/Operator.rst
+++ b/docs/api/cc/dragon/core/Operator.rst
@@ -11,14 +11,6 @@ Constructors
 Public Functions
 ----------------
-Arg
-###
-.. doxygenfunction:: dragon::Operator::Arg
-Args
-####
-.. doxygenfunction:: dragon::Operator::Args
 Buffer
 ######
 .. doxygenfunction:: dragon::Operator::Buffer
@@ -27,6 +19,14 @@ Fuse
 ####
 .. doxygenfunction:: dragon::Operator::Fuse
+GetArgument
+###########
+.. doxygenfunction:: dragon::Operator::GetArgument(const string &name)
+GetArgument
+###########
+.. doxygenfunction:: dragon::Operator::GetArgument(const string &name, const T &default_value)
 Input
 #####
 .. doxygenfunction:: dragon::Operator::Input

--- a/docs/api/python/dragon.rst
+++ b/docs/api/python/dragon.rst
@@ -21,9 +21,6 @@ dragon
  Functions
  ---------
-  `arange(...) <dragon/arange.html>`_
-  : Return a tensor of evenly spaced values within a interval.
  `assign(...) <dragon/assign.html>`_
  : Assign the value to input.
@@ -120,6 +117,9 @@ dragon
  `python_plugin(...) <dragon/python_plugin.html>`_
  : Create a plugin operator from the python class.
+  `range(...) <dragon/range.html>`_
+  : Return a tensor of evenly spaced values within a interval.
  `repeat(...) <dragon/repeat.html>`_
  : Repeat the elements along the given axis.
@@ -165,7 +165,6 @@ dragon
 .. toctree::
  :hidden:
-  dragon/arange
  dragon/assign
  dragon/broadcast_to
  dragon/cast
@@ -200,6 +199,7 @@ dragon
  dragon/one_hot
  dragon/pad
  dragon/python_plugin
+  dragon/range
  dragon/repeat
  dragon/reset_workspace
  dragon/reshape

--- a/docs/api/python/dragon/random.rst
+++ b/docs/api/python/dragon/random.rst
@@ -21,6 +21,9 @@ dragon.random
  `normal_like(...) <random/normal_like.html>`_
  : Return a tensor initialized from the normal distribution with shape as the other.
+  `permutation(...) <random/permutation.html>`_
+  : Return a tensor with value in the permuted range.
  `set_seed(...) <random/set_seed.html>`_
  : Set the global random seed.
@@ -41,6 +44,7 @@ dragon.random
  random/multinomial
  random/normal
  random/normal_like
+  random/permutation
  random/set_seed
  random/truncated_normal
  random/uniform

--- a/docs/api/python/dragon/random/permutation.rst
+++ b/docs/api/python/dragon/random/permutation.rst
+permutation
+===========
+.. autofunction:: dragon.random.permutation
+.. raw:: html
+  <style>
+    h1:before {
+      content: "dragon.random.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/dragon/arange.rst
+++ b/docs/api/python/dragon/arange.rst
-arange
+range
-======
+=====
-.. autofunction:: dragon.arange
+.. autofunction:: dragon.range
 .. raw:: html

--- a/docs/api/python/torch.rst
+++ b/docs/api/python/torch.rst
@@ -94,6 +94,9 @@ vm.torch
  `eye(...) <torch/eye.html>`_
  : Return a tensor constructed as the identity matrix.
+  `flatten(...) <torch/flatten.html>`_
+  : Return a tensor with dimensions flattened.
  `floor(...) <torch/floor.html>`_
  : Compute the largest integer not greater than input.
@@ -184,6 +187,9 @@ vm.torch
  `randn(...) <torch/randn.html>`_
  : Return a tensor from the normal distribution of N(0, 1).
+  `randperm(...) <torch/randperm.html>`_
+  : Return a tensor with value in the permuted range.
  `reciprocal(...) <torch/reciprocal.html>`_
  : Compute the reciprocal of input.
@@ -268,6 +274,7 @@ vm.torch
  torch/eq
  torch/exp
  torch/eye
+  torch/flatten
  torch/floor
  torch/from_numpy
  torch/ge
@@ -299,6 +306,7 @@ vm.torch
  torch/pow
  torch/rand
  torch/randn
+  torch/randperm
  torch/reciprocal
  torch/repeat
  torch/reshape

--- a/docs/api/python/torch/Tensor_.rst
+++ b/docs/api/python/torch/Tensor_.rst
@@ -189,6 +189,14 @@ fill\_
 #######
 .. automethod:: dragon.vm.torch.Tensor.fill_
+flatten
+#######
+.. automethod:: dragon.vm.torch.Tensor.flatten
+flatten\_
+#########
+.. automethod:: dragon.vm.torch.Tensor.flatten_
 float
 #####
 .. automethod:: dragon.vm.torch.Tensor.float
@@ -470,6 +478,7 @@ zero\_
 .. _torch.div(...): div.html
 .. _torch.eq(...): eq.html
 .. _torch.exp(...): exp.html
+.. _torch.flatten(...): flatten.html
 .. _torch.floor(...): floor.html
 .. _torch.ge(...): ge.html
 .. _torch.gt(...): gt.html

--- a/docs/api/python/torch/flatten.rst
+++ b/docs/api/python/torch/flatten.rst
+flatten
+=======
+.. autofunction:: dragon.vm.torch.flatten
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn.rst
+++ b/docs/api/python/torch/nn.rst
@@ -3,251 +3,255 @@ vm.torch.nn
 .. only:: html
-    Classes
+  Classes
-    -------
+  -------
-    `class Affine <nn/Affine.html>`_
+  `class Affine <nn/Affine.html>`_
-    : Apply the affine transformation over input.
+  : Apply the affine transformation over input.
-    `class AvgPool2d <nn/AvgPool2d.html>`_
+  `class AvgPool2d <nn/AvgPool2d.html>`_
-    : Apply the 2d average pooling.
+  : Apply the 2d average pooling.
-    `class BatchNorm1d <nn/BatchNorm1d.html>`_
+  `class BatchNorm1d <nn/BatchNorm1d.html>`_
-    : Apply the batch normalization over 2d input.
+  : Apply the batch normalization over 2d input.
-    `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
+  `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
-    `class BatchNorm2d <nn/BatchNorm2d.html>`_
+  `class BatchNorm2d <nn/BatchNorm2d.html>`_
-    : Apply the batch normalization over 3d input.
+  : Apply the batch normalization over 3d input.
-    `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
+  `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
-    `class BatchNorm3d <nn/BatchNorm3d.html>`_
+  `class BatchNorm3d <nn/BatchNorm3d.html>`_
-    : Apply the batch normalization over 4d input.
+  : Apply the batch normalization over 4d input.
-    `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
+  `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
-    `class BCEWithLogitsLoss <nn/BCEWithLogitsLoss.html>`_
+  `class BCEWithLogitsLoss <nn/BCEWithLogitsLoss.html>`_
-    : Compute the sigmoid cross entropy with contiguous targets.
+  : Compute the sigmoid cross entropy with contiguous targets.
-    `class ConstantPad1d <nn/ConstantPad1d.html>`_
+  `class ConstantPad1d <nn/ConstantPad1d.html>`_
-    : Pad input according to the last dimension with a constant.
+  : Pad input according to the last dimension with a constant.
-    `class ConstantPad2d <nn/ConstantPad2d.html>`_
+  `class ConstantPad2d <nn/ConstantPad2d.html>`_
-    : Pad input according to the last 2-dimensions with a constant.
+  : Pad input according to the last 2-dimensions with a constant.
-    `class ConstantPad2d <nn/ConstantPad2d.html>`_
+  `class ConstantPad2d <nn/ConstantPad2d.html>`_
-    : Pad input according to the last 3-dimensions with a constant.
+  : Pad input according to the last 3-dimensions with a constant.
-    `class Conv2d <nn/Conv2d.html>`_
+  `class Conv2d <nn/Conv2d.html>`_
-    : Apply the 2d convolution.
+  : Apply the 2d convolution.
-    `class ConvTranspose2d <nn/ConvTranspose2d.html>`_
+  `class ConvTranspose2d <nn/ConvTranspose2d.html>`_
-    : Apply the 2d deconvolution.
+  : Apply the 2d deconvolution.
-    `class CrossEntropyLoss <nn/CrossEntropyLoss.html>`_
+  `class CrossEntropyLoss <nn/CrossEntropyLoss.html>`_
-    : Compute the softmax cross entropy with sparse labels.
+  : Compute the softmax cross entropy with sparse labels.
-    `class CTCLoss <nn/CTCLoss.html>`_
+  `class CTCLoss <nn/CTCLoss.html>`_
-    : Compute the ctc loss with batched labels.
+  : Compute the ctc loss with batched labels.
-    `[Graves & Gomez, 2006] <http://www.cs.utoronto.ca/~graves/icml_2006.pdf>`_.
+  `[Graves & Gomez, 2006] <http://www.cs.utoronto.ca/~graves/icml_2006.pdf>`_.
-    `class DepthwiseConv2d <nn/DepthwiseConv2d.html>`_
+  `class DepthwiseConv2d <nn/DepthwiseConv2d.html>`_
-    : Apply the 2d depthwise convolution.
+  : Apply the 2d depthwise convolution.
-    `[Chollet, 2016] <https://arxiv.org/abs/1610.02357>`_.
+  `[Chollet, 2016] <https://arxiv.org/abs/1610.02357>`_.
-    `class DropBlock2d <nn/DropBlock2d.html>`_
+  `class DropBlock2d <nn/DropBlock2d.html>`_
-    : Set the spatial blocks to zero randomly.
+  : Set the spatial blocks to zero randomly.
-    `[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_.
+  `[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_.
-    `class Dropout <nn/Dropout.html>`_
+  `class Dropout <nn/Dropout.html>`_
-    : Set the elements to zero randomly.
+  : Set the elements to zero randomly.
-    `[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_.
+  `[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_.
-    `class DropPath <nn/DropPath.html>`_
+  `class DropPath <nn/DropPath.html>`_
-    : Set the examples over input to zero randomly.
+  : Set the examples over input to zero randomly.
-    `[Larsson et.al, 2016] <https://arxiv.org/abs/1605.07648>`_.
+  `[Larsson et.al, 2016] <https://arxiv.org/abs/1605.07648>`_.
-    `class ELU <nn/ELU.html>`_
+  `class ELU <nn/ELU.html>`_
-    : Apply the exponential linear unit.
+  : Apply the exponential linear unit.
-    `[Clevert et.al, 2015] <https://arxiv.org/abs/1511.07289>`_.
+  `[Clevert et.al, 2015] <https://arxiv.org/abs/1511.07289>`_.
-    `class GroupNorm <nn/GroupNorm.html>`_
+  `class Flatten <nn/Flatten.html>`_
-    : Apply the group normalization.
+  : Flatten the dimensions of input.
-    `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
-    `class GRU <nn/RNN.html>`_
+  `class GroupNorm <nn/GroupNorm.html>`_
-    : Apply a multi-layer gated recurrent unit (GRU) RNN.
+  : Apply the group normalization.
-    `[Cho et.al, 2014] <https://arxiv.org/abs/1406.1078>`_.
+  `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
-    `class GumbelSoftmax <nn/GumbelSoftmax.html>`_
+  `class GRU <nn/RNN.html>`_
-    : Apply the gumbel softmax with a temperature.
+  : Apply a multi-layer gated recurrent unit (GRU) RNN.
-    `[Jang et.al, 2016] <https://arxiv.org/abs/1611.01144>`_.
+  `[Cho et.al, 2014] <https://arxiv.org/abs/1406.1078>`_.
-    `class L1Loss <nn/L1Loss.html>`_
+  `class GumbelSoftmax <nn/GumbelSoftmax.html>`_
-    : Compute the element-wise absolute value difference.
+  : Apply the gumbel softmax with a temperature.
+  `[Jang et.al, 2016] <https://arxiv.org/abs/1611.01144>`_.
-    `class LeakyReLU <nn/LeakyReLU.html>`_
+  `class L1Loss <nn/L1Loss.html>`_
-    : Apply the leaky rectified linear unit.
+  : Compute the element-wise absolute value difference.
-    `class Linear <nn/Linear.html>`_
+  `class LeakyReLU <nn/LeakyReLU.html>`_
-    : Apply the linear transformation.
+  : Apply the leaky rectified linear unit.
-    `class LocalResponseNorm <nn/LocalResponseNorm.html>`_
+  `class Linear <nn/Linear.html>`_
-    : Apply the local response normalization.
+  : Apply the linear transformation.
-    `[Krizhevsky et.al, 2012] <http://www.cs.toronto.edu/~hinton/absps/imagenet.pdf>`_.
-    `class LogSoftmax <nn/LogSoftmax.html>`_
+  `class LocalResponseNorm <nn/LocalResponseNorm.html>`_
-    : Apply the composite of logarithm and softmax.
+  : Apply the local response normalization.
+  `[Krizhevsky et.al, 2012] <http://www.cs.toronto.edu/~hinton/absps/imagenet.pdf>`_.
-    `class LSTM <nn/LSTM.html>`_
+  `class LogSoftmax <nn/LogSoftmax.html>`_
-    : Apply a multi-layer long short-term memory (LSTM) RNN.
+  : Apply the composite of logarithm and softmax.
-    `[Hochreiter & Schmidhuber, 1997] <https://doi.org/10.1162>`_.
-    `class LSTMCell <nn/LSTMCell.html>`_
+  `class LSTM <nn/LSTM.html>`_
-    : Apply a long short-term memory (LSTM) cell.
+  : Apply a multi-layer long short-term memory (LSTM) RNN.
-    `[Hochreiter & Schmidhuber, 1997] <https://doi.org/10.1162>`_.
+  `[Hochreiter & Schmidhuber, 1997] <https://doi.org/10.1162>`_.
-    `class MaxPool2d <nn/MaxPool2d.html>`_
+  `class LSTMCell <nn/LSTMCell.html>`_
-    : Apply the 2d MaxPool2d pooling.
+  : Apply a long short-term memory (LSTM) cell.
+  `[Hochreiter & Schmidhuber, 1997] <https://doi.org/10.1162>`_.
-    `class Module <nn/Module.html>`_
+  `class MaxPool2d <nn/MaxPool2d.html>`_
-    : The base class of modules.
+  : Apply the 2d MaxPool2d pooling.
-    `class MSELoss <nn/MSELoss.html>`_
+  `class Module <nn/Module.html>`_
-    : Compute the element-wise squared error.
+  : The base class of modules.
-    `class NLLLoss <nn/NLLLoss.html>`_
+  `class MSELoss <nn/MSELoss.html>`_
-    : Compute the negative likelihood loss with sparse labels.
+  : Compute the element-wise squared error.
-    `class Parameter <nn/Parameter.html>`_
+  `class NLLLoss <nn/NLLLoss.html>`_
-    : A wrapped tensor considered to be a module parameter.
+  : Compute the negative likelihood loss with sparse labels.
-    `class PReLU <nn/PReLU.html>`_
+  `class Parameter <nn/Parameter.html>`_
-    : Apply the parametric rectified linear unit.
+  : A wrapped tensor considered to be a module parameter.
-    `[He et.al, 2015] <https://arxiv.org/abs/1502.01852>`_.
-    `class ReflectionPad1d <nn/ReflectionPad1d.html>`_
+  `class PReLU <nn/PReLU.html>`_
-    : Pad input according to the last dimension by reflecting boundary.
+  : Apply the parametric rectified linear unit.
+  `[He et.al, 2015] <https://arxiv.org/abs/1502.01852>`_.
-    `class ReflectionPad2d <nn/ReflectionPad2d.html>`_
+  `class ReflectionPad1d <nn/ReflectionPad1d.html>`_
-    : Pad input according to the last 2-dimensions by reflecting boundary.
+  : Pad input according to the last dimension by reflecting boundary.
-    `class ReflectionPad3d <nn/ReflectionPad3d.html>`_
+  `class ReflectionPad2d <nn/ReflectionPad2d.html>`_
-    : Pad input according to the last 3-dimensions by reflecting boundary.
+  : Pad input according to the last 2-dimensions by reflecting boundary.
-    `class ReLU <nn/ReLU.html>`_
+  `class ReflectionPad3d <nn/ReflectionPad3d.html>`_
-    : Apply the rectified linear unit.
+  : Pad input according to the last 3-dimensions by reflecting boundary.
-    `[Nair & Hinton, 2010] <http://www.csri.utoronto.ca/~hinton/absps/reluICML.pdf>`_.
-    `class ReLU6 <nn/ReLU.html>`_
+  `class ReLU <nn/ReLU.html>`_
-    : Apply the clipped-6 rectified linear unit.
+  : Apply the rectified linear unit.
-    `[Krizhevsky, 2010] <http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf>`_.
+  `[Nair & Hinton, 2010] <http://www.csri.utoronto.ca/~hinton/absps/reluICML.pdf>`_.
-    `class ReplicationPad1d <nn/ReplicationPad1d.html>`_
+  `class ReLU6 <nn/ReLU.html>`_
-    : Pad input according to the last dimension by replicating boundary.
+  : Apply the clipped-6 rectified linear unit.
+  `[Krizhevsky, 2010] <http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf>`_.
-    `class ReplicationPad2d <nn/ReplicationPad2d.html>`_
+  `class ReplicationPad1d <nn/ReplicationPad1d.html>`_
-    : Pad input according to the last 2-dimensions by replicating boundary.
+  : Pad input according to the last dimension by replicating boundary.
-    `class ReplicationPad3d <nn/ReplicationPad3d.html>`_
+  `class ReplicationPad2d <nn/ReplicationPad2d.html>`_
-    : Pad input according to the last 3-dimensions by replicating boundary.
+  : Pad input according to the last 2-dimensions by replicating boundary.
-    `class RNN <nn/RNN.html>`_
+  `class ReplicationPad3d <nn/ReplicationPad3d.html>`_
-    : Apply a multi-layer Elman RNN.
+  : Pad input according to the last 3-dimensions by replicating boundary.
-    `[Elman, 1990] <https://doi.org/10.1016>`_.
-    `class SELU <nn/SELU.html>`_
+  `class RNN <nn/RNN.html>`_
-    : Apply the scaled exponential linear unit.
+  : Apply a multi-layer Elman RNN.
-    `[Klambauer et.al, 2017] <https://arxiv.org/abs/1706.02515>`_.
+  `[Elman, 1990] <https://doi.org/10.1016>`_.
-    `class Sigmoid <nn/Sigmoid.html>`_
+  `class SELU <nn/SELU.html>`_
-    : Apply the sigmoid function.
+  : Apply the scaled exponential linear unit.
+  `[Klambauer et.al, 2017] <https://arxiv.org/abs/1706.02515>`_.
-    `class SigmoidFocalLoss <nn/SigmoidFocalLoss.html>`_
+  `class Sigmoid <nn/Sigmoid.html>`_
-    : Compute the sigmoid focal loss with sparse labels.
+  : Apply the sigmoid function.
-    `[Lin et.al, 2017] <https://arxiv.org/abs/1708.02002>`__.
-    `class SmoothL1Loss <nn/SmoothL1Loss.html>`_
+  `class SigmoidFocalLoss <nn/SigmoidFocalLoss.html>`_
-    : Compute the element-wise error transited from L1 and L2.
+  : Compute the sigmoid focal loss with sparse labels.
-    `[Girshick, 2015] <https://arxiv.org/abs/1504.08083>`_.
+  `[Lin et.al, 2017] <https://arxiv.org/abs/1708.02002>`__.
-    `class Softmax <nn/Softmax.html>`_
+  `class SmoothL1Loss <nn/SmoothL1Loss.html>`_
-    : Apply the softmax function.
+  : Compute the element-wise error transited from L1 and L2.
+  `[Girshick, 2015] <https://arxiv.org/abs/1504.08083>`_.
-    `class Tanh <nn/Tanh.html>`_
+  `class Softmax <nn/Softmax.html>`_
-    : Apply the tanh function.
+  : Apply the softmax function.
-    `class SyncBatchNorm <nn/SyncBatchNorm.html>`_
+  `class Tanh <nn/Tanh.html>`_
-    : Apply the sync batch normalization over input.
+  : Apply the tanh function.
-    `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
-    `class Upsample <nn/Upsample.html>`_
+  `class SyncBatchNorm <nn/SyncBatchNorm.html>`_
-    : Upsample input via interpolating neighborhoods.
+  : Apply the sync batch normalization over input.
+  `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
-    `class UpsamplingBilinear2d <nn/UpsamplingBilinear2d.html>`_
+  `class Upsample <nn/Upsample.html>`_
-    : Upsample input via bilinear interpolating.
+  : Upsample input via interpolating neighborhoods.
-    `class UpsamplingNearest2d <nn/UpsamplingNearest2d.html>`_
+  `class UpsamplingBilinear2d <nn/UpsamplingBilinear2d.html>`_
-    : Upsample input via nearest interpolating.
+  : Upsample input via bilinear interpolating.
-    `class ZeroPad2d <nn/ZeroPad2d.html>`_
+  `class UpsamplingNearest2d <nn/UpsamplingNearest2d.html>`_
-    : Pad input according to the last 2-dimensions with zeros.
+  : Upsample input via nearest interpolating.
+  `class ZeroPad2d <nn/ZeroPad2d.html>`_
+  : Pad input according to the last 2-dimensions with zeros.
 .. toctree::
-    :hidden:
+  :hidden:
-    nn/Affine
+  nn/Affine
-    nn/AvgPool2d
+  nn/AvgPool2d
-    nn/BatchNorm1d
+  nn/BatchNorm1d
-    nn/BatchNorm2d
+  nn/BatchNorm2d
-    nn/BatchNorm3d
+  nn/BatchNorm3d
-    nn/BCEWithLogitsLoss
+  nn/BCEWithLogitsLoss
-    nn/ConstantPad1d
+  nn/ConstantPad1d
-    nn/ConstantPad2d
+  nn/ConstantPad2d
-    nn/ConstantPad3d
+  nn/ConstantPad3d
-    nn/Conv2d
+  nn/Conv2d
-    nn/ConvTranspose2d
+  nn/ConvTranspose2d
-    nn/CrossEntropyLoss
+  nn/CrossEntropyLoss
-    nn/CTCLoss
+  nn/CTCLoss
-    nn/DepthwiseConv2d
+  nn/DepthwiseConv2d
-    nn/DropBlock2d
+  nn/DropBlock2d
-    nn/Dropout
+  nn/Dropout
-    nn/DropPath
+  nn/DropPath
-    nn/ELU
+  nn/ELU
-    nn/GroupNorm
+  nn/Flatten
-    nn/GRU
+  nn/GroupNorm
-    nn/GumbelSoftmax
+  nn/GRU
-    nn/L1Loss
+  nn/GumbelSoftmax
-    nn/LeakyReLU
+  nn/L1Loss
-    nn/Linear
+  nn/LeakyReLU
-    nn/LocalResponseNorm
+  nn/Linear
-    nn/LogSoftmax
+  nn/LocalResponseNorm
-    nn/LSTM
+  nn/LogSoftmax
-    nn/LSTMCell
+  nn/LSTM
-    nn/MaxPool2d
+  nn/LSTMCell
-    nn/Module
+  nn/MaxPool2d
-    nn/MSELoss
+  nn/Module
-    nn/NLLLoss
+  nn/MSELoss
-    nn/Parameter
+  nn/NLLLoss
-    nn/PReLU
+  nn/Parameter
-    nn/ReflectionPad1d
+  nn/PReLU
-    nn/ReflectionPad2d
+  nn/ReflectionPad1d
-    nn/ReflectionPad3d
+  nn/ReflectionPad2d
-    nn/ReLU
+  nn/ReflectionPad3d
-    nn/ReLU6
+  nn/ReLU
-    nn/ReplicationPad1d
+  nn/ReLU6
-    nn/ReplicationPad2d
+  nn/ReplicationPad1d
-    nn/ReplicationPad3d
+  nn/ReplicationPad2d
-    nn/RNN
+  nn/ReplicationPad3d
-    nn/SELU
+  nn/RNN
-    nn/Sigmoid
+  nn/SELU
-    nn/SigmoidFocalLoss
+  nn/Sigmoid
-    nn/SmoothL1Loss
+  nn/SigmoidFocalLoss
-    nn/Softmax
+  nn/SmoothL1Loss
-    nn/Tanh
+  nn/Softmax
-    nn/SyncBatchNorm
+  nn/Tanh
-    nn/Upsample
+  nn/SyncBatchNorm
-    nn/UpsamplingBilinear2d
+  nn/Upsample
-    nn/UpsamplingNearest2d
+  nn/UpsamplingBilinear2d
-    nn/ZeroPad2d
+  nn/UpsamplingNearest2d
+  nn/ZeroPad2d
 .. raw:: html

--- a/docs/api/python/torch/nn/Flatten.rst
+++ b/docs/api/python/torch/nn/Flatten.rst
+Flatten
+=======
+.. autoclass:: dragon.vm.torch.nn.Flatten
+__init__
+--------
+.. automethod:: dragon.vm.torch.nn.Flatten.__init__
+.. _torch.flatten(...): ../flatten.html
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/randperm.rst
+++ b/docs/api/python/torch/randperm.rst
+randperm
+========
+.. autofunction:: dragon.vm.torch.randperm
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.";
+      color: #103d3e;
+    }
+  </style>
--- a/dragon/core/operator.cc
+++ b/dragon/core/operator.cc
@@ -255,40 +255,50 @@ DEFINE_REGISTRY(
 /* Macros */
-#define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname)                          \
+#define INSTANTIATE_GET_SINGLE_ARGUMENT(T, fieldname, default) \
-  template <>                                                                  \
+  template <>                                                  \
-  DRAGON_API T OperatorBase::Arg(const string& name, const T& default_value) { \
+  DRAGON_API T OperatorBase::GetArgument(                      \
-    if (args_.count(name) == 0) {                                              \
+      const string& name, const T& default_value) {            \
-      return default_value;                                                    \
+    if (args_.count(name) == 0) return default_value;          \
-    }                                                                          \
+    CHECK(args_[name]->has_##fieldname());                     \
-    CHECK(args_[name]->has_##fieldname());                                     \
+    return static_cast<T>(args_[name]->fieldname());           \
-    return static_cast<T>(args_[name]->fieldname());                           \
+  }                                                            \
+  template <>                                                  \
+  DRAGON_API T OperatorBase::GetArgument(const string& name) { \
+    return OperatorBase::GetArgument<T>(name, default);        \
  }
-INSTANTIATE_GET_SINGLE_ARGUMENT(float, f)
+INSTANTIATE_GET_SINGLE_ARGUMENT(float, f, 0.f)
-INSTANTIATE_GET_SINGLE_ARGUMENT(double, f)
+INSTANTIATE_GET_SINGLE_ARGUMENT(double, f, 0.);
-INSTANTIATE_GET_SINGLE_ARGUMENT(int, i)
+INSTANTIATE_GET_SINGLE_ARGUMENT(int, i, 0);
-INSTANTIATE_GET_SINGLE_ARGUMENT(bool, i)
+INSTANTIATE_GET_SINGLE_ARGUMENT(bool, i, false);
-INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i)
+INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i, int64_t(0));
-INSTANTIATE_GET_SINGLE_ARGUMENT(string, s)
+INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, "");
 #undef INSTANTIATE_GET_SINGLE_ARGUMENT
-#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname)            \
+#define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname)             \
-  template <>                                                      \
+  template <>                                                       \
-  vector<T> DRAGON_API OperatorBase::Args<T>(const string& name) { \
+  vector<T> DRAGON_API OperatorBase::GetArgument<vector<T>>(        \
-    if (args_.count(name) == 0) return vector<T>();                \
+      const string& name, const vector<T>& default_value) {         \
-    vector<T> values;                                              \
+    if (args_.count(name) == 0) return default_value;               \
-    for (const auto& v : args_[name]->fieldname())                 \
+    vector<T> values;                                               \
-      values.push_back(static_cast<T>(v));                         \
+    for (const auto& v : args_[name]->fieldname()) {                \
-    return values;                                                 \
+      values.push_back(static_cast<T>(v));                          \
+    }                                                               \
+    return values;                                                  \
+  }                                                                 \
+  template <>                                                       \
+  vector<T> DRAGON_API OperatorBase::GetArgument<vector<T>>(        \
+      const string& name) {                                         \
+    return OperatorBase::GetArgument<vector<T>>(name, vector<T>()); \
  }
-INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats)
+INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats);
-INSTANTIATE_GET_REPEATED_ARGUMENT(double, floats)
+INSTANTIATE_GET_REPEATED_ARGUMENT(double, floats);
-INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints)
+INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints);
-INSTANTIATE_GET_REPEATED_ARGUMENT(bool, ints)
+INSTANTIATE_GET_REPEATED_ARGUMENT(bool, ints);
-INSTANTIATE_GET_REPEATED_ARGUMENT(int64_t, ints)
+INSTANTIATE_GET_REPEATED_ARGUMENT(int64_t, ints);
-INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings)
+INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings);
 #undef INSTANTIATE_GET_REPEATED_ARGUMENT
 template class Operator<CPUContext>;

--- a/dragon/core/operator.h
+++ b/dragon/core/operator.h
@@ -74,13 +74,13 @@ class DRAGON_API OperatorBase {
    return (int)outputs_.size();
  }
-  /*! \brief Return the value of single argument */
+  /*! \brief Return the value of argument */
  template <typename T>
-  T Arg(const string& name, const T& default_value);
+  T GetArgument(const string& name);
-  /*! \brief Return the value of repeated argument */
+  /*! \brief Return the value of argument with default */
  template <typename T>
-  vector<T> Args(const string& name);
+  T GetArgument(const string& name, const T& default_value);
  /*! \brief Return the message for supported value */
  string MessageForUnsupported(
@@ -199,7 +199,7 @@ class DRAGON_API Operator : public OperatorBase {
  Operator(const OperatorDef& def, Workspace* ws)
      : OperatorBase(def, ws),
        ctx_(def.device_option()),
-        do_sync_(OperatorBase::Arg<bool>("do_sync", false)) {}
+        do_sync_(OperatorBase::GetArgument<bool>("do_sync", false)) {}
  /*! \brief Prepare the content of inputs */
  virtual void Prepare();
@@ -279,19 +279,20 @@ OperatorBase* NewOperator(const OperatorDef&, Workspace*);
 /* Dispatchers */
-#define XIsType(X, type) X.template IsType<type>()
 template <typename... Types>
 struct TensorTypes {};
-using IntegralTensorTypes = TensorTypes<bool, int8_t, uint8_t, int, int64_t>;
+using IntegralTensorTypes = TensorTypes<int8_t, uint8_t, int, int64_t>;
 using FloatingTensorTypes = TensorTypes<float16, float, double>;
-using MathTensorTypes =
+using NumericalTensorTypes =
    TensorTypes<int8_t, uint8_t, int, int64_t, float16, float, double>;
-using AllTensorTypes =
+using BooleanIntegralTensorTypes =
+    TensorTypes<bool, int8_t, uint8_t, int, int64_t, bool>;
+using FullTensorTypes =
    TensorTypes<bool, int8_t, uint8_t, int, int64_t, float16, float, double>;
 template <typename Sizes, typename... Args>
@@ -382,30 +383,33 @@ DEFINE_TENSOR_TYPES_DISPATCHER(DoRunWithType);
 /* Arguments */
-#define OpArg OperatorBase::Arg
+#define OP_SINGLE_ARG(type, name, default) \
-#define OpArgs OperatorBase::Args
+  OperatorBase::GetArgument<type>(name, (default))
+#define OP_REPEATED_ARG(type, name) \
+  OperatorBase::GetArgument<vector<type>>(name)
-#define DECLARE_ARG_WITH_DESC(type, arg) \
+#define DECLARE_OP_SINGLE_ARG_WITH_DESC(type, arg) \
-  type arg##_;                           \
+  type arg##_;                                     \
-  string arg##_desc_;                    \
+  string arg##_desc_;                              \
  type arg()
-#define DECLARE_ARGS_WITH_DESC(type, arg) \
+#define DECLARE_OP_REPEATED_ARG_WITH_DESC(type, arg) \
-  string arg##_desc_;                     \
+  string arg##_desc_;                                \
-  vector<type> arg##_;                    \
+  vector<type> arg##_;                               \
-  vector<string> arg##_descs_;            \
+  vector<string> arg##_descs_;                       \
  type arg(int i, int* num = nullptr)
-#define GET_ARG_WITH_DESC(type, arg, default_value) \
+#define INIT_OP_SINGLE_ARG_WITH_DESC(type, arg, default_value) \
-  arg##_ = OpArg<type>(#arg, default_value);        \
+  arg##_ = OP_SINGLE_ARG(type, #arg, default_value);           \
-  arg##_desc_ = OpArg<string>(string(#arg) + "_desc", "")
+  arg##_desc_ = OP_SINGLE_ARG(string, string(#arg) + "_desc", "")
-#define GET_ARGS_WITH_DESC(type, arg)                      \
+#define INIT_OP_REPEATED_ARG_WITH_DESC(type, arg)                  \
-  arg##_ = OpArgs<type>(#arg);                             \
+  arg##_ = OP_REPEATED_ARG(type, #arg);                            \
-  arg##_desc_ = OpArg<string>(string(#arg) + "_desc", ""); \
+  arg##_desc_ = OP_SINGLE_ARG(string, string(#arg) + "_desc", ""); \
-  arg##_descs_ = OpArgs<string>(string(#arg) + "_descs")
+  arg##_descs_ = OP_REPEATED_ARG(string, string(#arg) + "_descs")
-#define DEFINE_ARG_WITH_DESC(type, classname, arg)                \
+#define DEFINE_OP_SINGLE_ARG_WITH_DESC(type, classname, arg)      \
  template <class Context>                                        \
  type classname<Context>::arg() {                                \
    if (arg##_desc_.empty()) return arg##_;                       \
@@ -419,7 +423,7 @@ DEFINE_TENSOR_TYPES_DISPATCHER(DoRunWithType);
    return arg##_tensor->template data<type, CPUContext>()[0];    \
  }
-#define DEFINE_ARGS_WITH_DESC(type, classname, arg)                         \
+#define DEFINE_OP_REPEATED_ARG_WITH_DESC(type, classname, arg)              \
  template <class Context>                                                  \
  type classname<Context>::arg(int i, int* num) {                           \
    const type* data;                                                       \
@@ -451,13 +455,13 @@ DEFINE_TENSOR_TYPES_DISPATCHER(DoRunWithType);
  }
 #define CANONICALIZE_AXIS_WITH_TENSOR_AND_OFFSET(tensor, offset)         \
-  auto axis = OpArg<int64_t>("axis", INT_MAX);                           \
+  auto axis = OP_SINGLE_ARG(int64_t, "axis", INT_MAX);                   \
  if (axis != INT_MAX) {                                                 \
    axis = axis < 0 ? axis + tensor.ndim() + offset : axis;              \
    CHECK(axis >= 0 && axis < tensor.ndim() + offset)                    \
        << "\nExcepted the axis in [-" << tensor.ndim() + offset << ", " \
        << tensor.ndim() + offset << "), got "                           \
-        << OpArg<int64_t>("axis", INT_MAX) << ".";                       \
+        << OP_SINGLE_ARG(int64_t, "axis", INT_MAX) << ".";               \
  }
 #define CANONICALIZE_AXIS_WITH_TENSOR(tensor) \
@@ -509,24 +513,24 @@ DECLARE_REGISTRY(
 #define REGISTER_CNML_OPERATOR(name, ...) \
  REGISTER_CLASS(CNMLOperatorRegistry, name, __VA_ARGS__)
-#define DEPLOY_CPU(name)                             \
+#define DEPLOY_CPU_OPERATOR(name)                    \
  REGISTER_CPU_OPERATOR(name, name##Op<CPUContext>); \
  INSTANTIATE_OPERATOR(name, CPUContext);
-#define DEPLOY_CUDA(name)                              \
+#define DEPLOY_CUDA_OPERATOR(name)                     \
  REGISTER_CUDA_OPERATOR(name, name##Op<CUDAContext>); \
  INSTANTIATE_OPERATOR(name, CUDAContext);
-#define DEPLOY_CPU_CUDA(name)                         \
+#define DEPLOY_CPU_CUDA_OPERATOR(name)                \
  REGISTER_CPU_OPERATOR(name, name##Op<CPUContext>);  \
  REGISTER_CUDA_OPERATOR(name, name##Op<CPUContext>); \
  INSTANTIATE_OPERATOR(name, CPUContext);
-#define DEPLOY_CUDNN(name)                                     \
+#define DEPLOY_CUDNN_OPERATOR(name)                            \
  REGISTER_CUDNN_OPERATOR(name, CuDNN##name##Op<CUDAContext>); \
  INSTANTIATE_CUDNN_OPERATOR(name);
-#define DEPLOY_CNML(name)                                    \
+#define DEPLOY_CNML_OPERATOR(name)                           \
  REGISTER_CNML_OPERATOR(name, CnML##name##Op<CNMLContext>); \
  INSTANTIATE_CNML_OPERATOR(name);

--- a/dragon/kernels/activation/drop_block_op_kernel.cc
+++ b/dragon/kernels/activation/drop_block_op_kernel.cc
@@ -15,7 +15,7 @@ void _DropBlock2dNCHW(
    const int seed_h,
    const int seed_w,
    const int block_size,
-    const uint32_t* seed,
+    const uint32_t* r,
    int* mask) {
  const int HW = H * W;
  const int CHW = C * HW;
@@ -24,7 +24,7 @@ void _DropBlock2dNCHW(
  std::array<int, 3> dims = {N, seed_h, seed_w};
  int offset;
  for (int i = 0; i < count; ++i) {
-    if (seed[i] > 0) {
+    if (r[i] > 0) {
      offset = idx[0] * CHW + idx[1] * W + idx[2];
      for (int c = 0; c < C; ++c) {
        for (int bh = 0; bh < block_size; ++bh) {
@@ -84,15 +84,15 @@ void DropBlock2d<CPUContext>(
    const int block_size,
    const float gamma,
    const string& data_format,
-    uint32_t* seed,
+    uint32_t* r,
    int* mask,
    CPUContext* ctx) {
  const int count = N * seed_h * seed_w;
-  math::RandomBernoulli(count, gamma, seed, ctx);
+  math::RandomBernoulli(count, gamma, r, ctx);
  if (data_format == "NCHW") {
-    _DropBlock2dNCHW(N, C, H, W, seed_h, seed_w, block_size, seed, mask);
+    _DropBlock2dNCHW(N, C, H, W, seed_h, seed_w, block_size, r, mask);
  } else if (data_format == "NHWC") {
-    _DropBlock2dNHWC(N, C, H, W, seed_h, seed_w, block_size, seed, mask);
+    _DropBlock2dNHWC(N, C, H, W, seed_h, seed_w, block_size, r, mask);
  } else {
    LOG(FATAL) << "Unknown DataFormat: " << data_format;
  }

--- a/dragon/kernels/activation/drop_block_op_kernel.cu
+++ b/dragon/kernels/activation/drop_block_op_kernel.cu
@@ -19,10 +19,10 @@ __global__ void _DropBlock2dNCHW(
    const int seed_w,
    const int block_size,
    const uint32_t thresh,
-    const uint32_t* seed,
+    const uint32_t* r,
    int* mask) {
  CUDA_1D_KERNEL_LOOP(idx, nthreads) {
-    if (seed[idx] < thresh) {
+    if (r[idx] < thresh) {
      const int wstart = idx % seed_w;
      const int hstart = (idx / seed_w) % seed_h;
      const int n = idx / seed_w / seed_h;
@@ -47,10 +47,10 @@ __global__ void _DropBlock2dNHWC(
    const int seed_w,
    const int block_size,
    const uint32_t thresh,
-    const uint32_t* seed,
+    const uint32_t* r,
    int* mask) {
  CUDA_1D_KERNEL_LOOP(idx, nthreads) {
-    if (seed[idx] < thresh) {
+    if (r[idx] < thresh) {
      const int wstart = idx % seed_w;
      const int hstart = (idx / seed_w) % seed_h;
      const int n = idx / seed_w / seed_h;
@@ -81,11 +81,11 @@ void DropBlock2d<CUDAContext>(
    const int block_size,
    const float gamma,
    const string& data_format,
-    uint32_t* seed,
+    uint32_t* r,
    int* mask,
    CUDAContext* ctx) {
  auto nthreads = N * seed_h * seed_w;
-  math::RandomUniform(nthreads, 0.f, 1.f, seed, ctx);
+  math::Random(nthreads, r, ctx);
  auto mask_thresh = (uint32_t)(UINT_MAX * gamma);
  if (data_format == "NCHW") {
    _DropBlock2dNCHW<<<
@@ -93,14 +93,14 @@ void DropBlock2d<CUDAContext>(
        CUDA_THREADS,
        0,
        ctx->cuda_stream()>>>(
-        nthreads, C, H, W, seed_h, seed_w, block_size, mask_thresh, seed, mask);
+        nthreads, C, H, W, seed_h, seed_w, block_size, mask_thresh, r, mask);
  } else if (data_format == "NHWC") {
    _DropBlock2dNHWC<<<
        CUDA_BLOCKS(nthreads),
        CUDA_THREADS,
        0,
        ctx->cuda_stream()>>>(
-        nthreads, C, H, W, seed_h, seed_w, block_size, mask_thresh, seed, mask);
+        nthreads, C, H, W, seed_h, seed_w, block_size, mask_thresh, r, mask);
  } else {
    LOG(FATAL) << "Unknown DataFormat: " << data_format;
  }

--- a/dragon/kernels/activation/dropout_op_kernel.cc
+++ b/dragon/kernels/activation/dropout_op_kernel.cc
@@ -82,7 +82,7 @@ void _Dropout<float16>(
      const T* x,                                                            \
      uint8_t* mask,                                                         \
      T* y,                                                                  \
-      uint32_t* scratch,                                                     \
+      uint32_t* r,                                                           \
      CPUContext* ctx) {                                                     \
    _Dropout(count, cast::to<T>(prob), cast::to<T>(scale), x, mask, y, ctx); \
  }

--- a/dragon/kernels/activation/dropout_op_kernel.cu
+++ b/dragon/kernels/activation/dropout_op_kernel.cu
@@ -97,15 +97,15 @@ void Dropout<float16, CUDAContext>(
    const float16* x,
    uint8_t* mask,
    float16* y,
-    uint32_t* scratch,
+    uint32_t* r,
    CUDAContext* ctx) {
-  math::RandomUniform(count, 0.f, 1.f, scratch, ctx);
+  math::Random(count, r, ctx);
  _Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
      count,
      static_cast<uint32_t>(UINT_MAX * prob),
      cast::to<half>(scale),
      reinterpret_cast<const half*>(x),
-      scratch,
+      r,
      mask,
      reinterpret_cast<half*>(y));
 }
@@ -130,12 +130,12 @@ void Dropout<float16, CUDAContext>(
      const T* x,                                                            \
      uint8_t* mask,                                                         \
      T* y,                                                                  \
-      uint32_t* scratch,                                                     \
+      uint32_t* r,                                                           \
      CUDAContext* ctx) {                                                    \
-    math::RandomUniform(count, 0.f, 1.f, scratch, ctx);                      \
+    math::Random(count, r, ctx);                                             \
    auto threshold = static_cast<uint32_t>(UINT_MAX * prob);                 \
    _Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(   \
-        count, threshold, cast::to<T>(scale), x, scratch, mask, y);          \
+        count, threshold, cast::to<T>(scale), x, r, mask, y);                \
  }
 DEFINE_KERNEL_LAUNCHER(float);

--- a/dragon/kernels/array/permutation_op_kernel.cc
+++ b/dragon/kernels/array/permutation_op_kernel.cc
+#include "dragon/utils/math_functions.h"
+#include "dragon/utils/op_kernels.h"
+namespace dragon {
+namespace kernel {
+namespace {
+template <typename T>
+void _SwapByKey(const int count, const uint32_t* r, T* y) {
+  for (int i = 0; i < count; ++i) {
+    std::swap(y[i], y[i + (r[i] % (count - i))]);
+  }
+}
+} // namespace
+#define DEFINE_KERNEL_LAUNCHER(T)                            \
+  template <>                                                \
+  void Permutation<T, CPUContext>(                           \
+      const int count, T* y, uint32_t* r, CPUContext* ctx) { \
+    math::Random(count, r, ctx);                             \
+    kernel::Range(count, 0.f, 1.f, y, ctx);                  \
+    _SwapByKey(count, r, y);                                 \
+  }
+DEFINE_KERNEL_LAUNCHER(int8_t);
+DEFINE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_KERNEL_LAUNCHER(int);
+DEFINE_KERNEL_LAUNCHER(int64_t);
+DEFINE_KERNEL_LAUNCHER(float16);
+DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(double);
+#undef DEFINE_KERNEL_LAUNCHER
+} // namespace kernel
+} // namespace dragon
--- a/dragon/kernels/array/permutation_op_kernel.cu
+++ b/dragon/kernels/array/permutation_op_kernel.cu
+#ifdef USE_CUDA
+#include "dragon/core/context_cuda.h"
+#include "dragon/utils/device/common_thrust.h"
+#include "dragon/utils/math_functions.h"
+#include "dragon/utils/op_kernels.h"
+namespace dragon {
+namespace kernel {
+namespace {
+__global__ void _Sequence(const int nthreads, half* y) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+    y[i] = __float2half(float(i));
+  }
+}
+} // namespace
+template <>
+void Permutation<float16, CUDAContext>(
+    const int count,
+    float16* y,
+    uint32_t* r,
+    CUDAContext* ctx) {
+  math::Random(count, r, ctx);
+  auto values = thrust::device_ptr<half>(reinterpret_cast<half*>(y));
+  auto keys = thrust::device_ptr<uint32_t>(r);
+  auto policy = thrust::cuda::par.on(ctx->cuda_stream());
+  _Sequence<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
+      count, reinterpret_cast<half*>(y));
+  thrust::sort_by_key(policy, keys, keys + count, values);
+}
+#define DEFINE_KERNEL_LAUNCHER(T)                             \
+  template <>                                                 \
+  void Permutation<T, CUDAContext>(                           \
+      const int count, T* y, uint32_t* r, CUDAContext* ctx) { \
+    math::Random(count, r, ctx);                              \
+    auto values = thrust::device_ptr<T>(y);                   \
+    auto keys = thrust::device_ptr<uint32_t>(r);              \
+    auto policy = thrust::cuda::par.on(ctx->cuda_stream());   \
+    thrust::sequence(policy, values, values + count);         \
+    thrust::sort_by_key(policy, keys, keys + count, values);  \
+  }
+DEFINE_KERNEL_LAUNCHER(int8_t);
+DEFINE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_KERNEL_LAUNCHER(int);
+DEFINE_KERNEL_LAUNCHER(int64_t);
+DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(double);
+#undef DEFINE_KERNEL_LAUNCHER
+} // namespace kernel
+} // namespace dragon
+#endif // USE_CUDA
--- a/dragon/kernels/array/arange_op_kernel.cc
+++ b/dragon/kernels/array/arange_op_kernel.cc
@@ -9,12 +9,12 @@ namespace kernel {
 namespace {
 template <typename T>
-void _Arange(const int count, const float start, const float step, T* y) {
+void _Range(const int count, const float start, const float delta, T* y) {
 #ifdef USE_OPENMP
 #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
  for (int i = 0; i < count; ++i) {
-    y[i] = static_cast<T>(start + i * step);
+    y[i] = static_cast<T>(start + i * delta);
  }
 }
@@ -23,29 +23,29 @@ void _Arange(const int count, const float start, const float step, T* y) {
 /* ------------------- Launcher Separator ------------------- */
 template <>
-void Arange<float16, CPUContext>(
+void Range<float16, CPUContext>(
    const int count,
    const float start,
-    const float step,
+    const float delta,
    float16* y,
    CPUContext* ctx) {
 #ifdef USE_OPENMP
 #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
  for (int i = 0; i < count; ++i) {
-    y[i] = cast::to<float16>(start + (float)i * step);
+    y[i] = cast::to<float16>(start + (float)i * delta);
  }
 }
 #define DEFINE_KERNEL_LAUNCHER(T)   \
  template <>                       \
-  void Arange<T, CPUContext>(       \
+  void Range<T, CPUContext>(        \
      const int count,              \
      const float start,            \
-      const float step,             \
+      const float delta,            \
      T* y,                         \
      CPUContext* ctx) {            \
-    _Arange(count, start, step, y); \
+    _Range(count, start, delta, y); \
  }
 DEFINE_KERNEL_LAUNCHER(int8_t);
@@ -54,7 +54,6 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/arange_op_kernel.cu
+++ b/dragon/kernels/array/arange_op_kernel.cu
@@ -11,20 +11,20 @@ namespace {
 template <typename T>
 __global__ void
-_Arange(const int nthreads, const float start, const float step, T* y) {
+_Range(const int nthreads, const float start, const float delta, T* y) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = start + (float)i * step;
+    y[i] = T(start + (float)i * delta);
  }
 }
 template <>
-__global__ void _Arange<half>(
+__global__ void _Range<half>(
    const int nthreads,
    const float start,
-    const float step,
+    const float delta,
    half* y) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = __float2half(start + (float)i * step);
+    y[i] = __float2half(start + (float)i * delta);
  }
 }
@@ -33,26 +33,26 @@ __global__ void _Arange<half>(
 /* ------------------- Launcher Separator ------------------- */
 template <>
-void Arange<float16, CUDAContext>(
+void Range<float16, CUDAContext>(
    const int count,
    const float start,
-    const float step,
+    const float delta,
    float16* y,
    CUDAContext* ctx) {
-  _Arange<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
+  _Range<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      count, start, step, reinterpret_cast<half*>(y));
+      count, start, delta, reinterpret_cast<half*>(y));
 }
-#define DEFINE_KERNEL_LAUNCHER(T)                                         \
+#define DEFINE_KERNEL_LAUNCHER(T)                                        \
-  template <>                                                             \
+  template <>                                                            \
-  void Arange<T, CUDAContext>(                                            \
+  void Range<T, CUDAContext>(                                            \
-      const int count,                                                    \
+      const int count,                                                   \
-      const float start,                                                  \
+      const float start,                                                 \
-      const float step,                                                   \
+      const float delta,                                                 \
-      T* y,                                                               \
+      T* y,                                                              \
-      CUDAContext* ctx) {                                                 \
+      CUDAContext* ctx) {                                                \
-    _Arange<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+    _Range<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        count, start, step, y);                                           \
+        count, start, delta, y);                                         \
  }
 DEFINE_KERNEL_LAUNCHER(int8_t);
@@ -61,7 +61,6 @@ DEFINE_KERNEL_LAUNCHER(int);
 DEFINE_KERNEL_LAUNCHER(int64_t);
 DEFINE_KERNEL_LAUNCHER(float);
 DEFINE_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/modules/python/common.h
+++ b/dragon/modules/python/common.h
@@ -104,8 +104,9 @@ class NumpyFeeder : public TensorFeederBase {
    int ndim = PyArray_NDIM(array);
    vec64_t dims(ndim);
    auto* npy_dims = PyArray_DIMS(array);
-    for (int i = 0; i < ndim; i++)
+    for (int i = 0; i < ndim; i++) {
      dims[i] = npy_dims[i];
+    }
    tensor->Reshape(dims);
    if (option.device_type() == PROTO_CUDA) {
 #ifdef USE_CUDA

--- a/dragon/modules/python/plugin_op.cc
+++ b/dragon/modules/python/plugin_op.cc
@@ -16,9 +16,9 @@ PythonPluginInferOp<Context>::PythonPluginInferOp(
    const OperatorDef& def,
    Workspace* ws)
    : Operator<Context>(def, ws),
-      module_name_(OpArg<string>("module_name", "")),
+      module_name_(OP_SINGLE_ARG(string, "module_name", "")),
-      class_name_(OpArg<string>("class_name", "")),
+      class_name_(OP_SINGLE_ARG(string, "class_name", "")),
-      kwargs_str_((OpArg<string>("kwargs_str", ""))) {
+      kwargs_str_(OP_SINGLE_ARG(string, "kwargs_str", "")) {
  // Optimization for all python ops
  this->do_sync_ = false;
@@ -118,21 +118,21 @@ void PythonPluginGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(PythonPluginInfer);
+DEPLOY_CPU_OPERATOR(PythonPluginInfer);
 #ifdef USE_CUDA
-DEPLOY_CUDA(PythonPluginInfer);
+DEPLOY_CUDA_OPERATOR(PythonPluginInfer);
 #endif
 OPERATOR_SCHEMA(PythonPluginInfer);
-DEPLOY_CPU(PythonPlugin);
+DEPLOY_CPU_OPERATOR(PythonPlugin);
 #ifdef USE_CUDA
-DEPLOY_CUDA(PythonPlugin);
+DEPLOY_CUDA_OPERATOR(PythonPlugin);
 #endif
 OPERATOR_SCHEMA(PythonPlugin);
-DEPLOY_CPU(PythonPluginGradient);
+DEPLOY_CPU_OPERATOR(PythonPluginGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(PythonPluginGradient);
+DEPLOY_CUDA_OPERATOR(PythonPluginGradient);
 #endif
 OPERATOR_SCHEMA(PythonPluginGradient);

--- a/dragon/operators/activation/drop_block2d_op.cc
+++ b/dragon/operators/activation/drop_block2d_op.cc
@@ -108,9 +108,9 @@ void DropBlock2dGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(DropBlock2d);
+DEPLOY_CPU_OPERATOR(DropBlock2d);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DropBlock2d);
+DEPLOY_CUDA_OPERATOR(DropBlock2d);
 #endif
 OPERATOR_SCHEMA(DropBlock2d)
@@ -121,9 +121,9 @@ OPERATOR_SCHEMA(DropBlock2d)
    /* X => Y */
    .AllowInplace({{0, 0}});
-DEPLOY_CPU(DropBlock2dGradient);
+DEPLOY_CPU_OPERATOR(DropBlock2dGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DropBlock2dGradient);
+DEPLOY_CUDA_OPERATOR(DropBlock2dGradient);
 #endif
 OPERATOR_SCHEMA(DropBlock2dGradient)

--- a/dragon/operators/activation/drop_block_op.h
+++ b/dragon/operators/activation/drop_block_op.h
@@ -22,10 +22,10 @@ class DropBlock2dOp final : public Operator<Context> {
 public:
  DropBlock2dOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        block_size_(OpArg<int64_t>("block_size", 7)),
+        block_size_(OP_SINGLE_ARG(int64_t, "block_size", 7)),
-        alpha_(OpArg<float>("alpha", 1.f)),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 1.f)),
-        decrement_(OpArg<float>("decrement", 0.f)) {
+        decrement_(OP_SINGLE_ARG(float, "decrement", 0.f)) {
-    GET_ARG_WITH_DESC(float, keep_prob, 0.9f);
+    INIT_OP_SINGLE_ARG_WITH_DESC(float, keep_prob, 0.9f);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -37,7 +37,7 @@ class DropBlock2dOp final : public Operator<Context> {
 protected:
  int64_t block_size_;
  float alpha_, decrement_, prob_ = 1.;
-  DECLARE_ARG_WITH_DESC(float, keep_prob);
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(float, keep_prob);
 };
 template <class Context>
@@ -52,7 +52,7 @@ class DropBlock2dGradientOp final : public Operator<Context> {
  void DoRunWithType();
 };
-DEFINE_ARG_WITH_DESC(float, DropBlock2dOp, keep_prob);
+DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropBlock2dOp, keep_prob);
 } // namespace dragon

--- a/dragon/operators/activation/drop_path_op.cc
+++ b/dragon/operators/activation/drop_path_op.cc
@@ -72,14 +72,14 @@ void DropPathGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(DropPath);
+DEPLOY_CPU_OPERATOR(DropPath);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DropPath);
+DEPLOY_CUDA_OPERATOR(DropPath);
 #endif
-DEPLOY_CPU(DropPathGradient);
+DEPLOY_CPU_OPERATOR(DropPathGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DropPathGradient);
+DEPLOY_CUDA_OPERATOR(DropPathGradient);
 #endif
 OPERATOR_SCHEMA(DropPath)

--- a/dragon/operators/activation/drop_path_op.h
+++ b/dragon/operators/activation/drop_path_op.h
@@ -21,8 +21,9 @@ template <class Context>
 class DropPathOp final : public Operator<Context> {
 public:
  DropPathOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), inc_(OpArg<float>("increment", 0.f)) {
+      : Operator<Context>(def, ws),
-    GET_ARG_WITH_DESC(float, prob, 0.2f);
+        inc_(OP_SINGLE_ARG(float, "increment", 0.f)) {
+    INIT_OP_SINGLE_ARG_WITH_DESC(float, prob, 0.2f);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -33,7 +34,7 @@ class DropPathOp final : public Operator<Context> {
 protected:
  float inc_, drop_prob_ = 0.f;
-  DECLARE_ARG_WITH_DESC(float, prob);
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(float, prob);
 };
 template <class Context>
@@ -48,7 +49,7 @@ class DropPathGradientOp final : public Operator<Context> {
  void DoRunWithType();
 };
-DEFINE_ARG_WITH_DESC(float, DropPathOp, prob);
+DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropPathOp, prob);
 } // namespace dragon

--- a/dragon/operators/activation/dropout_op.cc
+++ b/dragon/operators/activation/dropout_op.cc
@@ -56,14 +56,14 @@ void DropoutGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Dropout);
+DEPLOY_CPU_OPERATOR(Dropout);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Dropout);
+DEPLOY_CUDA_OPERATOR(Dropout);
 #endif
-DEPLOY_CPU(DropoutGradient);
+DEPLOY_CPU_OPERATOR(DropoutGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DropoutGradient);
+DEPLOY_CUDA_OPERATOR(DropoutGradient);
 #endif
 OPERATOR_SCHEMA(Dropout)

--- a/dragon/operators/activation/dropout_op.h
+++ b/dragon/operators/activation/dropout_op.h
@@ -22,7 +22,7 @@ class DropoutOp : public Operator<Context> {
 public:
  DropoutOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
-    GET_ARG_WITH_DESC(float, prob, 0.5f);
+    INIT_OP_SINGLE_ARG_WITH_DESC(float, prob, 0.5f);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -32,7 +32,7 @@ class DropoutOp : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARG_WITH_DESC(float, prob);
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(float, prob);
 };
 template <class Context>
@@ -40,7 +40,7 @@ class DropoutGradientOp : public Operator<Context> {
 public:
  DropoutGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
-    GET_ARG_WITH_DESC(float, prob, 0.5f);
+    INIT_OP_SINGLE_ARG_WITH_DESC(float, prob, 0.5f);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -50,11 +50,11 @@ class DropoutGradientOp : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARG_WITH_DESC(float, prob);
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(float, prob);
 };
-DEFINE_ARG_WITH_DESC(float, DropoutOp, prob);
+DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropoutOp, prob);
-DEFINE_ARG_WITH_DESC(float, DropoutGradientOp, prob);
+DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropoutGradientOp, prob);
 #ifdef USE_CUDNN

--- a/dragon/operators/activation/dropout_op_cudnn.cc
+++ b/dragon/operators/activation/dropout_op_cudnn.cc
@@ -119,8 +119,8 @@ void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(Dropout);
+DEPLOY_CUDNN_OPERATOR(Dropout);
-DEPLOY_CUDNN(DropoutGradient);
+DEPLOY_CUDNN_OPERATOR(DropoutGradient);
 } // namespace dragon

--- a/dragon/operators/activation/elu_op.cc
+++ b/dragon/operators/activation/elu_op.cc
@@ -38,14 +38,14 @@ void EluGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Elu);
+DEPLOY_CPU_OPERATOR(Elu);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Elu);
+DEPLOY_CUDA_OPERATOR(Elu);
 #endif
-DEPLOY_CPU(EluGradient);
+DEPLOY_CPU_OPERATOR(EluGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(EluGradient);
+DEPLOY_CUDA_OPERATOR(EluGradient);
 #endif
 OPERATOR_SCHEMA(Elu)

--- a/dragon/operators/activation/elu_op.h
+++ b/dragon/operators/activation/elu_op.h
@@ -21,7 +21,8 @@ template <class Context>
 class EluOp : public Operator<Context> {
 public:
  EluOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), alpha_(OpArg<float>("alpha", 1.f)) {}
+      : Operator<Context>(def, ws),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 1.f)) {}
  USE_OPERATOR_FUNCTIONS;
  template <typename T>
@@ -37,7 +38,8 @@ template <class Context>
 class EluGradientOp : public Operator<Context> {
 public:
  EluGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), alpha_(OpArg<float>("alpha", 1.f)) {}
+      : Operator<Context>(def, ws),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 1.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/activation/elu_op_cudnn.cc
+++ b/dragon/operators/activation/elu_op_cudnn.cc
@@ -52,8 +52,8 @@ void CuDNNEluGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(Elu);
+DEPLOY_CUDNN_OPERATOR(Elu);
-DEPLOY_CUDNN(EluGradient);
+DEPLOY_CUDNN_OPERATOR(EluGradient);
 } // namespace dragon

--- a/dragon/operators/activation/prelu_op.cc
+++ b/dragon/operators/activation/prelu_op.cc
@@ -113,14 +113,14 @@ void PReluGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(-1));
 }
-DEPLOY_CPU(PRelu);
+DEPLOY_CPU_OPERATOR(PRelu);
 #ifdef USE_CUDA
-DEPLOY_CUDA(PRelu);
+DEPLOY_CUDA_OPERATOR(PRelu);
 #endif
-DEPLOY_CPU(PReluGradient);
+DEPLOY_CPU_OPERATOR(PReluGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(PReluGradient);
+DEPLOY_CUDA_OPERATOR(PReluGradient);
 #endif
 OPERATOR_SCHEMA(PRelu)

--- a/dragon/operators/activation/prelu_op.h
+++ b/dragon/operators/activation/prelu_op.h
@@ -22,7 +22,7 @@ class PReluOp final : public Operator<Context> {
 public:
  PReluOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        channel_shared_(OpArg<int64_t>("channel_shared", 0)) {}
+        channel_shared_(OP_SINGLE_ARG(int64_t, "channel_shared", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/activation/relu_op.cc
+++ b/dragon/operators/activation/relu_op.cc
@@ -57,14 +57,14 @@ void ReluGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Relu);
+DEPLOY_CPU_OPERATOR(Relu);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Relu);
+DEPLOY_CUDA_OPERATOR(Relu);
 #endif
-DEPLOY_CPU(ReluGradient);
+DEPLOY_CPU_OPERATOR(ReluGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ReluGradient);
+DEPLOY_CUDA_OPERATOR(ReluGradient);
 #endif
 OPERATOR_SCHEMA(Relu)

--- a/dragon/operators/activation/relu_op.h
+++ b/dragon/operators/activation/relu_op.h
@@ -22,8 +22,8 @@ class ReluOp : public Operator<Context> {
 public:
  ReluOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        alpha_(OpArg<float>("alpha", 0.f)),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 0.f)),
-        max_value_(OpArg<float>("max_value", 0.f)) {}
+        max_value_(OP_SINGLE_ARG(float, "max_value", 0.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -40,8 +40,8 @@ class ReluGradientOp : public Operator<Context> {
 public:
  ReluGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        alpha_(OpArg<float>("alpha", 0.f)),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 0.f)),
-        max_value_(OpArg<float>("max_value", 0.f)) {}
+        max_value_(OP_SINGLE_ARG(float, "max_value", 0.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/activation/relu_op_cudnn.cc
+++ b/dragon/operators/activation/relu_op_cudnn.cc
@@ -58,8 +58,8 @@ void CuDNNReluGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(Relu);
+DEPLOY_CUDNN_OPERATOR(Relu);
-DEPLOY_CUDNN(ReluGradient);
+DEPLOY_CUDNN_OPERATOR(ReluGradient);
 } // namespace dragon

--- a/dragon/operators/activation/selu_op.cc
+++ b/dragon/operators/activation/selu_op.cc
@@ -40,14 +40,14 @@ void SeluGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Selu);
+DEPLOY_CPU_OPERATOR(Selu);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Selu);
+DEPLOY_CUDA_OPERATOR(Selu);
 #endif
-DEPLOY_CPU(SeluGradient);
+DEPLOY_CPU_OPERATOR(SeluGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SeluGradient);
+DEPLOY_CUDA_OPERATOR(SeluGradient);
 #endif
 OPERATOR_SCHEMA(Selu)

--- a/dragon/operators/activation/selu_op.h
+++ b/dragon/operators/activation/selu_op.h
@@ -22,8 +22,8 @@ class SeluOp final : public Operator<Context> {
 public:
  SeluOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        alpha_(OpArg<float>("alpha", 1.67326f)),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 1.67326f)),
-        gamma_(OpArg<float>("gamma", 1.0507f)) {}
+        gamma_(OP_SINGLE_ARG(float, "gamma", 1.0507f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -40,8 +40,8 @@ class SeluGradientOp final : public Operator<Context> {
 public:
  SeluGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        alpha_(OpArg<float>("alpha", 1.67326f)),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 1.67326f)),
-        gamma_(OpArg<float>("gamma", 1.0507f)) {}
+        gamma_(OP_SINGLE_ARG(float, "gamma", 1.0507f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/activation/sigmoid_op.cc
+++ b/dragon/operators/activation/sigmoid_op.cc
@@ -36,14 +36,14 @@ void SigmoidGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Sigmoid);
+DEPLOY_CPU_OPERATOR(Sigmoid);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Sigmoid);
+DEPLOY_CUDA_OPERATOR(Sigmoid);
 #endif
-DEPLOY_CPU(SigmoidGradient);
+DEPLOY_CPU_OPERATOR(SigmoidGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SigmoidGradient);
+DEPLOY_CUDA_OPERATOR(SigmoidGradient);
 #endif
 OPERATOR_SCHEMA(Sigmoid)

--- a/dragon/operators/activation/sigmoid_op_cudnn.cc
+++ b/dragon/operators/activation/sigmoid_op_cudnn.cc
@@ -50,8 +50,8 @@ void CuDNNSigmoidGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(Sigmoid);
+DEPLOY_CUDNN_OPERATOR(Sigmoid);
-DEPLOY_CUDNN(SigmoidGradient);
+DEPLOY_CUDNN_OPERATOR(SigmoidGradient);
 } // namespace dragon

--- a/dragon/operators/activation/softmax_op.cc
+++ b/dragon/operators/activation/softmax_op.cc
@@ -44,14 +44,14 @@ void SoftmaxGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Softmax);
+DEPLOY_CPU_OPERATOR(Softmax);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Softmax);
+DEPLOY_CUDA_OPERATOR(Softmax);
 #endif
-DEPLOY_CPU(SoftmaxGradient);
+DEPLOY_CPU_OPERATOR(SoftmaxGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SoftmaxGradient);
+DEPLOY_CUDA_OPERATOR(SoftmaxGradient);
 #endif
 OPERATOR_SCHEMA(Softmax)

--- a/dragon/operators/activation/softmax_op_cudnn.cc
+++ b/dragon/operators/activation/softmax_op_cudnn.cc
@@ -54,8 +54,8 @@ void CuDNNSoftmaxGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(Softmax);
+DEPLOY_CUDNN_OPERATOR(Softmax);
-DEPLOY_CUDNN(SoftmaxGradient);
+DEPLOY_CUDNN_OPERATOR(SoftmaxGradient);
 } // namespace dragon

--- a/dragon/operators/activation/tanh_op.cc
+++ b/dragon/operators/activation/tanh_op.cc
@@ -36,14 +36,14 @@ void TanhGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Tanh);
+DEPLOY_CPU_OPERATOR(Tanh);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Tanh);
+DEPLOY_CUDA_OPERATOR(Tanh);
 #endif
-DEPLOY_CPU(TanhGradient);
+DEPLOY_CPU_OPERATOR(TanhGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(TanhGradient);
+DEPLOY_CUDA_OPERATOR(TanhGradient);
 #endif
 OPERATOR_SCHEMA(Tanh)

--- a/dragon/operators/activation/tanh_op_cudnn.cc
+++ b/dragon/operators/activation/tanh_op_cudnn.cc
@@ -50,8 +50,8 @@ void CuDNNTanhGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(Tanh);
+DEPLOY_CUDNN_OPERATOR(Tanh);
-DEPLOY_CUDNN(TanhGradient);
+DEPLOY_CUDNN_OPERATOR(TanhGradient);
 } // namespace dragon

--- a/dragon/operators/array/arg_ops.h
+++ b/dragon/operators/array/arg_ops.h
@@ -22,7 +22,7 @@ class ArgMaxOp final : public Operator<Context> {
 public:
  ArgMaxOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        keep_dims_(OpArg<int64_t>("keep_dims", 0)) {}
+        keep_dims_(OP_SINGLE_ARG(int64_t, "keep_dims", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -39,7 +39,7 @@ class ArgMinOp final : public Operator<Context> {
 public:
  ArgMinOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        keep_dims_(OpArg<int64_t>("keep_dims", 0)) {}
+        keep_dims_(OP_SINGLE_ARG(int64_t, "keep_dims", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/array/argmax_op.cc
+++ b/dragon/operators/array/argmax_op.cc
@@ -49,12 +49,12 @@ void ArgMaxOp<Context>::DoRunWithType() {
 template <class Context>
 void ArgMaxOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(ArgMax);
+DEPLOY_CPU_OPERATOR(ArgMax);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ArgMax);
+DEPLOY_CUDA_OPERATOR(ArgMax);
 #endif
 OPERATOR_SCHEMA(ArgMax)

--- a/dragon/operators/array/argmin_op.cc
+++ b/dragon/operators/array/argmin_op.cc
@@ -49,12 +49,12 @@ void ArgMinOp<Context>::DoRunWithType() {
 template <class Context>
 void ArgMinOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(ArgMin);
+DEPLOY_CPU_OPERATOR(ArgMin);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ArgMin);
+DEPLOY_CUDA_OPERATOR(ArgMin);
 #endif
 OPERATOR_SCHEMA(ArgMin)

--- a/dragon/operators/array/cast_op.cc
+++ b/dragon/operators/array/cast_op.cc
@@ -39,21 +39,21 @@ namespace dragon {
  LOG(FATAL) << MessageForUnsupported(dtype(), ELIGIBLE_TENSOR_TYPES);
 #define DISPATCH_WITH_TENSOR(X)                             \
-  if (XIsType(X, bool)) {                                   \
+  if (X.template IsType<bool>()) {                          \
    DISPATCH_TYPE_TO_ALL(bool);                             \
-  } else if (XIsType(X, int8_t)) {                          \
+  } else if (X.template IsType<int8_t>()) {                 \
    DISPATCH_TYPE_TO_ALL(int8_t);                           \
-  } else if (XIsType(X, uint8_t)) {                         \
+  } else if (X.template IsType<uint8_t>()) {                \
    DISPATCH_TYPE_TO_ALL(uint8_t);                          \
-  } else if (XIsType(X, int)) {                             \
+  } else if (X.template IsType<int>()) {                    \
    DISPATCH_TYPE_TO_ALL(int);                              \
-  } else if (XIsType(X, int64_t)) {                         \
+  } else if (X.template IsType<int64_t>()) {                \
    DISPATCH_TYPE_TO_ALL(int64_t);                          \
-  } else if (XIsType(X, float16)) {                         \
+  } else if (X.template IsType<float16>()) {                \
    DISPATCH_TYPE_TO_ALL(float16);                          \
-  } else if (XIsType(X, float)) {                           \
+  } else if (X.template IsType<float>()) {                  \
    DISPATCH_TYPE_TO_ALL(float);                            \
-  } else if (XIsType(X, double)) {                          \
+  } else if (X.template IsType<double>()) {                 \
    DISPATCH_TYPE_TO_ALL(double);                           \
  } else {                                                  \
    LOG(FATAL) << MessageForUnsupported(                    \
@@ -78,14 +78,14 @@ void CastGradientOp<Context>::RunOnDevice() {
  DISPATCH_WITH_TENSOR(Input(-1));
 }
-DEPLOY_CPU(Cast);
+DEPLOY_CPU_OPERATOR(Cast);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Cast);
+DEPLOY_CUDA_OPERATOR(Cast);
 #endif
-DEPLOY_CPU(CastGradient);
+DEPLOY_CPU_OPERATOR(CastGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(CastGradient);
+DEPLOY_CUDA_OPERATOR(CastGradient);
 #endif
 OPERATOR_SCHEMA(Cast)

--- a/dragon/operators/array/channel_normalize_op.cc
+++ b/dragon/operators/array/channel_normalize_op.cc
@@ -59,12 +59,12 @@ void ChannelNormalizeOp<Context>::DoRunWithType() {
 template <class Context>
 void ChannelNormalizeOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(ChannelNormalize);
+DEPLOY_CPU_OPERATOR(ChannelNormalize);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ChannelNormalize);
+DEPLOY_CUDA_OPERATOR(ChannelNormalize);
 #endif
 OPERATOR_SCHEMA(ChannelNormalize)

--- a/dragon/operators/array/channel_normalize_op.h
+++ b/dragon/operators/array/channel_normalize_op.h
@@ -22,9 +22,9 @@ class ChannelNormalizeOp final : public Operator<Context> {
 public:
  ChannelNormalizeOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, perm);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, perm);
-    auto mean = OpArgs<float>("mean");
+    auto mean = OP_REPEATED_ARG(float, "mean");
-    auto std = OpArgs<float>("std");
+    auto std = OP_REPEATED_ARG(float, "std");
    CHECK_EQ(mean.size(), std.size())
        << "\nSize of <mean> and <std> should be same.";
    X_mean_.Reshape({(int64_t)mean.size()});
@@ -47,10 +47,10 @@ class ChannelNormalizeOp final : public Operator<Context> {
 protected:
  Tensor X_mean_, X_std_;
-  DECLARE_ARGS_WITH_DESC(int64_t, perm);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, perm);
 };
-DEFINE_ARGS_WITH_DESC(int64_t, ChannelNormalizeOp, perm);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, ChannelNormalizeOp, perm);
 } // namespace dragon

--- a/dragon/operators/array/channel_shuffle_op.cc
+++ b/dragon/operators/array/channel_shuffle_op.cc
@@ -25,7 +25,7 @@ void ChannelShuffleOp<Context>::DoRunWithType() {
 template <class Context>
 void ChannelShuffleOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -49,14 +49,14 @@ void ChannelShuffleGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(ChannelShuffle);
+DEPLOY_CPU_OPERATOR(ChannelShuffle);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ChannelShuffle);
+DEPLOY_CUDA_OPERATOR(ChannelShuffle);
 #endif
-DEPLOY_CPU(ChannelShuffleGradient);
+DEPLOY_CPU_OPERATOR(ChannelShuffleGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ChannelShuffleGradient);
+DEPLOY_CUDA_OPERATOR(ChannelShuffleGradient);
 #endif
 OPERATOR_SCHEMA(ChannelShuffle)

--- a/dragon/operators/array/channel_shuffle_op.h
+++ b/dragon/operators/array/channel_shuffle_op.h
@@ -21,7 +21,8 @@ template <class Context>
 class ChannelShuffleOp final : public Operator<Context> {
 public:
  ChannelShuffleOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), group_(OpArg<int64_t>("group", 1)) {}
+      : Operator<Context>(def, ws),
+        group_(OP_SINGLE_ARG(int64_t, "group", 1)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -37,7 +38,8 @@ template <class Context>
 class ChannelShuffleGradientOp final : public Operator<Context> {
 public:
  ChannelShuffleGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), group_(OpArg<int64_t>("group", 1)) {}
+      : Operator<Context>(def, ws),
+        group_(OP_SINGLE_ARG(int64_t, "group", 1)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/array/concat_op.cc
+++ b/dragon/operators/array/concat_op.cc
@@ -45,7 +45,7 @@ void ConcatOp<Context>::DoRunWithType() {
 template <class Context>
 void ConcatOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -77,14 +77,14 @@ void ConcatGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Concat);
+DEPLOY_CPU_OPERATOR(Concat);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Concat);
+DEPLOY_CUDA_OPERATOR(Concat);
 #endif
-DEPLOY_CPU(ConcatGradient);
+DEPLOY_CPU_OPERATOR(ConcatGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ConcatGradient);
+DEPLOY_CUDA_OPERATOR(ConcatGradient);
 #endif
 OPERATOR_SCHEMA(Concat)

--- a/dragon/operators/array/cum_ops.h
+++ b/dragon/operators/array/cum_ops.h
@@ -17,23 +17,23 @@
 namespace dragon {
-#define DECLARE_CUM_OP(name)                          \
+#define DECLARE_CUM_OP(name)                                  \
-  template <class Context>                            \
+  template <class Context>                                    \
-  class name##Op final : public Operator<Context> {   \
+  class name##Op final : public Operator<Context> {           \
-   public:                                            \
+   public:                                                    \
-    name##Op(const OperatorDef& def, Workspace* ws)   \
+    name##Op(const OperatorDef& def, Workspace* ws)           \
-        : Operator<Context>(def, ws),                 \
+        : Operator<Context>(def, ws),                         \
-          exclusive_(OpArg<int64_t>("exclusive", 0)), \
+          exclusive_(OP_SINGLE_ARG(int64_t, "exclusive", 0)), \
-          reverse_(OpArg<int64_t>("reverse", 0)) {}   \
+          reverse_(OP_SINGLE_ARG(int64_t, "reverse", 0)) {}   \
-    USE_OPERATOR_FUNCTIONS;                           \
+    USE_OPERATOR_FUNCTIONS;                                   \
-                                                      \
+                                                              \
-    void RunOnDevice() override;                      \
+    void RunOnDevice() override;                              \
-                                                      \
+                                                              \
-    template <typename T>                             \
+    template <typename T>                                     \
-    void DoRunWithType();                             \
+    void DoRunWithType();                                     \
-                                                      \
+                                                              \
-   protected:                                         \
+   protected:                                                 \
-    int64_t exclusive_, reverse_;                     \
+    int64_t exclusive_, reverse_;                             \
  };
 DECLARE_CUM_OP(CumSum);

--- a/dragon/operators/array/cumsum_op.cc
+++ b/dragon/operators/array/cumsum_op.cc
@@ -23,7 +23,7 @@ void CumSumOp<Context>::DoRunWithType() {
 template <class Context>
 void CumSumOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -48,14 +48,14 @@ void CumSumGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(CumSum);
+DEPLOY_CPU_OPERATOR(CumSum);
 #ifdef USE_CUDA
-DEPLOY_CUDA(CumSum);
+DEPLOY_CUDA_OPERATOR(CumSum);
 #endif
-DEPLOY_CPU(CumSumGradient);
+DEPLOY_CPU_OPERATOR(CumSumGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(CumSumGradient);
+DEPLOY_CUDA_OPERATOR(CumSumGradient);
 #endif
 OPERATOR_SCHEMA(CumSum)

--- a/dragon/operators/array/expand_dims_op.cc
+++ b/dragon/operators/array/expand_dims_op.cc
@@ -29,14 +29,14 @@ void ExpandDimsOp<Context>::RunOnDevice() {
  Y->Reshape(out_shape)->CopyFrom(X, ctx());
 }
-DEPLOY_CPU(ExpandDims);
+DEPLOY_CPU_OPERATOR(ExpandDims);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ExpandDims);
+DEPLOY_CUDA_OPERATOR(ExpandDims);
 #endif
-DEPLOY_CPU(ExpandDimsGradient);
+DEPLOY_CPU_OPERATOR(ExpandDimsGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ExpandDimsGradient);
+DEPLOY_CUDA_OPERATOR(ExpandDimsGradient);
 #endif
 OPERATOR_SCHEMA(ExpandDims)

--- a/dragon/operators/array/expand_op.cc
+++ b/dragon/operators/array/expand_op.cc
@@ -36,7 +36,7 @@ void ExpandOp<Context>::DoRunWithType() {
 template <class Context>
 void ExpandOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -71,14 +71,14 @@ void ExpandGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Expand);
+DEPLOY_CPU_OPERATOR(Expand);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Expand);
+DEPLOY_CUDA_OPERATOR(Expand);
 #endif
-DEPLOY_CPU(ExpandGradient);
+DEPLOY_CPU_OPERATOR(ExpandGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ExpandGradient);
+DEPLOY_CUDA_OPERATOR(ExpandGradient);
 #endif
 OPERATOR_SCHEMA(Expand)

--- a/dragon/operators/array/expand_op.h
+++ b/dragon/operators/array/expand_op.h
@@ -21,7 +21,7 @@ template <class Context>
 class ExpandOp final : public Operator<Context> {
 public:
  ExpandOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, dims);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, dims);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -31,7 +31,7 @@ class ExpandOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARGS_WITH_DESC(int64_t, dims);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, dims);
 };
 template <class Context>
@@ -46,7 +46,7 @@ class ExpandGradientOp final : public Operator<Context> {
  void DoRunWithType();
 };
-DEFINE_ARGS_WITH_DESC(int64_t, ExpandOp, dims);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, ExpandOp, dims);
 } // namespace dragon

--- a/dragon/operators/array/flatten_op.cc
+++ b/dragon/operators/array/flatten_op.cc
@@ -40,14 +40,14 @@ void FlattenOp<Context>::RunOnDevice() {
  Y->Reshape(out_shape)->CopyFrom(X, ctx());
 }
-DEPLOY_CPU(Flatten);
+DEPLOY_CPU_OPERATOR(Flatten);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Flatten);
+DEPLOY_CUDA_OPERATOR(Flatten);
 #endif
-DEPLOY_CPU(FlattenGradient);
+DEPLOY_CPU_OPERATOR(FlattenGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(FlattenGradient);
+DEPLOY_CUDA_OPERATOR(FlattenGradient);
 #endif
 OPERATOR_SCHEMA(Flatten)

--- a/dragon/operators/array/index_select_op.cc
+++ b/dragon/operators/array/index_select_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 #define CANONICALIZE_AXES_WITH_TENSOR(tensor)                                 \
  CANONICALIZE_AXIS_WITH_TENSOR(tensor);                                      \
-  auto num_axes = OpArg<int64_t>("num_axes", 1);                              \
+  auto num_axes = OP_SINGLE_ARG(int64_t, "num_axes", 1);                      \
  if (num_axes < 0) {                                                         \
    num_axes = tensor.ndim() - axis;                                          \
  } else if (num_axes == 0) {                                                 \
@@ -24,7 +24,8 @@ void IndexSelectOp<Context>::DoRunWithType() {
  CANONICALIZE_AXES_WITH_TENSOR(X);
  CHECK_GT(X_index.count(), 0) << "\nLength of indices must > 0.";
-  CHECK(XIsType(X_index, int64_t)) << "\nType of index should be int64.";
+  CHECK(X_index.template IsType<int64_t>())
+      << "\nType of index should be int64.";
  vec64_t X_dims(X.dims());
  vec64_t Y_dims(X_dims.begin(), X_dims.begin() + axis);
@@ -48,7 +49,7 @@ void IndexSelectOp<Context>::DoRunWithType() {
 template <class Context>
 void IndexSelectOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -81,14 +82,14 @@ void IndexSelectGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(IndexSelect);
+DEPLOY_CPU_OPERATOR(IndexSelect);
 #ifdef USE_CUDA
-DEPLOY_CUDA(IndexSelect);
+DEPLOY_CUDA_OPERATOR(IndexSelect);
 #endif
-DEPLOY_CPU(IndexSelectGradient);
+DEPLOY_CPU_OPERATOR(IndexSelectGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(IndexSelectGradient);
+DEPLOY_CUDA_OPERATOR(IndexSelectGradient);
 #endif
 OPERATOR_SCHEMA(IndexSelect)

--- a/dragon/operators/array/initialize_ops.cc
+++ b/dragon/operators/array/initialize_ops.cc
@@ -90,50 +90,50 @@ DISPATCH_WITH_TENSOR_TYPES(RandomUniform, FloatingTensorTypes);
 DISPATCH_WITH_TENSOR_TYPES(TruncatedNormal, FloatingTensorTypes);
 DISPATCH_WITH_TENSOR_TYPES(GlorotNormal, FloatingTensorTypes);
 DISPATCH_WITH_TENSOR_TYPES(GlorotUniform, FloatingTensorTypes);
-DISPATCH_WITH_TENSOR_TYPES(Fill, AllTensorTypes);
+DISPATCH_WITH_TENSOR_TYPES(Fill, FullTensorTypes);
-DISPATCH_WITH_TENSOR_TYPES(Eye, AllTensorTypes);
+DISPATCH_WITH_TENSOR_TYPES(Eye, FullTensorTypes);
 #undef DISPATCH_WITH_TYPES
 #undef DISPATCH_WITH_TENSOR_TYPES
-DEPLOY_CPU(Fill);
+DEPLOY_CPU_OPERATOR(Fill);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Fill);
+DEPLOY_CUDA_OPERATOR(Fill);
 #endif
-DEPLOY_CPU(Eye);
+DEPLOY_CPU_OPERATOR(Eye);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Eye);
+DEPLOY_CUDA_OPERATOR(Eye);
 #endif
-DEPLOY_CPU(GivenTensorFill);
+DEPLOY_CPU_OPERATOR(GivenTensorFill);
 #ifdef USE_CUDA
-DEPLOY_CUDA(GivenTensorFill);
+DEPLOY_CUDA_OPERATOR(GivenTensorFill);
 #endif
-DEPLOY_CPU(RandomNormal);
+DEPLOY_CPU_OPERATOR(RandomNormal);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RandomNormal);
+DEPLOY_CUDA_OPERATOR(RandomNormal);
 #endif
-DEPLOY_CPU(RandomUniform);
+DEPLOY_CPU_OPERATOR(RandomUniform);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RandomUniform);
+DEPLOY_CUDA_OPERATOR(RandomUniform);
 #endif
 #ifdef USE_CUDA
-DEPLOY_CPU_CUDA(TruncatedNormal);
+DEPLOY_CPU_CUDA_OPERATOR(TruncatedNormal);
 #else
-DEPLOY_CPU(TruncatedNormal);
+DEPLOY_CPU_OPERATOR(TruncatedNormal);
 #endif
-DEPLOY_CPU(GlorotNormal);
+DEPLOY_CPU_OPERATOR(GlorotNormal);
 #ifdef USE_CUDA
-DEPLOY_CUDA(GlorotNormal);
+DEPLOY_CUDA_OPERATOR(GlorotNormal);
 #endif
-DEPLOY_CPU(GlorotUniform);
+DEPLOY_CPU_OPERATOR(GlorotUniform);
 #ifdef USE_CUDA
-DEPLOY_CUDA(GlorotUniform);
+DEPLOY_CUDA_OPERATOR(GlorotUniform);
 #endif
 OPERATOR_SCHEMA(Fill).NumInputs(0, 1).NumOutputs(1);

--- a/dragon/operators/array/initialize_ops.h
+++ b/dragon/operators/array/initialize_ops.h
@@ -23,7 +23,7 @@ class InitializeOp : public Operator<Context> {
 public:
  InitializeOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, dims);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, dims);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -31,14 +31,15 @@ class InitializeOp : public Operator<Context> {
 protected:
  FillerInfo filler_info_;
-  DECLARE_ARGS_WITH_DESC(int64_t, dims);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, dims);
 };
 template <class Context>
 class FillOp final : public InitializeOp<Context> {
 public:
  FillOp(const OperatorDef& def, Workspace* ws)
-      : InitializeOp<Context>(def, ws), value_(OpArg<float>("value", 0.f)) {}
+      : InitializeOp<Context>(def, ws),
+        value_(OP_SINGLE_ARG(float, "value", 0.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -51,10 +52,10 @@ class FillOp final : public InitializeOp<Context> {
 };
 template <class Context>
-class ArangeOp final : public Operator<Context> {
+class RangeOp final : public Operator<Context> {
 public:
-  ArangeOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) {
+  RangeOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(float, slice);
+    INIT_OP_REPEATED_ARG_WITH_DESC(float, slice);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -64,14 +65,32 @@ class ArangeOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARGS_WITH_DESC(float, slice);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(float, slice);
+};
+template <class Context>
+class PermutationOp final : public Operator<Context> {
+ public:
+  PermutationOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws) {
+    INIT_OP_SINGLE_ARG_WITH_DESC(int64_t, limit, 0);
+  }
+  USE_OPERATOR_FUNCTIONS;
+  void RunOnDevice() override;
+  template <typename T>
+  void DoRunWithType();
+ protected:
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(int64_t, limit);
 };
 template <class Context>
 class EyeOp final : public InitializeOp<Context> {
 public:
  EyeOp(const OperatorDef& def, Workspace* ws)
-      : InitializeOp<Context>(def, ws), k_(OpArg<int64_t>("k", 0)) {}
+      : InitializeOp<Context>(def, ws), k_(OP_SINGLE_ARG(int64_t, "k", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -96,7 +115,7 @@ template <class Context>
 class GivenTensorFillOp final : public Operator<Context> {
 public:
  GivenTensorFillOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), shape_(OpArgs<int64_t>("shape")) {}
+      : Operator<Context>(def, ws), shape_(OP_REPEATED_ARG(int64_t, "shape")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -108,7 +127,7 @@ class GivenTensorFillOp final : public Operator<Context> {
  template <typename T>
  void ExtractImpl(TypeIdentity<T>) {
-    auto raw_values = OpArgs<T>("values");
+    auto raw_values = OP_REPEATED_ARG(T, "values");
    auto nelements = (int64_t)raw_values.size();
    values_.Reshape({nelements});
    memcpy(
@@ -118,7 +137,7 @@ class GivenTensorFillOp final : public Operator<Context> {
  }
  void ExtractImpl(TypeIdentity<float16>) {
-    auto raw_values = OpArgs<float>("values");
+    auto raw_values = OP_REPEATED_ARG(float, "values");
    auto nelements = (int64_t)raw_values.size();
    values_.Reshape({nelements});
    memcpy(
@@ -140,8 +159,8 @@ class RandomNormalOp final : public InitializeOp<Context> {
 public:
  RandomNormalOp(const OperatorDef& def, Workspace* ws)
      : InitializeOp<Context>(def, ws) {
-    auto mu = OpArg<float>("mean", 0.f);
+    auto mu = OP_SINGLE_ARG(float, "mean", 0.f);
-    auto sigma = OpArg<float>("std", 1.f);
+    auto sigma = OP_SINGLE_ARG(float, "std", 1.f);
    this->filler_info_.set_mean(mu);
    this->filler_info_.set_std(sigma);
    this->filler_info_.set_type("normal");
@@ -159,8 +178,8 @@ class RandomUniformOp final : public InitializeOp<Context> {
 public:
  RandomUniformOp(const OperatorDef& def, Workspace* ws)
      : InitializeOp<Context>(def, ws) {
-    auto low = OpArg<float>("low", -1.f);
+    auto low = OP_SINGLE_ARG(float, "low", -1.f);
-    auto high = OpArg<float>("high", 1.f);
+    auto high = OP_SINGLE_ARG(float, "high", 1.f);
    this->filler_info_.set_low(low);
    this->filler_info_.set_high(high);
    this->filler_info_.set_type("uniform");
@@ -178,8 +197,8 @@ class TruncatedNormalOp final : public InitializeOp<Context> {
 public:
  TruncatedNormalOp(const OperatorDef& def, Workspace* ws)
      : InitializeOp<Context>(def, ws) {
-    auto mu = OpArg<float>("mean", 0.f);
+    auto mu = OP_SINGLE_ARG(float, "mean", 0.f);
-    auto sigma = OpArg<float>("std", 1.f);
+    auto sigma = OP_SINGLE_ARG(float, "std", 1.f);
    this->filler_info_.set_mean(mu);
    this->filler_info_.set_std(sigma);
    this->filler_info_.set_low(mu - 2 * sigma);
@@ -199,8 +218,8 @@ class GlorotNormalOp final : public InitializeOp<Context> {
 public:
  GlorotNormalOp(const OperatorDef& def, Workspace* ws)
      : InitializeOp<Context>(def, ws) {
-    auto scale = OpArg<float>("scale", 2.f);
+    auto scale = OP_SINGLE_ARG(float, "scale", 2.f);
-    auto mode = OpArg<string>("mode", "fan_in");
+    auto mode = OP_SINGLE_ARG(string, "mode", "fan_in");
    this->filler_info_.set_type("glorot_normal");
    if (mode == "fan_avg") {
      this->filler_info_.set_variance_norm(FillerInfo_VarianceNorm_FAN_AVG);
@@ -224,8 +243,8 @@ class GlorotUniformOp final : public InitializeOp<Context> {
 public:
  GlorotUniformOp(const OperatorDef& def, Workspace* ws)
      : InitializeOp<Context>(def, ws) {
-    auto scale = OpArg<float>("scale", 3.f);
+    auto scale = OP_SINGLE_ARG(float, "scale", 3.f);
-    auto mode = OpArg<string>("mode", "fan_in");
+    auto mode = OP_SINGLE_ARG(string, "mode", "fan_in");
    this->filler_info_.set_type("glorot_uniform");
    if (mode == "fan_avg") {
      this->filler_info_.set_variance_norm(FillerInfo_VarianceNorm_FAN_AVG);
@@ -244,8 +263,9 @@ class GlorotUniformOp final : public InitializeOp<Context> {
  void DoRunWithType();
 };
-DEFINE_ARGS_WITH_DESC(int64_t, InitializeOp, dims);
+DEFINE_OP_SINGLE_ARG_WITH_DESC(int64_t, PermutationOp, limit);
-DEFINE_ARGS_WITH_DESC(float, ArangeOp, slice);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, InitializeOp, dims);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(float, RangeOp, slice);
 } // namespace dragon

--- a/dragon/operators/array/masked_select_op.cc
+++ b/dragon/operators/array/masked_select_op.cc
@@ -11,7 +11,7 @@ void MaskedSelectOp<Context>::DoRunWithType() {
  CHECK_EQ(X.count(), X_mask.count())
      << "\nSize of mask and input should be equal.";
-  CHECK(XIsType(X_mask, bool) || XIsType(X_mask, uint8_t))
+  CHECK(X_mask.template IsType<bool>() || X_mask.template IsType<uint8_t>())
      << "\nExcepted bool or uint8 mask.";
  // Store for the gradient calculation
@@ -52,7 +52,7 @@ void MaskedSelectOp<Context>::DoRunWithType() {
 template <class Context>
 void MaskedSelectOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -75,14 +75,14 @@ void MaskedSelectGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(MaskedSelect);
+DEPLOY_CPU_OPERATOR(MaskedSelect);
 #ifdef USE_CUDA
-DEPLOY_CUDA(MaskedSelect);
+DEPLOY_CUDA_OPERATOR(MaskedSelect);
 #endif
-DEPLOY_CPU(MaskedSelectGradient);
+DEPLOY_CPU_OPERATOR(MaskedSelectGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(MaskedSelectGradient);
+DEPLOY_CUDA_OPERATOR(MaskedSelectGradient);
 #endif
 OPERATOR_SCHEMA(MaskedSelect)

--- a/dragon/operators/array/multinomial_op.cc
+++ b/dragon/operators/array/multinomial_op.cc
@@ -64,9 +64,9 @@ void MultinomialOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<float, double>>::Call(this, Input(0));
 }
-DEPLOY_CPU(Multinomial);
+DEPLOY_CPU_OPERATOR(Multinomial);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Multinomial);
+DEPLOY_CUDA_OPERATOR(Multinomial);
 #endif
 OPERATOR_SCHEMA(Multinomial)

--- a/dragon/operators/array/multinomial_op.h
+++ b/dragon/operators/array/multinomial_op.h
@@ -22,9 +22,9 @@ class MultinomialOp final : public Operator<Context> {
 public:
  MultinomialOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        epsilon_(OpArg<double>("epsilon", 0.)),
+        epsilon_(OP_SINGLE_ARG(double, "epsilon", 0.)),
-        normalize_(OpArg<int64_t>("normalize", 0)),
+        normalize_(OP_SINGLE_ARG(int64_t, "normalize", 0)),
-        num_samples_(OpArg<int64_t>("num_samples", 1)) {}
+        num_samples_(OP_SINGLE_ARG(int64_t, "num_samples", 1)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/array/non_zero_op.cc
+++ b/dragon/operators/array/non_zero_op.cc
@@ -55,12 +55,12 @@ void NonZeroOp<Context>::DoRunWithType() {
 template <class Context>
 void NonZeroOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(NonZero);
+DEPLOY_CPU_OPERATOR(NonZero);
 #ifdef USE_CUDA
-DEPLOY_CUDA(NonZero);
+DEPLOY_CUDA_OPERATOR(NonZero);
 #endif
 OPERATOR_SCHEMA(NonZero)

--- a/dragon/operators/array/one_hot_op.cc
+++ b/dragon/operators/array/one_hot_op.cc
@@ -34,9 +34,9 @@ void OneHotOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<int, int64_t, float>>::Call(this, Input(0));
 }
-DEPLOY_CPU(OneHot);
+DEPLOY_CPU_OPERATOR(OneHot);
 #ifdef USE_CUDA
-DEPLOY_CUDA(OneHot);
+DEPLOY_CUDA_OPERATOR(OneHot);
 #endif
 OPERATOR_SCHEMA(OneHot)

--- a/dragon/operators/array/one_hot_op.h
+++ b/dragon/operators/array/one_hot_op.h
@@ -22,9 +22,9 @@ class OneHotOp final : public Operator<Context> {
 public:
  OneHotOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        depth_(OpArg<int64_t>("depth", -1)),
+        depth_(OP_SINGLE_ARG(int64_t, "depth", -1)),
-        on_value_(OpArg<int64_t>("on_value", 1)),
+        on_value_(OP_SINGLE_ARG(int64_t, "on_value", 1)),
-        off_value_(OpArg<int64_t>("off_value", 0)) {}
+        off_value_(OP_SINGLE_ARG(int64_t, "off_value", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/array/pad_op.cc
+++ b/dragon/operators/array/pad_op.cc
@@ -80,7 +80,7 @@ void PadOp<Context>::DoRunWithType() {
 template <class Context>
 void PadOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -124,14 +124,14 @@ void PadGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Pad);
+DEPLOY_CPU_OPERATOR(Pad);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Pad);
+DEPLOY_CUDA_OPERATOR(Pad);
 #endif
-DEPLOY_CPU(PadGradient);
+DEPLOY_CPU_OPERATOR(PadGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(PadGradient);
+DEPLOY_CUDA_OPERATOR(PadGradient);
 #endif
 OPERATOR_SCHEMA(Pad)

--- a/dragon/operators/array/pad_op.h
+++ b/dragon/operators/array/pad_op.h
@@ -22,9 +22,9 @@ class PadOp final : public Operator<Context> {
 public:
  PadOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        value_(OpArg<float>("value", 0.f)),
+        value_(OP_SINGLE_ARG(float, "value", 0.f)),
-        mode_(OpArg<string>("mode", "CONSTANT")) {
+        mode_(OP_SINGLE_ARG(string, "mode", "CONSTANT")) {
-    GET_ARGS_WITH_DESC(int64_t, pads);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, pads);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -36,7 +36,7 @@ class PadOp final : public Operator<Context> {
 protected:
  float value_;
  string mode_;
-  DECLARE_ARGS_WITH_DESC(int64_t, pads);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, pads);
 };
 template <class Context>
@@ -44,9 +44,9 @@ class PadGradientOp final : public Operator<Context> {
 public:
  PadGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        pad_l_(OpArgs<int64_t>("pad_l")),
+        pad_l_(OP_REPEATED_ARG(int64_t, "pad_l")),
-        pad_r_(OpArgs<int64_t>("pad_r")),
+        pad_r_(OP_REPEATED_ARG(int64_t, "pad_r")),
-        mode_(OpArg<string>("mode", "CONSTANT")) {
+        mode_(OP_SINGLE_ARG(string, "mode", "CONSTANT")) {
    if (pad_r_.empty()) {
      pad_r_ = pad_l_;
    } else {
@@ -66,7 +66,7 @@ class PadGradientOp final : public Operator<Context> {
  vec64_t pad_l_, pad_r_;
 };
-DEFINE_ARGS_WITH_DESC(int64_t, PadOp, pads);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, PadOp, pads);
 } // namespace dragon

--- a/dragon/operators/array/permutation_op.cc
+++ b/dragon/operators/array/permutation_op.cc
+#include "dragon/core/workspace.h"
+#include "dragon/operators/array/initialize_ops.h"
+#include "dragon/utils/op_kernels.h"
+namespace dragon {
+template <class Context>
+template <typename T>
+void PermutationOp<Context>::DoRunWithType() {
+  auto* Y = Output(0)->Reshape({limit()});
+  kernel::Permutation(
+      Y->count(),
+      Y->template mutable_data<T, Context>(),
+      ws()->template data<uint32_t, Context>({Y->count()})[0],
+      ctx());
+}
+template <class Context>
+void PermutationOp<Context>::RunOnDevice() {
+  DispatchHelper<NumericalTensorTypes>::Call(this);
+}
+DEPLOY_CPU_OPERATOR(Permutation);
+#ifdef USE_CUDA
+DEPLOY_CUDA_OPERATOR(Permutation);
+#endif
+OPERATOR_SCHEMA(Permutation).NumInputs(0).NumOutputs(1);
+NO_GRADIENT(Permutation);
+} // namespace dragon
--- a/dragon/operators/array/arange_op.cc
+++ b/dragon/operators/array/arange_op.cc
@@ -6,46 +6,46 @@ namespace dragon {
 template <class Context>
 template <typename T>
-void ArangeOp<Context>::DoRunWithType() {
+void RangeOp<Context>::DoRunWithType() {
  // Determine the slice arguments
  int num_args;
-  float start = 0.f, stop, step;
+  float start = 0.f, limit, delta;
  slice(0, &num_args);
  if (num_args == 2) {
-    stop = slice(0), step = slice(1);
+    limit = slice(0), delta = slice(1);
  } else if (num_args == 3) {
-    start = slice(0), stop = slice(1), step = slice(2);
+    start = slice(0), limit = slice(1), delta = slice(2);
  } else {
    LOG(FATAL) << "Unexcepted number of slice arguments: " << num_args;
  }
  // Determine the generating range
  // Values are in a half-open interval: [start, stop)
-  auto count = (int64_t)std::ceil((stop - start) / step);
+  auto count = (int64_t)std::ceil((limit - start) / delta);
  CHECK_GT(count, 0) << "\nInvalid generating range: "
-                     << "[" << start << ", " << stop << ") with step = " << step
+                     << "[" << start << ", " << limit
-                     << ".";
+                     << ") with delta = " << delta << ".";
-  kernel::Arange(
+  kernel::Range(
      count,
      start,
-      step,
+      delta,
      Output(0)->Reshape({count})->template mutable_data<T, Context>(),
      ctx());
 }
 template <class Context>
-void ArangeOp<Context>::RunOnDevice() {
+void RangeOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this);
+  DispatchHelper<NumericalTensorTypes>::Call(this);
 }
-DEPLOY_CPU(Arange);
+DEPLOY_CPU_OPERATOR(Range);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Arange);
+DEPLOY_CUDA_OPERATOR(Range);
 #endif
-OPERATOR_SCHEMA(Arange).NumInputs(0).NumOutputs(1);
+OPERATOR_SCHEMA(Range).NumInputs(0).NumOutputs(1);
-NO_GRADIENT(Arange);
+NO_GRADIENT(Range);
 } // namespace dragon
--- a/dragon/operators/array/reduce_max_op.cc
+++ b/dragon/operators/array/reduce_max_op.cc
@@ -47,12 +47,12 @@ void ReduceMaxOp<Context>::DoRunWithType() {
 template <class Context>
 void ReduceMaxOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(ReduceMax);
+DEPLOY_CPU_OPERATOR(ReduceMax);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ReduceMax);
+DEPLOY_CUDA_OPERATOR(ReduceMax);
 #endif
 OPERATOR_SCHEMA(ReduceMax)

--- a/dragon/operators/array/reduce_mean_op.cc
+++ b/dragon/operators/array/reduce_mean_op.cc
@@ -55,7 +55,7 @@ void ReduceMeanOp<Context>::DoRunWithType() {
 template <class Context>
 void ReduceMeanOp<Context>::RunOnDevice() {
  STORE_INPUT_SPEC(0);
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -86,14 +86,14 @@ void ReduceMeanGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(ReduceMean);
+DEPLOY_CPU_OPERATOR(ReduceMean);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ReduceMean);
+DEPLOY_CUDA_OPERATOR(ReduceMean);
 #endif
-DEPLOY_CPU(ReduceMeanGradient);
+DEPLOY_CPU_OPERATOR(ReduceMeanGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ReduceMeanGradient);
+DEPLOY_CUDA_OPERATOR(ReduceMeanGradient);
 #endif
 OPERATOR_SCHEMA(ReduceMean)

--- a/dragon/operators/array/reduce_min_op.cc
+++ b/dragon/operators/array/reduce_min_op.cc
@@ -47,12 +47,12 @@ void ReduceMinOp<Context>::DoRunWithType() {
 template <class Context>
 void ReduceMinOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(ReduceMin);
+DEPLOY_CPU_OPERATOR(ReduceMin);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ReduceMin);
+DEPLOY_CUDA_OPERATOR(ReduceMin);
 #endif
 OPERATOR_SCHEMA(ReduceMin)

--- a/dragon/operators/array/reduce_ops.h
+++ b/dragon/operators/array/reduce_ops.h
@@ -17,24 +17,24 @@
 namespace dragon {
-#define DECLARE_REDUCE_OP(name)                         \
+#define DECLARE_REDUCE_OP(name)                                 \
-  template <class Context>                              \
+  template <class Context>                                      \
-  class name##Op final : public Operator<Context> {     \
+  class name##Op final : public Operator<Context> {             \
-   public:                                              \
+   public:                                                      \
-    name##Op(const OperatorDef& def, Workspace* ws)     \
+    name##Op(const OperatorDef& def, Workspace* ws)             \
-        : Operator<Context>(def, ws),                   \
+        : Operator<Context>(def, ws),                           \
-          axes_(OpArgs<int64_t>("axes")),               \
+          axes_(OP_REPEATED_ARG(int64_t, "axes")),              \
-          keep_dims_(OpArg<int64_t>("keep_dims", 0)) {} \
+          keep_dims_(OP_SINGLE_ARG(int64_t, "keep_dims", 0)) {} \
-    USE_OPERATOR_FUNCTIONS;                             \
+    USE_OPERATOR_FUNCTIONS;                                     \
-                                                        \
+                                                                \
-    void RunOnDevice() override;                        \
+    void RunOnDevice() override;                                \
-                                                        \
+                                                                \
-    template <typename T>                               \
+    template <typename T>                                       \
-    void DoRunWithType();                               \
+    void DoRunWithType();                                       \
-                                                        \
+                                                                \
-   protected:                                           \
+   protected:                                                   \
-    int64_t keep_dims_;                                 \
+    int64_t keep_dims_;                                         \
-    vec64_t axes_;                                      \
+    vec64_t axes_;                                              \
  };
 #define DECLARE_REDUCE_GRAD_OP(name)                        \

--- a/dragon/operators/array/reduce_sum_op.cc
+++ b/dragon/operators/array/reduce_sum_op.cc
@@ -54,7 +54,7 @@ void ReduceSumOp<Context>::DoRunWithType() {
 template <class Context>
 void ReduceSumOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -85,14 +85,14 @@ void ReduceSumGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(ReduceSum);
+DEPLOY_CPU_OPERATOR(ReduceSum);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ReduceSum);
+DEPLOY_CUDA_OPERATOR(ReduceSum);
 #endif
-DEPLOY_CPU(ReduceSumGradient);
+DEPLOY_CPU_OPERATOR(ReduceSumGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ReduceSumGradient);
+DEPLOY_CUDA_OPERATOR(ReduceSumGradient);
 #endif
 OPERATOR_SCHEMA(ReduceSum)

--- a/dragon/operators/array/repeat_op.cc
+++ b/dragon/operators/array/repeat_op.cc
@@ -43,7 +43,7 @@ void RepeatOp<Context>::DoRunWithType() {
 template <class Context>
 void RepeatOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -79,14 +79,14 @@ void RepeatGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Repeat);
+DEPLOY_CPU_OPERATOR(Repeat);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Repeat);
+DEPLOY_CUDA_OPERATOR(Repeat);
 #endif
-DEPLOY_CPU(RepeatGradient);
+DEPLOY_CPU_OPERATOR(RepeatGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RepeatGradient);
+DEPLOY_CUDA_OPERATOR(RepeatGradient);
 #endif
 OPERATOR_SCHEMA(Repeat)

--- a/dragon/operators/array/repeat_op.h
+++ b/dragon/operators/array/repeat_op.h
@@ -21,7 +21,7 @@ template <class Context>
 class RepeatOp final : public Operator<Context> {
 public:
  RepeatOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) {
-    GET_ARG_WITH_DESC(int64_t, repeats, 1);
+    INIT_OP_SINGLE_ARG_WITH_DESC(int64_t, repeats, 1);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -31,7 +31,7 @@ class RepeatOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARG_WITH_DESC(int64_t, repeats);
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(int64_t, repeats);
 };
 template <class Context>
@@ -39,7 +39,7 @@ class RepeatGradientOp final : public Operator<Context> {
 public:
  RepeatGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
-    GET_ARG_WITH_DESC(int64_t, repeats, 1);
+    INIT_OP_SINGLE_ARG_WITH_DESC(int64_t, repeats, 1);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -49,11 +49,11 @@ class RepeatGradientOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARG_WITH_DESC(int64_t, repeats);
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(int64_t, repeats);
 };
-DEFINE_ARG_WITH_DESC(int64_t, RepeatOp, repeats);
+DEFINE_OP_SINGLE_ARG_WITH_DESC(int64_t, RepeatOp, repeats);
-DEFINE_ARG_WITH_DESC(int64_t, RepeatGradientOp, repeats);
+DEFINE_OP_SINGLE_ARG_WITH_DESC(int64_t, RepeatGradientOp, repeats);
 } // namespace dragon

--- a/dragon/operators/array/reshape_op.cc
+++ b/dragon/operators/array/reshape_op.cc
@@ -53,14 +53,14 @@ void ReshapeOp<Context>::RunOnDevice() {
  Y->Reshape(out_shape)->CopyFrom(X, ctx());
 }
-DEPLOY_CPU(Reshape);
+DEPLOY_CPU_OPERATOR(Reshape);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Reshape);
+DEPLOY_CUDA_OPERATOR(Reshape);
 #endif
-DEPLOY_CPU(ReshapeGradient);
+DEPLOY_CPU_OPERATOR(ReshapeGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ReshapeGradient);
+DEPLOY_CUDA_OPERATOR(ReshapeGradient);
 #endif
 OPERATOR_SCHEMA(Reshape)

--- a/dragon/operators/array/reshape_ops.h
+++ b/dragon/operators/array/reshape_ops.h
@@ -35,14 +35,14 @@ class ReshapeOp final : public Operator<Context> {
 public:
  ReshapeOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, dims);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, dims);
  }
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
 protected:
-  DECLARE_ARGS_WITH_DESC(int64_t, dims);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, dims);
 };
 template <class Context>
@@ -50,8 +50,8 @@ class FlattenOp final : public Operator<Context> {
 public:
  FlattenOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        num_axes_(OpArg<int64_t>("num_axes", -1)),
+        num_axes_(OP_SINGLE_ARG(int64_t, "num_axes", -1)),
-        keep_axes_(OpArg<int64_t>("keep_axes", INT_MAX)) {}
+        keep_axes_(OP_SINGLE_ARG(int64_t, "keep_axes", INT_MAX)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -64,7 +64,7 @@ template <class Context>
 class ExpandDimsOp final : public Operator<Context> {
 public:
  ExpandDimsOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), axes_(OpArgs<int64_t>("axes")) {}
+      : Operator<Context>(def, ws), axes_(OP_REPEATED_ARG(int64_t, "axes")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -77,7 +77,7 @@ template <class Context>
 class SqueezeOp final : public Operator<Context> {
 public:
  SqueezeOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), axes_(OpArgs<int64_t>("axes")) {}
+      : Operator<Context>(def, ws), axes_(OP_REPEATED_ARG(int64_t, "axes")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -100,7 +100,7 @@ DEFINE_GRADIENT_OP(ExpandDims);
 DEFINE_GRADIENT_OP(Squeeze);
 #undef DEFINE_GRADIENT_OP
-DEFINE_ARGS_WITH_DESC(int64_t, ReshapeOp, dims);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, ReshapeOp, dims);
 } // namespace dragon

--- a/dragon/operators/array/shape_op.cc
+++ b/dragon/operators/array/shape_op.cc
@@ -7,9 +7,9 @@ void ShapeOp<Context>::RunOnDevice() {
  Output(0)->template CopyFrom<int64_t>(Input(0).dims());
 }
-DEPLOY_CPU(Shape);
+DEPLOY_CPU_OPERATOR(Shape);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Shape);
+DEPLOY_CUDA_OPERATOR(Shape);
 #endif
 OPERATOR_SCHEMA(Shape)

--- a/dragon/operators/array/slice_op.cc
+++ b/dragon/operators/array/slice_op.cc
@@ -66,7 +66,7 @@ void SliceOp<Context>::DoRunWithType() {
 template <class Context>
 void SliceOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -102,17 +102,17 @@ void SliceGradientOp<Context>::DoRunWithType() {
 template <class Context>
 void SliceGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Slice);
+DEPLOY_CPU_OPERATOR(Slice);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Slice);
+DEPLOY_CUDA_OPERATOR(Slice);
 #endif
-DEPLOY_CPU(SliceGradient);
+DEPLOY_CPU_OPERATOR(SliceGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SliceGradient);
+DEPLOY_CUDA_OPERATOR(SliceGradient);
 #endif
 OPERATOR_SCHEMA(Slice)

--- a/dragon/operators/array/slice_op.h
+++ b/dragon/operators/array/slice_op.h
@@ -21,8 +21,8 @@ template <class Context>
 class SliceOp final : public Operator<Context> {
 public:
  SliceOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, starts);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, starts);
-    GET_ARGS_WITH_DESC(int64_t, sizes);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, sizes);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -32,8 +32,8 @@ class SliceOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARGS_WITH_DESC(int64_t, starts);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, starts);
-  DECLARE_ARGS_WITH_DESC(int64_t, sizes);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, sizes);
 };
 template <class Context>
@@ -48,8 +48,8 @@ class SliceGradientOp final : public Operator<Context> {
  void DoRunWithType();
 };
-DEFINE_ARGS_WITH_DESC(int64_t, SliceOp, starts);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, SliceOp, starts);
-DEFINE_ARGS_WITH_DESC(int64_t, SliceOp, sizes);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, SliceOp, sizes);
 } // namespace dragon

--- a/dragon/operators/array/split_op.cc
+++ b/dragon/operators/array/split_op.cc
@@ -6,8 +6,8 @@
 namespace dragon {
 #define DETERMINE_RUNTIME_ARGS(tensor)                                       \
-  auto size_splits = OpArgs<int64_t>("size_splits");                         \
+  auto size_splits = OP_REPEATED_ARG(int64_t, "size_splits");                \
-  auto slice_points = OpArgs<int64_t>("slice_points");                       \
+  auto slice_points = OP_REPEATED_ARG(int64_t, "slice_points");              \
  if (!slice_points.empty()) {                                               \
    int64_t index = 0;                                                       \
    size_splits = vec64_t(num_splits);                                       \
@@ -60,7 +60,7 @@ void SplitOp<Context>::DoRunWithType() {
 template <class Context>
 void SplitOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -109,14 +109,14 @@ void SplitGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, X);
 }
-DEPLOY_CPU(Split);
+DEPLOY_CPU_OPERATOR(Split);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Split);
+DEPLOY_CUDA_OPERATOR(Split);
 #endif
-DEPLOY_CPU(SplitGradient);
+DEPLOY_CPU_OPERATOR(SplitGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SplitGradient);
+DEPLOY_CUDA_OPERATOR(SplitGradient);
 #endif
 OPERATOR_SCHEMA(Split)

--- a/dragon/operators/array/squeeze_op.cc
+++ b/dragon/operators/array/squeeze_op.cc
@@ -29,14 +29,14 @@ void SqueezeOp<Context>::RunOnDevice() {
  Y->Reshape(out_shape)->CopyFrom(X, ctx());
 }
-DEPLOY_CPU(Squeeze);
+DEPLOY_CPU_OPERATOR(Squeeze);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Squeeze);
+DEPLOY_CUDA_OPERATOR(Squeeze);
 #endif
-DEPLOY_CPU(SqueezeGradient);
+DEPLOY_CPU_OPERATOR(SqueezeGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SqueezeGradient);
+DEPLOY_CUDA_OPERATOR(SqueezeGradient);
 #endif
 OPERATOR_SCHEMA(Squeeze)

--- a/dragon/operators/array/stack_op.cc
+++ b/dragon/operators/array/stack_op.cc
@@ -43,7 +43,7 @@ void StackOp<Context>::DoRunWithType() {
 template <class Context>
 void StackOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -71,17 +71,17 @@ void StackGradientOp<Context>::DoRunWithType() {
 template <class Context>
 void StackGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Stack);
+DEPLOY_CPU_OPERATOR(Stack);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Stack);
+DEPLOY_CUDA_OPERATOR(Stack);
 #endif
-DEPLOY_CPU(StackGradient);
+DEPLOY_CPU_OPERATOR(StackGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(StackGradient);
+DEPLOY_CUDA_OPERATOR(StackGradient);
 #endif
 OPERATOR_SCHEMA(Stack)

--- a/dragon/operators/array/tile_op.cc
+++ b/dragon/operators/array/tile_op.cc
@@ -33,7 +33,7 @@ void TileOp<Context>::DoRunWithType() {
 template <class Context>
 void TileOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -101,14 +101,14 @@ void TileGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(Tile);
+DEPLOY_CPU_OPERATOR(Tile);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Tile);
+DEPLOY_CUDA_OPERATOR(Tile);
 #endif
-DEPLOY_CPU(TileGradient);
+DEPLOY_CPU_OPERATOR(TileGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(TileGradient);
+DEPLOY_CUDA_OPERATOR(TileGradient);
 #endif
 OPERATOR_SCHEMA(Tile)

--- a/dragon/operators/array/tile_op.h
+++ b/dragon/operators/array/tile_op.h
@@ -21,7 +21,7 @@ template <class Context>
 class TileOp final : public Operator<Context> {
 public:
  TileOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, repeats);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, repeats);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -31,7 +31,7 @@ class TileOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARGS_WITH_DESC(int64_t, repeats);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, repeats);
 };
 template <class Context>
@@ -39,7 +39,7 @@ class TileGradientOp final : public Operator<Context> {
 public:
  TileGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, repeats);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, repeats);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -51,11 +51,11 @@ class TileGradientOp final : public Operator<Context> {
 protected:
  Tensor *dest_, *src_, nav_;
  int64_t axis_, repeat_;
-  DECLARE_ARGS_WITH_DESC(int64_t, repeats);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, repeats);
 };
-DEFINE_ARGS_WITH_DESC(int64_t, TileOp, repeats);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, TileOp, repeats);
-DEFINE_ARGS_WITH_DESC(int64_t, TileGradientOp, repeats);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, TileGradientOp, repeats);
 } // namespace dragon

--- a/dragon/operators/array/top_k_op.cc
+++ b/dragon/operators/array/top_k_op.cc
@@ -31,12 +31,12 @@ void TopKOp<Context>::DoRunWithType() {
 template <class Context>
 void TopKOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(TopK);
+DEPLOY_CPU_OPERATOR(TopK);
 #ifdef USE_CUDA
-DEPLOY_CUDA(TopK);
+DEPLOY_CUDA_OPERATOR(TopK);
 #endif
 OPERATOR_SCHEMA(TopK)

--- a/dragon/operators/array/top_k_op.h
+++ b/dragon/operators/array/top_k_op.h
@@ -22,8 +22,8 @@ class TopKOp final : public Operator<Context> {
 public:
  TopKOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        k_(OpArg<int64_t>("k", 1)),
+        k_(OP_SINGLE_ARG(int64_t, "k", 1)),
-        largest_(OpArg<int64_t>("largest", 1)) {}
+        largest_(OP_SINGLE_ARG(int64_t, "largest", 1)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/array/transpose_op.cc
+++ b/dragon/operators/array/transpose_op.cc
@@ -39,7 +39,7 @@ void TransposeOp<Context>::DoRunWithType() {
 template <class Context>
 void TransposeOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -66,14 +66,14 @@ void TransposeGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Transpose);
+DEPLOY_CPU_OPERATOR(Transpose);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Transpose);
+DEPLOY_CUDA_OPERATOR(Transpose);
 #endif
-DEPLOY_CPU(TransposeGradient);
+DEPLOY_CPU_OPERATOR(TransposeGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(TransposeGradient);
+DEPLOY_CUDA_OPERATOR(TransposeGradient);
 #endif
 OPERATOR_SCHEMA(Transpose)

--- a/dragon/operators/array/transpose_op.h
+++ b/dragon/operators/array/transpose_op.h
@@ -22,7 +22,7 @@ class TransposeOp final : public Operator<Context> {
 public:
  TransposeOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, perm);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, perm);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -32,7 +32,7 @@ class TransposeOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARGS_WITH_DESC(int64_t, perm);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, perm);
 };
 template <class Context>
@@ -47,7 +47,7 @@ class TransposeGradientOp : public Operator<Context> {
  void DoRunWithType();
 };
-DEFINE_ARGS_WITH_DESC(int64_t, TransposeOp, perm);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, TransposeOp, perm);
 } // namespace dragon

--- a/dragon/operators/array/where_op.cc
+++ b/dragon/operators/array/where_op.cc
@@ -10,7 +10,7 @@ void WhereOp<Context>::DoRunWithType() {
  auto &A = Input(0), &B = Input(1);
  auto &C = Input(2), *Y = Output(0);
-  CHECK(XIsType(C, bool) || XIsType(C, uint8_t))
+  CHECK(C.template IsType<bool>() || C.template IsType<uint8_t>())
      << "\nExcepted bool or uint8 condition tensor.";
  vec64_t AB_dims, Y_dims;
@@ -36,7 +36,7 @@ void WhereOp<Context>::DoRunWithType() {
 template <class Context>
 void WhereOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -45,7 +45,7 @@ void WhereGradientOp<Context>::DoRunWithType() {
  auto &A = Input(0), &B = Input(1), &C = Input(2), &dY = Input(3);
  auto *dA = Output(0), *dB = Output(1);
-  CHECK(XIsType(C, bool) || XIsType(C, uint8_t))
+  CHECK(C.template IsType<bool>() || C.template IsType<uint8_t>())
      << "\nExcepted bool or uint8 condition tensor.";
  vec32_t A_broadcast_axes, B_broadcast_axes;
@@ -155,14 +155,14 @@ void WhereGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Where);
+DEPLOY_CPU_OPERATOR(Where);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Where);
+DEPLOY_CUDA_OPERATOR(Where);
 #endif
-DEPLOY_CPU(WhereGradient);
+DEPLOY_CPU_OPERATOR(WhereGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(WhereGradient);
+DEPLOY_CUDA_OPERATOR(WhereGradient);
 #endif
 OPERATOR_SCHEMA(Where)

--- a/dragon/operators/control_flow/assign_op.cc
+++ b/dragon/operators/control_flow/assign_op.cc
@@ -79,12 +79,12 @@ void AssignOp<Context>::DoRunWithType() {
 template <class Context>
 void AssignOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Assign);
+DEPLOY_CPU_OPERATOR(Assign);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Assign);
+DEPLOY_CUDA_OPERATOR(Assign);
 #endif
 OPERATOR_SCHEMA(Assign)

--- a/dragon/operators/control_flow/assign_ops.h
+++ b/dragon/operators/control_flow/assign_ops.h
@@ -21,8 +21,8 @@ template <class Context>
 class AssignOp final : public Operator<Context> {
 public:
  AssignOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) {
-    GET_ARGS_WITH_DESC(int64_t, starts);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, starts);
-    GET_ARGS_WITH_DESC(int64_t, sizes);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, sizes);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -32,8 +32,8 @@ class AssignOp final : public Operator<Context> {
  void DoRunWithType();
 protected:
-  DECLARE_ARGS_WITH_DESC(int64_t, starts);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, starts);
-  DECLARE_ARGS_WITH_DESC(int64_t, sizes);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, sizes);
 };
 template <class Context>
@@ -48,8 +48,8 @@ class MaskedAssignOp final : public Operator<Context> {
  void DoRunWithType();
 };
-DEFINE_ARGS_WITH_DESC(int64_t, AssignOp, starts);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, AssignOp, starts);
-DEFINE_ARGS_WITH_DESC(int64_t, AssignOp, sizes);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, AssignOp, sizes);
 } // namespace dragon

--- a/dragon/operators/control_flow/copy_op.cc
+++ b/dragon/operators/control_flow/copy_op.cc
@@ -16,12 +16,12 @@ void CopyOp<Context>::DoRunWithType() {
 template <class Context>
 void CopyOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Copy);
+DEPLOY_CPU_OPERATOR(Copy);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Copy);
+DEPLOY_CUDA_OPERATOR(Copy);
 #endif
 OPERATOR_SCHEMA(Copy)

--- a/dragon/operators/control_flow/masked_assign_op.cc
+++ b/dragon/operators/control_flow/masked_assign_op.cc
@@ -10,7 +10,7 @@ template <typename T>
 void MaskedAssignOp<Context>::DoRunWithType() {
  auto &X = Input(0), &X_mask = Input(1), *Y = Output(0);
-  CHECK(XIsType(X_mask, bool) || XIsType(X_mask, uint8_t))
+  CHECK(X_mask.template IsType<bool>() || X_mask.template IsType<uint8_t>())
      << "\nExcepted bool or uint8 mask.";
  vec64_t X_dims, Y_dims;
@@ -37,12 +37,12 @@ void MaskedAssignOp<Context>::DoRunWithType() {
 template <class Context>
 void MaskedAssignOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(MaskedAssign);
+DEPLOY_CPU_OPERATOR(MaskedAssign);
 #ifdef USE_CUDA
-DEPLOY_CUDA(MaskedAssign);
+DEPLOY_CUDA_OPERATOR(MaskedAssign);
 #endif
 OPERATOR_SCHEMA(MaskedAssign)

--- a/dragon/operators/distributed/collective_op.cc
+++ b/dragon/operators/distributed/collective_op.cc
@@ -139,7 +139,7 @@ void CollectiveOp<Context>::RunOnDevice() {
 #endif
  for (int i = 0; i < InputSize(); i++) {
    src_tensor_ = &Input(i);
-    DispatchHelper<MathTensorTypes>::Call(this, *src_tensor_);
+    DispatchHelper<NumericalTensorTypes>::Call(this, *src_tensor_);
  }
 #ifdef USE_NCCL
 #if NCCL_VERSION_MIN(2, 2, 0)
@@ -151,13 +151,13 @@ void CollectiveOp<Context>::RunOnDevice() {
  src_tensor_ = nullptr;
  for (int i = 0; i < InputSize(); i++) {
    dest_tensor_ = &Input(i);
-    DispatchHelper<MathTensorTypes>::Call(this, *dest_tensor_);
+    DispatchHelper<NumericalTensorTypes>::Call(this, *dest_tensor_);
  }
 }
-DEPLOY_CPU(Collective);
+DEPLOY_CPU_OPERATOR(Collective);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Collective);
+DEPLOY_CUDA_OPERATOR(Collective);
 #endif
 OPERATOR_SCHEMA(Collective).AllowInplace([](int, int) -> bool { return true; });

--- a/dragon/operators/distributed/collective_op.h
+++ b/dragon/operators/distributed/collective_op.h
@@ -24,8 +24,8 @@ class CollectiveOp final : public CollectiveOpBase<Context> {
 public:
  CollectiveOp(const OperatorDef& def, Workspace* ws)
      : CollectiveOpBase<Context>(def, ws),
-        communication_(OpArg<string>("communication", "")),
+        communication_(OP_SINGLE_ARG(string, "communication", "")),
-        operation_(OpArg<string>("operation", "MEAN")) {}
+        operation_(OP_SINGLE_ARG(string, "operation", "MEAN")) {}
  USE_OPERATOR_FUNCTIONS;
  USE_COLLECTIVE_FUNCTIONS;

--- a/dragon/operators/distributed/collective_op_base.h
+++ b/dragon/operators/distributed/collective_op_base.h
@@ -26,8 +26,8 @@ class CollectiveOpBase : public Operator<Context> {
 public:
  CollectiveOpBase(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        comm_((MPI_Comm)OpArg<int64_t>("comm", 0)),
+        comm_((MPI_Comm)OP_SINGLE_ARG(int64_t, "comm", 0)),
-        group_((MPI_Group)OpArg<int64_t>("group", 0)) {
+        group_((MPI_Group)OP_SINGLE_ARG(int64_t, "group", 0)) {
    if ((int64_t)comm_ == 0) return;
    // The given group should be created before
    CHECK((int64_t)group_ != 0) << "\nEncounter the invalid mpi group.";
@@ -38,8 +38,8 @@ class CollectiveOpBase : public Operator<Context> {
    // Translate the root into the group
    MPI_Group world_group;
-    auto root = OpArg<int>("root", 0);
+    auto root = OP_SINGLE_ARG(int, "root", 0);
-    auto group_world_ranks = OpArgs<int64_t>("ranks");
+    auto group_world_ranks = OP_REPEATED_ARG(int64_t, "ranks");
    auto group_world_root = (int)group_world_ranks[root];
    group_str_ = Tensor::DimString(group_world_ranks);
    MPI_Comm_group(MPI_COMM_WORLD, &world_group);
@@ -51,7 +51,7 @@ class CollectiveOpBase : public Operator<Context> {
    // Check whether the NCCL backend should be enabled
    // If not, we will fallback to the MPI backend
 #ifdef USE_NCCL
-    enable_nccl_ = OpArg<string>("backend", "MPI") == "NCCL";
+    enable_nccl_ = OP_SINGLE_ARG(string, "backend", "MPI") == "NCCL";
    enable_nccl_ &= (TypeMeta::Id<Context>() == TypeMeta::Id<CUDAContext>());
 #else
    enable_nccl_ = false;

--- a/dragon/operators/generic/gradient_ops.cc
+++ b/dragon/operators/generic/gradient_ops.cc
@@ -89,24 +89,24 @@ void StopGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(GradientGenerate);
+DEPLOY_CPU_OPERATOR(GradientGenerate);
 #ifdef USE_CUDA
-DEPLOY_CUDA(GradientGenerate);
+DEPLOY_CUDA_OPERATOR(GradientGenerate);
 #endif
-DEPLOY_CPU(GradientGather);
+DEPLOY_CPU_OPERATOR(GradientGather);
 #ifdef USE_CUDA
-DEPLOY_CUDA(GradientGather);
+DEPLOY_CUDA_OPERATOR(GradientGather);
 #endif
-DEPLOY_CPU(GradientAdd);
+DEPLOY_CPU_OPERATOR(GradientAdd);
 #ifdef USE_CUDA
-DEPLOY_CUDA(GradientAdd);
+DEPLOY_CUDA_OPERATOR(GradientAdd);
 #endif
-DEPLOY_CPU(StopGradient);
+DEPLOY_CPU_OPERATOR(StopGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(StopGradient);
+DEPLOY_CUDA_OPERATOR(StopGradient);
 #endif
 OPERATOR_SCHEMA(GradientGenerate)

--- a/dragon/operators/generic/gradient_ops.h
+++ b/dragon/operators/generic/gradient_ops.h
@@ -21,7 +21,8 @@ template <class Context>
 class GradientGenerateOp final : public Operator<Context> {
 public:
  GradientGenerateOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), defaults_(OpArgs<float>("defaults")) {}
+      : Operator<Context>(def, ws),
+        defaults_(OP_REPEATED_ARG(float, "defaults")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/loss/ctc_loss_op.cc
+++ b/dragon/operators/loss/ctc_loss_op.cc
@@ -26,14 +26,14 @@ void CTCLossGradientOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
 }
-DEPLOY_CPU(CTCLoss);
+DEPLOY_CPU_OPERATOR(CTCLoss);
 #ifdef USE_CUDA
-DEPLOY_CUDA(CTCLoss);
+DEPLOY_CUDA_OPERATOR(CTCLoss);
 #endif
-DEPLOY_CPU(CTCLossGradient);
+DEPLOY_CPU_OPERATOR(CTCLossGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(CTCLossGradient);
+DEPLOY_CUDA_OPERATOR(CTCLossGradient);
 #endif
 OPERATOR_SCHEMA(CTCLoss)

--- a/dragon/operators/loss/ctc_loss_op.h
+++ b/dragon/operators/loss/ctc_loss_op.h
@@ -51,7 +51,7 @@ class CuDNNCTCLossOp final : public Operator<Context> {
 public:
  CuDNNCTCLossOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        padding_mask_(OpArg<int64_t>("padding_mask", -1)) {
+        padding_mask_(OP_SINGLE_ARG(int64_t, "padding_mask", -1)) {
    CuDNNCreateTensorDesc(&prob_desc_);
    CuDNNCreateTensorDesc(&grad_desc_);
    ctc_algo_ = CUDNN_CTC_LOSS_ALGO_DETERMINISTIC;

--- a/dragon/operators/loss/ctc_loss_op_cudnn.cc
+++ b/dragon/operators/loss/ctc_loss_op_cudnn.cc
@@ -81,7 +81,7 @@ template <class Context>
 void CuDNNCTCLossOp<Context>::RunOnDevice() {
  Reshape();
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
    CUDNN_CHECK(cudnnSetCTCLossDescriptor(ctc_desc_, CUDNN_DATA_FLOAT));
    DoRunWithType<float>();
  } else {
@@ -90,7 +90,7 @@ void CuDNNCTCLossOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CUDNN(CTCLoss);
+DEPLOY_CUDNN_OPERATOR(CTCLoss);
 } // namespace dragon

--- a/dragon/operators/loss/l1_loss_op.cc
+++ b/dragon/operators/loss/l1_loss_op.cc
-#include "dragon/operators/loss/l1_loss_ops.h"
 #include "dragon/core/workspace.h"
+#include "dragon/operators/loss/l1_loss_ops.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -120,14 +120,14 @@ void L1LossGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(L1Loss);
+DEPLOY_CPU_OPERATOR(L1Loss);
 #ifdef USE_CUDA
-DEPLOY_CUDA(L1Loss);
+DEPLOY_CUDA_OPERATOR(L1Loss);
 #endif
-DEPLOY_CPU(L1LossGradient);
+DEPLOY_CPU_OPERATOR(L1LossGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(L1LossGradient);
+DEPLOY_CUDA_OPERATOR(L1LossGradient);
 #endif
 OPERATOR_SCHEMA(L1Loss)

--- a/dragon/operators/loss/l1_loss_ops.h
+++ b/dragon/operators/loss/l1_loss_ops.h
@@ -22,7 +22,7 @@ class L1LossOp final : public Operator<Context> {
 public:
  L1LossOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        reduction_(OpArg<string>("reduction", "MEAN")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "MEAN")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -39,8 +39,8 @@ class SmoothL1LossOp final : public Operator<Context> {
 public:
  SmoothL1LossOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        beta_(OpArg<float>("beta", 1.f)),
+        beta_(OP_SINGLE_ARG(float, "beta", 1.f)),
-        reduction_(OpArg<string>("reduction", "MEAN")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "MEAN")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -58,7 +58,7 @@ class L1LossGradientOp final : public Operator<Context> {
 public:
  L1LossGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        reduction_(OpArg<string>("reduction", "MEAN")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "MEAN")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -75,8 +75,8 @@ class SmoothL1LossGradientOp final : public Operator<Context> {
 public:
  SmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        beta_(OpArg<float>("beta", 1.f)),
+        beta_(OP_SINGLE_ARG(float, "beta", 1.f)),
-        reduction_(OpArg<string>("reduction", "MEAN")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "MEAN")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/loss/l2_loss_op.cc
+++ b/dragon/operators/loss/l2_loss_op.cc
@@ -119,14 +119,14 @@ void L2LossGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(L2Loss);
+DEPLOY_CPU_OPERATOR(L2Loss);
 #ifdef USE_CUDA
-DEPLOY_CUDA(L2Loss);
+DEPLOY_CUDA_OPERATOR(L2Loss);
 #endif
-DEPLOY_CPU(L2LossGradient);
+DEPLOY_CPU_OPERATOR(L2LossGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(L2LossGradient);
+DEPLOY_CUDA_OPERATOR(L2LossGradient);
 #endif
 OPERATOR_SCHEMA(L2Loss)

--- a/dragon/operators/loss/l2_loss_op.h
+++ b/dragon/operators/loss/l2_loss_op.h
@@ -22,7 +22,7 @@ class L2LossOp final : public Operator<Context> {
 public:
  L2LossOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        reduction_(OpArg<string>("reduction", "MEAN")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "MEAN")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -39,7 +39,7 @@ class L2LossGradientOp final : public Operator<Context> {
 public:
  L2LossGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        reduction_(OpArg<string>("reduction", "MEAN")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "MEAN")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/loss/nll_loss_op.cc
+++ b/dragon/operators/loss/nll_loss_op.cc
@@ -66,19 +66,19 @@ void NLLLossOp<Context>::DoRunWithType() {
 template <class Context>
 void NLLLossOp<Context>::RunOnDevice() {
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
-    if (XIsType(Input(1), float)) {
+    if (Input(1).template IsType<float>()) {
      DoRunWithType<float, float>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<float, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
          types::to_string(Input(1).meta()), {"float32", "int64"});
    }
-  } else if (XIsType(Input(0), double)) {
+  } else if (Input(0).template IsType<double>()) {
-    if (XIsType(Input(1), double)) {
+    if (Input(1).template IsType<double>()) {
      DoRunWithType<double, double>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<double, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
@@ -136,19 +136,19 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
 template <class Context>
 void NLLLossGradientOp<Context>::RunOnDevice() {
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
-    if (XIsType(Input(1), float)) {
+    if (Input(1).template IsType<float>()) {
      DoRunWithType<float, float>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<float, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
          types::to_string(Input(1).meta()), {"float32", "int64"});
    }
-  } else if (XIsType(Input(0), double)) {
+  } else if (Input(0).template IsType<double>()) {
-    if (XIsType(Input(1), double)) {
+    if (Input(1).template IsType<double>()) {
      DoRunWithType<double, double>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<double, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
@@ -160,14 +160,14 @@ void NLLLossGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(NLLLoss);
+DEPLOY_CPU_OPERATOR(NLLLoss);
 #ifdef USE_CUDA
-DEPLOY_CUDA(NLLLoss);
+DEPLOY_CUDA_OPERATOR(NLLLoss);
 #endif
-DEPLOY_CPU(NLLLossGradient);
+DEPLOY_CPU_OPERATOR(NLLLossGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(NLLLossGradient);
+DEPLOY_CUDA_OPERATOR(NLLLossGradient);
 #endif
 OPERATOR_SCHEMA(NLLLoss)

--- a/dragon/operators/loss/nll_loss_op.h
+++ b/dragon/operators/loss/nll_loss_op.h
@@ -22,8 +22,8 @@ class NLLLossOp final : public Operator<Context> {
 public:
  NLLLossOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        ignore_index_(OpArg<int64_t>("ignore_index", -1)),
+        ignore_index_(OP_SINGLE_ARG(int64_t, "ignore_index", -1)),
-        reduction_(OpArg<string>("reduction", "VALID")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "VALID")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -41,8 +41,8 @@ class NLLLossGradientOp final : public Operator<Context> {
 public:
  NLLLossGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        ignore_index_(OpArg<int64_t>("ignore_index", -1)),
+        ignore_index_(OP_SINGLE_ARG(int64_t, "ignore_index", -1)),
-        reduction_(OpArg<string>("reduction", "VALID")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "VALID")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/loss/sigmoid_ce_loss_op.cc
+++ b/dragon/operators/loss/sigmoid_ce_loss_op.cc
@@ -98,14 +98,14 @@ void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<float, double>>::Call(this, Input(0));
 }
-DEPLOY_CPU(SigmoidCrossEntropy);
+DEPLOY_CPU_OPERATOR(SigmoidCrossEntropy);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SigmoidCrossEntropy);
+DEPLOY_CUDA_OPERATOR(SigmoidCrossEntropy);
 #endif
-DEPLOY_CPU(SigmoidCrossEntropyGradient);
+DEPLOY_CPU_OPERATOR(SigmoidCrossEntropyGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SigmoidCrossEntropyGradient);
+DEPLOY_CUDA_OPERATOR(SigmoidCrossEntropyGradient);
 #endif
 OPERATOR_SCHEMA(SigmoidCrossEntropy)

--- a/dragon/operators/loss/sigmoid_focal_loss_op.cc
+++ b/dragon/operators/loss/sigmoid_focal_loss_op.cc
@@ -66,19 +66,19 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
 template <class Context>
 void SigmoidFocalLossOp<Context>::RunOnDevice() {
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
-    if (XIsType(Input(1), float)) {
+    if (Input(1).template IsType<float>()) {
      DoRunWithType<float, float>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<float, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
          types::to_string(Input(1).meta()), {"float32", "int64"});
    }
-  } else if (XIsType(Input(0), double)) {
+  } else if (Input(0).template IsType<double>()) {
-    if (XIsType(Input(1), double)) {
+    if (Input(1).template IsType<double>()) {
      DoRunWithType<double, double>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<double, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
@@ -136,19 +136,19 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
 template <class Context>
 void SigmoidFocalLossGradientOp<Context>::RunOnDevice() {
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
-    if (XIsType(Input(1), float)) {
+    if (Input(1).template IsType<float>()) {
      DoRunWithType<float, float>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<float, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
          types::to_string(Input(1).meta()), {"float32", "int64"});
    }
-  } else if (XIsType(Input(0), double)) {
+  } else if (Input(0).template IsType<double>()) {
-    if (XIsType(Input(1), double)) {
+    if (Input(1).template IsType<double>()) {
      DoRunWithType<double, double>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<double, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
@@ -160,14 +160,14 @@ void SigmoidFocalLossGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(SigmoidFocalLoss);
+DEPLOY_CPU_OPERATOR(SigmoidFocalLoss);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SigmoidFocalLoss);
+DEPLOY_CUDA_OPERATOR(SigmoidFocalLoss);
 #endif
-DEPLOY_CPU(SigmoidFocalLossGradient);
+DEPLOY_CPU_OPERATOR(SigmoidFocalLossGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SigmoidFocalLossGradient);
+DEPLOY_CUDA_OPERATOR(SigmoidFocalLossGradient);
 #endif
 OPERATOR_SCHEMA(SigmoidFocalLoss)

--- a/dragon/operators/loss/sigmoid_loss_ops.h
+++ b/dragon/operators/loss/sigmoid_loss_ops.h
@@ -22,7 +22,7 @@ class SigmoidCrossEntropyOp final : public Operator<Context> {
 public:
  SigmoidCrossEntropyOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        reduction_(OpArg<string>("reduction", "VALID")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "VALID")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -39,11 +39,11 @@ class SigmoidFocalLossOp final : public Operator<Context> {
 public:
  SigmoidFocalLossOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        pos_alpha_(OpArg<float>("alpha", 0.25f)),
+        pos_alpha_(OP_SINGLE_ARG(float, "alpha", 0.25f)),
-        neg_alpha_(1.f - OpArg<float>("alpha", 0.25f)),
+        neg_alpha_(1.f - OP_SINGLE_ARG(float, "alpha", 0.25f)),
-        gamma_(OpArg<float>("gamma", 2.f)),
+        gamma_(OP_SINGLE_ARG(float, "gamma", 2.f)),
-        negative_index_(OpArg<int64_t>("negative_index", -1)),
+        negative_index_(OP_SINGLE_ARG(int64_t, "negative_index", -1)),
-        reduction_(OpArg<string>("reduction", "VALID")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "VALID")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -62,7 +62,7 @@ class SigmoidCrossEntropyGradientOp final : public Operator<Context> {
 public:
  SigmoidCrossEntropyGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        reduction_(OpArg<string>("reduction", "VALID")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "VALID")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -79,11 +79,11 @@ class SigmoidFocalLossGradientOp final : public Operator<Context> {
 public:
  SigmoidFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        pos_alpha_(OpArg<float>("alpha", 0.25f)),
+        pos_alpha_(OP_SINGLE_ARG(float, "alpha", 0.25f)),
-        neg_alpha_(1.f - OpArg<float>("alpha", 0.25f)),
+        neg_alpha_(1.f - OP_SINGLE_ARG(float, "alpha", 0.25f)),
-        gamma_(OpArg<float>("gamma", 2.f)),
+        gamma_(OP_SINGLE_ARG(float, "gamma", 2.f)),
-        negative_index_(OpArg<int64_t>("negative_index", -1)),
+        negative_index_(OP_SINGLE_ARG(int64_t, "negative_index", -1)),
-        reduction_(OpArg<string>("reduction", "VALID")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "VALID")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/loss/smooth_l1_loss_op.cc
+++ b/dragon/operators/loss/smooth_l1_loss_op.cc
@@ -120,14 +120,14 @@ void SmoothL1LossGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(SmoothL1Loss);
+DEPLOY_CPU_OPERATOR(SmoothL1Loss);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SmoothL1Loss);
+DEPLOY_CUDA_OPERATOR(SmoothL1Loss);
 #endif
-DEPLOY_CPU(SmoothL1LossGradient);
+DEPLOY_CPU_OPERATOR(SmoothL1LossGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SmoothL1LossGradient);
+DEPLOY_CUDA_OPERATOR(SmoothL1LossGradient);
 #endif
 OPERATOR_SCHEMA(SmoothL1Loss)

--- a/dragon/operators/loss/softmax_ce_loss_op.cc
+++ b/dragon/operators/loss/softmax_ce_loss_op.cc
@@ -108,14 +108,14 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<float, double>>::Call(this, Input(0));
 }
-DEPLOY_CPU(SoftmaxCrossEntropy);
+DEPLOY_CPU_OPERATOR(SoftmaxCrossEntropy);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SoftmaxCrossEntropy);
+DEPLOY_CUDA_OPERATOR(SoftmaxCrossEntropy);
 #endif
-DEPLOY_CPU(SoftmaxCrossEntropyGradient);
+DEPLOY_CPU_OPERATOR(SoftmaxCrossEntropyGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SoftmaxCrossEntropyGradient);
+DEPLOY_CUDA_OPERATOR(SoftmaxCrossEntropyGradient);
 #endif
 OPERATOR_SCHEMA(SoftmaxCrossEntropy)

--- a/dragon/operators/loss/softmax_loss_ops.h
+++ b/dragon/operators/loss/softmax_loss_ops.h
@@ -22,7 +22,7 @@ class SoftmaxCrossEntropyOp final : public Operator<Context> {
 public:
  SoftmaxCrossEntropyOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        reduction_(OpArg<string>("reduction", "MEAN")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "MEAN")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -39,8 +39,8 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
 public:
  SparseSoftmaxCrossEntropyOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        ignore_index_(OpArg<int64_t>("ignore_index", -1)),
+        ignore_index_(OP_SINGLE_ARG(int64_t, "ignore_index", -1)),
-        reduction_(OpArg<string>("reduction", "VALID")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "VALID")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -58,7 +58,7 @@ class SoftmaxCrossEntropyGradientOp final : public Operator<Context> {
 public:
  SoftmaxCrossEntropyGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        reduction_(OpArg<string>("reduction", "MEAN")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "MEAN")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -75,8 +75,8 @@ class SparseSoftmaxCrossEntropyGradientOp : public Operator<Context> {
 public:
  SparseSoftmaxCrossEntropyGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        ignore_index_(OpArg<int64_t>("ignore_index", -1)),
+        ignore_index_(OP_SINGLE_ARG(int64_t, "ignore_index", -1)),
-        reduction_(OpArg<string>("reduction", "VALID")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "VALID")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
+++ b/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
@@ -76,19 +76,19 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
 template <class Context>
 void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
-    if (XIsType(Input(1), float)) {
+    if (Input(1).template IsType<float>()) {
      DoRunWithType<float, float>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<float, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
          types::to_string(Input(1).meta()), {"float32", "int64"});
    }
-  } else if (XIsType(Input(0), double)) {
+  } else if (Input(0).template IsType<double>()) {
-    if (XIsType(Input(1), double)) {
+    if (Input(1).template IsType<double>()) {
      DoRunWithType<double, double>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<double, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
@@ -149,19 +149,19 @@ template <class Context>
 void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
  Output(0)->ReshapeLike(Input(0));
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
-    if (XIsType(Input(1), float)) {
+    if (Input(1).template IsType<float>()) {
      DoRunWithType<float, float>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<float, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
          types::to_string(Input(1).meta()), {"float32", "int64"});
    }
-  } else if (XIsType(Input(0), double)) {
+  } else if (Input(0).template IsType<double>()) {
-    if (XIsType(Input(1), double)) {
+    if (Input(1).template IsType<double>()) {
      DoRunWithType<double, double>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<double, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
@@ -173,14 +173,14 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(SparseSoftmaxCrossEntropy);
+DEPLOY_CPU_OPERATOR(SparseSoftmaxCrossEntropy);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SparseSoftmaxCrossEntropy);
+DEPLOY_CUDA_OPERATOR(SparseSoftmaxCrossEntropy);
 #endif
-DEPLOY_CPU(SparseSoftmaxCrossEntropyGradient);
+DEPLOY_CPU_OPERATOR(SparseSoftmaxCrossEntropyGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SparseSoftmaxCrossEntropyGradient);
+DEPLOY_CUDA_OPERATOR(SparseSoftmaxCrossEntropyGradient);
 #endif
 OPERATOR_SCHEMA(SparseSoftmaxCrossEntropy)

--- a/dragon/operators/math/abs_op.cc
+++ b/dragon/operators/math/abs_op.cc
@@ -29,9 +29,9 @@ void AbsGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(AbsGradient);
+DEPLOY_CPU_OPERATOR(AbsGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(AbsGradient);
+DEPLOY_CUDA_OPERATOR(AbsGradient);
 #endif
 OPERATOR_SCHEMA(AbsGradient)

--- a/dragon/operators/math/add_op.cc
+++ b/dragon/operators/math/add_op.cc
@@ -40,7 +40,7 @@ void AddOp<Context>::DoRunWithType() {
 template <class Context>
 void AddOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -92,14 +92,14 @@ void AddGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Add);
+DEPLOY_CPU_OPERATOR(Add);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Add);
+DEPLOY_CUDA_OPERATOR(Add);
 #endif
-DEPLOY_CPU(AddGradient);
+DEPLOY_CPU_OPERATOR(AddGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(AddGradient);
+DEPLOY_CUDA_OPERATOR(AddGradient);
 #endif
 OPERATOR_SCHEMA(Add)

--- a/dragon/operators/math/affine_op.cc
+++ b/dragon/operators/math/affine_op.cc
@@ -7,7 +7,7 @@ namespace dragon {
 #define CANONICALIZE_AXES_WITH_TENSOR(tensor)                                 \
  CANONICALIZE_AXIS_WITH_TENSOR(tensor);                                      \
-  auto num_axes = OpArg<int64_t>("num_axes", 1);                              \
+  auto num_axes = OP_SINGLE_ARG(int64_t, "num_axes", 1);                      \
  if (num_axes < 0) {                                                         \
    num_axes = tensor.ndim() - axis;                                          \
  } else if (num_axes == 0) {                                                 \
@@ -50,7 +50,7 @@ void AffineOp<Context>::DoRunWithType() {
 template <class Context>
 void AffineOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -135,14 +135,14 @@ void AffineGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Affine);
+DEPLOY_CPU_OPERATOR(Affine);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Affine);
+DEPLOY_CUDA_OPERATOR(Affine);
 #endif
-DEPLOY_CPU(AffineGradient);
+DEPLOY_CPU_OPERATOR(AffineGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(AffineGradient);
+DEPLOY_CUDA_OPERATOR(AffineGradient);
 #endif
 OPERATOR_SCHEMA(Affine)

--- a/dragon/operators/math/axpby_op.cc
+++ b/dragon/operators/math/axpby_op.cc
@@ -27,12 +27,12 @@ void AxpbyOp<Context>::DoRunWithType() {
 template <class Context>
 void AxpbyOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Axpby);
+DEPLOY_CPU_OPERATOR(Axpby);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Axpby);
+DEPLOY_CUDA_OPERATOR(Axpby);
 #endif
 OPERATOR_SCHEMA(Axpby)

--- a/dragon/operators/math/clip_op.cc
+++ b/dragon/operators/math/clip_op.cc
@@ -41,7 +41,7 @@ void ClipOp<Context>::DoRunWithType() {
 template <class Context>
 void ClipOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -64,14 +64,14 @@ void ClipGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Clip);
+DEPLOY_CPU_OPERATOR(Clip);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Clip);
+DEPLOY_CUDA_OPERATOR(Clip);
 #endif
-DEPLOY_CPU(ClipGradient);
+DEPLOY_CPU_OPERATOR(ClipGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ClipGradient);
+DEPLOY_CUDA_OPERATOR(ClipGradient);
 #endif
 OPERATOR_SCHEMA(Clip)

--- a/dragon/operators/math/clip_op.h
+++ b/dragon/operators/math/clip_op.h
@@ -22,8 +22,8 @@ class ClipOp : public Operator<Context> {
 public:
  ClipOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        low_(OpArg<float>("low", -FLT_MAX)),
+        low_(OP_SINGLE_ARG(float, "low", -FLT_MAX)),
-        high_(OpArg<float>("high", FLT_MAX)) {}
+        high_(OP_SINGLE_ARG(float, "high", FLT_MAX)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/math/cos_op.cc
+++ b/dragon/operators/math/cos_op.cc
@@ -21,9 +21,9 @@ void CosGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(CosGradient);
+DEPLOY_CPU_OPERATOR(CosGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(CosGradient);
+DEPLOY_CUDA_OPERATOR(CosGradient);
 #endif
 OPERATOR_SCHEMA(CosGradient)

--- a/dragon/operators/math/div_op.cc
+++ b/dragon/operators/math/div_op.cc
@@ -40,7 +40,7 @@ void DivOp<Context>::DoRunWithType() {
 template <class Context>
 void DivOp<Context>::RunOnDevice() {
-  DispatchHelper<MathTensorTypes>::Call(this, Input(0));
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -192,14 +192,14 @@ void DivGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(2));
 }
-DEPLOY_CPU(Div);
+DEPLOY_CPU_OPERATOR(Div);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Div);
+DEPLOY_CUDA_OPERATOR(Div);
 #endif
-DEPLOY_CPU(DivGradient);
+DEPLOY_CPU_OPERATOR(DivGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DivGradient);
+DEPLOY_CUDA_OPERATOR(DivGradient);
 #endif
 OPERATOR_SCHEMA(Div)

--- a/dragon/operators/math/dot_op.cc
+++ b/dragon/operators/math/dot_op.cc
@@ -191,14 +191,14 @@ void DotGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(2));
 }
-DEPLOY_CPU(Dot);
+DEPLOY_CPU_OPERATOR(Dot);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Dot);
+DEPLOY_CUDA_OPERATOR(Dot);
 #endif
-DEPLOY_CPU(DotGradient);
+DEPLOY_CPU_OPERATOR(DotGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DotGradient);
+DEPLOY_CUDA_OPERATOR(DotGradient);
 #endif
 OPERATOR_SCHEMA(Dot)

--- a/dragon/operators/math/elementwise_ops.cc
+++ b/dragon/operators/math/elementwise_ops.cc
@@ -19,21 +19,21 @@ DISPATCH_WITH_TENSOR_TYPES(Exp, FloatingTensorTypes, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(Log, FloatingTensorTypes, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(Sin, FloatingTensorTypes, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(Cos, FloatingTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(Invert, IntegralTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Invert, BooleanIntegralTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(Square, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Square, NumericalTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(Sign, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Sign, NumericalTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(Abs, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Abs, NumericalTensorTypes, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(IsInf, FloatingTensorTypes, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(IsNaN, FloatingTensorTypes, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(Pow, FloatingTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(Minimum, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Minimum, NumericalTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(Maximum, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Maximum, NumericalTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(Equal, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Equal, NumericalTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(NotEqual, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(NotEqual, NumericalTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(Less, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Less, NumericalTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(LessEqual, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(LessEqual, NumericalTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(Greater, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Greater, NumericalTensorTypes, Input(0));
-DISPATCH_WITH_TENSOR_TYPES(GreaterEqual, MathTensorTypes, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(GreaterEqual, NumericalTensorTypes, Input(0));
 #undef DISPATCH_WITH_TENSOR_TYPES
 #define DEFINE_SIMPLE_UNARY_OP_IMPL(name, TOut)                    \
@@ -120,56 +120,56 @@ DEFINE_SIMPLE_BINARY_OP_IMPL(Greater, bool);
 DEFINE_SIMPLE_BINARY_OP_IMPL(GreaterEqual, bool);
 #undef DEFINE_SIMPLE_BINARY_OP_IMPL
-DEPLOY_CPU(Ceil);
+DEPLOY_CPU_OPERATOR(Ceil);
-DEPLOY_CPU(Floor);
+DEPLOY_CPU_OPERATOR(Floor);
-DEPLOY_CPU(Round);
+DEPLOY_CPU_OPERATOR(Round);
-DEPLOY_CPU(Sqrt);
+DEPLOY_CPU_OPERATOR(Sqrt);
-DEPLOY_CPU(Rsqrt);
+DEPLOY_CPU_OPERATOR(Rsqrt);
-DEPLOY_CPU(Exp);
+DEPLOY_CPU_OPERATOR(Exp);
-DEPLOY_CPU(Log);
+DEPLOY_CPU_OPERATOR(Log);
-DEPLOY_CPU(Sin);
+DEPLOY_CPU_OPERATOR(Sin);
-DEPLOY_CPU(Cos);
+DEPLOY_CPU_OPERATOR(Cos);
-DEPLOY_CPU(Invert);
+DEPLOY_CPU_OPERATOR(Invert);
-DEPLOY_CPU(Square);
+DEPLOY_CPU_OPERATOR(Square);
-DEPLOY_CPU(Sign);
+DEPLOY_CPU_OPERATOR(Sign);
-DEPLOY_CPU(Abs);
+DEPLOY_CPU_OPERATOR(Abs);
-DEPLOY_CPU(IsInf);
+DEPLOY_CPU_OPERATOR(IsInf);
-DEPLOY_CPU(IsNaN);
+DEPLOY_CPU_OPERATOR(IsNaN);
-DEPLOY_CPU(Pow);
+DEPLOY_CPU_OPERATOR(Pow);
-DEPLOY_CPU(Minimum);
+DEPLOY_CPU_OPERATOR(Minimum);
-DEPLOY_CPU(Maximum);
+DEPLOY_CPU_OPERATOR(Maximum);
-DEPLOY_CPU(Equal);
+DEPLOY_CPU_OPERATOR(Equal);
-DEPLOY_CPU(NotEqual);
+DEPLOY_CPU_OPERATOR(NotEqual);
-DEPLOY_CPU(Less);
+DEPLOY_CPU_OPERATOR(Less);
-DEPLOY_CPU(LessEqual);
+DEPLOY_CPU_OPERATOR(LessEqual);
-DEPLOY_CPU(Greater);
+DEPLOY_CPU_OPERATOR(Greater);
-DEPLOY_CPU(GreaterEqual);
+DEPLOY_CPU_OPERATOR(GreaterEqual);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Ceil);
+DEPLOY_CUDA_OPERATOR(Ceil);
-DEPLOY_CUDA(Floor);
+DEPLOY_CUDA_OPERATOR(Floor);
-DEPLOY_CUDA(Round);
+DEPLOY_CUDA_OPERATOR(Round);
-DEPLOY_CUDA(Sqrt);
+DEPLOY_CUDA_OPERATOR(Sqrt);
-DEPLOY_CUDA(Rsqrt);
+DEPLOY_CUDA_OPERATOR(Rsqrt);
-DEPLOY_CUDA(Exp);
+DEPLOY_CUDA_OPERATOR(Exp);
-DEPLOY_CUDA(Log);
+DEPLOY_CUDA_OPERATOR(Log);
-DEPLOY_CUDA(Sin);
+DEPLOY_CUDA_OPERATOR(Sin);
-DEPLOY_CUDA(Cos);
+DEPLOY_CUDA_OPERATOR(Cos);
-DEPLOY_CUDA(Invert);
+DEPLOY_CUDA_OPERATOR(Invert);
-DEPLOY_CUDA(Square);
+DEPLOY_CUDA_OPERATOR(Square);
-DEPLOY_CUDA(Sign);
+DEPLOY_CUDA_OPERATOR(Sign);
-DEPLOY_CUDA(Abs);
+DEPLOY_CUDA_OPERATOR(Abs);
-DEPLOY_CUDA(IsInf);
+DEPLOY_CUDA_OPERATOR(IsInf);
-DEPLOY_CUDA(IsNaN);
+DEPLOY_CUDA_OPERATOR(IsNaN);
-DEPLOY_CUDA(Pow);
+DEPLOY_CUDA_OPERATOR(Pow);
-DEPLOY_CUDA(Minimum);
+DEPLOY_CUDA_OPERATOR(Minimum);
-DEPLOY_CUDA(Maximum);
+DEPLOY_CUDA_OPERATOR(Maximum);
-DEPLOY_CUDA(Equal);
+DEPLOY_CUDA_OPERATOR(Equal);
-DEPLOY_CUDA(NotEqual);
+DEPLOY_CUDA_OPERATOR(NotEqual);
-DEPLOY_CUDA(Less);
+DEPLOY_CUDA_OPERATOR(Less);
-DEPLOY_CUDA(LessEqual);
+DEPLOY_CUDA_OPERATOR(LessEqual);
-DEPLOY_CUDA(Greater);
+DEPLOY_CUDA_OPERATOR(Greater);
-DEPLOY_CUDA(GreaterEqual);
+DEPLOY_CUDA_OPERATOR(GreaterEqual);
 #endif
 OPERATOR_SCHEMA(Ceil).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});

--- a/dragon/operators/math/elementwise_ops.h
+++ b/dragon/operators/math/elementwise_ops.h
@@ -36,8 +36,8 @@ class AxpbyOp final : public Operator<Context> {
 public:
  AxpbyOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        alpha_(OpArg<float>("alpha", 1.f)),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 1.f)),
-        beta_(OpArg<float>("beta", 1.f)) {}
+        beta_(OP_SINGLE_ARG(float, "beta", 1.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/math/exp_op.cc
+++ b/dragon/operators/math/exp_op.cc
@@ -20,9 +20,9 @@ void ExpGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(ExpGradient);
+DEPLOY_CPU_OPERATOR(ExpGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ExpGradient);
+DEPLOY_CUDA_OPERATOR(ExpGradient);
 #endif
 OPERATOR_SCHEMA(ExpGradient)

--- a/dragon/operators/math/fully_connected_op.cc
+++ b/dragon/operators/math/fully_connected_op.cc
@@ -171,14 +171,14 @@ void FullyConnectedGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(FullyConnected);
+DEPLOY_CPU_OPERATOR(FullyConnected);
 #ifdef USE_CUDA
-DEPLOY_CUDA(FullyConnected);
+DEPLOY_CUDA_OPERATOR(FullyConnected);
 #endif
-DEPLOY_CPU(FullyConnectedGradient);
+DEPLOY_CPU_OPERATOR(FullyConnectedGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(FullyConnectedGradient);
+DEPLOY_CUDA_OPERATOR(FullyConnectedGradient);
 #endif
 OPERATOR_SCHEMA(FullyConnected)

--- a/dragon/operators/math/fully_connected_op.h
+++ b/dragon/operators/math/fully_connected_op.h
@@ -22,8 +22,8 @@ class FullyConnectedOp final : public Operator<Context> {
 public:
  FullyConnectedOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        out_channels_(OpArg<int64_t>("out_channels", 0)),
+        out_channels_(OP_SINGLE_ARG(int64_t, "out_channels", 0)),
-        transW_(OpArg<int64_t>("transW", 1)) {}
+        transW_(OP_SINGLE_ARG(int64_t, "transW", 1)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -40,8 +40,8 @@ class FullyConnectedGradientOp final : public Operator<Context> {
 public:
  FullyConnectedGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        out_channels_(OpArg<int64_t>("out_channels", 0)),
+        out_channels_(OP_SINGLE_ARG(int64_t, "out_channels", 0)),
-        transW_(OpArg<int64_t>("transW", 1)) {}
+        transW_(OP_SINGLE_ARG(int64_t, "transW", 1)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/math/log_op.cc
+++ b/dragon/operators/math/log_op.cc
@@ -20,9 +20,9 @@ void LogGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(LogGradient);
+DEPLOY_CPU_OPERATOR(LogGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(LogGradient);
+DEPLOY_CUDA_OPERATOR(LogGradient);
 #endif
 OPERATOR_SCHEMA(LogGradient)

--- a/dragon/operators/math/matmul_op.cc
+++ b/dragon/operators/math/matmul_op.cc
@@ -158,14 +158,14 @@ void MatMulGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(MatMul);
+DEPLOY_CPU_OPERATOR(MatMul);
 #ifdef USE_CUDA
-DEPLOY_CUDA(MatMul);
+DEPLOY_CUDA_OPERATOR(MatMul);
 #endif
-DEPLOY_CPU(MatMulGradient);
+DEPLOY_CPU_OPERATOR(MatMulGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(MatMulGradient);
+DEPLOY_CUDA_OPERATOR(MatMulGradient);
 #endif
 OPERATOR_SCHEMA(MatMul)

--- a/dragon/operators/math/matmul_op.h
+++ b/dragon/operators/math/matmul_op.h
@@ -22,8 +22,8 @@ class MatMulOp final : public Operator<Context> {
 public:
  MatMulOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        transA_(OpArg<int64_t>("transA", 0)),
+        transA_(OP_SINGLE_ARG(int64_t, "transA", 0)),
-        transB_(OpArg<int64_t>("transB", 0)) {}
+        transB_(OP_SINGLE_ARG(int64_t, "transB", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -40,8 +40,8 @@ class MatMulGradientOp final : public Operator<Context> {
 public:
  MatMulGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        transA_(OpArg<int64_t>("transA", 0)),
+        transA_(OP_SINGLE_ARG(int64_t, "transA", 0)),
-        transB_(OpArg<int64_t>("transB", 0)) {}
+        transB_(OP_SINGLE_ARG(int64_t, "transB", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/math/maximum_op.cc
+++ b/dragon/operators/math/maximum_op.cc
@@ -138,9 +138,9 @@ void MaximumGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(2));
 }
-DEPLOY_CPU(MaximumGradient);
+DEPLOY_CPU_OPERATOR(MaximumGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(MaximumGradient);
+DEPLOY_CUDA_OPERATOR(MaximumGradient);
 #endif
 OPERATOR_SCHEMA(MaximumGradient)

--- a/dragon/operators/math/minimum_op.cc
+++ b/dragon/operators/math/minimum_op.cc
@@ -138,9 +138,9 @@ void MinimumGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(2));
 }
-DEPLOY_CPU(MinimumGradient);
+DEPLOY_CPU_OPERATOR(MinimumGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(MinimumGradient);
+DEPLOY_CUDA_OPERATOR(MinimumGradient);
 #endif
 OPERATOR_SCHEMA(MinimumGradient)

--- a/dragon/operators/math/moments_op.cc
+++ b/dragon/operators/math/moments_op.cc
@@ -59,20 +59,19 @@ void MomentsOp<Context>::DoRunWithType() {
 template <class Context>
 void MomentsOp<Context>::RunOnDevice() {
  auto& X = Input(0);
+  if (X.template IsType<int8_t>()) {
-  if (XIsType(X, int8_t)) {
    DoRunWithType<int8_t, float>();
-  } else if (XIsType(X, uint8_t)) {
+  } else if (X.template IsType<uint8_t>()) {
    DoRunWithType<uint8_t, float>();
-  } else if (XIsType(X, int)) {
+  } else if (X.template IsType<int>()) {
    DoRunWithType<int, float>();
-  } else if (XIsType(X, int64_t)) {
+  } else if (X.template IsType<int64_t>()) {
    DoRunWithType<int64_t, float>();
-  } else if (XIsType(X, float16)) {
+  } else if (X.template IsType<float16>()) {
    DoRunWithType<float16, float>();
-  } else if (XIsType(X, float)) {
+  } else if (X.template IsType<float>()) {
    DoRunWithType<float, float>();
-  } else if (XIsType(X, double)) {
+  } else if (X.template IsType<double>()) {
    DoRunWithType<double, double>();
  } else {
    LOG(FATAL) << MessageForUnsupported(
@@ -81,9 +80,9 @@ void MomentsOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(Moments);
+DEPLOY_CPU_OPERATOR(Moments);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Moments);
+DEPLOY_CUDA_OPERATOR(Moments);
 #endif
 OPERATOR_SCHEMA(Moments)

--- a/dragon/operators/math/moments_op.h
+++ b/dragon/operators/math/moments_op.h
@@ -22,8 +22,8 @@ class MomentsOp final : public Operator<Context> {
 public:
  MomentsOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        axes_(OpArgs<int64_t>("axes")),
+        axes_(OP_REPEATED_ARG(int64_t, "axes")),
-        keep_dims_(OpArg<int64_t>("keep_dims", 0)) {}
+        keep_dims_(OP_SINGLE_ARG(int64_t, "keep_dims", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/math/mul_op.cc
+++ b/dragon/operators/math/mul_op.cc
@@ -40,7 +40,7 @@ void MulOp<Context>::DoRunWithType() {
 template <class Context>
 void MulOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -174,14 +174,14 @@ void MulGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(2));
 }
-DEPLOY_CPU(Mul);
+DEPLOY_CPU_OPERATOR(Mul);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Mul);
+DEPLOY_CUDA_OPERATOR(Mul);
 #endif
-DEPLOY_CPU(MulGradient);
+DEPLOY_CPU_OPERATOR(MulGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(MulGradient);
+DEPLOY_CUDA_OPERATOR(MulGradient);
 #endif
 OPERATOR_SCHEMA(Mul)

--- a/dragon/operators/math/neg_op.cc
+++ b/dragon/operators/math/neg_op.cc
@@ -38,14 +38,14 @@ void NegGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Neg);
+DEPLOY_CPU_OPERATOR(Neg);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Neg);
+DEPLOY_CUDA_OPERATOR(Neg);
 #endif
-DEPLOY_CPU(NegGradient);
+DEPLOY_CPU_OPERATOR(NegGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(NegGradient);
+DEPLOY_CUDA_OPERATOR(NegGradient);
 #endif
 OPERATOR_SCHEMA(Neg)

--- a/dragon/operators/math/pow_op.cc
+++ b/dragon/operators/math/pow_op.cc
@@ -173,9 +173,9 @@ void PowGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(PowGradient);
+DEPLOY_CPU_OPERATOR(PowGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(PowGradient);
+DEPLOY_CUDA_OPERATOR(PowGradient);
 #endif
 OPERATOR_SCHEMA(PowGradient)

--- a/dragon/operators/math/reciprocal_op.cc
+++ b/dragon/operators/math/reciprocal_op.cc
@@ -37,14 +37,14 @@ void ReciprocalGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Reciprocal);
+DEPLOY_CPU_OPERATOR(Reciprocal);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Reciprocal);
+DEPLOY_CUDA_OPERATOR(Reciprocal);
 #endif
-DEPLOY_CPU(ReciprocalGradient);
+DEPLOY_CPU_OPERATOR(ReciprocalGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ReciprocalGradient);
+DEPLOY_CUDA_OPERATOR(ReciprocalGradient);
 #endif
 OPERATOR_SCHEMA(Reciprocal)

--- a/dragon/operators/math/rsqrt_op.cc
+++ b/dragon/operators/math/rsqrt_op.cc
@@ -21,9 +21,9 @@ void RsqrtGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(RsqrtGradient);
+DEPLOY_CPU_OPERATOR(RsqrtGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RsqrtGradient);
+DEPLOY_CUDA_OPERATOR(RsqrtGradient);
 #endif
 OPERATOR_SCHEMA(RsqrtGradient)

--- a/dragon/operators/math/sign_op.cc
+++ b/dragon/operators/math/sign_op.cc
@@ -19,9 +19,9 @@ void SignGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(SignGradient);
+DEPLOY_CPU_OPERATOR(SignGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SignGradient);
+DEPLOY_CUDA_OPERATOR(SignGradient);
 #endif
 OPERATOR_SCHEMA(SignGradient)

--- a/dragon/operators/math/sin_op.cc
+++ b/dragon/operators/math/sin_op.cc
@@ -21,9 +21,9 @@ void SinGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(SinGradient);
+DEPLOY_CPU_OPERATOR(SinGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SinGradient);
+DEPLOY_CUDA_OPERATOR(SinGradient);
 #endif
 OPERATOR_SCHEMA(SinGradient)

--- a/dragon/operators/math/sqrt_op.cc
+++ b/dragon/operators/math/sqrt_op.cc
@@ -26,9 +26,9 @@ void SqrtGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(SqrtGradient);
+DEPLOY_CPU_OPERATOR(SqrtGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SqrtGradient);
+DEPLOY_CUDA_OPERATOR(SqrtGradient);
 #endif
 OPERATOR_SCHEMA(SqrtGradient)

--- a/dragon/operators/math/square_op.cc
+++ b/dragon/operators/math/square_op.cc
@@ -26,9 +26,9 @@ void SquareGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(SquareGradient);
+DEPLOY_CPU_OPERATOR(SquareGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SquareGradient);
+DEPLOY_CUDA_OPERATOR(SquareGradient);
 #endif
 OPERATOR_SCHEMA(SquareGradient)

--- a/dragon/operators/math/sub_op.cc
+++ b/dragon/operators/math/sub_op.cc
@@ -40,7 +40,7 @@ void SubOp<Context>::DoRunWithType() {
 template <class Context>
 void SubOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
 template <class Context>
@@ -97,14 +97,14 @@ void SubGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(Sub);
+DEPLOY_CPU_OPERATOR(Sub);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Sub);
+DEPLOY_CUDA_OPERATOR(Sub);
 #endif
-DEPLOY_CPU(SubGradient);
+DEPLOY_CPU_OPERATOR(SubGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SubGradient);
+DEPLOY_CUDA_OPERATOR(SubGradient);
 #endif
 OPERATOR_SCHEMA(Sub)

--- a/dragon/operators/metric/accuracy_op.cc
+++ b/dragon/operators/metric/accuracy_op.cc
@@ -49,19 +49,19 @@ void AccuracyOp<Context>::DoRunWithType() {
 template <class Context>
 void AccuracyOp<Context>::RunOnDevice() {
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
-    if (XIsType(Input(1), float)) {
+    if (Input(1).template IsType<float>()) {
      DoRunWithType<float, float>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<float, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
          types::to_string(Input(1).meta()), {"float32", "int64"});
    }
-  } else if (XIsType(Input(0), double)) {
+  } else if (Input(0).template IsType<double>()) {
-    if (XIsType(Input(1), double)) {
+    if (Input(1).template IsType<double>()) {
      DoRunWithType<double, double>();
-    } else if (XIsType(Input(1), int64_t)) {
+    } else if (Input(1).template IsType<int64_t>()) {
      DoRunWithType<double, int64_t>();
    } else {
      LOG(FATAL) << MessageForUnsupported(
@@ -73,9 +73,9 @@ void AccuracyOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(Accuracy);
+DEPLOY_CPU_OPERATOR(Accuracy);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Accuracy);
+DEPLOY_CUDA_OPERATOR(Accuracy);
 #endif
 OPERATOR_SCHEMA(Accuracy)

--- a/dragon/operators/metric/accuracy_op.h
+++ b/dragon/operators/metric/accuracy_op.h
@@ -22,8 +22,8 @@ class AccuracyOp final : public Operator<Context> {
 public:
  AccuracyOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        top_k_(OpArg<int64_t>("top_k", 1)),
+        top_k_(OP_SINGLE_ARG(int64_t, "top_k", 1)),
-        ignore_index_(OpArg<int64_t>("ignore_index", -1)) {}
+        ignore_index_(OP_SINGLE_ARG(int64_t, "ignore_index", -1)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/normalization/batch_norm_op.cc
+++ b/dragon/operators/normalization/batch_norm_op.cc
@@ -107,7 +107,7 @@ void BatchNormOp<Context>::RunOnDevice() {
  // Dispatch the training or inference impl
  Output(0)->ReshapeLike(Input(0));
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
    if (is_training_) {
      TrainingImpl<float, float>();
    } else {
@@ -184,7 +184,7 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
  // Dispatch the training or inference impl
  Output(0)->ReshapeLike(Input(0));
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
    if (is_training_ > 0) {
      TrainingImpl<float, float>();
    } else {
@@ -196,14 +196,14 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(BatchNorm);
+DEPLOY_CPU_OPERATOR(BatchNorm);
 #ifdef USE_CUDA
-DEPLOY_CUDA(BatchNorm);
+DEPLOY_CUDA_OPERATOR(BatchNorm);
 #endif
-DEPLOY_CPU(BatchNormGradient);
+DEPLOY_CPU_OPERATOR(BatchNormGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(BatchNormGradient);
+DEPLOY_CUDA_OPERATOR(BatchNormGradient);
 #endif
 OPERATOR_SCHEMA(BatchNorm)

--- a/dragon/operators/normalization/batch_norm_op.h
+++ b/dragon/operators/normalization/batch_norm_op.h
@@ -33,9 +33,9 @@ class BatchNormOpBase : public GenericOpBase<Context> {
 public:
  BatchNormOpBase(const OperatorDef& def, Workspace* ws)
      : GenericOpBase<Context>(def, ws),
-        momentum_(OpArg<float>("momentum", 0.9f)),
+        momentum_(OP_SINGLE_ARG(float, "momentum", 0.9f)),
-        epsilon_(OpArg<double>("epsilon", 1e-5)),
+        epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)),
-        use_stats_(OpArg<int64_t>("use_stats", -1)) {}
+        use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)) {}
  USE_OPERATOR_FUNCTIONS;
  void DetermineBaseArguments() {
@@ -48,7 +48,7 @@ class BatchNormOpBase : public GenericOpBase<Context> {
    }
    // Determine the data format
    this->data_format_ = "NCHW";
-    auto axis = OpArg<int64_t>("axis", -1);
+    auto axis = OP_SINGLE_ARG(int64_t, "axis", -1);
    if (axis == -1) axis += X.ndim();
    if (axis + 1 == X.ndim()) this->data_format_ = "NHWC";
    N_ = X.dim(0), C_ = X.dim(axis);

--- a/dragon/operators/normalization/batch_norm_op_cudnn.cc
+++ b/dragon/operators/normalization/batch_norm_op_cudnn.cc
@@ -78,9 +78,9 @@ void CuDNNBatchNormOp<Context>::RunOnDevice() {
  // Dispatch the training or inference impl
  Output(0)->ReshapeLike(Input(0));
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
    DoRunWithType<float>();
-  } else if (XIsType(Input(0), float16)) {
+  } else if (Input(0).template IsType<float16>()) {
    DoRunWithType<float16>();
  } else {
    LOG(FATAL) << MessageForUnsupported(
@@ -136,13 +136,13 @@ void CuDNNBatchNormGradientOp<Context>::RunOnDevice() {
  // Dispatch the training or inference impl
  Output(0)->ReshapeLike(Input(0));
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
    if (is_training_ > 0) {
      TrainingImpl<float>();
    } else {
      this->template InferenceImpl<float, float>();
    }
-  } else if (XIsType(Input(0), float16)) {
+  } else if (Input(0).template IsType<float16>()) {
    if (is_training_ > 0) {
      TrainingImpl<float16>();
    } else {
@@ -155,8 +155,8 @@ void CuDNNBatchNormGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CUDNN(BatchNorm);
+DEPLOY_CUDNN_OPERATOR(BatchNorm);
-DEPLOY_CUDNN(BatchNormGradient);
+DEPLOY_CUDNN_OPERATOR(BatchNormGradient);
 } // namespace dragon

--- a/dragon/operators/normalization/batch_norm_op_sync.cc
+++ b/dragon/operators/normalization/batch_norm_op_sync.cc
@@ -104,7 +104,7 @@ void SyncBatchNormOp<Context>::RunOnDevice() {
  // Dispatch the training or inference impl
  Output(0)->ReshapeLike(Input(0));
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
    if (is_training_ > 0) {
      TrainingImpl<float, float>();
    } else {
@@ -189,7 +189,7 @@ void SyncBatchNormGradientOp<Context>::RunOnDevice() {
  // Dispatch the training or inference impl
  Output(0)->ReshapeLike(Input(0));
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
    if (is_training_ > 0) {
      TrainingImpl<float, float>();
    } else {
@@ -201,14 +201,14 @@ void SyncBatchNormGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(SyncBatchNorm);
+DEPLOY_CPU_OPERATOR(SyncBatchNorm);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SyncBatchNorm);
+DEPLOY_CUDA_OPERATOR(SyncBatchNorm);
 #endif
-DEPLOY_CPU(SyncBatchNormGradient);
+DEPLOY_CPU_OPERATOR(SyncBatchNormGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SyncBatchNormGradient);
+DEPLOY_CUDA_OPERATOR(SyncBatchNormGradient);
 #endif
 OPERATOR_SCHEMA(SyncBatchNorm)

--- a/dragon/operators/normalization/group_norm_op.cc
+++ b/dragon/operators/normalization/group_norm_op.cc
@@ -55,9 +55,9 @@ void GroupNormOp<Context>::RunOnDevice() {
  DetermineBaseArguments();
  Output(0)->ReshapeLike(Input(0));
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
    DoRunWithType<float, float>();
-  } else if (XIsType(Input(0), float16)) {
+  } else if (Input(0).template IsType<float16>()) {
    DoRunWithType<float16, float>();
  } else {
    LOG(FATAL) << MessageForUnsupported(
@@ -97,9 +97,9 @@ void GroupNormGradientOp<Context>::RunOnDevice() {
  DetermineBaseArguments();
  Output(0)->ReshapeLike(Input(0));
-  if (XIsType(Input(0), float)) {
+  if (Input(0).template IsType<float>()) {
    DoRunWithType<float, float>();
-  } else if (XIsType(Input(0), float16)) {
+  } else if (Input(0).template IsType<float16>()) {
    DoRunWithType<float16, float>();
  } else {
    LOG(FATAL) << MessageForUnsupported(
@@ -107,14 +107,14 @@ void GroupNormGradientOp<Context>::RunOnDevice() {
  }
 }
-DEPLOY_CPU(GroupNorm);
+DEPLOY_CPU_OPERATOR(GroupNorm);
 #ifdef USE_CUDA
-DEPLOY_CUDA(GroupNorm);
+DEPLOY_CUDA_OPERATOR(GroupNorm);
 #endif
-DEPLOY_CPU(GroupNormGradient);
+DEPLOY_CPU_OPERATOR(GroupNormGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(GroupNormGradient);
+DEPLOY_CUDA_OPERATOR(GroupNormGradient);
 #endif
 OPERATOR_SCHEMA(GroupNorm)

--- a/dragon/operators/normalization/group_norm_op.h
+++ b/dragon/operators/normalization/group_norm_op.h
@@ -22,15 +22,15 @@ class GroupNormOpBase : public Operator<Context> {
 public:
  GroupNormOpBase(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        group_(OpArg<int64_t>("group", 0)),
+        group_(OP_SINGLE_ARG(int64_t, "group", 0)),
-        epsilon_(OpArg<double>("epsilon", 1e-5)) {}
+        epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)) {}
  USE_OPERATOR_FUNCTIONS;
  void DetermineBaseArguments() {
    auto& X = Input(0);
    // Determine the data format
    this->data_format_ = "NCHW";
-    auto axis = OpArg<int64_t>("axis", -1);
+    auto axis = OP_SINGLE_ARG(int64_t, "axis", -1);
    if (axis == -1) axis += X.ndim();
    if (axis + 1 == X.ndim()) this->data_format_ = "NHWC";
    if (X.ndim() == 2) this->data_format_ = "NCHW";

--- a/dragon/operators/normalization/lp_normalize_op.cc
+++ b/dragon/operators/normalization/lp_normalize_op.cc
@@ -5,16 +5,16 @@
 namespace dragon {
-#define CANONICALIZE_AXES_WITH_TENSOR(tensor)         \
+#define CANONICALIZE_AXES_WITH_TENSOR(tensor)            \
-  CANONICALIZE_AXIS_WITH_TENSOR(tensor);              \
+  CANONICALIZE_AXIS_WITH_TENSOR(tensor);                 \
-  auto num_axes = OpArg<int64_t>("num_axes", 1);      \
+  auto num_axes = OP_SINGLE_ARG(int64_t, "num_axes", 1); \
-  if (num_axes < 0) {                                 \
+  if (num_axes < 0) {                                    \
-    num_axes = tensor.ndim() - axis;                  \
+    num_axes = tensor.ndim() - axis;                     \
-  } else if (num_axes == 0) {                         \
+  } else if (num_axes == 0) {                            \
-    num_axes = 1;                                     \
+    num_axes = 1;                                        \
-  }                                                   \
+  }                                                      \
-  CHECK(axis + num_axes <= tensor.ndim())             \
+  CHECK(axis + num_axes <= tensor.ndim())                \
-      << "\nInvalid number of axes. Got " << num_axes \
+      << "\nInvalid number of axes. Got " << num_axes    \
      << ", excepted in the range [1, " << tensor.ndim() - axis << "]."
 template <class Context>
@@ -94,14 +94,14 @@ void LpNormalizeGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(LpNormalize);
+DEPLOY_CPU_OPERATOR(LpNormalize);
 #ifdef USE_CUDA
-DEPLOY_CUDA(LpNormalize);
+DEPLOY_CUDA_OPERATOR(LpNormalize);
 #endif
-DEPLOY_CPU(LpNormalizeGradient);
+DEPLOY_CPU_OPERATOR(LpNormalizeGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(LpNormalizeGradient);
+DEPLOY_CUDA_OPERATOR(LpNormalizeGradient);
 #endif
 OPERATOR_SCHEMA(LpNormalize)

--- a/dragon/operators/normalization/lp_normalize_op.h
+++ b/dragon/operators/normalization/lp_normalize_op.h
@@ -22,9 +22,9 @@ class LpNormalizeOp final : public Operator<Context> {
 public:
  LpNormalizeOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        p_(OpArg<int64_t>("p", 2)),
+        p_(OP_SINGLE_ARG(int64_t, "p", 2)),
-        epsilon_(OpArg<double>("epsilon", 1e-12)),
+        epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)),
-        reduction_(OpArg<string>("reduction", "SUM")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -43,9 +43,9 @@ class LpNormalizeGradientOp final : public Operator<Context> {
 public:
  LpNormalizeGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        p_(OpArg<int64_t>("p", 2)),
+        p_(OP_SINGLE_ARG(int64_t, "p", 2)),
-        epsilon_(OpArg<double>("epsilon", 1e-12)),
+        epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)),
-        reduction_(OpArg<string>("reduction", "SUM")) {}
+        reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/normalization/lrn_op.cc
+++ b/dragon/operators/normalization/lrn_op.cc
@@ -25,14 +25,14 @@ void LRNGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(-1));
 }
-DEPLOY_CPU(LRN);
+DEPLOY_CPU_OPERATOR(LRN);
 #ifdef USE_CUDA
-DEPLOY_CUDA(LRN);
+DEPLOY_CUDA_OPERATOR(LRN);
 #endif
-DEPLOY_CPU(LRNGradient);
+DEPLOY_CPU_OPERATOR(LRNGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(LRNGradient);
+DEPLOY_CUDA_OPERATOR(LRNGradient);
 #endif
 OPERATOR_SCHEMA(LRN)

--- a/dragon/operators/normalization/lrn_op.h
+++ b/dragon/operators/normalization/lrn_op.h
@@ -22,10 +22,10 @@ class LRNOp : public Operator<Context> {
 public:
  LRNOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        size_(OpArg<int64_t>("size", 5)),
+        size_(OP_SINGLE_ARG(int64_t, "size", 5)),
-        alpha_(OpArg<float>("alpha", 0.0001f)),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 0.0001f)),
-        beta_(OpArg<float>("beta", 0.75f)),
+        beta_(OP_SINGLE_ARG(float, "beta", 0.75f)),
-        bias_(OpArg<float>("bias", 1.f)) {}
+        bias_(OP_SINGLE_ARG(float, "bias", 1.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -43,10 +43,10 @@ class LRNGradientOp : public Operator<Context> {
 public:
  LRNGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        size_(OpArg<int64_t>("size", 5)),
+        size_(OP_SINGLE_ARG(int64_t, "size", 5)),
-        alpha_(OpArg<float>("alpha", 0.0001f)),
+        alpha_(OP_SINGLE_ARG(float, "alpha", 0.0001f)),
-        beta_(OpArg<float>("beta", 0.75f)),
+        beta_(OP_SINGLE_ARG(float, "beta", 0.75f)),
-        bias_(OpArg<float>("bias", 1.f)) {}
+        bias_(OP_SINGLE_ARG(float, "bias", 1.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/normalization/lrn_op_cudnn.cc
+++ b/dragon/operators/normalization/lrn_op_cudnn.cc
@@ -54,8 +54,8 @@ void CuDNNLRNGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(LRN);
+DEPLOY_CUDNN_OPERATOR(LRN);
-DEPLOY_CUDNN(LRNGradient);
+DEPLOY_CUDNN_OPERATOR(LRNGradient);
 } // namespace dragon

--- a/dragon/operators/recurrent/lstm_cell_op.cc
+++ b/dragon/operators/recurrent/lstm_cell_op.cc
@@ -19,13 +19,7 @@ template <class Context>
 void LSTMCellOp<Context>::RunOnDevice() {
  Output(0)->ReshapeLike(Input(1));
  Output(1)->ReshapeLike(Input(1));
+  DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
-  if (XIsType(Input(0), float)) {
-    DoRunWithType<float>();
-  } else {
-    LOG(FATAL) << MessageForUnsupported(
-        types::to_string(Input(0).meta()), {"float32"});
-  }
 }
 template <class Context>
@@ -51,29 +45,22 @@ template <class Context>
 void LSTMCellGradientOp<Context>::RunOnDevice() {
  Output(0)->ReshapeLike(Input(0));
  Output(1)->ReshapeLike(Input(1));
  if (!Input(-1).has_name()) {
    // dC will be ignored if C is not solved
    // We should Zero-Reset the dC
    Input(-1).ReshapeLike(Input(-2));
  }
+  DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
-  if (XIsType(Input(0), float)) {
-    DoRunWithType<float>();
-  } else {
-    LOG(FATAL) << MessageForUnsupported(
-        types::to_string(Input(0).meta()), {"float32"});
-  }
 }
-DEPLOY_CPU(LSTMCell);
+DEPLOY_CPU_OPERATOR(LSTMCell);
 #ifdef USE_CUDA
-DEPLOY_CUDA(LSTMCell);
+DEPLOY_CUDA_OPERATOR(LSTMCell);
 #endif
-DEPLOY_CPU(LSTMCellGradient);
+DEPLOY_CPU_OPERATOR(LSTMCellGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(LSTMCellGradient);
+DEPLOY_CUDA_OPERATOR(LSTMCellGradient);
 #endif
 OPERATOR_SCHEMA(LSTMCell)

--- a/dragon/operators/recurrent/recurrent_op.cc
+++ b/dragon/operators/recurrent/recurrent_op.cc
@@ -2,14 +2,14 @@
 namespace dragon {
-DEPLOY_CPU(Recurrent);
+DEPLOY_CPU_OPERATOR(Recurrent);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Recurrent);
+DEPLOY_CUDA_OPERATOR(Recurrent);
 #endif
-DEPLOY_CPU(RecurrentGradient);
+DEPLOY_CPU_OPERATOR(RecurrentGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RecurrentGradient);
+DEPLOY_CUDA_OPERATOR(RecurrentGradient);
 #endif
 OPERATOR_SCHEMA(Recurrent)

--- a/dragon/operators/recurrent/recurrent_op_cudnn.cc
+++ b/dragon/operators/recurrent/recurrent_op_cudnn.cc
@@ -316,8 +316,8 @@ void CuDNNRecurrentGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(Recurrent);
+DEPLOY_CUDNN_OPERATOR(Recurrent);
-DEPLOY_CUDNN(RecurrentGradient);
+DEPLOY_CUDNN_OPERATOR(RecurrentGradient);
 } // namespace dragon

--- a/dragon/operators/recurrent/recurrent_op_cudnn.h
+++ b/dragon/operators/recurrent/recurrent_op_cudnn.h
@@ -53,16 +53,16 @@ class CuDNNRecurrentOpBase : public Operator<Context> {
  CuDNNRecurrentOpBase(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
        states_initialized_(0),
-        num_layers_(OpArg<int64_t>("num_layers", 1)),
+        num_layers_(OP_SINGLE_ARG(int64_t, "num_layers", 1)),
-        hidden_size_(OpArg<int64_t>("hidden_size", 0)),
+        hidden_size_(OP_SINGLE_ARG(int64_t, "hidden_size", 0)),
-        bidirectional_(OpArg<int64_t>("bidirectional", 0)),
+        bidirectional_(OP_SINGLE_ARG(int64_t, "bidirectional", 0)),
-        dropout_ratio_(OpArg<float>("dropout_ratio", 1.f)),
+        dropout_ratio_(OP_SINGLE_ARG(float, "dropout_ratio", 1.f)),
        rng_seed_(def.device_option().random_seed()) {
    // Determine the rnn direction
    rnn_direction_ =
        bidirectional_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
    // Determine the rnn mode
-    auto mode_str = OpArg<string>("rnn_mode", "");
+    auto mode_str = OP_SINGLE_ARG(string, "rnn_mode", "");
    if (mode_str == "rnn_tanh") {
      rnn_mode_ = CUDNN_RNN_TANH;
    } else if (mode_str == "rnn_relu") {
@@ -75,7 +75,7 @@ class CuDNNRecurrentOpBase : public Operator<Context> {
      LOG(FATAL) << "Unknown RNN Mode: " << mode_str;
    }
    // Determine the rnn input mode
-    auto input_mode_str = OpArg<string>("rnn_input_mode", "linear");
+    auto input_mode_str = OP_SINGLE_ARG(string, "rnn_input_mode", "linear");
    if (input_mode_str == "skip") {
      rnn_input_mode_ = CUDNN_SKIP_INPUT;
    } else if (input_mode_str == "linear") {
@@ -84,7 +84,7 @@ class CuDNNRecurrentOpBase : public Operator<Context> {
      LOG(FATAL) << "Unknown RNN InputMode: " << input_mode_str;
    }
    // Override the running phase
-    SwitchToPhase(OpArg<string>("phase", ""));
+    SwitchToPhase(OP_SINGLE_ARG(string, "phase", ""));
    CuDNNCreateTensorDesc(&hx_desc_);
    CuDNNCreateTensorDesc(&cx_desc_);
    CuDNNCreateTensorDesc(&hy_desc_);

--- a/dragon/operators/recurrent/rnn_param_op.cc
+++ b/dragon/operators/recurrent/rnn_param_op.cc
@@ -39,9 +39,9 @@ void RNNParamSetOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(RNNParamSet);
+DEPLOY_CPU_OPERATOR(RNNParamSet);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RNNParamSet);
+DEPLOY_CUDA_OPERATOR(RNNParamSet);
 #endif
 OPERATOR_SCHEMA(RNNParamSet)

--- a/dragon/operators/recurrent/rnn_param_op.h
+++ b/dragon/operators/recurrent/rnn_param_op.h
@@ -22,14 +22,14 @@ class RNNParamSetOp final : public Operator<Context> {
 public:
  RNNParamSetOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        param_type_(OpArg<string>("param_type", "matrix")),
+        param_type_(OP_SINGLE_ARG(string, "param_type", "matrix")),
-        nlayers_(OpArg<int64_t>("num_layers", 1)),
+        nlayers_(OP_SINGLE_ARG(int64_t, "num_layers", 1)),
-        ndirections_(OpArg<int64_t>("num_directions", 1)),
+        ndirections_(OP_SINGLE_ARG(int64_t, "num_directions", 1)),
-        input_size_(OpArg<int64_t>("input_size", 0)),
+        input_size_(OP_SINGLE_ARG(int64_t, "input_size", 0)),
-        hidden_size_(OpArg<int64_t>("hidden_size", 0)),
+        hidden_size_(OP_SINGLE_ARG(int64_t, "hidden_size", 0)),
-        layer_id_(OpArg<int64_t>("layer_id", 0)),
+        layer_id_(OP_SINGLE_ARG(int64_t, "layer_id", 0)),
-        param_id_(OpArg<int64_t>("param_id", 0)) {
+        param_id_(OP_SINGLE_ARG(int64_t, "param_id", 0)) {
-    auto mode_str = OpArg<string>("rnn_mode", "rnn_tanh");
+    auto mode_str = OP_SINGLE_ARG(string, "rnn_mode", "rnn_tanh");
    if (mode_str == "rnn_tanh") {
      nparams_ = 2;
      spliter_ = 1;

--- a/dragon/operators/training/adam_update_op.cc
+++ b/dragon/operators/training/adam_update_op.cc
@@ -21,9 +21,9 @@ void AdamUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
      ctx());
 }
-DEPLOY_CPU(AdamUpdate);
+DEPLOY_CPU_OPERATOR(AdamUpdate);
 #ifdef USE_CUDA
-DEPLOY_CUDA(AdamUpdate);
+DEPLOY_CUDA_OPERATOR(AdamUpdate);
 #endif
 OPERATOR_SCHEMA(AdamUpdate)

--- a/dragon/operators/training/nesterov_update_op.cc
+++ b/dragon/operators/training/nesterov_update_op.cc
@@ -15,9 +15,9 @@ void NesterovUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
      ctx());
 }
-DEPLOY_CPU(NesterovUpdate);
+DEPLOY_CPU_OPERATOR(NesterovUpdate);
 #ifdef USE_CUDA
-DEPLOY_CUDA(NesterovUpdate);
+DEPLOY_CUDA_OPERATOR(NesterovUpdate);
 #endif
 OPERATOR_SCHEMA(NesterovUpdate)

--- a/dragon/operators/training/rmsprop_update_op.cc
+++ b/dragon/operators/training/rmsprop_update_op.cc
@@ -18,9 +18,9 @@ void RMSpropUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
      ctx());
 }
-DEPLOY_CPU(RMSpropUpdate);
+DEPLOY_CPU_OPERATOR(RMSpropUpdate);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RMSpropUpdate);
+DEPLOY_CUDA_OPERATOR(RMSpropUpdate);
 #endif
 OPERATOR_SCHEMA(RMSpropUpdate)

--- a/dragon/operators/training/sgd_update_op.cc
+++ b/dragon/operators/training/sgd_update_op.cc
@@ -19,9 +19,9 @@ void SGDUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
      ctx());
 }
-DEPLOY_CPU(SGDUpdate);
+DEPLOY_CPU_OPERATOR(SGDUpdate);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SGDUpdate);
+DEPLOY_CUDA_OPERATOR(SGDUpdate);
 #endif
 OPERATOR_SCHEMA(SGDUpdate)

--- a/dragon/operators/training/update_op_base.cc
+++ b/dragon/operators/training/update_op_base.cc
@@ -37,7 +37,7 @@ void UpdateOpBase<Context>::AdjustGradient(Tensor* dX, Tensor* X) {
  // Penalty
  auto weight_decay = Parameter("weight_decay");
  if (weight_decay > 0.f) {
-    if (XIsType((*X), float16)) {
+    if (X->template IsType<float16>()) {
      kernel::MixedPrecL2Penalty(
          X->count(),
          weight_decay * decay_mult_,
@@ -58,7 +58,7 @@ void UpdateOpBase<Context>::AdjustGradient(Tensor* dX, Tensor* X) {
 template <class Context>
 template <typename T>
 void UpdateOpBase<Context>::ApplyUpdate(Tensor* dX, Tensor* X) {
-  if (XIsType((*X), float16)) {
+  if (X->template IsType<float16>()) {
    kernel::MixedPrecUpdate(
        X->count(),
        dX->template data<float, Context>(),
@@ -85,11 +85,11 @@ void UpdateOpBase<Context>::RunOnDevice() {
      << "\nParam and grad should have the same dimensions."
      << "\nGot" << X->DimString() << " and " << dX.DimString();
-  if (XIsType(dX, float)) {
+  if (dX.template IsType<float>()) {
    AdjustGradient<float>(&dX, X);
    ComputeUpdate(&dX);
    ApplyUpdate<float>(&dX, X);
-  } else if (XIsType(dX, float16)) {
+  } else if (dX.template IsType<float16>()) {
    auto* dX_cast = ws()->CreateTensor(dX.name() + "[float32]");
    kernel::Cast(
        dX.count(),

--- a/dragon/operators/training/update_ops.h
+++ b/dragon/operators/training/update_ops.h
@@ -22,8 +22,8 @@ class UpdateOpBase : public Operator<Context> {
 public:
  UpdateOpBase(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        lr_mult_(OpArg<float>("lr_mult", 1.f)),
+        lr_mult_(OP_SINGLE_ARG(float, "lr_mult", 1.f)),
-        decay_mult_(OpArg<float>("decay_mult", 1.f)) {}
+        decay_mult_(OP_SINGLE_ARG(float, "decay_mult", 1.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/vision/bias_add_op.cc
+++ b/dragon/operators/vision/bias_add_op.cc
@@ -73,14 +73,14 @@ void BiasAddGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(BiasAdd);
+DEPLOY_CPU_OPERATOR(BiasAdd);
 #ifdef USE_CUDA
-DEPLOY_CUDA(BiasAdd);
+DEPLOY_CUDA_OPERATOR(BiasAdd);
 #endif
-DEPLOY_CPU(BiasAddGradient);
+DEPLOY_CPU_OPERATOR(BiasAddGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(BiasAddGradient);
+DEPLOY_CUDA_OPERATOR(BiasAddGradient);
 #endif
 OPERATOR_SCHEMA(BiasAdd)

--- a/dragon/operators/vision/bias_add_op_cudnn.cc
+++ b/dragon/operators/vision/bias_add_op_cudnn.cc
@@ -44,7 +44,7 @@ void CuDNNBiasAddGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(BiasAddGradient);
+DEPLOY_CUDNN_OPERATOR(BiasAddGradient);
 } // namespace dragon

--- a/dragon/operators/vision/conv2d_op.cc
+++ b/dragon/operators/vision/conv2d_op.cc
@@ -73,14 +73,14 @@ void Conv2dGradientOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<float, double>>::Call(this, Input(0));
 }
-DEPLOY_CPU(Conv2d);
+DEPLOY_CPU_OPERATOR(Conv2d);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Conv2d);
+DEPLOY_CUDA_OPERATOR(Conv2d);
 #endif
-DEPLOY_CPU(Conv2dGradient);
+DEPLOY_CPU_OPERATOR(Conv2dGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Conv2dGradient);
+DEPLOY_CUDA_OPERATOR(Conv2dGradient);
 #endif
 OPERATOR_SCHEMA(Conv2d)

--- a/dragon/operators/vision/conv2d_op_cudnn.cc
+++ b/dragon/operators/vision/conv2d_op_cudnn.cc
@@ -585,8 +585,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(-1));
 }
-DEPLOY_CUDNN(Conv2d);
+DEPLOY_CUDNN_OPERATOR(Conv2d);
-DEPLOY_CUDNN(Conv2dGradient);
+DEPLOY_CUDNN_OPERATOR(Conv2dGradient);
 } // namespace dragon

--- a/dragon/operators/vision/conv2d_transpose_op.cc
+++ b/dragon/operators/vision/conv2d_transpose_op.cc
@@ -73,14 +73,14 @@ void ConvTranspose2dGradientOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<float, double>>::Call(this, Input(0));
 }
-DEPLOY_CPU(ConvTranspose2d);
+DEPLOY_CPU_OPERATOR(ConvTranspose2d);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ConvTranspose2d);
+DEPLOY_CUDA_OPERATOR(ConvTranspose2d);
 #endif
-DEPLOY_CPU(ConvTranspose2dGradient);
+DEPLOY_CPU_OPERATOR(ConvTranspose2dGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ConvTranspose2dGradient);
+DEPLOY_CUDA_OPERATOR(ConvTranspose2dGradient);
 #endif
 OPERATOR_SCHEMA(ConvTranspose2d)

--- a/dragon/operators/vision/conv2d_transpose_op_cudnn.cc
+++ b/dragon/operators/vision/conv2d_transpose_op_cudnn.cc
@@ -580,8 +580,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(-1));
 }
-DEPLOY_CUDNN(ConvTranspose2d);
+DEPLOY_CUDNN_OPERATOR(ConvTranspose2d);
-DEPLOY_CUDNN(ConvTranspose2dGradient);
+DEPLOY_CUDNN_OPERATOR(ConvTranspose2dGradient);
 } // namespace dragon

--- a/dragon/operators/vision/conv_op_base.cc
+++ b/dragon/operators/vision/conv_op_base.cc
@@ -217,10 +217,10 @@ void ConvOpBase<Context>::Db(const T* dy, T* db) {
 template <class Context>
 void ConvOpBase<Context>::Setup(int num_axes) {
  num_axes_ = num_axes;
-  auto pads = OpArgs<int64_t>("pads");
+  auto pads = OP_REPEATED_ARG(int64_t, "pads");
-  auto strides = OpArgs<int64_t>("strides");
+  auto strides = OP_REPEATED_ARG(int64_t, "strides");
-  auto kshape = OpArgs<int64_t>("kernel_shape");
+  auto kshape = OP_REPEATED_ARG(int64_t, "kernel_shape");
-  auto dilations = OpArgs<int64_t>("dilations");
+  auto dilations = OP_REPEATED_ARG(int64_t, "dilations");
  auto at = [&](const vec64_t& vec, int i) {
    return i < vec.size() ? vec[i] : vec[0];

--- a/dragon/operators/vision/conv_op_base.h
+++ b/dragon/operators/vision/conv_op_base.h
@@ -24,9 +24,9 @@ class ConvOpBase : public Operator<Context> {
 public:
  ConvOpBase(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        padding_(OpArg<string>("padding", "VALID")),
+        padding_(OP_SINGLE_ARG(string, "padding", "VALID")),
-        out_channels_(OpArg<int64_t>("out_channels", 0)),
+        out_channels_(OP_SINGLE_ARG(int64_t, "out_channels", 0)),
-        group_(OpArg<int64_t>("group", 1)) {
+        group_(OP_SINGLE_ARG(int64_t, "group", 1)) {
    if (data_format() == "NCHW") {
      axis_ = 2;
    } else if (data_format() == "NHWC") {
@@ -35,8 +35,8 @@ class ConvOpBase : public Operator<Context> {
      LOG(FATAL) << "Unknown DataFormat: " << data_format();
    }
    num_axes_ = -1; // Unknown
-    GET_ARGS_WITH_DESC(int64_t, output_shape);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, output_shape);
-    GET_ARGS_WITH_DESC(int64_t, output_padding);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, output_padding);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -50,8 +50,8 @@ class ConvOpBase : public Operator<Context> {
  int64_t in_channels_, out_channels_, out_dim_;
  int64_t x_offset_, w_offset_, y_offset_;
-  DECLARE_ARGS_WITH_DESC(int64_t, output_shape);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, output_shape);
-  DECLARE_ARGS_WITH_DESC(int64_t, output_padding);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, output_padding);
  void Setup(int num_axes);
  void Reshape(bool backward = false);
@@ -135,8 +135,8 @@ class ConvOpBase : public Operator<Context> {
  int64_t conv_in_channels_, conv_out_channels_, conv_out_dim_;
 };
-DEFINE_ARGS_WITH_DESC(int64_t, ConvOpBase, output_shape);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, ConvOpBase, output_shape);
-DEFINE_ARGS_WITH_DESC(int64_t, ConvOpBase, output_padding);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, ConvOpBase, output_padding);
 #define USE_CONVOLUTION_FUNCTIONS           \
  using ConvOpBase<Context>::Setup;         \

--- a/dragon/operators/vision/depth_to_space_op.cc
+++ b/dragon/operators/vision/depth_to_space_op.cc
@@ -74,17 +74,17 @@ void DepthToSpaceOp<Context>::DoRunWithType() {
 template <class Context>
 void DepthToSpaceOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(DepthToSpace);
+DEPLOY_CPU_OPERATOR(DepthToSpace);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DepthToSpace);
+DEPLOY_CUDA_OPERATOR(DepthToSpace);
 #endif
-DEPLOY_CPU(DepthToSpaceGradient);
+DEPLOY_CPU_OPERATOR(DepthToSpaceGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DepthToSpaceGradient);
+DEPLOY_CUDA_OPERATOR(DepthToSpaceGradient);
 #endif
 OPERATOR_SCHEMA(DepthToSpace)

--- a/dragon/operators/vision/depth_to_space_op.h
+++ b/dragon/operators/vision/depth_to_space_op.h
@@ -21,7 +21,8 @@ template <class Context>
 class DepthToSpaceOp final : public Operator<Context> {
 public:
  DepthToSpaceOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), block_size_(OpArg<int>("block_size", 2)) {}
+      : Operator<Context>(def, ws),
+        block_size_(OP_SINGLE_ARG(int, "block_size", 2)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/vision/depthwise_conv2d_op.cc
+++ b/dragon/operators/vision/depthwise_conv2d_op.cc
@@ -114,14 +114,14 @@ void DepthwiseConv2dGradientOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
 }
-DEPLOY_CPU(DepthwiseConv2d);
+DEPLOY_CPU_OPERATOR(DepthwiseConv2d);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DepthwiseConv2d);
+DEPLOY_CUDA_OPERATOR(DepthwiseConv2d);
 #endif
-DEPLOY_CPU(DepthwiseConv2dGradient);
+DEPLOY_CPU_OPERATOR(DepthwiseConv2dGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(DepthwiseConv2dGradient);
+DEPLOY_CUDA_OPERATOR(DepthwiseConv2dGradient);
 #endif
 OPERATOR_SCHEMA(DepthwiseConv2d)

--- a/dragon/operators/vision/depthwise_conv2d_op_cudnn.cc
+++ b/dragon/operators/vision/depthwise_conv2d_op_cudnn.cc
@@ -133,8 +133,8 @@ void CuDNNDepthwiseConv2dGradientOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<float>>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(DepthwiseConv2d);
+DEPLOY_CUDNN_OPERATOR(DepthwiseConv2d);
-DEPLOY_CUDNN(DepthwiseConv2dGradient);
+DEPLOY_CUDNN_OPERATOR(DepthwiseConv2dGradient);
 } // namespace dragon

--- a/dragon/operators/vision/pool2d_op.cc
+++ b/dragon/operators/vision/pool2d_op.cc
@@ -108,14 +108,14 @@ void Pool2dGradientOp<Context>::RunOnDevice() {
  DispatchHelper<TensorTypes<float, double>>::Call(this, Input(0));
 }
-DEPLOY_CPU(Pool2d);
+DEPLOY_CPU_OPERATOR(Pool2d);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Pool2d);
+DEPLOY_CUDA_OPERATOR(Pool2d);
 #endif
-DEPLOY_CPU(Pool2dGradient);
+DEPLOY_CPU_OPERATOR(Pool2dGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Pool2dGradient);
+DEPLOY_CUDA_OPERATOR(Pool2dGradient);
 #endif
 OPERATOR_SCHEMA(Pool2d)

--- a/dragon/operators/vision/pool2d_op_cudnn.cc
+++ b/dragon/operators/vision/pool2d_op_cudnn.cc
@@ -81,8 +81,8 @@ void CuDNNPool2dGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CUDNN(Pool2d);
+DEPLOY_CUDNN_OPERATOR(Pool2d);
-DEPLOY_CUDNN(Pool2dGradient);
+DEPLOY_CUDNN_OPERATOR(Pool2dGradient);
 } // namespace dragon

--- a/dragon/operators/vision/pool_op_base.cc
+++ b/dragon/operators/vision/pool_op_base.cc
@@ -21,9 +21,9 @@ void PoolOpBase<Context>::Setup(int num_axes) {
  auto at = [&](const vec64_t& vec, int i) {
    return i < vec.size() ? vec[i] : vec[0];
  };
-  auto pads = OpArgs<int64_t>("pads");
+  auto pads = OP_REPEATED_ARG(int64_t, "pads");
-  auto strides = OpArgs<int64_t>("strides");
+  auto strides = OP_REPEATED_ARG(int64_t, "strides");
-  auto kshape = OpArgs<int64_t>("kernel_shape");
+  auto kshape = OP_REPEATED_ARG(int64_t, "kernel_shape");
  for (int i = 0; i < num_axes_; i++) {
    if (global_pool_) {
      pad_l_.push_back(0);

--- a/dragon/operators/vision/pool_op_base.h
+++ b/dragon/operators/vision/pool_op_base.h
@@ -22,10 +22,10 @@ class PoolOpBase : public Operator<Context> {
 public:
  PoolOpBase(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        mode_(OpArg<string>("mode", "MAX")),
+        mode_(OP_SINGLE_ARG(string, "mode", "MAX")),
-        padding_(OpArg<string>("padding", "VALID")),
+        padding_(OP_SINGLE_ARG(string, "padding", "VALID")),
-        ceil_mode_(OpArg<int64_t>("ceil_mode", 0)),
+        ceil_mode_(OP_SINGLE_ARG(int64_t, "ceil_mode", 0)),
-        global_pool_(OpArg<int64_t>("global_pooling", 0)) {
+        global_pool_(OP_SINGLE_ARG(int64_t, "global_pooling", 0)) {
    if (data_format() == "NCHW")
      axis_ = 2;
    else if (data_format() == "NHWC")

--- a/dragon/operators/vision/resize_op.cc
+++ b/dragon/operators/vision/resize_op.cc
@@ -112,7 +112,7 @@ void ResizeOp<Context>::RunOnDevice() {
    LOG(FATAL) << "Specify either <sizes> or <scales>.";
  }
-  DispatchHelper<MathTensorTypes>::Call(this, X);
+  DispatchHelper<NumericalTensorTypes>::Call(this, X);
 }
 template <class Context>
@@ -192,11 +192,11 @@ void ResizeGradientOp<Context>::RunOnDevice() {
  Buffer("in_dims")->template CopyTo<int64_t>(in_dims_);
  Buffer("out_dims")->template CopyTo<int64_t>(out_dims_);
-  if (XIsType(Input(0), float16)) {
+  if (Input(0).template IsType<float16>()) {
    DoRunWithTypeAndCast<float16>();
-  } else if (XIsType(Input(0), float)) {
+  } else if (Input(0).template IsType<float>()) {
    DoRunWithType<float>();
-  } else if (XIsType(Input(0), double)) {
+  } else if (Input(0).template IsType<double>()) {
    DoRunWithTypeAndCast<double>();
  } else {
    LOG(FATAL) << MessageForUnsupported(
@@ -204,14 +204,14 @@ void ResizeGradientOp<Context>::RunOnDevice() {
  };
 }
-DEPLOY_CPU(Resize);
+DEPLOY_CPU_OPERATOR(Resize);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Resize);
+DEPLOY_CUDA_OPERATOR(Resize);
 #endif
-DEPLOY_CPU(ResizeGradient);
+DEPLOY_CPU_OPERATOR(ResizeGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(ResizeGradient);
+DEPLOY_CUDA_OPERATOR(ResizeGradient);
 #endif
 OPERATOR_SCHEMA(Resize)

--- a/dragon/operators/vision/resize_op.h
+++ b/dragon/operators/vision/resize_op.h
@@ -22,10 +22,10 @@ class ResizeOp final : public Operator<Context> {
 public:
  ResizeOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        mode_(str::upper(OpArg<string>("mode", "NEAREST"))),
+        mode_(str::upper(OP_SINGLE_ARG(string, "mode", "NEAREST"))),
-        align_corners_(OpArg<int64_t>("align_corners", 0)) {
+        align_corners_(OP_SINGLE_ARG(int64_t, "align_corners", 0)) {
-    GET_ARGS_WITH_DESC(float, scales);
+    INIT_OP_REPEATED_ARG_WITH_DESC(float, scales);
-    GET_ARGS_WITH_DESC(int64_t, sizes);
+    INIT_OP_REPEATED_ARG_WITH_DESC(int64_t, sizes);
  }
  USE_OPERATOR_FUNCTIONS;
@@ -38,8 +38,8 @@ class ResizeOp final : public Operator<Context> {
  string mode_;
  int64_t align_corners_;
  vec64_t in_dims_, out_dims_, out_shape_;
-  DECLARE_ARGS_WITH_DESC(float, scales);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(float, scales);
-  DECLARE_ARGS_WITH_DESC(int64_t, sizes);
+  DECLARE_OP_REPEATED_ARG_WITH_DESC(int64_t, sizes);
 };
 template <class Context>
@@ -47,8 +47,8 @@ class ResizeGradientOp final : public Operator<Context> {
 public:
  ResizeGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        mode_(str::upper(OpArg<string>("mode", "NEAREST"))),
+        mode_(str::upper(OP_SINGLE_ARG(string, "mode", "NEAREST"))),
-        align_corners_(OpArg<int64_t>("align_corners", 0)) {}
+        align_corners_(OP_SINGLE_ARG(int64_t, "align_corners", 0)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -71,8 +71,8 @@ class ResizeGradientOp final : public Operator<Context> {
  vec64_t in_dims_, out_dims_;
 };
-DEFINE_ARGS_WITH_DESC(float, ResizeOp, scales);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(float, ResizeOp, scales);
-DEFINE_ARGS_WITH_DESC(int64_t, ResizeOp, sizes);
+DEFINE_OP_REPEATED_ARG_WITH_DESC(int64_t, ResizeOp, sizes);
 } // namespace dragon

--- a/dragon/operators/vision/roi_align_op.cc
+++ b/dragon/operators/vision/roi_align_op.cc
@@ -88,11 +88,11 @@ void RoiAlignGradientOp<Context>::DoRunWithTypeAndCast() {
 template <class Context>
 void RoiAlignGradientOp<Context>::RunOnDevice() {
-  if (XIsType(Input(1), float16)) {
+  if (Input(1).template IsType<float16>()) {
    DoRunWithTypeAndCast<float16>();
-  } else if (XIsType(Input(1), float)) {
+  } else if (Input(1).template IsType<float>()) {
    DoRunWithType<float>();
-  } else if (XIsType(Input(1), double)) {
+  } else if (Input(1).template IsType<double>()) {
    DoRunWithTypeAndCast<double>();
  } else {
    LOG(FATAL) << MessageForUnsupported(
@@ -100,14 +100,14 @@ void RoiAlignGradientOp<Context>::RunOnDevice() {
  };
 }
-DEPLOY_CPU(RoiAlign);
+DEPLOY_CPU_OPERATOR(RoiAlign);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RoiAlign);
+DEPLOY_CUDA_OPERATOR(RoiAlign);
 #endif
-DEPLOY_CPU(RoiAlignGradient);
+DEPLOY_CPU_OPERATOR(RoiAlignGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RoiAlignGradient);
+DEPLOY_CUDA_OPERATOR(RoiAlignGradient);
 #endif
 OPERATOR_SCHEMA(RoiAlign)

--- a/dragon/operators/vision/roi_align_op.h
+++ b/dragon/operators/vision/roi_align_op.h
@@ -22,10 +22,10 @@ class RoiAlignOp final : public Operator<Context> {
 public:
  RoiAlignOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        out_h_(OpArg<int64_t>("pooled_h", 0)),
+        out_h_(OP_SINGLE_ARG(int64_t, "pooled_h", 0)),
-        out_w_(OpArg<int64_t>("pooled_w", 0)),
+        out_w_(OP_SINGLE_ARG(int64_t, "pooled_w", 0)),
-        spatial_scale_(OpArg<float>("spatial_scale", 1.f)),
+        spatial_scale_(OP_SINGLE_ARG(float, "spatial_scale", 1.f)),
-        sampling_ratio_(OpArg<int64_t>("sampling_ratio", 2)) {
+        sampling_ratio_(OP_SINGLE_ARG(int64_t, "sampling_ratio", 2)) {
    CHECK_GT(out_h_, 0) << "\npooled_h must > 0";
    CHECK_GT(out_w_, 0) << "\npooled_w must > 0";
  }
@@ -47,10 +47,10 @@ class RoiAlignGradientOp final : public Operator<Context> {
 public:
  RoiAlignGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        out_h_(OpArg<int64_t>("pooled_h", 0)),
+        out_h_(OP_SINGLE_ARG(int64_t, "pooled_h", 0)),
-        out_w_(OpArg<int64_t>("pooled_w", 0)),
+        out_w_(OP_SINGLE_ARG(int64_t, "pooled_w", 0)),
-        spatial_scale_(OpArg<float>("spatial_scale", 1.f)),
+        spatial_scale_(OP_SINGLE_ARG(float, "spatial_scale", 1.f)),
-        sampling_ratio_(OpArg<int64_t>("sampling_ratio", 2)) {}
+        sampling_ratio_(OP_SINGLE_ARG(int64_t, "sampling_ratio", 2)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/vision/roi_pool_op.cc
+++ b/dragon/operators/vision/roi_pool_op.cc
@@ -91,11 +91,11 @@ void RoiPoolGradientOp<Context>::DoRunWithTypeAndCast() {
 template <class Context>
 void RoiPoolGradientOp<Context>::RunOnDevice() {
-  if (XIsType(Input(1), float16)) {
+  if (Input(1).template IsType<float16>()) {
    DoRunWithTypeAndCast<float16>();
-  } else if (XIsType(Input(1), float)) {
+  } else if (Input(1).template IsType<float>()) {
    DoRunWithType<float>();
-  } else if (XIsType(Input(1), double)) {
+  } else if (Input(1).template IsType<double>()) {
    DoRunWithTypeAndCast<double>();
  } else {
    LOG(FATAL) << MessageForUnsupported(
@@ -103,14 +103,14 @@ void RoiPoolGradientOp<Context>::RunOnDevice() {
  };
 }
-DEPLOY_CPU(RoiPool);
+DEPLOY_CPU_OPERATOR(RoiPool);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RoiPool);
+DEPLOY_CUDA_OPERATOR(RoiPool);
 #endif
-DEPLOY_CPU(RoiPoolGradient);
+DEPLOY_CPU_OPERATOR(RoiPoolGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RoiPoolGradient);
+DEPLOY_CUDA_OPERATOR(RoiPoolGradient);
 #endif
 OPERATOR_SCHEMA(RoiPool)

--- a/dragon/operators/vision/roi_pool_op.h
+++ b/dragon/operators/vision/roi_pool_op.h
@@ -22,9 +22,9 @@ class RoiPoolOp final : public Operator<Context> {
 public:
  RoiPoolOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        out_h_(OpArg<int64_t>("pooled_h", 0)),
+        out_h_(OP_SINGLE_ARG(int64_t, "pooled_h", 0)),
-        out_w_(OpArg<int64_t>("pooled_w", 0)),
+        out_w_(OP_SINGLE_ARG(int64_t, "pooled_w", 0)),
-        spatial_scale_(OpArg<float>("spatial_scale", 1.f)) {}
+        spatial_scale_(OP_SINGLE_ARG(float, "spatial_scale", 1.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;
@@ -42,9 +42,9 @@ class RoiPoolGradientOp final : public Operator<Context> {
 public:
  RoiPoolGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        out_h_(OpArg<int64_t>("pooled_h", 0)),
+        out_h_(OP_SINGLE_ARG(int64_t, "pooled_h", 0)),
-        out_w_(OpArg<int64_t>("pooled_w", 0)),
+        out_w_(OP_SINGLE_ARG(int64_t, "pooled_w", 0)),
-        spatial_scale_(OpArg<float>("spatial_scale", 1.f)) {}
+        spatial_scale_(OP_SINGLE_ARG(float, "spatial_scale", 1.f)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/operators/vision/space_to_depth_op.cc
+++ b/dragon/operators/vision/space_to_depth_op.cc
@@ -83,17 +83,17 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
 template <class Context>
 void SpaceToDepthOp<Context>::RunOnDevice() {
-  DispatchHelper<AllTensorTypes>::Call(this, Input(0));
+  DispatchHelper<FullTensorTypes>::Call(this, Input(0));
 }
-DEPLOY_CPU(SpaceToDepth);
+DEPLOY_CPU_OPERATOR(SpaceToDepth);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SpaceToDepth);
+DEPLOY_CUDA_OPERATOR(SpaceToDepth);
 #endif
-DEPLOY_CPU(SpaceToDepthGradient);
+DEPLOY_CPU_OPERATOR(SpaceToDepthGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(SpaceToDepthGradient);
+DEPLOY_CUDA_OPERATOR(SpaceToDepthGradient);
 #endif
 OPERATOR_SCHEMA(SpaceToDepth)

--- a/dragon/operators/vision/space_to_depth_op.h
+++ b/dragon/operators/vision/space_to_depth_op.h
@@ -21,7 +21,8 @@ template <class Context>
 class SpaceToDepthOp final : public Operator<Context> {
 public:
  SpaceToDepthOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws), block_size_(OpArg<int>("block_size", 2)) {}
+      : Operator<Context>(def, ws),
+        block_size_(OP_SINGLE_ARG(int, "block_size", 2)) {}
  USE_OPERATOR_FUNCTIONS;
  void RunOnDevice() override;

--- a/dragon/python/__init__.py
+++ b/dragon/python/__init__.py
@@ -54,7 +54,6 @@ from dragon.core.framework.workspace import get_workspace
 from dragon.core.framework.workspace import reset_workspace
 from dragon.core.ops import tensorbind_eager as _
 from dragon.core.ops import tensorbind_symbol as _
-from dragon.core.ops.array_ops import arange
 from dragon.core.ops.array_ops import broadcast_to
 from dragon.core.ops.array_ops import cast
 from dragon.core.ops.array_ops import channel_normalize
@@ -67,6 +66,7 @@ from dragon.core.ops.array_ops import masked_select
 from dragon.core.ops.array_ops import nonzero
 from dragon.core.ops.array_ops import one_hot
 from dragon.core.ops.array_ops import pad
+from dragon.core.ops.array_ops import range
 from dragon.core.ops.array_ops import repeat
 from dragon.core.ops.array_ops import reshape
 from dragon.core.ops.array_ops import shape

--- a/dragon/python/_api/random/__init__.py
+++ b/dragon/python/_api/random/__init__.py
@@ -15,6 +15,7 @@ from __future__ import print_function as _print_function
 from dragon.core.framework.config import set_random_seed as set_seed
 from dragon.core.ops.array_ops import multinomial
+from dragon.core.ops.array_ops import permutation
 from dragon.core.ops.init_ops import glorot_normal
 from dragon.core.ops.init_ops import glorot_uniform
 from dragon.core.ops.init_ops import random_normal as normal

--- a/dragon/python/core/autograph/op_spec.py
+++ b/dragon/python/core/autograph/op_spec.py
@@ -31,22 +31,6 @@ def accuracy_spec(args, inputs, outputs):
    return outputs
-@register('Arange')
-def arange_spec(args, inputs, outputs):
-    _ = locals()
-    outputs[0].dtype = args['dtype']
-    slice_args = args['slice']
-    if len(slice_args) == 2:
-        start, (stop, step) = 0, slice_args
-    else:
-        start, stop, step = slice_args
-    try:
-        outputs[0].shape = (int(math.ceil((stop - start) / step)),)
-    except TypeError:
-        pass
-    return outputs
 @register(['ArgMax', 'ArgMin'])
 def arg_reduce_spec(args, inputs, outputs):
    outputs[0].dtype = 'int64'
@@ -187,7 +171,6 @@ def concat_spec(args, inputs, outputs):
 @register(['Conv2d', 'DepthwiseConv2d'])
 def conv_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    out_shape = None
    try:
        out_shape = list(inputs[0].shape[:])
        num_axes = len(out_shape) - 2
@@ -221,7 +204,6 @@ def conv_spec(args, inputs, outputs):
 @register('ConvTranspose2d')
 def conv_transpose_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    out_shape = None
    try:
        out_shape = list(inputs[0].shape[:])
        num_axes = len(out_shape) - 2
@@ -286,6 +268,7 @@ def depth_to_space_spec(args, inputs, outputs):
 @register('Dot')
 def dot_spec(args, inputs, outputs):
+    _ = locals()
    outputs[0].dtype = inputs[0].dtype
    try:
        a_shape, b_shape = inputs[0].shape[:], inputs[1].shape[:]
@@ -605,6 +588,20 @@ def pad_spec(args, inputs, outputs):
    return outputs
+@register('Permutation')
+def permutation_spec(args, inputs, outputs):
+    _ = locals()
+    outputs[0].dtype = args['dtype']
+    if len(inputs) == 1:
+        try:
+            outputs[0].shape = inputs[0].shape[:]
+        except TypeError:
+            pass
+    else:
+        outputs[0].shape = (args['limit'],)
+    return outputs
 @register('Pool2d')
 def pool_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
@@ -643,6 +640,22 @@ def python_spec(args, inputs, outputs):
    return outputs
+@register('Range')
+def range_spec(args, inputs, outputs):
+    _ = locals()
+    outputs[0].dtype = args['dtype']
+    slice_args = args['slice']
+    if len(slice_args) == 2:
+        start, (limit, delta) = 0, slice_args
+    else:
+        start, limit, delta = slice_args
+    try:
+        outputs[0].shape = (int(math.ceil((limit - start) / delta)),)
+    except TypeError:
+        pass
+    return outputs
 @register([
    'ReduceMax',
    'ReduceMean',

--- a/dragon/python/core/framework/workspace.py
+++ b/dragon/python/core/framework/workspace.py
@@ -283,7 +283,6 @@ class Workspace(backend.Workspace):
        """Merge resources from the other.
        The ``other`` will not be reset until ``self`` is reset.
-        Carefulness should be taken to associate with the workspaces.
        Parameters
        ----------

--- a/dragon/python/core/ops/array_ops.py
+++ b/dragon/python/core/ops/array_ops.py
@@ -25,64 +25,6 @@ from dragon.core.ops.utils import parse_args
 from dragon.core.util import nest
-def arange(start, stop=None, step=1, dtype='int64', **kwargs):
-    r"""Return a tensor of evenly spaced values within a interval.
-    Specify ``start`` and ``stop`` to determine an interval:
-    ```python
-    x = dragon.arange(2, 4)  # [2, 3]
-    ```
-    If ``stop`` is **None**, interval :math:`[0, start)` will be taken instead:
-    ```python
-    x = dragon.arange(5)  # [0, 1, 2, 3, 4]
-    ```
-    Set ``step`` to make the strides:
-    ```python
-    x = dragon.arange(5, step=2)  # [0, 2, 4]
-    ```
-    Parameters
-    ----------
-    start : number
-        The start of interval.
-    stop : number, optional
-        The end of interval.
-    step : number, optional, default=1
-        The spacing between two elements.
-    dtype : str, optional, default='int64'
-        The optional data type.
-    Returns
-    -------
-    dragon.Tensor
-        The output tensor.
-    """
-    args = parse_args(locals())
-    args['dtype'] = args['dtype'].lower()
-    if stop is None:
-        args['slice'] = (float(start), float(step))
-    else:
-        args['slice'] = (float(start), float(stop), float(step))
-    args.pop('start')
-    args.pop('stop')
-    args.pop('step')
-    op_lib = array_ops_lib.Arange
-    trainable = args.pop('trainable') if 'trainable' in args else False
-    if context.executing_eagerly():
-        return op_lib.instantiate(
-            num_args=len(args['slice']),
-            dtype=dtype,
-        ).apply(args['slice'], trainable=trainable)
-    else:
-        return op_lib.blend(**args)
 @OpSchema.num_inputs(1)
 def argmax(inputs, axis=None, keep_dims=False, **kwargs):
    """Compute the index of maximum elements along the given axis.
@@ -1037,6 +979,98 @@ def pad(inputs, pads, mode='constant', value=0, **kwargs):
        return op_lib.blend(**args)
+def permutation(limit, dtype='int64', **kwargs):
+    r"""Return a tensor with value in the permuted range.
+    Specify ``limit`` to determine an interval :math:`[0, \text{limit})`:
+    ```python
+    x = dragon.permutation(4)
+    ```
+    Parameters
+    ----------
+    limit: number
+        The end of interval.
+    dtype : str, optional, default='int64'
+        The optional data type.
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+    """
+    args = parse_args(locals())
+    args['dtype'] = args['dtype'].lower()
+    op_lib = array_ops_lib.Permutation
+    trainable = args.pop('trainable') if 'trainable' in args else False
+    if context.executing_eagerly():
+        return op_lib \
+            .instantiate(dtype=dtype) \
+            .apply(limit, trainable=trainable)
+    else:
+        return op_lib.blend(**args)
+def range(start, limit=None, delta=1, dtype='int64', **kwargs):
+    r"""Return a tensor of evenly spaced values within a interval.
+    Specify ``start`` and ``limit`` to determine an interval:
+    ```python
+    x = dragon.range(2, 4)  # [2, 3]
+    ```
+    If ``limit`` is **None**, interval :math:`[0, \text{start})` will be taken instead:
+    ```python
+    x = dragon.range(5)  # [0, 1, 2, 3, 4]
+    ```
+    Set ``delta`` to make the strides:
+    ```python
+    x = dragon.range(5, delta=2)  # [0, 2, 4]
+    ```
+    Parameters
+    ----------
+    start : number
+        The start of interval.
+    limit: number, optional
+        The end of interval.
+    delta : number, optional, default=1
+        The spacing between two elements.
+    dtype : str, optional, default='int64'
+        The optional data type.
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+    """
+    args = parse_args(locals())
+    args['dtype'] = args['dtype'].lower()
+    if limit is None:
+        args['slice'] = (float(start), float(delta))
+    else:
+        args['slice'] = (float(start), float(limit), float(delta))
+    args.pop('start')
+    args.pop('limit')
+    args.pop('delta')
+    op_lib = array_ops_lib.Range
+    trainable = args.pop('trainable') if 'trainable' in args else False
+    if context.executing_eagerly():
+        return op_lib.instantiate(
+            num_args=len(args['slice']),
+            dtype=dtype,
+        ).apply(args['slice'], trainable=trainable)
+    else:
+        return op_lib.blend(**args)
 @OpSchema.num_inputs(1)
 @ArgHelper.desc('repeats')
 def repeat(inputs, axis=None, repeats=1, **kwargs):

--- a/dragon/python/core/ops/array_ops_lib.py
+++ b/dragon/python/core/ops/array_ops_lib.py
@@ -18,40 +18,6 @@ from dragon.core.framework import device_spec
 from dragon.core.framework.ops import Operator
-class Arange(Operator):
-    def __init__(self, key, dev, **kwargs):
-        super(Arange, self).__init__(key, dev, **kwargs)
-        self.num_args = kwargs.get('num_args', 3)
-        self.dtype = kwargs.get('dtype', 'int64')
-    def attributes(self):
-        return {
-            'op_type': 'Arange',
-            'arguments': {
-                'dtype': self.dtype,
-                'slice_descs': [
-                    '${{HANDLE}}/slice[{}]'
-                    .format(n) for n in range(self.num_args)],
-            }
-        }
-    def feed(self, ws, handle, slice_args):
-        for i in range(len(slice_args)):
-            self.feed_arg(
-                ws, '{}/slice[{}]'.format(handle, i),
-                slice_args[i], 'float32')
-    def forward(self, slice_args, trainable=False):
-        out = self.dispatch(
-            [], [self.alloc()],
-            callback=lambda ws, handle:
-                self.feed(ws, handle, slice_args),
-            no_grad=True,
-        )
-        out._requires_grad = trainable
-        return out
 class ArgReduce(Operator):
    def __init__(self, key, dev, **kwargs):
        super(ArgReduce, self).__init__(key, dev, **kwargs)
@@ -388,6 +354,68 @@ class Pad(Operator):
        )
+class Permutation(Operator):
+    def __init__(self, key, dev, **kwargs):
+        super(Permutation, self).__init__(key, dev, **kwargs)
+        self.dtype = kwargs.get('dtype', 'int64')
+    def attributes(self):
+        return {
+            'op_type': 'Permutation',
+            'arguments': {
+                'dtype': self.dtype,
+                'limit_desc': '${HANDLE}/limit',
+            }
+        }
+    def feed(self, ws, handle, limit):
+        self.feed_arg(ws, '{}/limit'.format(handle), limit, 'int64')
+    def forward(self, limit, trainable=False):
+        out = self.dispatch(
+            [], [self.alloc()],
+            callback=lambda ws, handle:
+                self.feed(ws, handle, limit),
+            no_grad=True,
+        )
+        out._requires_grad = trainable
+        return out
+class Range(Operator):
+    def __init__(self, key, dev, **kwargs):
+        super(Range, self).__init__(key, dev, **kwargs)
+        self.num_args = kwargs.get('num_args', 3)
+        self.dtype = kwargs.get('dtype', 'int64')
+    def attributes(self):
+        return {
+            'op_type': 'Range',
+            'arguments': {
+                'dtype': self.dtype,
+                'slice_descs': [
+                    '${{HANDLE}}/slice[{}]'
+                    .format(n) for n in range(self.num_args)],
+            }
+        }
+    def feed(self, ws, handle, slice_args):
+        for i in range(len(slice_args)):
+            self.feed_arg(
+                ws, '{}/slice[{}]'.format(handle, i),
+                slice_args[i], 'float32')
+    def forward(self, slice_args, trainable=False):
+        out = self.dispatch(
+            [], [self.alloc()],
+            callback=lambda ws, handle:
+                self.feed(ws, handle, slice_args),
+            no_grad=True,
+        )
+        out._requires_grad = trainable
+        return out
 class Reduce(Operator):
    def __init__(self, key, dev, **kwargs):
        super(Reduce, self).__init__(key, dev, **kwargs)

--- a/dragon/python/core/ops/tensorbind_eager.py
+++ b/dragon/python/core/ops/tensorbind_eager.py
@@ -168,8 +168,31 @@ def getitem(self, item):
    """
    if isinstance(item, EagerTensor):
-        return _masked_select(self, item)
+        if item.dtype == 'bool' or item.dtype == 'uint8':
+            return _masked_select(self, item)
+        elif item.dtype == 'int64':
+            return _index_select(self, item, 0)
+        else:
+            raise TypeError('Unsupported index type: ' + item.dtype)
    else:
+        if isinstance(item, tuple):
+            axis = None
+            for i, ele in enumerate(item):
+                if isinstance(ele, EagerTensor):
+                    if ele.dtype == 'int64' and axis is None:
+                        axis = i
+                    else:
+                        axis = None
+                        break
+                elif isinstance(ele, slice):
+                    if ele != slice(None, None, None):
+                        axis = None
+                        break
+                else:
+                    axis = None
+                    break
+            if axis is not None:
+                return _index_select(self, item[axis], axis)
        starts, sizes = _process_index(item)
        return _section_select(self, starts, sizes)
@@ -665,21 +688,28 @@ def uniform(self, low=0, high=1):
 def _binary_op(a, b, op_type, outputs=(None,)):
-    """Apply the general binary operation."""
+    """Apply the binary operation."""
    return math_ops_lib.BinaryOp \
        .instantiate(op_type=op_type) \
        .apply(ops.remove_binary_scalar([a, b]), outputs)
+def _index_select(x, index, axis):
+    """Select elements according to the index."""
+    return array_ops_lib.IndexSelect \
+        .instantiate(axis=axis, num_axes=1) \
+        .apply([x, index])
 def _masked_assign(ref, value, mask):
-    """Apply the mask-assign operation."""
+    """Assign value according to the mask."""
    value = ops.scalar_to_tensor(value, ref.dtype)
    return control_flow_ops_lib.MaskedAssign \
        .instantiate().apply([ref, value, mask])
 def _masked_select(x, mask):
-    """Apply the mask-select operation."""
+    """Select elements according to the mask."""
    return array_ops_lib.MaskedSelect \
        .instantiate().apply([x, mask])
@@ -705,18 +735,15 @@ def _process_index(item):
                sizes.append(ele.stop - starts[-1])
                if sizes[-1] == 0:
                    raise ValueError(
-                        'The starts and ends of axis {} '
+                        'The starts and ends of axis {} can not be equal'
-                        'can not be equal, got {}:{}.'
+                        ', got {}:{}.'.format(i, starts[-1], ele.stop))
-                        .format(i, starts[-1], ele.stop))
            if ele.step is not None:
                raise NotImplementedError
        elif isinstance(ele, int):
            starts.append(ele)
            sizes.append(0)
        else:
-            raise TypeError(
+            raise TypeError('Unsupported index type: {}'.format(type(ele)))
-                'Unsupported type of index: {}'
-                .format(type(ele)))
    return starts, sizes

--- a/dragon/python/core/ops/tensorbind_symbol.py
+++ b/dragon/python/core/ops/tensorbind_symbol.py
@@ -143,8 +143,31 @@ def getitem(self, item):
    """
    if isinstance(item, Tensor):
-        return _masked_select(self, item)
+        if item.dtype == 'bool' or item.dtype == 'uint8':
+            return _masked_select(self, item)
+        elif item.dtype == 'int64':
+            return _index_select(self, item, 0)
+        else:
+            raise TypeError('Unsupported index type: ' + item.dtype)
    else:
+        if isinstance(item, tuple):
+            axis = None
+            for i, ele in enumerate(item):
+                if isinstance(ele, Tensor):
+                    if ele.dtype == 'int64' and axis is None:
+                        axis = i
+                    else:
+                        axis = None
+                        break
+                elif isinstance(ele, slice):
+                    if ele != slice(None, None, None):
+                        axis = None
+                        break
+                else:
+                    axis = None
+                    break
+            if axis is not None:
+                return _index_select(self, item[axis], axis)
        starts, sizes = _process_index(item)
        return _section_select(self, starts, sizes)
@@ -425,19 +448,24 @@ def sub(self, other):
 def _binary_op(a, b, op_type):
-    """Create the general binary operator."""
+    """Apply the binary operator."""
    a, b = ops.remove_binary_scalar([a, b])
    return OpDef.apply(op_type, [a, b])
+def _index_select(x, index, axis):
+    """Select elements according to the index."""
+    return OpDef.apply('IndexSelect', [x, index], axis=axis, num_axes=1)
 def _masked_assign(ref, value, mask):
-    """Create the mask-assign operator."""
+    """Assign value according to the mask."""
    value = ops.scalar_to_tensor(value, ref.dtype)
    return OpDef.apply('MaskedAssign', [value, mask], [ref])
 def _masked_select(x, mask):
-    """Create the mask-select operator."""
+    """Select elements according to the mask."""
    return OpDef.apply('MaskedSelect', [x, mask])
@@ -462,18 +490,15 @@ def _process_index(item):
                sizes.append(ele.stop - starts[-1])
                if sizes[-1] == 0:
                    raise ValueError(
-                        'The starts and ends of axis {} '
+                        'The starts and ends of axis {} can not be equal'
-                        'can not be equal, got {}:{}.'
+                        ', got {}:{}.'.format(i, starts[-1], ele.stop))
-                        .format(i, starts[-1], ele.stop))
            if ele.step is not None:
                raise NotImplementedError
        elif isinstance(ele, int):
            starts.append(ele)
            sizes.append(0)
        else:
-            raise TypeError(
+            raise TypeError('Unsupported index type: {}'.format(type(ele)))
-                'Unsupported type of index: {}'
-                .format(type(ele)))
    return starts, sizes

--- a/dragon/python/vm/onnx/core/nodes/array.py
+++ b/dragon/python/vm/onnx/core/nodes/array.py
@@ -156,8 +156,7 @@ def flatten_exporter(op_def, shape_dict, ws):
            if arg.i != -1:
                raise ValueError(
                    'Excepted <num_axes> is -1, '
-                    'got {}.'.format(arg.i)
+                    'got {}.'.format(arg.i))
-                )
        elif arg.name == 'keep_axes':
            raise ValueError('<keep_axes> should not be set.')
    return node, None

--- a/dragon/utils/device/common_thrust.h
+++ b/dragon/utils/device/common_thrust.h
+#ifndef DRAGON_UTILS_DEVICE_COMMON_THRUST_H_
+#define DRAGON_UTILS_DEVICE_COMMON_THRUST_H_
+#ifdef USE_CUDA
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#endif // USE_CUDA
+#endif // DRAGON_UTILS_DEVICE_COMMON_THRUST_H_
--- a/dragon/utils/math/random.cc
+++ b/dragon/utils/math/random.cc
@@ -38,19 +38,32 @@ DRAGON_API void TruncatedNormal<float16, CPUContext>(
  CPU_FP16_NOT_SUPPORTED;
 }
+#define DEFINE_RANDOM_FUNC(T)                                                 \
+  template <>                                                                 \
+  DRAGON_API void Random<T, CPUContext>(const int n, T* y, CPUContext* ctx) { \
+    auto* rng = ctx->rand_generator();                                        \
+    for (int i = 0; i < n; ++i) {                                             \
+      y[i] = static_cast<T>((*rng)());                                        \
+    }                                                                         \
+  }
+DEFINE_RANDOM_FUNC(uint32_t);
+#undef DEFINE_RANDOM_FUNC
 #define DEFINE_RANDOM_UNIFORM_FUNC(T, key)                                     \
  template <>                                                                  \
  DRAGON_API void RandomUniform<T, CPUContext>(                                \
      const int n, const float low, const float high, T* y, CPUContext* ctx) { \
    std::uniform_##key##_distribution<T> distribution(low, high);              \
    auto* rng = ctx->rand_generator();                                         \
-    for (int i = 0; i < n; ++i)                                                \
+    for (int i = 0; i < n; ++i) {                                              \
      y[i] = distribution(*rng);                                               \
+    }                                                                          \
  }
-DEFINE_RANDOM_UNIFORM_FUNC(uint32_t, int);
 DEFINE_RANDOM_UNIFORM_FUNC(float, real);
 DEFINE_RANDOM_UNIFORM_FUNC(double, real);
+#undef DEFINE_RANDOM_UNIFORM_FUNC
 #define DEFINE_RANDOM_NORMAL_FUNC(T)                                           \
  template <>                                                                  \

--- a/dragon/utils/math/random.cu
+++ b/dragon/utils/math/random.cu
@@ -10,16 +10,9 @@ namespace dragon {
 namespace math {
 template <>
-DRAGON_API void RandomUniform<uint32_t, CUDAContext>(
+DRAGON_API void
-    const int n,
+Random<uint32_t, CUDAContext>(const int n, uint32_t* y, CUDAContext* ctx) {
-    const float low,
+  CURAND_CHECK(curandGenerate(ctx->curand_generator(), y, n));
-    const float high,
-    uint32_t* y,
-    CUDAContext* ctx) {
-  // Note that we ignore the low / high
-  // CuRand could only generates in the range of [0, uint32]
-  auto* rng = ctx->curand_generator();
-  CURAND_CHECK(curandGenerate(rng, y, n));
 }
 template <>
@@ -83,8 +76,8 @@ DRAGON_API void RandomNormal<double, CUDAContext>(
    const float sigma,
    double* y,
    CUDAContext* ctx) {
-  CURAND_CHECK(
+  auto* rng = ctx->curand_generator();
-      curandGenerateNormalDouble(ctx->curand_generator(), y, n, mu, sigma));
+  CURAND_CHECK(curandGenerateNormalDouble(rng, y, n, mu, sigma));
 }
 } // namespace math

--- a/dragon/utils/math/random.h
+++ b/dragon/utils/math/random.h
@@ -20,6 +20,9 @@ namespace dragon {
 namespace math {
 template <typename T, class Context>
+DRAGON_API void Random(const int n, T* y, Context* ctx);
+template <typename T, class Context>
 DRAGON_API void RandomUniform(
    const int n,
    const float low,

--- a/dragon/utils/op_kernels.h
+++ b/dragon/utils/op_kernels.h
@@ -38,7 +38,7 @@ void Dropout(
    const T* x,
    uint8_t* mask,
    T* y,
-    uint32_t* scratch,
+    uint32_t* r,
    Context* ctx);
 /* activation.drop_block */
@@ -54,7 +54,7 @@ void DropBlock2d(
    const int block_size,
    const float gamma,
    const string& data_format,
-    uint32_t* seed,
+    uint32_t* r,
    int* mask,
    Context* ctx);
@@ -209,16 +209,6 @@ void Tanh(const int count, const T* x, T* y, Context* ctx);
 template <typename T, class Context>
 void TanhGrad(const int count, const T* dy, const T* y, T* dx, Context* ctx);
-/* array.arange */
-template <typename T, class Context>
-void Arange(
-    const int count,
-    const float start,
-    const float step,
-    T* y,
-    Context* ctx);
 /* array.argmax */
 template <typename T, class Context>
@@ -322,7 +312,7 @@ void IndexSelectGrad(
    const int inner_dim,
    const int axis_dim,
    const int num_indices,
-    const int64_t* indices,
+    const int64_t* index,
    const T* dy,
    T* dx,
    Context* ctx);
@@ -414,6 +404,21 @@ void OneHot(
    T* y,
    Context* ctx);
+/* array.permutation */
+template <typename T, class Context>
+void Permutation(const int count, T* y, uint32_t* r, Context* ctx);
+/* array.range */
+template <typename T, class Context>
+void Range(
+    const int count,
+    const float start,
+    const float delta,
+    T* y,
+    Context* ctx);
 /* array.reduce */
 template <typename T, class Context>

--- a/tensorflow/core/keras/saving/pickle_format.py
+++ b/tensorflow/core/keras/saving/pickle_format.py
@@ -23,6 +23,8 @@ from dragon.core.framework import workspace
 from dragon.core.util import logging
 from dragon.core.util import six
+PICKLE_DEFAULT_PROTOCOL = 2
 def load_weights_from_pickle(f, layer, verbose=False):
    ws = workspace.get_workspace()
@@ -64,4 +66,4 @@ def save_weights_to_pickle(f, layer):
        if weight_impl is not None:
            weight_dict[weight.name] = weight_impl.ToNumpy(True)
    pickle = six.moves.pickle
-    pickle.dump(weight_dict, f, pickle.HIGHEST_PROTOCOL)
+    pickle.dump(weight_dict, f, PICKLE_DEFAULT_PROTOCOL)
--- a/tensorflow/core/ops/math_ops.py
+++ b/tensorflow/core/ops/math_ops.py
@@ -886,10 +886,10 @@ def range(start, limit=None, delta=1, dtype='int64', name=None):
        The output tensor.
    """
-    return array_ops.arange(
+    return array_ops.range(
        start=start,
-        stop=limit,
+        limit=limit,
-        step=delta,
+        delta=delta,
        dtype=dtype,
        name=name,
    )

--- a/tensorlayer/core/files/utils.py
+++ b/tensorlayer/core/files/utils.py
@@ -27,6 +27,8 @@ from dragon.core.framework import workspace
 from dragon.core.util import nest
 from dragon.core.util import six
+PICKLE_DEFAULT_PROTOCOL = 2
 def assign_weights(value_list, module):
    """Assign the value to the module weights.
@@ -220,10 +222,7 @@ def save_pkl_dict(save_list, name):
            raise ValueError('Input[%d] does not have <name> attribute.')
        save_dict[input.name] = _get_value(input)
    with open(name, 'wb') as f:
-        six.moves.pickle.dump(
+        six.moves.pickle.dump(save_dict, f, PICKLE_DEFAULT_PROTOCOL)
-            save_dict, f,
-            six.moves.pickle.HIGHEST_PROTOCOL,
-        )
 def save_weights_to_hdf5(filepath, module):

--- a/test/dragon/test_ops.py
+++ b/test/dragon/test_ops.py
@@ -398,23 +398,6 @@ class TestActivationOps(OpTestCase):
 class TestArrayOps(OpTestCase):
    """Test the array ops."""
-    def test_arange(self):
-        entries = [([5], {'dtype': 'int64'}),
-                   ([0, 5], {'dtype': 'int64'}),
-                   ([0, 5, 2], {'dtype': 'int64'}),
-                   ([0., 1., 0.2], {'dtype': 'float32'})]
-        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
-            with execution_context().mode(execution):
-                for (args, kwargs) in entries:
-                    data = np.arange(*args, **kwargs)
-                    x = dragon.arange(*args, **kwargs)
-                    self.assertEqual(x, data)
-    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
-    def test_arange_cuda(self):
-        with dragon.device('cuda'):
-            self.test_arange()
    def test_broadcast_to(self):
        entries = [((2, 2, 3, 1), (0, True)),
                   ((1, 2, 3, 2), (3, True)),
@@ -690,6 +673,37 @@ class TestArrayOps(OpTestCase):
        with dragon.device('cuda'):
            self.test_pad()
+    def test_permutation(self):
+        entries = [([4], {'dtype': 'int64'}),
+                   ([4], {'dtype': 'float32'})]
+        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
+            with execution_context().mode(execution):
+                for (args, kwargs) in entries:
+                    x = dragon.random.permutation(*args, **kwargs)
+                    self.assertEqual(x.shape, (4,))
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_range_cuda(self):
+        with dragon.device('cuda'):
+            self.test_permutation()
+    def test_range(self):
+        entries = [([5], {'dtype': 'int64'}),
+                   ([0, 5], {'dtype': 'int64'}),
+                   ([0, 5, 2], {'dtype': 'int64'}),
+                   ([0., 1., 0.2], {'dtype': 'float32'})]
+        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
+            with execution_context().mode(execution):
+                for (args, kwargs) in entries:
+                    data = np.arange(*args, **kwargs)
+                    x = dragon.range(*args, **kwargs)
+                    self.assertEqual(x, data)
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_range_cuda(self):
+        with dragon.device('cuda'):
+            self.test_range()
    def test_repeat(self):
        entries = [(None, 2), (1, 2)]
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
@@ -2719,21 +2733,30 @@ class TestTensorOps(OpTestCase):
    def test_getitem(self):
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
            with execution_context().mode(execution):
-                data = arange((2, 3))
+                data1, data2 = arange((2, 3)), arange((2,), dtype='int64')
-                x = new_tensor(data)
+                x, index = new_tensor(data1), new_tensor(data2)
-                self.assertEqual(x[x > 2], data[data > 2], test_symbols=False)
+                self.assertEqual(x[x > 2], data1[data1 > 2], test_symbols=False)
                entries = [0,
                           slice(None, None, None),
                           slice(0, None, None),
                           slice(0, 0, None),
                           slice(0, 1, None),
-                           slice(0, 1, 1),
+                           slice(0, 1, 1)]
-                           data,
-                           (data, data)]
                for item in entries:
                    try:
-                        self.assertEqual(x.__getitem__(item), data.__getitem__(item))
+                        self.assertEqual(x.__getitem__(item), data1.__getitem__(item))
-                    except (NotImplementedError, ValueError, TypeError):
+                    except (NotImplementedError, ValueError):
+                        pass
+                self.assertEqual(x[index], data1[data2])
+                self.assertEqual(x[:, index], data1[:, data2])
+                entries = [x,
+                           (slice(1, None, None), index),
+                           (1, index),
+                           (index, index)]
+                for item in entries:
+                    try:
+                        x.__getitem__(item)
+                    except TypeError:
                        pass
    def test_glorot_normal(self):

--- a/test/torch/test_ops.py
+++ b/test/torch/test_ops.py
@@ -210,6 +210,13 @@ class TestTensorOps(OpTestCase):
            data.fill(value)
            self.assertEqual(x, data)
+    def test_flatten(self):
+        data = arange((1, 2, 3))
+        x = new_tensor(data)
+        self.assertEqual(x.flatten(), data.flatten())
+        x.flatten_(-3, -2)
+        self.assertEqual(x, data.reshape((2, 3)))
    def test_floor(self):
        data = np.array([0.9, 1.4, 1.9])
        x = new_tensor(data)
@@ -218,21 +225,30 @@ class TestTensorOps(OpTestCase):
        self.assertEqual(x, np.floor(data))
    def test_getitem(self):
-        data = arange((2, 3))
+        data1, data2 = arange((2, 3)), arange((2,), dtype='int64')
-        x = new_tensor(data)
+        x, index = new_tensor(data1), new_tensor(data2)
-        self.assertEqual(x[x > 2], data[data > 2])
+        self.assertEqual(x[x > 2], data1[data1 > 2])
        entries = [0,
                   slice(None, None, None),
                   slice(0, None, None),
                   slice(0, 0, None),
                   slice(0, 1, None),
-                   slice(0, 1, 1),
+                   slice(0, 1, 1)]
-                   data,
-                   (data, data)]
        for item in entries:
            try:
-                self.assertEqual(x.__getitem__(item), data.__getitem__(item))
+                self.assertEqual(x.__getitem__(item), data1.__getitem__(item))
-            except (NotImplementedError, ValueError, TypeError):
+            except (NotImplementedError, ValueError):
+                pass
+        self.assertEqual(x[index], data1[data2])
+        self.assertEqual(x[:, index], data1[:, data2])
+        entries = [x,
+                   (slice(1, None, None), index),
+                   (1, index),
+                   (index, index)]
+        for item in entries:
+            try:
+                x.__getitem__(item)
+            except TypeError:
                pass
    def test_greater(self):
@@ -522,6 +538,7 @@ class TestTensorOps(OpTestCase):
            getattr(x, name + '_')()
            self.assertEqual(x, data.astype(dtype))
            x.type(dtype)
+            self.assertEqual(x.type(), dtype)
    def test_uniform(self):
        data = arange((2, 3))
@@ -561,6 +578,9 @@ class TestTorchOps(OpTestCase):
    def test_randn(self):
        self.assertEqual(torch.randn(2, 3).shape, (2, 3))
+    def test_randperm(self):
+        self.assertEqual(torch.randperm(4).shape, (4,))
    def test_zeros_like(self):
        data = np.zeros((2, 3), dtype='float32')
        x = new_tensor(data)

--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -54,6 +54,7 @@ from dragon.vm.torch.core.ops.array.functional import channel_normalize
 from dragon.vm.torch.core.ops.array.functional import channel_shuffle
 from dragon.vm.torch.core.ops.array.functional import chunk
 from dragon.vm.torch.core.ops.array.functional import cumsum
+from dragon.vm.torch.core.ops.array.functional import flatten
 from dragon.vm.torch.core.ops.array.functional import index_select
 from dragon.vm.torch.core.ops.array.functional import masked_select
 from dragon.vm.torch.core.ops.array.functional import masked_fill
@@ -80,6 +81,7 @@ from dragon.vm.torch.core.ops.init.functional import ones
 from dragon.vm.torch.core.ops.init.functional import ones_like
 from dragon.vm.torch.core.ops.init.functional import rand
 from dragon.vm.torch.core.ops.init.functional import randn
+from dragon.vm.torch.core.ops.init.functional import randperm
 from dragon.vm.torch.core.ops.init.functional import zeros
 from dragon.vm.torch.core.ops.init.functional import zeros_like
 from dragon.vm.torch.core.ops.math.functional import abs

--- a/torch/_api/nn/__init__.py
+++ b/torch/_api/nn/__init__.py
@@ -44,6 +44,7 @@ from dragon.vm.torch.core.nn.modules.conv import DepthwiseConv2d
 from dragon.vm.torch.core.nn.modules.dropout import DropBlock2d
 from dragon.vm.torch.core.nn.modules.dropout import Dropout
 from dragon.vm.torch.core.nn.modules.dropout import DropPath
+from dragon.vm.torch.core.nn.modules.flatten import Flatten
 from dragon.vm.torch.core.nn.modules.linear import Linear
 from dragon.vm.torch.core.nn.modules.loss import CTCLoss
 from dragon.vm.torch.core.nn.modules.loss import BCEWithLogitsLoss

--- a/torch/core/nn/modules/flatten.py
+++ b/torch/core/nn/modules/flatten.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Flatten modules."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from dragon.vm.torch.core.nn.modules.module import Module
+class Flatten(Module):
+    """Flatten the dimensions of input.
+    Examples:
+    ```python
+    m = torch.nn.Flatten()
+    x = torch.ones(1, 2, 4, 4)
+    y = m(x)
+    print(y.size())  # (1, 32)
+    ```
+    See Also
+    --------
+    `torch.flatten(...)`_
+    """
+    def __init__(self, start_dim=1, end_dim=-1):
+        """Create a ``Flatten`` module.
+        Parameters
+        ----------
+        start_dim : int, optional, default=0
+            The start dimension to flatten.
+        end_dim : int, optional, default=-1
+            The end dimension to flatten.
+        """
+        super(Flatten, self).__init__()
+        self.start_dim = start_dim
+        self.end_dim = end_dim
+    def extra_repr(self):
+        return 'start_dim={}, end_dim={}' \
+               .format(self.start_dim, self.end_dim)
+    def forward(self, input):
+        return input.flatten(self.start_dim, self.end_dim)
--- a/torch/core/ops/array/_functions.py
+++ b/torch/core/ops/array/_functions.py
@@ -216,6 +216,25 @@ class Expand(function.Function):
        )
+class Flatten(function.Function):
+    def __init__(self, key, dev, **kwargs):
+        super(Flatten, self).__init__(key, dev, **kwargs)
+        self.axis = kwargs.get('axis', 0)
+        self.num_axes = kwargs.get('num_axes', -1)
+    def attributes(self):
+        return {
+            'op_type': 'Flatten',
+            'arguments': {
+                'axis': self.axis,
+                'num_axes': self.num_axes,
+            }
+        }
+    def forward(self, input, out=None):
+        return self.dispatch([input], [self.alloc(out)])
 class IndexSelect(function.Function):
    def __init__(self, key, dev, **kwargs):
        super(IndexSelect, self).__init__(key, dev, **kwargs)

--- a/torch/core/ops/array/functional.py
+++ b/torch/core/ops/array/functional.py
@@ -290,8 +290,8 @@ def cumsum(input, dim, out=None):
    ```python
    x = torch.tensor([[1, 2, 3], [4, 5, 6]])
-    # A negative axis is the last-k axis
+    # A negative dimension is the last-k dimension
-    print(torch.cumsum(x, 1))   # [[1, 3, 6], [4, 9, 15]]
+    print(torch.cumsum(x, 1))  # [[1, 3, 6], [4, 9, 15]]
    print(torch.cumsum(x, -1))  # Equivalent
    ```
@@ -340,6 +340,52 @@ def expand(input, sizes):
        .apply(input, sizes)
+def flatten(input, start_dim=0, end_dim=-1, out=None):
+    """Return a tensor with dimensions flattened.
+    The argument ``start_dim`` and ``end_dim`` could be negative:
+    ```python
+    x = torch.tensor([[1, 2, 3], [4, 5, 6]])
+    # A negative axis is the last-k axis
+    print(torch.flatten(x, start_dim=0, end_dim=-1))
+    print(torch.flatten(x, start_dim=0, end_dim=1))  # Equivalent
+    ```
+    Parameters
+    ----------
+    input : torch.Tensor
+        The input tensor.
+    start_dim : int, optional, default=0
+        The start dimension to flatten.
+    end_dim : int, optional, default=-1
+        The end dimension to flatten.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+    """
+    if end_dim == -1:
+        num_axes = -1
+    else:
+        while end_dim < 0:
+            end_dim += input.ndimension()
+        while start_dim < 0:
+            start_dim += input.ndimension()
+        num_axes = end_dim - start_dim + 1
+    return _functions.Flatten \
+        .instantiate(
+            input.device,
+            axis=start_dim,
+            num_axes=num_axes,
+        ).apply(input, out)
 def index_select(input, dim, index, out=None):
    """Select the elements along the given dim using index.

--- a/torch/core/ops/init/_functions.py
+++ b/torch/core/ops/init/_functions.py
@@ -38,38 +38,6 @@ class _Initializer(function.Function):
        )
-class Arange(function.Function):
-    def __init__(self, key, dev, **kwargs):
-        super(Arange, self).__init__(key, dev, **kwargs)
-        self.num_args = kwargs.get('num_args', 3)
-        self.dtype = kwargs.get('dtype', 'int64')
-    def attributes(self):
-        return {
-            'op_type': 'Arange',
-            'arguments': {
-                'dtype': self.dtype,
-                'slice_descs': [
-                    '${{HANDLE}}/slice[{}]'
-                    .format(n) for n in range(self.num_args)],
-            }
-        }
-    def feed(self, ws, handle, slice_args):
-        for i in range(len(slice_args)):
-            self.feed_arg(
-                ws, '{}/slice[{}]'.format(handle, i),
-                slice_args[i], 'float32')
-    def forward(self, slice_args, out=None):
-        return self.dispatch(
-            [], [self.alloc(out)],
-            callback=lambda ws, handle:
-                self.feed(ws, handle, slice_args),
-            no_grad=True,
-        )
 class Eye(_Initializer):
    def __init__(self, key, dev, **kwargs):
        super(Eye, self).__init__(key, dev, **kwargs)
@@ -106,6 +74,32 @@ class Fill(_Initializer):
        }
+class Permutation(function.Function):
+    def __init__(self, key, dev, **kwargs):
+        super(Permutation, self).__init__(key, dev, **kwargs)
+        self.dtype = kwargs.get('dtype', 'int64')
+    def attributes(self):
+        return {
+            'op_type': 'Permutation',
+            'arguments': {
+                'dtype': self.dtype,
+                'limit_desc': '${HANDLE}/limit',
+            }
+        }
+    def feed(self, ws, handle, limit):
+        self.feed_arg(ws, '{}/limit'.format(handle), limit, 'int64')
+    def forward(self, limit, out=None):
+        return self.dispatch(
+            [], [self.alloc(out)],
+            callback=lambda ws, handle:
+                self.feed(ws, handle, limit),
+            no_grad=True,
+        )
 class RandomNormal(_Initializer):
    def __init__(self, key, dev, **kwargs):
        super(RandomNormal, self).__init__(key, dev, **kwargs)
@@ -144,3 +138,35 @@ class RandomUniform(_Initializer):
                    for n in range(self.ndim)],
            },
        }
+class Range(function.Function):
+    def __init__(self, key, dev, **kwargs):
+        super(Range, self).__init__(key, dev, **kwargs)
+        self.num_args = kwargs.get('num_args', 3)
+        self.dtype = kwargs.get('dtype', 'int64')
+    def attributes(self):
+        return {
+            'op_type': 'Range',
+            'arguments': {
+                'dtype': self.dtype,
+                'slice_descs': [
+                    '${{HANDLE}}/slice[{}]'
+                    .format(n) for n in range(self.num_args)],
+            }
+        }
+    def feed(self, ws, handle, slice_args):
+        for i in range(len(slice_args)):
+            self.feed_arg(
+                ws, '{}/slice[{}]'.format(handle, i),
+                slice_args[i], 'float32')
+    def forward(self, slice_args, out=None):
+        return self.dispatch(
+            [], [self.alloc(out)],
+            callback=lambda ws, handle:
+            self.feed(ws, handle, slice_args),
+            no_grad=True,
+        )
--- a/torch/core/ops/init/functional.py
+++ b/torch/core/ops/init/functional.py
@@ -75,7 +75,7 @@ def arange(
        slice_args = start, step
    else:
        slice_args = start, end, step
-    out = _functions.Arange \
+    out = _functions.Range \
        .instantiate(
            device if device else cpp.device(),
            num_args=len(slice_args),
@@ -267,6 +267,40 @@ def randn(*size, **kwargs):
    return normal_fill(out, 0, 1)
+def randperm(n, out=None, dtype='int64', device=None, requires_grad=False):
+    """Return a tensor with value in the permuted range.
+    Specify ``n`` to determine an interval :math:`[0, n)`:
+    ```python
+    print(torch.randperm(4))
+    ```
+    Parameters
+    ----------
+    n: number
+        The end of interval.
+    out : dragon.vm.torch.Tensor, optional
+        The optional output tensor.
+    dtype : str, optional, default='int64'
+        The optional data type.
+    device : dragon.vm.torch.device, optional
+        The optional device of returned tensor.
+    requires_grad : bool, optional, default=False
+        **True** to record gradient for returned tensor.
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+    """
+    out = out if out else utils.new_leaf([n], locals())
+    return _functions.Permutation \
+        .instantiate(out.device, dtype=out.dtype) \
+        .apply(n, out)
 def uniform_fill(input, low=0, high=1):
    """Fill input from the uniform distribution."""
    shape = input.shape

--- a/torch/core/ops/tensorbind.py
+++ b/torch/core/ops/tensorbind.py
@@ -590,6 +590,52 @@ def fill_(self, value):
    return init_funcs.fill(self, self.shape, value)
+def flatten(self, start_dim=0, end_dim=-1):
+    """Return a new tensor with dimensions flattened.
+    Parameters
+    ----------
+    start_dim : int, optional, default=0
+        The start dimension to flatten.
+    end_dim : int, optional, default=-1
+        The end dimension to flatten.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+    See Also
+    --------
+    `torch.flatten(...)`_
+    """
+    return array_funcs.flatten(self, start_dim, end_dim)
+def flatten_(self, start_dim=0, end_dim=-1):
+    """Flatten the dimensions.
+    Parameters
+    ----------
+    start_dim : int, optional, default=0
+        The start dimension to flatten.
+    end_dim : int, optional, default=-1
+        The end dimension to flatten.
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The self.
+    See Also
+    --------
+    `torch.flatten(...)`_
+    """
+    return array_funcs.flatten(self, start_dim, end_dim, self)
 def float(self):
    """Return a float32 tensor with the same data.
@@ -688,8 +734,31 @@ def getitem(self, item):
    """
    if isinstance(item, Tensor):
-        return self.masked_select(item)
+        if item.dtype == 'bool' or item.dtype == 'uint8':
+            return self.masked_select(item)
+        elif item.dtype == 'int64':
+            return self.index_select(0, item)
+        else:
+            raise TypeError('Unsupported index type: ' + item.dtype)
    else:
+        if isinstance(item, tuple):
+            dim = None
+            for i, ele in enumerate(item):
+                if isinstance(ele, Tensor):
+                    if ele.dtype == 'int64' and dim is None:
+                        dim = i
+                    else:
+                        dim = None
+                        break
+                elif isinstance(ele, slice):
+                    if ele != slice(None, None, None):
+                        dim = None
+                        break
+                else:
+                    dim = None
+                    break
+            if dim is not None:
+                return self.index_select(dim, item[dim])
        starts, sizes = _process_index(item)
        return array_funcs.slice(self, starts, sizes)
@@ -1618,7 +1687,7 @@ def topk(self, k, dim=None, largest=True, sorted=True):
    return array_funcs.topk(self, k, dim, largest, sorted)
-def type(self, dtype=None):
+def _type(self, dtype=None):
    """Return the data type.
    If ``dtype`` is not **None**, cast ``self`` to the new tensor.
@@ -1753,18 +1822,15 @@ def _process_index(item):
                sizes.append(ele.stop - starts[-1])
                if sizes[-1] == 0:
                    raise ValueError(
-                        'The starts and ends of axis {} '
+                        'The starts and ends of dim {} can not be equal'
-                        'can not be equal, got {}:{}.'
+                        ', got {}:{}.'.format(i, starts[-1], ele.stop))
-                        .format(i, starts[-1], ele.stop))
            if ele.step is not None:
                raise NotImplementedError
        elif isinstance(ele, int):
            starts.append(ele)
            sizes.append(0)
        else:
-            raise TypeError(
+            raise TypeError('Unsupported index type: {}'.format(type(ele)))
-                'Unsupported type of index: {}'
-                .format(type(ele)))
    return starts, sizes
@@ -1800,6 +1866,8 @@ Tensor.eq = eq
 Tensor.exp = exp
 Tensor.expand = expand
 Tensor.fill_ = fill_
+Tensor.flatten = flatten
+Tensor.flatten_ = flatten_
 Tensor.float = float
 Tensor.float_ = float_
 Tensor.floor = floor
@@ -1852,7 +1920,7 @@ Tensor.sum = sum
 Tensor.sub = sub
 Tensor.sub_ = sub_
 Tensor.topk = topk
-Tensor.type = type
+Tensor.type = _type
 Tensor.uniform_ = uniform_
 Tensor.unsqueeze = unsqueeze
 Tensor.unsqueeze_ = unsqueeze_

--- a/torch/core/serialization.py
+++ b/torch/core/serialization.py
@@ -23,7 +23,7 @@ import sys
 from dragon.core.util import six
 PICKLE_MODULE = six.moves.pickle
-DEFAULT_PROTOCOL = PICKLE_MODULE.HIGHEST_PROTOCOL
+DEFAULT_PROTOCOL = 2
 def save(obj, f, pickle_module=PICKLE_MODULE, pickle_protocol=DEFAULT_PROTOCOL):

--- a/torch/core/tensor.py
+++ b/torch/core/tensor.py
@@ -603,12 +603,12 @@ class Tensor(object):
        return self
    def cuda(self, device=None):
-        """Switch the internal storage on cuda memory.
+        """Copy memory to the specified cuda device.
        Parameters
        ----------
-        device : int, optional
+        device : Union[int, dragon.vm.torch.device], optional
-            The device index.
+            The device to copy to.
        Returns
        -------
@@ -619,6 +619,10 @@ class Tensor(object):
        if device is None:
            cfg = config.config()
            device = cfg.device_index
+        if isinstance(device, cpp.device):
+            if device.type != 'cuda':
+                raise ValueError('Excepted cuda device, got: ' + device.type)
+            device = device.index
        self._impl.ToCUDA(device)
        self._device.type, self._device.index = 'cuda', device
        return self
@@ -811,6 +815,48 @@ class Tensor(object):
        """
+    def flatten(self, start_dim=0, end_dim=-1):
+        """Return a new tensor with dimensions flattened.
+        Parameters
+        ----------
+        start_dim : int, optional, default=0
+            The start dimension to flatten.
+        end_dim : int, optional, default=-1
+            The end dimension to flatten.
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output tensor.
+        See Also
+        --------
+        `torch.flatten(...)`_
+        """
+    def flatten_(self, start_dim=0, end_dim=-1):
+        """Flatten the dimensions.
+        Parameters
+        ----------
+        start_dim : int, optional, default=0
+            The start dimension to flatten.
+        end_dim : int, optional, default=-1
+            The end dimension to flatten.
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The self.
+        See Also
+        --------
+        `torch.flatten(...)`_
+        """
    def float(self):
        """Return a float32 tensor with the same data.