Add native ops test

Summary: This commit tests the executing of native ops and verifies the results. Several bugs are found and fixed according to these tests.

Add native ops test
Summary: This commit tests the executing of native ops and verifies the results. Several bugs are found and fixed according to these tests.
Ting PAN
Commit adb6fa64 authored Jul 07, 2020 by Ting PAN
Showing with 2594 additions and 3027 deletions
caffe/layer.py
caffe/layers/common.py
caffe/layers/vision.py
caffe/net.py
caffe/proto/caffe.proto
caffe/solver.py
docs/api/python/dragon/math.rst
docs/api/python/dragon/math/accumulate.rst → docs/api/python/dragon/math/axpby.rst
docs/api/python/dragon/math/moving_average.rst
docs/api/python/dragon/updaters.rst → docs/api/python/dragon/optimizers.rst
docs/api/python/dragon/updaters/Adam.rst → docs/api/python/dragon/optimizers/Adam.rst
docs/api/python/dragon/updaters/Nesterov.rst → docs/api/python/dragon/optimizers/Nesterov.rst
docs/api/python/dragon/optimizers/Optimizer.rst
docs/api/python/dragon/updaters/RMSProp.rst → docs/api/python/dragon/optimizers/RMSprop.rst
docs/api/python/dragon/updaters/SGD.rst → docs/api/python/dragon/optimizers/SGD.rst
docs/api/python/index.rst
docs/api/python/torch.rst
docs/api/python/torch/accumulate.rst → docs/api/python/torch/axpby.rst
dragon/core/context_cuda.h
dragon/core/operator.cc
--- a/caffe/layer.py
+++ b/caffe/layer.py
@@ -9,13 +9,13 @@
 #
 # ------------------------------------------------------------

-"""Implementation for the ``Layer`` C++ class."""
+"""The base layer class."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.core.autograph.tensor import RefTensor
+from dragon.core.autograph.tensor import TensorRef
 from dragon.core.eager import context as eager_context
 from dragon.core.framework import context

@@ -76,8 +76,8 @@ class Layer(object):
        param_name = scoped_name + '/param:{}'.format(len(self._blobs))

        # Set the name explicitly.
-        variable = RefTensor(param_name)
-        variable_grad = RefTensor(param_name + '_grad')
+        variable = TensorRef(param_name)
+        variable_grad = TensorRef(param_name + '_grad')

        if filler is not None:
            variable._register_as(**filler)

--- a/caffe/layers/common.py
+++ b/caffe/layers/common.py
@@ -455,8 +455,8 @@ class InnerProduct(Layer):
        param = layer_param.inner_product_param
        self.arguments = {
            'axis': param.axis,
-            'num_output': param.num_output,
-            'transW': not param.transpose,
+            'out_channels': param.num_output,
+            'transpose_w': not param.transpose,
        }
        # Add weights and biases
        self.add_blob(filler=self.get_filler(param, 'weight_filler'))
@@ -522,7 +522,7 @@ class Normalize(Layer):
        normalize_param {
            across_spatial: false
            channel_shared: false
-            eps: 1e-5
+            eps: 1e-12
            scale_filler: {
                type: "constant"
                value: 1
@@ -548,7 +548,7 @@ class Normalize(Layer):
        self.add_blob(filler=self.get_filler(param, 'scale_filler'), value=1)

    def __call__(self, bottom):
-        norm_out = [normalization_ops.l2_normalize(bottom, **self.l2norm_arguments)]
+        norm_out = [normalization_ops.lp_normalize(bottom, **self.l2norm_arguments)]
        norm_out += [blob['data'] for blob in self._blobs]
        return math_ops.affine(norm_out, **self.affine_arguments)


--- a/caffe/layers/vision.py
+++ b/caffe/layers/vision.py
@@ -65,7 +65,7 @@ class Convolution(Layer):
        super(Convolution, self).__init__(layer_param)
        param = layer_param.convolution_param
        self.arguments = {
-            'num_output': param.num_output,
+            'out_channels': param.num_output,
            'kernel_shape': [int(e) for e in param.kernel_size],
            'strides': [int(e) for e in param.stride] if len(param.stride) > 0 else [1],
            'pads': [int(e) for e in param.pad] if len(param.pad) > 0 else [0],
@@ -187,7 +187,7 @@ class DepthwiseConv2d(Layer):
        super(DepthwiseConv2d, self).__init__(layer_param)
        param = layer_param.convolution_param
        self.arguments = {
-            'num_output': param.num_output,
+            'out_channels': param.num_output,
            'kernel_shape': [int(e) for e in param.kernel_size],
            'strides': [int(e) for e in param.stride] if len(param.stride) > 0 else [1],
            'pads': [int(e) for e in param.pad] if len(param.pad) > 0 else [0],

--- a/caffe/net.py
+++ b/caffe/net.py
@@ -9,7 +9,7 @@
 #
 # ------------------------------------------------------------

-"""Implementation for the ``Net`` C++ class."""
+"""The base net class."""

 from __future__ import absolute_import
 from __future__ import division
@@ -20,8 +20,8 @@ from google.protobuf import text_format

 from dragon.core.autograph import def_function
 from dragon.core.autograph import grad_impl
-from dragon.core.autograph.tensor import RefTensor
 from dragon.core.autograph.tensor import Tensor
+from dragon.core.autograph.tensor import TensorRef
 from dragon.core.framework import workspace
 from dragon.core.util import nest
 from dragon.vm.caffe import layers as layer_factory
@@ -84,17 +84,13 @@ class Net(object):

        if len(self._net_proto.input) > 0:
            shapes = self._net_proto.input_shape
-            for i, input in enumerate(self._net_proto.input):
+            for i, input_name in enumerate(self._net_proto.input):
                shape = [e for e in shapes[i].dim] if i < len(shapes) else None
                if input not in self._blobs:
-                    data = Tensor(input, shape=shape, dtype='float32').placeholder()
-                    self._blobs[input] = {
+                    data = Tensor(input_name, shape, 'float32').placeholder()
+                    self._blobs[input_name] = {
                        'data': data,
-                        'diff': RefTensor(
-                            data.id + '_grad',
-                            shape=shape,
-                            dtype=data.dtype
-                        ),
+                        'diff': TensorRef(data.id + '_grad', shape, data.dtype),
                    }

        for layer in self._net_proto.layer:
@@ -145,7 +141,7 @@ class Net(object):
            for i, blob in enumerate(layer._top):
                self._blobs[blob] = {
                    'data': outputs[i],
-                    'diff': RefTensor(outputs[i].id + '_grad'),
+                    'diff': TensorRef(outputs[i].id + '_grad'),
                }
                self._net_outputs.add(blob)


--- a/caffe/proto/caffe.proto
+++ b/caffe/proto/caffe.proto
@@ -3,29 +3,25 @@ syntax = "proto2";
 package caffe;

 // Specifies the shape (dimensions) of a Blob.
-message BlobShape {
-  repeated int64 dim = 1 [packed = true];
-}
+message BlobShape { repeated int64 dim = 1 [ packed = true ]; }

 message BlobProto {
  optional BlobShape shape = 7;
-  repeated float data = 5 [packed = true];
-  repeated float diff = 6 [packed = true];
-  repeated double double_data = 8 [packed = true];
-  repeated double double_diff = 9 [packed = true];
+  repeated float data = 5 [ packed = true ];
+  repeated float diff = 6 [ packed = true ];
+  repeated double double_data = 8 [ packed = true ];
+  repeated double double_diff = 9 [ packed = true ];

  // 4D dimensions -- deprecated.  Use "shape" instead.
-  optional int32 num = 1 [default = 0];
-  optional int32 channels = 2 [default = 0];
-  optional int32 height = 3 [default = 0];
-  optional int32 width = 4 [default = 0];
+  optional int32 num = 1 [ default = 0 ];
+  optional int32 channels = 2 [ default = 0 ];
+  optional int32 height = 3 [ default = 0 ];
+  optional int32 width = 4 [ default = 0 ];
 }

 // The BlobProtoVector is simply a way to pass multiple blobproto instances
 // around.
-message BlobProtoVector {
-  repeated BlobProto blobs = 1;
-}
+message BlobProtoVector { repeated BlobProto blobs = 1; }

 message Datum {
  optional int32 channels = 1;
@@ -37,21 +33,21 @@ message Datum {
  // Optionally, the datum could also hold float data.
  repeated float float_data = 6;
  // If true data contains an encoded image that need to be decoded
-  optional bool encoded = 7 [default = false];
+  optional bool encoded = 7 [ default = false ];
  repeated int32 labels = 8;
 }

 message FillerParameter {
  // The filler type.
-  optional string type = 1 [default = 'constant'];
-  optional float value = 2 [default = 0]; // the value in constant filler
-  optional float min = 3 [default = 0]; // the min value in uniform filler
-  optional float max = 4 [default = 1]; // the max value in uniform filler
-  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
-  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  optional string type = 1 [ default = 'constant' ];
+  optional float value = 2 [ default = 0 ]; // the value in constant filler
+  optional float min = 3 [ default = 0 ];   // the min value in uniform filler
+  optional float max = 4 [ default = 1 ];   // the max value in uniform filler
+  optional float mean = 5 [ default = 0 ];  // the mean value in Gaussian filler
+  optional float std = 6 [ default = 1 ];   // the std value in Gaussian filler
  // The expected number of non-zero output weights for a given input in
  // Gaussian filler -- the default -1 means don't perform sparsification.
-  optional int32 sparse = 7 [default = -1];
+  optional int32 sparse = 7 [ default = -1 ];
  // Normalize the filler variance by fan_in, fan_out, or their average.
  // Applies to 'xavier' and 'msra' fillers.
  enum VarianceNorm {
@@ -59,7 +55,7 @@ message FillerParameter {
    FAN_OUT = 1;
    AVERAGE = 2;
  }
-  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+  optional VarianceNorm variance_norm = 8 [ default = FAN_IN ];
 }

 message NetParameter {
@@ -78,7 +74,7 @@ message NetParameter {
  // Whether the network will force every layer to carry out backward operation.
  // If set False, then whether to carry out backward is determined
  // automatically according to the net structure and learning rates.
-  optional bool force_backward = 5 [default = false];
+  optional bool force_backward = 5 [ default = false ];
  // The current "state" of the network, including the phase, level, and stage.
  // Some layers may be included/excluded depending on this state and the states
  // specified in the layers' include and exclude fields.
@@ -86,11 +82,11 @@ message NetParameter {

  // Print debugging information about results while running Net::Forward,
  // Net::Backward, and Net::Update.
-  optional bool debug_info = 7 [default = false];
+  optional bool debug_info = 7 [ default = false ];

  // The layers that make up the net.  Each of their configurations, including
  // connectivity and behavior, is specified as a LayerParameter.
-  repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
+  repeated LayerParameter layer = 100; // ID 100 so layers are printed last.

  // DEPRECATED: use 'layer' instead.
  repeated V1LayerParameter layers = 2;
@@ -122,9 +118,9 @@ message SolverParameter {
  optional NetParameter net_param = 25;

  optional string train_net = 1; // Proto filename for the train net.
-  repeated string test_net = 2; // Proto filenames for the test nets.
+  repeated string test_net = 2;  // Proto filenames for the test nets.
  optional NetParameter train_net_param = 21; // Inline train net params.
-  repeated NetParameter test_net_param = 22; // Inline test net params.
+  repeated NetParameter test_net_param = 22;  // Inline test net params.

  // The states for the train/test nets. Must be unspecified or
  // specified once per net.
@@ -140,11 +136,11 @@ message SolverParameter {
  repeated int32 test_iter = 3;

  // The number of iterations between two testing phases.
-  optional int32 test_interval = 4 [default = 0];
-  optional bool test_compute_loss = 19 [default = false];
+  optional int32 test_interval = 4 [ default = 0 ];
+  optional bool test_compute_loss = 19 [ default = false ];
  // If true, run an initial test pass before the first iteration,
  // ensuring memory availability and printing the starting value of the loss.
-  optional bool test_initialization = 32 [default = true];
+  optional bool test_initialization = 32 [ default = true ];
  optional float base_lr = 5; // The base learning rate
  repeated float stage_lr = 50;
  repeated int32 stage_iter = 51;
@@ -152,10 +148,10 @@ message SolverParameter {
  // will be displayed.
  optional int32 display = 6;
  // Display the loss averaged over the last average_loss iterations
-  optional int32 average_loss = 33 [default = 1];
+  optional int32 average_loss = 33 [ default = 1 ];
  optional int32 max_iter = 7; // the maximum number of iterations
  // accumulate gradients over `iter_size` x `batch_size` instances
-  optional int32 iter_size = 36 [default = 1];
+  optional int32 iter_size = 36 [ default = 1 ];

  // The learning rate decay policy. The currently implemented learning rate
  // policies are as follows:
@@ -173,13 +169,13 @@ message SolverParameter {
  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
  // in the solver parameter protocol buffer, and iter is the current iteration.
  optional string lr_policy = 8;
-  optional float gamma = 9; // The parameter to compute the learning rate.
-  optional float power = 10; // The parameter to compute the learning rate.
+  optional float gamma = 9;     // The parameter to compute the learning rate.
+  optional float power = 10;    // The parameter to compute the learning rate.
  optional float momentum = 11; // The momentum value.
  optional float weight_decay = 12; // The weight decay.
  // regularization types supported: L1 and L2
  // controlled by weight_decay
-  optional string regularization_type = 29 [default = "L2"];
+  optional string regularization_type = 29 [ default = "L2" ];
  // the stepsize for learning rate policy "step"
  optional int32 stepsize = 13;
  // the stepsize for learning rate policy "multistep"
@@ -187,49 +183,49 @@ message SolverParameter {

  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
  // whenever their actual L2 norm is larger.
-  optional float clip_gradients = 35 [default = -1];
+  optional float clip_gradients = 35 [ default = -1 ];

-  optional int32 snapshot = 14 [default = 0]; // The snapshot interval
-  optional string snapshot_prefix = 15; // The prefix for the snapshot.
+  optional int32 snapshot = 14 [ default = 0 ]; // The snapshot interval
+  optional string snapshot_prefix = 15;         // The prefix for the snapshot.
  // whether to snapshot diff in the results or not. Snapshotting diff will help
  // debugging but the final protocol buffer size will be much larger.
-  optional bool snapshot_diff = 16 [default = false];
+  optional bool snapshot_diff = 16 [ default = false ];
  enum SnapshotFormat {
    HDF5 = 0;
    BINARYPROTO = 1;
  }
-  optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO];
+  optional SnapshotFormat snapshot_format = 37 [ default = BINARYPROTO ];
  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
  enum SolverMode {
    CPU = 0;
    GPU = 1;
  }
-  optional SolverMode solver_mode = 17 [default = GPU];
+  optional SolverMode solver_mode = 17 [ default = GPU ];
  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
-  optional int32 device_id = 18 [default = 0];
+  optional int32 device_id = 18 [ default = 0 ];
  // If non-negative, the seed with which the Solver will initialize the Caffe
  // random number generator -- useful for reproducible results. Otherwise,
  // (and by default) initialize using a seed derived from the system clock.
-  optional int64 random_seed = 20 [default = -1];
+  optional int64 random_seed = 20 [ default = -1 ];

  // type of the solver
-  optional string type = 40 [default = "SGD"];
+  optional string type = 40 [ default = "SGD" ];

  // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
-  optional float delta = 31 [default = 1e-8];
+  optional float delta = 31 [ default = 1e-8 ];
  // parameters for the Adam solver
-  optional float momentum2 = 39 [default = 0.999];
+  optional float momentum2 = 39 [ default = 0.999 ];

  // RMSProp decay value
  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
-  optional float rms_decay = 38 [default = 0.99];
+  optional float rms_decay = 38 [ default = 0.99 ];

  // If true, print information about the state of the net that may help with
  // debugging learning problems.
-  optional bool debug_info = 23 [default = false];
+  optional bool debug_info = 23 [ default = false ];

  // If false, don't save a snapshot after training finishes.
-  optional bool snapshot_after_train = 28 [default = true];
+  optional bool snapshot_after_train = 28 [ default = true ];

  // DEPRECATED: old solver enum types, use string instead
  enum SolverType {
@@ -241,25 +237,26 @@ message SolverParameter {
    ADAM = 5;
  }
  // DEPRECATED: use type instead of solver_type
-  optional SolverType solver_type = 30 [default = SGD];
+  optional SolverType solver_type = 30 [ default = SGD ];
 }

 // A message that stores the solver snapshots
 message SolverState {
-  optional int32 iter = 1; // The current iteration
+  optional int32 iter = 1;         // The current iteration
  optional string learned_net = 2; // The file that stores the learned net.
-  repeated BlobProto history = 3; // The history for sgd solvers
-  optional int32 current_step = 4 [default = 0]; // The current step for learning rate
+  repeated BlobProto history = 3;  // The history for sgd solvers
+  optional int32 current_step = 4
+      [ default = 0 ]; // The current step for learning rate
 }

 enum Phase {
-   TRAIN = 0;
-   TEST = 1;
+  TRAIN = 0;
+  TEST = 1;
 }

 message NetState {
-  optional Phase phase = 1 [default = TEST];
-  optional int32 level = 2 [default = 0];
+  optional Phase phase = 1 [ default = TEST ];
+  optional int32 level = 2 [ default = 0 ];
  repeated string stage = 3;
 }

@@ -300,24 +297,25 @@ message ParamSpec {
  }

  // The multiplier on the global learning rate for this parameter.
-  optional float lr_mult = 3 [default = 1.0];
+  optional float lr_mult = 3 [ default = 1.0 ];

  // The multiplier on the global weight decay for this parameter.
-  optional float decay_mult = 4 [default = 1.0];
+  optional float decay_mult = 4 [ default = 1.0 ];
 }

 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 146 (last added: parameter_param)
+// LayerParameter next available layer-specific ID: 146 (last added:
+// parameter_param)
 message LayerParameter {
-  optional string name = 1; // the layer name
-  optional string type = 2; // the layer type
+  optional string name = 1;   // the layer name
+  optional string type = 2;   // the layer type
  repeated string bottom = 3; // the name of each bottom blob
-  repeated string top = 4; // the name of each top blob
+  repeated string top = 4;    // the name of each top blob

  // The mirror stage optimization
-  optional bool mirror_stage = 150 [default = false];  
+  optional bool mirror_stage = 150 [ default = false ];

  // The train / test phase for computation.
  optional Phase phase = 10;
@@ -423,29 +421,29 @@ message TransformationParameter {
  // For data pre-processing, we can do simple scaling and subtracting the
  // data mean, if provided. Note that the mean subtraction is always carried
  // out before scaling.
-  optional float scale = 1 [default = 1];
+  optional float scale = 1 [ default = 1 ];
  // Specify if we want to randomly mirror data.
-  optional bool mirror = 2 [default = false];
+  optional bool mirror = 2 [ default = false ];
  // Specify if we would like to randomly crop an image.
-  optional uint32 crop_size = 3 [default = 0];
+  optional uint32 crop_size = 3 [ default = 0 ];
  // mean_file and mean_value cannot be specified at the same time
  optional string mean_file = 4;
-  // if specified can be repeated once (would substract it from all the channels)
-  // or can be repeated the same number of times as channels
-  // (would subtract them from the corresponding channel)
+  // if specified can be repeated once (would substract it from all the
+  // channels) or can be repeated the same number of times as channels (would
+  // subtract them from the corresponding channel)
  repeated float mean_value = 5;
  // Force the decoded image to have 3 color channels.
-  optional bool force_color = 6 [default = false];
+  optional bool force_color = 6 [ default = false ];
  // Force the decoded image to have 1 color channels.
-  optional bool force_gray = 7 [default = false];
+  optional bool force_gray = 7 [ default = false ];
  // Distort the color?
-  optional bool augment_color = 9 [default = false];
+  optional bool augment_color = 9 [ default = false ];
  // Target size.
-  optional uint32 resize = 10 [default=0];
+  optional uint32 resize = 10 [ default = 0 ];
  // Padding size.
-  optional uint32 padding = 11 [default = 0];
+  optional uint32 padding = 11 [ default = 0 ];
  // Crop size during scale jittering
-  optional uint32 random_crop_size = 12 [default = 0];
+  optional uint32 random_crop_size = 12 [ default = 0 ];
 }

 // Message that stores parameters shared by loss layers
@@ -469,7 +467,7 @@ message LossParameter {
    // Do not normalize the loss.
    NONE = 3;
  }
-  optional NormalizationMode normalization = 3 [default = VALID];
+  optional NormalizationMode normalization = 3 [ default = VALID ];
  // Deprecated.  Ignored if normalization is specified.  If normalization
  // is not specified, then setting this to false will be equivalent to
  // normalization = BATCH_SIZE to be consistent with previous behavior.
@@ -483,14 +481,14 @@ message AccuracyParameter {
  // When computing accuracy, count as correct by comparing the true label to
  // the top k scoring classes.  By default, only compare to the top scoring
  // class (i.e. argmax).
-  optional uint32 top_k = 1 [default = 1];
+  optional uint32 top_k = 1 [ default = 1 ];

  // The "label" axis of the prediction blob, whose argmax corresponds to the
  // predicted label -- may be negative to index from the end (e.g., -1 for the
  // last axis).  For example, if axis == 1 and the predictions are
  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
  // labels with integer values in {0, 1, ..., C-1}.
-  optional int32 axis = 2 [default = 1];
+  optional int32 axis = 2 [ default = 1 ];

  // If specified, ignore instances with the given label.
  optional int32 ignore_label = 3;
@@ -498,8 +496,8 @@ message AccuracyParameter {

 message ArgMaxParameter {
  // If true produce pairs (argmax, maxval)
-  optional bool out_max_val = 1 [default = false];
-  optional uint32 top_k = 2 [default = 1];
+  optional bool out_max_val = 1 [ default = false ];
+  optional uint32 top_k = 2 [ default = 1 ];
  // The axis along which to maximise -- may be negative to index from the
  // end (e.g., -1 for the last axis).
  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
@@ -512,10 +510,10 @@ message ConcatParameter {
  // end (e.g., -1 for the last axis).  Other axes must have the
  // same dimension for all the bottom blobs.
  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 2 [default = 1];
+  optional int32 axis = 2 [ default = 1 ];

  // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 concat_dim = 1 [default = 1];
+  optional uint32 concat_dim = 1 [ default = 1 ];
 }

 message BatchNormParameter {
@@ -524,10 +522,10 @@ message BatchNormParameter {
  // across the batch.
  optional bool use_global_stats = 1;
  // How much does the moving average decay each iteration?
-  optional float moving_average_fraction = 2 [default = 0.9];
+  optional float moving_average_fraction = 2 [ default = 0.9 ];
  // Small value to add to the variance estimate so that we don't divide by
  // zero.
-  optional float eps = 3 [default = 1e-5];
+  optional float eps = 3 [ default = 1e-5 ];
 }

 message BiasParameter {
@@ -544,7 +542,7 @@ message BiasParameter {
  //    (axis == 3 == -1)                                60
  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
  // "axis") -- a scalar bias.
-  optional int32 axis = 1 [default = 1];
+  optional int32 axis = 1 [ default = 1 ];

  // (num_axes is ignored unless just one bottom is given and the bias is
  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
@@ -552,7 +550,7 @@ message BiasParameter {
  // The number of axes of the input (bottom[0]) covered by the bias
  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
-  optional int32 num_axes = 2 [default = 1];
+  optional int32 num_axes = 2 [ default = 1 ];

  // (filler is ignored unless just one bottom is given and the bias is
  // a learned parameter of the layer.)
@@ -564,25 +562,25 @@ message BiasParameter {

 message ContrastiveLossParameter {
  // margin for dissimilar pair
-  optional float margin = 1 [default = 1.0];
+  optional float margin = 1 [ default = 1.0 ];
  // The first implementation of this cost did not exactly match the cost of
  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
  // Hadsell paper. New models should probably use this version.
  // legacy_version = true uses (margin - d^2). This is kept to support /
  // reproduce existing models and results
-  optional bool legacy_version = 2 [default = false];
+  optional bool legacy_version = 2 [ default = false ];
 }

 message ConvolutionParameter {
  optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional bool bias_term = 2 [ default = true ]; // whether to have bias terms

  // Pad, kernel size, and stride are all given as a single value for equal
  // dimensions in all spatial dimensions, or once per spatial dimension.
-  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 pad = 3;         // The padding size; defaults to 0
  repeated uint32 kernel_size = 4; // The kernel size
-  repeated uint32 stride = 6; // The stride; defaults to 1
+  repeated uint32 stride = 6;      // The stride; defaults to 1
  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
  // holes. (Kernel dilation is sometimes referred to by its use in the
  // algorithme �� trous from Holschneider et al. 1987.)
@@ -590,23 +588,23 @@ message ConvolutionParameter {

  // For 2D convolution only, the *_h and *_w versions may also be used to
  // specify both spatial dimensions.
-  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
-  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
-  optional uint32 kernel_h = 11; // The kernel height (2D only)
-  optional uint32 kernel_w = 12; // The kernel width (2D only)
-  optional uint32 stride_h = 13; // The stride height (2D only)
-  optional uint32 stride_w = 14; // The stride width (2D only)
+  optional uint32 pad_h = 9 [ default = 0 ];  // The padding height (2D only)
+  optional uint32 pad_w = 10 [ default = 0 ]; // The padding width (2D only)
+  optional uint32 kernel_h = 11;              // The kernel height (2D only)
+  optional uint32 kernel_w = 12;              // The kernel width (2D only)
+  optional uint32 stride_h = 13;              // The stride height (2D only)
+  optional uint32 stride_w = 14;              // The stride width (2D only)

-  optional uint32 group = 5 [default = 1]; // The group size for group conv
+  optional uint32 group = 5 [ default = 1 ]; // The group size for group conv

  optional FillerParameter weight_filler = 7; // The filler for the weight
-  optional FillerParameter bias_filler = 8; // The filler for the bias
+  optional FillerParameter bias_filler = 8;   // The filler for the bias
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 15 [default = DEFAULT];
+  optional Engine engine = 15 [ default = DEFAULT ];

  // The axis to interpret as "channels" when performing convolution.
  // Preceding dimensions are treated as independent inputs;
@@ -617,14 +615,14 @@ message ConvolutionParameter {
  // With (N, C, D, H, W) inputs, and axis == 1, we perform
  // N independent 3D convolutions, sliding (C/g)-channels
  // filters across the spatial axes (D, H, W) of the input.
-  optional int32 axis = 16 [default = 1];
+  optional int32 axis = 16 [ default = 1 ];

  // Whether to force use of the general ND convolution, even if a specific
  // implementation for blobs of the appropriate number of spatial dimensions
  // is available. (Currently, there is only a 2D-specific convolution
  // implementation; for input blobs with num_axes != 2, this option is
  // ignored and the ND implementation will be used.)
-  optional bool force_nd_im2col = 17 [default = false];
+  optional bool force_nd_im2col = 17 [ default = false ];
 }

 message CropParameter {
@@ -641,7 +639,7 @@ message CropParameter {
  // Note: standard dimensions are N,C,H,W so the default is a spatial crop,
  // and `axis` may be negative to index from the end (e.g., -1 for the last
  // axis).
-  optional int32 axis = 1 [default = 2];
+  optional int32 axis = 1 [ default = 2 ];
  repeated uint32 offset = 2;
 }

@@ -659,40 +657,40 @@ message DataParameter {
  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
  // be larger than the number of keys in the database.
  // DEPRECATED. Each solver accesses a different subset of the database.
-  optional uint32 rand_skip = 7 [default = 0];
-  optional DB backend = 8 [default = LEVELDB];
+  optional uint32 rand_skip = 7 [ default = 0 ];
+  optional DB backend = 8 [ default = LEVELDB ];
  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
  // simple scaling and subtracting the data mean, if provided. Note that the
  // mean subtraction is always carried out before scaling.
-  optional float scale = 2 [default = 1];
+  optional float scale = 2 [ default = 1 ];
  optional string mean_file = 3;
-  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
-  // crop an image.
-  optional uint32 crop_size = 5 [default = 0];
-  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
-  // data.
-  optional bool mirror = 6 [default = false];
+  // DEPRECATED. See TransformationParameter. Specify if we would like to
+  // randomly crop an image.
+  optional uint32 crop_size = 5 [ default = 0 ];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly
+  // mirror data.
+  optional bool mirror = 6 [ default = false ];
  // Force the encoded image to have 3 color channels
-  optional bool force_encoded_color = 9 [default = false];
+  optional bool force_encoded_color = 9 [ default = false ];
  // Prefetch queue (Number of batches to prefetch to host memory, increase if
  // data access bandwidth varies).
-  optional uint32 prefetch = 10 [default = 5];
+  optional uint32 prefetch = 10 [ default = 5 ];
  // Whether to shuffle the data.
-  optional bool shuffle = 11 [default = false];
+  optional bool shuffle = 11 [ default = false ];
  // The number of chunks to shuffle.
-  optional int32 num_chunks = 12 [default = 2048];
+  optional int32 num_chunks = 12 [ default = 2048 ];
 }

 message DropoutParameter {
-  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
-  optional bool scale_train = 2 [default = true];  // scale train or test phase
+  optional float dropout_ratio = 1 [ default = 0.5 ]; // dropout ratio
+  optional bool scale_train = 2 [ default = true ]; // scale train or test phase
 }

 // DummyDataLayer fills any number of arbitrarily shaped blobs with random
 // (or constant) data generated by "Fillers" (see "message FillerParameter").
 message DummyDataParameter {
-  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or N
-  // shape fields, and 0, 1 or N data_fillers.
+  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or
+  // N shape fields, and 0, 1 or N data_fillers.
  //
  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
@@ -713,12 +711,12 @@ message EltwiseParameter {
    SUM = 1;
    MAX = 2;
  }
-  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+  optional EltwiseOp operation = 1 [ default = SUM ]; // element-wise operation
  repeated float coeff = 2; // blob-wise coefficient for SUM operation

  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
  // of computing the gradient for the PROD operation. (No effect for SUM op.)
-  optional bool stable_prod_grad = 3 [default = true];
+  optional bool stable_prod_grad = 3 [ default = true ];
 }

 // Message that stores parameters used by ELULayer
@@ -726,7 +724,7 @@ message ELUParameter {
  // Described in:
  // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate
  // Deep Network Learning by Exponential Linear Units (ELUs). arXiv
-  optional float alpha = 1 [default = 1];
+  optional float alpha = 1 [ default = 1 ];
 }

 // Message that stores parameters used by EmbedLayer
@@ -737,10 +735,9 @@ message EmbedParameter {
  // 1 greater than the maximum possible input value.
  optional uint32 input_dim = 2;

-  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
-  optional FillerParameter weight_filler = 4; // The filler for the weight
-  optional FillerParameter bias_filler = 5; // The filler for the bias
-
+  optional bool bias_term = 3 [ default = true ]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4;     // The filler for the weight
+  optional FillerParameter bias_filler = 5;       // The filler for the bias
 }

 // Message that stores parameters used by ExpLayer
@@ -748,21 +745,21 @@ message ExpParameter {
  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
  // Or if base is set to the default (-1), base is set to e,
  // so y = exp(shift + scale * x).
-  optional float base = 1 [default = -1.0];
-  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [default = 0.0];
+  optional float base = 1 [ default = -1.0 ];
+  optional float scale = 2 [ default = 1.0 ];
+  optional float shift = 3 [ default = 0.0 ];
 }

 /// Message that stores parameters used by FlattenLayer
 message FlattenParameter {
  // The first axis to flatten: all preceding axes are retained in the output.
  // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 1 [default = 1];
+  optional int32 axis = 1 [ default = 1 ];

  // The last axis to flatten: all following axes are retained in the output.
  // May be negative to index from the end (e.g., the default -1 for the last
  // axis).
-  optional int32 end_axis = 2 [default = -1];
+  optional int32 end_axis = 2 [ default = -1 ];
 }

 // Message that stores parameters used by HDF5DataLayer
@@ -777,12 +774,10 @@ message HDF5DataParameter {
  // and the ordering of data within any given HDF5 file is shuffled,
  // but data between different files are not interleaved; all of a file's
  // data are output (in a random order) before moving onto another file.
-  optional bool shuffle = 3 [default = false];
+  optional bool shuffle = 3 [ default = false ];
 }

-message HDF5OutputParameter {
-  optional string file_name = 1;
-}
+message HDF5OutputParameter { optional string file_name = 1; }

 message HingeLossParameter {
  enum Norm {
@@ -790,38 +785,38 @@ message HingeLossParameter {
    L2 = 2;
  }
  // Specify the Norm to use L1 or L2
-  optional Norm norm = 1 [default = L1];
+  optional Norm norm = 1 [ default = L1 ];
 }

 message ImageDataParameter {
  // Specify the data source.
  optional string source = 1;
  // Specify the batch size.
-  optional uint32 batch_size = 4 [default = 1];
+  optional uint32 batch_size = 4 [ default = 1 ];
  // The rand_skip variable is for the data layer to skip a few data points
  // to avoid all asynchronous sgd clients to start at the same point. The skip
  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
  // be larger than the number of keys in the database.
-  optional uint32 rand_skip = 7 [default = 0];
+  optional uint32 rand_skip = 7 [ default = 0 ];
  // Whether or not ImageLayer should shuffle the list of files at every epoch.
-  optional bool shuffle = 8 [default = false];
+  optional bool shuffle = 8 [ default = false ];
  // It will also resize images if new_height or new_width are not zero.
-  optional uint32 new_height = 9 [default = 0];
-  optional uint32 new_width = 10 [default = 0];
+  optional uint32 new_height = 9 [ default = 0 ];
+  optional uint32 new_width = 10 [ default = 0 ];
  // Specify if the images are color or gray
-  optional bool is_color = 11 [default = true];
+  optional bool is_color = 11 [ default = true ];
  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
  // simple scaling and subtracting the data mean, if provided. Note that the
  // mean subtraction is always carried out before scaling.
-  optional float scale = 2 [default = 1];
+  optional float scale = 2 [ default = 1 ];
  optional string mean_file = 3;
-  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
-  // crop an image.
-  optional uint32 crop_size = 5 [default = 0];
-  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
-  // data.
-  optional bool mirror = 6 [default = false];
-  optional string root_folder = 12 [default = ""];
+  // DEPRECATED. See TransformationParameter. Specify if we would like to
+  // randomly crop an image.
+  optional uint32 crop_size = 5 [ default = 0 ];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly
+  // mirror data.
+  optional bool mirror = 6 [ default = false ];
+  optional string root_folder = 12 [ default = "" ];
 }

 message InfogainLossParameter {
@@ -831,19 +826,20 @@ message InfogainLossParameter {

 message InnerProductParameter {
  optional uint32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2 [default = true]; // whether to have bias terms
-  optional FillerParameter weight_filler = 3; // The filler for the weight
-  optional FillerParameter bias_filler = 4; // The filler for the bias
+  optional bool bias_term = 2 [ default = true ]; // whether to have bias terms
+  optional FillerParameter weight_filler = 3;     // The filler for the weight
+  optional FillerParameter bias_filler = 4;       // The filler for the bias

  // The first axis to be lumped into a single inner product computation;
  // all preceding axes are retained in the output.
  // May be negative to index from the end (e.g., -1 for the last axis).
-  optional int32 axis = 5 [default = 1];
+  optional int32 axis = 5 [ default = 1 ];
  // Specify whether to transpose the weight matrix or not.
  // If transpose == true, any operations will be performed on the transpose
-  // of the weight matrix. The weight matrix itself is not going to be transposed
-  // but rather the transfer flag of operations will be toggled accordingly.
-  optional bool transpose = 6 [default = false];
+  // of the weight matrix. The weight matrix itself is not going to be
+  // transposed but rather the transfer flag of operations will be toggled
+  // accordingly.
+  optional bool transpose = 6 [ default = false ];
 }

 message InputParameter {
@@ -860,28 +856,28 @@ message LogParameter {
  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
  // Or if base is set to the default (-1), base is set to e,
  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
-  optional float base = 1 [default = -1.0];
-  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [default = 0.0];
+  optional float base = 1 [ default = -1.0 ];
+  optional float scale = 2 [ default = 1.0 ];
+  optional float shift = 3 [ default = 0.0 ];
 }

 // Message that stores parameters used by LRNLayer
 message LRNParameter {
-  optional uint32 local_size = 1 [default = 5];
-  optional float alpha = 2 [default = 1.];
-  optional float beta = 3 [default = 0.75];
+  optional uint32 local_size = 1 [ default = 5 ];
+  optional float alpha = 2 [ default = 1. ];
+  optional float beta = 3 [ default = 0.75 ];
  enum NormRegion {
    ACROSS_CHANNELS = 0;
    WITHIN_CHANNEL = 1;
  }
-  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
-  optional float k = 5 [default = 1.];
+  optional NormRegion norm_region = 4 [ default = ACROSS_CHANNELS ];
+  optional float k = 5 [ default = 1. ];
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 6 [default = DEFAULT];
+  optional Engine engine = 6 [ default = DEFAULT ];
 }

 message MemoryDataParameter {
@@ -893,18 +889,16 @@ message MemoryDataParameter {

 message MVNParameter {
  // This parameter can be set to false to normalize mean only
-  optional bool normalize_variance = 1 [default = true];
+  optional bool normalize_variance = 1 [ default = true ];

  // This parameter can be set to true to perform DNN-like MVN
-  optional bool across_channels = 2 [default = false];
+  optional bool across_channels = 2 [ default = false ];

  // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 3 [default = 1e-9];
+  optional float eps = 3 [ default = 1e-9 ];
 }

-message ParameterParameter {
-  optional BlobShape shape = 1;
-}
+message ParameterParameter { optional BlobShape shape = 1; }

 message PoolingParameter {
  enum PoolMethod {
@@ -912,45 +906,45 @@ message PoolingParameter {
    AVE = 1;
    STOCHASTIC = 2;
  }
-  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  optional PoolMethod pool = 1 [ default = MAX ]; // The pooling method
  // Pad, kernel size, and stride are all given as a single value for equal
  // dimensions in height and width or as Y, X pairs.
-  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
-  optional uint32 pad_h = 9 [default = 0]; // The padding height
-  optional uint32 pad_w = 10 [default = 0]; // The padding width
-  optional uint32 kernel_size = 2; // The kernel size (square)
-  optional uint32 kernel_h = 5; // The kernel height
-  optional uint32 kernel_w = 6; // The kernel width
-  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
-  optional uint32 stride_h = 7; // The stride height
-  optional uint32 stride_w = 8; // The stride width
+  optional uint32 pad = 4 [ default = 0 ];   // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [ default = 0 ]; // The padding height
+  optional uint32 pad_w = 10 [ default = 0 ]; // The padding width
+  optional uint32 kernel_size = 2;            // The kernel size (square)
+  optional uint32 kernel_h = 5;               // The kernel height
+  optional uint32 kernel_w = 6;               // The kernel width
+  optional uint32 stride = 3 [ default = 1 ]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7;               // The stride height
+  optional uint32 stride_w = 8;               // The stride width
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 11 [default = DEFAULT];
+  optional Engine engine = 11 [ default = DEFAULT ];
  // If global_pooling then it will pool over the size of the bottom by doing
  // kernel_h = bottom->height and kernel_w = bottom->width
-  optional bool global_pooling = 12 [default = false];
+  optional bool global_pooling = 12 [ default = false ];
 }

 // Message that stores parameters used by ROIPoolingLayer
 message ROIPoolingParameter {
  // Pad, kernel size, and stride are all given as a single value for equal
  // dimensions in height and width or as Y, X pairs.
-  optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
-  optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
+  optional uint32 pooled_h = 1 [ default = 0 ]; // The pooled output height
+  optional uint32 pooled_w = 2 [ default = 0 ]; // The pooled output width
  // Multiplicative spatial scale factor to translate ROI coords from their
  // input scale to the scale used when pooling
-  optional float spatial_scale = 3 [default = 1];
+  optional float spatial_scale = 3 [ default = 1 ];
 }

 message PowerParameter {
  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
-  optional float power = 1 [default = 1.0];
-  optional float scale = 2 [default = 1.0];
-  optional float shift = 3 [default = 0.0];
+  optional float power = 1 [ default = 1.0 ];
+  optional float scale = 2 [ default = 1.0 ];
+  optional float shift = 3 [ default = 0.0 ];
 }

 message PythonParameter {
@@ -960,11 +954,11 @@ message PythonParameter {
  // in Python before calling the `setup()` method. This could be a number,
  // string, dictionary in Python dict format, JSON, etc. You may parse this
  // string in `setup` method and use it in `forward` and `backward`.
-  optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data parallelism.
-  // If true, each worker solver sequentially run forward from this layer.
-  // This value should be set true if you are using it as a data layer.
-  optional bool share_in_parallel = 4 [default = false];
+  optional string param_str = 3 [ default = ''];
+  // Whether this PythonLayer is shared among worker solvers during data
+  // parallelism. If true, each worker solver sequentially run forward from this
+  // layer. This value should be set true if you are using it as a data layer.
+  optional bool share_in_parallel = 4 [ default = false ];
 }

 // Message that stores parameters used by ReductionLayer
@@ -976,7 +970,7 @@ message ReductionParameter {
    MEAN = 4;
  }

-  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
+  optional ReductionOp operation = 1 [ default = SUM ]; // reduction operation

  // The first axis to reduce to a scalar -- may be negative to index from the
  // end (e.g., -1 for the last axis).
@@ -991,9 +985,9 @@ message ReductionParameter {
  // If axis == 0 (the default), the output Blob always has the empty shape
  // (count 1), performing reduction across the entire input --
  // often useful for creating new loss functions.
-  optional int32 axis = 2 [default = 0];
+  optional int32 axis = 2 [ default = 0 ];

-  optional float coeff = 3 [default = 1.0]; // coefficient for output
+  optional float coeff = 3 [ default = 1.0 ]; // coefficient for output
 }

 // Message that stores parameters used by ReLULayer
@@ -1003,13 +997,13 @@ message ReLUParameter {
  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
  // improve neural network acoustic models. In ICML Workshop on Deep Learning
  // for Audio, Speech, and Language Processing.
-  optional float negative_slope = 1 [default = 0];
+  optional float negative_slope = 1 [ default = 0 ];
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 2 [default = DEFAULT];
+  optional Engine engine = 2 [ default = DEFAULT ];
 }

 message ReshapeParameter {
@@ -1072,8 +1066,8 @@ message ReshapeParameter {
  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
  //
-  optional int32 axis = 2 [default = 0];
-  optional int32 num_axes = 3 [default = -1];
+  optional int32 axis = 2 [ default = 0 ];
+  optional int32 num_axes = 3 [ default = -1 ];
 }

 message ScaleParameter {
@@ -1090,7 +1084,7 @@ message ScaleParameter {
  //    (axis == 3 == -1)                                60
  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
  // "axis") -- a scalar multiplier.
-  optional int32 axis = 1 [default = 1];
+  optional int32 axis = 1 [ default = 1 ];

  // (num_axes is ignored unless just one bottom is given and the scale is
  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
@@ -1098,7 +1092,7 @@ message ScaleParameter {
  // The number of axes of the input (bottom[0]) covered by the scale
  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
-  optional int32 num_axes = 2 [default = 1];
+  optional int32 num_axes = 2 [ default = 1 ];

  // (filler is ignored unless just one bottom is given and the scale is
  // a learned parameter of the layer.)
@@ -1109,7 +1103,7 @@ message ScaleParameter {

  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
  // may be more efficient).  Initialized with bias_filler (defaults to 0).
-  optional bool bias_term = 4 [default = false];
+  optional bool bias_term = 4 [ default = false ];
  optional FillerParameter bias_filler = 5;
 }

@@ -1119,18 +1113,18 @@ message SigmoidParameter {
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 1 [default = DEFAULT];
+  optional Engine engine = 1 [ default = DEFAULT ];
 }

 message SliceParameter {
  // The axis along which to slice -- may be negative to index from the end
  // (e.g., -1 for the last axis).
  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
-  optional int32 axis = 3 [default = 1];
+  optional int32 axis = 3 [ default = 1 ];
  repeated uint32 slice_point = 2;

  // DEPRECATED: alias for "axis" -- does not support negative indexing.
-  optional uint32 slice_dim = 1 [default = 1];
+  optional uint32 slice_dim = 1 [ default = 1 ];
 }

 // Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
@@ -1140,12 +1134,12 @@ message SoftmaxParameter {
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 1 [default = DEFAULT];
+  optional Engine engine = 1 [ default = DEFAULT ];

  // The axis along which to perform the softmax -- may be negative to index
  // from the end (e.g., -1 for the last axis).
  // Any other axes will be evaluated as independent softmaxes.
-  optional int32 axis = 2 [default = 1];
+  optional int32 axis = 2 [ default = 1 ];
 }

 message TanHParameter {
@@ -1154,13 +1148,13 @@ message TanHParameter {
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 1 [default = DEFAULT];
+  optional Engine engine = 1 [ default = DEFAULT ];
 }

 // Message that stores parameters used by TileLayer
 message TileParameter {
  // The index of the axis to tile.
-  optional int32 axis = 1 [default = 1];
+  optional int32 axis = 1 [ default = 1 ];

  // The number of copies (tiles) of the blob to output.
  optional int32 tiles = 2;
@@ -1170,7 +1164,7 @@ message TileParameter {

 // Message that stores parameters used by ThresholdLayer
 message ThresholdParameter {
-  optional float threshold = 1 [default = 0]; // Strictly positive values
+  optional float threshold = 1 [ default = 0 ]; // Strictly positive values
 }

 message WindowDataParameter {
@@ -1179,31 +1173,31 @@ message WindowDataParameter {
  // For data pre-processing, we can do simple scaling and subtracting the
  // data mean, if provided. Note that the mean subtraction is always carried
  // out before scaling.
-  optional float scale = 2 [default = 1];
+  optional float scale = 2 [ default = 1 ];
  optional string mean_file = 3;
  // Specify the batch size.
  optional uint32 batch_size = 4;
  // Specify if we would like to randomly crop an image.
-  optional uint32 crop_size = 5 [default = 0];
+  optional uint32 crop_size = 5 [ default = 0 ];
  // Specify if we want to randomly mirror data.
-  optional bool mirror = 6 [default = false];
+  optional bool mirror = 6 [ default = false ];
  // Foreground (object) overlap threshold
-  optional float fg_threshold = 7 [default = 0.5];
+  optional float fg_threshold = 7 [ default = 0.5 ];
  // Background (non-object) overlap threshold
-  optional float bg_threshold = 8 [default = 0.5];
+  optional float bg_threshold = 8 [ default = 0.5 ];
  // Fraction of batch that should be foreground objects
-  optional float fg_fraction = 9 [default = 0.25];
+  optional float fg_fraction = 9 [ default = 0.25 ];
  // Amount of contextual padding to add around a window
  // (used only by the window_data_layer)
-  optional uint32 context_pad = 10 [default = 0];
+  optional uint32 context_pad = 10 [ default = 0 ];
  // Mode for cropping out a detection window
  // warp: cropped window is warped to a fixed size and aspect ratio
  // square: the tightest square around the window is cropped
-  optional string crop_mode = 11 [default = "warp"];
+  optional string crop_mode = 11 [ default = "warp" ];
  // cache_images: will load all images in memory for faster access
-  optional bool cache_images = 12 [default = false];
+  optional bool cache_images = 12 [ default = false ];
  // append root_folder to locate images
-  optional string root_folder = 13 [default = ""];
+  optional string root_folder = 13 [ default = "" ];
 }

 message SPPParameter {
@@ -1213,13 +1207,13 @@ message SPPParameter {
    STOCHASTIC = 2;
  }
  optional uint32 pyramid_height = 1;
-  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  optional PoolMethod pool = 2 [ default = MAX ]; // The pooling method
  enum Engine {
    DEFAULT = 0;
    CAFFE = 1;
    CUDNN = 2;
  }
-  optional Engine engine = 6 [default = DEFAULT];
+  optional Engine engine = 6 [ default = DEFAULT ];
 }

 // DEPRECATED: use LayerParameter.
@@ -1323,40 +1317,40 @@ message V0LayerParameter {

  // Parameters to specify layers with inner products.
  optional uint32 num_output = 3; // The number of outputs for the layer
-  optional bool biasterm = 4 [default = true]; // whether to have bias terms
-  optional FillerParameter weight_filler = 5; // The filler for the weight
-  optional FillerParameter bias_filler = 6; // The filler for the bias
-
-  optional uint32 pad = 7 [default = 0]; // The padding size
-  optional uint32 kernelsize = 8; // The kernel size
-  optional uint32 group = 9 [default = 1]; // The group size for group conv
-  optional uint32 stride = 10 [default = 1]; // The stride
+  optional bool biasterm = 4 [ default = true ]; // whether to have bias terms
+  optional FillerParameter weight_filler = 5;    // The filler for the weight
+  optional FillerParameter bias_filler = 6;      // The filler for the bias
+
+  optional uint32 pad = 7 [ default = 0 ];     // The padding size
+  optional uint32 kernelsize = 8;              // The kernel size
+  optional uint32 group = 9 [ default = 1 ];   // The group size for group conv
+  optional uint32 stride = 10 [ default = 1 ]; // The stride
  enum PoolMethod {
    MAX = 0;
    AVE = 1;
    STOCHASTIC = 2;
  }
-  optional PoolMethod pool = 11 [default = MAX]; // The pooling method
-  optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio
+  optional PoolMethod pool = 11 [ default = MAX ];     // The pooling method
+  optional float dropout_ratio = 12 [ default = 0.5 ]; // dropout ratio

-  optional uint32 local_size = 13 [default = 5]; // for local response norm
-  optional float alpha = 14 [default = 1.]; // for local response norm
-  optional float beta = 15 [default = 0.75]; // for local response norm
-  optional float k = 22 [default = 1.];
+  optional uint32 local_size = 13 [ default = 5 ]; // for local response norm
+  optional float alpha = 14 [ default = 1. ];      // for local response norm
+  optional float beta = 15 [ default = 0.75 ];     // for local response norm
+  optional float k = 22 [ default = 1. ];

  // For data layers, specify the data source
  optional string source = 16;
  // For data pre-processing, we can do simple scaling and subtracting the
  // data mean, if provided. Note that the mean subtraction is always carried
  // out before scaling.
-  optional float scale = 17 [default = 1];
+  optional float scale = 17 [ default = 1 ];
  optional string meanfile = 18;
  // For data layers, specify the batch size.
  optional uint32 batchsize = 19;
  // For data layers, specify if we would like to randomly crop an image.
-  optional uint32 cropsize = 20 [default = 0];
+  optional uint32 cropsize = 20 [ default = 0 ];
  // For data layers, specify if we want to randomly mirror data.
-  optional bool mirror = 21 [default = false];
+  optional bool mirror = 21 [ default = false ];

  // The blobs containing the numeric parameters of the layer
  repeated BlobProto blobs = 50;
@@ -1370,41 +1364,41 @@ message V0LayerParameter {
  // to avoid all asynchronous sgd clients to start at the same point. The skip
  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
  // be larger than the number of keys in the database.
-  optional uint32 rand_skip = 53 [default = 0];
+  optional uint32 rand_skip = 53 [ default = 0 ];

  // Fields related to detection (det_*)
  // foreground (object) overlap threshold
-  optional float det_fg_threshold = 54 [default = 0.5];
+  optional float det_fg_threshold = 54 [ default = 0.5 ];
  // background (non-object) overlap threshold
-  optional float det_bg_threshold = 55 [default = 0.5];
+  optional float det_bg_threshold = 55 [ default = 0.5 ];
  // Fraction of batch that should be foreground objects
-  optional float det_fg_fraction = 56 [default = 0.25];
+  optional float det_fg_fraction = 56 [ default = 0.25 ];

  // optional bool OBSOLETE_can_clobber = 57 [default = true];

  // Amount of contextual padding to add around a window
  // (used only by the window_data_layer)
-  optional uint32 det_context_pad = 58 [default = 0];
+  optional uint32 det_context_pad = 58 [ default = 0 ];

  // Mode for cropping out a detection window
  // warp: cropped window is warped to a fixed size and aspect ratio
  // square: the tightest square around the window is cropped
-  optional string det_crop_mode = 59 [default = "warp"];
+  optional string det_crop_mode = 59 [ default = "warp" ];

  // For ReshapeLayer, one needs to specify the new dimensions.
-  optional int32 new_num = 60 [default = 0];
-  optional int32 new_channels = 61 [default = 0];
-  optional int32 new_height = 62 [default = 0];
-  optional int32 new_width = 63 [default = 0];
+  optional int32 new_num = 60 [ default = 0 ];
+  optional int32 new_channels = 61 [ default = 0 ];
+  optional int32 new_height = 62 [ default = 0 ];
+  optional int32 new_width = 63 [ default = 0 ];

  // Whether or not ImageLayer should shuffle the list of files at every epoch.
  // It will also resize images if new_height or new_width are not zero.
-  optional bool shuffle_images = 64 [default = false];
+  optional bool shuffle_images = 64 [ default = false ];

  // For ConcatLayer, one needs to specify the dimension for concatenation, and
  // the other dimensions must be the same for all the bottom blobs.
  // By default it will concatenate blobs along the channels dimension.
-  optional uint32 concat_dim = 65 [default = 1];
+  optional uint32 concat_dim = 65 [ default = 1 ];

  optional HDF5OutputParameter hdf5_output_param = 1001;
 }
@@ -1416,14 +1410,14 @@ message PReLUParameter {
  // Initial value of a_i. Default is a_i=0.25 for all i.
  optional FillerParameter filler = 1;
  // Whether or not slope paramters are shared across channels.
-  optional bool channel_shared = 2 [default = false];
+  optional bool channel_shared = 2 [ default = false ];
 }

 message SmoothL1LossParameter {
  // SmoothL1Loss(x) =
  //   0.5 * (sigma * x) ** 2    -- if x < 1.0 / sigma / sigma
  //   |x| - 0.5 / sigma / sigma -- otherwise
-  optional float sigma = 1 [default = 1];
+  optional float sigma = 1 [ default = 1 ];
 }

 message PermuteParameter {
@@ -1434,20 +1428,18 @@ message PermuteParameter {
 }

 message NormalizeParameter {
-  optional bool across_spatial = 1 [default = true];
+  optional bool across_spatial = 1 [ default = true ];
  // Initial value of scale. Default is 1.0 for all
  optional FillerParameter scale_filler = 2;
  // Whether or not scale parameters are shared across channels.
-  optional bool channel_shared = 3 [default = true];
+  optional bool channel_shared = 3 [ default = true ];
  // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 4 [default = 1e-5];
+  optional float eps = 4 [ default = 1e-12 ];
 }

 message GroupNormParameter {
-  optional float eps = 1 [default = 1e-5];
-  optional int32 group = 2 [default = 32];
+  optional float eps = 1 [ default = 1e-5 ];
+  optional int32 group = 2 [ default = 32 ];
 }

-message CastParameter {
-  optional string dtype = 1;
-}
+message CastParameter { optional string dtype = 1; }
--- a/caffe/solver.py
+++ b/caffe/solver.py
@@ -9,7 +9,7 @@
 #
 # ------------------------------------------------------------

-"""Implementation for the ``Solver`` C++ class."""
+"""The solver to update parameters."""

 from __future__ import absolute_import
 from __future__ import division
@@ -19,9 +19,12 @@ import time

 from google.protobuf import text_format

-from dragon import updaters
 from dragon.core.autograph import def_function
 from dragon.core.framework import workspace
+from dragon.core.training.adam import Adam
+from dragon.core.training.rmsprop import RMSprop
+from dragon.core.training.sgd import SGD
+from dragon.core.training.sgd import Nesterov
 from dragon.vm.caffe.net import Net
 from dragon.vm.caffe.proto import caffe_pb2

@@ -47,10 +50,10 @@ class Solver(object):
        if self._param.iter_size > 1:
            raise NotImplementedError('GradientAccum is deprecated.')
        self._arguments = {
-            'scale_gradient': 1. / self._param.iter_size,
-            'clip_gradient': float(self._param.clip_gradients),
-            'l2_decay': float(self._param.weight_decay)
-            if str(self._param.regularization_type) == 'L2' else -1.,
+            'scale': 1. / self._param.iter_size,
+            'clip_norm': float(self._param.clip_gradients),
+            'weight_decay': float(self._param.weight_decay)
+            if str(self._param.regularization_type) == 'L2' else 0,
        }
        self._optimizer = None
        self._net, self._test_nets = None, []
@@ -415,7 +418,7 @@ class AdamSolver(Solver):
        self._arguments['beta1'] = self._param.momentum
        self._arguments['beta2'] = self._param.momentum2
        self._arguments['eps'] = self._param.delta
-        self._optimizer = updaters.Adam(**self._arguments)
+        self._optimizer = Adam(**self._arguments)


 class NesterovSolver(Solver):
@@ -447,7 +450,7 @@ class NesterovSolver(Solver):
        super(NesterovSolver, self).__init__(solver_file, is_root)
        self._arguments['base_lr'] = self._param.base_lr
        self._arguments['momentum'] = self._param.momentum
-        self._optimizer = updaters.Nesterov(**self._arguments)
+        self._optimizer = Nesterov(**self._arguments)


 class RMSPropSolver(Solver):
@@ -481,7 +484,7 @@ class RMSPropSolver(Solver):
        self._arguments['base_lr'] = self._param.base_lr
        self._arguments['decay'] = self._param.rms_decay
        self._arguments['eps'] = self._param.delta
-        self._optimizer = updaters.RMSProp(**self._arguments)
+        self._optimizer = RMSprop(**self._arguments)


 class SGDSolver(Solver):
@@ -513,4 +516,4 @@ class SGDSolver(Solver):
        super(SGDSolver, self).__init__(solver_file, is_root)
        self._arguments['base_lr'] = self._param.base_lr
        self._arguments['momentum'] = self._param.momentum
-        self._optimizer = updaters.SGD(**self._arguments)
+        self._optimizer = SGD(**self._arguments)
--- a/docs/api/python/dragon/math.rst
+++ b/docs/api/python/dragon/math.rst
@@ -9,9 +9,6 @@ dragon.math
  `abs(...) <math/abs.html>`_
  : Compute the absolute value of input.

-  `accumulate(...) <math/accumulate.html>`_
-  : Compute the element-wise accumulation from input to output.
-
  `add(...) <math/add.html>`_
  : Compute the element-wise addition.

@@ -24,6 +21,9 @@ dragon.math
  `argmin(...) <math/argmin.html>`_
  : Compute the indices of minimum elements along the given axis.

+  `axpby(...) <math/axpby.html>`_
+  : Compute the element-wise addition from input to output.
+
  `ceil(...) <math/ceil.html>`_
  : Compute the smallest integer not less than input.

@@ -96,9 +96,6 @@ dragon.math
  `moments(...) <math/moments.html>`_
  : Compute the mean and variance of input along the given axes.

-  `moving_average(...) <math/moving_average.html>`_
-  : Compute the moving average of input to output.
-
  `mul(...) <math/mul.html>`_
  : Compute the element-wise multiplication.

@@ -148,11 +145,11 @@ dragon.math
  :hidden:

  math/abs
-  math/accumulate
  math/add
  math/affine
  math/argmax
  math/argmin
+  math/axpby
  math/ceil
  math/clip
  math/cos
@@ -177,7 +174,6 @@ dragon.math
  math/min
  math/minimum
  math/moments
-  math/moving_average
  math/mul
  math/negative
  math/not_equal

--- a/docs/api/python/dragon/math/accumulate.rst
+++ b/docs/api/python/dragon/math/accumulate.rst
-accumulate
-==========
+axpby
+=====

-.. autofunction:: dragon.math.accumulate
+.. autofunction:: dragon.math.axpby

 .. raw:: html


--- a/docs/api/python/dragon/math/moving_average.rst
+++ b/docs/api/python/dragon/math/moving_average.rst
-moving_average
-==============
-
-.. autofunction:: dragon.math.moving_average
-
-.. raw:: html
-
-  <style>
-    h1:before {
-      content: "dragon.math.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/updaters.rst
+++ b/docs/api/python/dragon/updaters.rst
-dragon.updaters
-===============
+dragon.optimizers
+=================

 .. only:: html

  Classes
  -------

-  `class Adam <updaters/Adam.html>`_
-  : The updater which implements Adam algorithm.
+  `class Adam <optimizers/Adam.html>`_
+  : The optimizer to apply Adam algorithm.
  `[Kingma & Ba, 2014] <https://arxiv.org/abs/1412.6980>`_.

-  `class Nesterov <updaters/Nesterov.html>`_
-  : The updater which implements NesterovSGD algorithm.
+  `class Nesterov <optimizers/Nesterov.html>`_
+  : The optimizer to apply NesterovSGD algorithm.
  `[Sutskever et.al, 2013] <http://www.cs.toronto.edu/~hinton/absps/momentum.pdf>`_.

-  `class RMSProp <updaters/RMSProp.html>`_
-  : The updater which implements RMSprop algorithm.
+  `class RMSProp <optimizers/RMSprop.html>`_
+  : The optimizer to apply RMSprop algorithm.
  `[Hinton et.al, 2013] <http://www.cs.utoronto.ca/~bonner/courses/2016s/csc321/lectures/lec6.pdf>`_.

-  `class SGD <updaters/SGD.html>`_
-  : The updater which implements MomentumSGD algorithm.
+  `class SGD <optimizers/SGD.html>`_
+  : The optimizer to apply MomentumSGD algorithm.
  `[Polyak, 1964] <https://doi.org/10.1016/0041-5553(64)90137-5>`_.

 .. toctree::
   :hidden:

-   updaters/Adam
-   updaters/Nesterov
-   updaters/RMSProp
-   updaters/SGD
+   optimizers/Adam
+   optimizers/Nesterov
+   optimizers/Optimizer
+   optimizers/RMSprop
+   optimizers/SGD

 .. raw:: html


--- a/docs/api/python/dragon/updaters/Adam.rst
+++ b/docs/api/python/dragon/updaters/Adam.rst
 Adam
 ====

-.. autoclass:: dragon.updaters.Adam
+.. autoclass:: dragon.optimizers.Adam

 __init__
 --------
-.. automethod:: dragon.updaters.Adam.__init__
+.. automethod:: dragon.optimizers.Adam.__init__

 Methods
 -------

 apply_gradients
 ################
-.. automethod:: dragon.updaters.Updater.apply_gradients
+.. automethod:: dragon.optimizers.Optimizer.apply_gradients
  :noindex:

 .. raw:: html

  <style>
    h1:before {
-      content: "dragon.updaters.";
+      content: "dragon.optimizers.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/dragon/updaters/Nesterov.rst
+++ b/docs/api/python/dragon/updaters/Nesterov.rst
 Nesterov
 ========

-.. autoclass:: dragon.updaters.Nesterov
+.. autoclass:: dragon.optimizers.Nesterov

 __init__
 --------
-.. automethod:: dragon.updaters.Nesterov.__init__
+.. automethod:: dragon.optimizers.Nesterov.__init__

 Methods
 -------

 apply_gradients
 ################
-.. automethod:: dragon.updaters.Updater.apply_gradients
+.. automethod:: dragon.optimizers.Optimizer.apply_gradients
  :noindex:

 .. raw:: html

  <style>
    h1:before {
-      content: "dragon.updaters.";
+      content: "dragon.optimizers.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/dragon/optimizers/Optimizer.rst
+++ b/docs/api/python/dragon/optimizers/Optimizer.rst
+Optimizer
+=========
+
+.. autoclass:: dragon.optimizers.Optimizer
+
+__init__
+--------
+.. automethod:: dragon.optimizers.Optimizer.__init__
+
+Methods
+-------
+
+apply_gradients
+################
+.. automethod:: dragon.optimizers.Optimizer.apply_gradients
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "dragon.optimizers.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/dragon/updaters/RMSProp.rst
+++ b/docs/api/python/dragon/updaters/RMSProp.rst
-RMSProp
+RMSprop
 =======

-.. autoclass:: dragon.updaters.RMSProp
+.. autoclass:: dragon.optimizers.RMSprop

 __init__
 --------
-.. automethod:: dragon.updaters.RMSProp.__init__
+.. automethod:: dragon.optimizers.RMSprop.__init__

 Methods
 -------

 apply_gradients
 ################
-.. automethod:: dragon.updaters.Updater.apply_gradients
+.. automethod:: dragon.optimizers.Optimizer.apply_gradients
  :noindex:

 .. raw:: html

  <style>
    h1:before {
-      content: "dragon.updaters.";
+      content: "dragon.optimizers.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/dragon/updaters/SGD.rst
+++ b/docs/api/python/dragon/updaters/SGD.rst
 SGD
 ===

-.. autoclass:: dragon.updaters.SGD
+.. autoclass:: dragon.optimizers.SGD

 __init__
 --------
-.. automethod:: dragon.updaters.SGD.__init__
+.. automethod:: dragon.optimizers.SGD.__init__

 Methods
 -------

 apply_gradients
 ################
-.. automethod:: dragon.updaters.Updater.apply_gradients
+.. automethod:: dragon.optimizers.Optimizer.apply_gradients
  :noindex:

 .. raw:: html

  <style>
    h1:before {
-      content: "dragon.updaters.";
+      content: "dragon.optimizers.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -14,15 +14,15 @@ For using it, import as follows:

 However, it will not help you much because you do not want to learn it.

-We have extended it with following programming styles:
+To resolve this matter, we are concerned to design diverse styles for you:

 Dragon
 ######

-  *Dragon* takes a very light-weight programming style.
+  *Dragon* is initially as a light-weight but professional style.

-  Our goal is to reduce unnecessary structures or interfaces. Therefore,
-  in addition to feed or fetch, the last thing is designing a function.
+  Native interfaces are encouraged to manipulate the backend engine
+  to perform the computation flexibly with data feeding or fetching.

  This style involves the following components:

@@ -38,15 +38,15 @@ Dragon
  * `dragon.math <dragon/math.html>`_
  * `dragon.metrics <dragon/metrics.html>`_
  * `dragon.nn <dragon/nn.html>`_
+  * `dragon.optimizers <dragon/optimizers.html>`_
  * `dragon.random <dragon/random.html>`_
-  * `dragon.updaters <dragon/updaters.html>`_
-  * `dragon.vision <dragon/vision.html>`_
  * `dragon.workspace <dragon/workspace.html>`_
+  * `dragon.vision <dragon/vision.html>`_

 Caffe
 #####

-  *Caffe* is one of the most famous deep learning framework for Computer Vision.
+  *Caffe* is the most famous framework for vision.

  Our work is very different from the official python wrappers, a.k.a,
  the *PyCaffe*, which comes from the exports of *BoostPython*
@@ -102,7 +102,7 @@ PyTorch

  *PyTorch* provides straight-forward operations on research prototyping.

-  To bridge it, our *JIT* traces and dispatches the expressions,
+  To bridge it, our *JIT* traces and dispatches the operations,
  as well as the rewriting of *GC* (Garbage Collection) to reuse
  the memories and operators by turns.

@@ -168,52 +168,52 @@ Modules
 .. only:: html

  `Module autograph <dragon/autograph.html>`_
-  : Public API for ``dragon.autograph`` namespace.
+  : Native API for ``dragon.autograph`` namespace.

  `Module bitwise <dragon/bitwise.html>`_
-  : Public API for ``dragon.bitwise`` namespace.
+  : Native API for ``dragon.bitwise`` namespace.

  `Module cuda <dragon/cuda.html>`_
-  : Public API for ``dragon.cuda`` namespace.
+  : Native API for ``dragon.cuda`` namespace.

  `Module distributed <dragon/distributed.html>`_
-  : Public API for ``dragon.distributed`` namespace.
+  : Native API for ``dragon.distributed`` namespace.

  `Module dlpack <dragon/dlpack.html>`_
-  : Public API for ``dragon.dlpack`` namespace.
+  : Native API for ``dragon.dlpack`` namespace.

  `Module io <dragon/io.html>`_
-  : Public API for ``dragon.io`` namespace.
+  : Native API for ``dragon.io`` namespace.

  `Module logging <dragon/logging.html>`_
-  : Public API for ``dragon.logging`` namespace.
+  : Native API for ``dragon.logging`` namespace.

  `Module losses <dragon/losses.html>`_
-  : Public API for ``dragon.losses`` namespace.
+  : Native API for ``dragon.losses`` namespace.

  `Module math <dragon/math.html>`_
-  : Public API for ``dragon.math`` namespace.
+  : Native API for ``dragon.math`` namespace.

  `Module metrics <dragon/metrics.html>`_
-  : Public API for ``dragon.metrics`` namespace.
+  : Native API for ``dragon.metrics`` namespace.

  `Module nn <dragon/nn.html>`_
-  : Public API for ``dragon.nn`` namespace.
+  : Native API for ``dragon.nn`` namespace.
+
+  `Module optimizers <dragon/optimizers.html>`_
+  : Native API for ``dragon.optimizers`` namespace.

  `Module random <dragon/random.html>`_
-  : Public API for ``dragon.random`` namespace.
+  : Native API for ``dragon.random`` namespace.

-  `Module updaters <dragon/updaters.html>`_
-  : Public API for ``dragon.updaters`` namespace.
+  `Module workspace <dragon/workspace.html>`_
+  : Native API for ``dragon.workspace`` namespace.

  `Module vision <dragon/vision.html>`_
-  : Public API for ``dragon.vision`` namespace.
+  : Native API for ``dragon.vision`` namespace.

  `Module workspace <dragon/workspace.html>`_
-  : Public API for ``dragon.workspace`` namespace.
-
-  `Module workspace <dragon/workspace.html>`_
-  : Public API for ``dragon.workspace`` namespace.
+  : Native API for ``dragon.workspace`` namespace.

  `Module vm.caffe <caffe.html>`_
  : Virtual API for ``caffe`` namespace.
@@ -317,10 +317,10 @@ Modules
  dragon/math
  dragon/metrics
  dragon/nn
+  dragon/optimizers
  dragon/random
-  dragon/updaters
-  dragon/vision
  dragon/workspace
+  dragon/vision
  caffe
  caffe/layers
  dali

--- a/docs/api/python/torch.rst
+++ b/docs/api/python/torch.rst
@@ -30,9 +30,6 @@ vm.torch
  `abs(...) <torch/abs.html>`_
  : Compute the absolute value of input.

-  `accumulate(...) <torch/accumulate.html>`_
-  : Compute the element-wise accumulation from input to output.
-
  `add(...) <torch/add.html>`_
  : Compute the element-wise addition.

@@ -45,6 +42,9 @@ vm.torch
  `argmin(...) <torch/argmin.html>`_
  : Return the indices of minimum elements along the given axis.

+  `axpby(...) <torch/axpby.html>`_
+  : Compute the element-wise addition from input to output.
+
  `bitwise_not(...) <torch/bitwise_not.html>`_
  : Compute the element-wise NOT bitwise operation.

@@ -254,11 +254,11 @@ vm.torch
  :hidden:

  torch/abs
-  torch/accumulate
  torch/add
  torch/arange
  torch/argmax
  torch/argmin
+  torch/axpby
  torch/bitwise_not
  torch/bitwise_xor
  torch/cat

--- a/docs/api/python/torch/accumulate.rst
+++ b/docs/api/python/torch/accumulate.rst
-accumulate
-==========
+axpby
+=====

-.. autofunction:: dragon.vm.torch.accumulate
+.. autofunction:: dragon.vm.torch.axpby


 .. raw:: html

--- a/dragon/core/context_cuda.h
+++ b/dragon/core/context_cuda.h
@@ -50,18 +50,18 @@ class CUDAObject {
         */
        if (stream) cudaStreamDestroy(stream);
      }
-      for (auto& e : cublas_handles_[i])
-        if (e) {
-          CUBLAS_CHECK(cublasDestroy_v2(e));
+      for (auto& handle : cublas_handles_[i])
+        if (handle) {
+          CUBLAS_CHECK(cublasDestroy(handle));
        }
 #ifdef USE_CUDNN
-      for (auto& e : cudnn_handles_[i])
-        if (e) {
-          CUDNN_CHECK(cudnnDestroy(e));
+      for (auto& handle : cudnn_handles_[i])
+        if (handle) {
+          CUDNN_CHECK(cudnnDestroy(handle));
        }
 #endif
 #ifdef USE_NCCL
-      for (auto& e : nccl_comms_[i]) {
+      for (auto& comm : nccl_comms_[i]) {
        /*!
         * Temporarily disable the comm destroying,
         * to avoid an unhandled error.
@@ -74,17 +74,18 @@ class CUDAObject {
  /*! \brief Return the specified cublas handle */
  cublasHandle_t cublas_handle(int device_id, int stream_id) {
    auto& handles = cublas_handles_[device_id];
-    if (handles.size() <= (unsigned)stream_id)
+    if (handles.size() <= (unsigned)stream_id) {
      handles.resize(stream_id + 1, nullptr);
+    }
    if (!handles[stream_id]) {
      CUDADeviceGuard guard(device_id);
-      CUBLAS_CHECK(cublasCreate_v2(&handles[stream_id]));
-      CUBLAS_CHECK(
-          cublasSetStream_v2(handles[stream_id], stream(device_id, stream_id)));
+      CUBLAS_CHECK(cublasCreate(&handles[stream_id]));
+      auto& handle = handles[stream_id];
+      CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
+      CUBLAS_CHECK(cublasSetStream(handle, stream(device_id, stream_id)));
 #if CUDA_VERSION >= 9000
      if (TENSOR_CORE_AVAILABLE()) {
-        CUBLAS_CHECK(
-            cublasSetMathMode(handles[stream_id], CUBLAS_TENSOR_OP_MATH));
+        CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
      }
 #endif
    }
@@ -95,13 +96,14 @@ class CUDAObject {
 #ifdef USE_CUDNN
  cudnnHandle_t cudnn_handle(int device_id, int stream_id) {
    auto& handles = cudnn_handles_[device_id];
-    if (handles.size() <= (unsigned)stream_id)
+    if (handles.size() <= (unsigned)stream_id) {
      handles.resize(stream_id + 1, nullptr);
+    }
    if (!handles[stream_id]) {
      CUDADeviceGuard guard(device_id);
      CUDNN_CHECK(cudnnCreate(&handles[stream_id]));
-      CUDNN_CHECK(
-          cudnnSetStream(handles[stream_id], stream(device_id, stream_id)));
+      auto& handle = handles[stream_id];
+      CUDNN_CHECK(cudnnSetStream(handle, stream(device_id, stream_id)));
    }
    return handles[stream_id];
  }
@@ -144,7 +146,7 @@ class CUDAObject {
    if (!streams[stream_id]) {
      CUDADeviceGuard guard(device_id);
      unsigned int flags =
-          !stream_id ? cudaStreamDefault : cudaStreamNonBlocking;
+          stream_id == 0 ? cudaStreamDefault : cudaStreamNonBlocking;
      CUDA_CHECK(cudaStreamCreateWithFlags(&streams[stream_id], flags));
    }
    return streams[stream_id];

--- a/dragon/core/operator.cc
+++ b/dragon/core/operator.cc
@@ -80,7 +80,7 @@ Tensor* OperatorBase::Output(int i, const vec32_t& inputs) {
 }

 Tensor* OperatorBase::Buffer(const string& name) {
-  return ws()->CreateTensor(unique_name(name));
+  return ws()->CreateTensor("/share/buffer/" + handle_ + "/" + name);
 }

 string OperatorBase::TypeString(const Tensor& tensor, const Set<string>& types)

--- a/dragon/core/operator.h
+++ b/dragon/core/operator.h
@@ -133,11 +133,6 @@ class DRAGON_API OperatorBase {
    return handle_;
  }

-  /*! \brief Return the unique name in this operator */
-  const string unique_name(const string& name) const {
-    return "/mnt/" + handle_ + "/" + name;
-  }
-
  /*! \brief Return the stored def */
  const OperatorDef& def() const {
    return def_;
@@ -268,7 +263,6 @@ OperatorBase* NewOperator(const OperatorDef&, Workspace*);
  using OperatorBase::dtype;         \
  using OperatorBase::data_format;   \
  using OperatorBase::handle;        \
-  using OperatorBase::unique_name;   \
  using OperatorBase::def;           \
  using OperatorBase::ws

@@ -277,17 +271,18 @@ OperatorBase* NewOperator(const OperatorDef&, Workspace*);
  using Operator<Context>::allow_run; \
  using Operator<Context>::ctx

-#define STORE_INPUT_SPEC(i)                                             \
-  *(ws()->CreateTensor(unique_name("Input[" + std::to_string(i) + "]")) \
-        ->ReshapeLike(Input(i))                                         \
+#define STORE_INPUT_SPEC(i)               \
+  *(Buffer("X_spec:" + std::to_string(i)) \
+        ->ReshapeLike(Input(i))           \
        ->set_meta(Input(i).meta()))

 #define RESTORE_INPUT_SPEC(i) \
-  *(ws()->GetTensor(unique_name("Input[" + std::to_string(i) + "]")))
+  *(ws()->GetTensor(          \
+      "/share/buffer/" + handle() + "/X_spec:" + std::to_string(i)))

 /* Dispatchers */

-#define XIsType(x, type) x.template IsType<type>()
+#define XIsType(X, type) X.template IsType<type>()

 template <typename... Types>
 struct TensorTypes {};

--- a/dragon/kernels/activation/elu_op_kernel.cu
+++ b/dragon/kernels/activation/elu_op_kernel.cu
@@ -53,14 +53,11 @@ __global__ void _EluGrad(
    const T* y,
    T* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] = dy[i] *
-        (
 #if __CUDA_ARCH__ >= 350
-                __ldg(y + i) > T(0) ? T(1) : alpha + __ldg(y + i)
+    dx[i] = dy[i] * (__ldg(y + i) > T(0) ? T(1) : alpha + __ldg(y + i));
 #else
-                y[i] > T(0) ? T(1) : (alpha + y[i])
+    dx[i] = dy[i] * (y[i] > T(0) ? T(1) : (alpha + y[i]));
 #endif
-        );
  }
 }


--- a/dragon/kernels/activation/softmax_op_kernel.cc
+++ b/dragon/kernels/activation/softmax_op_kernel.cc
@@ -14,28 +14,28 @@ void _Softmax(
    const int inner_dim,
    const T* x,
    T* y) {
-  int row_ofs, col_ofs, yi;
+  int row_offset, col_offset, yi;
  auto x_stride = axis_dim * inner_dim;
  for (int i = 0; i < outer_dim; ++i) {
-    row_ofs = i * axis_dim * inner_dim;
+    row_offset = i * axis_dim * inner_dim;
    for (int j = 0; j < inner_dim; ++j) {
-      col_ofs = row_ofs + j;
-      T val = x[col_ofs];
+      col_offset = row_offset + j;
+      T val = x[col_offset];
      for (int k = 1; k < axis_dim; ++k) {
-        yi = col_ofs + k * inner_dim;
+        yi = col_offset + k * inner_dim;
        val = std::max(val, x[yi]);
      }
      for (int k = 0; k < axis_dim; ++k) {
-        yi = col_ofs + k * inner_dim;
+        yi = col_offset + k * inner_dim;
        y[yi] = std::exp(x[yi] - val);
      }
-      val = y[col_ofs];
+      val = y[col_offset];
      for (int k = 1; k < axis_dim; ++k) {
-        yi = col_ofs + k * inner_dim;
+        yi = col_offset + k * inner_dim;
        val += y[yi];
      }
      for (int k = 0; k < axis_dim; ++k) {
-        yi = col_ofs + k * inner_dim;
+        yi = col_offset + k * inner_dim;
        y[yi] /= val;
      }
    }
@@ -60,19 +60,19 @@ void _SoftmaxGrad(
    const T* dy,
    const T* y,
    T* dx) {
-  int row_ofs, col_ofs, yi;
+  int row_offset, col_offset, yi;
  auto x_stride = axis_dim * inner_dim;
  for (int i = 0; i < outer_dim; ++i) {
-    row_ofs = i * axis_dim * inner_dim;
+    row_offset = i * axis_dim * inner_dim;
    for (int j = 0; j < inner_dim; ++j) {
-      col_ofs = row_ofs + j;
-      T val = dy[col_ofs] * y[col_ofs];
+      col_offset = row_offset + j;
+      T val = dy[col_offset] * y[col_offset];
      for (int k = 1; k < axis_dim; ++k) {
-        yi = col_ofs + k * inner_dim;
+        yi = col_offset + k * inner_dim;
        val += dy[yi] * y[yi];
      }
      for (int k = 0; k < axis_dim; ++k) {
-        yi = col_ofs + k * inner_dim;
+        yi = col_offset + k * inner_dim;
        dx[yi] = (dy[yi] - val) * y[yi];
      }
    }

--- a/dragon/kernels/array/cum_sum_op_kernel.cc
+++ b/dragon/kernels/array/cum_sum_op_kernel.cc
@@ -53,11 +53,11 @@ void _CumSumReverse(
    CPUContext* ctx) {
  const int kStart = axis_dim - 1;
  for (int n = 0; n < outer_dim; ++n) {
-    const int n_ofs = n * axis_dim;
+    const int n_offset = n * axis_dim;
    for (int m = kStart; m >= 0; --m) {
-      const int nm_ofs = (n_ofs + m) * inner_dim;
+      const int nm_offset = (n_offset + m) * inner_dim;
      for (int k = 0; k < inner_dim; ++k) {
-        const int i = nm_ofs + k;
+        const int i = nm_offset + k;
        if (m < kStart) {
          const int j = i + inner_dim;
          y[i] = y[j] + x[exclusive ? j : i];

--- a/dragon/kernels/array/eye_op_kernel.cc
+++ b/dragon/kernels/array/eye_op_kernel.cc
@@ -25,9 +25,9 @@ void _SetEye(const int n, const int m, const int k, T* y) {
      const int n, const int m, const int k, T* y, CPUContext* ctx) { \
    math::Set(n* m, cast::to<T>(0.f), y, ctx);                        \
    if (k > 0) {                                                      \
-      _SetEye(n - k, m, k, y);                                        \
+      if (m - k > 0) _SetEye(m - k, m, k, y);                         \
    } else {                                                          \
-      _SetEye(n + k, m, 0, y - k * m);                                \
+      if (n + k > 0) _SetEye(n + k, m, 0, y - k * m);                 \
    }                                                                 \
  }


--- a/dragon/kernels/array/eye_op_kernel.cu
+++ b/dragon/kernels/array/eye_op_kernel.cu
@@ -20,9 +20,9 @@ __global__ void _SetEye(const int n, const int m, const int k, T* y) {

 template <>
 __global__ void _SetEye<half>(const int n, const int m, const int k, half* y) {
-  const half kZero = __float2half(1.f);
+  const half kOne = __float2half(1.f);
  CUDA_1D_KERNEL_LOOP(i, n) {
-    y[i * m + k + i] = kZero;
+    y[i * m + k + i] = kOne;
  }
 }

@@ -39,26 +39,34 @@ void Eye<float16, CUDAContext>(
    CUDAContext* ctx) {
  math::Set(n * m, cast::to<float16>(0.f), y, ctx);
  if (k > 0) {
-    _SetEye<<<CUDA_BLOCKS(n - k), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-        n - k, m, k, reinterpret_cast<half*>(y));
+    if (m - k > 0) {
+      _SetEye<<<CUDA_BLOCKS(m - k), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
+          m - k, m, k, reinterpret_cast<half*>(y));
+    }
  } else {
-    _SetEye<<<CUDA_BLOCKS(n + k), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-        n + k, m, 0, reinterpret_cast<half*>(y - k * m));
+    if (n + k > 0) {
+      _SetEye<<<CUDA_BLOCKS(n + k), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
+          n + k, m, 0, reinterpret_cast<half*>(y - k * m));
+    }
  }
 }

-#define DEFINE_KERNEL_LAUNCHER(T)                                           \
-  template <>                                                               \
-  void Eye<T, CUDAContext>(                                                 \
-      const int n, const int m, const int k, T* y, CUDAContext* ctx) {      \
-    math::Set(n* m, T(0), y, ctx);                                          \
-    if (k > 0) {                                                            \
-      _SetEye<<<CUDA_BLOCKS(n - k), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-          n - k, m, k, y);                                                  \
-    } else {                                                                \
-      _SetEye<<<CUDA_BLOCKS(n + k), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-          n + k, m, 0, y - k * m);                                          \
-    }                                                                       \
+#define DEFINE_KERNEL_LAUNCHER(T)                                             \
+  template <>                                                                 \
+  void Eye<T, CUDAContext>(                                                   \
+      const int n, const int m, const int k, T* y, CUDAContext* ctx) {        \
+    math::Set(n* m, T(0), y, ctx);                                            \
+    if (k > 0) {                                                              \
+      if (m - k > 0) {                                                        \
+        _SetEye<<<CUDA_BLOCKS(m - k), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+            m - k, m, k, y);                                                  \
+      }                                                                       \
+    } else {                                                                  \
+      if (n + k > 0) {                                                        \
+        _SetEye<<<CUDA_BLOCKS(n + k), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+            n + k, m, 0, y - k * m);                                          \
+      }                                                                       \
+    }                                                                         \
  }

 DEFINE_KERNEL_LAUNCHER(bool);

--- a/dragon/kernels/loss/generic_loss_op_kernel.cc
+++ b/dragon/kernels/loss/generic_loss_op_kernel.cc
@@ -35,21 +35,22 @@ void _BroadcastLossGrad<float16>(

 } // namespace

-#define DEFINE_KERNEL_LAUNCHER(T)                                   \
-  template <>                                                       \
-  void ReduceLoss<T, CPUContext>(                                   \
-      const int count,                                              \
-      const int num_masks,                                          \
-      const float normalizer,                                       \
-      const T* x,                                                   \
-      const int* mask,                                              \
-      T* y,                                                         \
-      CPUContext* ctx) {                                            \
-    float inv_scale = std::max(                                     \
-        1e-5F,                                                      \
-        num_masks > 0 ? (float)math::Sum(num_masks, 1.f, mask, ctx) \
-                      : normalizer);                                \
-    y[0] = math::Sum(count, 1.f / inv_scale, x, ctx);               \
+#define DEFINE_KERNEL_LAUNCHER(T)                         \
+  template <>                                             \
+  void ReduceLoss<T, CPUContext>(                         \
+      const int count,                                    \
+      const int num_masks,                                \
+      const float normalizer,                             \
+      const T* x,                                         \
+      const int* mask,                                    \
+      T* y,                                               \
+      CPUContext* ctx) {                                  \
+    float inv_scale = std::max(                           \
+        1e-5F,                                            \
+        num_masks > 0 && normalizer < 0.f                 \
+            ? (float)math::Sum(num_masks, 1.f, mask, ctx) \
+            : normalizer);                                \
+    y[0] = math::Sum(count, 1.f / inv_scale, x, ctx);     \
  }

 #define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                   \
@@ -64,8 +65,9 @@ void _BroadcastLossGrad<float16>(
      CPUContext* ctx) {                                                 \
    float inv_scale = std::max(                                          \
        1e-5F,                                                           \
-        num_masks > 0 ? (float)math::Sum(num_masks, 1.f, mask, ctx)      \
-                      : normalizer);                                     \
+        num_masks > 0 && normalizer < 0.f                                \
+            ? (float)math::Sum(num_masks, 1.f, mask, ctx)                \
+            : normalizer);                                               \
    math::Scale(count, cast::to<float>(dy[0]) / inv_scale, dx, dx, ctx); \
  }                                                                      \
  template <>                                                            \

--- a/dragon/kernels/loss/generic_loss_op_kernel.cu
+++ b/dragon/kernels/loss/generic_loss_op_kernel.cu
@@ -152,15 +152,15 @@ __global__ void _ReduceLossGradWithMask<half>(
 template <typename T>
 __global__ void _BroadcastLossGrad(
    const int nthreads,
-    const int rows,
-    const int cols,
+    const int dim1,
+    const int dim2,
    const T* dy,
    T* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 350
-    dx[i] *= __ldg(dy + (i / rows) * cols + (i % cols));
+    dx[i] *= __ldg(dy + (i / dim1) * dim2 + (i % dim2));
 #else
-    dx[i] *= dy[(i / rows) * cols + (i % cols)];
+    dx[i] *= dy[(i / dim1) * dim2 + (i % dim2)];
 #endif
  }
 }
@@ -168,18 +168,18 @@ __global__ void _BroadcastLossGrad(
 template <>
 __global__ void _BroadcastLossGrad<half>(
    const int nthreads,
-    const int rows,
-    const int cols,
+    const int dim1,
+    const int dim2,
    const half* dy,
    half* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
 #if __CUDA_ARCH__ >= 350
    dx[i] = __float2half(
        __half2float(dx[i]) *
-        __half2float(__ldg(dy + (i / rows) * cols + (i % cols))));
+        __half2float(__ldg(dy + (i / dim1) * dim2 + (i % dim2))));
 #else
    dx[i] = __float2half(
-        __half2float(dx[i]) * __half2float(dy[(i / rows) * cols + (i % cols)]));
+        __half2float(dx[i]) * __half2float(dy[(i / dim1) * dim2 + (i % dim2)]));
 #endif
  }
 }
@@ -197,7 +197,7 @@ void ReduceLoss<float16, CUDAContext>(
    const int* mask,
    float16* y,
    CUDAContext* ctx) {
-  if (num_masks > 0) {
+  if (num_masks > 0 && normalizer < 0.f) {
    _ReduceLossWithMask<<<1, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
        num_masks,
        reinterpret_cast<const half*>(x),
@@ -221,7 +221,7 @@ void ReduceLossGrad<float16, CUDAContext>(
    const int* mask,
    float16* dx,
    CUDAContext* ctx) {
-  if (num_masks > 0) {
+  if (num_masks > 0 && normalizer < 0.f) {
    _ReduceMask<<<1, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
        num_masks, const_cast<int*>(mask));
    _ReduceLossGradWithMask<<<
@@ -254,16 +254,15 @@ void BroadcastLossGrad<float16, CUDAContext>(
    const float16* dy,
    float16* dx,
    CUDAContext* ctx) {
-  auto rows = outer_dim * axis_dim, cols = inner_dim;
-  auto nthreads = rows * cols;
+  auto nthreads = outer_dim * axis_dim * inner_dim;
  _BroadcastLossGrad<<<
      CUDA_BLOCKS(nthreads),
      CUDA_THREADS,
      0,
      ctx->cuda_stream()>>>(
      nthreads,
-      rows,
-      cols,
+      axis_dim * inner_dim,
+      inner_dim,
      reinterpret_cast<const half*>(dy),
      reinterpret_cast<half*>(dx));
 } // BroadcastLossGrad
@@ -278,7 +277,7 @@ void BroadcastLossGrad<float16, CUDAContext>(
      const int* mask,                                                 \
      T* y,                                                            \
      CUDAContext* ctx) {                                              \
-    if (num_masks > 0) {                                               \
+    if (num_masks > 0 && normalizer < 0.f) {                           \
      _ReduceLossWithMask<<<1, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
          num_masks, x, mask, y);                                      \
    } else {                                                           \
@@ -297,7 +296,7 @@ void BroadcastLossGrad<float16, CUDAContext>(
      const int* mask,                                           \
      T* dx,                                                     \
      CUDAContext* ctx) {                                        \
-    if (num_masks > 0) {                                         \
+    if (num_masks > 0 && normalizer < 0.f) {                     \
      _ReduceMask<<<1, CUDA_THREADS, 0, ctx->cuda_stream()>>>(   \
          num_masks, const_cast<int*>(mask));                    \
      _ReduceLossGradWithMask<<<                                 \
@@ -322,13 +321,13 @@ void BroadcastLossGrad<float16, CUDAContext>(
      const T* dy,                                               \
      T* dx,                                                     \
      CUDAContext* ctx) {                                        \
-    auto rows = outer_dim * axis_dim, cols = inner_dim;          \
-    auto nthreads = rows * cols;                                 \
+    auto nthreads = outer_dim * axis_dim * inner_dim;            \
    _BroadcastLossGrad<<<                                        \
        CUDA_BLOCKS(nthreads),                                   \
        CUDA_THREADS,                                            \
        0,                                                       \
-        ctx->cuda_stream()>>>(nthreads, rows, cols, dy, dx);     \
+        ctx->cuda_stream()>>>(                                   \
+        nthreads, axis_dim * inner_dim, inner_dim, dy, dx);      \
  }

 DEFINE_KERNEL_LAUNCHER(float);

--- a/dragon/kernels/training/mprec_update_op_kernel.cc
+++ b/dragon/kernels/training/mprec_update_op_kernel.cc
@@ -7,31 +7,31 @@ namespace dragon {
 namespace kernel {

 template <>
-void MixedPrecL2Decay<float16, CPUContext>(
+void MixedPrecL2Penalty<float16, CPUContext>(
    const int count,
    const float alpha,
-    const float16* w,
+    const float16* x,
    float* dx,
    CPUContext* ctx) {
 #ifdef USE_OPENMP
 #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
  for (int i = 0; i < count; ++i) {
-    dx[i] += (cast::to<float>(w[i]) * alpha);
+    dx[i] += (cast::to<float>(x[i]) * alpha);
  }
 }

 template <>
 void MixedPrecUpdate<float16, CPUContext>(
    const int count,
-    const float* updates,
-    float16* w,
+    const float* dx,
+    float16* x,
    CPUContext* ctx) {
 #ifdef USE_OPENMP
 #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
  for (int i = 0; i < count; ++i) {
-    w[i] = cast::to<float16>(cast::to<float>(w[i]) - updates[i]);
+    x[i] = cast::to<float16>(cast::to<float>(x[i]) - dx[i]);
  }
 }


--- a/dragon/kernels/training/mprec_update_op_kernel.cu
+++ b/dragon/kernels/training/mprec_update_op_kernel.cu
@@ -9,24 +9,19 @@ namespace kernel {

 namespace {

-__global__ void _MixedPrecL2DecayHalf(
+__global__ void _MixedPrecL2Penalty(
    const int nthreads,
    const float alpha,
-    const half* w,
+    const half* x,
    float* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-#if __CUDA_ARCH__ >= 530
-    dx[i] += __half2float(w[i]) * alpha;
-#endif
+    dx[i] += __half2float(x[i]) * alpha;
  }
 }

-__global__ void
-_MixedPrecUpdateHalf(const int nthreads, const float* updates, half* w) {
+__global__ void _MixedPrecUpdate(const int nthreads, const float* dx, half* x) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-#if __CUDA_ARCH__ >= 530
-    w[i] = __float2half(__half2float(w[i]) - updates[i]);
-#endif
+    x[i] = __float2half(__half2float(x[i]) - dx[i]);
  }
 }

@@ -35,30 +30,27 @@ _MixedPrecUpdateHalf(const int nthreads, const float* updates, half* w) {
 /* ------------------- Launcher Separator ------------------- */

 template <>
-void MixedPrecL2Decay<float16, CUDAContext>(
+void MixedPrecL2Penalty<float16, CUDAContext>(
    const int count,
    const float alpha,
-    const float16* w,
+    const float16* x,
    float* dx,
    CUDAContext* ctx) {
-  _MixedPrecL2DecayHalf<<<
+  _MixedPrecL2Penalty<<<
      CUDA_BLOCKS(count),
      CUDA_THREADS,
      0,
-      ctx->cuda_stream()>>>(count, alpha, reinterpret_cast<const half*>(w), dx);
+      ctx->cuda_stream()>>>(count, alpha, reinterpret_cast<const half*>(x), dx);
 }

 template <>
 void MixedPrecUpdate<float16, CUDAContext>(
    const int count,
-    const float* updates,
-    float16* w,
+    const float* dx,
+    float16* x,
    CUDAContext* ctx) {
-  _MixedPrecUpdateHalf<<<
-      CUDA_BLOCKS(count),
-      CUDA_THREADS,
-      0,
-      ctx->cuda_stream()>>>(count, updates, reinterpret_cast<half*>(w));
+  _MixedPrecUpdate<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
+      count, dx, reinterpret_cast<half*>(x));
 }

 } // namespace kernel

--- a/dragon/kernels/vision/avgpool_op_kernel.cu
+++ b/dragon/kernels/vision/avgpool_op_kernel.cu
@@ -116,15 +116,13 @@ __global__ void _AvgPool2dGradNCHW(
    const T* dy,
    T* dx) {
  CUDA_1D_KERNEL_LOOP(xi, nthreads) {
-    const int w = xi % W;
-    const int h = (xi / W) % H;
+    const int w = xi % W + pad_w;
+    const int h = (xi / W) % H + pad_h;
    const int c = (xi / W / H) % C;
    const int n = xi / W / H / C;

-    const int phstart =
-        (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const int pwstart =
-        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
    const int phend = min(h / stride_h + 1, out_h);
    const int pwend = min(w / stride_w + 1, out_w);

@@ -164,14 +162,12 @@ __global__ void _AvgPool2dGradNHWC(
    T* dx) {
  CUDA_1D_KERNEL_LOOP(xi, nthreads) {
    const int c = xi % C;
-    const int w = (xi / C) % W;
-    const int h = (xi / C / W) % H;
+    const int w = (xi / C) % W + pad_w;
+    const int h = (xi / C / W) % H + pad_h;
    const int n = xi / C / W / H;

-    const int phstart =
-        (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const int pwstart =
-        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
    const int phend = min(h / stride_h + 1, out_h);
    const int pwend = min(w / stride_w + 1, out_w);


--- a/dragon/kernels/vision/conv_op_kernel.cc
+++ b/dragon/kernels/vision/conv_op_kernel.cc
@@ -30,8 +30,8 @@ void _Im2Col2dNCHW(
    const T* im,
    T* col) {
  int ih, iw;
-  const int im_ofs = H * W;
-  for (int c = 0; c < C; ++c, im += im_ofs) {
+  const int im_offset = H * W;
+  for (int c = 0; c < C; ++c, im += im_offset) {
    for (int kh = 0; kh < kernel_h; ++kh) {
      for (int kw = 0; kw < kernel_w; ++kw) {
        ih = -pad_h + kh * dilation_h;
@@ -117,8 +117,8 @@ void _Col2Im2dNCHW(
    const T* col,
    T* im) {
  int ih, iw;
-  const int im_ofs = H * W;
-  for (int c = 0; c < C; ++c, im += im_ofs) {
+  const int im_offset = H * W;
+  for (int c = 0; c < C; ++c, im += im_offset) {
    for (int kh = 0; kh < kernel_h; ++kh) {
      for (int kw = 0; kw < kernel_w; ++kw) {
        ih = -pad_h + kh * dilation_h;

--- a/dragon/kernels/vision/depthwise_conv_op_kernel.cc
+++ b/dragon/kernels/vision/depthwise_conv_op_kernel.cc
@@ -27,13 +27,13 @@ void _DepthwiseConv2dNCHW(
    T* y) {
  T sum_val;
  int ih, iw, xi, wi;
-  int yc_ofs, xc_start, yc_start;
+  int yc_offset, xc_start, yc_start;
  int ih_start, yh_start, iw_start;
  for (int n = 0; n < N; ++n) {
    for (int c = 0; c < C; ++c) {
-      yc_ofs = n * C + c;
-      xc_start = yc_ofs * H * W;
-      yc_start = yc_ofs * out_h;
+      yc_offset = n * C + c;
+      xc_start = yc_offset * H * W;
+      yc_start = yc_offset * out_h;
      for (int oh = 0; oh < out_h; ++oh) {
        ih_start = oh * stride_h - pad_h;
        yh_start = (yc_start + oh) * out_w;

--- a/dragon/kernels/vision/resize_linear_op_kernel.cc
+++ b/dragon/kernels/vision/resize_linear_op_kernel.cc
@@ -46,7 +46,7 @@ void _ResizeLinearNCHW(
  std::array<int, 4> idx = {0, 0, 0, 0};
  std::array<int, 4> dims = {N, C, out_h, out_w};
  float h_in, w_in, u, v, t, b, tl, tr, bl, br;
-  int ti, bi, li, ri, ofs, h_max = H - 1, w_max = W - 1;
+  int ti, bi, li, ri, offset, h_max = H - 1, w_max = W - 1;
  for (int i = 0; i < count; ++i) {
    h_in = TransformCoordinate(idx[2], scale_h, align_corners);
    w_in = TransformCoordinate(idx[3], scale_w, align_corners);
@@ -54,11 +54,11 @@ void _ResizeLinearNCHW(
    bi = (h_in < h_max) ? std::ceil(h_in) : h_max;
    ri = (w_in < w_max) ? std::ceil(w_in) : w_max;
    v = h_in - ti, u = w_in - li;
-    ofs = (idx[0] * C + idx[1]) * H;
-    tl = (float)x[(ofs + ti) * W + li];
-    tr = (float)x[(ofs + ti) * W + ri];
-    bl = (float)x[(ofs + bi) * W + li];
-    br = (float)x[(ofs + bi) * W + ri];
+    offset = (idx[0] * C + idx[1]) * H;
+    tl = (float)x[(offset + ti) * W + li];
+    tr = (float)x[(offset + ti) * W + ri];
+    bl = (float)x[(offset + bi) * W + li];
+    br = (float)x[(offset + bi) * W + ri];
    t = tl + (tr - tl) * u;
    b = bl + (br - bl) * u;
    y[i] = static_cast<T>(t + (b - t) * v);
@@ -83,7 +83,7 @@ void _ResizeLinearNHWC(
  std::array<int, 4> idx = {0, 0, 0, 0};
  std::array<int, 4> dims = {N, out_h, out_w, C};
  float h_in, w_in, u, v, t, b, tl, tr, bl, br;
-  int ti, bi, li, ri, ofs, h_max = H - 1, w_max = W - 1;
+  int ti, bi, li, ri, offset, h_max = H - 1, w_max = W - 1;
  for (int i = 0; i < count; ++i) {
    h_in = TransformCoordinate(idx[1], scale_h, align_corners);
    w_in = TransformCoordinate(idx[2], scale_w, align_corners);
@@ -91,11 +91,11 @@ void _ResizeLinearNHWC(
    bi = (h_in < h_max) ? std::ceil(h_in) : h_max;
    ri = (w_in < w_max) ? std::ceil(w_in) : w_max;
    v = h_in - ti, u = w_in - li;
-    ofs = idx[0] * H;
-    tl = (float)x[((ofs + ti) * W + li) * C + idx[3]];
-    tr = (float)x[((ofs + ti) * W + ri) * C + idx[3]];
-    bl = (float)x[((ofs + bi) * W + li) * C + idx[3]];
-    br = (float)x[((ofs + bi) * W + ri) * C + idx[3]];
+    offset = idx[0] * H;
+    tl = (float)x[((offset + ti) * W + li) * C + idx[3]];
+    tr = (float)x[((offset + ti) * W + ri) * C + idx[3]];
+    bl = (float)x[((offset + bi) * W + li) * C + idx[3]];
+    br = (float)x[((offset + bi) * W + ri) * C + idx[3]];
    t = tl + (tr - tl) * u;
    b = bl + (br - bl) * u;
    y[i] = static_cast<T>(t + (b - t) * v);
@@ -120,7 +120,7 @@ void _ResizeLinearGradNCHW(
  std::array<int, 4> idx = {0, 0, 0, 0};
  std::array<int, 4> dims = {N, C, out_h, out_w};
  float h_in, w_in, u, v, dt, db, tl, tr, bl, br;
-  int ti, bi, li, ri, ofs, h_max = H - 1, w_max = W - 1;
+  int ti, bi, li, ri, offset, h_max = H - 1, w_max = W - 1;
  for (int i = 0; i < count; ++i) {
    h_in = TransformCoordinate(idx[2], scale_h, align_corners);
    w_in = TransformCoordinate(idx[3], scale_w, align_corners);
@@ -128,13 +128,13 @@ void _ResizeLinearGradNCHW(
    bi = (h_in < h_max) ? std::ceil(h_in) : h_max;
    ri = (w_in < w_max) ? std::ceil(w_in) : w_max;
    v = h_in - ti, u = w_in - li;
-    ofs = (idx[0] * C + idx[1]) * H;
+    offset = (idx[0] * C + idx[1]) * H;
    dt = (1.f - v) * static_cast<float>(dy[i]);
    db = v * static_cast<float>(dy[i]);
-    dx[(ofs + ti) * W + li] += (1.f - u) * dt; // tl
-    dx[(ofs + ti) * W + ri] += u * dt; // tr
-    dx[(ofs + bi) * W + li] += (1.f - u) * db; // bl
-    dx[(ofs + bi) * W + ri] += u * db; // br
+    dx[(offset + ti) * W + li] += (1.f - u) * dt; // tl
+    dx[(offset + ti) * W + ri] += u * dt; // tr
+    dx[(offset + bi) * W + li] += (1.f - u) * db; // bl
+    dx[(offset + bi) * W + ri] += u * db; // br
    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -156,7 +156,7 @@ void _ResizeLinearGradNHWC(
  std::array<int, 4> idx = {0, 0, 0, 0};
  std::array<int, 4> dims = {N, out_h, out_w, C};
  float h_in, w_in, u, v, dt, db, tl, tr, bl, br;
-  int ti, bi, li, ri, ofs, h_max = H - 1, w_max = W - 1;
+  int ti, bi, li, ri, offset, h_max = H - 1, w_max = W - 1;
  for (int i = 0; i < count; ++i) {
    h_in = TransformCoordinate(idx[1], scale_h, align_corners);
    w_in = TransformCoordinate(idx[2], scale_w, align_corners);
@@ -164,13 +164,13 @@ void _ResizeLinearGradNHWC(
    bi = (h_in < h_max) ? std::ceil(h_in) : h_max;
    ri = (w_in < w_max) ? std::ceil(w_in) : w_max;
    v = h_in - ti, u = w_in - li;
-    ofs = idx[0] * H;
+    offset = idx[0] * H;
    dt = (1.f - v) * static_cast<float>(dy[i]);
    db = v * static_cast<float>(dy[i]);
-    dx[((ofs + ti) * W + li) * C + idx[3]] += (1.f - u) * dt; // tl
-    dx[((ofs + ti) * W + ri) * C + idx[3]] += u * dt; // tr
-    dx[((ofs + bi) * W + li) * C + idx[3]] += (1.f - u) * db; // bl
-    dx[((ofs + bi) * W + ri) * C + idx[3]] += u * db; // br
+    dx[((offset + ti) * W + li) * C + idx[3]] += (1.f - u) * dt; // tl
+    dx[((offset + ti) * W + ri) * C + idx[3]] += u * dt; // tr
+    dx[((offset + bi) * W + li) * C + idx[3]] += (1.f - u) * db; // bl
+    dx[((offset + bi) * W + ri) * C + idx[3]] += u * db; // br
    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }

--- a/dragon/kernels/vision/resize_linear_op_kernel.cu
+++ b/dragon/kernels/vision/resize_linear_op_kernel.cu
@@ -61,17 +61,17 @@ __global__ void _ResizeLinearNCHW(
    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
    const float u = w_in - li;

-    const int ofs = (n * C + c) * H;
+    const int offset = (n * C + c) * H;
 #if __CUDA_ARCH__ >= 350
-    const float tl = __ldg(x + ((ofs + ti) * W + li));
-    const float tr = __ldg(x + ((ofs + ti) * W + ri));
-    const float bl = __ldg(x + ((ofs + bi) * W + li));
-    const float br = __ldg(x + ((ofs + bi) * W + ri));
+    const float tl = __ldg(x + ((offset + ti) * W + li));
+    const float tr = __ldg(x + ((offset + ti) * W + ri));
+    const float bl = __ldg(x + ((offset + bi) * W + li));
+    const float br = __ldg(x + ((offset + bi) * W + ri));
 #else
-    const float tl = x[(ofs + ti) * W + li];
-    const float tr = x[(ofs + ti) * W + ri];
-    const float bl = x[(ofs + bi) * W + li];
-    const float br = x[(ofs + bi) * W + ri];
+    const float tl = x[(offset + ti) * W + li];
+    const float tr = x[(offset + ti) * W + ri];
+    const float bl = x[(offset + bi) * W + li];
+    const float br = x[(offset + bi) * W + ri];
 #endif
    const float t = tl + (tr - tl) * u;
    const float b = bl + (br - bl) * u;
@@ -109,11 +109,11 @@ __global__ void _ResizeLinearNCHW<half>(
    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
    const float u = w_in - li;

-    const int ofs = (n * C + c) * H;
-    const float tl = __half2float(__ldg(x + ((ofs + ti) * W + li)));
-    const float tr = __half2float(__ldg(x + ((ofs + ti) * W + ri)));
-    const float bl = __half2float(__ldg(x + ((ofs + bi) * W + li)));
-    const float br = __half2float(__ldg(x + ((ofs + bi) * W + ri)));
+    const int offset = (n * C + c) * H;
+    const float tl = __half2float(__ldg(x + ((offset + ti) * W + li)));
+    const float tr = __half2float(__ldg(x + ((offset + ti) * W + ri)));
+    const float bl = __half2float(__ldg(x + ((offset + bi) * W + li)));
+    const float br = __half2float(__ldg(x + ((offset + bi) * W + ri)));
    const float t = tl + (tr - tl) * u;
    const float b = bl + (br - bl) * u;

@@ -151,17 +151,17 @@ __global__ void _ResizeLinearNHWC(
    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
    const float u = w_in - li;

-    const int ofs = n * H;
+    const int offset = n * H;
 #if __CUDA_ARCH__ >= 350
-    const float tl = __ldg(x + (((ofs + ti) * W + li) * C + c));
-    const float tr = __ldg(x + (((ofs + ti) * W + ri) * C + c));
-    const float bl = __ldg(x + (((ofs + bi) * W + li) * C + c));
-    const float br = __ldg(x + (((ofs + bi) * W + ri) * C + c));
+    const float tl = __ldg(x + (((offset + ti) * W + li) * C + c));
+    const float tr = __ldg(x + (((offset + ti) * W + ri) * C + c));
+    const float bl = __ldg(x + (((offset + bi) * W + li) * C + c));
+    const float br = __ldg(x + (((offset + bi) * W + ri) * C + c));
 #else
-    const float tl = x[((ofs + ti) * W + li) * C + c];
-    const float tr = x[((ofs + ti) * W + ri) * C + c];
-    const float bl = x[((ofs + bi) * W + li) * C + c];
-    const float br = x[((ofs + bi) * W + ri) * C + c];
+    const float tl = x[((offset + ti) * W + li) * C + c];
+    const float tr = x[((offset + ti) * W + ri) * C + c];
+    const float bl = x[((offset + bi) * W + li) * C + c];
+    const float br = x[((offset + bi) * W + ri) * C + c];
 #endif
    const float t = tl + (tr - tl) * u;
    const float b = bl + (br - bl) * u;
@@ -199,11 +199,15 @@ __global__ void _ResizeLinearNHWC<half>(
    const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
    const float u = w_in - li;

-    const int ofs = n * H;
-    const float tl = __half2float(__ldg(x + (((ofs + ti) * W + li) * C + c)));
-    const float tr = __half2float(__ldg(x + (((ofs + ti) * W + ri) * C + c)));
-    const float bl = __half2float(__ldg(x + (((ofs + bi) * W + li) * C + c)));
-    const float br = __half2float(__ldg(x + (((ofs + bi) * W + ri) * C + c)));
+    const int offset = n * H;
+    const float tl =
+        __half2float(__ldg(x + (((offset + ti) * W + li) * C + c)));
+    const float tr =
+        __half2float(__ldg(x + (((offset + ti) * W + ri) * C + c)));
+    const float bl =
+        __half2float(__ldg(x + (((offset + bi) * W + li) * C + c)));
+    const float br =
+        __half2float(__ldg(x + (((offset + bi) * W + ri) * C + c)));
    const float t = tl + (tr - tl) * u;
    const float b = bl + (br - bl) * u;

@@ -249,11 +253,11 @@ __global__ void _ResizeLinearGradNCHW(
    const float db = v * ((float)dy[yi]);
 #endif

-    const int ofs = (n * C + c) * H;
-    atomicAdd(&dx[(ofs + ti) * W + li], (1.f - u) * dt);
-    atomicAdd(&dx[(ofs + ti) * W + ri], u * dt);
-    atomicAdd(&dx[(ofs + bi) * W + li], (1.f - u) * db);
-    atomicAdd(&dx[(ofs + bi) * W + ri], u * db);
+    const int offset = (n * C + c) * H;
+    atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
+    atomicAdd(&dx[(offset + ti) * W + ri], u * dt);
+    atomicAdd(&dx[(offset + bi) * W + li], (1.f - u) * db);
+    atomicAdd(&dx[(offset + bi) * W + ri], u * db);
  }
 }

@@ -290,11 +294,11 @@ __global__ void _ResizeLinearGradNCHW<half>(
    const float dt = (1.f - v) * __half2float(__ldg(dy + yi));
    const float db = v * __half2float(__ldg(dy + yi));

-    const int ofs = (n * C + c) * H;
-    atomicAdd(&dx[(ofs + ti) * W + li], (1.f - u) * dt);
-    atomicAdd(&dx[(ofs + ti) * W + ri], u * dt);
-    atomicAdd(&dx[(ofs + bi) * W + li], (1.f - u) * db);
-    atomicAdd(&dx[(ofs + bi) * W + ri], u * db);
+    const int offset = (n * C + c) * H;
+    atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
+    atomicAdd(&dx[(offset + ti) * W + ri], u * dt);
+    atomicAdd(&dx[(offset + bi) * W + li], (1.f - u) * db);
+    atomicAdd(&dx[(offset + bi) * W + ri], u * db);
 #endif
  }
 }
@@ -336,11 +340,11 @@ __global__ void _ResizeLinearGradNHWC(
    const float db = v * ((float)dy[yi]);
 #endif

-    const int ofs = n * H;
-    atomicAdd(&dx[((ofs + ti) * W + li) * C + c], (1.f - u) * dt);
-    atomicAdd(&dx[((ofs + ti) * W + ri) * C + c], u * dt);
-    atomicAdd(&dx[((ofs + bi) * W + li) * C + c], (1.f - u) * db);
-    atomicAdd(&dx[((ofs + bi) * W + ri) * C + c], u * db);
+    const int offset = n * H;
+    atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt);
+    atomicAdd(&dx[((offset + ti) * W + ri) * C + c], u * dt);
+    atomicAdd(&dx[((offset + bi) * W + li) * C + c], (1.f - u) * db);
+    atomicAdd(&dx[((offset + bi) * W + ri) * C + c], u * db);
  }
 }


--- a/dragon/operators/activation/elu_op_cudnn.cc
+++ b/dragon/operators/activation/elu_op_cudnn.cc
@@ -11,7 +11,6 @@ template <typename T>
 void CuDNNEluOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0, {0});
  CuDNNSetTensorDesc<T>(&input_desc_, X.dims());
-
  CUDNN_CHECK(cudnnActivationForward(
      ctx()->cudnn_handle(),
      act_desc_,
@@ -33,7 +32,6 @@ template <typename T>
 void CuDNNEluGradientOp<Context>::DoRunWithType() {
  auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
  CuDNNSetTensorDesc<T>(&input_desc_, Y.dims());
-
  CUDNN_CHECK(cudnnActivationBackward(
      ctx()->cudnn_handle(),
      act_desc_,

--- a/dragon/operators/activation/relu_op.cc
+++ b/dragon/operators/activation/relu_op.cc
@@ -7,7 +7,6 @@ template <class Context>
 template <typename T>
 void ReluOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0, {0});
-
  if (max_value_ > 0.f) {
    kernel::ReluN(
        X.count(),
@@ -34,7 +33,6 @@ template <class Context>
 template <typename T>
 void ReluGradientOp<Context>::DoRunWithType() {
  auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
-
  if (max_value_ > 0.f) {
    kernel::ReluNGrad(
        Y.count(),

--- a/dragon/operators/activation/relu_op_cudnn.cc
+++ b/dragon/operators/activation/relu_op_cudnn.cc
@@ -9,7 +9,6 @@ template <typename T>
 void CuDNNReluOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0, {0});
  CuDNNSetTensorDesc<T>(&input_desc_, X.dims());
-
 #if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationForward(
      ctx()->cudnn_handle(),
@@ -47,7 +46,6 @@ template <typename T>
 void CuDNNReluGradientOp<Context>::DoRunWithType() {
  auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
  CuDNNSetTensorDesc<T>(&input_desc_, Y.dims());
-
 #if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationBackward(
      ctx()->cudnn_handle(),

--- a/dragon/operators/activation/sigmoid_op_cudnn.cc
+++ b/dragon/operators/activation/sigmoid_op_cudnn.cc
@@ -9,7 +9,6 @@ template <typename T>
 void CuDNNSigmoidOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0, {0});
  CuDNNSetTensorDesc<T>(&input_desc_, X.dims());
-
 #if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationForward(
      ctx()->cudnn_handle(),
@@ -43,7 +42,6 @@ template <typename T>
 void CuDNNSigmoidGradientOp<Context>::DoRunWithType() {
  auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
  CuDNNSetTensorDesc<T>(&input_desc_, Y.dims());
-
 #if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationBackward(
      ctx()->cudnn_handle(),

--- a/dragon/operators/activation/softmax_op_cudnn.cc
+++ b/dragon/operators/activation/softmax_op_cudnn.cc
@@ -9,10 +9,8 @@ template <typename T>
 void CuDNNSoftmaxOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0, {0});
  CANONICALIZE_AXIS_WITH_TENSOR(X);
-
  CuDNNSetTensorDesc<T>(
      &input_desc_, {X.count(0, axis), X.dim(axis), X.count(axis + 1)});
-
  CUDNN_CHECK(cudnnSoftmaxForward(
      ctx()->cudnn_handle(),
      CUDNN_SOFTMAX_ACCURATE,
@@ -35,10 +33,8 @@ template <typename T>
 void CuDNNSoftmaxGradientOp<Context>::DoRunWithType() {
  auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(Y);
-
  CuDNNSetTensorDesc<T>(
      &input_desc_, {Y.count(0, axis), Y.dim(axis), Y.count(axis + 1)});
-
  CUDNN_CHECK(cudnnSoftmaxBackward(
      ctx()->cudnn_handle(),
      CUDNN_SOFTMAX_ACCURATE,

--- a/dragon/operators/activation/tanh_op_cudnn.cc
+++ b/dragon/operators/activation/tanh_op_cudnn.cc
@@ -9,7 +9,6 @@ template <typename T>
 void CuDNNTanhOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0, {0});
  CuDNNSetTensorDesc<T>(&input_desc_, X.dims());
-
 #if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationForward(
      ctx()->cudnn_handle(),
@@ -43,7 +42,6 @@ template <typename T>
 void CuDNNTanhGradientOp<Context>::DoRunWithType() {
  auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
  CuDNNSetTensorDesc<T>(&input_desc_, Y.dims());
-
 #if CUDNN_VERSION_MIN(5, 0, 0)
  CUDNN_CHECK(cudnnActivationBackward(
      ctx()->cudnn_handle(),

--- a/dragon/operators/array/cast_op.cc
+++ b/dragon/operators/array/cast_op.cc
@@ -64,9 +64,7 @@ void CastOp<Context>::RunOnDevice() {
    STORE_INPUT_SPEC(0);
    DISPATCH_WITH_TENSOR(Input(0));
  } else {
-    Buffer("X[" + std::to_string(0) + "]")
-        ->ReshapeLike(*Output(0))
-        ->set_meta(Output(0)->meta());
+    Buffer("X_spec:0")->ReshapeLike(*Output(0))->set_meta(Output(0)->meta());
    DISPATCH_WITH_TENSOR((*Output(0)));
  };
 }

--- a/dragon/operators/array/reduce_ops.h
+++ b/dragon/operators/array/reduce_ops.h
@@ -26,7 +26,9 @@ namespace dragon {
          axes_(OpArgs<int64_t>("axes")),               \
          keep_dims_(OpArg<int64_t>("keep_dims", 0)) {} \
    USE_OPERATOR_FUNCTIONS;                             \
+                                                        \
    void RunOnDevice() override;                        \
+                                                        \
    template <typename T>                               \
    void DoRunWithType();                               \
                                                        \
@@ -41,7 +43,9 @@ namespace dragon {
   public:                                                  \
    SIMPLE_CTOR_DTOR(name##GradientOp);                     \
    USE_OPERATOR_FUNCTIONS;                                 \
+                                                            \
    void RunOnDevice() override;                            \
+                                                            \
    template <typename T>                                   \
    void DoRunWithType();                                   \
  };

--- a/dragon/operators/loss/softmax_ce_loss_op.cc
+++ b/dragon/operators/loss/softmax_ce_loss_op.cc
@@ -15,9 +15,8 @@ void SoftmaxCrossEntropyOp<Context>::DoRunWithType() {
  auto inner_dim = X.count(axis + 1);
  auto num_preds = outer_dim * inner_dim;

-  CHECK_EQ(num_preds, Input(1).count())
+  CHECK_EQ(X.count(), Input(1).count())
      << "\nNumber of preds must match the number of targets.";
-
  Buffer("prob")->ReshapeLike(X);

  auto* loss = ws()->template data<T, Context>({X.count()})[0];

--- a/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
+++ b/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
@@ -17,6 +17,8 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {

  CHECK_EQ(num_preds, Input(1).count())
      << "\nNumber of preds must match the number of targets.";
+  auto* X_prob = Buffer("prob")->ReshapeLike(X);
+  auto* prob = X_prob->template mutable_data<LogitType, Context>();

  auto scratches = ws()->template data<Context>({
      num_preds * sizeof(LogitType), // loss
@@ -25,10 +27,6 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
  auto* loss = static_cast<LogitType*>(scratches[0]);
  auto* mask = static_cast<int*>(scratches[1]);

-  auto* prob = Buffer("prob")
-                   ->ReshapeLike(X)
-                   ->template mutable_data<LogitType, Context>();
-
  kernel::Softmax(
      outer_dim,
      X.dim(axis),

--- a/dragon/operators/math/accumulate_op.h
+++ b/dragon/operators/math/accumulate_op.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *    <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_OPERATORS_MATH_ACCUMULATE_OP_H_
-#define DRAGON_OPERATORS_MATH_ACCUMULATE_OP_H_
-
-#include "dragon/core/operator.h"
-
-namespace dragon {
-
-template <class Context>
-class AccumulateOp final : public Operator<Context> {
- public:
-  AccumulateOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        alpha_(OpArg<float>("alpha", 1.f)),
-        beta_(OpArg<float>("beta", 1.f)) {}
-  USE_OPERATOR_FUNCTIONS;
-
-  void RunOnDevice() override;
-
-  template <typename T>
-  void DoRunWithType(Tensor* X, Tensor* Y);
-
- protected:
-  float alpha_, beta_;
-};
-
-} // namespace dragon
-
-#endif // DRAGON_OPERATORS_MATH_ACCUMULATE_OP_H_
--- a/dragon/operators/math/accumulate.cc
+++ b/dragon/operators/math/accumulate.cc
 #include "dragon/core/workspace.h"
-#include "dragon/operators/math/accumulate_op.h"
+#include "dragon/operators/math/elementwise_ops.h"
 #include "dragon/utils/math_functions.h"

 namespace dragon {

 template <class Context>
 template <typename T>
-void AccumulateOp<Context>::DoRunWithType(Tensor* X, Tensor* Y) {
+void AxpbyOp<Context>::DoRunWithType(Tensor* X, Tensor* Y) {
  CHECK_EQ(X->count(), Y->count());
  auto* x = X->template data<T, Context>();
  auto* y = Y->template mutable_data<T, Context>();
@@ -26,41 +26,42 @@ void AccumulateOp<Context>::DoRunWithType(Tensor* X, Tensor* Y) {
 }

 template <class Context>
-void AccumulateOp<Context>::RunOnDevice() {
+void AxpbyOp<Context>::RunOnDevice() {
  for (int i = 0; i < InputSize(); i++) {
-    Output(i)->ReshapeLike(Input(i));
-    if (XIsType(Input(i), int8_t)) {
-      DoRunWithType<int8_t>(&Input(i), Output(i));
-    } else if (XIsType(Input(i), uint8_t)) {
-      DoRunWithType<uint8_t>(&Input(i), Output(i));
-    } else if (XIsType(Input(i), int)) {
-      DoRunWithType<int>(&Input(i), Output(i));
-    } else if (XIsType(Input(i), int64_t)) {
-      DoRunWithType<int64_t>(&Input(i), Output(i));
-    } else if (XIsType(Input(i), float16)) {
-      DoRunWithType<float16>(&Input(i), Output(i));
-    } else if (XIsType(Input(i), float)) {
-      DoRunWithType<float>(&Input(i), Output(i));
-    } else if (XIsType(Input(i), double)) {
-      DoRunWithType<double>(&Input(i), Output(i));
+    auto &X = Input(i), *Y = Output(i);
+    Y->ReshapeLike(X);
+    if (XIsType(X, int8_t)) {
+      DoRunWithType<int8_t>(&X, Y);
+    } else if (XIsType(X, uint8_t)) {
+      DoRunWithType<uint8_t>(&X, Y);
+    } else if (XIsType(X, int)) {
+      DoRunWithType<int>(&X, Y);
+    } else if (XIsType(X, int64_t)) {
+      DoRunWithType<int64_t>(&X, Y);
+    } else if (XIsType(X, float16)) {
+      DoRunWithType<float16>(&X, Y);
+    } else if (XIsType(X, float)) {
+      DoRunWithType<float>(&X, Y);
+    } else if (XIsType(X, double)) {
+      DoRunWithType<double>(&X, Y);
    } else
      LOG(FATAL) << TypeString(
-          Input(i),
+          X,
          {"int8", "uint8", "int32", "int64", "float16", "float32", "float64"});
  }
 }

-DEPLOY_CPU(Accumulate);
+DEPLOY_CPU(Axpby);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Accumulate);
+DEPLOY_CUDA(Axpby);
 #endif

-OPERATOR_SCHEMA(Accumulate)
+OPERATOR_SCHEMA(Axpby)
    /* X1, ... */
    .NumInputs(1, INT_MAX)
    /* Y1, ... */
    .NumOutputs(1, INT_MAX);

-NO_GRADIENT(Accumulate);
+NO_GRADIENT(Axpby);

 } // namespace dragon
--- a/dragon/operators/math/dot_op.cc
+++ b/dragon/operators/math/dot_op.cc
-#include "dragon/operators/math/dot_op.h"
+#include "dragon/operators/math/elementwise_ops.h"
 #include "dragon/utils/math_functions.h"

 namespace dragon {

 template <class Context>
 template <typename T>
-void DotOp<Context>::DotImpl() {
-  CHECK_EQ(Input(0).dim(0), Input(1).dim(0))
-      << "\nTensor(" << Input(0).name() << "): " << Input(0).DimString()
-      << " can not Dot with Tensor"
-      << "(" << Input(1).name() << "): " << Input(1).DimString();
-
-  Output(0)->Reshape({});
-
-  auto* a = Input(0).template data<T, Context>();
-  auto* b = Input(1).template data<T, Context>();
-  auto* y = Output(0)->template mutable_data<T, Context>();
-
-  T yHost;
-  math::Dot(Input(0).count(), a, b, &yHost, ctx());
-  ctx()->template Copy<T, Context, CPUContext>(1, y, &yHost);
-}
-
-template <class Context>
-template <typename T>
-void DotOp<Context>::GemmImpl() {
-  K1_ = transA_ ? Input(0).dim(0) : Input(0).dim(-1);
-  K2_ = transB_ ? Input(1).dim(1) : Input(1).dim(0);
-  N_ = transB_ ? Input(1).dim(0) : Input(1).dim(1);
-  M_ = Input(0).count() / K1_;
-
-  CHECK_EQ(K1_, K2_) << "\nTensor(" << Input(0).name()
-                     << "): " << Input(0).DimString()
-                     << " can not Dot with Tensor"
-                     << "(" << Input(1).name() << "): " << Input(1).DimString();
-
-  auto out_dims = Input(0).dims();
-  if (!transA_) {
-    out_dims.pop_back();
-  } else {
-    out_dims.erase(out_dims.begin());
-  }
-  out_dims.push_back(N_);
-  Output(0)->Reshape(out_dims);
-
-  auto* a = Input(0).template data<T, Context>();
-  auto* b = Input(1).template data<T, Context>();
-  auto* y = Output(0)->template mutable_data<T, Context>();
-
-  math::Gemm(
-      transA_ ? CblasTrans : CblasNoTrans,
-      transB_ ? CblasTrans : CblasNoTrans,
-      M_,
-      N_,
-      K1_,
-      1.f,
-      a,
-      b,
-      0.f,
-      y,
-      ctx());
-}
-
-template <class Context>
-template <typename T>
-void DotOp<Context>::GemvImpl() {
-  N_ = transA_ ? Input(0).dim(0) : Input(0).dim(-1);
-  M_ = Input(0).count() / N_;
-
-  CHECK_EQ(N_, Input(1).dim(0))
-      << "\nTensor(" << Input(0).name() << "): " << Input(0).DimString()
-      << " can not Dot with Tensor"
-      << "(" << Input(1).name() << "): " << Input(1).DimString();
-
-  auto out_dims = Input(0).dims();
-  if (!transA_) {
-    out_dims.pop_back();
-  } else {
-    out_dims.erase(out_dims.begin());
-  }
-  Output(0)->Reshape(out_dims);
-
-  auto* a = Input(0).template data<T, Context>();
-  auto* b = Input(1).template data<T, Context>();
-  auto* y = Output(0)->template mutable_data<T, Context>();
-
-  math::Gemv(
-      transA_ ? CblasTrans : CblasNoTrans,
-      transA_ ? N_ : M_,
-      transA_ ? M_ : N_,
-      1.f,
-      a,
-      b,
-      0.f,
-      y,
-      ctx());
-}
-
-template <class Context>
-template <typename T>
 void DotOp<Context>::DoRunWithType() {
-  if (Input(0).ndim() == 1 && Input(1).ndim() == 1) {
-    DotImpl<T>();
-  } else if (Input(0).ndim() >= 2 && Input(1).ndim() == 2) {
-    GemmImpl<T>();
-  } else if (Input(0).ndim() >= 2 && Input(1).ndim() == 1) {
-    GemvImpl<T>();
+  auto &A = Input(0), &B = Input(1), *Y = Output(0);
+
+  if (A.ndim() == 1 && B.ndim() == 1) {
+    // Compute vector product
+    CHECK_EQ(A.count(), B.count())
+        << "\nShapes " << A.DimString() << " and " << B.DimString()
+        << " not aligned: " << A.count() << " (dim 0) != " << B.count()
+        << " (dim 0)";
+    math::Dot(
+        A.count(),
+        A.template data<T, Context>(),
+        B.template data<T, Context>(),
+        Y->Reshape({})->template mutable_data<T, Context>(),
+        ctx());
+  } else if (A.ndim() == 2 && B.ndim() == 2) {
+    // Compute matrix multiplication
+    CHECK_EQ(A.dim(1), B.dim(0))
+        << "\nShapes " << A.DimString() << " and " << B.DimString()
+        << " not aligned: " << A.dim(1) << " (dim 1) != " << B.dim(0)
+        << " (dim 0)";
+    math::Gemm(
+        CblasNoTrans,
+        CblasNoTrans,
+        A.dim(0),
+        B.dim(1),
+        A.dim(1),
+        1.f,
+        A.template data<T, Context>(),
+        B.template data<T, Context>(),
+        0.f,
+        Y->Reshape({A.dim(0), B.dim(1)})->template mutable_data<T, Context>(),
+        ctx());
+  } else if (A.ndim() == 0 && B.ndim() == 0) {
+    // Compute elementwise multiplication
+    math::Mul(
+        1,
+        A.template data<T, Context>(),
+        B.template data<T, Context>(),
+        Y->Reshape({})->template mutable_data<T, Context>(),
+        ctx());
+  } else if (A.ndim() >= 2 && B.ndim() == 1) {
+    // Compute matrix-vector multiplication
+    CHECK_EQ(A.dim(-1), B.dim(0))
+        << "\nShapes " << A.DimString() << " and " << B.DimString()
+        << " not aligned: " << A.dim(-1) << " (dim -1) != " << B.dim(0)
+        << " (dim 0)";
+    vec64_t Y_dims(A.dims().begin(), A.dims().end() - 1);
+    math::Gemv(
+        CblasNoTrans,
+        A.dim(0),
+        A.dim(-1),
+        1.f,
+        A.template data<T, Context>(),
+        B.template data<T, Context>(),
+        0.f,
+        Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
+        ctx());
  } else {
-    LOG(FATAL) << "\nTensor(" << Input(0).name()
-               << "): " << Input(0).DimString() << " can not dot with Tensor"
-               << "(" << Input(1).name() << "): " << Input(1).DimString();
+    LOG(FATAL) << "\nShapes " << A.DimString() << " and " << B.DimString()
+               << " not aligned.";
  }
 }

@@ -120,164 +76,119 @@ void DotOp<Context>::RunOnDevice() {

 template <class Context>
 template <typename T>
-void DotGradientOp<Context>::DotImpl() {
-  CHECK_EQ(Input(0).count(), Input(1).count())
-      << "\nTensor(" << Input(0).name() << "): " << Input(0).DimString()
-      << " can not Dot with Tensor"
-      << "(" << Input(1).name() << "): " << Input(1).DimString();
-
-  auto* a = Input(0).template data<T, Context>();
-  auto* b = Input(1).template data<T, Context>();
-  auto* dy = Input(-1).template data<T, CPUContext>();
-
-  if (Output(0)->has_name()) {
-    auto* da = Output(0)->template mutable_data<T, Context>();
-    math::Scale(Output(0)->count(), cast::to<float>(dy[0]), b, da, ctx());
-  }
-
-  if (Output(1)->has_name()) {
-    auto* db = Output(1)->template mutable_data<T, Context>();
-    math::Scale(Output(0)->count(), cast::to<float>(dy[0]), a, db, ctx());
-  }
-}
-
-template <class Context>
-template <typename T>
-void DotGradientOp<Context>::GemmImpl() {
-  K1_ = transA_ ? Input(0).dim(0) : Input(0).dim(-1);
-  K2_ = transB_ ? Input(1).dim(1) : Input(1).dim(0);
-  N_ = transB_ ? Input(1).dim(0) : Input(1).dim(1);
-  M_ = Input(0).count() / K1_;
-
-  CHECK_EQ(K1_, K2_) << "\nTensor(" << Input(0).name()
-                     << "): " << Input(0).DimString()
-                     << " can not Dot with Tensor"
-                     << "(" << Input(1).name() << "): " << Input(1).DimString();
-
-  auto* a = Input(0).template data<T, Context>();
-  auto* b = Input(1).template data<T, Context>();
-  auto* dy = Input(-1).template data<T, Context>();
-
-  if (Output(0)->has_name()) {
-    auto* da = Output(0)->template mutable_data<T, Context>();
-    if (transA_) {
+void DotGradientOp<Context>::DoRunWithType() {
+  auto &A = Input(0), &B = Input(1), &dY = Input(2);
+  auto *dA = Output(0), *dB = Output(1);
+
+  if (A.ndim() == 1 && B.ndim() == 1) {
+    // Gradient of vector product
+    if (dA->has_name()) {
+      math::Mul(
+          dY.ndim(),
+          dY.dims().data(),
+          B.ndim(),
+          B.dims().data(),
+          dY.template data<T, Context>(),
+          B.template data<T, Context>(),
+          dA->ReshapeLike(A)->template mutable_data<T, Context>(),
+          ctx());
+    }
+    if (dB->has_name()) {
+      math::Mul(
+          dY.ndim(),
+          dY.dims().data(),
+          A.ndim(),
+          A.dims().data(),
+          dY.template data<T, Context>(),
+          A.template data<T, Context>(),
+          dB->ReshapeLike(B)->template mutable_data<T, Context>(),
+          ctx());
+    }
+  } else if (A.ndim() == 2 && B.ndim() == 2) {
+    // Gradient of matrix multiplication
+    if (dA->has_name()) {
      math::Gemm(
-          transB_ ? CblasTrans : CblasNoTrans,
+          CblasNoTrans,
          CblasTrans,
-          K1_,
-          M_,
-          N_,
+          A.dim(0),
+          A.dim(1),
+          B.dim(1),
          1.f,
-          b,
-          dy,
+          dY.template data<T, Context>(),
+          B.template data<T, Context>(),
          0.f,
-          da,
+          dA->ReshapeLike(A)->template mutable_data<T, Context>(),
          ctx());
-    } else {
+    }
+    if (dB->has_name()) {
      math::Gemm(
+          CblasTrans,
          CblasNoTrans,
-          transB_ ? CblasNoTrans : CblasTrans,
-          M_,
-          K1_,
-          N_,
+          A.dim(1),
+          B.dim(1),
+          A.dim(0),
          1.f,
-          dy,
-          b,
+          A.template data<T, Context>(),
+          dY.template data<T, Context>(),
          0.f,
-          da,
+          dB->ReshapeLike(B)->template mutable_data<T, Context>(),
          ctx());
    }
-  }
-
-  if (Output(1)->has_name()) {
-    auto* db = Output(1)->template mutable_data<T, Context>();
-    if (transB_) {
+  } else if (A.ndim() == 0 && B.ndim() == 0) {
+    // Gradient of elementwise multiplication
+    if (dA->has_name()) {
+      math::Mul(
+          1,
+          dY.template data<T, Context>(),
+          B.template data<T, Context>(),
+          dA->ReshapeLike(A)->template mutable_data<T, Context>(),
+          ctx());
+    }
+    if (dB->has_name()) {
+      math::Mul(
+          1,
+          dY.template data<T, Context>(),
+          A.template data<T, Context>(),
+          dB->ReshapeLike(B)->template mutable_data<T, Context>(),
+          ctx());
+    }
+  } else if (A.ndim() >= 2 && B.ndim() == 1) {
+    // Gradient of matrix-vector multiplication
+    if (dA->has_name()) {
      math::Gemm(
-          CblasTrans,
-          transA_ ? CblasTrans : CblasNoTrans,
-          N_,
-          K1_,
-          M_,
+          CblasNoTrans,
+          CblasNoTrans,
+          A.dim(0),
+          A.dim(1),
+          1,
          1.f,
-          dy,
-          a,
+          dY.template data<T, Context>(),
+          B.template data<T, Context>(),
          0.f,
-          db,
+          dA->ReshapeLike(A)->template mutable_data<T, Context>(),
          ctx());
-    } else {
-      math::Gemm(
-          transA_ ? CblasNoTrans : CblasTrans,
-          CblasNoTrans,
-          K1_,
-          N_,
-          M_,
+    }
+    if (dB->has_name()) {
+      math::Gemv(
+          CblasTrans,
+          A.dim(0),
+          A.dim(1),
          1.f,
-          a,
-          dy,
+          A.template data<T, Context>(),
+          dY.template data<T, Context>(),
          0.f,
-          db,
+          dB->ReshapeLike(B)->template mutable_data<T, Context>(),
          ctx());
    }
-  }
-}
-
-template <class Context>
-template <typename T>
-void DotGradientOp<Context>::GemvImpl() {
-  N_ = transA_ ? Input(0).dim(0) : Input(0).dim(-1);
-  M_ = Input(0).count() / N_;
-
-  CHECK_EQ(N_, Input(1).dim(0))
-      << "\nTensor(" << Input(0).name() << "): " << Input(0).DimString()
-      << " can not Dot with Tensor"
-      << "(" << Input(1).name() << "): " << Input(1).DimString();
-
-  auto* a = Input(0).template data<T, Context>();
-  auto* b = Input(1).template data<T, Context>();
-  auto* dy = Input(-1).template data<T, Context>();
-
-  if (Output(0)->has_name()) {
-    auto* da = Output(0)->template mutable_data<T, Context>();
-    math::Gemm(
-        CblasNoTrans, CblasNoTrans, M_, N_, 1, 1.f, dy, b, 0.f, da, ctx());
-  }
-
-  if (Output(1)->has_name()) {
-    auto* db = Output(1)->template mutable_data<T, Context>();
-    math::Gemv(
-        transA_ ? CblasNoTrans : CblasTrans,
-        transA_ ? N_ : M_,
-        transA_ ? M_ : N_,
-        1.f,
-        a,
-        dy,
-        0.f,
-        db,
-        ctx());
-  }
-}
-
-template <class Context>
-template <typename T>
-void DotGradientOp<Context>::DoRunWithType() {
-  if (Input(0).ndim() == 1 && Input(1).ndim() == 1) {
-    DotImpl<T>();
-  } else if (Input(0).ndim() >= 2 && Input(1).ndim() == 2) {
-    GemmImpl<T>();
-  } else if (Input(0).ndim() >= 2 && Input(1).ndim() == 1) {
-    GemvImpl<T>();
  } else {
-    LOG(FATAL) << "\nTensor(" << Input(0).name()
-               << "): " << Input(0).DimString() << " can not Dot with Tensor"
-               << "(" << Input(1).name() << "): " << Input(1).DimString();
+    LOG(FATAL) << "\nShapes " << A.DimString() << " and " << B.DimString()
+               << " not aligned.";
  }
 }

 template <class Context>
 void DotGradientOp<Context>::RunOnDevice() {
-  Output(0)->ReshapeLike(Input(0));
-  Output(1)->ReshapeLike(Input(1));
-  DispatchHelper<FloatingTensorTypes>::Call(this, Input(-1));
+  DispatchHelper<FloatingTensorTypes>::Call(this, Input(2));
 }

 DEPLOY_CPU(Dot);

--- a/dragon/operators/math/dot_op.h
+++ b/dragon/operators/math/dot_op.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *    <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_OPERATORS_MATH_DOT_OP_H_
-#define DRAGON_OPERATORS_MATH_DOT_OP_H_
-
-#include "dragon/core/operator.h"
-
-namespace dragon {
-
-template <class Context>
-class DotOp final : public Operator<Context> {
- public:
-  DotOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        transA_(OpArg<bool>("transA", false)),
-        transB_(OpArg<bool>("transB", false)) {}
-  USE_OPERATOR_FUNCTIONS;
-
-  void RunOnDevice() override;
-
-  template <typename T>
-  void DotImpl();
-
-  template <typename T>
-  void GemmImpl();
-
-  template <typename T>
-  void GemvImpl();
-
-  template <typename T>
-  void DoRunWithType();
-
- protected:
-  int64_t transA_, transB_;
-  int64_t M_, K1_, K2_, N_;
-  int64_t M1_, N1_, M2_, N2_;
-};
-
-template <class Context>
-class DotGradientOp final : public Operator<Context> {
- public:
-  DotGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        transA_(OpArg<bool>("transA", false)),
-        transB_(OpArg<bool>("transB", false)) {}
-  USE_OPERATOR_FUNCTIONS;
-
-  void RunOnDevice() override;
-
-  template <typename T>
-  void DotImpl();
-
-  template <typename T>
-  void GemmImpl();
-
-  template <typename T>
-  void GemvImpl();
-
-  template <typename T>
-  void DoRunWithType();
-
- protected:
-  int64_t transA_, transB_;
-  int64_t M_, K1_, K2_, N_;
-  int64_t M1_, N1_, M2_, N2_;
-};
-
-} // namespace dragon
-
-#endif // DRAGON_OPERATORS_MATH_DOT_OP_H_
--- a/dragon/operators/math/elementwise_ops.h
+++ b/dragon/operators/math/elementwise_ops.h
@@ -18,7 +18,7 @@

 namespace dragon {

-#define DECLARE_SIMPLE_UNARY_OP(name)               \
+#define DECLARE_ELEMENTWISE_OP(name)                \
  template <class Context>                          \
  class name##Op final : public Operator<Context> { \
   public:                                          \
@@ -31,18 +31,23 @@ namespace dragon {
    void DoRunWithType();                           \
  };

-#define DECLARE_SIMPLE_BINARY_OP(name)              \
-  template <class Context>                          \
-  class name##Op final : public Operator<Context> { \
-   public:                                          \
-    SIMPLE_CTOR_DTOR(name##Op);                     \
-    USE_OPERATOR_FUNCTIONS;                         \
-                                                    \
-    void RunOnDevice() override;                    \
-                                                    \
-    template <typename T>                           \
-    void DoRunWithType();                           \
-  };
+template <class Context>
+class AxpbyOp final : public Operator<Context> {
+ public:
+  AxpbyOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        alpha_(OpArg<float>("alpha", 1.f)),
+        beta_(OpArg<float>("beta", 1.f)) {}
+  USE_OPERATOR_FUNCTIONS;
+
+  void RunOnDevice() override;
+
+  template <typename T>
+  void DoRunWithType(Tensor* X, Tensor* Y);
+
+ protected:
+  float alpha_, beta_;
+};

 inline vec32_t CheckOutputAliases(
    const Tensor& A,
@@ -64,87 +69,61 @@ inline vec32_t CheckOutputAliases(
  return available_aliases;
 }

-inline void IsBroadcast(
-    const Tensor& A,
-    const Tensor& B,
-    int& rows,
-    int& cols,
-    int& kind,
-    Tensor* Y = nullptr) {
-  kind = -2;
-  if (A.count() == B.count()) {
-    if (Y != nullptr) Y->ReshapeLike(A);
-    kind = -1;
-  } else if (B.count() < A.count()) {
-    if (Y != nullptr) Y->ReshapeLike(A);
-    if (utils::math::IsRowwiseBroadcast(A.dims(), B.dims(), &rows, &cols)) {
-      kind = 0;
-    } else if (utils::math::IsColwiseBroadcast(
-                   A.dims(), B.dims(), &rows, &cols)) {
-      kind = 1;
-    }
-  } else {
-    if (Y != nullptr) Y->ReshapeLike(B);
-    if (utils::math::IsRowwiseBroadcast(A.dims(), B.dims(), &rows, &cols)) {
-      kind = 2;
-    } else if (utils::math::IsColwiseBroadcast(
-                   A.dims(), B.dims(), &rows, &cols)) {
-      kind = 3;
-    }
-  }
-}
+// Unary ElementwiseOp
+DECLARE_ELEMENTWISE_OP(Abs);
+DECLARE_ELEMENTWISE_OP(Ceil);
+DECLARE_ELEMENTWISE_OP(Cos);
+DECLARE_ELEMENTWISE_OP(Exp);
+DECLARE_ELEMENTWISE_OP(Floor);
+DECLARE_ELEMENTWISE_OP(IsInf);
+DECLARE_ELEMENTWISE_OP(IsNaN);
+DECLARE_ELEMENTWISE_OP(Log);
+DECLARE_ELEMENTWISE_OP(Neg);
+DECLARE_ELEMENTWISE_OP(Invert);
+DECLARE_ELEMENTWISE_OP(Reciprocal);
+DECLARE_ELEMENTWISE_OP(Round);
+DECLARE_ELEMENTWISE_OP(Rsqrt);
+DECLARE_ELEMENTWISE_OP(Sign);
+DECLARE_ELEMENTWISE_OP(Sin);
+DECLARE_ELEMENTWISE_OP(Sqrt);
+DECLARE_ELEMENTWISE_OP(Square);
+DECLARE_ELEMENTWISE_OP(AbsGradient);
+DECLARE_ELEMENTWISE_OP(CosGradient);
+DECLARE_ELEMENTWISE_OP(ExpGradient);
+DECLARE_ELEMENTWISE_OP(LogGradient);
+DECLARE_ELEMENTWISE_OP(NegGradient);
+DECLARE_ELEMENTWISE_OP(ReciprocalGradient);
+DECLARE_ELEMENTWISE_OP(RsqrtGradient);
+DECLARE_ELEMENTWISE_OP(SignGradient);
+DECLARE_ELEMENTWISE_OP(SinGradient);
+DECLARE_ELEMENTWISE_OP(SqrtGradient);
+DECLARE_ELEMENTWISE_OP(SquareGradient);

-DECLARE_SIMPLE_UNARY_OP(Abs);
-DECLARE_SIMPLE_UNARY_OP(Ceil);
-DECLARE_SIMPLE_UNARY_OP(Cos);
-DECLARE_SIMPLE_UNARY_OP(Exp);
-DECLARE_SIMPLE_UNARY_OP(Floor);
-DECLARE_SIMPLE_UNARY_OP(IsInf);
-DECLARE_SIMPLE_UNARY_OP(IsNaN);
-DECLARE_SIMPLE_UNARY_OP(Log);
-DECLARE_SIMPLE_UNARY_OP(Neg);
-DECLARE_SIMPLE_UNARY_OP(Invert);
-DECLARE_SIMPLE_UNARY_OP(Reciprocal);
-DECLARE_SIMPLE_UNARY_OP(Round);
-DECLARE_SIMPLE_UNARY_OP(Rsqrt);
-DECLARE_SIMPLE_UNARY_OP(Sign);
-DECLARE_SIMPLE_UNARY_OP(Sin);
-DECLARE_SIMPLE_UNARY_OP(Sqrt);
-DECLARE_SIMPLE_UNARY_OP(Square);
-DECLARE_SIMPLE_UNARY_OP(AbsGradient);
-DECLARE_SIMPLE_UNARY_OP(CosGradient);
-DECLARE_SIMPLE_UNARY_OP(ExpGradient);
-DECLARE_SIMPLE_UNARY_OP(LogGradient);
-DECLARE_SIMPLE_UNARY_OP(NegGradient);
-DECLARE_SIMPLE_UNARY_OP(ReciprocalGradient);
-DECLARE_SIMPLE_UNARY_OP(RsqrtGradient);
-DECLARE_SIMPLE_UNARY_OP(SignGradient);
-DECLARE_SIMPLE_UNARY_OP(SinGradient);
-DECLARE_SIMPLE_UNARY_OP(SqrtGradient);
-DECLARE_SIMPLE_UNARY_OP(SquareGradient);
-#undef DECLARE_SIMPLE_UNARY_OP
+// Binary ElementwiseOp
+DECLARE_ELEMENTWISE_OP(Add);
+DECLARE_ELEMENTWISE_OP(Sub);
+DECLARE_ELEMENTWISE_OP(Mul);
+DECLARE_ELEMENTWISE_OP(Div);
+DECLARE_ELEMENTWISE_OP(Pow);
+DECLARE_ELEMENTWISE_OP(Dot);
+DECLARE_ELEMENTWISE_OP(Minimum);
+DECLARE_ELEMENTWISE_OP(Maximum);
+DECLARE_ELEMENTWISE_OP(Equal);
+DECLARE_ELEMENTWISE_OP(NotEqual);
+DECLARE_ELEMENTWISE_OP(Less);
+DECLARE_ELEMENTWISE_OP(LessEqual);
+DECLARE_ELEMENTWISE_OP(Greater);
+DECLARE_ELEMENTWISE_OP(GreaterEqual);
+DECLARE_ELEMENTWISE_OP(AddGradient);
+DECLARE_ELEMENTWISE_OP(SubGradient);
+DECLARE_ELEMENTWISE_OP(MulGradient);
+DECLARE_ELEMENTWISE_OP(DivGradient);
+DECLARE_ELEMENTWISE_OP(PowGradient);
+DECLARE_ELEMENTWISE_OP(DotGradient);
+DECLARE_ELEMENTWISE_OP(MinimumGradient);
+DECLARE_ELEMENTWISE_OP(MaximumGradient);

-DECLARE_SIMPLE_BINARY_OP(Add);
-DECLARE_SIMPLE_BINARY_OP(Sub);
-DECLARE_SIMPLE_BINARY_OP(Mul);
-DECLARE_SIMPLE_BINARY_OP(Div);
-DECLARE_SIMPLE_BINARY_OP(Pow);
-DECLARE_SIMPLE_BINARY_OP(Minimum);
-DECLARE_SIMPLE_BINARY_OP(Maximum);
-DECLARE_SIMPLE_BINARY_OP(Equal);
-DECLARE_SIMPLE_BINARY_OP(NotEqual);
-DECLARE_SIMPLE_BINARY_OP(Less);
-DECLARE_SIMPLE_BINARY_OP(LessEqual);
-DECLARE_SIMPLE_BINARY_OP(Greater);
-DECLARE_SIMPLE_BINARY_OP(GreaterEqual);
-DECLARE_SIMPLE_BINARY_OP(AddGradient);
-DECLARE_SIMPLE_BINARY_OP(SubGradient);
-DECLARE_SIMPLE_BINARY_OP(MulGradient);
-DECLARE_SIMPLE_BINARY_OP(DivGradient);
-DECLARE_SIMPLE_BINARY_OP(PowGradient);
-DECLARE_SIMPLE_BINARY_OP(MinimumGradient);
-DECLARE_SIMPLE_BINARY_OP(MaximumGradient);
-#undef DECLARE_SIMPLE_BINARY_OP
+#undef DECLARE_ELEMENTWISE_OP

 } // namespace dragon


--- a/dragon/operators/math/fully_connected_op.cc
+++ b/dragon/operators/math/fully_connected_op.cc
@@ -13,14 +13,14 @@ void FullyConnectedOp<Context>::DoRunWithType() {

  // Determine the number of output channels
  int64_t M = X.count(0, axis), K = X.count(axis), N;
-  if (num_output_ <= 0) {
+  if (out_channels_ <= 0) {
    // Infer the "N" from the weights shape
    N = W.count() / K;
    CHECK_GT(N, 0) << "\nFailed to infer the N from "
                   << "the weights shape: " << W.DimString();
  } else {
    // Use a fixed "N" from the argument
-    N = num_output_;
+    N = out_channels_;
  }

  vec64_t Y_dims(axis + 1);
@@ -82,14 +82,14 @@ void FullyConnectedGradientOp<Context>::DoRunWithType() {

  // Determine the number of output channels
  int64_t M = X.count(0, axis), K = X.count(axis), N;
-  if (num_output_ <= 0) {
+  if (out_channels_ <= 0) {
    // Infer the "N" from the weights shape
    N = W.count() / K;
    CHECK_GT(N, 0) << "\nFailed to infer the N from "
                   << "the weights shape: " << W.DimString();
  } else {
    // Use a fixed "N" from the argument
-    N = num_output_;
+    N = out_channels_;
  }

  if (dX->has_name()) {

--- a/dragon/operators/math/fully_connected_op.h
+++ b/dragon/operators/math/fully_connected_op.h
@@ -22,7 +22,7 @@ class FullyConnectedOp final : public Operator<Context> {
 public:
  FullyConnectedOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        num_output_(OpArg<int64_t>("num_output", 0)),
+        out_channels_(OpArg<int64_t>("out_channels", 0)),
        transW_(OpArg<int64_t>("transW", 1)) {}
  USE_OPERATOR_FUNCTIONS;

@@ -32,7 +32,7 @@ class FullyConnectedOp final : public Operator<Context> {
  void DoRunWithType();

 protected:
-  int64_t num_output_, transW_;
+  int64_t out_channels_, transW_;
 };

 template <class Context>
@@ -40,7 +40,7 @@ class FullyConnectedGradientOp final : public Operator<Context> {
 public:
  FullyConnectedGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
-        num_output_(OpArg<int64_t>("num_output", 0)),
+        out_channels_(OpArg<int64_t>("out_channels", 0)),
        transW_(OpArg<int64_t>("transW", 1)) {}
  USE_OPERATOR_FUNCTIONS;

@@ -50,7 +50,7 @@ class FullyConnectedGradientOp final : public Operator<Context> {
  void DoRunWithType();

 protected:
-  int64_t num_output_, transW_;
+  int64_t out_channels_, transW_;
 };

 } // namespace dragon

--- a/dragon/operators/math/matmul_op.cc
+++ b/dragon/operators/math/matmul_op.cc
@@ -5,7 +5,7 @@ namespace dragon {

 template <class Context>
 template <typename T>
-void MatmulOp<Context>::DoRunWithType() {
+void MatMulOp<Context>::DoRunWithType() {
  auto &A = Input(0), &B = Input(1), *Y = Output(0);

  CHECK_GE(A.ndim(), 2) << "\nTensor(" << A.name() + ") must be a matrix"
@@ -51,13 +51,13 @@ void MatmulOp<Context>::DoRunWithType() {
 }

 template <class Context>
-void MatmulOp<Context>::RunOnDevice() {
+void MatMulOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }

 template <class Context>
 template <typename T>
-void MatmulGradientOp<Context>::DoRunWithType() {
+void MatMulGradientOp<Context>::DoRunWithType() {
  auto &A = Input(0), &B = Input(1), &dY = Input(2);
  auto *dA = Output(0), *dB = Output(1);

@@ -154,32 +154,32 @@ void MatmulGradientOp<Context>::DoRunWithType() {
 }

 template <class Context>
-void MatmulGradientOp<Context>::RunOnDevice() {
+void MatMulGradientOp<Context>::RunOnDevice() {
  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
 }

-DEPLOY_CPU(Matmul);
+DEPLOY_CPU(MatMul);
 #ifdef USE_CUDA
-DEPLOY_CUDA(Matmul);
+DEPLOY_CUDA(MatMul);
 #endif

-DEPLOY_CPU(MatmulGradient);
+DEPLOY_CPU(MatMulGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA(MatmulGradient);
+DEPLOY_CUDA(MatMulGradient);
 #endif

-OPERATOR_SCHEMA(Matmul)
+OPERATOR_SCHEMA(MatMul)
    /* A, B */
    .NumInputs(2)
    /* Y */
    .NumOutputs(1);

-OPERATOR_SCHEMA(MatmulGradient)
+OPERATOR_SCHEMA(MatMulGradient)
    /* A, B, dY */
    .NumInputs(3)
    /* dA, dB */
    .NumOutputs(2);

-REGISTER_GRADIENT(Matmul, GenericGradientMaker);
+REGISTER_GRADIENT(MatMul, GenericGradientMaker);

 } // namespace dragon
--- a/dragon/operators/math/matmul_op.h
+++ b/dragon/operators/math/matmul_op.h
@@ -18,9 +18,9 @@
 namespace dragon {

 template <class Context>
-class MatmulOp final : public Operator<Context> {
+class MatMulOp final : public Operator<Context> {
 public:
-  MatmulOp(const OperatorDef& def, Workspace* ws)
+  MatMulOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
        transA_(OpArg<int64_t>("transA", 0)),
        transB_(OpArg<int64_t>("transB", 0)) {}
@@ -36,9 +36,9 @@ class MatmulOp final : public Operator<Context> {
 };

 template <class Context>
-class MatmulGradientOp final : public Operator<Context> {
+class MatMulGradientOp final : public Operator<Context> {
 public:
-  MatmulGradientOp(const OperatorDef& def, Workspace* ws)
+  MatMulGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
        transA_(OpArg<int64_t>("transA", 0)),
        transB_(OpArg<int64_t>("transB", 0)) {}

--- a/dragon/operators/normalization/group_norm_op.cc
+++ b/dragon/operators/normalization/group_norm_op.cc
@@ -94,8 +94,8 @@ void GroupNormGradientOp<Context>::DoRunWithType() {
 template <class Context>
 void GroupNormGradientOp<Context>::RunOnDevice() {
  DetermineBaseArguments();
-
  Output(0)->ReshapeLike(Input(0));
+
  if (XIsType(Input(0), float)) {
    DoRunWithType<float, float>();
  } else if (XIsType(Input(0), float16)) {

--- a/dragon/operators/normalization/group_norm_op.h
+++ b/dragon/operators/normalization/group_norm_op.h
@@ -42,9 +42,6 @@ class GroupNormOpBase : public Operator<Context> {
    // Check the channels and groups
    CHECK_EQ(C_ % G_, 0) << "\nThe " << C_ << " channels "
                         << "can not be split into " << G_ << " groups.";
-    if (G_ == C_ && X.ndim() == 2) {
-      LOG(WARNING) << "The 2d input will output all zeros.";
-    }
  }

 protected:

--- a/dragon/operators/training/adam_update_op.cc
+++ b/dragon/operators/training/adam_update_op.cc
-#include "dragon/operators/training/adam_update_op.h"
 #include "dragon/core/workspace.h"
+#include "dragon/operators/training/update_ops.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {

 template <class Context>
-void AdamUpdateOp<Context>::Compute(Tensor* dX) {
-  auto* m = ws()->CreateTensor("/mnt/" + slot() + "/m")
-                ->ReshapeLike(*dX)
-                ->template mutable_data<float, Context>();
-  auto* v = ws()->CreateTensor("/mnt/" + slot() + "/v")
-                ->ReshapeLike(*dX)
-                ->template mutable_data<float, Context>();
-
+void AdamUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
  t_++;
-  auto beta1 = param("beta1");
-  auto beta2 = param("beta2");
+  auto beta1 = Parameter("beta1"), beta2 = Parameter("beta2");
  auto coef = sqrt(1.f - pow(beta2, t_)) / (1.f - pow(beta1, t_));

  kernel::AdamUpdate(
      dX->count(),
-      param("base_lr") * coef * lr_mult(),
+      Parameter("base_lr") * coef * this->lr_mult_,
      beta1,
      beta2,
-      param("eps"),
+      Parameter("eps"),
      dX->template mutable_data<float, Context>(),
-      m,
-      v,
+      Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
+      Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
      ctx());
 }


--- a/dragon/operators/training/adam_update_op.h
+++ b/dragon/operators/training/adam_update_op.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *    <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_OPERATORS_TRAINING_ADAM_UPDATE_OP_H_
-#define DRAGON_OPERATORS_TRAINING_ADAM_UPDATE_OP_H_
-
-#include "dragon/operators/training/update_op_base.h"
-
-namespace dragon {
-
-template <class Context>
-class AdamUpdateOp final : public UpdateOpBase<Context> {
- public:
-  AdamUpdateOp(const OperatorDef& def, Workspace* ws)
-      : UpdateOpBase<Context>(def, ws), t_(0) {}
-  USE_OPERATOR_FUNCTIONS;
-  USE_PARAM_UPDATE_FUNCTIONS;
-
-  void Compute(Tensor* dX) override;
-
- protected:
-  int t_;
-  // float lr_, beta1_, beta2_, eps_;
-};
-
-} // namespace dragon
-
-#endif // DRAGON_OPERATORS_TRAINING_ADAM_UPDATE_OP_H_
--- a/dragon/operators/training/nesterov_update_op.cc
+++ b/dragon/operators/training/nesterov_update_op.cc
-#include "dragon/operators/training/nesterov_update_op.h"
 #include "dragon/core/workspace.h"
+#include "dragon/operators/training/update_ops.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {

 template <class Context>
-void NesterovUpdateOp<Context>::Compute(Tensor* dX) {
-  auto* m = ws()->CreateTensor("/mnt/" + slot() + "/m")
-                ->ReshapeLike(*dX)
-                ->template mutable_data<float, Context>();
-
+void NesterovUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
  kernel::NesterovUpdate(
      dX->count(),
-      param("base_lr") * lr_mult(),
-      param("momentum"),
+      Parameter("base_lr") * this->lr_mult_,
+      Parameter("momentum"),
      dX->template mutable_data<float, Context>(),
-      m,
+      Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
      ctx());
 }


--- a/dragon/operators/training/nesterov_update_op.h
+++ b/dragon/operators/training/nesterov_update_op.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *    <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_OPERATORS_TRAINING_NESTEROV_UPDATE_OP_H_
-#define DRAGON_OPERATORS_TRAINING_NESTEROV_UPDATE_OP_H_
-
-#include "dragon/operators/training/update_op_base.h"
-
-namespace dragon {
-
-template <class Context>
-class NesterovUpdateOp final : public UpdateOpBase<Context> {
- public:
-  NesterovUpdateOp(const OperatorDef& def, Workspace* ws)
-      : UpdateOpBase<Context>(def, ws) {}
-  USE_OPERATOR_FUNCTIONS;
-  USE_PARAM_UPDATE_FUNCTIONS;
-
-  void Compute(Tensor* dX) override;
-};
-
-} // namespace dragon
-
-#endif // DRAGON_OPERATORS_TRAINING_NESTEROV_UPDATE_OP_H_
--- a/dragon/operators/training/rmsprop_update_op.cc
+++ b/dragon/operators/training/rmsprop_update_op.cc
-#include "dragon/operators/training/rmsprop_update_op.h"
 #include "dragon/core/workspace.h"
+#include "dragon/operators/training/update_ops.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {

 template <class Context>
-void RMSPropUpdateOp<Context>::Compute(Tensor* dX) {
-  auto* m = ws()->CreateTensor("/mnt/" + slot() + "/m")
-                ->ReshapeLike(*dX)
-                ->template mutable_data<float, Context>();
-  auto* v = ws()->CreateTensor("/mnt/" + slot() + "/v")
-                ->ReshapeLike(*dX)
-                ->template mutable_data<float, Context>();
-
+void RMSpropUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
  kernel::RMSPropUpdate(
      dX->count(),
-      param("base_lr") * lr_mult(),
-      param("momentum"),
-      param("decay"),
-      param("eps"),
+      Parameter("base_lr") * this->lr_mult_,
+      Parameter("momentum"),
+      Parameter("decay"),
+      Parameter("eps"),
      dX->template mutable_data<float, Context>(),
-      m,
-      v,
+      Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
+      Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
      ctx());
 }

-DEPLOY_CPU(RMSPropUpdate);
+DEPLOY_CPU(RMSpropUpdate);
 #ifdef USE_CUDA
-DEPLOY_CUDA(RMSPropUpdate);
+DEPLOY_CUDA(RMSpropUpdate);
 #endif

-OPERATOR_SCHEMA(RMSPropUpdate)
+OPERATOR_SCHEMA(RMSpropUpdate)
    /* dX */
    .NumInputs(1)
    /* X */
    .NumOutputs(1);

-NO_GRADIENT(RMSPropUpdate);
+NO_GRADIENT(RMSpropUpdate);

 } // namespace dragon
--- a/dragon/operators/training/rmsprop_update_op.h
+++ b/dragon/operators/training/rmsprop_update_op.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *    <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_OPERATORS_TRAINING_RMSPROP_UPDATE_OP_H_
-#define DRAGON_OPERATORS_TRAINING_RMSPROP_UPDATE_OP_H_
-
-#include "dragon/operators/training/update_op_base.h"
-
-namespace dragon {
-
-template <class Context>
-class RMSPropUpdateOp final : public UpdateOpBase<Context> {
- public:
-  RMSPropUpdateOp(const OperatorDef& def, Workspace* ws)
-      : UpdateOpBase<Context>(def, ws) {}
-  USE_OPERATOR_FUNCTIONS;
-  USE_PARAM_UPDATE_FUNCTIONS;
-
-  void Compute(Tensor* dX) override;
-};
-
-} // namespace dragon
-
-#endif // DRAGON_OPERATORS_TRAINING_RMSPROP_UPDATE_OP_H_
--- a/dragon/operators/training/sgd_update_op.cc
+++ b/dragon/operators/training/sgd_update_op.cc
-#include "dragon/operators/training/sgd_update_op.h"
 #include "dragon/core/workspace.h"
+#include "dragon/operators/training/update_ops.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {

 template <class Context>
-void SGDUpdateOp<Context>::Compute(Tensor* dX) {
-  auto* m = ws()->CreateTensor("/mnt/" + slot() + "/m")
-                ->ReshapeLike(*dX)
-                ->template mutable_data<float, Context>();
-
+void SGDUpdateOp<Context>::ComputeUpdate(Tensor* dX) {
  // Momentum Correction, See arXiv:1706.02677
-  auto lr = param("base_lr") * lr_mult();
+  auto lr = Parameter("base_lr") * this->lr_mult_;
  if (last_lr_ > 0) correction_ = lr / last_lr_;
  last_lr_ = lr; // Record the last value

  kernel::SGDUpdate(
      dX->count(),
      lr,
-      param("momentum") * correction_,
+      Parameter("momentum") * correction_,
      dX->template mutable_data<float, Context>(),
-      m,
+      Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
      ctx());
 }


--- a/dragon/operators/training/sgd_update_op.h
+++ b/dragon/operators/training/sgd_update_op.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *    <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_OPERATORS_TRAINING_SGD_UPDATE_OP_H_
-#define DRAGON_OPERATORS_TRAINING_SGD_UPDATE_OP_H_
-
-#include "dragon/operators/training/update_op_base.h"
-
-namespace dragon {
-
-template <class Context>
-class SGDUpdateOp final : public UpdateOpBase<Context> {
- public:
-  SGDUpdateOp(const OperatorDef& def, Workspace* ws)
-      : UpdateOpBase<Context>(def, ws), last_lr_(-1.f), correction_(1.f) {}
-  USE_OPERATOR_FUNCTIONS;
-  USE_PARAM_UPDATE_FUNCTIONS;
-
-  void Compute(Tensor* dX) override;
-
- protected:
-  float last_lr_, correction_;
-};
-
-} // namespace dragon
-
-#endif // DRAGON_OPERATORS_TRAINING_SGD_UPDATE_OP_H_
--- a/dragon/operators/training/update_op_base.cc
+++ b/dragon/operators/training/update_op_base.cc
-#include "dragon/operators/training/update_op_base.h"
 #include "dragon/core/workspace.h"
-#include "dragon/utils/cast.h"
+#include "dragon/operators/training/update_ops.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {

 template <class Context>
-float UpdateOpBase<Context>::param(const string& name) const {
-  return ws()
-      ->GetTensor(slot_ + "/" + name)
-      ->template mutable_data<float, CPUContext>()[0];
+Tensor* UpdateOpBase<Context>::Slot(const string& name) {
+  return Buffer(Output(0)->name() + "/" + name);
+}
+
+template <class Context>
+float UpdateOpBase<Context>::Parameter(const string& name) const {
+  auto* P = ws()->GetTensor("/share/hyper/" + handle() + "/" + name);
+  return P->template mutable_data<float, CPUContext>()[0];
 }

 template <class Context>
 template <typename T>
-void UpdateOpBase<Context>::Process(Tensor* dX, Tensor* X) {
+void UpdateOpBase<Context>::AdjustGradient(Tensor* dX, Tensor* X) {
  // Scale
-  auto scale_factor = param("scale_gradient");
-  if (scale_factor != 1.f) {
+  auto scale = Parameter("scale");
+  if (scale != 1.f) {
    auto* dx = dX->template mutable_data<T, Context>();
-    math::Scale(dX->count(), scale_factor, dx, dx, ctx());
+    math::Scale(dX->count(), scale, dx, dx, ctx());
  }
  // Clip
-  auto clip_thresh = param("clip_gradient");
-  if (clip_thresh > 0.f) {
-    T sumsq_grad;
+  auto clip_norm = Parameter("clip_norm");
+  if (clip_norm > 0.f) {
    auto* dx = dX->template mutable_data<T, Context>();
-    math::Dot(dX->count(), dx, dx, &sumsq_grad, ctx());
-    auto l2_norm = sqrt(cast::to<float>(sumsq_grad));
-    if (l2_norm > clip_thresh) {
-      math::Scale(dX->count(), clip_thresh / l2_norm, dx, dx, ctx());
+    auto grad_norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx()));
+    if (grad_norm > clip_norm) {
+      math::Scale(dX->count(), clip_norm / grad_norm, dx, dx, ctx());
    }
  }
-  // L2 Decay
-  auto l2_decay = param("l2_decay") * decay_mult_;
-  if (l2_decay > 0) {
+  // Penalty
+  auto weight_decay = Parameter("weight_decay");
+  if (weight_decay > 0.f) {
    if (XIsType((*X), float16)) {
-      kernel::MixedPrecL2Decay(
+      kernel::MixedPrecL2Penalty(
          X->count(),
-          l2_decay,
+          weight_decay * decay_mult_,
          X->template data<float16, Context>(),
          dX->template mutable_data<float, Context>(),
          ctx());
    } else {
      math::Axpy(
          X->count(),
-          l2_decay,
+          weight_decay * decay_mult_,
          X->template data<T, Context>(),
          dX->template mutable_data<T, Context>(),
          ctx());
@@ -56,7 +57,7 @@ void UpdateOpBase<Context>::Process(Tensor* dX, Tensor* X) {

 template <class Context>
 template <typename T>
-void UpdateOpBase<Context>::Apply(Tensor* dX, Tensor* X) {
+void UpdateOpBase<Context>::ApplyUpdate(Tensor* dX, Tensor* X) {
  if (XIsType((*X), float16)) {
    kernel::MixedPrecUpdate(
        X->count(),
@@ -64,9 +65,9 @@ void UpdateOpBase<Context>::Apply(Tensor* dX, Tensor* X) {
        X->template mutable_data<float16, Context>(),
        ctx());
  } else {
-    math::Axpy(
+    math::Sub(
        X->count(),
-        -1.f,
+        X->template data<T, Context>(),
        dX->template data<T, Context>(),
        X->template mutable_data<T, Context>(),
        ctx());
@@ -85,19 +86,19 @@ void UpdateOpBase<Context>::RunOnDevice() {
      << "\nGot" << X->DimString() << " and " << dX.DimString();

  if (XIsType(dX, float)) {
-    Process<float>(&dX, X);
-    Compute(&dX);
-    Apply<float>(&dX, X);
+    AdjustGradient<float>(&dX, X);
+    ComputeUpdate(&dX);
+    ApplyUpdate<float>(&dX, X);
  } else if (XIsType(dX, float16)) {
-    auto* dX_fp32 = ws()->CreateTensor(dX.name() + "/fp32");
+    auto* dX_cast = ws()->CreateTensor(dX.name() + "[float32]");
    kernel::Cast(
        dX.count(),
        dX.template data<float16, Context>(),
-        dX_fp32->ReshapeLike(dX)->template mutable_data<float, Context>(),
+        dX_cast->ReshapeLike(dX)->template mutable_data<float, Context>(),
        ctx());
-    Process<float>(dX_fp32, X);
-    Compute(dX_fp32);
-    Apply<float>(dX_fp32, X);
+    AdjustGradient<float>(dX_cast, X);
+    ComputeUpdate(dX_cast);
+    ApplyUpdate<float>(dX_cast, X);
  } else {
    LOG(FATAL) << TypeString(dX, {"float16", "float32"});
  }

--- a/dragon/operators/training/update_op_base.h
+++ b/dragon/operators/training/update_op_base.h
-/*!
- * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
- *
- * Licensed under the BSD 2-Clause License.
- * You should have received a copy of the BSD 2-Clause License
- * along with the software. If not, See,
- *
- *    <https://opensource.org/licenses/BSD-2-Clause>
- *
- * ------------------------------------------------------------
- */
-
-#ifndef DRAGON_OPERATORS_TRAINING_UPDATE_OP_BASE_H_
-#define DRAGON_OPERATORS_TRAINING_UPDATE_OP_BASE_H_
-
-#include "dragon/core/operator.h"
-
-namespace dragon {
-
-template <class Context>
-class UpdateOpBase : public Operator<Context> {
- public:
-  UpdateOpBase(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        lr_mult_(OpArg<float>("lr_mult", 1.f)),
-        decay_mult_(OpArg<float>("decay_mult", 1.f)),
-        slot_(OpArg<string>("slot", "")) {
-    CHECK(!slot_.empty()) << "\nRequired a non-empty slot";
-  }
-  USE_OPERATOR_FUNCTIONS;
-
-  void RunOnDevice() override;
-
-  virtual void Compute(Tensor* dX) = 0;
-
-  template <typename T>
-  void Process(Tensor* dX, Tensor* X);
-
-  template <typename T>
-  void Apply(Tensor* dX, Tensor* X);
-
-  string slot() {
-    return slot_ + "/" + Output(0)->name();
-  }
-
-  float param(const string& name) const;
-
-  float lr_mult() const {
-    return lr_mult_;
-  }
-
- protected:
-  string slot_;
-  float lr_mult_, decay_mult_;
-};
-
-#define USE_PARAM_UPDATE_FUNCTIONS    \
-  using UpdateOpBase<Context>::slot;  \
-  using UpdateOpBase<Context>::param; \
-  using UpdateOpBase<Context>::lr_mult
-
-} // namespace dragon
-
-#endif // DRAGON_OPERATORS_TRAINING_UPDATE_OP_BASE_H_
--- a/dragon/operators/training/update_ops.h
+++ b/dragon/operators/training/update_ops.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *    <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef DRAGON_OPERATORS_TRAINING_UPDATE_OPS_H_
+#define DRAGON_OPERATORS_TRAINING_UPDATE_OPS_H_
+
+#include "dragon/core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class UpdateOpBase : public Operator<Context> {
+ public:
+  UpdateOpBase(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        lr_mult_(OpArg<float>("lr_mult", 1.f)),
+        decay_mult_(OpArg<float>("decay_mult", 1.f)) {}
+  USE_OPERATOR_FUNCTIONS;
+
+  void RunOnDevice() override;
+
+  virtual void ComputeUpdate(Tensor* dX) = 0;
+
+  template <typename T>
+  void AdjustGradient(Tensor* dX, Tensor* X);
+
+  template <typename T>
+  void ApplyUpdate(Tensor* dX, Tensor* X);
+
+  Tensor* Slot(const string& name);
+  float Parameter(const string& name) const;
+
+ protected:
+  float lr_mult_, decay_mult_;
+};
+
+#define USE_PARAM_UPDATE_FUNCTIONS   \
+  using UpdateOpBase<Context>::Slot; \
+  using UpdateOpBase<Context>::Parameter
+
+template <class Context>
+class SGDUpdateOp final : public UpdateOpBase<Context> {
+ public:
+  SGDUpdateOp(const OperatorDef& def, Workspace* ws)
+      : UpdateOpBase<Context>(def, ws), last_lr_(-1.f), correction_(1.f) {}
+  USE_OPERATOR_FUNCTIONS;
+  USE_PARAM_UPDATE_FUNCTIONS;
+
+  void ComputeUpdate(Tensor* dX) override;
+
+ protected:
+  float last_lr_, correction_;
+};
+
+template <class Context>
+class NesterovUpdateOp final : public UpdateOpBase<Context> {
+ public:
+  NesterovUpdateOp(const OperatorDef& def, Workspace* ws)
+      : UpdateOpBase<Context>(def, ws) {}
+  USE_OPERATOR_FUNCTIONS;
+  USE_PARAM_UPDATE_FUNCTIONS;
+
+  void ComputeUpdate(Tensor* dX) override;
+};
+
+template <class Context>
+class RMSpropUpdateOp final : public UpdateOpBase<Context> {
+ public:
+  RMSpropUpdateOp(const OperatorDef& def, Workspace* ws)
+      : UpdateOpBase<Context>(def, ws) {}
+  USE_OPERATOR_FUNCTIONS;
+  USE_PARAM_UPDATE_FUNCTIONS;
+
+  void ComputeUpdate(Tensor* dX) override;
+};
+
+template <class Context>
+class AdamUpdateOp final : public UpdateOpBase<Context> {
+ public:
+  AdamUpdateOp(const OperatorDef& def, Workspace* ws)
+      : UpdateOpBase<Context>(def, ws), t_(0) {}
+  USE_OPERATOR_FUNCTIONS;
+  USE_PARAM_UPDATE_FUNCTIONS;
+
+  void ComputeUpdate(Tensor* dX) override;
+
+ protected:
+  int t_;
+};
+
+#undef USE_PARAM_UPDATE_FUNCTIONS
+
+} // namespace dragon
+
+#endif // DRAGON_OPERATORS_TRAINING_UPDATE_OPS_H_
--- a/dragon/operators/vision/bias_add_op.cc
+++ b/dragon/operators/vision/bias_add_op.cc
@@ -59,9 +59,9 @@ void BiasAddGradientOp<Context>::DoRunWithType() {
      dB->Reshape({dY.dim(-1)});
    }
    math::ReduceSum(
-        3,
+        dims.size(),
        dims.data(),
-        2,
+        axes.size(),
        axes.data(),
        1.f,
        dY.template data<T, Context>(),

--- a/dragon/operators/vision/conv2d_op.cc
+++ b/dragon/operators/vision/conv2d_op.cc
@@ -16,7 +16,7 @@ void Conv2dOp<Context>::DoRunWithType() {
  auto* y = Y->template mutable_data<T, Context>();

  for (int i = 0; i < X.dim(0); ++i) {
-    Wx(x + i * x_ofs_, w, y + i * y_ofs_);
+    Wx(x + i * x_offset_, w, y + i * y_offset_);
  }

  if (HasBias()) {
@@ -46,7 +46,7 @@ void Conv2dGradientOp<Context>::DoRunWithType() {
    auto* w = W.template data<T, Context>();
    auto* dx = dX->template mutable_data<T, Context>();
    for (int i = 0; i < X.dim(0); ++i) {
-      Dx(dy + i * y_ofs_, w, dx + i * x_ofs_);
+      Dx(dy + i * y_offset_, w, dx + i * x_offset_);
    }
  }

@@ -55,7 +55,7 @@ void Conv2dGradientOp<Context>::DoRunWithType() {
    auto* x = X.template data<T, Context>();
    auto* dw = dW->template mutable_data<T, Context>();
    for (int i = 0; i < X.dim(0); ++i) {
-      Dw(dy + i * y_ofs_, x + i * x_ofs_, dw, i > 0);
+      Dw(dy + i * y_offset_, x + i * x_offset_, dw, i > 0);
    }
  }


--- a/dragon/operators/vision/conv2d_op_cudnn.cc
+++ b/dragon/operators/vision/conv2d_op_cudnn.cc
@@ -73,8 +73,8 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
          filter_desc_,
          CuDNNType<T>::type,
          format_,
-          num_output_ / cudnn_group_,
-          channels_ / group_,
+          out_channels_ / cudnn_group_,
+          in_channels_ / group_,
          kshape_[0],
          kshape_[1]));
 #else
@@ -82,14 +82,15 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
          filter_desc_,
          CuDNNType<T>::type,
          format_,
-          num_output_ / cudnn_group_,
-          channels_ / group_,
+          out_channels_ / cudnn_group_,
+          in_channels_ / group_,
          kshape_[0],
          kshape_[1]));
 #endif
      // Determine the bias shape
      if (HasBias()) {
-        CuDNNSetBiasDesc<T>(&bias_desc_, X.ndim(), num_output_, data_format());
+        CuDNNSetBiasDesc<T>(
+            &bias_desc_, X.ndim(), out_channels_, data_format());
      }
    }
    // Set the conv configuration
@@ -179,16 +180,16 @@ void CuDNNConv2dOp<Context>::DoRunWithType() {
        ctx()->cudnn_handle(),
        CuDNNType<T>::one,
        input_desc_,
-        x + x_ofs_ * g,
+        x + x_offset_ * g,
        filter_desc_,
-        w + w_ofs_ * g,
+        w + w_offset_ * g,
        conv_desc_,
        fwd_algo_,
        scratch,
        cudnn_ws_nbytes_,
        CuDNNType<T>::zero,
        output_desc_,
-        y + y_ofs_ * g));
+        y + y_offset_ * g));
  }

  if (HasBias()) {
@@ -217,11 +218,11 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
  ConvOpBase<Context>::Reshape();

  if (data_format() == "NCHW") {
-    x_ofs_ = Input(0).stride(0) / cudnn_group_;
-    y_ofs_ = Output(0)->stride(0) / cudnn_group_;
+    x_offset_ = Input(0).stride(0) / cudnn_group_;
+    y_offset_ = Output(0)->stride(0) / cudnn_group_;
  } else if (data_format() == "NHWC") {
-    x_ofs_ = Input(0).dim(-1) / cudnn_group_;
-    y_ofs_ = Output(0)->dim(-1) / cudnn_group_;
+    x_offset_ = Input(0).dim(-1) / cudnn_group_;
+    y_offset_ = Output(0)->dim(-1) / cudnn_group_;
  }

  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
@@ -294,8 +295,8 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
          filter_desc_,
          CuDNNType<T>::type,
          format_,
-          num_output_ / cudnn_group_,
-          channels_ / group_,
+          out_channels_ / cudnn_group_,
+          in_channels_ / group_,
          kshape_[0],
          kshape_[1]));
 #else
@@ -303,14 +304,15 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
          filter_desc_,
          CuDNNType<T>::type,
          format_,
-          num_output_ / cudnn_group_,
-          channels_ / group_,
+          out_channels_ / cudnn_group_,
+          in_channels_ / group_,
          kshape_[0],
          kshape_[1]));
 #endif
      // Determine the bias shape
      if (HasBias()) {
-        CuDNNSetBiasDesc<T>(&bias_desc_, X.ndim(), num_output_, data_format());
+        CuDNNSetBiasDesc<T>(
+            &bias_desc_, X.ndim(), out_channels_, data_format());
      }
    }
    // Set the conv configuration
@@ -470,16 +472,16 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
          ctx()->cudnn_handle(),
          CuDNNType<T>::one,
          output_desc_,
-          x + x_ofs_ * g,
+          x + x_offset_ * g,
          input_desc_,
-          dy + y_ofs_ * g,
+          dy + y_offset_ * g,
          conv_desc_,
          bwd_filter_algo_,
          scratch,
          cudnn_ws_nbytes_,
          CuDNNType<T>::zero,
          filter_desc_,
-          dw + w_ofs_ * g));
+          dw + w_offset_ * g));
    }
  }

@@ -491,16 +493,16 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
          ctx()->cudnn_handle(),
          CuDNNType<T>::one,
          filter_desc_,
-          w + w_ofs_ * g,
+          w + w_offset_ * g,
          input_desc_,
-          dy + y_ofs_ * g,
+          dy + y_offset_ * g,
          conv_desc_,
          bwd_data_algo_,
          scratch,
          cudnn_ws_nbytes_,
          CuDNNType<T>::zero,
          output_desc_,
-          dx + x_ofs_ * g));
+          dx + x_offset_ * g));
    }
  }
 }
@@ -518,11 +520,11 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
  ConvOpBase<Context>::Reshape(true);

  if (data_format() == "NCHW") {
-    x_ofs_ = Input(0).stride(0) / cudnn_group_;
-    y_ofs_ = Input(-1).stride(0) / cudnn_group_;
+    x_offset_ = Input(0).stride(0) / cudnn_group_;
+    y_offset_ = Input(-1).stride(0) / cudnn_group_;
  } else if (data_format() == "NHWC") {
-    x_ofs_ = Input(0).dim(-1) / cudnn_group_;
-    y_ofs_ = Input(-1).dim(-1) / cudnn_group_;
+    x_offset_ = Input(0).dim(-1) / cudnn_group_;
+    y_offset_ = Input(-1).dim(-1) / cudnn_group_;
  }

  DispatchHelper<FloatingTensorTypes>::Call(this, Input(-1));

--- a/dragon/operators/vision/conv2d_transpose_op.cc
+++ b/dragon/operators/vision/conv2d_transpose_op.cc
@@ -8,12 +8,7 @@ template <class Context>
 template <typename T>
 void ConvTranspose2dOp<Context>::DoRunWithType() {
  auto &X = Input(0), &W = Input(1), *Y = Output(0);
-
  ConvOpBase<Context>::Reshape();
-  // Fix the output shape for im2col/col2im
-  for (int i = 0; i < num_axes_; i++) {
-    out_shape_[i] = X.dim(axis_ + i);
-  }

  TENSOR_FILL(W, w_shape_);
  auto* x = X.template data<T, Context>();
@@ -21,7 +16,7 @@ void ConvTranspose2dOp<Context>::DoRunWithType() {
  auto* y = Y->template mutable_data<T, Context>();

  for (int i = 0; i < X.dim(0); ++i) {
-    Dx(x + i * x_ofs_, w, y + i * y_ofs_);
+    Dx(x + i * x_offset_, w, y + i * y_offset_);
  }

  if (HasBias()) {
@@ -44,19 +39,14 @@ template <typename T>
 void ConvTranspose2dGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &W = Input(1), &dY = Input(2);
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
-
  ConvOpBase<Context>::Reshape(true);
-  // Fix the output shape for im2col/col2im
-  for (int i = 0; i < num_axes_; i++) {
-    out_shape_[i] = X.dim(axis_ + i);
-  }

  if (dX->has_name()) {
    auto* dy = dY.template data<T, Context>();
    auto* w = W.template data<T, Context>();
    auto* dx = dX->template mutable_data<T, Context>();
    for (int i = 0; i < X.dim(0); ++i) {
-      Wx(dy + i * y_ofs_, w, dx + i * x_ofs_);
+      Wx(dy + i * y_offset_, w, dx + i * x_offset_);
    }
  }

@@ -65,7 +55,7 @@ void ConvTranspose2dGradientOp<Context>::DoRunWithType() {
    auto* dy = dY.template data<T, Context>();
    auto* dw = dW->template mutable_data<T, Context>();
    for (int i = 0; i < X.dim(0); ++i) {
-      Dw(x + i * x_ofs_, dy + i * y_ofs_, dw, i > 0);
+      Dw(x + i * x_offset_, dy + i * y_offset_, dw, i > 0);
    }
  }


--- a/dragon/operators/vision/conv2d_transpose_op_cudnn.cc
+++ b/dragon/operators/vision/conv2d_transpose_op_cudnn.cc
@@ -71,8 +71,8 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() {
          filter_desc_,
          CuDNNType<T>::type,
          format_,
-          channels_ / cudnn_group_,
-          num_output_ / group_,
+          in_channels_ / cudnn_group_,
+          out_channels_ / group_,
          kshape_[0],
          kshape_[1]));
 #else
@@ -80,14 +80,15 @@ void CuDNNConvTranspose2dOp<Context>::ResetDesc() {
          filter_desc_,
          CuDNNType<T>::type,
          format_,
-          channels_ / cudnn_group_,
-          num_output_ / group_,
+          in_channels_ / cudnn_group_,
+          out_channels_ / group_,
          kshape_[0],
          kshape_[1]));
 #endif
      // Determine the bias shape
      if (HasBias()) {
-        CuDNNSetBiasDesc<T>(&bias_desc_, X.ndim(), num_output_, data_format());
+        CuDNNSetBiasDesc<T>(
+            &bias_desc_, X.ndim(), out_channels_, data_format());
      }
    }
    // Set the conv configuration
@@ -180,16 +181,16 @@ void CuDNNConvTranspose2dOp<Context>::DoRunWithType() {
        ctx()->cudnn_handle(),
        CuDNNType<T>::one,
        filter_desc_,
-        w + w_ofs_ * g,
+        w + w_offset_ * g,
        input_desc_,
-        x + x_ofs_ * g,
+        x + x_offset_ * g,
        conv_desc_,
        fwd_algo_,
        scratch,
        cudnn_ws_nbytes_,
        CuDNNType<T>::zero,
        output_desc_,
-        y + y_ofs_ * g));
+        y + y_offset_ * g));
  }

  if (HasBias()) {
@@ -218,11 +219,11 @@ void CuDNNConvTranspose2dOp<Context>::RunOnDevice() {
  ConvOpBase<Context>::Reshape();

  if (data_format() == "NCHW") {
-    x_ofs_ = Input(0).stride(0) / cudnn_group_;
-    y_ofs_ = Output(0)->stride(0) / cudnn_group_;
+    x_offset_ = Input(0).stride(0) / cudnn_group_;
+    y_offset_ = Output(0)->stride(0) / cudnn_group_;
  } else if (data_format() == "NHWC") {
-    x_ofs_ = Input(0).dim(-1) / cudnn_group_;
-    y_ofs_ = Output(0)->dim(-1) / cudnn_group_;
+    x_offset_ = Input(0).dim(-1) / cudnn_group_;
+    y_offset_ = Output(0)->dim(-1) / cudnn_group_;
  }

  DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
@@ -293,8 +294,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
          filter_desc_,
          CuDNNType<T>::type,
          format_,
-          channels_ / cudnn_group_,
-          num_output_ / group_,
+          in_channels_ / cudnn_group_,
+          out_channels_ / group_,
          kshape_[0],
          kshape_[1]));
 #else
@@ -302,14 +303,15 @@ void CuDNNConvTranspose2dGradientOp<Context>::ResetDesc() {
          filter_desc,
          CuDNNType<T>::type,
          format_,
-          channels_ / cudnn_group_,
-          num_output_ / group_,
+          in_channels_ / cudnn_group_,
+          out_channels_ / group_,
          kshape_[0],
          kshape_[1]));
 #endif
      // Determine the bias shape
      if (HasBias()) {
-        CuDNNSetBiasDesc<T>(&bias_desc_, X.ndim(), num_output_, data_format());
+        CuDNNSetBiasDesc<T>(
+            &bias_desc_, X.ndim(), out_channels_, data_format());
      }
    }
    // Set the conv configuration
@@ -466,16 +468,16 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() {
          ctx()->cudnn_handle(),
          CuDNNType<T>::one,
          input_desc_,
-          dy + y_ofs_ * g,
+          dy + y_offset_ * g,
          output_desc_,
-          x + x_ofs_ * g,
+          x + x_offset_ * g,
          conv_desc_,
          bwd_filter_algo_,
          scratch,
          cudnn_ws_nbytes_,
          CuDNNType<T>::zero,
          filter_desc_,
-          dw + w_ofs_ * g));
+          dw + w_offset_ * g));
    }
  }

@@ -487,16 +489,16 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() {
          ctx()->cudnn_handle(),
          CuDNNType<T>::one,
          input_desc_,
-          dy + y_ofs_ * g,
+          dy + y_offset_ * g,
          filter_desc_,
-          w + w_ofs_ * g,
+          w + w_offset_ * g,
          conv_desc_,
          bwd_data_algo_,
          scratch,
          cudnn_ws_nbytes_,
          CuDNNType<T>::zero,
          output_desc_,
-          dx + x_ofs_ * g));
+          dx + x_offset_ * g));
    }
  }
 }
@@ -514,11 +516,11 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunOnDevice() {
  ConvOpBase<Context>::Reshape(true);

  if (data_format() == "NCHW") {
-    x_ofs_ = Input(0).stride(0) / cudnn_group_;
-    y_ofs_ = Input(-1).stride(0) / cudnn_group_;
+    x_offset_ = Input(0).stride(0) / cudnn_group_;
+    y_offset_ = Input(-1).stride(0) / cudnn_group_;
  } else if (data_format() == "NHWC") {
-    x_ofs_ = Input(0).dim(-1) / cudnn_group_;
-    y_ofs_ = Input(-1).dim(-1) / cudnn_group_;
+    x_offset_ = Input(0).dim(-1) / cudnn_group_;
+    y_offset_ = Input(-1).dim(-1) / cudnn_group_;
  }

  DispatchHelper<FloatingTensorTypes>::Call(this, Input(-1));

--- a/dragon/operators/vision/conv_op_base.cc
+++ b/dragon/operators/vision/conv_op_base.cc
@@ -10,10 +10,11 @@ namespace dragon {

 template <class Context>
 void ConvOpBase<Context>::ComputeOutShape() {
+  auto X_dims = Input(0).dims();
  out_shape_.clear();
  for (int i = 0; i < num_axes_; i++) {
    if (!Transposed()) {
-      auto idm = x_shape_[axis_ + i];
+      auto idm = X_dims[axis_ + i];
      auto dk = dilation_[i] * (kshape_[i] - 1) + 1;
      if (!str::find(padding_, "SAME")) {
        // Explicit pads
@@ -32,7 +33,7 @@ void ConvOpBase<Context>::ComputeOutShape() {
        } // SAME_LOWER or SAME
      }
    } else {
-      auto idm = x_shape_[axis_ + i];
+      auto idm = X_dims[axis_ + i];
      auto dk = dilation_[i] * (kshape_[i] - 1) + 1;
      if (!str::find(padding_, "SAME")) {
        // Explicit pads
@@ -79,13 +80,11 @@ template <class Context>
 template <typename T>
 void ConvOpBase<Context>::Wx(const T* x, const T* w, T* y, bool skip) {
  auto* col = x;
-
  if (!is_1x1_) {
    auto* scratch = ws()->template data<T, Context>({col_dim_})[0];
    if (!skip) Im2Col(x, scratch);
    col = scratch;
  }
-
  for (int g = 0; g < group_; g++) {
    if (data_format() == "NCHW") {
      math::Gemm(
@@ -95,10 +94,10 @@ void ConvOpBase<Context>::Wx(const T* x, const T* w, T* y, bool skip) {
          conv_out_dim_,
          kernel_dim_,
          1.f,
-          w + w_ofs_ * g,
-          col + col_ofs_ * g,
+          w + w_offset_ * g,
+          col + col_offset_ * g,
          0.f,
-          y + output_ofs_ * g,
+          y + out_offset_ * g,
          ctx());
    } else if (data_format() == "NHWC") {
      math::Gemm(
@@ -121,10 +120,11 @@ template <class Context>
 template <typename T>
 void ConvOpBase<Context>::Pb(const T* bias, T* y) {
  if (data_format() == "NCHW") {
-    kernel::BiasAdd(Input(0).dim(0), num_output_, out_dim_, y, bias, y, ctx());
+    kernel::BiasAdd(
+        Input(0).dim(0), out_channels_, out_dim_, y, bias, y, ctx());
  } else if (data_format() == "NHWC") {
    kernel::BiasAdd(
-        Input(0).dim(0) * out_dim_, num_output_, 1, y, bias, y, ctx());
+        Input(0).dim(0) * out_dim_, out_channels_, 1, y, bias, y, ctx());
  }
 }

@@ -141,10 +141,10 @@ void ConvOpBase<Context>::Dx(const T* dy, const T* w, T* dx) {
          conv_out_dim_,
          conv_out_channels_ / group_,
          1.f,
-          w + w_ofs_ * g,
-          dy + output_ofs_ * g,
+          w + w_offset_ * g,
+          dy + out_offset_ * g,
          0.f,
-          col + col_ofs_ * g,
+          col + col_offset_ * g,
          ctx());
    } else if (data_format() == "NHWC") {
      math::Gemm(
@@ -168,13 +168,11 @@ template <class Context>
 template <typename T>
 void ConvOpBase<Context>::Dw(const T* dy, const T* x, T* dw, bool accum) {
  auto* col = x;
-
  if (!is_1x1_) {
    auto* scratch = ws()->template data<T, Context>({col_dim_})[0];
    Im2Col(x, scratch);
    col = scratch;
  }
-
  for (int g = 0; g < group_; g++) {
    if (data_format() == "NCHW") {
      math::Gemm(
@@ -184,10 +182,10 @@ void ConvOpBase<Context>::Dw(const T* dy, const T* x, T* dw, bool accum) {
          kernel_dim_,
          conv_out_dim_,
          1.f,
-          dy + output_ofs_ * g,
-          col + col_ofs_ * g,
+          dy + out_offset_ * g,
+          col + col_offset_ * g,
          accum ? 1.f : 0.f,
-          dw + w_ofs_ * g,
+          dw + w_offset_ * g,
          ctx());
    } else if (data_format() == "NHWC") {
      math::Gemm(
@@ -211,10 +209,10 @@ template <typename T>
 void ConvOpBase<Context>::Db(const T* dy, T* db) {
  vec32_t dims, axes;
  if (data_format() == "NCHW") {
-    dims = {(int)Input(0).dim(0), (int)num_output_, (int)out_dim_};
+    dims = {(int)Input(0).dim(0), (int)out_channels_, (int)out_dim_};
    axes = {0, 2};
  } else if (data_format() == "NHWC") {
-    dims = {(int)Input(0).dim(0), (int)out_dim_, (int)num_output_};
+    dims = {(int)Input(0).dim(0), (int)out_dim_, (int)out_channels_};
    axes = {0, 1};
  }
  math::ReduceSum(3, dims.data(), 2, axes.data(), 1.f, dy, db, ctx());
@@ -223,16 +221,15 @@ void ConvOpBase<Context>::Db(const T* dy, T* db) {
 template <class Context>
 void ConvOpBase<Context>::Setup(int num_axes) {
  num_axes_ = num_axes;
-
-  auto at = [&](const vec64_t& vec, int i) {
-    return i < vec.size() ? vec[i] : vec[0];
-  };
-
  auto pads = OpArgs<int64_t>("pads");
  auto strides = OpArgs<int64_t>("strides");
  auto kshape = OpArgs<int64_t>("kernel_shape");
  auto dilations = OpArgs<int64_t>("dilations");

+  auto at = [&](const vec64_t& vec, int i) {
+    return i < vec.size() ? vec[i] : vec[0];
+  };
+
  for (int i = 0; i < num_axes; i++) {
    pad_l_.push_back(at(pads, i));
    stride_.push_back(at(strides, i));
@@ -241,8 +238,9 @@ void ConvOpBase<Context>::Setup(int num_axes) {
  }

  if ((int64_t)pads.size() == (num_axes * 2)) {
-    for (int i = 0; i < num_axes; i++)
+    for (int i = 0; i < num_axes; i++) {
      pad_r_.push_back(pads[num_axes + i]);
+    }
  } else {
    pad_r_.assign(pad_l_.begin(), pad_l_.end());
  }
@@ -264,63 +262,56 @@ void ConvOpBase<Context>::Reshape(bool backward) {
  auto* Y_ref = backward ? &Input(-1) : Output(0);

  // Determine the in/out channels
-  channels_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
-  if (num_output_ <= 0) {
+  in_channels_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
+  if (out_channels_ <= 0) {
    // Infer the out channels from the weights shape
-    num_output_ = W.count() / channels_;
-    for (int i = 0; i < num_axes_; i++)
-      num_output_ /= kshape_[i];
-    CHECK_GT(num_output_, 0) << "\nFailed to infer the out channels "
-                             << "from weights: " << W.DimString();
+    out_channels_ = W.count() / (in_channels_ / group_);
+    for (int i = 0; i < num_axes_; i++) {
+      out_channels_ /= kshape_[i];
+    }
+    CHECK_GT(out_channels_, 0) << "\nFailed to infer the out channels "
+                               << "from weights: " << W.DimString();
  }
  if (Transposed()) {
-    conv_out_channels_ = channels_;
-    conv_in_channels_ = num_output_;
+    conv_out_channels_ = in_channels_;
+    conv_in_channels_ = out_channels_;
  } else {
-    conv_out_channels_ = num_output_;
-    conv_in_channels_ = channels_;
+    conv_out_channels_ = out_channels_;
+    conv_in_channels_ = in_channels_;
  }

  // Determine the weight and bias shape
  // Weight shape is assumed as NCHW format
  // whatever to compute the fans correctly
  w_shape_ = {conv_out_channels_, conv_in_channels_ / group_};
-  for (int i = 0; i < num_axes_; i++)
+  for (int i = 0; i < num_axes_; i++) {
    w_shape_.push_back(kshape_[i]);
-  b_shape_ = {num_output_};
+  }
+  b_shape_ = {out_channels_};

-  // Determine the Y shape
-  x_shape_ = X.dims();
+  // Determine the output shape
  ComputeOutShape();
  if (backward) {
    if (Output(0)->has_name()) Output(0)->ReshapeLike(X);
    if (Output(1)->has_name()) Output(1)->ReshapeLike(W);
-    if (Output(2)->has_name()) Output(2)->Reshape({num_output_});
+    if (Output(2)->has_name()) Output(2)->Reshape({out_channels_});
  } else {
+    vec64_t Y_dims{X.dim(0)};
    if (data_format() == "NCHW") {
-      y_shape_ = {X.dim(0), num_output_};
-      for (int i = 0; i < num_axes_; i++)
-        y_shape_.push_back(out_shape_[i]);
+      Y_dims.push_back(out_channels_);
+      for (int i = 0; i < num_axes_; i++) {
+        Y_dims.push_back(out_shape_[i]);
+      }
    } else if (data_format() == "NHWC") {
-      y_shape_ = {X.dim(0)};
-      for (int i = 0; i < num_axes_; i++)
-        y_shape_.push_back(out_shape_[i]);
-      y_shape_.push_back(num_output_);
-    }
-    Output(0)->Reshape(y_shape_);
-  }
-
-  // Determine the input shape for im2col/col2im
-  in_shape_.clear();
-  for (int i = 0; i < num_axes_; i++) {
-    if (Transposed()) {
-      in_shape_.push_back(Y_ref->dim(axis_ + i));
-    } else {
-      in_shape_.push_back(X.dim(axis_ + i));
+      for (int i = 0; i < num_axes_; i++) {
+        Y_dims.push_back(out_shape_[i]);
+      }
+      Y_dims.push_back(out_channels_);
    }
+    Output(0)->Reshape(Y_dims);
  }

-  // Determine the out spatial dim
+  // Determine the output dim
  auto end_axis = X.ndim() - 1;
  if (data_format() == "NCHW") {
    if (Transposed()) {
@@ -338,25 +329,31 @@ void ConvOpBase<Context>::Reshape(bool backward) {
    out_dim_ = Y_ref->count(axis_, end_axis);
  }

-  // Determine the misc
-  x_ofs_ = X.stride(0);
-  y_ofs_ = Y_ref->stride(0);
+  // Compute the miscellaneous
+  x_offset_ = X.stride(0);
+  y_offset_ = Y_ref->stride(0);
  kernel_dim_ = conv_in_channels_ / group_;
-  for (int i = 0; i < num_axes_; i++)
+  for (int i = 0; i < num_axes_; i++) {
    kernel_dim_ *= kshape_[i];
-  col_ofs_ = kernel_dim_ * conv_out_dim_;
-  w_ofs_ = conv_out_channels_ * kernel_dim_ / group_;
-  output_ofs_ = conv_out_channels_ * conv_out_dim_ / group_;
+  }
+  col_offset_ = kernel_dim_ * conv_out_dim_;
+  w_offset_ = conv_out_channels_ * kernel_dim_ / group_;
+  out_offset_ = conv_out_channels_ * conv_out_dim_ / group_;

-  // Determine the workspace size for col buffer
-  col_dim_ = kernel_dim_ * group_;
+  // Compute the arguments for im2col/col2im
+  in_shape_.clear();
  for (int i = 0; i < num_axes_; i++) {
    if (Transposed()) {
-      col_dim_ *= x_shape_[axis_ + i];
+      in_shape_.push_back(Y_ref->dim(axis_ + i));
+      out_shape_[i] = X.dim(axis_ + i);
    } else {
-      col_dim_ *= out_shape_[i];
+      in_shape_.push_back(X.dim(axis_ + i));
    }
  }
+  col_dim_ = kernel_dim_ * group_;
+  for (int i = 0; i < num_axes_; i++) {
+    col_dim_ *= out_shape_[i];
+  }
 }

 #define INSTANTIATE_API(Context, T)                                    \

--- a/dragon/operators/vision/conv_op_base.h
+++ b/dragon/operators/vision/conv_op_base.h
@@ -25,7 +25,7 @@ class ConvOpBase : public Operator<Context> {
  ConvOpBase(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
        padding_(OpArg<string>("padding", "VALID")),
-        num_output_(OpArg<int64_t>("num_output", 0)),
+        out_channels_(OpArg<int64_t>("out_channels", 0)),
        group_(OpArg<int64_t>("group", 1)) {
    if (data_format() == "NCHW") {
      axis_ = 2;
@@ -42,18 +42,13 @@ class ConvOpBase : public Operator<Context> {

  vec64_t kshape_, stride_;
  vec64_t pad_l_, pad_r_, dilation_;
-  vec64_t in_shape_, out_shape_;
-  vec64_t x_shape_, y_shape_;
-  vec64_t w_shape_, b_shape_;
+  vec64_t in_shape_, w_shape_, b_shape_, out_shape_;

  string padding_;
-  int64_t is_1x1_, num_output_, group_;
+  int64_t group_;
  int64_t axis_, num_axes_;
-  int64_t channels_, out_dim_;
-  int64_t conv_in_channels_, conv_out_channels_;
-  int64_t conv_out_dim_, kernel_dim_, col_dim_;
-  int64_t col_ofs_, output_ofs_;
-  int64_t w_ofs_, x_ofs_, y_ofs_;
+  int64_t in_channels_, out_channels_, out_dim_;
+  int64_t x_offset_, w_offset_, y_offset_;

  DECLARE_ARGS_WITH_DESC(int64_t, output_shape);
  DECLARE_ARGS_WITH_DESC(int64_t, output_padding);
@@ -133,37 +128,42 @@ class ConvOpBase : public Operator<Context> {
      LOG(FATAL) << "ConvNd has not been implemented.";
    }
  }
+
+  int64_t is_1x1_;
+  int64_t kernel_dim_, col_dim_;
+  int64_t col_offset_, out_offset_;
+  int64_t conv_in_channels_, conv_out_channels_, conv_out_dim_;
 };

 DEFINE_ARGS_WITH_DESC(int64_t, ConvOpBase, output_shape);
 DEFINE_ARGS_WITH_DESC(int64_t, ConvOpBase, output_padding);

-#define USE_CONVOLUTION_FUNCTIONS         \
-  using ConvOpBase<Context>::Setup;       \
-  using ConvOpBase<Context>::Reshape;     \
-  using ConvOpBase<Context>::Transposed;  \
-  using ConvOpBase<Context>::HasBias;     \
-  using ConvOpBase<Context>::Wx;          \
-  using ConvOpBase<Context>::Pb;          \
-  using ConvOpBase<Context>::Dx;          \
-  using ConvOpBase<Context>::Dw;          \
-  using ConvOpBase<Context>::Db;          \
-  using ConvOpBase<Context>::kshape_;     \
-  using ConvOpBase<Context>::stride_;     \
-  using ConvOpBase<Context>::pad_l_;      \
-  using ConvOpBase<Context>::pad_r_;      \
-  using ConvOpBase<Context>::dilation_;   \
-  using ConvOpBase<Context>::group_;      \
-  using ConvOpBase<Context>::channels_;   \
-  using ConvOpBase<Context>::num_output_; \
-  using ConvOpBase<Context>::axis_;       \
-  using ConvOpBase<Context>::num_axes_;   \
-  using ConvOpBase<Context>::x_ofs_;      \
-  using ConvOpBase<Context>::y_ofs_;      \
-  using ConvOpBase<Context>::w_ofs_;      \
-  using ConvOpBase<Context>::w_shape_;    \
-  using ConvOpBase<Context>::b_shape_;    \
-  using ConvOpBase<Context>::in_shape_;   \
+#define USE_CONVOLUTION_FUNCTIONS           \
+  using ConvOpBase<Context>::Setup;         \
+  using ConvOpBase<Context>::Reshape;       \
+  using ConvOpBase<Context>::Transposed;    \
+  using ConvOpBase<Context>::HasBias;       \
+  using ConvOpBase<Context>::Wx;            \
+  using ConvOpBase<Context>::Pb;            \
+  using ConvOpBase<Context>::Dx;            \
+  using ConvOpBase<Context>::Dw;            \
+  using ConvOpBase<Context>::Db;            \
+  using ConvOpBase<Context>::kshape_;       \
+  using ConvOpBase<Context>::stride_;       \
+  using ConvOpBase<Context>::pad_l_;        \
+  using ConvOpBase<Context>::pad_r_;        \
+  using ConvOpBase<Context>::dilation_;     \
+  using ConvOpBase<Context>::group_;        \
+  using ConvOpBase<Context>::in_channels_;  \
+  using ConvOpBase<Context>::out_channels_; \
+  using ConvOpBase<Context>::axis_;         \
+  using ConvOpBase<Context>::num_axes_;     \
+  using ConvOpBase<Context>::x_offset_;     \
+  using ConvOpBase<Context>::w_offset_;     \
+  using ConvOpBase<Context>::y_offset_;     \
+  using ConvOpBase<Context>::in_shape_;     \
+  using ConvOpBase<Context>::w_shape_;      \
+  using ConvOpBase<Context>::b_shape_;      \
  using ConvOpBase<Context>::out_shape_

 } // namespace dragon

--- a/dragon/operators/vision/depthwise_conv2d_op.cc
+++ b/dragon/operators/vision/depthwise_conv2d_op.cc
@@ -10,14 +10,15 @@ template <typename T>
 void DepthwiseConv2dOp<Context>::DoRunWithType() {
  auto &X = Input(0), &W = Input(1), *Y = Output(0);

-  group_ = channels_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
-  CHECK_EQ(channels_, num_output_) << "\nExcepted in/out channels unchanged.";
+  group_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
  ConvOpBase<Context>::Reshape();
+  CHECK_EQ(in_channels_, out_channels_)
+      << "\nExcepted in/out channels to be same.";

  TENSOR_FILL(W, w_shape_);
  kernel::DepthwiseConv2d(
      Input(0).dim(0),
-      channels_,
+      in_channels_,
      in_shape_[0],
      in_shape_[1],
      out_shape_[0],
@@ -54,13 +55,13 @@ void DepthwiseConv2dGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &W = Input(1), &dY = Input(2);
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);

-  group_ = channels_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
+  group_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
  ConvOpBase<Context>::Reshape(true);

  if (dX->has_name()) {
    kernel::DepthwiseConv2dGrad(
        X.dim(0),
-        channels_,
+        in_channels_,
        in_shape_[0],
        in_shape_[1],
        out_shape_[0],
@@ -83,7 +84,7 @@ void DepthwiseConv2dGradientOp<Context>::DoRunWithType() {
  if (dW->has_name()) {
    kernel::DepthwiseConv2dWGrad(
        X.dim(0),
-        channels_,
+        in_channels_,
        in_shape_[0],
        in_shape_[1],
        out_shape_[0],

--- a/dragon/operators/vision/depthwise_conv2d_op_cudnn.cc
+++ b/dragon/operators/vision/depthwise_conv2d_op_cudnn.cc
@@ -12,14 +12,15 @@ template <typename T>
 void CuDNNDepthwiseConv2dOp<Context>::DoRunWithType() {
  auto &X = Input(0), &W = Input(1), *Y = Output(0);

-  group_ = channels_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
-  CHECK_EQ(channels_, num_output_) << "\nExcepted in/out channels unchanged.";
+  group_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
  ConvOpBase<Context>::Reshape();
+  CHECK_EQ(in_channels_, out_channels_)
+      << "\nExcepted in/out channels to be same.";

  TENSOR_FILL(W, w_shape_);
  kernel::DepthwiseConv2d(
      X.dim(0),
-      channels_,
+      in_channels_,
      in_shape_[0],
      in_shape_[1],
      out_shape_[0],
@@ -40,7 +41,7 @@ void CuDNNDepthwiseConv2dOp<Context>::DoRunWithType() {

  if (HasBias()) {
    TENSOR_FILL(Input(2), b_shape_);
-    CuDNNSetBiasDesc<T>(&bias_desc_, 4, num_output_, data_format());
+    CuDNNSetBiasDesc<T>(&bias_desc_, 4, out_channels_, data_format());
    CuDNNSetTensorDesc<T>(&output_desc_, Y->dims(), data_format());
    CUDNN_CHECK(cudnnAddTensor(
        ctx()->cudnn_handle(),
@@ -64,13 +65,13 @@ void CuDNNDepthwiseConv2dGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &W = Input(1), &dY = Input(2);
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);

-  group_ = channels_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
+  group_ = data_format() == "NCHW" ? X.dim(1) : X.dim(-1);
  ConvOpBase<Context>::Reshape(true);

  if (dX->has_name()) {
    kernel::DepthwiseConv2dGrad(
        X.dim(0),
-        channels_,
+        in_channels_,
        in_shape_[0],
        in_shape_[1],
        out_shape_[0],
@@ -93,7 +94,7 @@ void CuDNNDepthwiseConv2dGradientOp<Context>::DoRunWithType() {
  if (dW->has_name()) {
    kernel::DepthwiseConv2dWGrad(
        X.dim(0),
-        channels_,
+        in_channels_,
        in_shape_[0],
        in_shape_[1],
        out_shape_[0],
@@ -115,7 +116,7 @@ void CuDNNDepthwiseConv2dGradientOp<Context>::DoRunWithType() {

  if (dB->has_name()) {
    CuDNNSetTensorDesc<T>(&input_desc_, Input(-1).dims(), data_format());
-    CuDNNSetBiasDesc<T>(&bias_desc_, 4, num_output_, data_format());
+    CuDNNSetBiasDesc<T>(&bias_desc_, 4, out_channels_, data_format());
    CUDNN_CHECK(cudnnConvolutionBackwardBias(
        ctx()->cudnn_handle(),
        CuDNNType<T>::one,

--- a/dragon/operators/vision/space_to_depth_op.cc
+++ b/dragon/operators/vision/space_to_depth_op.cc
@@ -50,7 +50,7 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
  if (data_format() == "NCHW") {
    for (int i = 0; i < num_axes; i++) {
      perm.insert(perm.begin() + 1, perm.back());
-      perm.pop_back(); // CRD mode
+      perm.pop_back(); // DCR mode
    }
  }


--- a/dragon/proto/dragon.proto
+++ b/dragon/proto/dragon.proto
@@ -10,61 +10,65 @@ package dragon;

 // Store the serialized Tensor objects.
 message TensorProto {
-    repeated int32 dims = 1;
-    enum DataType {
-        UNDEFINED = 0;
-        // Basic types.
-        FLOAT = 1;
-        INT32 = 2;
-        BYTE = 3;
-        STRING = 4;
-
-        // Less-commonly used data types.
-        BOOL = 5;
-        UINT8 = 6;
-        INT8 = 7;
-        UINT16 = 8;
-        INT16 = 9;
-        INT64 = 10;
-        FLOAT16 = 12;
-        DOUBLE = 13;
-    }
-    optional DataType data_type = 2 [default = FLOAT];
-    // For float.
-    repeated float float_data = 3 [packed = true];
-    // For int32, uint8, int8, uint16, int16, bool, and float16
-    // Note about float16: in storage we will basically convert float16 byte-wise
-    // to unsigned short and then store them in the int32_data field.
-    repeated int32 int32_data = 4 [packed = true];
-    // For bytes.
-    optional bytes byte_data = 5;
-    // For strings.
-    repeated bytes string_data = 6;
-    // For double.
-    repeated double double_data = 9 [packed = true];
-    // For int64.
-    repeated int64 int64_data = 10 [packed = true];
-    // Store the raw data, contents are serialized as little-endian.
-    optional bytes raw_data = 13;
-
-    // Optionally, a name for the tensor.
-    optional string name = 7;
+  repeated int32 dims = 1;
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;
+    INT32 = 2;
+    BYTE = 3;
+    STRING = 4;
+
+    // Less-commonly used data types.
+    BOOL = 5;
+    UINT8 = 6;
+    INT8 = 7;
+    UINT16 = 8;
+    INT16 = 9;
+    INT64 = 10;
+    FLOAT16 = 12;
+    DOUBLE = 13;
+  }
+  optional DataType data_type = 2 [default = FLOAT];
+  // For float.
+  repeated float float_data = 3 [packed = true];
+  // For int32, uint8, int8, uint16, int16, bool, and float16
+  // Note about float16: in storage we will basically convert float16 byte-wise
+  // to unsigned short and then store them in the int32_data field.
+  repeated int32 int32_data = 4 [packed = true];
+  // For bytes.
+  optional bytes byte_data = 5;
+  // For strings.
+  repeated bytes string_data = 6;
+  // For double.
+  repeated double double_data = 9 [packed = true];
+  // For int64.
+  repeated int64 int64_data = 10 [packed = true];
+  // Store the raw data, contents are serialized as little-endian.
+  optional bytes raw_data = 13;
+
+  // Optionally, a name for the tensor.
+  optional string name = 7;
 }

 // Record the filler of Tensor.
 // This structure is kept for backward compatibility
 // with caffe1, which relies implicit initializer.
 message TensorFillerProto {
-    optional string tensor = 1;
-    optional string type = 2 [default = 'constant'];
-    optional float value = 3 [default = 0];
-    optional float low = 4 [default = 0];
-    optional float high = 5 [default = 1];
-    optional float mean = 6 [default = 0];
-    optional float std = 7 [default = 1];
-    optional float scale = 8 [default = 3];
-    enum VarianceNorm { FAN_IN = 0; FAN_OUT = 1; FAN_AVG=2; }
-    optional VarianceNorm variance_norm = 9 [default = FAN_IN];
+  optional string tensor = 1;
+  optional string type = 2 [default = 'constant'];
+  optional float value = 3 [default = 0];
+  optional float low = 4 [default = 0];
+  optional float high = 5 [default = 1];
+  optional float mean = 6 [default = 0];
+  optional float std = 7 [default = 1];
+  optional float scale = 8 [default = 3];
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    FAN_AVG = 2;
+  }
+  optional VarianceNorm variance_norm = 9 [default = FAN_IN];
 }

 // Store multiple TensorProto objects in one single proto.
@@ -74,99 +78,99 @@ message TensorProtos {

 // DeviceType that Dragon currently supports.
 enum DeviceTypeProto {
-    // The default device.
-    PROTO_CPU = 0;
-    // NVIDIA's CUDA Environment.
-    PROTO_CUDA = 1;
-    // CAMBRICON's CNML Environment.
-    PROTO_CNML = 2;
+  // The default device.
+  PROTO_CPU = 0;
+  // NVIDIA's CUDA Environment.
+  PROTO_CUDA = 1;
+  // CAMBRICON's CNML Environment.
+  PROTO_CNML = 2;
 }

 // Device-specific options.
 message DeviceOption {
-    // The type of device to dispatch executions.
-    optional DeviceTypeProto device_type = 1 [default = PROTO_CPU];
-    // The index of this device.
-    optional int32 device_id = 2 [default = 0];
-    // The random seed to start the random generator.
-    optional uint32 random_seed = 3 [default = 3];
+  // The type of device to dispatch executions.
+  optional DeviceTypeProto device_type = 1 [default = PROTO_CPU];
+  // The index of this device.
+  optional int32 device_id = 2 [default = 0];
+  // The random seed to start the random generator.
+  optional uint32 random_seed = 3 [default = 3];
 }

 // A named argument containing either singular float, integer and string
 // values, or repeated float, int and string arrays.
 message Argument {
-    // The name of this argument.
-    optional string name = 1;
-
-    // Store the float32 value.
-    optional float f = 2;
-    // Store the bool, int32, int64 value.
-    optional int64 i = 3;
-    // Store the string value.
-    optional bytes s = 4;
-
-    // Store the float32 values.
-    repeated float floats = 7;
-    // Store the bool, int32, int64 values.
-    repeated int64 ints = 8;
-    // Store the string values.
-    repeated bytes strings = 9;
+  // The name of this argument.
+  optional string name = 1;
+
+  // Store the float32 value.
+  optional float f = 2;
+  // Store the bool, int32, int64 value.
+  optional int64 i = 3;
+  // Store the string value.
+  optional bytes s = 4;
+
+  // Store the float32 values.
+  repeated float floats = 7;
+  // Store the bool, int32, int64 values.
+  repeated int64 ints = 8;
+  // Store the string values.
+  repeated bytes strings = 9;
 }

 // Operator Definition
 message OperatorDef {
-    // The name of inputs.
-    repeated string input = 1;
-    // The name of outputs.
-    repeated string output = 2;
-
-    // The optional name of this operator.
-    optional string name = 3;
-    // The operator type.
-    optional string type = 4;
-    // The arguments.
-    repeated Argument arg = 5;
-
-    // The device option that the operator should run under.
-    optional DeviceOption device_option = 6;
-
-    // The optional unique key for this operator.
-    // Set it to persist operators in the eager mode.
-    optional string cache_key = 7;
+  // The name of inputs.
+  repeated string input = 1;
+  // The name of outputs.
+  repeated string output = 2;
+
+  // The optional name of this operator.
+  optional string name = 3;
+  // The operator type.
+  optional string type = 4;
+  // The arguments.
+  repeated Argument arg = 5;
+
+  // The device option that the operator should run under.
+  optional DeviceOption device_option = 6;
+
+  // The optional unique key for this operator.
+  // Set it to persist operators in the eager mode.
+  optional string cache_key = 7;
 }

 // Record the gradient information
 message GradientProto {
-    // The derivative target.
-    optional string cost = 1;
-    // The target with respect to?
-    optional string wrt = 2;
-    // The external gradient
-    optional string external = 3;
+  // The derivative target.
+  optional string cost = 1;
+  // The target with respect to?
+  optional string wrt = 2;
+  // The external gradient
+  optional string external = 3;
 }

 // Graph Definition
 message GraphDef {
-    // The graph name.
-    optional string name = 1;
+  // The graph name.
+  optional string name = 1;

-    // The operators to execute.
-    repeated OperatorDef op = 2;
+  // The operators to execute.
+  repeated OperatorDef op = 2;

-    // The type of graph.
-    optional string graph_type = 3;
+  // The type of graph.
+  optional string graph_type = 3;

-    // The device option for this graph.
-    optional DeviceOption device_option = 5;
+  // The device option for this graph.
+  optional DeviceOption device_option = 5;

-    // The arguments.
-    repeated Argument arg = 6;
+  // The arguments.
+  repeated Argument arg = 6;

-    // The name of inputs.
-    repeated string input = 7;
-    // The name of outputs.
-    repeated string output = 8;
+  // The name of inputs.
+  repeated string input = 7;
+  // The name of outputs.
+  repeated string output = 8;

-    // The gradients information.
-    repeated GradientProto gradient = 9;
+  // The gradients information.
+  repeated GradientProto gradient = 9;
 }
--- a/dragon/proto/onnx.proto
+++ b/dragon/proto/onnx.proto
@@ -2,7 +2,6 @@
 // WARNING: This file is automatically generated!  Please edit onnx.in.proto.
 //

-
 // Copyright (c) Facebook Inc. and Microsoft Corporation.
 // Licensed under the MIT license.

@@ -19,11 +18,12 @@ package onnx_dragon;
 // 3)  Definitions of built-in operators.
 //
 // This document describes the syntax of models and their computation graphs,
-// as well as the standard data types. Together, they are referred to as the ONNX
-// Intermediate Representation, or 'IR' for short. 
+// as well as the standard data types. Together, they are referred to as the
+// ONNX Intermediate Representation, or 'IR' for short.
 //
 // The normative semantic specification of the ONNX IR is found in docs/IR.md.
-// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+// Definitions of the built-in neural network operators may be found in
+// docs/Operators.md.

 // Notes
 //
@@ -35,10 +35,11 @@ package onnx_dragon;
 // by sharing our working version of ONNX.
 //
 // Protobuf compatibility
-// 
-// To simplify framework compatibility, ONNX is defined using the subset of protobuf 
-// that is compatible with both protobuf v2 and v3. This means that we do not use any
-// protobuf features that are only available in one of the two versions.
+//
+// To simplify framework compatibility, ONNX is defined using the subset of
+// protobuf that is compatible with both protobuf v2 and v3. This means that we
+// do not use any protobuf features that are only available in one of the two
+// versions.
 //
 // Here are the most notable contortions we have to carry out to work around
 // these limitations:
@@ -47,10 +48,10 @@ package onnx_dragon;
 //     of key-value pairs, where order does not matter and duplicates
 //     are not allowed.

-
 // Versioning
 //
-// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+// ONNX versioning is specified in docs/IR.md and elaborated on in
+// docs/Versioning.md
 //
 // To be compatible with both proto2 and proto3, we will use a version number
 // that is not defined by the default value but an explicit enum number.
@@ -60,8 +61,8 @@ enum Version {
  _START_VERSION = 0;
  // The version field is always serialized and we will use it to store the
  // version that the  graph is generated from. This helps us set up version
-  // control. 
-  // For the IR, we are using simple numbers starting with with 0x00000001, 
+  // control.
+  // For the IR, we are using simple numbers starting with with 0x00000001,
  // which was the version we published on Oct 10, 2017.
  IR_VERSION_2017_10_10 = 0x0000000000000001;

@@ -80,13 +81,13 @@ enum Version {
 // Attributes
 //
 // A named attribute containing either singular float, integer, string, graph,
-// and tensor values, or repeated float, integer, string, graph, and tensor values.
-// An AttributeProto MUST contain the name field, and *only one* of the
+// and tensor values, or repeated float, integer, string, graph, and tensor
+// values. An AttributeProto MUST contain the name field, and *only one* of the
 // following content fields, effectively enforcing a C/C++ union equivalent.
 message AttributeProto {
-
  // Note: this enum is structurally identical to the OpSchema::AttrType
-  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  // enum defined in schema.h.  If you rev one, you likely need to rev the
+  // other.
  enum AttributeType {
    UNDEFINED = 0;
    FLOAT = 1;
@@ -103,12 +104,12 @@ message AttributeProto {
  }

  // The name field MUST be present for this version of the IR.
-  optional string name = 1;           // namespace Attribute
- 
-  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
-  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
-  // in parent scope.
-  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  optional string name = 1;  // namespace Attribute
+
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in
+  // parent function. In this case, this AttributeProto does not contain data,
+  // and it's a reference of attribute in parent scope. NOTE: This should ONLY
+  // be used in function (sub-graph). It's invalid to be used in main graph.
  optional string ref_attr_name = 21;

  // A human-readable documentation for this attribute. Markdown is allowed.
@@ -120,16 +121,19 @@ message AttributeProto {
  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
  // field MUST be set and match the f|i|s|t|... field in use.  This
  // change was made to accomodate proto3 implementations.
-  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
-
-  // Exactly ONE of the following fields must be present for this version of the IR
-  optional float f = 2;               // float
-  optional int64 i = 3;               // int
-  optional bytes s = 4;               // UTF-8 string
-  optional TensorProto t = 5;         // tensor value
-  optional GraphProto g = 6;          // graph
+  optional AttributeType type =
+      20;  // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the
+  // IR
+  optional float f = 2;        // float
+  optional int64 i = 3;        // int
+  optional bytes s = 4;        // UTF-8 string
+  optional TensorProto t = 5;  // tensor value
+  optional GraphProto g = 6;   // graph
  // Do not use field below, it's deprecated.
-  // optional ValueProto v = 12;         // value - subsumes everything but graph
+  // optional ValueProto v = 12;         // value - subsumes everything but
+  // graph

  repeated float floats = 7;          // list of floats
  repeated int64 ints = 8;            // list of ints
@@ -142,7 +146,7 @@ message AttributeProto {
 // the shape of the value.
 message ValueInfoProto {
  // This field MUST be present in this version of the IR.
-  optional string name = 1;     // namespace Value
+  optional string name = 1;  // namespace Value
  // This field MUST be present in this version of the IR.
  optional TypeProto type = 2;
  // A human-readable documentation for this value. Markdown is allowed.
@@ -154,20 +158,20 @@ message ValueInfoProto {
 // Computation graphs are made up of a DAG of nodes, which represent what is
 // commonly called a "layer" or "pipeline stage" in machine learning frameworks.
 //
-// For example, it can be a node of type "Conv" that takes in an image, a filter 
+// For example, it can be a node of type "Conv" that takes in an image, a filter
 // tensor and a bias tensor, and produces the convolved output.
 message NodeProto {
-  repeated string input = 1;    // namespace Value
-  repeated string output = 2;   // namespace Value
+  repeated string input = 1;   // namespace Value
+  repeated string output = 2;  // namespace Value

  // An optional identifier for this node in a graph.
  // This field MAY be absent in ths version of the IR.
-  optional string name = 3;     // namespace Node
+  optional string name = 3;  // namespace Node

  // The symbolic identifier of the Operator to execute.
  optional string op_type = 4;  // namespace Operator
  // The domain of the OperatorSet that specifies the operator named by op_type.
-  optional string domain = 7;   // namespace Domain
+  optional string domain = 7;  // namespace Domain

  // Additional named attributes.
  repeated AttributeProto attribute = 5;
@@ -198,21 +202,21 @@ message ModelProto {
  repeated OperatorSetIdProto opset_import = 8;

  // The name of the framework or tool used to generate this model.
-  // This field SHOULD be present to indicate which implementation/tool/framework
-  // emitted the model.
+  // This field SHOULD be present to indicate which
+  // implementation/tool/framework emitted the model.
  optional string producer_name = 2;

  // The version of the framework or tool used to generate this model.
-  // This field SHOULD be present to indicate which implementation/tool/framework
-  // emitted the model.
+  // This field SHOULD be present to indicate which
+  // implementation/tool/framework emitted the model.
  optional string producer_version = 3;

  // Domain name of the model.
  // We use reverse domain names as name space indicators. For example:
  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
  //
-  // Together with `model_version` and GraphProto.name, this forms the unique identity of
-  // the graph.
+  // Together with `model_version` and GraphProto.name, this forms the unique
+  // identity of the graph.
  optional string domain = 4;

  // The version of the graph encoded. See Version enum below.
@@ -232,25 +236,25 @@ message ModelProto {
 // See https://developers.google.com/protocol-buffers/docs/proto3#maps
 message StringStringEntryProto {
  optional string key = 1;
-  optional string value= 2;
+  optional string value = 2;
 };

 // Graphs
 //
-// A graph defines the computational logic of a model and is comprised of a parameterized 
-// list of nodes that form a directed acyclic graph based on their inputs and outputs.
-// This is the equivalent of the "network" or "graph" in many deep learning
-// frameworks.
+// A graph defines the computational logic of a model and is comprised of a
+// parameterized list of nodes that form a directed acyclic graph based on their
+// inputs and outputs. This is the equivalent of the "network" or "graph" in
+// many deep learning frameworks.
 message GraphProto {
  // The nodes in the graph, sorted topologically.
  repeated NodeProto node = 1;

  // The name of the graph.
-  optional string name = 2;   // namespace Graph
+  optional string name = 2;  // namespace Graph

-  // A list of named tensor values, used to specify constant inputs of the graph.
-  // Each TensorProto entry must have a distinct name (within the list) that
-  // also appears in the input list.
+  // A list of named tensor values, used to specify constant inputs of the
+  // graph. Each TensorProto entry must have a distinct name (within the list)
+  // that also appears in the input list.
  repeated TensorProto initializer = 5;

  // A human-readable documentation for this graph. Markdown is allowed.
@@ -264,13 +268,10 @@ message GraphProto {
  // must be distinct. It is optional for a value to appear in value_info list.
  repeated ValueInfoProto value_info = 13;

-  // DO NOT USE the following fields, they were deprecated from earlier versions.
-  // repeated string input = 3;
-  // repeated string output = 4;
-  // optional int64 ir_version = 6;
-  // optional int64 producer_version = 7;
-  // optional string producer_tag = 8;
-  // optional string domain = 9;
+  // DO NOT USE the following fields, they were deprecated from earlier
+  // versions. repeated string input = 3; repeated string output = 4; optional
+  // int64 ir_version = 6; optional int64 producer_version = 7; optional string
+  // producer_tag = 8; optional string domain = 9;
 }

 // Tensors
@@ -297,8 +298,8 @@ message TensorProto {
    DOUBLE = 11;
    UINT32 = 12;
    UINT64 = 13;
-    COMPLEX64 = 14;     // complex with float32 real and imaginary components
-    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+    COMPLEX64 = 14;   // complex with float32 real and imaginary components
+    COMPLEX128 = 15;  // complex with float64 real and imaginary components

    // Non-IEEE floating-point format based on IEEE754 single-precision
    // floating-point number truncated to 16 bits.
@@ -356,7 +357,7 @@ message TensorProto {
  repeated int64 int64_data = 7 [packed = true];

  // Optionally, a name for the tensor.
-  optional string name = 8; // namespace Value
+  optional string name = 8;  // namespace Value

  // A human-readable documentation for this tensor. Markdown is allowed.
  optional string doc_string = 12;
@@ -368,14 +369,16 @@ message TensorProto {
  // When this raw_data field is used to store tensor value, elements MUST
  // be stored in as fixed-width, little-endian order.
  // Floating-point data types MUST be stored in IEEE 754 format.
-  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
-  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
-  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  // Complex64 elements must be written as two consecutive FLOAT values, real
+  // component first. Complex128 elements must be written as two consecutive
+  // DOUBLE values, real component first. Boolean type MUST be written one byte
+  // per tensor element (00000001 for true, 00000000 for false).
  //
  // Note: the advantage of specific field rather than the raw_data field is
  // that in some cases (e.g. int data), protobuf does a better packing via
  // variable length storage, and may lead to smaller binary footprint.
-  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  // When this field is present, the data_type field MUST NOT be STRING or
+  // UNDEFINED
  optional bytes raw_data = 9;

  // For double
@@ -384,7 +387,8 @@ message TensorProto {
  // and the corresponding imaginary component apparing in the
  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
-  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  // When this field is present, the data_type field MUST be DOUBLE or
+  // COMPLEX128
  repeated double double_data = 10 [packed = true];

  // For uint64 and uint32 values
@@ -400,12 +404,13 @@ message TensorShapeProto {
  message Dimension {
    oneof value {
      int64 dim_value = 1;
-      string dim_param = 2;   // namespace Shape
+      string dim_param = 2;  // namespace Shape
    };
    // Standard denotation can optionally be used to denote tensor
    // dimensions with standard semantic descriptions to ensure
    // that operations are applied to the correct axis of a tensor.
-    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // Refer to
+    // https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
    // for pre-defined dimension denotations.
    optional string denotation = 3;
  };
@@ -416,7 +421,6 @@ message TensorShapeProto {
 //
 // The standard ONNX data types.
 message TypeProto {
-
  message Tensor {
    // This field MUST NOT have the value of UNDEFINED
    // This field MUST be present for this version of the IR.
@@ -424,16 +428,15 @@ message TypeProto {
    optional TensorShapeProto shape = 2;
  }

-
  oneof value {
    // The type of a tensor.
    Tensor tensor_type = 1;
-
  }

-  // An optional denotation can be used to denote the whole 
-  // type with a standard semantic description as to what is 
-  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // An optional denotation can be used to denote the whole
+  // type with a standard semantic description as to what is
+  // stored inside. Refer to
+  // https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
  // for pre-defined type denotations.
  optional string denotation = 6;
 }
@@ -445,7 +448,8 @@ message OperatorSetIdProto {
  // The domain of the operator set being identified.
  // The empty string ("") or absence of this field implies the operator
  // set that is defined as part of the ONNX specification.
-  // This field MUST be present in this version of the IR when referring to any other operator set.
+  // This field MUST be present in this version of the IR when referring to any
+  // other operator set.
  optional string domain = 1;

  // The version of the operator set being identified.

--- a/dragon/python/__init__.py
+++ b/dragon/python/__init__.py
@@ -28,10 +28,10 @@ from dragon._api import losses
 from dragon._api import math
 from dragon._api import metrics
 from dragon._api import nn
+from dragon._api import optimizers
 from dragon._api import random
-from dragon._api import updaters
-from dragon._api import vision
 from dragon._api import workspace
+from dragon._api import vision

 # Virtual API
 from dragon import vm
@@ -56,7 +56,7 @@ from dragon.core.framework.context import name_scope
 from dragon.core.framework.workspace import get_workspace
 from dragon.core.framework.workspace import reset_workspace
 from dragon.core.ops import tensorbind_eager as _
-from dragon.core.ops import tensorbind_symbolic as _
+from dragon.core.ops import tensorbind_symbol as _
 from dragon.core.ops.array_ops import arange
 from dragon.core.ops.array_ops import broadcast_to
 from dragon.core.ops.array_ops import cast

--- a/dragon/python/_api/math/__init__.py
+++ b/dragon/python/_api/math/__init__.py
@@ -24,9 +24,9 @@ from dragon.core.ops.array_ops import min
 from dragon.core.ops.array_ops import moments
 from dragon.core.ops.array_ops import sum
 from dragon.core.ops.math_ops import abs
-from dragon.core.ops.math_ops import accumulate
 from dragon.core.ops.math_ops import add
 from dragon.core.ops.math_ops import affine
+from dragon.core.ops.math_ops import axpby
 from dragon.core.ops.math_ops import ceil
 from dragon.core.ops.math_ops import clip
 from dragon.core.ops.math_ops import cos
@@ -45,7 +45,6 @@ from dragon.core.ops.math_ops import log
 from dragon.core.ops.math_ops import matmul
 from dragon.core.ops.math_ops import maximum
 from dragon.core.ops.math_ops import minimum
-from dragon.core.ops.math_ops import moving_average
 from dragon.core.ops.math_ops import mul
 from dragon.core.ops.math_ops import negative
 from dragon.core.ops.math_ops import not_equal

--- a/dragon/python/_api/optimizers/__init__.py
+++ b/dragon/python/_api/optimizers/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import as _absolute_import
+from __future__ import division as _division
+from __future__ import print_function as _print_function
+
+from dragon.core.training.adam import Adam
+from dragon.core.training.optimizer import Optimizer
+from dragon.core.training.rmsprop import RMSprop
+from dragon.core.training.sgd import Nesterov
+from dragon.core.training.sgd import SGD
+
+__all__ = [_s for _s in dir() if not _s.startswith('_')]
--- a/dragon/python/core/autograph/def_function.py
+++ b/dragon/python/core/autograph/def_function.py
@@ -29,7 +29,7 @@ from dragon.core.eager import context as eager_context
 from dragon.core.eager.tensor import EagerTensor
 from dragon.core.framework import context
 from dragon.core.framework import workspace
-from dragon.core.training import updater
+from dragon.core.training import optimizer
 from dragon.core.util import decorator
 from dragon.core.util import inspect
 from dragon.core.util import nest
@@ -265,7 +265,7 @@ class FunctionGuard(object):
                    dummies.append(obj)
            executables = [function_lib.create_function(inputs, outputs)]
            for obj in dummies:
-                if isinstance(obj, updater.Updater):
+                if isinstance(obj, optimizer.Optimizer):
                    executables.append(function_lib.create_function(updater=obj))
            self.inputs = inputs
            self.outputs = returns

--- a/dragon/python/core/autograph/function_lib.py
+++ b/dragon/python/core/autograph/function_lib.py
@@ -78,22 +78,22 @@ def add_phase(graph_def, targets):
    graph_def.arg.extend([proto_util.make_argument('phase', phase)])


-def add_update_ops(graph_def, updater):
+def add_update_ops(graph_def, optimizer):
    """Add the update operators for graph."""
-    if updater is None:
+    if optimizer is None:
        return
    grads, update_ops = [], []
-    extra_arguments = updater._extra_kwargs
-    extra_arguments['slot'] = updater._slot
+    extra_arguments = optimizer._extra_kwargs
+    extra_arguments['handle'] = optimizer._op_handle
    # Generate update operators according to the updater.
-    for e in updater._param_group:
+    for e in optimizer._param_group:
        (param, grad), arguments = e
        if workspace.has_tensor(grad):
            grads.append(grad)
            arguments = dict(arguments, **extra_arguments)
            update_ops.append(
                proto_util.make_operator_def(
-                    op_type=updater._op_type,
+                    op_type=optimizer._op_type,
                    inputs=[grad],
                    outputs=[param],
                    name=OpDef.get_name(),
@@ -102,7 +102,7 @@ def add_update_ops(graph_def, updater):
        else:
            logging.info('Skip to update Tensor({}).'.format(param))
    # Insert a reduce op if the process group is found.
-    process_group = updater._process_group
+    process_group = optimizer._process_group
    if process_group is not None:
        update_ops.insert(
            0, proto_util.make_operator_def(
@@ -139,12 +139,15 @@ class Function(object):

        # Collect the forward operators.
        requires_grad = False
-        for output in outputs:
+        for i, output in enumerate(outputs):
            op_info.merge_from(output)
            op_info.add_target(output.id)
-            if output._grad is not None and \
-                    output._grad.required():
-                requires_grad = True
+            try:
+                grad_info = output._grad
+                if grad_info and grad_info.required():
+                    requires_grad = True
+            except AttributeError:
+                raise ValueError('Output[%d] is not a symbolic tensor.' % i)

        # Handle givens.
        if givens is not None:
@@ -169,23 +172,23 @@ class Function(object):
                ])
                del op_def.input[:len(op_def.input) // 2]

-        # Sort out the topology of states.
+        # Sort out the states.
        op_defs = sorted(op_info._defs.items(), key=lambda d: d[0])
        forward_ops = copy.deepcopy([v for k, v in op_defs])

        # Generate the backward operators.
        if requires_grad:
-            input_grads = {}
+            input_grads, grad_targets = {}, []
            for output in outputs:
-                if hasattr(output, '_grad'):
-                    grad_info = output._grad
-                    if grad_info is not None:
-                        if grad_info.input is not None:
-                            input_grads[output.id] = grad_info.input.id
+                grad_info = output._grad
+                if grad_info is not None:
+                    if grad_info.input is not None:
+                        input_grads[output.id] = output._grad.input.id
+                    grad_targets.append(output.id)
            forward_ops, gradient_ops, _ = \
                grad_maker.GradientMaker.make(
                    forward_ops=forward_ops,
-                    targets=list(op_info._targets),
+                    targets=grad_targets,
                    input_grads=input_grads,
                )
        else:

--- a/dragon/python/core/autograph/grad_impl.py
+++ b/dragon/python/core/autograph/grad_impl.py
@@ -13,7 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.core.autograph.tensor import RefTensor
+from dragon.core.autograph.tensor import TensorRef
 from dragon.core.eager import context
 from dragon.core.util import nest

@@ -120,19 +120,12 @@ def gradients(ys, xs, grad_ys=None):
        if grad_ys is not None:
            y._grad.set_input(grad_ys[i])
        for x in xs:
-            if not hasattr(x, '_grad') or \
-                    x._grad is None:
+            if not hasattr(x, '_grad') or x._grad is None:
                x._grad = GradientInfo(x)
            y._grad.add_wrt(x.id)
            x._grad.add_cost(y)
            if i == 0:
-                dxs.append(
-                    RefTensor(
-                        name=x.id + '_grad',
-                        shape=x.shape,
-                        dtype=x.dtype,
-                    )
-                )
+                dxs.append(TensorRef(x.id + '_grad', x.shape, x.dtype))

    # Return the packed gradients.
    return dxs
--- a/dragon/python/core/autograph/op_def.py
+++ b/dragon/python/core/autograph/op_def.py
@@ -15,7 +15,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.core.autograph.tensor import RefTensor
+from dragon.core.autograph.tensor import TensorRef
 from dragon.core.autograph import op_spec
 from dragon.core.framework import context
 from dragon.core.framework import proto_util
@@ -76,26 +76,24 @@ class OpDef(object):
            outputs = []
            name_scope = context.get_name_scope()
            for i in range(num_outputs):
-                outputs.append(RefTensor(
+                outputs.append(TensorRef(
                    workspace.get_dummy_name(
                        name_scope + (name if name else op_type),
                        suffix=':{}'.format(i),
-                        domain='Tensor'))
-                )
+                        domain='Tensor')))
        else:
            outputs = nest.flatten(outputs)
            num_outputs = len(outputs)

        # Construct Def.
        op_idx, op_name = OpDef.get_index_and_name()
-        op_info._defs[op_idx] = \
-            proto_util.make_operator_def(
-                name=op_name,
-                op_type=op_type,
-                inputs=[input.id for input in inputs],
-                outputs=[output.id for output in outputs],
-                device_option=proto_util.get_default_device_option(),
-                **kwargs)
+        op_info._defs[op_idx] = proto_util.make_operator_def(
+            name=op_name,
+            op_type=op_type,
+            inputs=[input.id for input in inputs],
+            outputs=[output.id for output in outputs],
+            device_option=proto_util.get_default_device_option(),
+            **kwargs)

        # Blend the op for outputs.
        for output in outputs:

--- a/dragon/python/core/autograph/op_spec.py
+++ b/dragon/python/core/autograph/op_spec.py
@@ -147,7 +147,7 @@ def cast_spec(args, inputs, outputs):
    outputs[0].dtype = args['dtype']
    try:
        outputs[0].shape = inputs[0].shape[:]
-    except TypeError:
+    except (TypeError, IndexError):
        pass
    return outputs

@@ -192,7 +192,10 @@ def conv_spec(args, inputs, outputs):
        out_shape = inputs[0].shape[:]
        channel_axis = 1 if args['data_format'] == 'NCHW' else -1
        spatial_axis = 2 if args['data_format'] == 'NCHW' else 1
-        out_shape[channel_axis] = args['num_output']
+        if 'out_channels' in args:
+            out_shape[channel_axis] = args['out_channels']
+        else:
+            out_shape[channel_axis] = inputs[1].shape[0]
        for i in range(len(out_shape) - 2):
            input_size = out_shape[i + spatial_axis]
            k = args['kernel_shape'][i]
@@ -219,7 +222,10 @@ def conv_transpose_spec(args, inputs, outputs):
        out_shape = inputs[0].shape[:]
        channel_axis = 1 if args['data_format'] == 'NCHW' else -1
        spatial_axis = 2 if args['data_format'] == 'NCHW' else 1
-        out_shape[channel_axis] = args['num_output']
+        if 'out_channels' in args:
+            out_shape[channel_axis] = args['out_channels']
+        else:
+            out_shape[channel_axis] = inputs[1].shape[1]
        for i in range(len(out_shape) - 2):
            k = args['kernel_shape'][i]
            s = args['strides'][i]
@@ -274,20 +280,16 @@ def depth_to_space_spec(args, inputs, outputs):
 @register('Dot')
 def dot_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    ta, tb = args['transA'], args['transB']
    try:
-        if len(inputs[0].shape) == 1:
+        a_shape, b_shape = inputs[0].shape[:], inputs[1].shape[:]
+        if len(a_shape) == 1 and len(b_shape) == 1:
            outputs[0].shape = []
-            return outputs
-    except TypeError:
-        pass
-    try:
-        if len(inputs[0].shape) >= 2 and len(inputs[1].shape) in (1, 2):
-            out_shape = inputs[0].shape[1:] if ta else inputs[0].shape[:-1]
-            if len(inputs[1].shape) == 2:
-                out_shape.append(inputs[1].shape[0] if tb else inputs[1].shape[1])
-            outputs[0].shape = out_shape
-            return outputs
+        elif len(a_shape) == 2 and len(b_shape) == 2:
+            outputs[0].shape = [a_shape[0], b_shape[1]]
+        elif len(a_shape) == 0 and len(b_shape) == 0:
+            outputs[0].shape = []
+        elif len(a_shape) >= 2 and len(b_shape) == 1:
+            outputs[0].shape = a_shape[:-1]
    except TypeError:
        pass
    return outputs
@@ -298,6 +300,7 @@ def dot_spec(args, inputs, outputs):
    'L1Loss',
    'L2Loss',
    'SigmoidCrossEntropy',
+    'SigmoidFocalLoss',
    'SmoothL1Loss',
 ])
 def eltwise_loss_spec(args, inputs, outputs):
@@ -426,22 +429,22 @@ def flatten_spec(args, inputs, outputs):
 @register('FullyConnected')
 def fully_connected_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    axis, num_output = args['axis'], args['num_output']
+    axis, out_channels = args['axis'], args.get('out_channels', None)
    while axis < 0:
        try:
            axis += len(inputs[0].shape)
        except TypeError:
            return outputs
    outputs[0].shape = [None] * (axis + 1)
-    if num_output is None:
+    if out_channels is None:
        try:
            if args['transW']:
-                num_output = inputs[1].shape[0]
+                out_channels = inputs[1].shape[0]
            else:
-                num_output = inputs[1].shape[1]
+                out_channels = inputs[1].shape[1]
        except (TypeError, IndexError):
-            num_output = None
-    outputs[0].shape[axis] = num_output
+            out_channels = None
+    outputs[0].shape[axis] = out_channels
    try:
        outputs[0].shape[:axis] = inputs[0].shape[:axis]
    except TypeError:
@@ -488,7 +491,7 @@ def index_select_spec(args, inputs, outputs):
    return outputs


-@register(['IsInf', 'InNaN'])
+@register(['IsInf', 'IsNaN'])
 def is_spec(args, inputs, outputs):
    _ = locals()
    outputs[0].dtype = 'bool'
@@ -507,7 +510,7 @@ def masked_select_spec(args, inputs, outputs):
    return outputs


-@register('Matmul')
+@register('MatMul')
 def matmul_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
    ta, tb = args['transA'], args['transB']
@@ -758,7 +761,7 @@ def resize_spec(args, inputs, outputs):
 @register(['RoiPool', 'RoiAlign'])
 def roi_pool_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    pool_h, pool_w = args['pool_h'], args['pool_w']
+    pool_h, pool_w = args['pooled_h'], args['pooled_w']
    out_shape = None
    try:
        out_shape = inputs[0].shape[:]
@@ -814,7 +817,6 @@ def slice_spec(args, inputs, outputs):

 @register([
    'NLLLoss',
-    'SigmoidFocalLoss',
    'SoftmaxCrossEntropy',
    'SparseSoftmaxCrossEntropy',
 ])

--- a/dragon/python/core/autograph/tensor.py
+++ b/dragon/python/core/autograph/tensor.py
@@ -420,7 +420,7 @@ class Tensor(types.TensorMetaclass):
            The constant contains the value.

        """
-        return RefTensor('', dtype=dtype)._from_constant(value, name)
+        return Tensor('', dtype=dtype)._from_constant(value, name)

    def _register_as(self, type, **kwargs):
        """Fill self with the specific type of filler."""
@@ -463,13 +463,12 @@ class Tensor(types.TensorMetaclass):
        """Convert the value to a tensor."""
        if not isinstance(value, numpy.ndarray):
            value = numpy.array(value, self.dtype if self.dtype else 'float32')
-        return RefTensor(
+        return TensorRef(
            name=workspace.get_dummy_name(
                basename=context.get_name_scope() +
                        (name if name else 'Const'),
                suffix=':0',
-                domain='Tensor'
-            ),
+                domain='Tensor'),
            shape=list(value.shape),
            dtype=str(value.dtype),
        ).set_value(value)
@@ -560,8 +559,8 @@ class Tensor(types.TensorMetaclass):
        return self.__div__(other)


-class RefTensor(object):
-    """Create a reference tensor not involved with name scope."""
+class TensorRef(object):
+    """Create a reference not involved with name scope."""

    def __new__(cls, name, shape=None, dtype=None):
        tensor = Tensor('', shape=shape, dtype=dtype)

--- a/dragon/python/core/framework/mapping.py
+++ b/dragon/python/core/framework/mapping.py
@@ -9,7 +9,7 @@
 #
 # ------------------------------------------------------------

-"""Some useful mappings are defined here."""
+"""Constant mappings."""

 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/framework/ops.py
+++ b/dragon/python/core/framework/ops.py
@@ -104,7 +104,7 @@ class Operator(object):
        """Generate the OpDef from attributes."""
        attributes = self.attributes()
        self._def = proto_util.make_operator_cdef(
-            name='Generic',
+            name=attributes.get('name', 'GenericOp'),
            cache_key=self._cache_key,
            op_type=attributes['op_type'],
            device_option=proto_util.get_device_option(

--- a/dragon/python/core/framework/proto_util.py
+++ b/dragon/python/core/framework/proto_util.py
@@ -134,10 +134,14 @@ def make_operator_cdef(
    op_def = backend.OperatorDef()
    op_def.ParseFrom(
        make_operator_def(
-            op_type, inputs, outputs, name,
-            cache_key, device_option, arg, **kwargs
-        ).SerializeToString()
-    )
+            op_type,
+            inputs,
+            outputs,
+            name,
+            cache_key,
+            device_option,
+            arg,
+            **kwargs).SerializeToString())
    return op_def



--- a/dragon/python/core/framework/workspace.py
+++ b/dragon/python/core/framework/workspace.py
@@ -9,12 +9,7 @@
 #
 # ------------------------------------------------------------

-"""Wrappers for the Workspace of C++ backend.
-
-Flexible API is provided to manage the global resources
-between the Python threads (quite different from C++).
-
-"""
+"""Generic interfaces of current default workspace."""

 from __future__ import absolute_import
 from __future__ import division

--- a/dragon/python/core/ops/activation_ops.py
+++ b/dragon/python/core/ops/activation_ops.py
@@ -268,7 +268,7 @@ def leaky_relu(inputs, alpha=0.2, **kwargs):


 @OpSchema.num_inputs(1)
-def log_softmax(inputs, axis=1, **kwargs):
+def log_softmax(inputs, axis=-1, **kwargs):
    r"""Apply the composite of logarithm and softmax.

    The **LogSoftmax** function is defined as:
@@ -287,7 +287,7 @@ def log_softmax(inputs, axis=1, **kwargs):
    ----------
    inputs : dragon.Tensor
        The input tensor.
-    axis : int, optional, default=1
+    axis : int, optional, default=-1
        The axis to reduce.

    Returns
@@ -351,7 +351,7 @@ def prelu(inputs, channel_shared=False, data_format='NCHW', **kwargs):
    if context.executing_eagerly():
        return op_lib \
            .instantiate(data_format=data_format) \
-            .apply([inputs])
+            .apply(inputs)
    else:
        return op_lib.blend(**args)

@@ -373,7 +373,7 @@ def relu(inputs, **kwargs):
    Examples:

    ```python
-    x = dragon.constant([-1, 0, 1], 'float32')
+    x = dragon.constant([-1., 0., 1.])
    print(dragon.nn.relu(x, inplace=False))
    ```

@@ -449,10 +449,10 @@ def selu(inputs, alpha=1.67326, gamma=1.0507, **kwargs):

    .. math::
        \text{SELU}(x) = \gamma *
-        \begin{cases}
-            x, & \text{ if } x \geq 0 \\
-            \alpha * (e^{x} - 1), & \text{ otherwise }
-        \end{cases}
+            \begin{cases}
+                x, & \text{ if } x \geq 0 \\
+                \alpha * (e^{x} - 1), & \text{ otherwise }
+            \end{cases}

    Examples:

@@ -561,9 +561,8 @@ def softmax(inputs, axis=-1, **kwargs):
    op_lib = activation_ops_lib.Softmax
    if context.executing_eagerly():
        return op_lib \
-            .instantiate(
-                axis=axis,
-            ).apply([inputs], inplace=inplace)
+            .instantiate(axis=axis) \
+            .apply([inputs], inplace=inplace)
    else:
        return op_lib.blend(**args)


--- a/dragon/python/core/ops/array_ops.py
+++ b/dragon/python/core/ops/array_ops.py
@@ -64,11 +64,14 @@ def arange(start, stop=None, step=1, dtype='int64', **kwargs):
    """
    args = parse_args(locals())
    args['dtype'] = args['dtype'].lower()
-    op_lib = array_ops_lib.Arange
    if stop is None:
-        args['slice'] = (start, step)
+        args['slice'] = (float(start), float(step))
    else:
-        args['slice'] = (start, stop, step)
+        args['slice'] = (float(start), float(stop), float(step))
+    args.pop('start')
+    args.pop('stop')
+    args.pop('step')
+    op_lib = array_ops_lib.Arange
    trainable = args.pop('trainable') if 'trainable' in args else False
    if context.executing_eagerly():
        return op_lib.instantiate(
@@ -269,6 +272,8 @@ def cast(inputs, dtype, **kwargs):
            .instantiate(dtype=dtype) \
            .apply([inputs], inplace=inplace)
    else:
+        if inputs.dtype == dtype:
+            return inputs
        if inplace:
            args['inputs'], args['outputs'] = [], [inputs]
        return op_lib.blend(**args)
@@ -627,16 +632,14 @@ def index_select(inputs, indices, axis=0, **kwargs):
        return op_lib.blend(**args)


-@OpSchema.num_inputs(1)
-def masked_select(inputs, mask, **kwargs):
+@OpSchema.num_inputs(2)
+def masked_select(inputs, **kwargs):
    """Select the elements where the given mask is **1**.

    Parameters
    ----------
-    inputs : dragon.Tensor
-        The input tensor.
-    mask : dragon.Tensor
-        The mask, with the same size as ``inputs``.
+    inputs : Sequence[dragon.Tensor]
+        The input and mask tensor.

    Returns
    -------
@@ -647,9 +650,8 @@ def masked_select(inputs, mask, **kwargs):
    args = parse_args(locals())
    op_lib = array_ops_lib.MaskedSelect
    if context.executing_eagerly():
-        return op_lib.instantiate().apply([inputs, mask])
+        return op_lib.instantiate().apply(inputs)
    else:
-        args['inputs'] = [args['inputs'], args.pop('mask')]
        return op_lib.blend(**args)


@@ -1047,7 +1049,7 @@ def pad(inputs, pads, mode='constant', value=0, **kwargs):
            .instantiate(
                ndim=len(pads_begin),
                value=args['value'],
-                mode=mode,
+                mode=args['mode'],
            ).apply([inputs], args['pads'])
    else:
        return op_lib.blend(**args)
@@ -1278,7 +1280,9 @@ def split(
        size_splits = None
    if slice_points is not None:
        if len(slice_points) + 1 != num_splits:
-            raise ValueError('Excepted %d values for <slice_points>.')
+            raise ValueError(
+                'Excepted %d values for <slice_points>.'
+                % len(slice_points))
    if context.executing_eagerly():
        return op_lib \
            .instantiate(

--- a/dragon/python/core/ops/control_flow_ops.py
+++ b/dragon/python/core/ops/control_flow_ops.py
@@ -61,38 +61,36 @@ def assign(inputs, starts=None, sizes=None, **kwargs):

 @OpSchema.num_inputs(1, 2)
 def copy(inputs, **kwargs):
-    r"""Copy the value to ref.
-
-    .. math:: \text{Ref}[:] = \text{Value}[:]
+    """Copy the input.

    Examples:

    ```python
-    # Copy the content from ``x`` to ``xx``
+    # Copy ``x`` to ``y``
    x = dragon.ones(shape=(2, 3))
-    xx = dragon.zeros(shape=(2, 4))
-    dragon.copy([xx, x])
+    y = dragon.zeros(shape=(2, 4))
+    dragon.copy([x, y])

-    # Create a new tensor initialized from ``x``
-    xxx = dragon.copy(x)
+    # Copy to a new tensor from ``x``
+    y = dragon.copy(x)
    ```

    Parameters
    ----------
-    inputs : Sequence[dragon.Tensor]
-        The **ref** and **value**.
+    inputs : Union[dragon.Tensor, Sequence[dragon.Tensor]]
+        The input tensor.

    Returns
    -------
    dragon.Tensor
-        The **ref**.
+        The output tensor.

    """
    args = parse_args(locals())
-    inputs = nest.flatten(inputs)
-    if len(inputs) == 2:
-        args['inputs'] = [inputs[1]]
-        args['outputs'] = [inputs[0]]
+    args['inputs'] = nest.flatten(inputs)
+    if len(args['inputs']) == 2:
+        args['outputs'] = [args['inputs'][1]]
+        args['inputs'] = [args['inputs'][0]]
    else:
        args['outputs'] = None
    op_lib = control_flow_ops_lib.Copy
@@ -104,8 +102,8 @@ def copy(inputs, **kwargs):
        return op_lib.blend('Copy', **args)


-@OpSchema.num_inputs(2)
-def masked_assign(inputs, mask, **kwargs):
+@OpSchema.num_inputs(3)
+def masked_assign(inputs, **kwargs):
    r"""Assign the value to ref where mask is **1**.

    .. math::
@@ -118,24 +116,22 @@ def masked_assign(inputs, mask, **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The **ref** and **value**.
-    mask : dragon.Tensor
-        The mask, with the same size as **ref**.
+        The **ref**, **value** and **mask** tensor.

    Returns
    -------
    dragon.Tensor
-        The **ref**.
+        The **ref** tensor..

    """
    args = parse_args(locals())
    inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
    op_lib = control_flow_ops_lib.MaskedAssign
    if context.executing_eagerly():
-        return op_lib.instantiate().apply(inputs, mask)
+        return op_lib.instantiate().apply(inputs)
    else:
        args.update({
            'outputs': [args['inputs'][0]],
-            'inputs': [args['inputs'][1], mask],
+            'inputs': [args['inputs'][1:]],
        })
        return op_lib.blend(**args)
--- a/dragon/python/core/ops/control_flow_ops_lib.py
+++ b/dragon/python/core/ops/control_flow_ops_lib.py
@@ -47,7 +47,7 @@ class Assign(Operator):
                sizes[i], 'int64',
            )

-    def forward(self, ws, inputs, starts, sizes):
+    def forward(self, inputs, starts, sizes):
        return self.dispatch(
            [inputs[1]], [inputs[0]],
            callback=lambda ws, handle:
@@ -75,5 +75,5 @@ class MaskedAssign(Operator):
    def attributes(self):
        return {'op_type': 'MaskedAssign', 'arguments': {}}

-    def forward(self, inputs, mask):
-        return self.dispatch([inputs[1], mask], [inputs[0]], no_grad=True)
+    def forward(self, inputs):
+        return self.dispatch(inputs[1:], [inputs[0]], no_grad=True)
--- a/dragon/python/core/ops/loss_ops.py
+++ b/dragon/python/core/ops/loss_ops.py
@@ -88,9 +88,7 @@ def l1_loss(inputs, reduction='mean', **kwargs):
    op_lib = loss_ops_lib.L1Loss
    if context.executing_eagerly():
        return op_lib  \
-            .instantiate(
-                reduction=args['reduction'],
-            ).apply(inputs)
+            .instantiate(reduction=args['reduction']).apply(inputs)
    else:
        return op_lib.blend(**args)


--- a/dragon/python/core/ops/math_ops.py
+++ b/dragon/python/core/ops/math_ops.py
@@ -46,55 +46,13 @@ def abs(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib.instantiate(op_type='Abs').apply(inputs)
+        return op_lib.instantiate(op_type='Abs').apply([inputs])
    else:
        return op_lib.blend('Abs', **args)


-@OpSchema.num_inputs(1, 2147483647)
-def accumulate(inputs, outputs=None, alpha=1., beta=1., **kwargs):
-    r"""Compute the element-wise accumulation from input to output.
-
-    .. math:: y = \alpha x + \beta y
-
-    If ``outputs`` is not provided, **zeros** will be used instead.
-
-    Parameters
-    ----------
-    inputs : Sequence[dragon.Tensor]
-        The tensor :math:`x`.
-    outputs : Sequence[dragon.Tensor], optional
-        The tensor :math:`y`.
-    alpha : number, optional, default=1.
-        The value of :math:`\alpha`.
-    beta : number, optional, default=1.
-        The value of :math:`\beta`.
-
-    Returns
-    -------
-    Sequence[dragon.Tensor]
-        The tensor :math:`y`.
-
-    """
-    args = parse_args(locals())
-    args['alpha'], args['beta'] = float(alpha), float(beta)
-    if types.is_tensor(inputs):
-        inputs = [inputs]
-    if outputs is not None and types.is_tensor(outputs):
-        args['outputs'] = [outputs]
-    op_lib = math_ops_lib.Accumulate
-    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(
-                alpha=args['alpha'],
-                beta=args['beta'],
-            ).apply(inputs, args['outputs'])
-    else:
-        return op_lib.blend(**args)
-
-
 @OpSchema.num_inputs(2)
 def add(inputs, **kwargs):
    r"""Compute the element-wise addition.
@@ -123,11 +81,9 @@ def add(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Add') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Add').apply(inputs)
    else:
        return op_lib.blend('Add', **args)

@@ -173,6 +129,48 @@ def affine(inputs, axis=1, num_axes=1, **kwargs):
        return op_lib.blend(**args)


+@OpSchema.num_inputs(1, 2147483647)
+def axpby(inputs, outputs=None, alpha=1., beta=1., **kwargs):
+    r"""Compute the element-wise addition from input to output.
+
+    .. math:: y = \alpha x + \beta y
+
+    If ``outputs`` is not provided, **zeros** will be used instead.
+
+    Parameters
+    ----------
+    inputs : Union[dragon.Tensor, Sequence[dragon.Tensor]]
+        The tensor :math:`x`.
+    outputs : Union[dragon.Tensor, Sequence[dragon.Tensor]], optional
+        The tensor :math:`y`.
+    alpha : number, optional, default=1.
+        The value of :math:`\alpha`.
+    beta : number, optional, default=1.
+        The value of :math:`\beta`.
+
+    Returns
+    -------
+    Union[dragon.Tensor, Sequence[dragon.Tensor]]
+        The tensor :math:`y`.
+
+    """
+    args = parse_args(locals())
+    args['alpha'], args['beta'] = float(alpha), float(beta)
+    if types.is_tensor(inputs):
+        inputs = [inputs]
+    if outputs is not None and types.is_tensor(outputs):
+        args['outputs'] = [outputs]
+    op_lib = math_ops_lib.Axpby
+    if context.executing_eagerly():
+        return op_lib \
+            .instantiate(
+                alpha=args['alpha'],
+                beta=args['beta'],
+            ).apply(inputs, args['outputs'])
+    else:
+        return op_lib.blend(**args)
+
+
 @OpSchema.num_inputs(2)
 def bitwise_and(inputs, **kwargs):
    r"""Compute the element-wise AND bitwise operation.
@@ -285,9 +283,9 @@ def ceil(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib.instantiate(op_type='Ceil').apply(inputs)
+        return op_lib.instantiate(op_type='Ceil').apply([inputs])
    else:
        return op_lib.blend('Ceil', **args)

@@ -324,7 +322,7 @@ def clip(inputs, low=None, high=None, **kwargs):
            .instantiate(
                low=args['low'],
                high=args['high'],
-            ).apply(inputs)
+            ).apply([inputs])
    else:
        return op_lib.blend(**args)

@@ -354,11 +352,9 @@ def cos(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Cos') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Cos').apply([inputs])
    else:
        return op_lib.blend('Cos', **args)

@@ -391,56 +387,48 @@ def div(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Div') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Div').apply(inputs)
    else:
        return op_lib.blend('Div', **args)


 @OpSchema.num_inputs(2)
-def dot(inputs, transA=False, transB=False, **kwargs):
+def dot(inputs, **kwargs):
    r"""Compute the dot product.

    .. math:: \text{out} = a \cdot b

-    If ``rank(a)`` == ``rank(b)`` == 1, computes **Vector Dot**:
+    If ``rank(a)`` == ``rank(b)`` == 1, compute vector product:

    ```python
-    x = dragon.ones((4,))
-    y = dragon.ones((4,))
-    print(dragon.math.dot([x, y]))  # 4.0
+    x = dragon.ones((2,))
+    y = dragon.ones((2,))
+    print(dragon.math.dot([x, y]))  # 2.0
    ```

-    If ``rank(a)`` >= 2, ``rank(b)`` == 2, computes **Matrix-Matrix Multiplication**:
+    If ``rank(a)`` == ``rank(b)`` == 2, compute matrix multiplication:

    ```python
-    x = dragon.ones((1, 2, 3))
+    x = dragon.ones((2, 3))
    y = dragon.ones((3, 2))
    print(dragon.math.dot([x, y]))  # [[[3. 3.], [3. 3.]]]
-    print(dragon.math.dot([x.reshape((2, 3)), y]).reshape((1, 2, 2)))     # Equivalent
-    print(dragon.math.matmul([x.reshape((2, 3)), y]).reshape((1, 2, 2)))  # Equivalent
+    print(dragon.math.matmul([x, y]))  # Equivalent
    ```

-    If ``rank(a)`` >= 2, ``rank(b)`` == 1, computes **Matrix-Vector Multiplication**:
+    If ``rank(a)`` >= 2, ``rank(b)`` == 1, compute matrix-vector multiplication:

    ```python
-    x = dragon.ones((1, 2, 3))
+    x = dragon.ones((2, 3))
    y = dragon.ones((3,))
    print(dragon.math.dot([x, y]))  # [[3. 3.]]
-    print(dragon.math.dot([x.reshape((2, 3)), y]).reshape((1, 2)))  # Equivalent
    ```

    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
        The tensor :math:`a` and :math:`b`.
-    transA : bool, optional, default=False
-        **True** to transpose :math:`a` before computation.
-    transB : bool, optional, default=False
-        **True** to transpose :math:`b` before computation.

    Returns
    -------
@@ -449,15 +437,11 @@ def dot(inputs, transA=False, transB=False, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Dot
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(
-                transA=transA,
-                transB=transB,
-            ).apply(inputs)
+        return op_lib.instantiate(op_type='Dot').apply(inputs)
    else:
-        return op_lib.blend(**args)
+        return op_lib.blend('Dot', **args)


 @OpSchema.num_inputs(2)
@@ -489,11 +473,9 @@ def equal(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Equal') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Equal').apply(inputs)
    else:
        return op_lib.blend('Equal', **args)

@@ -523,11 +505,9 @@ def exp(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Exp') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Exp').apply([inputs])
    else:
        return op_lib.blend('Exp', **args)

@@ -557,38 +537,36 @@ def floor(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib.instantiate(op_type='Floor').apply(inputs)
+        return op_lib.instantiate(op_type='Floor').apply([inputs])
    else:
        return op_lib.blend('Floor', **args)


 @OpSchema.num_inputs(2, 3)
-def fully_connected(inputs, num_output=None, axis=1, transW=True, **kwargs):
+def fully_connected(inputs, axis=1, transpose_w=True, **kwargs):
    r"""Compute the dense matrix multiplication along the given axes.

    .. math:: y = Wx + b

    The column of input matrix is determined by:

-    .. math:: \text{Col} = \text{Dim}(\text{Input}, \text{Axis})
+    .. math:: \text{Col} = \text{DimSince}(\text{Input}, \text{Axis})

    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
        The tensor :math:`x`, :math:`W` and :math:`b`.
-    num_output : int, optional
-        The optional output dim.
    axis : int, optional, default=1
        The start axis to compute, can be negative.
-    transW : bool, optional, default=True
+    transpose_w : bool, optional, default=True
        **True** to transpose :math:`W` before computation.

    Returns
    -------
    dragon.Tensor
-        The **y**.
+        The output tensor.

    """
    args = parse_args(locals())
@@ -597,9 +575,11 @@ def fully_connected(inputs, num_output=None, axis=1, transW=True, **kwargs):
        return op_lib \
            .instantiate(
                axis=axis,
-                transW=transW,
+                transpose_w=transpose_w,
            ).apply(inputs)
    else:
+        args.pop('transpose_w')
+        args['transW'] = transpose_w
        return op_lib.blend('FullyConnected', **args)


@@ -631,12 +611,10 @@ def greater(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    inputs = ops.remove_binary_scalar(inputs)
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Greater') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Greater').apply(inputs)
    else:
        return op_lib.blend('Greater', **args)

@@ -670,11 +648,9 @@ def greater_equal(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='GreaterEqual') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='GreaterEqual').apply(inputs)
    else:
        return op_lib.blend('GreaterEqual', **args)

@@ -709,9 +685,9 @@ def invert(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib.instantiate(op_type='Invert').apply(inputs)
+        return op_lib.instantiate(op_type='Invert').apply([inputs])
    else:
        return op_lib.blend('Invert', **args)

@@ -741,11 +717,9 @@ def is_inf(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='IsInf') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='IsInf').apply([inputs])
    else:
        return op_lib.blend('IsInf', **args)

@@ -775,11 +749,9 @@ def is_nan(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='IsNaN') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='IsNaN').apply([inputs])
    else:
        return op_lib.blend('IsNaN', **args)

@@ -809,11 +781,9 @@ def log(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Log') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Log').apply([inputs])
    else:
        return op_lib.blend('Log', **args)

@@ -847,11 +817,9 @@ def less(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Less') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Less').apply(inputs)
    else:
        return op_lib.blend('Less', **args)

@@ -885,17 +853,15 @@ def less_equal(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='LessEqual') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='LessEqual').apply(inputs)
    else:
        return op_lib.blend('LessEqual', **args)


 @OpSchema.num_inputs(2)
-def matmul(inputs, transA=False, transB=False, **kwargs):
+def matmul(inputs, transpose_a=False, transpose_b=False, **kwargs):
    r"""Compute the matrix multiplication.

    .. math:: \text{out} = a \times b
@@ -920,16 +886,16 @@ def matmul(inputs, transA=False, transB=False, **kwargs):
    a = dragon.ones((3, 2), 'float32')
    b = dragon.ones((3, 3), 'float32')
    print(dragon.math.matmul([a, b]))  # ``a`` takes the wrong dimensions
-    print(dragon.math.matmul([a, b], transA=True))  # Ok
+    print(dragon.math.matmul([a, b], transpose_a=True))  # Ok
    ```

    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
        The matrix :math:`a` and :math:`b`.
-    transA : bool, optional, default=False
+    transpose_a : bool, optional, default=False
        **True** to transpose :math:`a` before computation.
-    transB : bool, optional, default=False
+    transpose_b : bool, optional, default=False
        **True** to transpose :math:`b` before computation.

    Returns
@@ -939,15 +905,17 @@ def matmul(inputs, transA=False, transB=False, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Matmul
+    op_lib = math_ops_lib.MatMul
    if context.executing_eagerly():
        return op_lib \
            .instantiate(
-                transA=transA,
-                transB=transB,
+                transpose_a=transpose_a,
+                transpose_b=transpose_b,
            ).apply(inputs)
    else:
-        return op_lib.blend(**args)
+        args.pop('transpose_a')
+        args.pop('transpose_b')
+        return op_lib.blend(transA=transpose_a, transB=transpose_b, **args)


 @OpSchema.num_inputs(2)
@@ -969,11 +937,9 @@ def maximum(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Maximum') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Maximum').apply(inputs)
    else:
        return op_lib.blend('Maximum', **args)

@@ -997,37 +963,13 @@ def minimum(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Minimum') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Minimum').apply(inputs)
    else:
        return op_lib.blend('Minimum', **args)


-@OpSchema.num_inputs(1, 2147483647)
-def moving_average(inputs, decay, **kwargs):
-    r"""Compute the moving average of input to output.
-
-    .. math:: y = (1 - decay) * x + decay * y
-
-    Parameters
-    ----------
-    inputs : Sequence[dragon.Tensor]
-        The **x**.
-    decay : float, required
-        The decay factor.
-
-    Returns
-    -------
-    Sequence[dragon.Tensor]
-        The **y**.
-
-    """
-    return accumulate(inputs, 1. - decay, decay, **kwargs)
-
-
 @OpSchema.num_inputs(2)
 def mul(inputs, **kwargs):
    r"""Compute the element-wise multiplication.
@@ -1056,11 +998,9 @@ def mul(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib  \
-            .instantiate(op_type='Mul') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Mul').apply(inputs)
    else:
        return op_lib.blend('Mul', **args)

@@ -1088,11 +1028,9 @@ def negative(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Neg') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Neg').apply([inputs])
    else:
        return op_lib.blend('Neg', **args)

@@ -1126,11 +1064,9 @@ def not_equal(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='NotEqual') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='NotEqual').apply(inputs)
    else:
        return op_lib.blend('NotEqual', **args)

@@ -1163,11 +1099,9 @@ def pow(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Pow') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Pow').apply(inputs)
    else:
        return op_lib.blend('Pow', **args)

@@ -1197,11 +1131,9 @@ def reciprocal(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Reciprocal') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Reciprocal').apply([inputs])
    else:
        return op_lib.blend('Reciprocal', **args)

@@ -1231,9 +1163,9 @@ def round(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib.instantiate(op_type='Round').apply(inputs)
+        return op_lib.instantiate(op_type='Round').apply([inputs])
    else:
        return op_lib.blend('Round', **args)

@@ -1263,11 +1195,9 @@ def rsqrt(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Rsqrt') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Rsqrt').apply([inputs])
    else:
        return op_lib.blend('Rsqrt', **args)

@@ -1303,9 +1233,9 @@ def sign(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib.instantiate(op_type='Sign').apply(inputs)
+        return op_lib.instantiate(op_type='Sign').apply([inputs])
    else:
        return op_lib.blend('Sign', **args)

@@ -1335,11 +1265,9 @@ def sin(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Sin') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Sin').apply([inputs])
    else:
        return op_lib.blend('Sin', **args)

@@ -1369,11 +1297,9 @@ def sqrt(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Sqrt') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Sqrt').apply([inputs])
    else:
        return op_lib.blend('Sqrt', **args)

@@ -1403,11 +1329,9 @@ def square(inputs, **kwargs):

    """
    args = parse_args(locals())
-    op_lib = math_ops_lib.Unary
+    op_lib = math_ops_lib.UnaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Square') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Square').apply([inputs])
    else:
        return op_lib.blend('Square', **args)

@@ -1440,10 +1364,8 @@ def sub(inputs, **kwargs):
    """
    args = parse_args(locals())
    inputs = ops.remove_binary_scalar(inputs)
-    op_lib = math_ops_lib.Binary
+    op_lib = math_ops_lib.BinaryOp
    if context.executing_eagerly():
-        return op_lib \
-            .instantiate(op_type='Sub') \
-            .apply(inputs)
+        return op_lib.instantiate(op_type='Sub').apply(inputs)
    else:
        return op_lib.blend('Sub', **args)
--- a/dragon/python/core/ops/math_ops_lib.py
+++ b/dragon/python/core/ops/math_ops_lib.py
@@ -16,49 +16,49 @@ from __future__ import print_function
 from dragon.core.framework.ops import Operator


-class Accumulate(Operator):
+class Affine(Operator):
    def __init__(self, key, dev, **kwargs):
-        super(Accumulate, self).__init__(key, dev, **kwargs)
-        self.alpha = kwargs.get('alpha', 1.)
-        self.beta = kwargs.get('beta', 1.)
+        super(Affine, self).__init__(key, dev, **kwargs)
+        self.axis = kwargs.get('axis', 1)
+        self.num_axes = kwargs.get('num_axes', 1)

    def attributes(self):
        return {
-            'op_type': 'Accumulate',
+            'op_type': 'Affine',
            'arguments': {
-                'alpha': self.alpha,
-                'beta': self.beta,
+                'axis': self.axis,
+                'num_axes': self.num_axes,
            }
        }

-    def forward(self, inputs, outputs=None):
-        if outputs is None:
-            outputs = [self.alloc() for _ in range(len(inputs))]
-        return self.dispatch(inputs, outputs, no_grad=True)
+    def forward(self, inputs):
+        return self.dispatch(inputs, [self.alloc()])


-class Affine(Operator):
+class Axpby(Operator):
    def __init__(self, key, dev, **kwargs):
-        super(Affine, self).__init__(key, dev, **kwargs)
-        self.axis = kwargs.get('axis', 1)
-        self.num_axes = kwargs.get('num_axes', 1)
+        super(Axpby, self).__init__(key, dev, **kwargs)
+        self.alpha = kwargs.get('alpha', 1.)
+        self.beta = kwargs.get('beta', 1.)

    def attributes(self):
        return {
-            'op_type': 'Affine',
+            'op_type': 'Axpby',
            'arguments': {
-                'axis': self.axis,
-                'num_axes': self.num_axes,
+                'alpha': self.alpha,
+                'beta': self.beta,
            }
        }

-    def forward(self, inputs):
-        return self.dispatch(inputs, [self.alloc()])
+    def forward(self, inputs, outputs=None):
+        if outputs is None:
+            outputs = [self.alloc() for _ in range(len(inputs))]
+        return self.dispatch(inputs, outputs, no_grad=True)


-class Binary(Operator):
+class BinaryOp(Operator):
    def __init__(self, key, dev, **kwargs):
-        super(Binary, self).__init__(key, dev, **kwargs)
+        super(BinaryOp, self).__init__(key, dev, **kwargs)
        self.op_type = kwargs.get('op_type', '')

    def attributes(self):
@@ -95,37 +95,18 @@ class Clip(Operator):
        return self.dispatch(inputs, [self.alloc()])


-class Dot(Operator):
-    def __init__(self, key, dev, **kwargs):
-        super(Dot, self).__init__(key, dev, **kwargs)
-        self.transA = kwargs.get('transA', False)
-        self.transB = kwargs.get('transB', False)
-
-    def attributes(self):
-        return {
-            'op_type': 'Dot',
-            'arguments': {
-                'transA': self.transA,
-                'transB': self.transB,
-            }
-        }
-
-    def forward(self, inputs):
-        return self.dispatch(inputs, [self.alloc()])
-
-
 class FullyConnected(Operator):
    def __init__(self, key, dev, **kwargs):
        super(FullyConnected, self).__init__(key, dev, **kwargs)
        self.axis = kwargs.get('axis', 1)
-        self.transW = kwargs.get('transW', True)
+        self.transpose_w = kwargs.get('transpose_w', True)

    def attributes(self):
        return {
            'op_type': 'FullyConnected',
            'arguments': {
                'axis': self.axis,
-                'transW': self.transW,
+                'transW': self.transpose_w,
            }
        }

@@ -133,18 +114,18 @@ class FullyConnected(Operator):
        return self.dispatch(inputs, [self.alloc()])


-class Matmul(Operator):
+class MatMul(Operator):
    def __init__(self, key, dev, **kwargs):
-        super(Matmul, self).__init__(key, dev, **kwargs)
-        self.transA = kwargs.get('transA', False)
-        self.transB = kwargs.get('transB', False)
+        super(MatMul, self).__init__(key, dev, **kwargs)
+        self.transpose_a = kwargs.get('transpose_a', False)
+        self.transpose_b = kwargs.get('transpose_b', False)

    def attributes(self):
        return {
-            'op_type': 'Matmul',
+            'op_type': 'MatMul',
            'arguments': {
-                'transA': self.transA,
-                'transB': self.transB,
+                'transA': self.transpose_a,
+                'transB': self.transpose_b,
            }
        }

@@ -152,9 +133,9 @@ class Matmul(Operator):
        return self.dispatch(inputs, [self.alloc()])


-class Unary(Operator):
+class UnaryOp(Operator):
    def __init__(self, key, dev, **kwargs):
-        super(Unary, self).__init__(key, dev, **kwargs)
+        super(UnaryOp, self).__init__(key, dev, **kwargs)
        self.op_type = kwargs.get('op_type', '')

    def attributes(self):

--- a/dragon/python/core/ops/normalization_ops.py
+++ b/dragon/python/core/ops/normalization_ops.py
@@ -38,10 +38,10 @@ def batch_norm(
    .. math::
        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

-    The moving average of stats are calculated as:
+    The running average of statistics are calculated as:

    .. math::
-        x_{moving} \leftarrow momentum * x_{moving} + (1 - momentum) * x_{stat}
+        x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}

    Note that the number of inputs should be **5**, i.e.,
    this operators is implemented into the fused version.
@@ -56,11 +56,11 @@ def batch_norm(
    axis : int, optional, default=-1
        The channel axis.
    momentum : float, optional, default=0.9
-        The momentum of moving average.
+        The momentum for running average.
    eps : float, optional, default=1e-5
-        The epsilon.
+        The value of :math:`\epsilon`.
    use_stats : int, optional, default=-1
-        Whether to use global stats.
+        Whether to use estimated statistics or not.

    Returns
    -------
@@ -168,7 +168,7 @@ def instance_norm(inputs, axis=-1, eps=1e-5, **kwargs):


 @OpSchema.num_inputs(1)
-def lp_normalize(inputs, axis=None, p=2, eps=1e-5, reduction='sum', **kwargs):
+def lp_normalize(inputs, axis=None, p=2, eps=1e-12, reduction='sum', **kwargs):
    r"""Apply the lp normalization.

    The **Lp-Normalization** is defined as:
@@ -200,7 +200,7 @@ def lp_normalize(inputs, axis=None, p=2, eps=1e-5, reduction='sum', **kwargs):
        The order of the normalization.
    axis : Union[int, Sequence[int]], optional
        The axis to compute the norm.
-    eps : float, optional, default=1e-5
+    eps : float, optional, default=1e-12
        The value of :math:`\epsilon`.
    reduction : {'sum', 'mean'}, optional
        The reduction method for norm.
@@ -326,9 +326,9 @@ def local_response_norm(
                beta=args['beta'],
                bias=args['bias'],
                data_format=data_format,
-            ).apply(inputs)
+            ).apply([inputs])
    else:
-        return op_lib.blend(**args)
+        return op_lib.blend('LRN', **args)


 @OpSchema.num_inputs(5)
@@ -349,10 +349,10 @@ def sync_batch_norm(
    .. math::
        \text{out} = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

-    The moving average of statistics are calculated as:
+    The running average of statistics are calculated as:

    .. math::
-        x_{moving} \leftarrow momentum * x_{moving} + (1 - momentum) * x_{stat}
+        x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}

    Note that the number of inputs should be **5**, i.e.,
    this operators is implemented into the fused version.
@@ -367,11 +367,11 @@ def sync_batch_norm(
    axis : int, optional, default=-1
        The channel axis.
    momentum : float, optional, default=0.9
-        The momentum of moving average.
+        The momentum for average.
    eps : float, optional, default=1e-5
-        The epsilon.
+        The value of :math:`\epsilon`.
    use_stats : int, optional, default=-1
-        Whether to use global stats.
+        Whether to use estimated statistics or not.
    process_group : ProcessGroup, optional
        The group for communication.


--- a/dragon/python/core/ops/normalization_ops_lib.py
+++ b/dragon/python/core/ops/normalization_ops_lib.py
@@ -83,7 +83,7 @@ class LpNormalize(Operator):
            }
        }

-    def forward(self, inputs):
+    def forward(self,inputs):
        return self.dispatch(inputs, [self.alloc()])


@@ -94,6 +94,7 @@ class LocalResponseNorm(Operator):
        self.alpha = kwargs.get('alpha', 0.0001)
        self.beta = kwargs.get('beta', 0.75)
        self.bias = kwargs.get('bias', 1.)
+        self.data_format = kwargs.get('data_format', 'NCHW')

    def attributes(self):
        return {
@@ -103,6 +104,7 @@ class LocalResponseNorm(Operator):
                'alpha': self.alpha,
                'beta': self.beta,
                'bias': self.bias,
+                'data_format': self.data_format,
            }
        }


--- a/dragon/python/core/ops/tensorbind_eager.py
+++ b/dragon/python/core/ops/tensorbind_eager.py
@@ -69,9 +69,7 @@ def astype(self, dtype, inplace=False):

    """
    return array_ops_lib.Cast \
-        .instantiate(
-            dtype=dtype,
-        ).apply([self], inplace)
+        .instantiate(dtype=dtype).apply([self], inplace)


 def constant(self, value=0):
@@ -701,7 +699,7 @@ def uniform(self, low=0, high=1):

 def _binary_op(a, b, op_type, outputs=None):
    """Apply the general binary operation."""
-    return math_ops_lib.Binary \
+    return math_ops_lib.BinaryOp \
        .instantiate(op_type=op_type) \
        .apply(ops.remove_binary_scalar([a, b]), outputs)

@@ -710,7 +708,7 @@ def _masked_assign(ref, value, mask):
    """Apply the mask-assign operation."""
    value = ops.scalar_to_tensor(value, ref.dtype)
    return control_flow_ops_lib.MaskedAssign \
-        .instantiate().apply([ref, value], mask)
+        .instantiate().apply([ref, value, mask])


 def _masked_select(x, mask):
@@ -764,22 +762,20 @@ def _section_assign(ref, value, starts, sizes):
    """Apply the section-assign operation."""
    value = ops.scalar_to_tensor(value, ref.dtype)
    return control_flow_ops_lib.Assign \
-        .instantiate(
-            ndim=len(starts) if starts is not None else 0,
-        ).apply([ref, value], starts, sizes)
+        .instantiate(ndim=len(starts) if starts is not None else 0) \
+        .apply([ref, value], starts, sizes)


 def _section_select(x, starts, sizes):
    """Apply the section-select operation."""
    return array_ops_lib.Slice \
-        .instantiate(
-            ndim=len(starts),
-        ).apply([x], starts, sizes)
+        .instantiate(ndim=len(starts)).apply([x], starts, sizes)


 def _unary_op(x, op_type):
    """Apply the general unary operation."""
-    return math_ops_lib.Unary.instantiate(op_type=op_type).apply(x)
+    return math_ops_lib.UnaryOp \
+        .instantiate(op_type=op_type).apply([x])


 # Aliases
@@ -801,13 +797,15 @@ EagerTensor.__iadd__ = iadd
 EagerTensor.__idiv__ = idiv
 EagerTensor.__imul__ = imul
 EagerTensor.__isub__ = isub
+EagerTensor.__itruediv__ = idiv
 EagerTensor.__le__ = le
 EagerTensor.__lt__ = lt
 EagerTensor.__mul__ = mul
 EagerTensor.__neg__ = neg
+EagerTensor.__radd__ = radd
 EagerTensor.__rdiv__ = rdiv
 EagerTensor.__rmul__ = rmul
-EagerTensor.__rtruediv__ = rdiv
 EagerTensor.__rsub__ = rsub
+EagerTensor.__rtruediv__ = rdiv
 EagerTensor.__setitem__ = setitem
 EagerTensor.__sub__ = sub
--- a/dragon/python/core/ops/tensorbind_symbolic.py
+++ b/dragon/python/core/ops/tensorbind_symbolic.py
@@ -66,7 +66,9 @@ def astype(self, dtype, inplace=False):
    `dragon.cast(...)`_ : Cast the data type of input.

    """
-    inputs, outputs = ([], [self]) if inplace else ([self], [])
+    if self.dtype == dtype:
+        return self
+    inputs, outputs = ([], [self]) if inplace else ([self], None)
    return OpDef.apply('Cast', inputs, outputs, dtype=dtype)



--- a/dragon/python/core/ops/training_ops_lib.py
+++ b/dragon/python/core/ops/training_ops_lib.py
@@ -19,18 +19,18 @@ from dragon.core.framework.ops import Operator
 class ParamUpdate(Operator):
    def __init__(self, key, dev, **kwargs):
        super(ParamUpdate, self).__init__(key, dev, **kwargs)
-        self.op_type = kwargs.get('op_type', 'ParamUpdate')
-        self.lr_mult = kwargs.get('lr_mult', 1.)
-        self.decay_mult = kwargs.get('decay_mult', 1.)
-        self.slot = kwargs.get('slot', '')
+        self.op_type = kwargs.get('op_type', '')
+        self.op_handle = kwargs.get('op_handle', '')
+        self.lr_mult = kwargs.get('lr_mult', 1)
+        self.decay_mult = kwargs.get('decay_mult', 1)

    def attributes(self):
        return {
+            'name': self.op_handle,
            'op_type': self.op_type,
            'arguments': {
-                'lr_mult': self.lr_mult,
-                'decay_mult': self.decay_mult,
-                'slot': self.slot,
+                'lr_mult': float(self.lr_mult),
+                'decay_mult': float(self.decay_mult),
            },
        }


--- a/dragon/python/core/ops/vision_ops.py
+++ b/dragon/python/core/ops/vision_ops.py
@@ -28,7 +28,7 @@ def bias_add(inputs, data_format='NCHW', **kwargs):
    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
-        The tensor ``input`` and ``bias``.
+        The ``input`` and ``bias``.
    data_format : {'NCHW', 'NHWC'}, optional
        The optional data format.

@@ -53,7 +53,6 @@ def bias_add(inputs, data_format='NCHW', **kwargs):
 @OpSchema.num_inputs(2, 3)
 def conv2d(
    inputs,
-    num_output=None,
    kernel_shape=3,
    strides=1,
    pads=0,
@@ -65,24 +64,12 @@ def conv2d(
 ):
    r"""Apply the 2d convolution.

-    The spatial output dimension is computed as:
-
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} +
-                2 * pad - \text{DK}_{size}) / stride + 1
-        \end{cases}
-
    Set ``padding`` to **VALID** will use the value of ``pads``.

    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
        The tensor ``x``, ``weight`` and ``bias``.
-    num_output : int, optional
-        The optional number of output channels.
    kernel_shape : Sequence[int], optional, default=3
        The shape of convolution kernel.
    strides : Sequence[int], optional, default=1
@@ -114,9 +101,7 @@ def conv2d(
            args[key] = _normalize_pads(args[key], 2)
        else:
            args[key] = _normalize_tuple(args[key], 2)
-
    op_lib = vision_ops_lib.Conv2d
-
    if context.executing_eagerly():
        weight_shape = inputs[1].shape
        return op_lib \
@@ -142,7 +127,6 @@ def conv2d(
 @ArgHelper.repeated_desc('output_shape')
 def conv2d_transpose(
    inputs,
-    num_output=None,
    kernel_shape=3,
    strides=1,
    pads=0,
@@ -156,24 +140,12 @@ def conv2d_transpose(
 ):
    r"""Apply the 2d deconvolution.

-    The spatial output dimension is computed as:
-
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} - 1) *
-                stride + \text{DK}_{size} - 2 * pad
-        \end{cases}
-
    Set ``padding`` to **VALID** will use the value of ``pads``.

    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
        The tensor ``x``, ``weight`` and ``bias``.
-    num_output : int, optional
-        The optional number of output channels.
    kernel_shape : Sequence[int], optional, default=3
        The shape of convolution kernel.
    strides : Sequence[int], optional, default=1
@@ -200,7 +172,6 @@ def conv2d_transpose(

    """
    args = parse_args(locals())
-
    if padding not in ('VALID', 'SAME', 'SAME_UPPER', 'SAME_LOWER'):
        raise ValueError('Unsupported padding algorithm: %s' % padding)
    if data_format not in ('NCHW', 'NHWC'):
@@ -212,9 +183,7 @@ def conv2d_transpose(
            args[key] = _normalize_pads(args[key], 2)
        else:
            args[key] = _normalize_tuple(args[key], 2)
-
    op_lib = vision_ops_lib.ConvTranspose2d
-
    if context.executing_eagerly():
        weight_shape = inputs[1].shape
        return op_lib \
@@ -240,7 +209,6 @@ def conv2d_transpose(
 @OpSchema.num_inputs(2, 3)
 def depthwise_conv2d(
    inputs,
-    num_output=None,
    kernel_shape=3,
    strides=1,
    pads=0,
@@ -252,24 +220,12 @@ def depthwise_conv2d(
    r"""Apply the 2d depthwise convolution.
    `[Chollet, 2016] <https://arxiv.org/abs/1610.02357>`_.

-    The spatial output dimension is computed as:
-
-    .. math::
-        \begin{cases}
-            \text{DK}_{size} = dilation *
-                (\text{K}_{size} - 1) + 1 \\
-            \text{Dim}_{out} = (\text{Dim}_{in} +
-                2 * pad - \text{DK}_{size}) / stride + 1
-        \end{cases}
-
    Set ``padding`` to **VALID** will use the value of ``pads``.

    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
        The tensor ``x``, ``weight`` and ``bias``.
-    num_output : int, optional
-        The optional number of output channels.
    kernel_shape : Sequence[int], optional, default=3
        The size(s) of convolution kernel.
    strides : Sequence[int], optional, default=1
@@ -290,7 +246,6 @@ def depthwise_conv2d(

    """
    args = parse_args(locals())
-
    if padding not in ('VALID', 'SAME', 'SAME_UPPER', 'SAME_LOWER'):
        raise ValueError('Unsupported padding algorithm: %s' % padding)
    if data_format not in ('NCHW', 'NHWC'):
@@ -300,9 +255,7 @@ def depthwise_conv2d(
            args[key] = _normalize_pads(args[key], 2)
        else:
            args[key] = _normalize_tuple(args[key], 2)
-
    op_lib = vision_ops_lib.DepthwiseConv2d
-
    if context.executing_eagerly():
        weight_shape = inputs[1].shape
        return op_lib \
@@ -361,7 +314,7 @@ def depth_to_space(inputs, block_size, data_format='NCHW', **kwargs):
            .instantiate(
                block_size=block_size,
                data_format=data_format,
-            ).apply(inputs)
+            ).apply([inputs])
    else:
        return op_lib.blend(**args)

@@ -374,19 +327,13 @@ def pool2d(
    pads=0,
    padding='VALID',
    ceil_mode=False,
-    mode='MAX',
+    mode='max',
    data_format='NCHW',
    global_pooling=False,
    **kwargs
 ):
    r"""Apply the 2d pooling.

-    The spatial output dimension is computed as:
-
-    .. math::
-        \text{Dim}_{out} = (\text{Dim}_{in} +
-            2 * pad - \text{K}_{size}) / stride + 1
-
    Set ``padding`` to **VALID** will use the value of ``pads``.

    If ``global_pooling`` is **True**, ``strides`` and ``pads`` will be set to **1** and **0**.
@@ -410,7 +357,7 @@ def pool2d(
    data_format : {'NCHW', 'NHWC'}, optional
        The optional data format.
    global_pooling : bool, optional, default=False
-        Whether to use global pooling.
+        Whether to apply the global pooling.

    Returns
    -------
@@ -419,7 +366,6 @@ def pool2d(

    """
    args = parse_args(locals())
-
    if mode not in ('MAX', 'AVG'):
        raise ValueError('Unsupported pooling mode: %s' % mode)
    if padding not in ('VALID', 'SAME', 'SAME_UPPER', 'SAME_LOWER'):
@@ -431,9 +377,7 @@ def pool2d(
            args[key] = _normalize_pads(args[key], 2)
        else:
            args[key] = _normalize_tuple(args[key], 2)
-
    op_lib = vision_ops_lib.Pool2d
-
    if context.executing_eagerly():
        return op_lib \
            .instantiate(
@@ -445,7 +389,7 @@ def pool2d(
                mode=mode,
                data_format=data_format,
                global_pooling=global_pooling,
-            ).apply(inputs)
+            ).apply([inputs])
    else:
        return op_lib.blend(**args)

@@ -526,7 +470,7 @@ def resize(
                num_sizes=len(args['sizes']) if sizes is not None else 0,
                num_scales=len(args['scales']) if scales is not None else 0,
                data_format=data_format,
-            ).apply(inputs, args['sizes'], args['scales'])
+            ).apply([inputs], args['sizes'], args['scales'])
    else:
        return op_lib.blend(**args)

@@ -668,7 +612,7 @@ def space_to_depth(inputs, block_size, data_format='NCHW', **kwargs):
            .instantiate(
                block_size=block_size,
                data_format=data_format,
-            ).apply(inputs)
+            ).apply([inputs])
    else:
        return op_lib.blend(**args)


--- a/dragon/python/core/ops/vision_ops_lib.py
+++ b/dragon/python/core/ops/vision_ops_lib.py
@@ -32,7 +32,6 @@ class _ConvNd(Operator):
        return {
            'op_type': self.__class__.__name__,
            'arguments': {
-                'num_output': self.num_output,
                'kernel_shape': self.kernel_shape,
                'strides': self.strides,
                'pads': self.pads,
@@ -113,7 +112,6 @@ class ConvTranspose2d(_ConvNd):
        return {
            'op_type': self.__class__.__name__,
            'arguments': {
-                'num_output': self.num_output,
                'kernel_shape': self.kernel_shape,
                'strides': self.strides,
                'pads': self.pads,

--- a/dragon/python/core/testing/__init__.py
+++ b/dragon/python/core/testing/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
--- a/dragon/python/core/testing/unittest/__init__.py
+++ b/dragon/python/core/testing/unittest/__init__.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
--- a/dragon/python/core/testing/unittest/common_utils.py
+++ b/dragon/python/core/testing/unittest/common_utils.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unittest
+
+import argparse
+import dragon
+
+parser = argparse.ArgumentParser(add_help=False)
+TEST_CUDA = dragon.cuda.is_available()
+
+
+def run_tests(argv=None):
+    """Run tests under the current ``__main__``."""
+    if argv is None:
+        args, remaining = parser.parse_known_args()
+        argv = [sys.argv[0]] + remaining
+    unittest.main(argv=argv)
--- a/dragon/python/core/training/adam.py
+++ b/dragon/python/core/training/adam.py
@@ -9,17 +9,17 @@
 #
 # ------------------------------------------------------------

-"""Define the Adam updaters."""
+"""The Adam optimizers."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.core.training import updater
+from dragon.core.training import optimizer


-class Adam(updater.Updater):
-    r"""The updater which implements Adam algorithm.
+class Adam(optimizer.Optimizer):
+    r"""The optimizer to apply Adam algorithm.
    `[Kingma & Ba, 2014] <https://arxiv.org/abs/1412.6980>`_.

    The **Adam** update is defined as:

--- a/dragon/python/core/training/optimizer.py
+++ b/dragon/python/core/training/optimizer.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+"""The optimizer to update parameters."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.core import distributed
+from dragon.core.eager import context
+from dragon.core.framework import workspace
+from dragon.core.ops import distributed_ops_lib
+from dragon.core.ops import training_ops_lib
+
+
+class Optimizer(object):
+    """The base class of optimizers."""
+
+    # Store for the global unique handle
+    _DEFAULT_UNIQUE_HANDLE_INDEX = 0
+
+    def __init__(
+        self,
+        scale=1,
+        clip_norm=0,
+        weight_decay=0,
+        name=None,
+    ):
+        """Create a ``Optimizer``.
+
+        Parameters
+        ----------
+        scale : float, optional, default=1
+            The scaling factor to gradient.
+        clip_norm : float, optional, default=0
+            The maximum L2 norm to clip gradient.
+        weight_decay : float, optional, default=0
+            The L2 penalty factor to weight.
+        name : str, optional
+            The optional name for shared slots.
+
+        """
+        self._defaults = {
+            'scale': float(scale),
+            'clip_norm': float(clip_norm),
+            'weight_decay': float(weight_decay),
+        }
+        self._param_group = []
+        if name:
+            self._op_handle = name
+        else:
+            Optimizer. _DEFAULT_UNIQUE_HANDLE_INDEX += 1
+            self._op_handle = 'Optimizer_{}'.format(
+                Optimizer. _DEFAULT_UNIQUE_HANDLE_INDEX)
+        self._op_type = self.__class__.__name__ + 'Update'
+        self._process_group = distributed.get_group()
+        self._extra_kwargs = {}
+
+    def apply_gradients(
+        self,
+        values_and_grads,
+        lr_mult=None,
+        decay_mult=None,
+    ):
+        """Apply the gradients on values.
+
+        Parameters
+        ----------
+        values_and_grads : Sequence[Sequence[dragon.Tensor]]
+            The values and grads.
+        lr_mult : number, optional
+            The multiplier to learning rate.
+        decay_mult : number, optional
+            The multiplier to weight decay.
+
+        """
+        if context.executing_eagerly():
+            # Filter value whose grad is missing.
+            values, grads = [], []
+            for v, g in values_and_grads:
+                if g is not None:
+                    values.append(v)
+                    grads.append(g)
+            # Accumulate grads from the current process group.
+            if self._process_group is not None:
+                distributed_ops_lib.Collective \
+                    .instantiate(
+                        operation='MEAN',
+                        communication='ALLREDUCE',
+                        group=self._process_group,
+                    ).apply(grads)
+            # Apply the updates.
+            for v, g in zip(values, grads):
+                self._run_update(v, g, lr_mult, decay_mult)
+        else:
+            # Store for the lazy compilation.
+            for v, g in values_and_grads:
+                self._add_update(v, g, lr_mult, decay_mult)
+        return self
+
+    def _init_set_defaults(self, extra=None):
+        """Initialize the defaults into current workspace."""
+        if extra is not None:
+            self._defaults = dict(self._defaults, **extra)
+        for k, v in self._defaults.items():
+            workspace.feed_tensor(
+                '/share/hyper/%s/%s' % (self._op_handle, k), v,
+                dtype='float32', enforce_cpu=True,
+            )
+
+    def _add_update(self, param, grad, lr_mult=None, decay_mult=None):
+        """Add a symbolic operator for updating."""
+        pair = (v.id if hasattr(v, 'id') else v for v in (param, grad))
+        self._param_group.append(
+            (pair, {
+                'lr_mult': float(lr_mult) if lr_mult is not None else 1.,
+                'decay_mult': float(decay_mult) if decay_mult is not None else 1.,
+            })
+        )
+
+    def _run_update(self, param, grad, lr_mult=None, decay_mult=None):
+        """Run an eager operation for updating."""
+        return training_ops_lib.ParamUpdate \
+            .instantiate(
+                op_type=self._op_type,
+                op_handle=self._op_handle,
+                lr_mult=float(lr_mult) if lr_mult is not None else 1.,
+                decay_mult=float(decay_mult) if decay_mult is not None else 1.,
+            ).apply(grad, param)
+
+    def __getattr__(self, item):
+        defaults = self.__dict__.get('_defaults')
+        if item in defaults:
+            return workspace.fetch_tensor(
+                '/share/hyper/%s/%s' % (self._op_handle, item))
+        return self.__dict__[item]
+
+    def __setattr__(self, key, value):
+        defaults = self.__dict__.get('_defaults')
+        if defaults is not None and key in defaults:
+            workspace.feed_tensor(
+                '/share/hyper/%s/%s' % (self._op_handle, key), value,
+                dtype='float32', enforce_cpu=True)
+        else:
+            object.__setattr__(self, key, value)
--- a/dragon/python/core/training/rmsprop.py
+++ b/dragon/python/core/training/rmsprop.py
@@ -9,17 +9,17 @@
 #
 # ------------------------------------------------------------

-"""Define the RMSprop updater."""
+"""The RMSprop optimizers."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.core.training import updater
+from dragon.core.training import optimizer


-class RMSProp(updater.Updater):
-    r"""The updater which implements RMSprop algorithm.
+class RMSprop(optimizer.Optimizer):
+    r"""The optimizer to apply RMSprop algorithm.
    `[Hinton et.al, 2013] <http://www.cs.utoronto.ca/~bonner/courses/2016s/csc321/lectures/lec6.pdf>`_.

    The **RMSprop** update is defined as:
@@ -43,7 +43,7 @@ class RMSProp(updater.Updater):
        eps=1e-8,
        **kwargs
    ):
-        r"""Create a ``RMSProp`` updater.
+        r"""Create a ``RMSProp`` optimizer.

        Parameters
        ----------
@@ -57,7 +57,7 @@ class RMSProp(updater.Updater):
            The initial value for :math:`\epsilon`.

        """
-        super(RMSProp, self).__init__(**kwargs)
+        super(RMSprop, self).__init__(**kwargs)
        self._init_set_defaults({
            'base_lr': base_lr,
            'momentum': momentum,

--- a/dragon/python/core/training/sgd.py
+++ b/dragon/python/core/training/sgd.py
@@ -9,17 +9,17 @@
 #
 # ------------------------------------------------------------

-"""Define the SGD updaters."""
+"""The SGD optimizers."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.core.training import updater
+from dragon.core.training import optimizer


-class SGD(updater.Updater):
-    r"""The updater which implements MomentumSGD algorithm.
+class SGD(optimizer.Optimizer):
+    r"""The optimizer to apply MomentumSGD algorithm.
    `[Polyak, 1964] <https://doi.org/10.1016/0041-5553(64)90137-5>`_.

    The **MomentumSGD** update is defined as:
@@ -46,8 +46,8 @@ class SGD(updater.Updater):
        })


-class Nesterov(updater.Updater):
-    r"""The updater which implements NesterovSGD algorithm.
+class Nesterov(optimizer.Optimizer):
+    r"""The optimizer to apply NesterovSGD algorithm.
    `[Sutskever et.al, 2013] <http://www.cs.toronto.edu/~hinton/absps/momentum.pdf>`_.

    The **NesterovSGD** update is defined as:
@@ -60,7 +60,7 @@ class Nesterov(updater.Updater):
    """

    def __init__(self, base_lr=0.01, momentum=0.9, **kwargs):
-        r"""Create a ``Nesterov`` updater.
+        r"""Create a ``Nesterov`` optimizer.

        Parameters
        ----------

--- a/dragon/python/core/util/unittest.py
+++ b/dragon/python/core/util/unittest.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unittest
+
+import argparse
+import dragon
+
+from dragon.vm import torch as torch_vm
+
+
+parser = argparse.ArgumentParser(add_help=False)
+TEST_CUDA = dragon.cuda.is_available()
+
+
+def new_tensor(data, constructor='EagerTensor', execution=None):
+    if execution is not None:
+        if execution == 'GRAPH_MODE':
+            return dragon.Tensor(
+                shape=data.shape,
+                dtype=str(data.dtype),
+            ).set_value(data)
+        else:
+            return dragon.EagerTensor(data, copy=True)
+    if constructor == 'EagerTensor':
+        return dragon.EagerTensor(data, copy=True)
+    elif constructor == 'Tensor':
+        return dragon.Tensor(
+            shape=data.shape,
+            dtype=str(data.dtype),
+        ).set_value(data)
+    elif constructor == 'torch.Tensor':
+        return torch_vm.tensor(data)
+    else:
+        raise ValueError('Unknown constructor:', constructor)
+
+
+def run_tests(argv=None):
+    """Run tests under the current ``__main__``."""
+    if argv is None:
+        args, remaining = parser.parse_known_args()
+        argv = [sys.argv[0]] + remaining
+    unittest.main(argv=argv)
--- a/dragon/utils/cuda_device.h
+++ b/dragon/utils/cuda_device.h
@@ -14,7 +14,7 @@
 #define DRAGON_UTILS_CUDA_DEVICE_H_

 #ifdef USE_CUDA
-#include <cublas.h>
+#include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <curand.h>

--- a/dragon/utils/eigen_utils.h
+++ b/dragon/utils/eigen_utils.h
@@ -55,6 +55,10 @@ using ConstEigenVectorArrayMap =
    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;

 template <typename T>
+using ConstEigenVectorArrayMap2 =
+    Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>;
+
+template <typename T>
 using EigenArrayMap =
    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;


--- a/dragon/utils/math/blas.cc
+++ b/dragon/utils/math/blas.cc
@@ -108,11 +108,16 @@ DRAGON_API void Dot<float16, CPUContext>(
  CPU_FP16_NOT_SUPPORTED;
 }

-#define DEFINE_DOT_FUNC(T)                                               \
-  template <>                                                            \
-  DRAGON_API void Dot<T, CPUContext>(                                    \
-      int n, const T* a, const T* b, T* y, CPUContext* ctx) {            \
-    *y = ConstEigenVectorMap<T>(a, n).dot(ConstEigenVectorMap<T>(b, n)); \
+#define DEFINE_DOT_FUNC(T)                                                 \
+  template <>                                                              \
+  DRAGON_API void Dot<T, CPUContext>(                                      \
+      int n, const T* a, const T* b, T* y, CPUContext* ctx) {              \
+    *y = ConstEigenVectorMap<T>(a, n).dot(ConstEigenVectorMap<T>(b, n));   \
+  }                                                                        \
+  template <>                                                              \
+  DRAGON_API T Dot<T, CPUContext>(                                         \
+      int n, const T* a, const T* b, CPUContext* ctx) {                    \
+    return ConstEigenVectorMap<T>(a, n).dot(ConstEigenVectorMap<T>(b, n)); \
  }

 DEFINE_DOT_FUNC(float);
@@ -121,6 +126,11 @@ DEFINE_DOT_FUNC(double);

 #define DEFINE_ASUM_FUNC(T)                                                    \
  template <>                                                                  \
+  DRAGON_API void ASum<T, CPUContext>(                                         \
+      const int n, const T* x, T* y, CPUContext* ctx) {                        \
+    *y = ConstEigenVectorArrayMap<T>(x, n).abs().sum();                        \
+  }                                                                            \
+  template <>                                                                  \
  DRAGON_API T ASum<T, CPUContext>(const int n, const T* x, CPUContext* ctx) { \
    return ConstEigenVectorArrayMap<T>(x, n).abs().sum();                      \
  }

--- a/dragon/utils/math/blas.cu
+++ b/dragon/utils/math/blas.cu
@@ -94,13 +94,15 @@ DEFINE_SCALE_FUNC(int64_t);
  template <>                                                                  \
  DRAGON_API void Scale<T, CUDAContext>(                                       \
      const int n, const float alpha, const T* x, T* y, CUDAContext* ctx) {    \
-    T _alpha_ = (T)alpha;                                                      \
    if (x != y) {                                                              \
      CUDA_CHECK(cudaMemcpyAsync(                                              \
          y, x, sizeof(T) * n, cudaMemcpyDeviceToDevice, ctx->cuda_stream())); \
    }                                                                          \
-    if (_alpha_ != T(1)) {                                                     \
-      CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, &_alpha_, y, 1));      \
+    if (alpha != 1.f) {                                                        \
+      T scale = (T)alpha;                                                      \
+      CUBLAS_CHECK(cublasSetPointerMode(                                       \
+          ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));                    \
+      CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, &scale, y, 1));        \
    }                                                                          \
  }

@@ -120,6 +122,8 @@ DRAGON_API void Scale<float16, CUDAContext>(
        ctx->cuda_stream()));
  }
  if (alpha != 1.f) {
+    CUBLAS_CHECK(
+        cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
    CUBLAS_CHECK(cublasScalEx(
        ctx->cublas_handle(),
        n,
@@ -132,8 +136,8 @@ DRAGON_API void Scale<float16, CUDAContext>(
  }
 }

-DEFINE_SCALE_FUNC(float, cublasSscal_v2);
-DEFINE_SCALE_FUNC(double, cublasDscal_v2);
+DEFINE_SCALE_FUNC(float, cublasSscal);
+DEFINE_SCALE_FUNC(double, cublasDscal);
 #undef DEFINE_SCALE_FUNC

 #define DEFINE_COPY_FUNC(T)                                                    \
@@ -170,12 +174,14 @@ DEFINE_AXPY_FUNC(int);
 DEFINE_AXPY_FUNC(int64_t);
 #undef DEFINE_AXPY_FUNC

-#define DEFINE_AXPY_FUNC(T, cublas_func)                                      \
-  template <>                                                                 \
-  DRAGON_API void Axpy<T, CUDAContext>(                                       \
-      const int n, const float alpha, const T* x, T* y, CUDAContext* ctx) {   \
-    T _alpha_ = (T)alpha;                                                     \
-    CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, &_alpha_, x, 1, y, 1)); \
+#define DEFINE_AXPY_FUNC(T, cublas_func)                                       \
+  template <>                                                                  \
+  DRAGON_API void Axpy<T, CUDAContext>(                                        \
+      const int n, const float alpha, const T* x, T* y, CUDAContext* ctx) {    \
+    T scale = (T)alpha;                                                        \
+    CUBLAS_CHECK(                                                              \
+        cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
+    CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, &scale, x, 1, y, 1));    \
  }

 template <>
@@ -185,6 +191,8 @@ DRAGON_API void Axpy<float16, CUDAContext>(
    const float16* x,
    float16* y,
    CUDAContext* ctx) {
+  CUBLAS_CHECK(
+      cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
  CUBLAS_CHECK(cublasAxpyEx(
      ctx->cublas_handle(),
      n,
@@ -199,8 +207,8 @@ DRAGON_API void Axpy<float16, CUDAContext>(
      CUDA_R_32F));
 }

-DEFINE_AXPY_FUNC(float, cublasSaxpy_v2);
-DEFINE_AXPY_FUNC(double, cublasDaxpy_v2);
+DEFINE_AXPY_FUNC(float, cublasSaxpy);
+DEFINE_AXPY_FUNC(double, cublasDaxpy);
 #undef DEFINE_AXPY_FUNC

 #define DEFINE_AXPBY_FUNC(T)                                         \
@@ -249,12 +257,22 @@ DEFINE_AXPBY_FUNC(float);
 DEFINE_AXPBY_FUNC(double);
 #undef DEFINE_AXPBY_FUNC

-#define DEFINE_DOT_FUNC(T, cublas_func)                                \
-  template <>                                                          \
-  DRAGON_API void Dot<T, CUDAContext>(                                 \
-      const int n, const T* a, const T* b, T* y, CUDAContext* ctx) {   \
-    CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, a, 1, b, 1, y)); \
-    ctx->FinishDeviceComputation();                                    \
+#define DEFINE_DOT_FUNC(T, cublas_func)                                        \
+  template <>                                                                  \
+  DRAGON_API void Dot<T, CUDAContext>(                                         \
+      const int n, const T* a, const T* b, T* y, CUDAContext* ctx) {           \
+    CUBLAS_CHECK(cublasSetPointerMode(                                         \
+        ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));                    \
+    CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, a, 1, b, 1, y));         \
+  }                                                                            \
+  template <>                                                                  \
+  DRAGON_API T Dot<T, CUDAContext>(                                            \
+      const int n, const T* a, const T* b, CUDAContext* ctx) {                 \
+    T y_host;                                                                  \
+    CUBLAS_CHECK(                                                              \
+        cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
+    CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, a, 1, b, 1, &y_host));   \
+    return y_host;                                                             \
  }

 template <>
@@ -264,6 +282,8 @@ DRAGON_API void Dot<float16, CUDAContext>(
    const float16* b,
    float16* y,
    CUDAContext* ctx) {
+  CUBLAS_CHECK(
+      cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));
  CUBLAS_CHECK(cublasDotEx(
      ctx->cublas_handle(),
      n,
@@ -276,18 +296,28 @@ DRAGON_API void Dot<float16, CUDAContext>(
      y,
      CUDA_R_16F,
      CUDA_R_32F));
-  ctx->FinishDeviceComputation();
 }

-DEFINE_DOT_FUNC(float, cublasSdot_v2);
-DEFINE_DOT_FUNC(double, cublasDdot_v2);
+DEFINE_DOT_FUNC(float, cublasSdot);
+DEFINE_DOT_FUNC(double, cublasDdot);
 #undef DEFINE_DOT_FUNC

-#define DEFINE_ASUM_FUNC(T, cublas_func)           \
-  template <>                                      \
-  DRAGON_API T ASum<T, CUDAContext>(               \
-      const int n, const T* x, CUDAContext* ctx) { \
-    return cublas_func(n, x, 1);                   \
+#define DEFINE_ASUM_FUNC(T, cublas_func)                                       \
+  template <>                                                                  \
+  DRAGON_API void ASum<T, CUDAContext>(                                        \
+      const int n, const T* x, T* y, CUDAContext* ctx) {                       \
+    CUBLAS_CHECK(cublasSetPointerMode(                                         \
+        ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));                    \
+    cublas_func(ctx->cublas_handle(), n, x, 1, y);                             \
+  }                                                                            \
+  template <>                                                                  \
+  DRAGON_API T ASum<T, CUDAContext>(                                           \
+      const int n, const T* x, CUDAContext* ctx) {                             \
+    T y_host;                                                                  \
+    CUBLAS_CHECK(                                                              \
+        cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
+    cublas_func(ctx->cublas_handle(), n, x, 1, &y_host);                       \
+    return y_host;                                                             \
  }

 DEFINE_ASUM_FUNC(float, cublasSasum);
@@ -312,7 +342,8 @@ DRAGON_API void Gemv<float16, CUDAContext>(
  int k = cuTransA == CUBLAS_OP_N ? M : N;
  int LDA = cuTransA == CUBLAS_OP_N ? m : k;
  int LDC = m;
-  const float _alpha_ = alpha, _beta_ = beta;
+  CUBLAS_CHECK(
+      cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
  if (math_type == "float32") {
 #if CUDA_VERSION >= 9000
    if (TENSOR_CORE_AVAILABLE()) {
@@ -324,14 +355,14 @@ DRAGON_API void Gemv<float16, CUDAContext>(
          m,
          1,
          k,
-          &_alpha_,
+          &alpha,
          A,
          CUDA_R_16F,
          LDA,
          x,
          CUDA_R_16F,
          k,
-          &_beta_,
+          &beta,
          y,
          CUDA_R_16F,
          LDC,
@@ -346,14 +377,14 @@ DRAGON_API void Gemv<float16, CUDAContext>(
          m,
          1,
          k,
-          &_alpha_,
+          &alpha,
          A,
          CUDA_R_16F,
          LDA,
          x,
          CUDA_R_16F,
          k,
-          &_beta_,
+          &beta,
          y,
          CUDA_R_16F,
          LDC));
@@ -366,21 +397,21 @@ DRAGON_API void Gemv<float16, CUDAContext>(
        m,
        1,
        k,
-        &_alpha_,
+        &alpha,
        A,
        CUDA_R_16F,
        LDA,
        x,
        CUDA_R_16F,
        k,
-        &_beta_,
+        &beta,
        y,
        CUDA_R_16F,
        LDC));
 #endif
  } else if (math_type == "float16") {
-    const half _alpha_ = cast::to<half>(alpha);
-    const half _beta_ = cast::to<half>(beta);
+    const half alpha_half = cast::to<half>(alpha);
+    const half beta_half = cast::to<half>(beta);
 #if CUDA_VERSION >= 9000
    if (TENSOR_CORE_AVAILABLE()) {
      // GEMV + MATH16 + TENSOR-CORE
@@ -391,14 +422,14 @@ DRAGON_API void Gemv<float16, CUDAContext>(
          m,
          1,
          k,
-          &_alpha_,
+          &alpha_half,
          A,
          CUDA_R_16F,
          LDA,
          x,
          CUDA_R_16F,
          k,
-          &_beta_,
+          &beta_half,
          y,
          CUDA_R_16F,
          LDC,
@@ -413,12 +444,12 @@ DRAGON_API void Gemv<float16, CUDAContext>(
          m,
          1,
          k,
-          &_alpha_,
+          &alpha_half,
          reinterpret_cast<const half*>(A),
          LDA,
          reinterpret_cast<const half*>(x),
          k,
-          &_beta_,
+          &beta_half,
          reinterpret_cast<half*>(y),
          LDC));
    }
@@ -430,12 +461,12 @@ DRAGON_API void Gemv<float16, CUDAContext>(
        m,
        1,
        k,
-        &_alpha_,
+        &alpha_half,
        reinterpret_cast<const half*>(A),
        LDA,
        reinterpret_cast<const half*>(x),
        k,
-        &_beta_,
+        &beta_half,
        reinterpret_cast<half*>(y),
        LDC));
 #endif
@@ -458,20 +489,10 @@ DRAGON_API void Gemv<float, CUDAContext>(
    const string math_type) {
  cublasOperation_t cuTransA =
      TransA == CblasNoTrans ? CUBLAS_OP_T : CUBLAS_OP_N;
-  const float _alpha_ = alpha, _beta_ = beta;
-  CUBLAS_CHECK(cublasSgemv_v2(
-      ctx->cublas_handle(),
-      cuTransA,
-      N,
-      M,
-      &_alpha_,
-      A,
-      N,
-      x,
-      1,
-      &_beta_,
-      y,
-      1));
+  CUBLAS_CHECK(
+      cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
+  CUBLAS_CHECK(cublasSgemv(
+      ctx->cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
 }

 template <>
@@ -488,18 +509,21 @@ DRAGON_API void Gemv<double, CUDAContext>(
    const string math_type) {
  cublasOperation_t cuTransA =
      TransA == CblasNoTrans ? CUBLAS_OP_T : CUBLAS_OP_N;
-  const double _alpha_ = alpha, _beta_ = beta;
-  CUBLAS_CHECK(cublasDgemv_v2(
+  const auto alpha64 = static_cast<double>(alpha);
+  const auto beta64 = static_cast<double>(beta);
+  CUBLAS_CHECK(
+      cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
+  CUBLAS_CHECK(cublasDgemv(
      ctx->cublas_handle(),
      cuTransA,
      N,
      M,
-      &_alpha_,
+      &alpha64,
      A,
      N,
      x,
      1,
-      &_beta_,
+      &beta64,
      y,
      1));
 }
@@ -524,8 +548,9 @@ DRAGON_API void Gemm<float16, CUDAContext>(
      TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_CHECK(
+      cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
  if (math_type == "float32") {
-    const float _alpha_ = alpha, _beta_ = beta;
 #if CUDA_VERSION >= 9000
    if (TENSOR_CORE_AVAILABLE()) {
      // GEMM + MATH32 + TENSOR-CORE
@@ -536,14 +561,14 @@ DRAGON_API void Gemm<float16, CUDAContext>(
          N,
          M,
          K,
-          &_alpha_,
+          &alpha,
          B,
          CUDA_R_16F,
          ldb,
          A,
          CUDA_R_16F,
          lda,
-          &_beta_,
+          &beta,
          C,
          CUDA_R_16F,
          N,
@@ -558,14 +583,14 @@ DRAGON_API void Gemm<float16, CUDAContext>(
          N,
          M,
          K,
-          &_alpha_,
+          &alpha,
          B,
          CUDA_R_16F,
          ldb,
          A,
          CUDA_R_16F,
          lda,
-          &_beta_,
+          &beta,
          C,
          CUDA_R_16F,
          N));
@@ -578,21 +603,21 @@ DRAGON_API void Gemm<float16, CUDAContext>(
        N,
        M,
        K,
-        &_alpha_,
+        &alpha,
        B,
        CUDA_R_16F,
        ldb,
        A,
        CUDA_R_16F,
        lda,
-        &_beta_,
+        &beta,
        C,
        CUDA_R_16F,
        N));
 #endif
  } else if (math_type == "float16") {
-    const half _alpha_ = cast::to<half>(alpha);
-    const half _beta_ = cast::to<half>(beta);
+    const half alpha_half = cast::to<half>(alpha);
+    const half beta_half = cast::to<half>(beta);
 #if CUDA_VERSION >= 9000
    if (TENSOR_CORE_AVAILABLE()) {
      // GEMM + MATH16 + TENSOR-CORE
@@ -603,14 +628,14 @@ DRAGON_API void Gemm<float16, CUDAContext>(
          N,
          M,
          K,
-          &_alpha_,
+          &alpha_half,
          B,
          CUDA_R_16F,
          ldb,
          A,
          CUDA_R_16F,
          lda,
-          &_beta_,
+          &beta_half,
          C,
          CUDA_R_16F,
          N,
@@ -625,12 +650,12 @@ DRAGON_API void Gemm<float16, CUDAContext>(
          N,
          M,
          K,
-          &_alpha_,
+          &alpha_half,
          reinterpret_cast<const half*>(B),
          ldb,
          reinterpret_cast<const half*>(A),
          lda,
-          &_beta_,
+          &beta_half,
          reinterpret_cast<half*>(C),
          N));
    }
@@ -642,12 +667,12 @@ DRAGON_API void Gemm<float16, CUDAContext>(
        N,
        M,
        K,
-        &_alpha_,
+        &alpha_half,
        reinterpret_cast<const half*>(B),
        ldb,
        reinterpret_cast<const half*>(A),
        lda,
-        &_beta_,
+        &beta_half,
        reinterpret_cast<half*>(C),
        N));
 #endif
@@ -676,7 +701,9 @@ DRAGON_API void Gemm<float, CUDAContext>(
      TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm_v2(
+  CUBLAS_CHECK(
+      cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
+  CUBLAS_CHECK(cublasSgemm(
      ctx->cublas_handle(),
      cuTransB,
      cuTransA,
@@ -713,20 +740,23 @@ DRAGON_API void Gemm<double, CUDAContext>(
      TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const double _alpha_ = alpha, _beta_ = beta;
-  CUBLAS_CHECK(cublasDgemm_v2(
+  const auto alpha64 = static_cast<double>(alpha);
+  const auto beta64 = static_cast<double>(beta);
+  CUBLAS_CHECK(
+      cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
+  CUBLAS_CHECK(cublasDgemm(
      ctx->cublas_handle(),
      cuTransB,
      cuTransA,
      N,
      M,
      K,
-      &_alpha_,
+      &alpha64,
      B,
      ldb,
      A,
      lda,
-      &_beta_,
+      &beta64,
      C,
      N));
 }

--- a/dragon/utils/math/blas.h
+++ b/dragon/utils/math/blas.h
@@ -48,6 +48,12 @@ template <typename T, class Context>
 DRAGON_API void Dot(const int n, const T* a, const T* b, T* y, Context* ctx);

 template <typename T, class Context>
+DRAGON_API T Dot(const int n, const T* a, const T* b, Context* ctx);
+
+template <typename T, class Context>
+DRAGON_API void ASum(const int n, const T* x, T* y, Context* ctx);
+
+template <typename T, class Context>
 DRAGON_API T ASum(const int n, const T* x, Context* ctx);

 template <typename T, class Context>

--- a/dragon/utils/math/broadcast.cc
+++ b/dragon/utils/math/broadcast.cc
@@ -213,7 +213,7 @@ DEFINE_BROADCAST_2ND_FUNC(Div, double, /);
  void _Colwise##name<TIn, true>(                                            \
      const int rows, const int cols, const TIn* a, const TIn* b, TOut* y) { \
    EigenArrayMap<TOut>(y, cols, rows) =                                     \
-        ConstEigenVectorArrayMap<TIn>(a, rows).colwise().replicate(cols)     \
+        ConstEigenVectorArrayMap2<TIn>(a, rows).colwise().replicate(cols)    \
            expr ConstEigenArrayMap<TIn>(b, cols, rows);                     \
  }                                                                          \
  template <>                                                                \
@@ -230,7 +230,7 @@ DEFINE_BROADCAST_2ND_FUNC(Div, double, /);
      const int rows, const int cols, const TIn* a, const TIn* b, TOut* y) { \
    EigenArrayMap<TOut>(y, cols, rows) =                                     \
        ConstEigenArrayMap<TIn>(a, cols, rows)                               \
-            expr ConstEigenVectorArrayMap<TIn>(b, rows)                      \
+            expr ConstEigenVectorArrayMap2<TIn>(b, rows)                     \
                .colwise()                                                   \
                .replicate(cols);                                            \
  }
@@ -273,36 +273,36 @@ DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(GreaterEqual, float, bool, >=);
 DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(GreaterEqual, double, bool, >=);
 #undef DEFINE_ROWWISE_COLWISE_BIANRY_FUNC

-#define DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(name, T, func)                    \
-  template <>                                                                \
-  void _Rowwise##name<T, true>(                                              \
-      const int rows, const int cols, const T* a, const T* b, T* y) {        \
-    EigenArrayMap<T>(y, cols, rows) =                                        \
-        ConstEigenVectorArrayMap<T>(a, cols).rowwise().replicate(rows).func( \
-            ConstEigenArrayMap<T>(b, cols, rows));                           \
-  }                                                                          \
-  template <>                                                                \
-  void _Colwise##name<T, true>(                                              \
-      const int rows, const int cols, const T* a, const T* b, T* y) {        \
-    EigenArrayMap<T>(y, cols, rows) =                                        \
-        ConstEigenVectorArrayMap<T>(a, rows).colwise().replicate(cols).func( \
-            ConstEigenArrayMap<T>(b, cols, rows));                           \
-  }                                                                          \
-  template <>                                                                \
-  void _Rowwise##name<T, false>(                                             \
-      const int rows, const int cols, const T* a, const T* b, T* y) {        \
-    EigenArrayMap<T>(y, cols, rows) =                                        \
-        ConstEigenArrayMap<T>(a, cols, rows)                                 \
-            .func(ConstEigenVectorArrayMap<T>(b, cols).rowwise().replicate(  \
-                rows));                                                      \
-  }                                                                          \
-  template <>                                                                \
-  void _Colwise##name<T, false>(                                             \
-      const int rows, const int cols, const T* a, const T* b, T* y) {        \
-    EigenArrayMap<T>(y, cols, rows) =                                        \
-        ConstEigenArrayMap<T>(a, cols, rows)                                 \
-            .func(ConstEigenVectorArrayMap<T>(b, rows).colwise().replicate(  \
-                cols));                                                      \
+#define DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(name, T, func)                     \
+  template <>                                                                 \
+  void _Rowwise##name<T, true>(                                               \
+      const int rows, const int cols, const T* a, const T* b, T* y) {         \
+    EigenArrayMap<T>(y, cols, rows) =                                         \
+        ConstEigenVectorArrayMap<T>(a, cols).rowwise().replicate(rows).func(  \
+            ConstEigenArrayMap<T>(b, cols, rows));                            \
+  }                                                                           \
+  template <>                                                                 \
+  void _Colwise##name<T, true>(                                               \
+      const int rows, const int cols, const T* a, const T* b, T* y) {         \
+    EigenArrayMap<T>(y, cols, rows) =                                         \
+        ConstEigenVectorArrayMap2<T>(a, rows).colwise().replicate(cols).func( \
+            ConstEigenArrayMap<T>(b, cols, rows));                            \
+  }                                                                           \
+  template <>                                                                 \
+  void _Rowwise##name<T, false>(                                              \
+      const int rows, const int cols, const T* a, const T* b, T* y) {         \
+    EigenArrayMap<T>(y, cols, rows) =                                         \
+        ConstEigenArrayMap<T>(a, cols, rows)                                  \
+            .func(ConstEigenVectorArrayMap<T>(b, cols).rowwise().replicate(   \
+                rows));                                                       \
+  }                                                                           \
+  template <>                                                                 \
+  void _Colwise##name<T, false>(                                              \
+      const int rows, const int cols, const T* a, const T* b, T* y) {         \
+    EigenArrayMap<T>(y, cols, rows) =                                         \
+        ConstEigenArrayMap<T>(a, cols, rows)                                  \
+            .func(ConstEigenVectorArrayMap2<T>(b, rows).colwise().replicate(  \
+                cols));                                                       \
  }

 DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Pow, float, pow);

--- a/dragon/utils/math/utils.cc
+++ b/dragon/utils/math/utils.cc
@@ -211,7 +211,7 @@ void ComputeBinaryBroadcastStrides(
  Y_dims.resize(num_dims);
  int64_t A_stride = 1;
  int64_t B_stride = 1;
-  for (int i = 0; i < num_dims; ++i) {
+  for (int i = num_dims - 1; i >= 0; --i) {
    A_broadcast_strides[i] = A_broadcast_dims[i] == 1 ? 0 : A_stride;
    B_broadcast_strides[i] = B_broadcast_dims[i] == 1 ? 0 : B_stride;
    Y_dims[i] = std::max(A_broadcast_dims[i], B_broadcast_dims[i]);

--- a/dragon/utils/op_kernels.h
+++ b/dragon/utils/op_kernels.h
@@ -1025,18 +1025,18 @@ void SGDUpdate(
    T* m,
    Context* ctx);

-/* training.op_base */
+/* training.mixed_prec_update */

 template <typename T, class Context>
-void MixedPrecL2Decay(
+void MixedPrecL2Penalty(
    const int count,
    const float alpha,
-    const T* w,
+    const T* x,
    float* dx,
    Context* ctx);

 template <typename T, class Context>
-void MixedPrecUpdate(const int count, const float* updates, T* w, Context* ctx);
+void MixedPrecUpdate(const int count, const float* dx, T* x, Context* ctx);

 /* vision.bias_add */


--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -49,7 +49,6 @@ from dragon.vm.tensorflow.core.framework.dtypes import qint32
 from dragon.vm.tensorflow.core.framework.dtypes import qint8
 from dragon.vm.tensorflow.core.framework.dtypes import quint16
 from dragon.vm.tensorflow.core.framework.dtypes import quint8
-from dragon.vm.tensorflow.core.framework.dtypes import resource
 from dragon.vm.tensorflow.core.framework.dtypes import string
 from dragon.vm.tensorflow.core.framework.dtypes import uint16
 from dragon.vm.tensorflow.core.framework.dtypes import uint32

--- a/tensorflow/_api/dtypes/__init__.py
+++ b/tensorflow/_api/dtypes/__init__.py
@@ -33,7 +33,6 @@ from dragon.vm.tensorflow.core.framework.dtypes import qint32
 from dragon.vm.tensorflow.core.framework.dtypes import qint8
 from dragon.vm.tensorflow.core.framework.dtypes import quint16
 from dragon.vm.tensorflow.core.framework.dtypes import quint8
-from dragon.vm.tensorflow.core.framework.dtypes import resource
 from dragon.vm.tensorflow.core.framework.dtypes import string
 from dragon.vm.tensorflow.core.framework.dtypes import uint16
 from dragon.vm.tensorflow.core.framework.dtypes import uint32

--- a/tensorflow/core/framework/constant_op.py
+++ b/tensorflow/core/framework/constant_op.py
@@ -15,7 +15,7 @@ from __future__ import print_function

 import numpy

-from dragon.core.autograph.tensor import RefTensor
+from dragon.core.autograph.tensor import TensorRef
 from dragon.core.eager import context as eager_context
 from dragon.core.eager.tensor import EagerTensor
 from dragon.core.framework import context
@@ -80,7 +80,7 @@ def constant(value, dtype=None, shape=None, name='Const'):
    if eager_context.executing_eagerly():
        return EagerTensor(value, name=name + ':0')
    else:
-        return RefTensor(
+        return TensorRef(
            name=workspace.get_dummy_name(name, ':0', 'Tensor'),
            shape=list(value.shape),
            dtype=str(value.dtype),

--- a/tensorflow/core/framework/dtypes.py
+++ b/tensorflow/core/framework/dtypes.py
@@ -19,7 +19,31 @@ from __future__ import print_function

 import numpy as np

-from dragon.vm.tensorflow.core.proto import types_pb2
+# Predefine the type enumerations
+# to avoid to import the tensorflow proto
+DT_INVALID = 0
+DT_FLOAT = 1
+DT_DOUBLE = 2
+DT_INT32 = 3
+DT_UINT8 = 4
+DT_INT16 = 5
+DT_INT8 = 6
+DT_STRING = 7
+DT_COMPLEX64 = 8
+DT_INT64 = 9
+DT_BOOL = 10
+DT_QINT8 = 11
+DT_QUINT8 = 12
+DT_QINT32 = 13
+DT_BFLOAT16 = 14
+DT_QINT16 = 15
+DT_QUINT16 = 16
+DT_UINT16 = 17
+DT_COMPLEX128 = 18
+DT_HALF = 19
+DT_VARIANT = 21
+DT_UINT32 = 22
+DT_UINT64 = 23


 class DType(object):
@@ -69,8 +93,6 @@ class DType(object):

    * ``tf.qint32``: Quantized 32-bit signed integer.

-    * ``tf.resource``: Handle to a mutable resource.
-
    * ``tf.variant``: Values of arbitrary types.

    """
@@ -81,13 +103,12 @@ class DType(object):
        Parameters
        ----------
        type_enum : DataType
-            The ``types_pb2.DataType`` value.
+            The ``DataType`` value.

        """
        type_enum = int(type_enum)
-        if (type_enum not in types_pb2.DataType.values()
-                or type_enum == types_pb2.DT_INVALID):
-            raise TypeError('<type_enum> is not a valid types_pb2.DataType.')
+        if type_enum == DT_INVALID:
+            raise TypeError('<type_enum> is not a valid DataType.')
        self._type_enum = type_enum

    @property
@@ -106,8 +127,7 @@ class DType(object):

    @property
    def is_numpy_compatible(self):
-        return (self._type_enum != types_pb2.DT_RESOURCE and
-                self._type_enum != types_pb2.DT_RESOURCE_REF)
+        return self._type_enum in _TF_TO_NP

    @property
    def as_numpy_dtype(self):
@@ -230,55 +250,53 @@ dtype_range = {np.bool_: (False, True),
               np.float64: (-1, 1)}


-# Define standard wrappers for the types_pb2.DataType enum.
-resource = DType(types_pb2.DT_RESOURCE)
-float16 = DType(types_pb2.DT_HALF)
+# Define standard wrappers for the DataType enum.
+float16 = DType(DT_HALF)
 half = float16
-float32 = DType(types_pb2.DT_FLOAT)
-float64 = DType(types_pb2.DT_DOUBLE)
+float32 = DType(DT_FLOAT)
+float64 = DType(DT_DOUBLE)
 double = float64
-int32 = DType(types_pb2.DT_INT32)
-uint8 = DType(types_pb2.DT_UINT8)
-uint16 = DType(types_pb2.DT_UINT16)
-uint64 = DType(types_pb2.DT_UINT32)
-uint32 = DType(types_pb2.DT_UINT64)
-int16 = DType(types_pb2.DT_INT16)
-int8 = DType(types_pb2.DT_INT8)
-string = DType(types_pb2.DT_STRING)
-complex64 = DType(types_pb2.DT_COMPLEX64)
-complex128 = DType(types_pb2.DT_COMPLEX128)
-int64 = DType(types_pb2.DT_INT64)
-bool = DType(types_pb2.DT_BOOL)
-qint8 = DType(types_pb2.DT_QINT8)
-quint8 = DType(types_pb2.DT_QUINT8)
-qint16 = DType(types_pb2.DT_QINT16)
-quint16 = DType(types_pb2.DT_QUINT16)
-qint32 = DType(types_pb2.DT_QINT32)
-bfloat16 = DType(types_pb2.DT_BFLOAT16)
-variant = DType(types_pb2.DT_VARIANT)
-
-# Standard mappings between types_pb2.DataType values and string names.
+int32 = DType(DT_INT32)
+uint8 = DType(DT_UINT8)
+uint16 = DType(DT_UINT16)
+uint64 = DType(DT_UINT32)
+uint32 = DType(DT_UINT64)
+int16 = DType(DT_INT16)
+int8 = DType(DT_INT8)
+string = DType(DT_STRING)
+complex64 = DType(DT_COMPLEX64)
+complex128 = DType(DT_COMPLEX128)
+int64 = DType(DT_INT64)
+bool = DType(DT_BOOL)
+qint8 = DType(DT_QINT8)
+quint8 = DType(DT_QUINT8)
+qint16 = DType(DT_QINT16)
+quint16 = DType(DT_QUINT16)
+qint32 = DType(DT_QINT32)
+bfloat16 = DType(DT_BFLOAT16)
+variant = DType(DT_VARIANT)
+
+# Standard mappings between DataType values and string names.
 _TYPE_TO_STRING = {
-    types_pb2.DT_HALF: "float16",
-    types_pb2.DT_FLOAT: "float32",
-    types_pb2.DT_DOUBLE: "float64",
-    types_pb2.DT_INT32: "int32",
-    types_pb2.DT_UINT8: "uint8",
-    types_pb2.DT_UINT16: "uint16",
-    types_pb2.DT_INT16: "int16",
-    types_pb2.DT_INT8: "int8",
-    types_pb2.DT_STRING: "string",
-    types_pb2.DT_COMPLEX64: "complex64",
-    types_pb2.DT_COMPLEX128: "complex128",
-    types_pb2.DT_INT64: "int64",
-    types_pb2.DT_BOOL: "bool",
-    types_pb2.DT_QINT8: "qint8",
-    types_pb2.DT_QUINT8: "quint8",
-    types_pb2.DT_QINT16: "qint16",
-    types_pb2.DT_QUINT16: "quint16",
-    types_pb2.DT_QINT32: "qint32",
-    types_pb2.DT_BFLOAT16: "bfloat16",
-    types_pb2.DT_RESOURCE: "resource",
+    DT_HALF: "float16",
+    DT_FLOAT: "float32",
+    DT_DOUBLE: "float64",
+    DT_INT32: "int32",
+    DT_UINT8: "uint8",
+    DT_UINT16: "uint16",
+    DT_INT16: "int16",
+    DT_INT8: "int8",
+    DT_STRING: "string",
+    DT_COMPLEX64: "complex64",
+    DT_COMPLEX128: "complex128",
+    DT_INT64: "int64",
+    DT_BOOL: "bool",
+    DT_QINT8: "qint8",
+    DT_QUINT8: "quint8",
+    DT_QINT16: "qint16",
+    DT_QUINT16: "quint16",
+    DT_QINT32: "qint32",
+    DT_BFLOAT16: "bfloat16",
 }

 # Numpy representation for quantized dtypes.
@@ -314,51 +332,50 @@ _NP_TO_TF = {
 }

 _TF_TO_NP = {
-    types_pb2.DT_HALF: np.float16,
-    types_pb2.DT_FLOAT: np.float32,
-    types_pb2.DT_DOUBLE: np.float64,
-    types_pb2.DT_INT32: np.int32,
-    types_pb2.DT_UINT8: np.uint8,
-    types_pb2.DT_UINT16: np.uint16,
-    types_pb2.DT_INT16: np.int16,
-    types_pb2.DT_INT8: np.int8,
-    types_pb2.DT_STRING: np.object,
-    types_pb2.DT_COMPLEX64: np.complex64,
-    types_pb2.DT_COMPLEX128: np.complex128,
-    types_pb2.DT_INT64: np.int64,
-    types_pb2.DT_BOOL: np.bool,
-    types_pb2.DT_QINT8: _np_qint8,
-    types_pb2.DT_QUINT8: _np_quint8,
-    types_pb2.DT_QINT16: _np_qint16,
-    types_pb2.DT_QUINT16: _np_quint16,
-    types_pb2.DT_QINT32: _np_qint32,
-    types_pb2.DT_BFLOAT16: np.uint16,
+    DT_HALF: np.float16,
+    DT_FLOAT: np.float32,
+    DT_DOUBLE: np.float64,
+    DT_INT32: np.int32,
+    DT_UINT8: np.uint8,
+    DT_UINT16: np.uint16,
+    DT_INT16: np.int16,
+    DT_INT8: np.int8,
+    DT_STRING: np.object,
+    DT_COMPLEX64: np.complex64,
+    DT_COMPLEX128: np.complex128,
+    DT_INT64: np.int64,
+    DT_BOOL: np.bool,
+    DT_QINT8: _np_qint8,
+    DT_QUINT8: _np_quint8,
+    DT_QINT16: _np_qint16,
+    DT_QUINT16: _np_quint16,
+    DT_QINT32: _np_qint32,
+    DT_BFLOAT16: np.uint16,
 }

 _INTERN_TABLE = {
-    types_pb2.DT_HALF: float16,
-    types_pb2.DT_FLOAT: float32,
-    types_pb2.DT_DOUBLE: float64,
-    types_pb2.DT_INT32: int32,
-    types_pb2.DT_UINT8: uint8,
-    types_pb2.DT_UINT16: uint16,
-    types_pb2.DT_UINT32: uint32,
-    types_pb2.DT_UINT64: uint64,
-    types_pb2.DT_INT16: int16,
-    types_pb2.DT_INT8: int8,
-    types_pb2.DT_STRING: string,
-    types_pb2.DT_COMPLEX64: complex64,
-    types_pb2.DT_COMPLEX128: complex128,
-    types_pb2.DT_INT64: int64,
-    types_pb2.DT_BOOL: bool,
-    types_pb2.DT_QINT8: qint8,
-    types_pb2.DT_QUINT8: quint8,
-    types_pb2.DT_QINT16: qint16,
-    types_pb2.DT_QUINT16: quint16,
-    types_pb2.DT_QINT32: qint32,
-    types_pb2.DT_BFLOAT16: bfloat16,
-    types_pb2.DT_RESOURCE: resource,
-    types_pb2.DT_VARIANT: variant,
+    DT_HALF: float16,
+    DT_FLOAT: float32,
+    DT_DOUBLE: float64,
+    DT_INT32: int32,
+    DT_UINT8: uint8,
+    DT_UINT16: uint16,
+    DT_UINT32: uint32,
+    DT_UINT64: uint64,
+    DT_INT16: int16,
+    DT_INT8: int8,
+    DT_STRING: string,
+    DT_COMPLEX64: complex64,
+    DT_COMPLEX128: complex128,
+    DT_INT64: int64,
+    DT_BOOL: bool,
+    DT_QINT8: qint8,
+    DT_QUINT8: quint8,
+    DT_QINT16: qint16,
+    DT_QUINT16: quint16,
+    DT_QINT32: qint32,
+    DT_BFLOAT16: bfloat16,
+    DT_VARIANT: variant,
 }

 _STRING_TO_TF = {

--- a/tensorflow/core/keras/optimizer/adam.py
+++ b/tensorflow/core/keras/optimizer/adam.py
@@ -62,7 +62,6 @@ class Adam(optimizer.Optimizer):

        """
        super(Adam, self).__init__(name, **kwargs)
-        self._op_type = 'AdamUpdate'
        self._set_hyper('learning_rate', kwargs.get('lr', learning_rate), 'base_lr')
        self._set_hyper('beta_1', beta_1, 'beta1')
        self._set_hyper('beta_2', beta_2, 'beta2')

--- a/tensorflow/core/keras/optimizer/optimizer.py
+++ b/tensorflow/core/keras/optimizer/optimizer.py
@@ -21,7 +21,7 @@ from dragon.core.eager import context as eager_context
 from dragon.core.framework import context
 from dragon.core.framework import types
 from dragon.core.framework import workspace
-from dragon.core.training import updater
+from dragon.core.training import optimizer as optimizer_v1
 from dragon.core.util import six
 from dragon.vm.tensorflow.core.framework import dtypes
 from dragon.vm.tensorflow.core.keras import initializers
@@ -29,7 +29,7 @@ from dragon.vm.tensorflow.core.keras.utils import generic_utils
 from dragon.vm.tensorflow.core.ops import variables


-class Optimizer(updater.Updater):
+class Optimizer(optimizer_v1.Optimizer):
    """The base class for optimizers."""

    BASE_WEIGHT_DECAY = 0.0001
@@ -46,9 +46,9 @@ class Optimizer(updater.Updater):
        self._init_set_name(name)
        super(Optimizer, self).__init__(
            name=self._name,
-            l2_decay=self.BASE_WEIGHT_DECAY,
+            weight_decay=self.BASE_WEIGHT_DECAY,
        )
-        allowed_kwargs = {'clipnorm', 'clipvalue', 'lr', 'decay'}
+        allowed_kwargs = {'scale', 'clipnorm', 'lr'}
        for k in kwargs:
            if k not in allowed_kwargs:
                raise TypeError('Unexpected keyword argument:', str(k))
@@ -61,11 +61,12 @@ class Optimizer(updater.Updater):
        self._iterations = 0

        # Register the common hyper parameters.
+        if 'scale' in kwargs:
+            self._defaults['scale'] = kwargs.pop('scale')
        if 'clipnorm' in kwargs:
-            self._defaults['clip_gradient'] = kwargs.pop('clipnorm')
+            self._defaults['clip_norm'] = kwargs.pop('clipnorm')
        for k, v in self._defaults.items():
            self._set_hyper(k, v, k)
-
        self._hypers_created = False

    @property
@@ -196,7 +197,7 @@ class Optimizer(updater.Updater):
            else:
                self._hyper[name] = value
        if alias and name not in self._alias:
-            self._alias[name] = self._slot + '/' + alias
+            self._alias[name] = '/share/hyper/%s/%s' % (self._op_handle, alias)

    def __getattr__(self, item):
        if item == 'lr':

--- a/tensorflow/core/keras/optimizer/rmsprop.py
+++ b/tensorflow/core/keras/optimizer/rmsprop.py
@@ -59,7 +59,6 @@ class RMSprop(optimizer.Optimizer):

        """
        super(RMSprop, self).__init__(name, **kwargs)
-        self._op_type = 'RMSPropUpdate'
        self._set_hyper('learning_rate', kwargs.get('lr', learning_rate), 'base_lr')
        self._set_hyper('rho', rho, 'decay')
        self._set_hyper('momentum', momentum, 'momentum')

--- a/tensorflow/core/ops/array_ops.py
+++ b/tensorflow/core/ops/array_ops.py
@@ -19,7 +19,7 @@ from __future__ import print_function

 import numpy

-from dragon.core.autograph.tensor import RefTensor
+from dragon.core.autograph.tensor import TensorRef
 from dragon.core.framework import context
 from dragon.core.framework import workspace
 from dragon.core.ops import array_ops
@@ -477,7 +477,7 @@ def placeholder(dtype=None, shape=None, name=None):

    """
    # Construct a tensor from the explicit name
-    return RefTensor(
+    return TensorRef(
        workspace.get_dummy_name(
            context.get_name_scope() + name
            if name else 'Placeholder',

--- a/tensorflow/core/ops/math_ops.py
+++ b/tensorflow/core/ops/math_ops.py
@@ -716,8 +716,8 @@ def matmul(
    """
    return math_ops.matmul(
        [a, b],
-        transA=transpose_a,
-        transB=transpose_b,
+        transpose_a=transpose_a,
+        transpose_b=transpose_b,
        name=name,
    )


--- a/tensorflow/core/ops/nn_ops.py
+++ b/tensorflow/core/ops/nn_ops.py
@@ -18,7 +18,6 @@ import functools
 from dragon.core.framework import types
 from dragon.core.ops import activation_ops
 from dragon.core.ops import loss_ops
-from dragon.core.ops import math_ops
 from dragon.core.ops import normalization_ops
 from dragon.core.ops import vision_ops
 from dragon.core.util import nest
@@ -205,7 +204,6 @@ def convolution(
    return getattr(vision_ops, '{}{}d'.format(
        kwargs.get('conv_type', 'conv'), num_spatial_dims))(
            [input, filters],
-            num_output=filters.shape[0],
            kernel_shape=filters.shape[2:],
            strides=strides[start_axis:start_axis + num_spatial_dims],
            dilations=dilations[start_axis:start_axis + num_spatial_dims],
@@ -288,7 +286,6 @@ def conv_transpose(

    return getattr(vision_ops, 'conv{}d_transpose'.format(num_spatial_dims))(
        [input, filters],
-        num_output=filters.shape[1],
        kernel_shape=filters.shape[2:],
        strides=strides[start_axis:start_axis + num_spatial_dims],
        dilations=dilations[start_axis:start_axis + num_spatial_dims],
@@ -897,29 +894,6 @@ def sparse_softmax_cross_entropy_with_logits(
    )


-def xw_plus_b(x, weights, biases, name=None):
-    if weights.shape is None:
-        raise ValueError('weights must have a valid shape.')
-    else:
-        if len(weights.shape) != 2:
-            raise ValueError('weights must be a 2D Tensor')
-
-    if biases.shape is None:
-        raise ValueError('biases must a have a valid shape.')
-    else:
-        if len(biases.shape) != 1:
-            raise ValueError('biases must be a 1D Tensor')
-        if weights.shape[1] != biases.shape[0]:
-            raise ValueError('the shape of weights and biaes are incompatible.')
-
-    return math_ops.fully_connected(
-        [x, weights, biases],
-        num_output=weights.shape[1],
-        transW=False,
-        name=name,
-    )
-
-
 def _normalize_spatial_args(
    name,
    values,

--- a/tensorflow/core/proto/__init__.py
+++ b/tensorflow/core/proto/__init__.py
--- a/tensorflow/core/proto/config.proto
+++ b/tensorflow/core/proto/config.proto
-syntax = "proto2";
-
-package tensorflow;
-
-message GPUOptions {
-  // A value between 0 and 1 that indicates what fraction of the
-  // available GPU memory to pre-allocate for each process.  1 means
-  // to pre-allocate all of the GPU memory, 0.5 means the process
-  // allocates ~50% of the available GPU memory.
-  optional double per_process_gpu_memory_fraction = 1;
-
-  // The type of GPU allocation strategy to use.
-  //
-  // Allowed values:
-  // "": The empty string (default) uses a system-chosen default
-  //     which may change over time.
-  //
-  // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
-  //        version of dlmalloc.
-  optional string allocator_type = 2;
-
-  // Delay deletion of up to this many bytes to reduce the number of
-  // interactions with gpu driver code.  If 0, the system chooses
-  // a reasonable default (several MBs).
-  optional int64 deferred_deletion_bytes = 3;
-
-  // If true, the allocator does not pre-allocate the entire specified
-  // GPU memory region, instead starting small and growing as needed.
-  optional bool allow_growth = 4;
-
-  // A comma-separated list of GPU ids that determines the 'visible'
-  // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
-  // can see 8 GPU devices in the process, and one wanted to map
-  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", then one
-  // would specify this field as "5,3".  This field is similar in
-  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
-  // it applies to the visible GPU devices in the process.
-  //
-  // NOTE: The GPU driver provides the process with the visible GPUs
-  // in an order which is not guaranteed to have any correlation to
-  // the *physical* GPU id in the machine.  This field is used for
-  // remapping "visible" to "virtual", which means this operates only
-  // after the process starts.  Users are required to use vendor
-  // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
-  // physical to visible device mapping prior to invoking TensorFlow.
-  optional string visible_device_list = 5;
-
-  // In the event polling loop sleep this many microseconds between
-  // PollEvents calls, when the queue is not empty.  If value is not
-  // set or set to 0, gets set to a non-zero default.
-  optional int32 polling_active_delay_usecs = 6;
-
-  // In the event polling loop sleep this many millisconds between
-  // PollEvents calls, when the queue is empty.  If value is not
-  // set or set to 0, gets set to a non-zero default.
-  optional int32 polling_inactive_delay_msecs = 7;
-
-  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
-  // enabling this option forces all CPU tensors to be allocated with Cuda
-  // pinned memory. Normally, TensorFlow will infer which tensors should be
-  // allocated as the pinned memory. But in case where the inference is
-  // incomplete, this option can significantly speed up the cross-device memory
-  // copy performance as long as it fits the memory.
-  // Note that this option is not something that should be
-  // enabled by default for unknown or very large models, since all Cuda pinned
-  // memory is unpageable, having too much pinned memory might negatively impact
-  // the overall host system performance.
-  optional bool force_gpu_compatible = 8;
-}
-
-message GraphOptions {
-  // If true, use control flow to schedule the activation of Recv nodes.
-  // (Currently ignored.)
-  optional bool enable_recv_scheduling = 2;
-
-  // Options controlling how graph is optimized.
-  // OptimizerOptions optimizer_options = 3;
-
-  // The number of steps to run before returning a cost model detailing
-  // the memory usage and performance of each node of the graph. 0 means
-  // no cost model.
-  optional int64 build_cost_model = 4;
-
-  // The number of steps to skip before collecting statistics for the
-  // cost model.
-  optional int64 build_cost_model_after = 9;
-
-  // Annotate each Node with Op output shape data, to the extent it can
-  // be statically inferred.
-  optional bool infer_shapes = 5;
-
-  // Only place the subgraphs that are run, rather than the entire graph.
-  //
-  // This is useful for interactive graph building, where one might
-  // produce graphs that cannot be placed during the debugging
-  // process.  In particular, it allows the client to continue work in
-  // a session after adding a node to a graph whose placement
-  // constraints are unsatisfiable.
-  optional bool place_pruned_graph = 6;
-
-  // If true, transfer float values between processes as bfloat16.
-  optional bool enable_bfloat16_sendrecv = 7;
-
-  // If > 0, record a timeline every this many steps.
-  // EXPERIMENTAL: This currently has no effect in MasterSession.
-  optional int32 timeline_step = 8;
-
-  // Options that control the type and amount of graph rewriting.
-  // Not currently configurable via the public Python API (i.e. there is no API
-  // stability guarantee if you import RewriterConfig explicitly).
-  // RewriterConfig rewrite_options = 10;
-}
-
-message ConfigProto {
-  // Map from device type name (e.g., "CPU" or "GPU" ) to maximum
-  // number of devices of that type to use.  If a particular device
-  // type is not found in the map, the system picks an appropriate
-  // number.
-  // map<string, int32> device_count = 1;
-
-  // The execution of an individual op (for some op types) can be
-  // parallelized on a pool of intra_op_parallelism_threads.
-  // 0 means the system picks an appropriate number.
-  optional int32 intra_op_parallelism_threads = 2;
-
-  // Nodes that perform blocking operations are enqueued on a pool of
-  // inter_op_parallelism_threads available in each process.
-  //
-  // 0 means the system picks an appropriate number.
-  //
-  // Note that the first Session created in the process sets the
-  // number of threads for all future sessions unless use_per_session_threads is
-  // true or session_inter_op_thread_pool is configured.
-  optional int32 inter_op_parallelism_threads = 5;
-
-  // If true, use a new set of threads for this session rather than the global
-  // pool of threads. Only supported by direct sessions.
-  //
-  // If false, use the global threads created by the first session, or the
-  // per-session thread pools configured by session_inter_op_thread_pool.
-  //
-  // This option is deprecated. The same effect can be achieved by setting
-  // session_inter_op_thread_pool to have one element, whose num_threads equals
-  // inter_op_parallelism_threads.
-  optional bool use_per_session_threads = 9;
-
-  // This option is experimental - it may be replaced with a different mechanism
-  // in the future.
-  //
-  // Configures session thread pools. If this is configured, then RunOptions for
-  // a Run call can select the thread pool to use.
-  //
-  // The intended use is for when some session invocations need to run in a
-  // background pool limited to a small number of threads:
-  // - For example, a session may be configured to have one large pool (for
-  // regular compute) and one small pool (for periodic, low priority work);
-  // using the small pool is currently the mechanism for limiting the inter-op
-  // parallelism of the low priority work.  Note that it does not limit the
-  // parallelism of work spawned by a single op kernel implementation.
-  // - Using this setting is normally not needed in training, but may help some
-  // serving use cases.
-  // - It is also generally recommended to set the global_name field of this
-  // proto, to avoid creating multiple large pools. It is typically better to
-  // run the non-low-priority work, even across sessions, in a single large
-  // pool.
-  // repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;
-
-  // Assignment of Nodes to Devices is recomputed every placement_period
-  // steps until the system warms up (at which point the recomputation
-  // typically slows down automatically).
-  optional int32 placement_period = 3;
-
-  // When any filters are present sessions will ignore all devices which do not
-  // match the filters. Each filter can be partially specified, e.g. "/job:ps"
-  // "/job:worker/replica:3", etc.
-  repeated string device_filters = 4;
-
-  // Options that apply to all GPUs.
-  optional GPUOptions gpu_options = 6;
-
-  // Whether soft placement is allowed. If allow_soft_placement is true,
-  // an op will be placed on CPU if
-  //   1. there's no GPU implementation for the OP
-  // or
-  //   2. no GPU devices are known or registered
-  // or
-  //   3. need to co-locate with reftype input(s) which are from CPU.
-  optional bool allow_soft_placement = 7;
-
-  // Whether device placements should be logged.
-  optional bool log_device_placement = 8;
-
-  // Options that apply to all graphs.
-  optional GraphOptions graph_options = 10;
-
-  // Global timeout for all blocking operations in this session.  If non-zero,
-  // and not overridden on a per-operation basis, this value will be used as the
-  // deadline for all blocking operations.
-  optional int64 operation_timeout_in_ms = 11;
-
-  // Options that apply when this session uses the distributed runtime.
-  // RPCOptions rpc_options = 13;
-
-  // Optional list of all workers to use in this session.
-  // ClusterDef cluster_def = 14;
-
-  // Next: 15
-}
\ No newline at end of file
--- a/tensorflow/core/proto/types.proto
+++ b/tensorflow/core/proto/types.proto
-syntax = "proto2";
-
-package tensorflow;
-
-enum DataType {
-  // Not a legal value for DataType.  Used to indicate a DataType field
-  // has not been set.
-  DT_INVALID = 0;
-
-  // Data types that all computation devices are expected to be
-  // capable to support.
-  DT_FLOAT = 1;
-  DT_DOUBLE = 2;
-  DT_INT32 = 3;
-  DT_UINT8 = 4;
-  DT_INT16 = 5;
-  DT_INT8 = 6;
-  DT_STRING = 7;
-  DT_COMPLEX64 = 8;  // Single-precision complex
-  DT_INT64 = 9;
-  DT_BOOL = 10;
-  DT_QINT8 = 11;     // Quantized int8
-  DT_QUINT8 = 12;    // Quantized uint8
-  DT_QINT32 = 13;    // Quantized int32
-  DT_BFLOAT16 = 14;  // Float32 truncated to 16 bits.  Only for cast ops.
-  DT_QINT16 = 15;    // Quantized int16
-  DT_QUINT16 = 16;   // Quantized uint16
-  DT_UINT16 = 17;
-  DT_COMPLEX128 = 18;  // Double-precision complex
-  DT_HALF = 19;
-  DT_RESOURCE = 20;
-  DT_VARIANT = 21;  // Arbitrary C++ data types
-  DT_UINT32 = 22;
-  DT_UINT64 = 23;
-
-  // Do not use!  These are only for parameters.  Every enum above
-  // should have a corresponding value below (verified by types_test).
-  DT_FLOAT_REF = 101;
-  DT_DOUBLE_REF = 102;
-  DT_INT32_REF = 103;
-  DT_UINT8_REF = 104;
-  DT_INT16_REF = 105;
-  DT_INT8_REF = 106;
-  DT_STRING_REF = 107;
-  DT_COMPLEX64_REF = 108;
-  DT_INT64_REF = 109;
-  DT_BOOL_REF = 110;
-  DT_QINT8_REF = 111;
-  DT_QUINT8_REF = 112;
-  DT_QINT32_REF = 113;
-  DT_BFLOAT16_REF = 114;
-  DT_QINT16_REF = 115;
-  DT_QUINT16_REF = 116;
-  DT_UINT16_REF = 117;
-  DT_COMPLEX128_REF = 118;
-  DT_HALF_REF = 119;
-  DT_RESOURCE_REF = 120;
-  DT_VARIANT_REF = 121;
-  DT_UINT32_REF = 122;
-  DT_UINT64_REF = 123;
-}
\ No newline at end of file
--- a/tensorlayer/core/engine/layer.py
+++ b/tensorlayer/core/engine/layer.py
@@ -122,7 +122,7 @@ class Layer(module.Module):
        """
        self._built = True

-    def forward(self, inputs, **kwargs):
+    def forward(self, inputs):
        """Method to define the forward operations.

        Parameters

--- a/tensorlayer/core/layers/convolution/simplified_conv.py
+++ b/tensorlayer/core/layers/convolution/simplified_conv.py
@@ -85,10 +85,8 @@ class Conv2d(layer.Layer):
        self.W_init = W_init
        self.b_init = b_init
        self.in_channels = in_channels
-
        self.W = None
        self.b = None
-
        if self.in_channels:
            self.build(None)
            self._built = True
@@ -116,7 +114,6 @@ class Conv2d(layer.Layer):
                self.in_channels = inputs_shape[-1]
            else:
                self.in_channels = inputs_shape[1]
-
        # Fake shape with ``channels_first`` format,
        # to indicate the backend to compute fans correctly.
        filter_shape = [self.n_filter, self.in_channels] + self.filter_size
@@ -135,10 +132,8 @@ class Conv2d(layer.Layer):
    def forward(self, inputs, **kwargs):
        data_format = conv_utils.convert_data_format(self.data_format)
        padding, pads = conv_utils.normalize_2d_args('padding', self.padding)
-
        outputs = vision_ops.conv2d(
            [inputs, self.W] + ([self.b] if self.b_init else []),
-            num_output=self.n_filter,
            kernel_shape=self.filter_size,
            strides=self.strides,
            pads=pads,
@@ -148,5 +143,4 @@ class Conv2d(layer.Layer):
        )
        if self.act:
            outputs = self.act(outputs)
-
        return outputs
--- a/tensorlayer/core/layers/dense/base_dense.py
+++ b/tensorlayer/core/layers/dense/base_dense.py
@@ -106,11 +106,7 @@ class Dense(layer.Layer):

    def forward(self, inputs):
        outputs = math_ops.fully_connected(
-            [inputs, self.W] + ([self.b] if self.b_init else []),
-            num_output=self.n_units,
-            axis=1,
-            transW=True,
-        )
+            [inputs, self.W] + ([self.b] if self.b_init else []), axis=1)
        if self.act:
            outputs = self.act(outputs)
        return outputs
--- a/test/dragon/core/test_ops.py
+++ b/test/dragon/core/test_ops.py
--- a/test/run_test.py
+++ b/test/run_test.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#    <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import subprocess
+
+import argparse
+
+TESTS_AND_SOURCES = [
+    ('dragon/core/test_ops', 'dragon.core.ops'),
+]
+
+TESTS = [t[0] for t in TESTS_AND_SOURCES]
+SOURCES = [t[1] for t in TESTS_AND_SOURCES]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Run the unittests',
+        epilog='where TESTS is any of: {}'.format(', '.join(TESTS)))
+    parser.add_argument(
+        '-v',
+        '--verbose',
+        action='store_true',
+        help='print verbose information')
+    parser.add_argument(
+        '-q',
+        '--quiet',
+        action='store_true',
+        help='print error information only')
+    parser.add_argument(
+        '-c',
+        '--coverage',
+        action='store_true',
+        help='run coverage for unittests')
+    return parser.parse_args()
+
+
+def get_base_command(args):
+    """Return the base running command."""
+    if args.coverage:
+        executable = ['coverage', 'run', '--parallel-mode']
+    else:
+        executable = [sys.executable]
+    return executable
+
+
+def main():
+    """The main procedure."""
+    args = parse_args()
+    base_command = get_base_command(args)
+    for i, test in enumerate(TESTS):
+        command = base_command[:]
+        if args.coverage:
+            if SOURCES[i]:
+                command.extend(['--source ', SOURCES[i]])
+        command.append(test + '.py')
+        if args.verbose:
+            command.append('--verbose')
+        elif args.quiet:
+            command.append('--quiet')
+        subprocess.call(' '.join(command), shell=True)
+    if args.coverage:
+        subprocess.call(['coverage', 'combine'])
+        subprocess.call(['coverage', 'html'])
+
+
+if __name__ == '__main__':
+    main()
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -80,8 +80,8 @@ from dragon.vm.torch.ops.init.functional import uniform
 from dragon.vm.torch.ops.init.functional import zeros
 from dragon.vm.torch.ops.init.functional import zeros_like
 from dragon.vm.torch.ops.math.functional import abs
-from dragon.vm.torch.ops.math.functional import accumulate
 from dragon.vm.torch.ops.math.functional import add
+from dragon.vm.torch.ops.math.functional import axpby
 from dragon.vm.torch.ops.math.functional import bitwise_not
 from dragon.vm.torch.ops.math.functional import bitwise_xor
 from dragon.vm.torch.ops.math.functional import ceil

--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -98,7 +98,7 @@ class Function(object):
        """Generate the OpDef from attributes."""
        attributes = self.attributes()
        self._def = proto_util.make_operator_cdef(
-            name='Generic',
+            name=attributes.get('name', 'GenericOp'),
            cache_key=self._cache_key,
            op_type=attributes['op_type'],
            device_option=proto_util.get_device_option(

--- a/torch/nn/modules/_functions.py
+++ b/torch/nn/modules/_functions.py
@@ -46,7 +46,6 @@ class _ConvNd(function.Function):
        return {
            'op_type': self.__class__.__name__,
            'arguments': {
-                'num_output': self.num_output,
                'kernel_shape': self.kernel_shape,
                'strides': self.strides,
                'pads': self.pads,

--- a/torch/ops/math/_functions.py
+++ b/torch/ops/math/_functions.py
@@ -16,15 +16,15 @@ from __future__ import print_function
 from dragon.vm.torch.autograd import function


-class Accumulate(function.Function):
+class Axpby(function.Function):
    def __init__(self, key, dev, **kwargs):
-        super(Accumulate, self).__init__(key, dev, **kwargs)
+        super(Axpby, self).__init__(key, dev, **kwargs)
        self.alpha = kwargs.get('alpha', 1.)
        self.beta = kwargs.get('beta', 1.)

    def attributes(self):
        return {
-            'op_type': 'Accumulate',
+            'op_type': 'Axpby',
            'arguments': {
                'alpha': self.alpha,
                'beta': self.beta,
@@ -36,9 +36,9 @@ class Accumulate(function.Function):
        return self.dispatch([input], [out], no_grad=True)


-class Binary(function.Function):
+class BinaryFunc(function.Function):
    def __init__(self, key, dev, **kwargs):
-        super(Binary, self).__init__(key, dev, **kwargs)
+        super(BinaryFunc, self).__init__(key, dev, **kwargs)
        self.op_type = kwargs.get('op_type', '')

    def attributes(self):
@@ -73,9 +73,9 @@ class Clip(function.Function):
        return self.dispatch([input], [out])


-class Unary(function.Function):
+class UnaryFunc(function.Function):
    def __init__(self, key, dev, **kwargs):
-        super(Unary, self).__init__(key, dev, **kwargs)
+        super(UnaryFunc, self).__init__(key, dev, **kwargs)
        self.op_type = kwargs.get('op_type', '')

    def attributes(self):
@@ -86,18 +86,18 @@ class Unary(function.Function):
        return self.dispatch([input], [out])


-class MM(function.Function):
+class MatMul(function.Function):
    def __init__(self, key, dev, **kwargs):
-        super(MM, self).__init__(key, dev, **kwargs)
-        self.transA = kwargs.get('transA', False)
-        self.transB = kwargs.get('transB', False)
+        super(MatMul, self).__init__(key, dev, **kwargs)
+        self.transpose_a = kwargs.get('transpose_a', False)
+        self.transpose_b = kwargs.get('transpose_b', False)

    def attributes(self):
        return {
-            'op_type': 'Matmul',
+            'op_type': 'MatMul',
            'arguments': {
-                'transA': self.transA,
-                'transB': self.transB,
+                'transA': self.transpose_a,
+                'transB': self.transpose_b,
            },
        }


--- a/torch/ops/math/functional.py
+++ b/torch/ops/math/functional.py
@@ -44,8 +44,8 @@ def abs(input, out=None):
    return _unary_func(input, 'Abs', out)


-def accumulate(input, alpha=1., beta=1., out=None):
-    r"""Compute the element-wise accumulation from input to output.
+def axpby(input, alpha=1., beta=1., out=None):
+    r"""Compute the element-wise addition from input to output.

    .. math:: \text{out} = \alpha * \text{input} + \beta * \text{out}

@@ -66,7 +66,7 @@ def accumulate(input, alpha=1., beta=1., out=None):
        The output tensor.

    """
-    return _functions.Accumulate \
+    return _functions.Axpby \
        .instantiate(
            input.device,
            alpha=alpha,
@@ -555,7 +555,7 @@ def maximum(input, other, out=None):
    """
    input, other = utils \
        .remove_binary_scalar(input, other)
-    return _functions.Binary \
+    return _functions.BinaryFunc \
        .instantiate(
            input.device,
            op_type='Maximum',
@@ -584,28 +584,28 @@ def minimum(input, other, out=None):
    """
    input, other = utils \
        .remove_binary_scalar(input, other)
-    return _functions.Binary \
+    return _functions.BinaryFunc \
        .instantiate(
            input.device,
            op_type='Minimum',
        ).apply(input, other, out)


-def mm(input, mat2, transA=False, transB=False, out=None):
+def mm(input, mat2, transpose_a=False, transpose_b=False, out=None):
    r"""Compute matrix-matrix multiplication.

-    .. math:: \text{out} = AB
+    .. math:: \text{out} = a \times b

    Parameters
    ----------
    input : dragon.vm.torch.Tensor
-        The matrix :math:`A`.
+        The matrix :math:`a`.
    mat2 : dragon.vm.torch.Tensor
-        The matrix :math:`B`.
-    transA : bool, optional, default=False
-        **True** to transpose :math:`A` before computation.
-    transB : bool, optional, default=False
-        **True** to transpose :math:`B` before computation.
+        The matrix :math:`b`.
+    transpose_a : bool, optional, default=False
+        **True** to transpose :math:`a` before computation.
+    transpose_b : bool, optional, default=False
+        **True** to transpose :math:`b` before computation.
    out : dragon.vm.torch.Tensor, optional
        The optional output.

@@ -615,11 +615,11 @@ def mm(input, mat2, transA=False, transB=False, out=None):
        The output tensor.

    """
-    return _functions.MM \
+    return _functions.MatMul \
        .instantiate(
            utils.unify_devices([input, mat2]),
-            transA=transA,
-            transB=transB,
+            transpose_a=transpose_a,
+            transpose_b=transpose_b,
        ).apply(input, mat2, out)


@@ -922,17 +922,13 @@ def sub(input, value, out=None):
 def _binary_func(input, value, op_type='', out=None):
    """Generic binary function."""
    input, value = utils.remove_binary_scalar(input, value)
-    return _functions.Binary \
-        .instantiate(
-            input.device,
-            op_type=op_type,
-        ).apply(input, value, out)
+    return _functions.BinaryFunc \
+        .instantiate(input.device, op_type=op_type) \
+        .apply(input, value, out)


 def _unary_func(input, op_type='', out=None):
    """Generic unary function."""
-    return _functions.Unary \
-        .instantiate(
-            input.device,
-            op_type=op_type,
-        ).apply(input, out)
+    return _functions.UnaryFunc \
+        .instantiate(input.device, op_type=op_type) \
+        .apply(input, out)
--- a/torch/ops/training/_functions.py
+++ b/torch/ops/training/_functions.py
@@ -19,18 +19,18 @@ from dragon.vm.torch.autograd import function
 class ParamUpdate(function.Function):
    def __init__(self, key, dev, **kwargs):
        super(ParamUpdate, self).__init__(key, dev, **kwargs)
-        self.slot = kwargs.get('slot', '')
-        self.lr_mult = kwargs.get('lr_mult', 1.)
-        self.decay_mult = kwargs.get('decay_mult', 1.)
-        self.op_type = kwargs.get('op_type', 'Update')
+        self.op_type = kwargs.get('op_type', '')
+        self.op_handle = kwargs.get('op_handle', '')
+        self.lr_mult = kwargs.get('lr_mult', 1)
+        self.decay_mult = kwargs.get('decay_mult', 1)

    def attributes(self):
        return {
+            'name': self.op_handle,
            'op_type': self.op_type,
            'arguments': {
-                'lr_mult': self.lr_mult,
-                'decay_mult': self.decay_mult,
-                'slot': self.slot,
+                'lr_mult': float(self.lr_mult),
+                'decay_mult': float(self.decay_mult),
            },
        }

@@ -49,11 +49,8 @@ class GradAccumulate(function.Function):

    def attributes(self):
        return {
-            'op_type': 'Accumulate',
-            'arguments': {
-                'alpha': 1.,
-                'beta': 1.,
-            },
+            'op_type': 'Axpby',
+            'arguments': {'alpha': 1., 'beta': 1.},
        }

    def forward(self, grads):

--- a/torch/ops/training/functional.py
+++ b/torch/ops/training/functional.py
@@ -23,25 +23,23 @@ def grad_accumulate(grads):
    if len(grads) == 0:
        return
    return _functions.GradAccumulate \
-        .instantiate(
-            grads[0].device,
-        ).apply(grads)
+        .instantiate(grads[0].device).apply(grads)


 def param_update(
    param,
    grad,
    op_type,
-    slot,
-    lr_mult=1.,
-    decay_mult=1.,
+    op_handle,
+    lr_mult=1,
+    decay_mult=1,
 ):
    """Apply the param update."""
    return _functions.ParamUpdate \
        .instantiate(
            param.device,
            op_type=op_type,
-            slot=slot,
+            op_handle=op_handle,
            lr_mult=lr_mult,
            decay_mult=decay_mult,
        ).apply(param, grad)
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -40,13 +40,12 @@ class Adam(Optimizer):
        self,
        params,
        lr=1e-3,
-        beta1=0.9,
-        beta2=0.999,
+        betas=(0.9, 0.999),
        eps=1e-8,
        weight_decay=0,
        amsgrad=False,
-        scale_gradient=1.,
-        clip_gradient=-1.,
+        scale=1,
+        clip_norm=0,
    ):
        r"""Create an ``Adam`` optimizer.

@@ -56,50 +55,47 @@ class Adam(Optimizer):
            The parameters to optimize.
        lr : float, required
            The initial value for :math:`\text{lr}`.
-        beta1 : float, optional, default=0.9
-            The initial value for :math:`\beta_{1}`.
-        beta2 : float, optional, default=0.999
-            The initial value for :math:`\beta_{2}`.
+        betas : Tuple[float, float], optional, default=(0.9, 0.999)
+            The initial value for :math:`\beta_{1}` and :math:`\beta_{2}`.
        eps : float, optional, default=1e-8
-            The initial value of :math:`\epsilon`.
-        weight_decay : float, optional, default=-1.
-            The factor of L2 penalty.
+            The initial value for :math:`\epsilon`.
+        weight_decay : float, optional, default=0
+            The L2 penalty factor to weight.
        amsgrad : bool, optional, default=False
            **True** to switch to **AMSGrad** optimizer.
-        scale_gradient : float, optional, default=1.
-            The factor to scale gradients.
-        clip_gradient : float, optional, default=-1.
-            The norm thresh to clip gradients.
+        scale : float, optional, default=1
+            The scaling factor to gradient.
+        clip_norm : float, optional, default=0
+            The maximum L2 norm to clip gradient.

        """
        if not 0. <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0. <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0. <= beta1 < 1.:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(beta1))
-        if not 0. <= beta2 < 1.:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(beta2))
+        if not 0. <= betas[0] < 1.:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0. <= betas[1] < 1.:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if amsgrad:
            raise NotImplementedError()
        defaults = dict(
            lr=lr,
-            beta1=beta1,
-            beta2=beta2,
+            beta1=betas[0],
+            beta2=betas[1],
            eps=eps,
-            weight_decay=weight_decay,
            amsgrad=amsgrad,
-            scale_gradient=scale_gradient,
-            clip_gradient=clip_gradient,
+            scale=scale,
+            clip_norm=clip_norm,
+            weight_decay=weight_decay,
        )
        super(Adam, self).__init__(params, defaults)
-        self._update_op_type = 'AdamUpdate'
        self._shared_args = {
            'lr': 'base_lr',
            'beta1': 'beta1',
            'beta2': 'beta2',
            'eps': 'eps',
-            'weight_decay': 'l2_decay',
-            'clip_gradient': 'clip_gradient',
-            'scale_gradient': 'scale_gradient',
+            'scale': 'scale',
+            'clip_norm': 'clip_norm',
+            'weight_decay': 'weight_decay',
        }
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -43,8 +43,8 @@ class Optimizer(object):

    """

-    # Store the global unique slot index.
-    _DEFAULT_UNIQUE_SLOT_ID = 0
+    # Store for the global unique handle
+    _DEFAULT_UNIQUE_HANDLE_INDEX = 0

    def __init__(self, params, defaults):
        """Create a ``Optimizer``.
@@ -69,7 +69,7 @@ class Optimizer(object):
            param_groups = [{'params': param_groups}]
        for param_group in param_groups:
            self.add_param_group(param_group)
-        self._update_op_type = None
+        self._op_type = self.__class__.__name__ + 'Update'
        self._process_group = distributed.get_group()
        self._shared_args = {}

@@ -113,8 +113,8 @@ class Optimizer(object):
        # A group inherits the defaults while using ``multiplier``
        param_group2 = {
            'params': [],
-            'lr_mult': 1.,
-            'decay_mult': 1.,
+            'lr_mult': 1,
+            'decay_mult': 1,
        }
        ```

@@ -124,25 +124,21 @@ class Optimizer(object):
            The param group to add.

        """
-        assert isinstance(param_group, dict), "Param group must be a dict."
+        if not isinstance(param_group, dict):
+            raise TypeError('Param group must be a dict.')

        params = param_group['params']
-
        if isinstance(params, Tensor):
            param_group['params'] = [params]
-        elif isinstance(params, set):
-            raise TypeError(
-                'Optimizer parameters need to be organized in ordered collections,'
-                '\nbut the ordering of tensors in sets will change between runs.'
-                '\nPlease use a list instead.'
-            )
+        elif isinstance(params, (set, dict)):
+            raise TypeError('Parameters should be organized in a sequence.')
        else:
            param_group['params'] = list(params)

        for param in param_group['params']:
            if not param.requires_grad:
                raise ValueError(
-                    "Optimizing a Parameter that "
+                    "Optimizing a parameter that "
                    "doesn't require gradients."
                )

@@ -155,17 +151,18 @@ class Optimizer(object):
            else:
                param_group.setdefault(name, default)

-        if 'slot' not in param_group:
-            Optimizer._DEFAULT_UNIQUE_SLOT_ID += 1
-            param_group['slot'] = 'Optimizer/Slot:{}'.format(
-                Optimizer._DEFAULT_UNIQUE_SLOT_ID)
+        if 'name' not in param_group:
+            Optimizer._DEFAULT_UNIQUE_HANDLE_INDEX += 1
+            param_group['name'] = 'Optimizer_{}'.format(
+                Optimizer._DEFAULT_UNIQUE_HANDLE_INDEX)

        param_set = set()
        for group in self.param_groups:
            param_set.update(set(group['params']))

        if not param_set.isdisjoint(set(param_group['params'])):
-            raise ValueError("Some parameters appear in more than one parameter group")
+            raise ValueError('Some parameters appear in '
+                             'more than one parameter group.')

        self.param_groups.append(param_group)

@@ -224,7 +221,7 @@ class Optimizer(object):

    def _init_set_defaults(self, group):
        """Initialize the defaults into current workspace."""
-        template = group['slot'] + '/{}'
+        template = '/share/hyper/%s/{}' % group['name']
        for k, v in group.items():
            if k in self._shared_args:
                workspace.feed_tensor(
@@ -256,10 +253,10 @@ class Optimizer(object):
        for p, g in zip(params, grads):
            training_funcs.param_update(
                p, g,
-                slot=group['slot'],
-                op_type=self._update_op_type,
-                lr_mult=group.get('lr_mult', 1.),
-                decay_mult=group.get('decay_mult', 1.),
+                op_type=self._op_type,
+                op_handle=group['name'],
+                lr_mult=group.get('lr_mult', 1),
+                decay_mult=group.get('decay_mult', 1),
            )

    @staticmethod

--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -46,8 +46,8 @@ class RMSprop(Optimizer):
        weight_decay=0,
        momentum=0,
        centered=False,
-        scale_gradient=1.,
-        clip_gradient=-1.,
+        scale=1,
+        clip_norm=0,
    ):
        r"""Create a ``RMSprop`` optimizer.

@@ -61,14 +61,14 @@ class RMSprop(Optimizer):
            The initial value for :math:`\alpha`.
        eps : float, optional, default=1e-7
            The initial value for :math:`\epsilon`.
-        weight_decay : float, optional, default=-1.
-            The factor of L2 penalty.
+        weight_decay : float, optional, default=0
+            The L2 penalty factor to weight.
        momentum : float, optional, default=0
            The initial value for :math:`\text{momentum}`.
-        scale_gradient : float, optional, default=1.
-            The factor to scale gradients.
-        clip_gradient : float, optional, default=-1.
-            The norm thresh to clip gradients.
+        scale : float, optional, default=1
+            The scaling factor to gradient.
+        clip_norm : float, optional, default=0
+            The maximum L2 norm to clip gradient.

        """
        if not 0. <= lr:
@@ -85,18 +85,17 @@ class RMSprop(Optimizer):
            alpha=alpha,
            eps=eps,
            centered=centered,
+            scale=scale,
+            clip_norm=clip_norm,
            weight_decay=weight_decay,
-            scale_gradient=scale_gradient,
-            clip_gradient=clip_gradient,
        )
        super(RMSprop, self).__init__(params, defaults)
-        self._update_op_type = 'RMSPropUpdate'
        self._shared_args = {
            'lr': 'base_lr',
            'momentum': 'momentum',
            'alpha': 'decay',
            'eps': 'eps',
-            'weight_decay': 'l2_decay',
-            'clip_gradient': 'clip_gradient',
-            'scale_gradient': 'scale_gradient',
+            'scale': 'scale',
+            'clip_norm': 'clip_norm',
+            'weight_decay': 'weight_decay',
        }
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -67,10 +67,10 @@ class SGD(Optimizer):
        lr=required,
        momentum=0,
        dampening=0,
-        weight_decay=-1.,
+        weight_decay=0,
        nesterov=False,
-        scale_gradient=1.,
-        clip_gradient=-1.,
+        scale=1,
+        clip_norm=0,
    ):
        r"""Create a ``SGD`` optimizer.

@@ -84,37 +84,37 @@ class SGD(Optimizer):
            The initial value for :math:`\text{momentum}`.
        dampening : float, optional, default=0
            The dampening for :math:`\text{momentum}`.
-        weight_decay : float, optional, default=-1.
-            The factor of L2 penalty.
+        weight_decay : float, optional, default=0
+            The L2 penalty factor to weight.
        nesterov : bool, optional, default=False
            **True** to switch to **NesterovSGD** optimizer.
-        scale_gradient : float, optional, default=1.
-            The factor to scale gradients.
-        clip_gradient : float, optional, default=-1.
-            The norm thresh to clip gradients.
+        scale : float, optional, default=1
+            The scaling factor to gradient.
+        clip_norm : float, optional, default=0
+            The maximum L2 norm to clip gradient.

        """
        if lr is not required and lr < 0.:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError('Invalid learning rate: {}'.format(lr))
        if momentum < 0.:
-            raise ValueError("Invalid momentum value: {}".format(momentum))
+            raise ValueError('Invalid momentum value: {}'.format(momentum))
        defaults = dict(
            lr=lr,
            momentum=momentum,
            dampening=dampening,
-            weight_decay=weight_decay,
            nesterov=nesterov,
-            scale_gradient=scale_gradient,
-            clip_gradient=clip_gradient,
+            scale=scale,
+            clip_norm=clip_norm,
+            weight_decay=weight_decay,
        )
        if nesterov and (momentum <= 0. or dampening != 0.):
-            raise ValueError("Nesterov momentum requires a momentum and zero dampening.")
+            raise ValueError('Nesterov momentum requires a momentum and zero dampening.')
        super(SGD, self).__init__(params, defaults)
-        self._update_op_type = 'NesterovUpdate' if nesterov else 'SGDUpdate'
+        self._op_type = ('Nesterov' if nesterov else 'SGD') + 'Update'
        self._shared_args = {
            'lr': 'base_lr',
            'momentum': 'momentum',
-            'weight_decay': 'l2_decay',
-            'clip_gradient': 'clip_gradient',
-            'scale_gradient': 'scale_gradient',
+            'scale': 'scale',
+            'clip_norm': 'clip_norm',
+            'weight_decay': 'weight_decay',
        }