Refactor Fundamental Ops

Ting PAN
Commit f47e53cf authored Jul 14, 2018 by Ting PAN
Showing with 2426 additions and 1840 deletions
Dragon/include/core/common.h
Dragon/include/core/graph.h
Dragon/include/core/workspace.h
Dragon/include/operators/arithmetic/add_op.h
Dragon/include/operators/arithmetic/affine_op.h
Dragon/include/operators/arithmetic/div_op.h
Dragon/include/operators/arithmetic/fundamental_op.h
Dragon/include/operators/arithmetic/mul_op.h
Dragon/include/operators/arithmetic/sub_op.h
Dragon/include/operators/loss/sigmoid_cross_entropy_op.h
Dragon/include/operators/loss/sigmoid_focal_loss_op.h
Dragon/include/operators/loss/softmax_cross_entropy_op.h
Dragon/include/operators/loss/sparse_softmax_focal_loss_op.h → Dragon/include/operators/loss/softmax_focal_loss_op.h
Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
Dragon/include/operators/misc/python_op.h
Dragon/include/operators/norm/instance_norm_op.h
Dragon/include/operators/norm/l2_norm_op.h
Dragon/include/utils/op_kernel.h
Dragon/modules/python/py_tensor.h
Dragon/python/dragon/core/tensor.py
--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -52,9 +52,9 @@ using Set = std::unordered_set<Value> ;
 /*
 * Define the Kernel version.
 *
- * | Major(2) | Minor(2) | Patch(07) |
+ * | Major(2) | Minor(2) | Patch(08) |
 */
-#define DRAGON_VERSION 2207
+#define DRAGON_VERSION 2208

 /*
 * Define the default random seed.

--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -23,7 +23,7 @@ class GraphBase {
        vector<string> parents;
        vector<string> childs;
        int op_idx = -1;
-        string op_type;
+        OperatorDef op_def;
    };

    GraphBase(

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -27,7 +27,7 @@ class Workspace {
    typedef Map<string, unique_ptr<OperatorBase> > OperatorMap;
    typedef Map<string, unique_ptr<GraphBase> > GraphMap;
    typedef Map<string, TensorFiller> FillerMap;
-    typedef Map<string, string> RenameMap;
+    typedef Map<string, string> ProxyMap;

    Workspace(const string& name) : name_(name) { InitWorkspace(); }

@@ -56,9 +56,9 @@ class Workspace {

    inline Workspace* MoveWorkspace(Workspace* ws) {
        CHECK(ws) << "The given Workspace is invalid.";
-        if (workspace_map_.count(ws->name()))
-            return workspace_map_[ws->name()];
-        return workspace_map_[ws->name()] = ws;
+        if (ws_map_.count(ws->name()))
+            return ws_map_[ws->name()];
+        return ws_map_[ws->name()] = ws;
    }

    inline void ClearWorkspace() {
@@ -70,8 +70,8 @@ class Workspace {
    /******************** Tensor ********************/

    inline string GetTensorName(const string& name) {
-        if (rename_map_.count(name) > 0) {
-            return rename_map_[name];
+        if (proxy_map_.count(name) > 0) {
+            return proxy_map_[name];
        } else { return name; }
    }

@@ -84,7 +84,7 @@ class Workspace {
            return tensor_map_[query].get();
        if (use_remote) {
            //  search remote workspace
-            for (auto& it : workspace_map_) {
+            for (auto& it : ws_map_) {
                if (it.second->HasTensor(query))
                    return it.second->GetTensor(query);
            }
@@ -129,7 +129,7 @@ class Workspace {
        for (auto& it : tensor_map_)
            names.push_back(it.first);
        //  serach remote workspace
-        for (auto& it : workspace_map_) {
+        for (auto& it : ws_map_) {
            vector<string> sub_names = it.second->GetTensors();
            names.insert(names.end(),
                sub_names.begin(), sub_names.end());
@@ -147,7 +147,7 @@ class Workspace {
        if (!use_remote) return result;

        //  search remote workspace
-        for (auto& it : workspace_map_)
+        for (auto& it : ws_map_)
            result |= it.second->HasFiller(name);
        return result;
    }
@@ -167,7 +167,7 @@ class Workspace {
            return &filler_map_[name];

        //  search remote workspace
-        for (auto& it : workspace_map_) {
+        for (auto& it : ws_map_) {
            if (it.second->HasFiller(name))
                return it.second->GetFiller(name);
        }
@@ -274,20 +274,23 @@ class Workspace {

    /******************** Utility ********************/

-    inline void CreateRename(
-        const string&           old_tensor,
-        const string&           new_tensor) {
-        rename_map_[old_tensor] = new_tensor;
+    inline bool SetProxy(
+        const string&           key,
+        const string&           proxy) {
+        if (proxy_map_.count(key))
+            return proxy_map_[key] == proxy;
+        proxy_map_[key] = proxy;
+        return true;
    }

 private:
    string name_;
-    WorkspaceMap workspace_map_;
+    WorkspaceMap ws_map_;
    TensorMap tensor_map_;
    OperatorMap op_map_;
    GraphMap graph_map_;
    FillerMap filler_map_;
-    RenameMap rename_map_;
+    ProxyMap proxy_map_;
 };

 }    // namespace dragon

--- a/Dragon/include/operators/arithmetic/add_op.h
+++ b/Dragon/include/operators/arithmetic/add_op.h
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// ------------------------------------------------------------
-
-#ifndef DRAGON_OPERATORS_ARITHMETIC_ADD_OP_H_
-#define DRAGON_OPERATORS_ARITHMETIC_ADD_OP_H_
-
-#include "core/operator.h"
-
-namespace dragon {
-
-template <class Context>
-class AddOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(AddOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class AddGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(AddGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class RAddOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(RAddOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class RAddGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(RAddGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-}    // namespace dragon
-
-#endif    // DRAGON_OPERATORS_ARITHMETIC_ADD_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/affine_op.h
+++ b/Dragon/include/operators/arithmetic/affine_op.h
@@ -80,17 +80,17 @@ class CuDNNAffineOpBase : public Operator<Context> {
    }

    template <typename T>
-    void ResetDesc() {
+    void ResetDesc(const Tensor& X) {
        //  determine the range of affine
        start_axis = axis;
-        if (start_axis < 0) start_axis += (int)Input(0).ndim();
-        if (num_axes == -1) num_axes = (int)Input(0).ndim() - start_axis;
+        if (start_axis < 0) start_axis += (int)X.ndim();
+        if (num_axes == -1) num_axes = (int)X.ndim() - start_axis;
        else if (num_axes == 0) num_axes = 1;
        end_axis = start_axis + num_axes;
-        CHECK_LT(start_axis, (int)Input(0).ndim());
-        CHECK_LE(start_axis + num_axes, (int)Input(0).ndim());
+        CHECK_LT(start_axis, (int)X.ndim());
+        CHECK_LE(start_axis + num_axes, (int)X.ndim());
        //  determine the input desc
-        vector<TIndex> input_dims = Input(0).dims();
+        vector<TIndex> input_dims = X.dims();
        //  cudnn requires ndimensions range from [4, 5]
        if (input_dims.size() < 4) input_dims.resize(4, 1);
        else if (input_dims.size() > 5) 
@@ -98,7 +98,8 @@ class CuDNNAffineOpBase : public Operator<Context> {
        cudnnSetTensorDesc<T>(&input_desc, input_dims);
        //  determine the scale desc
        vector<TIndex> param_dims(input_dims.size(), 1);
-        for (int i = start_axis; i < end_axis; i++) param_dims[i] = input_dims[i];
+        for (int i = start_axis; i < end_axis; i++)
+            param_dims[i] = input_dims[i];
        cudnnSetTensorDesc<T>(&param_desc, param_dims);
    }


--- a/Dragon/include/operators/arithmetic/div_op.h
+++ b/Dragon/include/operators/arithmetic/div_op.h
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// ------------------------------------------------------------
-
-#ifndef DRAGON_OPERATORS_ARITHMETIC_DIV_OP_H_
-#define DRAGON_OPERATORS_ARITHMETIC_DIV_OP_H_
-
-#include "core/operator.h"
-
-namespace dragon {
-
-template <class Context>
-class DivOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(DivOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class DivGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(DivGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class RDivOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(RDivOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class RDivGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(RDivGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-}    // namepsace dragon
-
-#endif    // DRAGON_OPERATORS_ARITHMETIC_DIV_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/fundamental_op.h
+++ b/Dragon/include/operators/arithmetic/fundamental_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// ------------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_ARITHMETIC_FUNDAMENTAL_OP_H_
+#define DRAGON_OPERATORS_ARITHMETIC_FUNDAMENTAL_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+/*********************************************
+*                                            *
+*                    Add                     *
+*                                            *
+**********************************************/
+
+template <class Context>
+class AddOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(AddOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+template <class Context>
+class AddGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(AddGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+
+ protected:
+    Tensor *X1, *X2;
+};
+
+/*********************************************
+*                                            *
+*                    RAdd                    *
+*                                            *
+**********************************************/
+
+template <class Context>
+class RAddOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RAddOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+template <class Context>
+class RAddGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RAddGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+/*********************************************
+*                                            *
+*                     Sub                    *
+*                                            *
+**********************************************/
+
+template <class Context>
+class SubOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(SubOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+template <class Context>
+class SubGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(SubGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+/*********************************************
+*                                            *
+*                    RSub                    *
+*                                            *
+**********************************************/
+
+template <class Context>
+class RSubOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RSubOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+template <class Context>
+class RSubGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RSubGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+/*********************************************
+*                                            *
+*                     Mul                    *
+*                                            *
+**********************************************/
+
+template <class Context>
+class MulOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MulOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+template <class Context>
+class MulGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MulGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+/*********************************************
+*                                            *
+*                     RMul                   *
+*                                            *
+**********************************************/
+
+template <class Context>
+class RMulOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RMulOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+template <class Context>
+class RMulGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RMulGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+/*********************************************
+*                                            *
+*                    Div                     *
+*                                            *
+**********************************************/
+
+template <class Context>
+class DivOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(DivOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+template <class Context>
+class DivGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(DivGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+/*********************************************
+*                                            *
+*                    RDiv                    *
+*                                            *
+**********************************************/
+
+template <class Context>
+class RDivOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RDivOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+template <class Context>
+class RDivGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(RDivGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType(int type);
+};
+
+#define DeclareX1X2 \
+    ws()->CreateTensor( \
+        "/mnt/" + anchor() + "/fundamental/X1") \
+        ->ReshapeLike(Input(0)); \
+    ws()->CreateTensor( \
+        "/mnt/" + anchor() + "/fundamental/X2") \
+        ->ReshapeLike(Input(1))
+
+#define DefineX1X2 \
+    Tensor* X1 = ws()->GetTensor( \
+        "/mnt/" + anchor() + "/fundamental/X1"); \
+    Tensor* X2 = ws()->GetTensor( \
+        "/mnt/" + anchor() + "/fundamental/X2")
+
+#define RunByX1X2(dtype) \
+    DefineX1X2; \
+    if (X1->dims() == X2->dims()) { \
+        EltwiseRunWithType<dtype>(); \
+    } else if (X1->dim(0) == X2->dim(0) && \
+        X2->count(1) == 1) { \
+        BroadcastRunWithType<dtype>(2); \
+    } else if (X1->dim(-1) == X2->dim(-1) && \
+        X2->count(0, X2->axis(-1)) == 1) { \
+        BroadcastRunWithType<dtype>(1); \
+    } else if (X2->ndim() == 1 && X2->dim(0) == 1) { \
+        BroadcastRunWithType<dtype>(0); \
+    } else { \
+        LOG(FATAL) << "Could not broadcast with shapes " \
+                   << X1->DimString() << "  " \
+                   << X2->DimString(); \
+    }
+
+#define RRunByX1X2(dtype) \
+    DefineX1X2; \
+    if (X1->dims() == X2->dims()) { \
+        EltwiseRunWithType<dtype>(); \
+    } else if (X1->dim(0) == X2->dim(0) && \
+        X1->count(1) == 1) { \
+        BroadcastRunWithType<dtype>(2); \
+    } else if (X1->dim(-1) == X2->dim(-1) && \
+        X1->count(0, X1->axis(-1)) == 1) { \
+        BroadcastRunWithType<dtype>(1); \
+    } else if (X1->ndim() == 1 && X1->dim(0) == 1) { \
+        BroadcastRunWithType<dtype>(0); \
+    } else { \
+        LOG(FATAL) << "Could not broadcast with shapes " \
+                   << X1->DimString() << "  " \
+                   << X2->DimString(); \
+    }
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_ARITHMETIC_FUNDAMENTAL_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/mul_op.h
+++ b/Dragon/include/operators/arithmetic/mul_op.h
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// ------------------------------------------------------------
-
-#ifndef DRAGON_OPERATORS_ARITHMETIC_MUL_OP_H_
-#define DRAGON_OPERATORS_ARITHMETIC_MUL_OP_H_
-
-#include "core/operator.h"
-
-namespace dragon {
-
-template <class Context>
-class MulOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(MulOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class MulGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(MulGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class RMulOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(RMulOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class RMulGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(RMulGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-}    // namespace dragon
-
-#endif    // DRAGON_OPERATORS_ARITHMETIC_MUL_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/sub_op.h
+++ b/Dragon/include/operators/arithmetic/sub_op.h
-// ------------------------------------------------------------
-// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-//
-// Licensed under the BSD 2-Clause License.
-// You should have received a copy of the BSD 2-Clause License
-// along with the software. If not, See,
-//
-//      <https://opensource.org/licenses/BSD-2-Clause>
-//
-// ------------------------------------------------------------
-
-#ifndef DRAGON_OPERATORS_ARITHMETIC_SUB_OP_H_
-#define DRAGON_OPERATORS_ARITHMETIC_SUB_OP_H_
-
-#include "core/operator.h"
-
-namespace dragon {
-
-template <class Context>
-class SubOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(SubOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class SubGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(SubGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class RSubOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(RSubOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-template <class Context>
-class RSubGradientOp final : public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(RSubGradientOp);
-    USE_OPERATOR_FUNCTIONS;
-
-    void RunOnDevice() override;
-    template <typename T> void EltwiseRunWithType();
-    template <typename T> void BroadcastRunWithType(int type);
-};
-
-}    // namespace dragon
-
-#endif    // DRAGON_OPERATORS_ARITHMETIC_SUB_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/loss/sigmoid_cross_entropy_op.h
+++ b/Dragon/include/operators/loss/sigmoid_cross_entropy_op.h
@@ -17,9 +17,12 @@
 namespace dragon {

 template <class Context>
-class SigmoidCrossEntropyOp final : public Operator<Context> {
+class SigmoidCrossEntropyOp
+    final : public Operator<Context> {
 public:
-    SigmoidCrossEntropyOp(const OperatorDef& def, Workspace* ws)
+    SigmoidCrossEntropyOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
        : Operator<Context>(def, ws),
          normalization(OperatorBase::Arg<string>(
              "normalization", "VALID")) {}
@@ -29,14 +32,17 @@ class SigmoidCrossEntropyOp final : public Operator<Context> {
    template <typename T> void RunWithType();

 protected:
-    Tensor valid, losses;
+    Tensor losses, flags;
    string normalization;
 };

 template <class Context>
-class SigmoidCrossEntropyGradientOp final : public Operator<Context> {
+class SigmoidCrossEntropyGradientOp
+    final : public Operator<Context> {
 public:
-    SigmoidCrossEntropyGradientOp(const OperatorDef& def, Workspace* ws)
+    SigmoidCrossEntropyGradientOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
        : Operator<Context>(def, ws),
          normalization(OperatorBase::Arg<string>(
              "normalization", "VALID")) {}
@@ -46,7 +52,7 @@ class SigmoidCrossEntropyGradientOp final : public Operator<Context> {
    template <typename T> void RunWithType();

 protected:
-    Tensor valid;
+    Tensor flags;
    string normalization;
 };


--- a/Dragon/include/operators/loss/sigmoid_focal_loss_op.h
+++ b/Dragon/include/operators/loss/sigmoid_focal_loss_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// -------------------------------------------------------------
+
+#ifndef DRAGON_OPERATORS_LOSS_SIGMOID_FOCAL_LOSS_OP_H_
+#define DRAGON_OPERATORS_LOSS_SIGMOID_FOCAL_LOSS_OP_H_
+
+#include "core/operator.h"
+
+namespace dragon {
+
+template <class Context>
+class SigmoidFocalLossOp
+    final : public Operator<Context> {
+ public:
+    SigmoidFocalLossOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
+        : Operator<Context>(def, ws),
+          axis(OperatorBase::Arg<int>("axis", 1)),
+          normalization(OperatorBase::Arg<string>(
+              "normalization", "VALID")),
+          alpha(OperatorBase::Arg<float>("alpha", 0.25f)),
+          gamma(OperatorBase::Arg<float>("gamma", 2.f)),
+          neg_id(OperatorBase::Arg<int>("neg_id", 0)) {
+        pos_alpha = alpha;
+        neg_alpha = 1.f - alpha;
+    }
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    float alpha, gamma, pos_alpha, neg_alpha;
+    TIndex axis, neg_id, outer_dim, axis_dim, inner_dim;
+    Tensor losses, flags;
+    string normalization;
+};
+
+template <class Context>
+class SigmoidFocalLossGradientOp
+    final : public Operator<Context> {
+ public:
+     SigmoidFocalLossGradientOp(
+         const OperatorDef&         def,
+         Workspace*                 ws)
+         : Operator<Context>(def, ws),
+           axis(OperatorBase::Arg<int>("axis", 1)),
+           normalization(OperatorBase::Arg<string>(
+               "normalization", "VALID")),
+           alpha(OperatorBase::Arg<float>("alpha", 0.25f)),
+           gamma(OperatorBase::Arg<float>("gamma", 2.f)),
+           neg_id(OperatorBase::Arg<int>("neg_id", 0)) {
+         pos_alpha = alpha;
+         neg_alpha = 1.f - alpha;
+    }
+    USE_OPERATOR_FUNCTIONS;
+
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+
+ protected:
+    float alpha, gamma, pos_alpha, neg_alpha;
+    TIndex axis, neg_id, outer_dim, axis_dim, inner_dim;
+    Tensor flags;
+    string normalization;
+};
+
+}    // namespace dragon
+
+#endif    // DRAGON_OPERATORS_LOSS_SIGMOID_FOCAL_LOSS_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/loss/softmax_cross_entropy_op.h
+++ b/Dragon/include/operators/loss/softmax_cross_entropy_op.h
@@ -17,9 +17,12 @@
 namespace dragon {

 template <class Context>
-class SoftmaxCrossEntropyOp final : public Operator<Context> {
+class SoftmaxCrossEntropyOp
+    final : public Operator<Context> {
 public:
-    SoftmaxCrossEntropyOp(const OperatorDef& def, Workspace* ws)
+    SoftmaxCrossEntropyOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", 1)),
          normalization(OperatorBase::Arg<string>(
@@ -39,9 +42,12 @@ class SoftmaxCrossEntropyOp final : public Operator<Context> {
 };

 template <class Context>
-class SoftmaxCrossEntropyGradientOp final : public Operator<Context> {
+class SoftmaxCrossEntropyGradientOp
+    final : public Operator<Context> {
 public:
-    SoftmaxCrossEntropyGradientOp(const OperatorDef& def, Workspace* ws)
+    SoftmaxCrossEntropyGradientOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", 1)),
          normalization(OperatorBase::Arg<string>(

--- a/Dragon/include/operators/loss/sparse_softmax_focal_loss_op.h
+++ b/Dragon/include/operators/loss/sparse_softmax_focal_loss_op.h
@@ -9,18 +9,20 @@
 //
 // -------------------------------------------------------------

-#ifndef DRAGON_OPERATORS_LOSS_SPARSE_SOFTMAX_FOCAL_LOSS_OP_H_
-#define DRAGON_OPERATORS_LOSS_SPARSE_SOFTMAX_FOCAL_LOSS_OP_H_
+#ifndef DRAGON_OPERATORS_LOSS_SOFTMAX_FOCAL_LOSS_OP_H_
+#define DRAGON_OPERATORS_LOSS_SOFTMAX_FOCAL_LOSS_OP_H_

 #include "operators/loss/sparse_softmax_cross_entropy_op.h"

 namespace dragon {

 template <class Context>
-class SparseSoftmaxFocalLossOp final
-    : public SparseSoftmaxCrossEntropyOp<Context> {
+class SoftmaxFocalLossOp
+    final : public SparseSoftmaxCrossEntropyOp<Context> {
 public:
-    SparseSoftmaxFocalLossOp(const OperatorDef& def, Workspace* ws)
+    SoftmaxFocalLossOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
        : SparseSoftmaxCrossEntropyOp<Context>(def, ws),
           axis(OperatorBase::Arg<int>("axis", 1)),
           normalization(OperatorBase::Arg<string>(
@@ -44,10 +46,12 @@ class SparseSoftmaxFocalLossOp final
 };

 template <class Context>
-class SparseSoftmaxFocalLossGradientOp final
-    : public SparseSoftmaxCrossEntropyGradientOp<Context> {
+class SoftmaxFocalLossGradientOp
+    final : public SparseSoftmaxCrossEntropyGradientOp<Context> {
 public:
-    SparseSoftmaxFocalLossGradientOp(const OperatorDef& def, Workspace* ws)
+    SoftmaxFocalLossGradientOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
         : SparseSoftmaxCrossEntropyGradientOp<Context>(def, ws),
           axis(OperatorBase::Arg<int>("axis", 1)),
           normalization(OperatorBase::Arg<string>(
@@ -72,4 +76,4 @@ class SparseSoftmaxFocalLossGradientOp final

 }    // namespace dragon

-#endif    // DRAGON_OPERATORS_LOSS_SPARSE_SOFTMAX_FOCAL_LOSS_OP_H_
\ No newline at end of file
+#endif    // DRAGON_OPERATORS_LOSS_SOFTMAX_FOCAL_LOSS_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
+++ b/Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
@@ -19,7 +19,9 @@ namespace dragon {
 template <class Context>
 class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
 public:
-    SparseSoftmaxCrossEntropyOp(const OperatorDef& def, Workspace* ws)
+    SparseSoftmaxCrossEntropyOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", 1)),
          normalization(OperatorBase::Arg<string>(
@@ -47,9 +49,12 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
 };

 template <class Context>
-class SparseSoftmaxCrossEntropyGradientOp : public Operator<Context> {
+class SparseSoftmaxCrossEntropyGradientOp
+    : public Operator<Context> {
 public:
-    SparseSoftmaxCrossEntropyGradientOp(const OperatorDef& def, Workspace* ws)
+    SparseSoftmaxCrossEntropyGradientOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", 1)),
          normalization(OperatorBase::Arg<string>(

--- a/Dragon/include/operators/misc/python_op.h
+++ b/Dragon/include/operators/misc/python_op.h
@@ -29,6 +29,7 @@ class RunOp : public Operator<Context> {
    void RunOnDevice() override;

 protected:
+    string CallMethodHelper(const string& method);
    PyObject* self, *inputs, *outputs;
    string module, op, param_str;
 };

--- a/Dragon/include/operators/norm/instance_norm_op.h
+++ b/Dragon/include/operators/norm/instance_norm_op.h
--- a/Dragon/include/operators/norm/l2_norm_op.h
+++ b/Dragon/include/operators/norm/l2_norm_op.h
@@ -23,7 +23,7 @@ class L2NormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", 0)),
          num_axes(OperatorBase::Arg<int>("num_axes", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
          mode(OperatorBase::Arg<string>("mode", "SUM")) {}
    USE_OPERATOR_FUNCTIONS;


--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -247,18 +247,52 @@ void AbsGrad(
 template <typename T, class Context>
 void SigmoidCrossEntropy(
    const int               count,
-    const T*                x,
-    const T*                target,
-    T*                      loss,
-    T*                      valid);
+    const T*                logits,
+    const T*                targets,
+    T*                      losses,
+    T*                      flags,
+    Context*                ctx);

 template <typename T, class Context>
 void SigmoidCrossEntropyGrad(
    const int               count,
-    const T*                x,
-    const T*                target,
-    T*                      dx,
-    T*                      valid);
+    const T*                logits,
+    const T*                targets,
+    T*                      dlogits,
+    T*                      flags,
+    Context*                ctx);
+
+/******************** loss.sigmoid_focal_loss ********************/
+
+template <typename T, class Context>
+void SigmoidFocalLoss(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const float*            logits,
+    const float*            targets,
+    float*                  losses,
+    float*                  flags,
+    Context*                ctx);
+
+template <typename T, class Context>
+void SigmoidFocalLossGradient(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const float*            logits,
+    const float*            targets,
+    float*                  dlogits,
+    float*                  flags,
+    Context*                ctx);

 /******************** loss.smooth_l1_loss ********************/

@@ -285,38 +319,10 @@ void SoftmaxCrossEntropy(
    const T*                target,
    T*                      loss);

-/******************** loss.sparse_softmax_cross_entropy ********************/
-
-template <typename Tx, typename Ty, class Context>
-void SparseSoftmaxCrossEntropy(
-    const int               outer_dim,
-    const int               axis_dim,
-    const int               inner_dim,
-    const Tx*               prob,
-    const Ty*               labels,
-    const int*              ignores,
-    const int               num_ignores,
-    Tx*                     losses,
-    Tx*                     flags,
-    Context*                ctx);
-
-template <typename Tx, typename Ty, class Context>
-void SparseSoftmaxCrossEntropyGrad(
-    const int               outer_dim,
-    const int               axis_dim,
-    const int               inner_dim,
-    const Tx*               prob,
-    const Ty*               labels,
-    const int*              ignores,
-    const int               num_ignores,
-    Tx*                     dx,
-    Tx*                     flags,
-    Context*                ctx);
-
-/******************** loss.sparse_softmax_focal_loss ********************/
+/******************** loss.softmax_focal_loss ********************/

 template <typename T, class Context>
-void SparseSoftmaxFocalLoss(
+void SoftmaxFocalLoss(
    const int               outer_dim,
    const int               axis_dim,
    const int               inner_dim,
@@ -329,11 +335,11 @@ void SparseSoftmaxFocalLoss(
    const int*              ignores,
    const int               num_ignores,
    T*                      losses,
-    T*                      flags ,
+    T*                      flags,
    Context*                ctx);

 template <typename T, class Context>
-void SparseSoftmaxFocalLossGrad(
+void SoftmaxFocalLossGrad(
    const int               outer_dim,
    const int               axis_dim,
    const int               inner_dim,
@@ -349,6 +355,34 @@ void SparseSoftmaxFocalLossGrad(
    T*                      flags,
    Context*                ctx);

+/******************** loss.sparse_softmax_cross_entropy ********************/
+
+template <typename Tx, typename Ty, class Context>
+void SparseSoftmaxCrossEntropy(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    Tx*                     losses,
+    Tx*                     flags,
+    Context*                ctx);
+
+template <typename Tx, typename Ty, class Context>
+void SparseSoftmaxCrossEntropyGrad(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    Tx*                     dx,
+    Tx*                     flags,
+    Context*                ctx);
+
 /******************** misc.astype ********************/

 template <typename Ta, typename Tb, class Context>

--- a/Dragon/modules/python/py_tensor.h
+++ b/Dragon/modules/python/py_tensor.h
@@ -69,7 +69,7 @@ inline PyObject* RenameTensorCC(PyObject* self, PyObject* args) {
        PyErr_SetString(PyExc_ValueError, err_msg.c_str());
        return nullptr;
    }
-    ws()->CreateRename(ori_name, tar_name);
+    ws()->SetProxy(ori_name, tar_name);
    Py_RETURN_TRUE;
 }


--- a/Dragon/python/dragon/core/tensor.py
+++ b/Dragon/python/dragon/core/tensor.py
@@ -947,6 +947,7 @@ class Tensor(object):

        Examples
        --------
+        >>> import dragon as dg
        >>> a = Tensor().Variable()
        >>> b = Tensor().Variable()
        >>> c = Tensor.CreateOperator(inputs=[a, b], op_type='Add', nout=1)
@@ -956,12 +957,10 @@ class Tensor(object):
        >>> c = Tensor().Variable()
        >>> c = Tensor.CreateOperator(inputs=[a, b], op_type='Add', existing_outputs=c)

-        >>> import dragon.core.workspace as ws
-        >>> import dragon.vm.theano as theano
        >>> dynamic_shape = Tensor().Variable()
-        >>> ws.FeedTensor(dynamic_shape, [1, 2, 3, 4])
-        >>> a = ops.Fill(shape=dynamic_shape, value=5.0)
-        >>> print theano.function(outputs=a)
+        >>> dg.workspace.FeedTensor(dynamic_shape, [1, 2, 3, 4])
+        >>> a = dg.Fill(shape=dynamic_shape, value=5.0)
+        >>> print dg.function(outputs=a)
        >>> [[ 5.  5.  5.]
             [ 5.  5.  5.]]


--- a/Dragon/python/dragon/core/workspace.py
+++ b/Dragon/python/dragon/core/workspace.py
@@ -450,7 +450,7 @@ def FetchTensor(tensor):
    return FetchTensorCC(_stringify_tensor(tensor))


-def FeedTensor(tensor, ndarray, force_cpu=False, dtype=None):
+def FeedTensor(tensor, array, force_cpu=False, dtype=None):
    """Feed the values to the given tensor.

    Parameters
@@ -461,8 +461,8 @@ def FeedTensor(tensor, ndarray, force_cpu=False, dtype=None):
        The values to feed.
    force_cpu : boolean
        Whether force to feed to cpu context.
-    dtype : np.dtype or None
-        The data type. If ``None``, np.float32 will be used instead.
+    dtype : str
+        The data type. If ``None``, ``float32`` will be used instead.

    Returns
    -------
@@ -470,14 +470,14 @@ def FeedTensor(tensor, ndarray, force_cpu=False, dtype=None):

    Examples
    --------
-    >>> import dragon.core.workspace as ws
-    >>> a = Tensor().Variable()
-    >>> ws.FeedTensor(a, 1)
-    >>> a_value = ws.FetchTensor(a)
+    >>> import dragon as dg
+    >>> a = dg.Tensor().Variable()
+    >>> dg.workspace.FeedTensor(a, 1)
+    >>> a_value = dg.workspace.FetchTensor(a)
    >>> a_value, a_value.dtype
    >>> [ 1.], float32

-    >>> ws.FeedTensor(a, [[1, 2, 3]], dtype=np.float16)
+    >>> dg.workspace.FeedTensor(a, [[1, 2, 3]], dtype='float16')
    >>> a_value = a.get_value()
    >>> a_value, a_value.dtype
    >>> [[ 1.  2.  3.]], float16
@@ -504,24 +504,24 @@ def FeedTensor(tensor, ndarray, force_cpu=False, dtype=None):
            elif option['device'] == 'CPU':
                dev = utils.MakeDeviceOption(0, 0)

-    if not isinstance(ndarray, np.ndarray):
-        if not isinstance(ndarray, list):
-            ndarray = [ndarray]
-        auto_dtype = np.float32 if dtype is None else dtype
+    if not isinstance(array, np.ndarray):
+        if not isinstance(array, list):
+            array = [array]
+        auto_data_type = np.float32 if dtype is None else dtype
    else:
-        auto_dtype = ndarray.dtype if dtype is None else dtype
+        auto_data_type = array.dtype if dtype is None else dtype

    if hasattr(tensor, 'dtype') and tensor.dtype is not None:
        if tensor.dtype not in _DATA_TYPES:
            raise TypeError('Unsupported data types: {}.'.format(tensor.dtype))
-        preset_dtype = _DATA_TYPES[tensor.dtype]
+        preset_data_type = _DATA_TYPES[tensor.dtype]
        if dtype is not None:
-            if dtype != preset_dtype:
+            if dtype != preset_data_type:
                raise TypeError('The preset data type is {}, but force to {}.'.
-                                format(preset_dtype, dtype))
-        auto_dtype = preset_dtype
-    ndarray = np.array(ndarray, dtype=auto_dtype, copy=False)
-    FeedTensorCC(name, ndarray, _stringify_proto(dev))
+                                format(preset_data_type, dtype))
+        auto_data_type = preset_data_type
+    nd_array = np.array(array, dtype=auto_data_type, copy=False)
+    FeedTensorCC(name, nd_array, _stringify_proto(dev))


 stages = {
@@ -729,7 +729,10 @@ def ExportMetaGraph(meta_graph):
        logger.info('Export meta graph into: {}'.format(filepath))


-def Snapshot(tensors, filename, prefix='', suffix='.bin', format='default'):
+def Snapshot(
+        tensors, filename,
+        prefix='', suffix='.bin',
+        format='default'):
    """Snapshot tensors into a binary file.

    Parameters
@@ -751,42 +754,42 @@ def Snapshot(tensors, filename, prefix='', suffix='.bin', format='default'):

    Notes
    -----
-    The full filepath will be:  ``prefix`` + ``filename`` + ``suffix``.
+    The full file path will be:  ``prefix`` + ``filename`` + ``suffix``.

    Available formats: ['default', 'caffe'].

    """
    from dragon.config import logger
-    filepath = prefix + filename + suffix
+    file_path = prefix + filename + suffix
    if mpi.Is_Init():
        if not mpi.AllowSnapshot(): return
-        filepath = filepath + '.rank.{}'.format(mpi.Rank())
+        file_path = file_path + '.rank.{}'.format(mpi.Rank())

-    dir = os.path.split(filepath)[0]
+    dir = os.path.split(file_path)[0]
    if len(dir) > 0 and not os.path.exists(dir): os.makedirs(dir)

    if format == 'default':
-        content = {}
+        state_dict = {}
        for tensor in tensors:
-            content[tensor.name] = FetchTensor(tensor)
-        with open(filepath, 'wb') as f:
-            cPickle.dump(content, f, cPickle.HIGHEST_PROTOCOL)
-        logger.info('Snapshot Model@: ' + filepath)
+            state_dict[tensor.name] = FetchTensor(tensor)
+        with open(file_path, 'wb') as f:
+            cPickle.dump(state_dict, f, cPickle.HIGHEST_PROTOCOL)
+        logger.info('Snapshot Model@: ' + file_path)
        logger.info('Model Format: cPickle')

    elif format is 'caffe':
        names = [tensor.name for tensor in tensors]
-        SnapshotCC(filepath, names, 1)
+        SnapshotCC(file_path, names, 1)

    else: raise TypeError('Unknown binary format: {}'.format(format))


-def Restore(filepath, format='default'):
+def Restore(binary_file, format='default'):
    """Restore tensors from a binary file.

    Parameters
    ----------
-    filepath : str
+    binary_file : str
        The path of binary file.
    format : str
        The format of this binary file.
@@ -801,25 +804,27 @@ def Restore(filepath, format='default'):

    """
    from dragon.config import logger
-    assert os.path.exists(filepath), 'model of path({}) does not exist.'.format(filepath)
+    assert os.path.exists(binary_file), \
+        'Binary file({}) does not exist.'.format(binary_file)
+
    if format == 'default':
        try:
-            content = cPickle.load(open(filepath, 'rb'))
+            state_dict = cPickle.load(open(binary_file, 'rb'))
        except UnicodeDecodeError:
-            content = cPickle.load(open(filepath, 'rb'), encoding='iso-8859-1')
-        logger.info('Restore From Model@: ' + filepath)
+            state_dict = cPickle.load(open(binary_file, 'rb'), encoding='iso-8859-1')
+        logger.info('Restore From Model@: ' + binary_file)
        logger.info('Model Format: cPickle')
-        for key, ndarray in content.items():
-            if not HasTensor(key):
-                logger.info('[Warning]:  Tensor({}) of model does not exist in any Graphs, skip.'.format(key))
+        for k, v in state_dict.items():
+            if not HasTensor(k):
+                logger.info('[Warning]: Tensor({}) does not exist in any Graphs, skip.'.format(k))
            else:
-                logger.info('[Info]: Tensor({}) restored.'.format(key))
-                FeedTensor(key, ndarray)
+                FeedTensor(k, v)
+                logger.info('[Info]: Tensor({}) is restored.'.format(k))

    elif format == 'caffe':
-        # TODO(PhyscalX): caffemodel can't save the tensor name
+        # TODO(PhyscalX): caffe models can't save the tensor name
        # TODO(PhyscalX): we simply use layer_name + @paramX
-        RestoreCC(filepath, 1)
+        RestoreCC(binary_file, 1)

    else:
        raise TypeError('Unknown binary format: {}'.format(format))
\ No newline at end of file
--- a/Dragon/python/dragon/memonger.py
+++ b/Dragon/python/dragon/memonger.py
@@ -63,13 +63,12 @@ def Drop(op_func, *args, **kwargs):

    Examples
    --------
-    >>> from dragon.core.tensor import Tensor
-    >>> import dragon.ops as ops
+    >>> import dragon as dg
    >>> import dragon.memonger as opt
-    >>> data = Tensor().Variable()
-    >>> conv_1 = ops.Conv2d(data, num_output=8)
-    >>> conv_1_bn = opt.Drop(ops.BatchNorm, [conv_1, Tensor().Variable(), Tensor.Variable()])
-    >>> conv_1_relu = opt.Drop(ops.Relu, conv_1_bn)
+    >>> data = dg.Tensor().Variable()
+    >>> conv_1 = dg.Conv2d(data, num_output=8)
+    >>> conv_1_bn = opt.Drop(dg.BatchNorm, [conv_1, dg.Tensor().Variable(), dg.Tensor.Variable()])
+    >>> conv_1_relu = opt.Drop(dg.Relu, conv_1_bn)

    """
    kwargs['mirror_stage'] = True

--- a/Dragon/python/dragon/operators/loss.py
+++ b/Dragon/python/dragon/operators/loss.py
@@ -217,7 +217,52 @@ def L2Loss(inputs, normalization='BATCH_SIZE', **kwargs):
    return output


-def SparseSoftmaxFocalLoss(inputs, axis=1, normalization='VALID', ignore_labels=(),
+def SigmoidFocalLoss(inputs, axis=1, normalization='VALID',
+                     alpha=0.25, gamma=2.0, neg_id=0, **kwargs):
+    """SoftmaxFocalLoss with sparse labels. `[Lin et.al, 2017] <https://arxiv.org/abs/1708.02002>`_.
+
+    Parameters
+    ----------
+    inputs : list of Tensor
+        The inputs, represent [input, sparse_labels].
+    axis : int
+        The axis of softmax function.
+    normalization : str
+        The normalization, ``UNIT``, ``FULL``, ``VALID``, ``BATCH_SIZE`` or ``NONE``.
+    alpha : float
+        The scale factor on the rare class. Default is ``0.25``.
+    gamma : float
+        The exponential decay factor on the easy examples. Default is ``2.0``.
+    neg_id : int
+        The negative id. Default is ``0``.
+
+    Returns
+    -------
+    Tensor
+        The loss.
+
+    Notes
+    -----
+    Set the normalization to ``UNIT`` will return unreduced losses.
+
+    """
+    CheckInputs(inputs, 2)
+    arguments = ParseArguments(locals())
+
+    output = Tensor.CreateOperator(nout=1, op_type='SigmoidFocalLoss', **arguments)
+
+    if inputs[0].shape is not None:
+        if normalization != 'UNIT': output.shape = [1]
+        elif all(dim is not None for dim in inputs[0].shape):
+            outer_dim = int(np.prod(inputs[0].shape[0 : axis]))
+            inner_dim = int(np.prod(inputs[0].shape[axis + 1 :]))
+            output.shape = [outer_dim * inner_dim]
+        else: output.shape = [None]
+
+    return output
+
+
+def SoftmaxFocalLoss(inputs, axis=1, normalization='VALID', ignore_labels=(),
                     alpha=0.25, gamma=2.0, neg_id=0, **kwargs):
    """SoftmaxFocalLoss with sparse labels. `[Lin et.al, 2017] <https://arxiv.org/abs/1708.02002>`_.

@@ -251,7 +296,7 @@ def SparseSoftmaxFocalLoss(inputs, axis=1, normalization='VALID', ignore_labels=
    CheckInputs(inputs, 2)
    arguments = ParseArguments(locals())

-    output = Tensor.CreateOperator(nout=1, op_type='SparseSoftmaxFocalLoss', **arguments)
+    output = Tensor.CreateOperator(nout=1, op_type='SoftmaxFocalLoss', **arguments)

    if inputs[0].shape is not None:
        if normalization != 'UNIT': output.shape = [1]

--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
@@ -76,7 +76,8 @@ SoftmaxCrossEntropy = loss.SoftmaxCrossEntropy
 SmoothL1Loss = loss.SmoothL1Loss
 L1Loss = loss.L1Loss
 L2Loss = loss.L2Loss
-SparseSoftmaxFocalLoss = loss.SparseSoftmaxFocalLoss
+SigmoidFocalLoss = loss.SigmoidFocalLoss
+SoftmaxFocalLoss = loss.SoftmaxFocalLoss
 CTCLoss = loss.CTCLoss

 # arithmetic

--- a/Dragon/python/dragon/version.py
+++ b/Dragon/python/dragon/version.py
@@ -14,7 +14,7 @@ from __future__ import division
 from __future__ import print_function

 version = '0.2.2'
-full_version = '0.2.2.7'
+full_version = '0.2.2.8'
 release = False

 if not release:

--- a/Dragon/python/dragon/vm/caffe/layers/__init__.py
+++ b/Dragon/python/dragon/vm/caffe/layers/__init__.py
@@ -34,6 +34,7 @@ from .loss import SoftmaxWithLossLayer, \
                  SigmoidCrossEntropyLossLayer, \
                  L2LossLayer, \
                  SmoothL1LossLayer, \
+                  SigmoidWithFocalLossLayer, \
                  SoftmaxWithFocalLossLayer

 from .mpi import MPIBroadcastLayer,\

--- a/Dragon/python/dragon/vm/caffe/layers/loss.py
+++ b/Dragon/python/dragon/vm/caffe/layers/loss.py
@@ -138,6 +138,48 @@ class SmoothL1LossLayer(Layer):
        return loss


+class SigmoidWithFocalLossLayer(Layer):
+    """The implementation of ``SigmoidWithFocalLossLayer``.
+
+    Parameters
+    ----------
+    axis : int
+        The axis of softmax. Refer `SoftmaxParameter.axis`_.
+    alpha : float
+        The scale on the rare class. Refer `FocalLossParameter.alpha`_.
+    gamma : float
+        The exponential decay. Refer `FocalLossParameter.gamma`_.
+    neg_id : int
+        The negative id. Refer `FocalLossParameter.neg_id`_.
+    normalization : NormalizationMode
+        The normalization. Refer `LossParameter.normalization`_.
+    normalize : boolean
+        Whether to normalize. Refer `LossParameter.normalize`_.
+
+    """
+    def __init__(self, LayerParameter):
+        super(SigmoidWithFocalLossLayer, self).__init__(LayerParameter)
+        param = LayerParameter.loss_param
+        softmax_param = LayerParameter.softmax_param
+        focal_loss_param = LayerParameter.focal_loss_param
+        norm_mode = {0: 'FULL', 1: 'VALID', 2: 'BATCH_SIZE', 3: 'NONE', 4: 'UNIT'}
+        normalization = 'VALID'
+        if param.HasField('normalize'):
+            if not param.normalize: normalization = 'BATCH_SIZE'
+        else: normalization = norm_mode[param.normalization]
+        self._param = {'axis': softmax_param.axis,
+                       'normalization': normalization,
+                       'alpha': float(focal_loss_param.alpha),
+                       'gamma': float(focal_loss_param.gamma),
+                       'neg_id': focal_loss_param.neg_id}
+
+    def Setup(self, bottom):
+        super(SigmoidWithFocalLossLayer, self).Setup(bottom)
+        loss = ops.SigmoidFocalLoss(bottom, **self._param)
+        if self._loss_weight is not None: loss *= self._loss_weight
+        return loss
+
+
 class SoftmaxWithFocalLossLayer(Layer):
    """The implementation of ``SoftmaxWithFocalLossLayer``.

@@ -176,6 +218,6 @@ class SoftmaxWithFocalLossLayer(Layer):

    def Setup(self, bottom):
        super(SoftmaxWithFocalLossLayer, self).Setup(bottom)
-        loss = ops.SparseSoftmaxFocalLoss(bottom, **self._param)
+        loss = ops.SoftmaxFocalLoss(bottom, **self._param)
        if self._loss_weight is not None: loss *= self._loss_weight
        return loss
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/net.py
+++ b/Dragon/python/dragon/vm/caffe/net.py
@@ -34,12 +34,12 @@ class Net(object):
    especially when extending the modern architectures of `ConvNets`.
    """
    def __init__(self, *args):
-        """Construct a Net by the ``prototxt`` file.
+        """Construct a Net by the ``proto_txt`` file.

        Parameters
        ----------
-        prototxt : str
-            The path of ``.prototxt`` file.
+        proto_txt : str
+            The path of ``.proto_txt`` file.
        model : str
            (Optional) The path of the ``.caffemodel`` file.
        phase : str
@@ -58,22 +58,22 @@ class Net(object):

        References
        ----------
-        `NetInit(prototxt, phase)`_ - Construct a Net.
+        `NetInit(proto_txt, phase)`_ - Construct a Net.

-        `NetInitLoad(prototxt, model, phase)`_ - Construct a Net and load the model.
+        `NetInitLoad(proto_txt, model, phase)`_ - Construct a Net and load the model.

        """
        if len(args) == 2:
            self.NetInit(args[0], args[1])
        else: self.NetInitLoad(args[0], args[1], args[2])

-    def NetInit(self, prototxt, phase='TRAIN'):
-        """Construct a Net by the ``prototxt`` file.
+    def NetInit(self, proto_txt, phase='TRAIN'):
+        """Construct a Net by the ``proto_txt`` file.

        Parameters
        ----------
-        prototxt : str
-            The path of ``.prototxt`` file.
+        proto_txt : str
+            The path of ``proto_txt`` file.
        phase : str
            The phase, ``TRAIN`` or ``TEST``.

@@ -88,11 +88,11 @@ class Net(object):

        """
        self._net = pb.NetParameter()
-        Parse(open(prototxt,'r').read(), self._net)
+        Parse(open(proto_txt,'r').read(), self._net)
        self._phase = phase
        self._layers = []
        if not hasattr(self, '_blobs'): self._blobs = {}
-        self._params = {};
+        self._params = {}
        self._swap_tensors = {}
        self._inputs_to_tensors = {}
        self._costs = []; self._wrts = []
@@ -115,13 +115,13 @@ class Net(object):
            if not self.FilterLayer(layer): continue
            self.CheckBackward(layer)

-    def NetInitLoad(self, prototxt, model, phase='TRAIN'):
-        """Construct a Net by the ``prototxt`` file.
+    def NetInitLoad(self, proto_txt, model, phase='TRAIN'):
+        """Construct a Net by the ``proto_txt`` file.

        Parameters
        ----------
-        prototxt : str
-            The path of ``.prototxt`` file.
+        proto_txt : str
+            The path of ``proto_txt`` file.
        model : str
            (Optional) The path of the ``.caffemodel`` file.
        phase : str
@@ -137,7 +137,7 @@ class Net(object):
        The implementation of `Net_Init_Load(_caffe.cpp, L137)`_.

        """
-        self.NetInit(prototxt, phase)
+        self.NetInit(proto_txt, phase)
        self._model = model  # lazy-loading model

    def FilterLayer(self, LayerParameter):
@@ -518,7 +518,7 @@ class Net(object):
        Examples
        --------
        >>> import dragon.core.workspace as ws
-        >>> ws.Snapshot(net.store_params(), filename='xxx', suffix='.caffeomdel')
+        >>> ws.Snapshot(self.store_params(), filename='xxx', suffix='.caffeomdel')

        """
        params = []
@@ -577,8 +577,8 @@ class Net(object):
        --------
        >>> import dragon.ops as ops
        >>> data, label = ops.LMDBData()
-        >>> net.replace(net.blobs['data'].data, data)
-        >>> net.replace(net.blobs['label'].data, label)
+        >>> self.replace(self.blobs['data'].data, data)
+        >>> self.replace(self.blobs['label'].data, label)

        """
        self._swap_tensors[A] = B
@@ -590,7 +590,7 @@ class PartialNet(Net):
    Examples
    --------
    >>> from dragon.core.tensor import Tensor
-    >>> net = PartialNet('xxx.prototxt', 'TEST', **{'blob_name': Tensor().Variable()})
+    >>> net = PartialNet('xxx.proto_txt', 'TEST', **{'blob_name': Tensor().Variable()})

    """
    def __init__(self, *args, **kwargs):

--- a/Dragon/python/dragon/vm/caffe/proto/caffe.proto
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe.proto
@@ -1456,7 +1456,7 @@ message NormalizeParameter {
  // Whether or not scale parameters are shared across channels.
  optional bool channel_shared = 3 [default = true];
  // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 4 [default = 1e-10];
+  optional float eps = 4 [default = 1e-3];
 }

 message ParallelParameter {

--- a/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
@@ -19,7 +19,7 @@ _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor.FileDescriptor(
  name='caffe.proto',
  package='caffe',
-  serialized_pb=_b('\n\x0b\x63\x61\x66\x66\x65.proto\x12\x05\x63\x61\x66\x66\x65\"\x1c\n\tBlobShape\x12\x0f\n\x03\x64im\x18\x01 \x03(\x03\x42\x02\x10\x01\"\xcc\x01\n\tBlobProto\x12\x1f\n\x05shape\x18\x07 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x10\n\x04\x64\x61ta\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04\x64iff\x18\x06 \x03(\x02\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_data\x18\x08 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_diff\x18\t \x03(\x01\x42\x02\x10\x01\x12\x0e\n\x03num\x18\x01 \x01(\x05:\x01\x30\x12\x13\n\x08\x63hannels\x18\x02 \x01(\x05:\x01\x30\x12\x11\n\x06height\x18\x03 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\x05:\x01\x30\"2\n\x0f\x42lobProtoVector\x12\x1f\n\x05\x62lobs\x18\x01 \x03(\x0b\x32\x10.caffe.BlobProto\"\x91\x01\n\x05\x44\x61tum\x12\x10\n\x08\x63hannels\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x04 \x01(\x0c\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x12\n\nfloat_data\x18\x06 \x03(\x02\x12\x16\n\x07\x65ncoded\x18\x07 \x01(\x08:\x05\x66\x61lse\x12\x0e\n\x06labels\x18\x08 \x03(\x05\"\x8a\x02\n\x0f\x46illerParameter\x12\x16\n\x04type\x18\x01 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x02 \x01(\x02:\x01\x30\x12\x0e\n\x03min\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03max\x18\x04 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x05 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x06 \x01(\x02:\x01\x31\x12\x12\n\x06sparse\x18\x07 \x01(\x05:\x02-1\x12\x42\n\rvariance_norm\x18\x08 \x01(\x0e\x32#.caffe.FillerParameter.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x41VERAGE\x10\x02\"\x8e\x02\n\x0cNetParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12%\n\x0binput_shape\x18\x08 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x11\n\tinput_dim\x18\x04 \x03(\x05\x12\x1d\n\x0e\x66orce_backward\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x1e\n\x05state\x18\x06 \x01(\x0b\x32\x0f.caffe.NetState\x12\x19\n\ndebug_info\x18\x07 \x01(\x08:\x05\x66\x61lse\x12$\n\x05layer\x18\x64 \x03(\x0b\x32\x15.caffe.LayerParameter\x12\'\n\x06layers\x18\x02 \x03(\x0b\x32\x17.caffe.V1LayerParameter\"\xc9\n\n\x0fSolverParameter\x12\x0b\n\x03net\x18\x18 \x01(\t\x12&\n\tnet_param\x18\x19 \x01(\x0b\x32\x13.caffe.NetParameter\x12\x11\n\ttrain_net\x18\x01 \x01(\t\x12\x10\n\x08test_net\x18\x02 \x03(\t\x12,\n\x0ftrain_net_param\x18\x15 \x01(\x0b\x32\x13.caffe.NetParameter\x12+\n\x0etest_net_param\x18\x16 \x03(\x0b\x32\x13.caffe.NetParameter\x12$\n\x0btrain_state\x18\x1a \x01(\x0b\x32\x0f.caffe.NetState\x12#\n\ntest_state\x18\x1b \x03(\x0b\x32\x0f.caffe.NetState\x12\x11\n\ttest_iter\x18\x03 \x03(\x05\x12\x18\n\rtest_interval\x18\x04 \x01(\x05:\x01\x30\x12 \n\x11test_compute_loss\x18\x13 \x01(\x08:\x05\x66\x61lse\x12!\n\x13test_initialization\x18  \x01(\x08:\x04true\x12\x0f\n\x07\x62\x61se_lr\x18\x05 \x01(\x02\x12\x10\n\x08stage_lr\x18\x32 \x03(\x02\x12\x12\n\nstage_iter\x18\x33 \x03(\x05\x12\x0f\n\x07\x64isplay\x18\x06 \x01(\x05\x12\x17\n\x0c\x61verage_loss\x18! \x01(\x05:\x01\x31\x12\x10\n\x08max_iter\x18\x07 \x01(\x05\x12\x14\n\titer_size\x18$ \x01(\x05:\x01\x31\x12\x11\n\tlr_policy\x18\x08 \x01(\t\x12\r\n\x05gamma\x18\t \x01(\x02\x12\r\n\x05power\x18\n \x01(\x02\x12\x10\n\x08momentum\x18\x0b \x01(\x02\x12\x14\n\x0cweight_decay\x18\x0c \x01(\x02\x12\x1f\n\x13regularization_type\x18\x1d \x01(\t:\x02L2\x12\x10\n\x08stepsize\x18\r \x01(\x05\x12\x11\n\tstepvalue\x18\" \x03(\x05\x12\x1a\n\x0e\x63lip_gradients\x18# \x01(\x02:\x02-1\x12\x13\n\x08snapshot\x18\x0e \x01(\x05:\x01\x30\x12\x17\n\x0fsnapshot_prefix\x18\x0f \x01(\t\x12\x1c\n\rsnapshot_diff\x18\x10 \x01(\x08:\x05\x66\x61lse\x12K\n\x0fsnapshot_format\x18% \x01(\x0e\x32%.caffe.SolverParameter.SnapshotFormat:\x0b\x42INARYPROTO\x12;\n\x0bsolver_mode\x18\x11 \x01(\x0e\x32!.caffe.SolverParameter.SolverMode:\x03GPU\x12\x14\n\tdevice_id\x18\x12 \x01(\x05:\x01\x30\x12\x17\n\x0brandom_seed\x18\x14 \x01(\x03:\x02-1\x12\x11\n\x04type\x18( \x01(\t:\x03SGD\x12\x15\n\x05\x64\x65lta\x18\x1f \x01(\x02:\x06\x31\x65-008\x12\x18\n\tmomentum2\x18\' \x01(\x02:\x05\x30.999\x12\x17\n\trms_decay\x18& \x01(\x02:\x04\x30.99\x12\x19\n\ndebug_info\x18\x17 \x01(\x08:\x05\x66\x61lse\x12\"\n\x14snapshot_after_train\x18\x1c \x01(\x08:\x04true\x12;\n\x0bsolver_type\x18\x1e \x01(\x0e\x32!.caffe.SolverParameter.SolverType:\x03SGD\"+\n\x0eSnapshotFormat\x12\x08\n\x04HDF5\x10\x00\x12\x0f\n\x0b\x42INARYPROTO\x10\x01\"\x1e\n\nSolverMode\x12\x07\n\x03\x43PU\x10\x00\x12\x07\n\x03GPU\x10\x01\"U\n\nSolverType\x12\x07\n\x03SGD\x10\x00\x12\x0c\n\x08NESTEROV\x10\x01\x12\x0b\n\x07\x41\x44\x41GRAD\x10\x02\x12\x0b\n\x07RMSPROP\x10\x03\x12\x0c\n\x08\x41\x44\x41\x44\x45LTA\x10\x04\x12\x08\n\x04\x41\x44\x41M\x10\x05\"l\n\x0bSolverState\x12\x0c\n\x04iter\x18\x01 \x01(\x05\x12\x13\n\x0blearned_net\x18\x02 \x01(\t\x12!\n\x07history\x18\x03 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x17\n\x0c\x63urrent_step\x18\x04 \x01(\x05:\x01\x30\"N\n\x08NetState\x12!\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase:\x04TEST\x12\x10\n\x05level\x18\x02 \x01(\x05:\x01\x30\x12\r\n\x05stage\x18\x03 \x03(\t\"\x85\x01\n\x0cNetStateRule\x12\x1b\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase\x12\x11\n\tmin_level\x18\x02 \x01(\x05\x12\x11\n\tmax_level\x18\x03 \x01(\x05\x12\r\n\x05stage\x18\x04 \x03(\t\x12\x11\n\tnot_stage\x18\x05 \x03(\t\x12\x10\n\x08mpi_rank\x18\x06 \x03(\r\"\xa3\x01\n\tParamSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x31\n\nshare_mode\x18\x02 \x01(\x0e\x32\x1d.caffe.ParamSpec.DimCheckMode\x12\x12\n\x07lr_mult\x18\x03 \x01(\x02:\x01\x31\x12\x15\n\ndecay_mult\x18\x04 \x01(\x02:\x01\x31\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xcb\x19\n\x0eLayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06\x62ottom\x18\x03 \x03(\t\x12\x0b\n\x03top\x18\x04 \x03(\t\x12\x1c\n\x0cmirror_stage\x18\xa2\x01 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x05phase\x18\n \x01(\x0e\x32\x0c.caffe.Phase\x12\x13\n\x0bloss_weight\x18\x05 \x03(\x02\x12\x1f\n\x05param\x18\x06 \x03(\x0b\x32\x10.caffe.ParamSpec\x12\x1f\n\x05\x62lobs\x18\x07 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x16\n\x0epropagate_down\x18\x0b \x03(\x08\x12$\n\x07include\x18\x08 \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18\t \x03(\x0b\x32\x13.caffe.NetStateRule\x12\x37\n\x0ftransform_param\x18\x64 \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18\x65 \x01(\x0b\x32\x14.caffe.LossParameter\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x66 \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18g \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12\x34\n\x10\x62\x61tch_norm_param\x18\x8b\x01 \x01(\x0b\x32\x19.caffe.BatchNormParameter\x12)\n\nbias_param\x18\x8d\x01 \x01(\x0b\x32\x14.caffe.BiasParameter\x12,\n\x0c\x63oncat_param\x18h \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18i \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18j \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12)\n\ncrop_param\x18\x90\x01 \x01(\x0b\x32\x14.caffe.CropParameter\x12(\n\ndata_param\x18k \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18l \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18m \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18n \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12\'\n\telu_param\x18\x8c\x01 \x01(\x0b\x32\x13.caffe.ELUParameter\x12+\n\x0b\x65mbed_param\x18\x89\x01 \x01(\x0b\x32\x15.caffe.EmbedParameter\x12&\n\texp_param\x18o \x01(\x0b\x32\x13.caffe.ExpParameter\x12/\n\rflatten_param\x18\x87\x01 \x01(\x0b\x32\x17.caffe.FlattenParameter\x12\x31\n\x0fhdf5_data_param\x18p \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18q \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18r \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18s \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18t \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18u \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12+\n\x0binput_param\x18\x8f\x01 \x01(\x0b\x32\x15.caffe.InputParameter\x12\'\n\tlog_param\x18\x86\x01 \x01(\x0b\x32\x13.caffe.LogParameter\x12&\n\tlrn_param\x18v \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18w \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18x \x01(\x0b\x32\x13.caffe.MVNParameter\x12\x33\n\x0fparameter_param\x18\x91\x01 \x01(\x0b\x32\x19.caffe.ParameterParameter\x12.\n\rpooling_param\x18y \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18z \x01(\x0b\x32\x15.caffe.PowerParameter\x12+\n\x0bprelu_param\x18\x83\x01 \x01(\x0b\x32\x15.caffe.PReLUParameter\x12-\n\x0cpython_param\x18\x82\x01 \x01(\x0b\x32\x16.caffe.PythonParameter\x12\x33\n\x0freduction_param\x18\x88\x01 \x01(\x0b\x32\x19.caffe.ReductionParameter\x12(\n\nrelu_param\x18{ \x01(\x0b\x32\x14.caffe.ReLUParameter\x12/\n\rreshape_param\x18\x85\x01 \x01(\x0b\x32\x17.caffe.ReshapeParameter\x12+\n\x0bscale_param\x18\x8e\x01 \x01(\x0b\x32\x15.caffe.ScaleParameter\x12.\n\rsigmoid_param\x18| \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18} \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12\'\n\tspp_param\x18\x84\x01 \x01(\x0b\x32\x13.caffe.SPPParameter\x12*\n\x0bslice_param\x18~ \x01(\x0b\x32\x15.caffe.SliceParameter\x12(\n\ntanh_param\x18\x7f \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x33\n\x0fthreshold_param\x18\x80\x01 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12)\n\ntile_param\x18\x8a\x01 \x01(\x0b\x32\x14.caffe.TileParameter\x12\x36\n\x11window_data_param\x18\x81\x01 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12\x36\n\x11roi_pooling_param\x18\x97\x01 \x01(\x0b\x32\x1a.caffe.ROIPoolingParameter\x12;\n\x14smooth_l1_loss_param\x18\x98\x01 \x01(\x0b\x32\x1c.caffe.SmoothL1LossParameter\x12\'\n\tmpi_param\x18\x99\x01 \x01(\x0b\x32\x13.caffe.MPIParameter\x12/\n\rpermute_param\x18\x9a\x01 \x01(\x0b\x32\x17.caffe.PermuteParameter\x12\x33\n\x0fnormalize_param\x18\x9b\x01 \x01(\x0b\x32\x19.caffe.NormalizeParameter\x12\x31\n\x0eparallel_param\x18\x9d\x01 \x01(\x0b\x32\x18.caffe.ParallelParameter\x12-\n\x0cresize_param\x18\x9e\x01 \x01(\x0b\x32\x16.caffe.ResizeParameter\x12\x36\n\x11\x65xpand_dims_param\x18\x9f\x01 \x01(\x0b\x32\x1a.caffe.ExpandDimsParameter\x12\x31\n\x0eproposal_param\x18\xa0\x01 \x01(\x0b\x32\x18.caffe.ProposalParameter\x12\x38\n\x12\x62\x61tch_renorm_param\x18\xa1\x01 \x01(\x0b\x32\x1b.caffe.BatchRenormParameter\x12\x38\n\x12\x64\x65nse_concat_param\x18\xa3\x01 \x01(\x0b\x32\x1b.caffe.DenseConcatParameter\x12\x34\n\x10\x66ocal_loss_param\x18\xa4\x01 \x01(\x0b\x32\x19.caffe.FocalLossParameter\x12-\n\x0cgather_param\x18\xa5\x01 \x01(\x0b\x32\x16.caffe.GatherParameter\x12\x34\n\x10group_norm_param\x18\xa6\x01 \x01(\x0b\x32\x19.caffe.GroupNormParameter\"\xa7\x02\n\x17TransformationParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x31\x12\x15\n\x06mirror\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x14\n\tcrop_size\x18\x03 \x01(\r:\x01\x30\x12\x12\n\x07padding\x18\x0b \x01(\r:\x01\x30\x12\x11\n\tmean_file\x18\x04 \x01(\t\x12\x12\n\nmean_value\x18\x05 \x03(\x02\x12\x1a\n\x0b\x66orce_color\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\nforce_gray\x18\x07 \x01(\x08:\x05\x66\x61lse\x12!\n\x12\x63olor_augmentation\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x10min_random_scale\x18\t \x01(\x02:\x01\x31\x12\x1b\n\x10max_random_scale\x18\n \x01(\x02:\x01\x31\"\xf5\x01\n\rLossParameter\x12\x14\n\x0cignore_label\x18\x01 \x01(\x05\x12\x44\n\rnormalization\x18\x03 \x01(\x0e\x32&.caffe.LossParameter.NormalizationMode:\x05VALID\x12\x11\n\tnormalize\x18\x02 \x01(\x08\x1a\'\n\x13\x45xpandDimsParameter\x12\x10\n\x04\x61xis\x18\x01 \x01(\x05:\x02-1\"L\n\x11NormalizationMode\x12\x08\n\x04\x46ULL\x10\x00\x12\t\n\x05VALID\x10\x01\x12\x0e\n\nBATCH_SIZE\x10\x02\x12\x08\n\x04NONE\x10\x03\x12\x08\n\x04UNIT\x10\x04\"L\n\x11\x41\x63\x63uracyParameter\x12\x10\n\x05top_k\x18\x01 \x01(\r:\x01\x31\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x14\n\x0cignore_label\x18\x03 \x01(\x05\"M\n\x0f\x41rgMaxParameter\x12\x1a\n\x0bout_max_val\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x05top_k\x18\x02 \x01(\r:\x01\x31\x12\x0c\n\x04\x61xis\x18\x03 \x01(\x05\"9\n\x0f\x43oncatParameter\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x15\n\nconcat_dim\x18\x01 \x01(\r:\x01\x31\"h\n\x12\x42\x61tchNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x30.001\"]\n\rBiasParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\"L\n\x18\x43ontrastiveLossParameter\x12\x11\n\x06margin\x18\x01 \x01(\x02:\x01\x31\x12\x1d\n\x0elegacy_version\x18\x02 \x01(\x08:\x05\x66\x61lse\"\xfc\x03\n\x14\x43onvolutionParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x0b\n\x03pad\x18\x03 \x03(\r\x12\x13\n\x0bkernel_size\x18\x04 \x03(\r\x12\x0e\n\x06stride\x18\x06 \x03(\r\x12\x10\n\x08\x64ilation\x18\x12 \x03(\r\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x10\n\x08kernel_h\x18\x0b \x01(\r\x12\x10\n\x08kernel_w\x18\x0c \x01(\r\x12\x10\n\x08stride_h\x18\r \x01(\r\x12\x10\n\x08stride_w\x18\x0e \x01(\r\x12\x10\n\x05group\x18\x05 \x01(\r:\x01\x31\x12-\n\rweight_filler\x18\x07 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x08 \x01(\x0b\x32\x16.caffe.FillerParameter\x12;\n\x06\x65ngine\x18\x0f \x01(\x0e\x32\".caffe.ConvolutionParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x10 \x01(\x05:\x01\x31\x12\x1e\n\x0f\x66orce_nd_im2col\x18\x11 \x01(\x08:\x05\x66\x61lse\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"0\n\rCropParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x32\x12\x0e\n\x06offset\x18\x02 \x03(\r\"\xa4\x02\n\rDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x31\n\x07\x62\x61\x63kend\x18\x08 \x01(\x0e\x32\x17.caffe.DataParameter.DB:\x07LEVELDB\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\"\n\x13\x66orce_encoded_color\x18\t \x01(\x08:\x05\x66\x61lse\x12\x13\n\x08prefetch\x18\n \x01(\r:\x01\x35\"\x1b\n\x02\x44\x42\x12\x0b\n\x07LEVELDB\x10\x00\x12\x08\n\x04LMDB\x10\x01\"I\n\x10\x44ropoutParameter\x12\x1a\n\rdropout_ratio\x18\x01 \x01(\x02:\x03\x30.5\x12\x19\n\x0bscale_train\x18\x02 \x01(\x08:\x04true\"\xa0\x01\n\x12\x44ummyDataParameter\x12+\n\x0b\x64\x61ta_filler\x18\x01 \x03(\x0b\x32\x16.caffe.FillerParameter\x12\x1f\n\x05shape\x18\x06 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x0b\n\x03num\x18\x02 \x03(\r\x12\x10\n\x08\x63hannels\x18\x03 \x03(\r\x12\x0e\n\x06height\x18\x04 \x03(\r\x12\r\n\x05width\x18\x05 \x03(\r\"\xa5\x01\n\x10\x45ltwiseParameter\x12\x39\n\toperation\x18\x01 \x01(\x0e\x32!.caffe.EltwiseParameter.EltwiseOp:\x03SUM\x12\r\n\x05\x63oeff\x18\x02 \x03(\x02\x12\x1e\n\x10stable_prod_grad\x18\x03 \x01(\x08:\x04true\"\'\n\tEltwiseOp\x12\x08\n\x04PROD\x10\x00\x12\x07\n\x03SUM\x10\x01\x12\x07\n\x03MAX\x10\x02\" \n\x0c\x45LUParameter\x12\x10\n\x05\x61lpha\x18\x01 \x01(\x02:\x01\x31\"\xac\x01\n\x0e\x45mbedParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x11\n\tinput_dim\x18\x02 \x01(\r\x12\x17\n\tbias_term\x18\x03 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"D\n\x0c\x45xpParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"9\n\x10\x46lattenParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x14\n\x08\x65nd_axis\x18\x02 \x01(\x05:\x02-1\"O\n\x11HDF5DataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x16\n\x07shuffle\x18\x03 \x01(\x08:\x05\x66\x61lse\"(\n\x13HDF5OutputParameter\x12\x11\n\tfile_name\x18\x01 \x01(\t\"^\n\x12HingeLossParameter\x12\x30\n\x04norm\x18\x01 \x01(\x0e\x32\x1e.caffe.HingeLossParameter.Norm:\x02L1\"\x16\n\x04Norm\x12\x06\n\x02L1\x10\x01\x12\x06\n\x02L2\x10\x02\"\x97\x02\n\x12ImageDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\nbatch_size\x18\x04 \x01(\r:\x01\x31\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x16\n\x07shuffle\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnew_height\x18\t \x01(\r:\x01\x30\x12\x14\n\tnew_width\x18\n \x01(\r:\x01\x30\x12\x16\n\x08is_color\x18\x0b \x01(\x08:\x04true\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\x0c \x01(\t:\x00\"\'\n\x15InfogainLossParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\"\xcb\x01\n\x15InnerProductParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0f\n\x04\x61xis\x18\x05 \x01(\x05:\x01\x31\x12\x18\n\ttranspose\x18\x06 \x01(\x08:\x05\x66\x61lse\"1\n\x0eInputParameter\x12\x1f\n\x05shape\x18\x01 \x03(\x0b\x32\x10.caffe.BlobShape\"D\n\x0cLogParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xb8\x02\n\x0cLRNParameter\x12\x15\n\nlocal_size\x18\x01 \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x02 \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x03 \x01(\x02:\x04\x30.75\x12\x44\n\x0bnorm_region\x18\x04 \x01(\x0e\x32\x1e.caffe.LRNParameter.NormRegion:\x0f\x41\x43ROSS_CHANNELS\x12\x0c\n\x01k\x18\x05 \x01(\x02:\x01\x31\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.LRNParameter.Engine:\x07\x44\x45\x46\x41ULT\"5\n\nNormRegion\x12\x13\n\x0f\x41\x43ROSS_CHANNELS\x10\x00\x12\x12\n\x0eWITHIN_CHANNEL\x10\x01\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xbd\x01\n\x13MemoryDataParameter\x12\x12\n\nbatch_size\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\x12\x0e\n\x06height\x18\x03 \x01(\r\x12\r\n\x05width\x18\x04 \x01(\r\x12;\n\x05\x64type\x18\x05 \x01(\x0e\x32#.caffe.MemoryDataParameter.DataType:\x07\x46LOAT32\"$\n\x08\x44\x61taType\x12\x0b\n\x07\x46LOAT32\x10\x00\x12\x0b\n\x07\x46LOAT16\x10\x01\"e\n\x0cMVNParameter\x12 \n\x12normalize_variance\x18\x01 \x01(\x08:\x04true\x12\x1e\n\x0f\x61\x63ross_channels\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x13\n\x03\x65ps\x18\x03 \x01(\x02:\x06\x31\x65-009\"5\n\x12ParameterParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\"\xa2\x03\n\x10PoolingParameter\x12\x35\n\x04pool\x18\x01 \x01(\x0e\x32\".caffe.PoolingParameter.PoolMethod:\x03MAX\x12\x0e\n\x03pad\x18\x04 \x01(\r:\x01\x30\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x02 \x01(\r\x12\x10\n\x08kernel_h\x18\x05 \x01(\r\x12\x10\n\x08kernel_w\x18\x06 \x01(\r\x12\x11\n\x06stride\x18\x03 \x01(\r:\x01\x31\x12\x10\n\x08stride_h\x18\x07 \x01(\r\x12\x10\n\x08stride_w\x18\x08 \x01(\r\x12\x37\n\x06\x65ngine\x18\x0b \x01(\x0e\x32\x1e.caffe.PoolingParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x1d\n\x0eglobal_pooling\x18\x0c \x01(\x08:\x05\x66\x61lse\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Y\n\x13ROIPoolingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"F\n\x0ePowerParameter\x12\x10\n\x05power\x18\x01 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"g\n\x0fPythonParameter\x12\x0e\n\x06module\x18\x01 \x01(\t\x12\r\n\x05layer\x18\x02 \x01(\t\x12\x13\n\tparam_str\x18\x03 \x01(\t:\x00\x12 \n\x11share_in_parallel\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xad\x01\n\x12ReductionParameter\x12=\n\toperation\x18\x01 \x01(\x0e\x32%.caffe.ReductionParameter.ReductionOp:\x03SUM\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x10\n\x05\x63oeff\x18\x03 \x01(\x02:\x01\x31\"5\n\x0bReductionOp\x12\x07\n\x03SUM\x10\x01\x12\x08\n\x04\x41SUM\x10\x02\x12\t\n\x05SUMSQ\x10\x03\x12\x08\n\x04MEAN\x10\x04\"\x8d\x01\n\rReLUParameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\x12\x34\n\x06\x65ngine\x18\x02 \x01(\x0e\x32\x1b.caffe.ReLUParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Z\n\x10ReshapeParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\x08num_axes\x18\x03 \x01(\x05:\x02-1\"\xa5\x01\n\x0eScaleParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x18\n\tbias_term\x18\x04 \x01(\x08:\x05\x66\x61lse\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"x\n\x10SigmoidParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SigmoidParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"L\n\x0eSliceParameter\x12\x0f\n\x04\x61xis\x18\x03 \x01(\x05:\x01\x31\x12\x13\n\x0bslice_point\x18\x02 \x03(\r\x12\x14\n\tslice_dim\x18\x01 \x01(\r:\x01\x31\"\x89\x01\n\x10SoftmaxParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SoftmaxParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"r\n\rTanHParameter\x12\x34\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1b.caffe.TanHParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"T\n\rTileParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05tiles\x18\x02 \x01(\x05\x12#\n\tmultiples\x18\x03 \x01(\x0b\x32\x10.caffe.BlobShape\"*\n\x12ThresholdParameter\x12\x14\n\tthreshold\x18\x01 \x01(\x02:\x01\x30\"\xc1\x02\n\x13WindowDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0c\x66g_threshold\x18\x07 \x01(\x02:\x03\x30.5\x12\x19\n\x0c\x62g_threshold\x18\x08 \x01(\x02:\x03\x30.5\x12\x19\n\x0b\x66g_fraction\x18\t \x01(\x02:\x04\x30.25\x12\x16\n\x0b\x63ontext_pad\x18\n \x01(\r:\x01\x30\x12\x17\n\tcrop_mode\x18\x0b \x01(\t:\x04warp\x12\x1b\n\x0c\x63\x61\x63he_images\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\r \x01(\t:\x00\"\xeb\x01\n\x0cSPPParameter\x12\x16\n\x0epyramid_height\x18\x01 \x01(\r\x12\x31\n\x04pool\x18\x02 \x01(\x0e\x32\x1e.caffe.SPPParameter.PoolMethod:\x03MAX\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.SPPParameter.Engine:\x07\x44\x45\x46\x41ULT\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xe0\x13\n\x10V1LayerParameter\x12\x0e\n\x06\x62ottom\x18\x02 \x03(\t\x12\x0b\n\x03top\x18\x03 \x03(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12$\n\x07include\x18  \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18! \x03(\x0b\x32\x13.caffe.NetStateRule\x12/\n\x04type\x18\x05 \x01(\x0e\x32!.caffe.V1LayerParameter.LayerType\x12\x1f\n\x05\x62lobs\x18\x06 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x0e\n\x05param\x18\xe9\x07 \x03(\t\x12>\n\x0f\x62lob_share_mode\x18\xea\x07 \x03(\x0e\x32$.caffe.V1LayerParameter.DimCheckMode\x12\x10\n\x08\x62lobs_lr\x18\x07 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x08 \x03(\x02\x12\x13\n\x0bloss_weight\x18# \x03(\x02\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x1b \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18\x17 \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12,\n\x0c\x63oncat_param\x18\t \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18( \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18\n \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12(\n\ndata_param\x18\x0b \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18\x0c \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18\x1a \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18\x18 \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12&\n\texp_param\x18) \x01(\x0b\x32\x13.caffe.ExpParameter\x12\x31\n\x0fhdf5_data_param\x18\r \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18\x0e \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18\x1d \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18\x0f \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18\x10 \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18\x11 \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12&\n\tlrn_param\x18\x12 \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18\x16 \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18\" \x01(\x0b\x32\x13.caffe.MVNParameter\x12.\n\rpooling_param\x18\x13 \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18\x15 \x01(\x0b\x32\x15.caffe.PowerParameter\x12(\n\nrelu_param\x18\x1e \x01(\x0b\x32\x14.caffe.ReLUParameter\x12.\n\rsigmoid_param\x18& \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18\' \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12*\n\x0bslice_param\x18\x1f \x01(\x0b\x32\x15.caffe.SliceParameter\x12(\n\ntanh_param\x18% \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x32\n\x0fthreshold_param\x18\x19 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12\x35\n\x11window_data_param\x18\x14 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12\x37\n\x0ftransform_param\x18$ \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18* \x01(\x0b\x32\x14.caffe.LossParameter\x12&\n\x05layer\x18\x01 \x01(\x0b\x32\x17.caffe.V0LayerParameter\"\xd8\x04\n\tLayerType\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06\x41\x42SVAL\x10#\x12\x0c\n\x08\x41\x43\x43URACY\x10\x01\x12\n\n\x06\x41RGMAX\x10\x1e\x12\x08\n\x04\x42NLL\x10\x02\x12\n\n\x06\x43ONCAT\x10\x03\x12\x14\n\x10\x43ONTRASTIVE_LOSS\x10%\x12\x0f\n\x0b\x43ONVOLUTION\x10\x04\x12\x08\n\x04\x44\x41TA\x10\x05\x12\x11\n\rDECONVOLUTION\x10\'\x12\x0b\n\x07\x44ROPOUT\x10\x06\x12\x0e\n\nDUMMY_DATA\x10 \x12\x12\n\x0e\x45UCLIDEAN_LOSS\x10\x07\x12\x0b\n\x07\x45LTWISE\x10\x19\x12\x07\n\x03\x45XP\x10&\x12\x0b\n\x07\x46LATTEN\x10\x08\x12\r\n\tHDF5_DATA\x10\t\x12\x0f\n\x0bHDF5_OUTPUT\x10\n\x12\x0e\n\nHINGE_LOSS\x10\x1c\x12\n\n\x06IM2COL\x10\x0b\x12\x0e\n\nIMAGE_DATA\x10\x0c\x12\x11\n\rINFOGAIN_LOSS\x10\r\x12\x11\n\rINNER_PRODUCT\x10\x0e\x12\x07\n\x03LRN\x10\x0f\x12\x0f\n\x0bMEMORY_DATA\x10\x1d\x12\x1d\n\x19MULTINOMIAL_LOGISTIC_LOSS\x10\x10\x12\x07\n\x03MVN\x10\"\x12\x0b\n\x07POOLING\x10\x11\x12\t\n\x05POWER\x10\x1a\x12\x08\n\x04RELU\x10\x12\x12\x0b\n\x07SIGMOID\x10\x13\x12\x1e\n\x1aSIGMOID_CROSS_ENTROPY_LOSS\x10\x1b\x12\x0b\n\x07SILENCE\x10$\x12\x0b\n\x07SOFTMAX\x10\x14\x12\x10\n\x0cSOFTMAX_LOSS\x10\x15\x12\t\n\x05SPLIT\x10\x16\x12\t\n\x05SLICE\x10!\x12\x08\n\x04TANH\x10\x17\x12\x0f\n\x0bWINDOW_DATA\x10\x18\x12\r\n\tTHRESHOLD\x10\x1f\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xfd\x07\n\x10V0LayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x12\n\nnum_output\x18\x03 \x01(\r\x12\x16\n\x08\x62iasterm\x18\x04 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x06 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0e\n\x03pad\x18\x07 \x01(\r:\x01\x30\x12\x12\n\nkernelsize\x18\x08 \x01(\r\x12\x10\n\x05group\x18\t \x01(\r:\x01\x31\x12\x11\n\x06stride\x18\n \x01(\r:\x01\x31\x12\x35\n\x04pool\x18\x0b \x01(\x0e\x32\".caffe.V0LayerParameter.PoolMethod:\x03MAX\x12\x1a\n\rdropout_ratio\x18\x0c \x01(\x02:\x03\x30.5\x12\x15\n\nlocal_size\x18\r \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x0e \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x0f \x01(\x02:\x04\x30.75\x12\x0c\n\x01k\x18\x16 \x01(\x02:\x01\x31\x12\x0e\n\x06source\x18\x10 \x01(\t\x12\x10\n\x05scale\x18\x11 \x01(\x02:\x01\x31\x12\x10\n\x08meanfile\x18\x12 \x01(\t\x12\x11\n\tbatchsize\x18\x13 \x01(\r\x12\x13\n\x08\x63ropsize\x18\x14 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x15 \x01(\x08:\x05\x66\x61lse\x12\x1f\n\x05\x62lobs\x18\x32 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x10\n\x08\x62lobs_lr\x18\x33 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x34 \x03(\x02\x12\x14\n\trand_skip\x18\x35 \x01(\r:\x01\x30\x12\x1d\n\x10\x64\x65t_fg_threshold\x18\x36 \x01(\x02:\x03\x30.5\x12\x1d\n\x10\x64\x65t_bg_threshold\x18\x37 \x01(\x02:\x03\x30.5\x12\x1d\n\x0f\x64\x65t_fg_fraction\x18\x38 \x01(\x02:\x04\x30.25\x12\x1a\n\x0f\x64\x65t_context_pad\x18: \x01(\r:\x01\x30\x12\x1b\n\rdet_crop_mode\x18; \x01(\t:\x04warp\x12\x12\n\x07new_num\x18< \x01(\x05:\x01\x30\x12\x17\n\x0cnew_channels\x18= \x01(\x05:\x01\x30\x12\x15\n\nnew_height\x18> \x01(\x05:\x01\x30\x12\x14\n\tnew_width\x18? \x01(\x05:\x01\x30\x12\x1d\n\x0eshuffle_images\x18@ \x01(\x08:\x05\x66\x61lse\x12\x15\n\nconcat_dim\x18\x41 \x01(\r:\x01\x31\x12\x36\n\x11hdf5_output_param\x18\xe9\x07 \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"W\n\x0ePReLUParameter\x12&\n\x06\x66iller\x18\x01 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1d\n\x0e\x63hannel_shared\x18\x02 \x01(\x08:\x05\x66\x61lse\")\n\x15SmoothL1LossParameter\x12\x10\n\x05sigma\x18\x01 \x01(\x02:\x01\x31\"H\n\x0cMPIParameter\x12\x0f\n\x04root\x18\x01 \x01(\r:\x01\x30\x12\x12\n\x07\x63omm_id\x18\x02 \x01(\x04:\x01\x30\x12\x13\n\x08group_id\x18\x03 \x01(\x04:\x01\x30\"!\n\x10PermuteParameter\x12\r\n\x05order\x18\x01 \x03(\r\"\x93\x01\n\x12NormalizeParameter\x12\x1c\n\x0e\x61\x63ross_spatial\x18\x01 \x01(\x08:\x04true\x12,\n\x0cscale_filler\x18\x02 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1c\n\x0e\x63hannel_shared\x18\x03 \x01(\x08:\x04true\x12\x13\n\x03\x65ps\x18\x04 \x01(\x02:\x06\x31\x65-010\"d\n\x11ParallelParameter\x12\x1d\n\x0emultiple_nodes\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x16\n\x07shuffle\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x18\n\tpartition\x18\x03 \x01(\x08:\x05\x66\x61lse\"R\n\x0fResizeParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x0e\n\x02\x66x\x18\x02 \x01(\x02:\x02-1\x12\x0e\n\x02\x66y\x18\x03 \x01(\x02:\x02-1\"\'\n\x13\x45xpandDimsParameter\x12\x10\n\x04\x61xis\x18\x01 \x01(\x05:\x02-1\"\x90\x02\n\x11ProposalParameter\x12\x0e\n\x06stride\x18\x01 \x03(\x05\x12\r\n\x05ratio\x18\x02 \x03(\x02\x12\r\n\x05scale\x18\x03 \x03(\x02\x12\x1b\n\rpre_nms_top_n\x18\x04 \x01(\r:\x04\x36\x30\x30\x30\x12\x1b\n\x0epost_nms_top_n\x18\x05 \x01(\r:\x03\x33\x30\x30\x12\x17\n\nnms_thresh\x18\x06 \x01(\x02:\x03\x30.7\x12\x14\n\x08min_size\x18\x07 \x01(\r:\x02\x31\x36\x12\x14\n\tmin_level\x18\x08 \x01(\x05:\x01\x32\x12\x14\n\tmax_level\x18\t \x01(\x05:\x01\x35\x12\x1c\n\x0f\x63\x61nonical_scale\x18\n \x01(\x05:\x03\x32\x32\x34\x12\x1a\n\x0f\x63\x61nonical_level\x18\x0b \x01(\x05:\x01\x34\"\xa6\x01\n\x14\x42\x61tchRenormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x30.001\x12\x10\n\x05r_max\x18\x04 \x01(\x02:\x01\x33\x12\x10\n\x05\x64_max\x18\x05 \x01(\x02:\x01\x35\x12\x16\n\x07t_delta\x18\x06 \x01(\x02:\x05\x30.001\"?\n\x14\x44\x65nseConcatParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x16\n\x0bgrowth_rate\x18\x02 \x01(\x05:\x01\x30\"N\n\x12\x46ocalLossParameter\x12\x13\n\x05\x61lpha\x18\x01 \x01(\x02:\x04\x30.25\x12\x10\n\x05gamma\x18\x02 \x01(\x02:\x01\x32\x12\x11\n\x06neg_id\x18\x03 \x01(\x05:\x01\x30\"\"\n\x0fGatherParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x30\"{\n\x12GroupNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x30.001\x12\x11\n\x05group\x18\x05 \x01(\r:\x02\x33\x32*\x1c\n\x05Phase\x12\t\n\x05TRAIN\x10\x00\x12\x08\n\x04TEST\x10\x01')
+  serialized_pb=_b('\n\x0b\x63\x61\x66\x66\x65.proto\x12\x05\x63\x61\x66\x66\x65\"\x1c\n\tBlobShape\x12\x0f\n\x03\x64im\x18\x01 \x03(\x03\x42\x02\x10\x01\"\xcc\x01\n\tBlobProto\x12\x1f\n\x05shape\x18\x07 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x10\n\x04\x64\x61ta\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04\x64iff\x18\x06 \x03(\x02\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_data\x18\x08 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_diff\x18\t \x03(\x01\x42\x02\x10\x01\x12\x0e\n\x03num\x18\x01 \x01(\x05:\x01\x30\x12\x13\n\x08\x63hannels\x18\x02 \x01(\x05:\x01\x30\x12\x11\n\x06height\x18\x03 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\x05:\x01\x30\"2\n\x0f\x42lobProtoVector\x12\x1f\n\x05\x62lobs\x18\x01 \x03(\x0b\x32\x10.caffe.BlobProto\"\x91\x01\n\x05\x44\x61tum\x12\x10\n\x08\x63hannels\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x04 \x01(\x0c\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x12\n\nfloat_data\x18\x06 \x03(\x02\x12\x16\n\x07\x65ncoded\x18\x07 \x01(\x08:\x05\x66\x61lse\x12\x0e\n\x06labels\x18\x08 \x03(\x05\"\x8a\x02\n\x0f\x46illerParameter\x12\x16\n\x04type\x18\x01 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x02 \x01(\x02:\x01\x30\x12\x0e\n\x03min\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03max\x18\x04 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x05 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x06 \x01(\x02:\x01\x31\x12\x12\n\x06sparse\x18\x07 \x01(\x05:\x02-1\x12\x42\n\rvariance_norm\x18\x08 \x01(\x0e\x32#.caffe.FillerParameter.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x41VERAGE\x10\x02\"\x8e\x02\n\x0cNetParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12%\n\x0binput_shape\x18\x08 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x11\n\tinput_dim\x18\x04 \x03(\x05\x12\x1d\n\x0e\x66orce_backward\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x1e\n\x05state\x18\x06 \x01(\x0b\x32\x0f.caffe.NetState\x12\x19\n\ndebug_info\x18\x07 \x01(\x08:\x05\x66\x61lse\x12$\n\x05layer\x18\x64 \x03(\x0b\x32\x15.caffe.LayerParameter\x12\'\n\x06layers\x18\x02 \x03(\x0b\x32\x17.caffe.V1LayerParameter\"\xc9\n\n\x0fSolverParameter\x12\x0b\n\x03net\x18\x18 \x01(\t\x12&\n\tnet_param\x18\x19 \x01(\x0b\x32\x13.caffe.NetParameter\x12\x11\n\ttrain_net\x18\x01 \x01(\t\x12\x10\n\x08test_net\x18\x02 \x03(\t\x12,\n\x0ftrain_net_param\x18\x15 \x01(\x0b\x32\x13.caffe.NetParameter\x12+\n\x0etest_net_param\x18\x16 \x03(\x0b\x32\x13.caffe.NetParameter\x12$\n\x0btrain_state\x18\x1a \x01(\x0b\x32\x0f.caffe.NetState\x12#\n\ntest_state\x18\x1b \x03(\x0b\x32\x0f.caffe.NetState\x12\x11\n\ttest_iter\x18\x03 \x03(\x05\x12\x18\n\rtest_interval\x18\x04 \x01(\x05:\x01\x30\x12 \n\x11test_compute_loss\x18\x13 \x01(\x08:\x05\x66\x61lse\x12!\n\x13test_initialization\x18  \x01(\x08:\x04true\x12\x0f\n\x07\x62\x61se_lr\x18\x05 \x01(\x02\x12\x10\n\x08stage_lr\x18\x32 \x03(\x02\x12\x12\n\nstage_iter\x18\x33 \x03(\x05\x12\x0f\n\x07\x64isplay\x18\x06 \x01(\x05\x12\x17\n\x0c\x61verage_loss\x18! \x01(\x05:\x01\x31\x12\x10\n\x08max_iter\x18\x07 \x01(\x05\x12\x14\n\titer_size\x18$ \x01(\x05:\x01\x31\x12\x11\n\tlr_policy\x18\x08 \x01(\t\x12\r\n\x05gamma\x18\t \x01(\x02\x12\r\n\x05power\x18\n \x01(\x02\x12\x10\n\x08momentum\x18\x0b \x01(\x02\x12\x14\n\x0cweight_decay\x18\x0c \x01(\x02\x12\x1f\n\x13regularization_type\x18\x1d \x01(\t:\x02L2\x12\x10\n\x08stepsize\x18\r \x01(\x05\x12\x11\n\tstepvalue\x18\" \x03(\x05\x12\x1a\n\x0e\x63lip_gradients\x18# \x01(\x02:\x02-1\x12\x13\n\x08snapshot\x18\x0e \x01(\x05:\x01\x30\x12\x17\n\x0fsnapshot_prefix\x18\x0f \x01(\t\x12\x1c\n\rsnapshot_diff\x18\x10 \x01(\x08:\x05\x66\x61lse\x12K\n\x0fsnapshot_format\x18% \x01(\x0e\x32%.caffe.SolverParameter.SnapshotFormat:\x0b\x42INARYPROTO\x12;\n\x0bsolver_mode\x18\x11 \x01(\x0e\x32!.caffe.SolverParameter.SolverMode:\x03GPU\x12\x14\n\tdevice_id\x18\x12 \x01(\x05:\x01\x30\x12\x17\n\x0brandom_seed\x18\x14 \x01(\x03:\x02-1\x12\x11\n\x04type\x18( \x01(\t:\x03SGD\x12\x15\n\x05\x64\x65lta\x18\x1f \x01(\x02:\x06\x31\x65-008\x12\x18\n\tmomentum2\x18\' \x01(\x02:\x05\x30.999\x12\x17\n\trms_decay\x18& \x01(\x02:\x04\x30.99\x12\x19\n\ndebug_info\x18\x17 \x01(\x08:\x05\x66\x61lse\x12\"\n\x14snapshot_after_train\x18\x1c \x01(\x08:\x04true\x12;\n\x0bsolver_type\x18\x1e \x01(\x0e\x32!.caffe.SolverParameter.SolverType:\x03SGD\"+\n\x0eSnapshotFormat\x12\x08\n\x04HDF5\x10\x00\x12\x0f\n\x0b\x42INARYPROTO\x10\x01\"\x1e\n\nSolverMode\x12\x07\n\x03\x43PU\x10\x00\x12\x07\n\x03GPU\x10\x01\"U\n\nSolverType\x12\x07\n\x03SGD\x10\x00\x12\x0c\n\x08NESTEROV\x10\x01\x12\x0b\n\x07\x41\x44\x41GRAD\x10\x02\x12\x0b\n\x07RMSPROP\x10\x03\x12\x0c\n\x08\x41\x44\x41\x44\x45LTA\x10\x04\x12\x08\n\x04\x41\x44\x41M\x10\x05\"l\n\x0bSolverState\x12\x0c\n\x04iter\x18\x01 \x01(\x05\x12\x13\n\x0blearned_net\x18\x02 \x01(\t\x12!\n\x07history\x18\x03 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x17\n\x0c\x63urrent_step\x18\x04 \x01(\x05:\x01\x30\"N\n\x08NetState\x12!\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase:\x04TEST\x12\x10\n\x05level\x18\x02 \x01(\x05:\x01\x30\x12\r\n\x05stage\x18\x03 \x03(\t\"\x85\x01\n\x0cNetStateRule\x12\x1b\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase\x12\x11\n\tmin_level\x18\x02 \x01(\x05\x12\x11\n\tmax_level\x18\x03 \x01(\x05\x12\r\n\x05stage\x18\x04 \x03(\t\x12\x11\n\tnot_stage\x18\x05 \x03(\t\x12\x10\n\x08mpi_rank\x18\x06 \x03(\r\"\xa3\x01\n\tParamSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x31\n\nshare_mode\x18\x02 \x01(\x0e\x32\x1d.caffe.ParamSpec.DimCheckMode\x12\x12\n\x07lr_mult\x18\x03 \x01(\x02:\x01\x31\x12\x15\n\ndecay_mult\x18\x04 \x01(\x02:\x01\x31\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xcb\x19\n\x0eLayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06\x62ottom\x18\x03 \x03(\t\x12\x0b\n\x03top\x18\x04 \x03(\t\x12\x1c\n\x0cmirror_stage\x18\xa2\x01 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x05phase\x18\n \x01(\x0e\x32\x0c.caffe.Phase\x12\x13\n\x0bloss_weight\x18\x05 \x03(\x02\x12\x1f\n\x05param\x18\x06 \x03(\x0b\x32\x10.caffe.ParamSpec\x12\x1f\n\x05\x62lobs\x18\x07 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x16\n\x0epropagate_down\x18\x0b \x03(\x08\x12$\n\x07include\x18\x08 \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18\t \x03(\x0b\x32\x13.caffe.NetStateRule\x12\x37\n\x0ftransform_param\x18\x64 \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18\x65 \x01(\x0b\x32\x14.caffe.LossParameter\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x66 \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18g \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12\x34\n\x10\x62\x61tch_norm_param\x18\x8b\x01 \x01(\x0b\x32\x19.caffe.BatchNormParameter\x12)\n\nbias_param\x18\x8d\x01 \x01(\x0b\x32\x14.caffe.BiasParameter\x12,\n\x0c\x63oncat_param\x18h \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18i \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18j \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12)\n\ncrop_param\x18\x90\x01 \x01(\x0b\x32\x14.caffe.CropParameter\x12(\n\ndata_param\x18k \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18l \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18m \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18n \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12\'\n\telu_param\x18\x8c\x01 \x01(\x0b\x32\x13.caffe.ELUParameter\x12+\n\x0b\x65mbed_param\x18\x89\x01 \x01(\x0b\x32\x15.caffe.EmbedParameter\x12&\n\texp_param\x18o \x01(\x0b\x32\x13.caffe.ExpParameter\x12/\n\rflatten_param\x18\x87\x01 \x01(\x0b\x32\x17.caffe.FlattenParameter\x12\x31\n\x0fhdf5_data_param\x18p \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18q \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18r \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18s \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18t \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18u \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12+\n\x0binput_param\x18\x8f\x01 \x01(\x0b\x32\x15.caffe.InputParameter\x12\'\n\tlog_param\x18\x86\x01 \x01(\x0b\x32\x13.caffe.LogParameter\x12&\n\tlrn_param\x18v \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18w \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18x \x01(\x0b\x32\x13.caffe.MVNParameter\x12\x33\n\x0fparameter_param\x18\x91\x01 \x01(\x0b\x32\x19.caffe.ParameterParameter\x12.\n\rpooling_param\x18y \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18z \x01(\x0b\x32\x15.caffe.PowerParameter\x12+\n\x0bprelu_param\x18\x83\x01 \x01(\x0b\x32\x15.caffe.PReLUParameter\x12-\n\x0cpython_param\x18\x82\x01 \x01(\x0b\x32\x16.caffe.PythonParameter\x12\x33\n\x0freduction_param\x18\x88\x01 \x01(\x0b\x32\x19.caffe.ReductionParameter\x12(\n\nrelu_param\x18{ \x01(\x0b\x32\x14.caffe.ReLUParameter\x12/\n\rreshape_param\x18\x85\x01 \x01(\x0b\x32\x17.caffe.ReshapeParameter\x12+\n\x0bscale_param\x18\x8e\x01 \x01(\x0b\x32\x15.caffe.ScaleParameter\x12.\n\rsigmoid_param\x18| \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18} \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12\'\n\tspp_param\x18\x84\x01 \x01(\x0b\x32\x13.caffe.SPPParameter\x12*\n\x0bslice_param\x18~ \x01(\x0b\x32\x15.caffe.SliceParameter\x12(\n\ntanh_param\x18\x7f \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x33\n\x0fthreshold_param\x18\x80\x01 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12)\n\ntile_param\x18\x8a\x01 \x01(\x0b\x32\x14.caffe.TileParameter\x12\x36\n\x11window_data_param\x18\x81\x01 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12\x36\n\x11roi_pooling_param\x18\x97\x01 \x01(\x0b\x32\x1a.caffe.ROIPoolingParameter\x12;\n\x14smooth_l1_loss_param\x18\x98\x01 \x01(\x0b\x32\x1c.caffe.SmoothL1LossParameter\x12\'\n\tmpi_param\x18\x99\x01 \x01(\x0b\x32\x13.caffe.MPIParameter\x12/\n\rpermute_param\x18\x9a\x01 \x01(\x0b\x32\x17.caffe.PermuteParameter\x12\x33\n\x0fnormalize_param\x18\x9b\x01 \x01(\x0b\x32\x19.caffe.NormalizeParameter\x12\x31\n\x0eparallel_param\x18\x9d\x01 \x01(\x0b\x32\x18.caffe.ParallelParameter\x12-\n\x0cresize_param\x18\x9e\x01 \x01(\x0b\x32\x16.caffe.ResizeParameter\x12\x36\n\x11\x65xpand_dims_param\x18\x9f\x01 \x01(\x0b\x32\x1a.caffe.ExpandDimsParameter\x12\x31\n\x0eproposal_param\x18\xa0\x01 \x01(\x0b\x32\x18.caffe.ProposalParameter\x12\x38\n\x12\x62\x61tch_renorm_param\x18\xa1\x01 \x01(\x0b\x32\x1b.caffe.BatchRenormParameter\x12\x38\n\x12\x64\x65nse_concat_param\x18\xa3\x01 \x01(\x0b\x32\x1b.caffe.DenseConcatParameter\x12\x34\n\x10\x66ocal_loss_param\x18\xa4\x01 \x01(\x0b\x32\x19.caffe.FocalLossParameter\x12-\n\x0cgather_param\x18\xa5\x01 \x01(\x0b\x32\x16.caffe.GatherParameter\x12\x34\n\x10group_norm_param\x18\xa6\x01 \x01(\x0b\x32\x19.caffe.GroupNormParameter\"\xa7\x02\n\x17TransformationParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x31\x12\x15\n\x06mirror\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x14\n\tcrop_size\x18\x03 \x01(\r:\x01\x30\x12\x12\n\x07padding\x18\x0b \x01(\r:\x01\x30\x12\x11\n\tmean_file\x18\x04 \x01(\t\x12\x12\n\nmean_value\x18\x05 \x03(\x02\x12\x1a\n\x0b\x66orce_color\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\nforce_gray\x18\x07 \x01(\x08:\x05\x66\x61lse\x12!\n\x12\x63olor_augmentation\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x10min_random_scale\x18\t \x01(\x02:\x01\x31\x12\x1b\n\x10max_random_scale\x18\n \x01(\x02:\x01\x31\"\xf5\x01\n\rLossParameter\x12\x14\n\x0cignore_label\x18\x01 \x01(\x05\x12\x44\n\rnormalization\x18\x03 \x01(\x0e\x32&.caffe.LossParameter.NormalizationMode:\x05VALID\x12\x11\n\tnormalize\x18\x02 \x01(\x08\x1a\'\n\x13\x45xpandDimsParameter\x12\x10\n\x04\x61xis\x18\x01 \x01(\x05:\x02-1\"L\n\x11NormalizationMode\x12\x08\n\x04\x46ULL\x10\x00\x12\t\n\x05VALID\x10\x01\x12\x0e\n\nBATCH_SIZE\x10\x02\x12\x08\n\x04NONE\x10\x03\x12\x08\n\x04UNIT\x10\x04\"L\n\x11\x41\x63\x63uracyParameter\x12\x10\n\x05top_k\x18\x01 \x01(\r:\x01\x31\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x14\n\x0cignore_label\x18\x03 \x01(\x05\"M\n\x0f\x41rgMaxParameter\x12\x1a\n\x0bout_max_val\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x05top_k\x18\x02 \x01(\r:\x01\x31\x12\x0c\n\x04\x61xis\x18\x03 \x01(\x05\"9\n\x0f\x43oncatParameter\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x15\n\nconcat_dim\x18\x01 \x01(\r:\x01\x31\"h\n\x12\x42\x61tchNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x30.001\"]\n\rBiasParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\"L\n\x18\x43ontrastiveLossParameter\x12\x11\n\x06margin\x18\x01 \x01(\x02:\x01\x31\x12\x1d\n\x0elegacy_version\x18\x02 \x01(\x08:\x05\x66\x61lse\"\xfc\x03\n\x14\x43onvolutionParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x0b\n\x03pad\x18\x03 \x03(\r\x12\x13\n\x0bkernel_size\x18\x04 \x03(\r\x12\x0e\n\x06stride\x18\x06 \x03(\r\x12\x10\n\x08\x64ilation\x18\x12 \x03(\r\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x10\n\x08kernel_h\x18\x0b \x01(\r\x12\x10\n\x08kernel_w\x18\x0c \x01(\r\x12\x10\n\x08stride_h\x18\r \x01(\r\x12\x10\n\x08stride_w\x18\x0e \x01(\r\x12\x10\n\x05group\x18\x05 \x01(\r:\x01\x31\x12-\n\rweight_filler\x18\x07 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x08 \x01(\x0b\x32\x16.caffe.FillerParameter\x12;\n\x06\x65ngine\x18\x0f \x01(\x0e\x32\".caffe.ConvolutionParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x10 \x01(\x05:\x01\x31\x12\x1e\n\x0f\x66orce_nd_im2col\x18\x11 \x01(\x08:\x05\x66\x61lse\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"0\n\rCropParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x32\x12\x0e\n\x06offset\x18\x02 \x03(\r\"\xa4\x02\n\rDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x31\n\x07\x62\x61\x63kend\x18\x08 \x01(\x0e\x32\x17.caffe.DataParameter.DB:\x07LEVELDB\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\"\n\x13\x66orce_encoded_color\x18\t \x01(\x08:\x05\x66\x61lse\x12\x13\n\x08prefetch\x18\n \x01(\r:\x01\x35\"\x1b\n\x02\x44\x42\x12\x0b\n\x07LEVELDB\x10\x00\x12\x08\n\x04LMDB\x10\x01\"I\n\x10\x44ropoutParameter\x12\x1a\n\rdropout_ratio\x18\x01 \x01(\x02:\x03\x30.5\x12\x19\n\x0bscale_train\x18\x02 \x01(\x08:\x04true\"\xa0\x01\n\x12\x44ummyDataParameter\x12+\n\x0b\x64\x61ta_filler\x18\x01 \x03(\x0b\x32\x16.caffe.FillerParameter\x12\x1f\n\x05shape\x18\x06 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x0b\n\x03num\x18\x02 \x03(\r\x12\x10\n\x08\x63hannels\x18\x03 \x03(\r\x12\x0e\n\x06height\x18\x04 \x03(\r\x12\r\n\x05width\x18\x05 \x03(\r\"\xa5\x01\n\x10\x45ltwiseParameter\x12\x39\n\toperation\x18\x01 \x01(\x0e\x32!.caffe.EltwiseParameter.EltwiseOp:\x03SUM\x12\r\n\x05\x63oeff\x18\x02 \x03(\x02\x12\x1e\n\x10stable_prod_grad\x18\x03 \x01(\x08:\x04true\"\'\n\tEltwiseOp\x12\x08\n\x04PROD\x10\x00\x12\x07\n\x03SUM\x10\x01\x12\x07\n\x03MAX\x10\x02\" \n\x0c\x45LUParameter\x12\x10\n\x05\x61lpha\x18\x01 \x01(\x02:\x01\x31\"\xac\x01\n\x0e\x45mbedParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x11\n\tinput_dim\x18\x02 \x01(\r\x12\x17\n\tbias_term\x18\x03 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"D\n\x0c\x45xpParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"9\n\x10\x46lattenParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x14\n\x08\x65nd_axis\x18\x02 \x01(\x05:\x02-1\"O\n\x11HDF5DataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x16\n\x07shuffle\x18\x03 \x01(\x08:\x05\x66\x61lse\"(\n\x13HDF5OutputParameter\x12\x11\n\tfile_name\x18\x01 \x01(\t\"^\n\x12HingeLossParameter\x12\x30\n\x04norm\x18\x01 \x01(\x0e\x32\x1e.caffe.HingeLossParameter.Norm:\x02L1\"\x16\n\x04Norm\x12\x06\n\x02L1\x10\x01\x12\x06\n\x02L2\x10\x02\"\x97\x02\n\x12ImageDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\nbatch_size\x18\x04 \x01(\r:\x01\x31\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x16\n\x07shuffle\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnew_height\x18\t \x01(\r:\x01\x30\x12\x14\n\tnew_width\x18\n \x01(\r:\x01\x30\x12\x16\n\x08is_color\x18\x0b \x01(\x08:\x04true\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\x0c \x01(\t:\x00\"\'\n\x15InfogainLossParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\"\xcb\x01\n\x15InnerProductParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0f\n\x04\x61xis\x18\x05 \x01(\x05:\x01\x31\x12\x18\n\ttranspose\x18\x06 \x01(\x08:\x05\x66\x61lse\"1\n\x0eInputParameter\x12\x1f\n\x05shape\x18\x01 \x03(\x0b\x32\x10.caffe.BlobShape\"D\n\x0cLogParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xb8\x02\n\x0cLRNParameter\x12\x15\n\nlocal_size\x18\x01 \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x02 \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x03 \x01(\x02:\x04\x30.75\x12\x44\n\x0bnorm_region\x18\x04 \x01(\x0e\x32\x1e.caffe.LRNParameter.NormRegion:\x0f\x41\x43ROSS_CHANNELS\x12\x0c\n\x01k\x18\x05 \x01(\x02:\x01\x31\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.LRNParameter.Engine:\x07\x44\x45\x46\x41ULT\"5\n\nNormRegion\x12\x13\n\x0f\x41\x43ROSS_CHANNELS\x10\x00\x12\x12\n\x0eWITHIN_CHANNEL\x10\x01\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xbd\x01\n\x13MemoryDataParameter\x12\x12\n\nbatch_size\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\x12\x0e\n\x06height\x18\x03 \x01(\r\x12\r\n\x05width\x18\x04 \x01(\r\x12;\n\x05\x64type\x18\x05 \x01(\x0e\x32#.caffe.MemoryDataParameter.DataType:\x07\x46LOAT32\"$\n\x08\x44\x61taType\x12\x0b\n\x07\x46LOAT32\x10\x00\x12\x0b\n\x07\x46LOAT16\x10\x01\"e\n\x0cMVNParameter\x12 \n\x12normalize_variance\x18\x01 \x01(\x08:\x04true\x12\x1e\n\x0f\x61\x63ross_channels\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x13\n\x03\x65ps\x18\x03 \x01(\x02:\x06\x31\x65-009\"5\n\x12ParameterParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\"\xa2\x03\n\x10PoolingParameter\x12\x35\n\x04pool\x18\x01 \x01(\x0e\x32\".caffe.PoolingParameter.PoolMethod:\x03MAX\x12\x0e\n\x03pad\x18\x04 \x01(\r:\x01\x30\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x02 \x01(\r\x12\x10\n\x08kernel_h\x18\x05 \x01(\r\x12\x10\n\x08kernel_w\x18\x06 \x01(\r\x12\x11\n\x06stride\x18\x03 \x01(\r:\x01\x31\x12\x10\n\x08stride_h\x18\x07 \x01(\r\x12\x10\n\x08stride_w\x18\x08 \x01(\r\x12\x37\n\x06\x65ngine\x18\x0b \x01(\x0e\x32\x1e.caffe.PoolingParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x1d\n\x0eglobal_pooling\x18\x0c \x01(\x08:\x05\x66\x61lse\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Y\n\x13ROIPoolingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"F\n\x0ePowerParameter\x12\x10\n\x05power\x18\x01 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"g\n\x0fPythonParameter\x12\x0e\n\x06module\x18\x01 \x01(\t\x12\r\n\x05layer\x18\x02 \x01(\t\x12\x13\n\tparam_str\x18\x03 \x01(\t:\x00\x12 \n\x11share_in_parallel\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xad\x01\n\x12ReductionParameter\x12=\n\toperation\x18\x01 \x01(\x0e\x32%.caffe.ReductionParameter.ReductionOp:\x03SUM\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x10\n\x05\x63oeff\x18\x03 \x01(\x02:\x01\x31\"5\n\x0bReductionOp\x12\x07\n\x03SUM\x10\x01\x12\x08\n\x04\x41SUM\x10\x02\x12\t\n\x05SUMSQ\x10\x03\x12\x08\n\x04MEAN\x10\x04\"\x8d\x01\n\rReLUParameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\x12\x34\n\x06\x65ngine\x18\x02 \x01(\x0e\x32\x1b.caffe.ReLUParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Z\n\x10ReshapeParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\x08num_axes\x18\x03 \x01(\x05:\x02-1\"\xa5\x01\n\x0eScaleParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x18\n\tbias_term\x18\x04 \x01(\x08:\x05\x66\x61lse\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"x\n\x10SigmoidParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SigmoidParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"L\n\x0eSliceParameter\x12\x0f\n\x04\x61xis\x18\x03 \x01(\x05:\x01\x31\x12\x13\n\x0bslice_point\x18\x02 \x03(\r\x12\x14\n\tslice_dim\x18\x01 \x01(\r:\x01\x31\"\x89\x01\n\x10SoftmaxParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SoftmaxParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"r\n\rTanHParameter\x12\x34\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1b.caffe.TanHParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"T\n\rTileParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05tiles\x18\x02 \x01(\x05\x12#\n\tmultiples\x18\x03 \x01(\x0b\x32\x10.caffe.BlobShape\"*\n\x12ThresholdParameter\x12\x14\n\tthreshold\x18\x01 \x01(\x02:\x01\x30\"\xc1\x02\n\x13WindowDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0c\x66g_threshold\x18\x07 \x01(\x02:\x03\x30.5\x12\x19\n\x0c\x62g_threshold\x18\x08 \x01(\x02:\x03\x30.5\x12\x19\n\x0b\x66g_fraction\x18\t \x01(\x02:\x04\x30.25\x12\x16\n\x0b\x63ontext_pad\x18\n \x01(\r:\x01\x30\x12\x17\n\tcrop_mode\x18\x0b \x01(\t:\x04warp\x12\x1b\n\x0c\x63\x61\x63he_images\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\r \x01(\t:\x00\"\xeb\x01\n\x0cSPPParameter\x12\x16\n\x0epyramid_height\x18\x01 \x01(\r\x12\x31\n\x04pool\x18\x02 \x01(\x0e\x32\x1e.caffe.SPPParameter.PoolMethod:\x03MAX\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.SPPParameter.Engine:\x07\x44\x45\x46\x41ULT\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xe0\x13\n\x10V1LayerParameter\x12\x0e\n\x06\x62ottom\x18\x02 \x03(\t\x12\x0b\n\x03top\x18\x03 \x03(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12$\n\x07include\x18  \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18! \x03(\x0b\x32\x13.caffe.NetStateRule\x12/\n\x04type\x18\x05 \x01(\x0e\x32!.caffe.V1LayerParameter.LayerType\x12\x1f\n\x05\x62lobs\x18\x06 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x0e\n\x05param\x18\xe9\x07 \x03(\t\x12>\n\x0f\x62lob_share_mode\x18\xea\x07 \x03(\x0e\x32$.caffe.V1LayerParameter.DimCheckMode\x12\x10\n\x08\x62lobs_lr\x18\x07 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x08 \x03(\x02\x12\x13\n\x0bloss_weight\x18# \x03(\x02\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x1b \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18\x17 \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12,\n\x0c\x63oncat_param\x18\t \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18( \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18\n \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12(\n\ndata_param\x18\x0b \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18\x0c \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18\x1a \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18\x18 \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12&\n\texp_param\x18) \x01(\x0b\x32\x13.caffe.ExpParameter\x12\x31\n\x0fhdf5_data_param\x18\r \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18\x0e \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18\x1d \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18\x0f \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18\x10 \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18\x11 \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12&\n\tlrn_param\x18\x12 \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18\x16 \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18\" \x01(\x0b\x32\x13.caffe.MVNParameter\x12.\n\rpooling_param\x18\x13 \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18\x15 \x01(\x0b\x32\x15.caffe.PowerParameter\x12(\n\nrelu_param\x18\x1e \x01(\x0b\x32\x14.caffe.ReLUParameter\x12.\n\rsigmoid_param\x18& \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18\' \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12*\n\x0bslice_param\x18\x1f \x01(\x0b\x32\x15.caffe.SliceParameter\x12(\n\ntanh_param\x18% \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x32\n\x0fthreshold_param\x18\x19 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12\x35\n\x11window_data_param\x18\x14 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12\x37\n\x0ftransform_param\x18$ \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18* \x01(\x0b\x32\x14.caffe.LossParameter\x12&\n\x05layer\x18\x01 \x01(\x0b\x32\x17.caffe.V0LayerParameter\"\xd8\x04\n\tLayerType\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06\x41\x42SVAL\x10#\x12\x0c\n\x08\x41\x43\x43URACY\x10\x01\x12\n\n\x06\x41RGMAX\x10\x1e\x12\x08\n\x04\x42NLL\x10\x02\x12\n\n\x06\x43ONCAT\x10\x03\x12\x14\n\x10\x43ONTRASTIVE_LOSS\x10%\x12\x0f\n\x0b\x43ONVOLUTION\x10\x04\x12\x08\n\x04\x44\x41TA\x10\x05\x12\x11\n\rDECONVOLUTION\x10\'\x12\x0b\n\x07\x44ROPOUT\x10\x06\x12\x0e\n\nDUMMY_DATA\x10 \x12\x12\n\x0e\x45UCLIDEAN_LOSS\x10\x07\x12\x0b\n\x07\x45LTWISE\x10\x19\x12\x07\n\x03\x45XP\x10&\x12\x0b\n\x07\x46LATTEN\x10\x08\x12\r\n\tHDF5_DATA\x10\t\x12\x0f\n\x0bHDF5_OUTPUT\x10\n\x12\x0e\n\nHINGE_LOSS\x10\x1c\x12\n\n\x06IM2COL\x10\x0b\x12\x0e\n\nIMAGE_DATA\x10\x0c\x12\x11\n\rINFOGAIN_LOSS\x10\r\x12\x11\n\rINNER_PRODUCT\x10\x0e\x12\x07\n\x03LRN\x10\x0f\x12\x0f\n\x0bMEMORY_DATA\x10\x1d\x12\x1d\n\x19MULTINOMIAL_LOGISTIC_LOSS\x10\x10\x12\x07\n\x03MVN\x10\"\x12\x0b\n\x07POOLING\x10\x11\x12\t\n\x05POWER\x10\x1a\x12\x08\n\x04RELU\x10\x12\x12\x0b\n\x07SIGMOID\x10\x13\x12\x1e\n\x1aSIGMOID_CROSS_ENTROPY_LOSS\x10\x1b\x12\x0b\n\x07SILENCE\x10$\x12\x0b\n\x07SOFTMAX\x10\x14\x12\x10\n\x0cSOFTMAX_LOSS\x10\x15\x12\t\n\x05SPLIT\x10\x16\x12\t\n\x05SLICE\x10!\x12\x08\n\x04TANH\x10\x17\x12\x0f\n\x0bWINDOW_DATA\x10\x18\x12\r\n\tTHRESHOLD\x10\x1f\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xfd\x07\n\x10V0LayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x12\n\nnum_output\x18\x03 \x01(\r\x12\x16\n\x08\x62iasterm\x18\x04 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x06 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0e\n\x03pad\x18\x07 \x01(\r:\x01\x30\x12\x12\n\nkernelsize\x18\x08 \x01(\r\x12\x10\n\x05group\x18\t \x01(\r:\x01\x31\x12\x11\n\x06stride\x18\n \x01(\r:\x01\x31\x12\x35\n\x04pool\x18\x0b \x01(\x0e\x32\".caffe.V0LayerParameter.PoolMethod:\x03MAX\x12\x1a\n\rdropout_ratio\x18\x0c \x01(\x02:\x03\x30.5\x12\x15\n\nlocal_size\x18\r \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x0e \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x0f \x01(\x02:\x04\x30.75\x12\x0c\n\x01k\x18\x16 \x01(\x02:\x01\x31\x12\x0e\n\x06source\x18\x10 \x01(\t\x12\x10\n\x05scale\x18\x11 \x01(\x02:\x01\x31\x12\x10\n\x08meanfile\x18\x12 \x01(\t\x12\x11\n\tbatchsize\x18\x13 \x01(\r\x12\x13\n\x08\x63ropsize\x18\x14 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x15 \x01(\x08:\x05\x66\x61lse\x12\x1f\n\x05\x62lobs\x18\x32 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x10\n\x08\x62lobs_lr\x18\x33 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x34 \x03(\x02\x12\x14\n\trand_skip\x18\x35 \x01(\r:\x01\x30\x12\x1d\n\x10\x64\x65t_fg_threshold\x18\x36 \x01(\x02:\x03\x30.5\x12\x1d\n\x10\x64\x65t_bg_threshold\x18\x37 \x01(\x02:\x03\x30.5\x12\x1d\n\x0f\x64\x65t_fg_fraction\x18\x38 \x01(\x02:\x04\x30.25\x12\x1a\n\x0f\x64\x65t_context_pad\x18: \x01(\r:\x01\x30\x12\x1b\n\rdet_crop_mode\x18; \x01(\t:\x04warp\x12\x12\n\x07new_num\x18< \x01(\x05:\x01\x30\x12\x17\n\x0cnew_channels\x18= \x01(\x05:\x01\x30\x12\x15\n\nnew_height\x18> \x01(\x05:\x01\x30\x12\x14\n\tnew_width\x18? \x01(\x05:\x01\x30\x12\x1d\n\x0eshuffle_images\x18@ \x01(\x08:\x05\x66\x61lse\x12\x15\n\nconcat_dim\x18\x41 \x01(\r:\x01\x31\x12\x36\n\x11hdf5_output_param\x18\xe9\x07 \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"W\n\x0ePReLUParameter\x12&\n\x06\x66iller\x18\x01 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1d\n\x0e\x63hannel_shared\x18\x02 \x01(\x08:\x05\x66\x61lse\")\n\x15SmoothL1LossParameter\x12\x10\n\x05sigma\x18\x01 \x01(\x02:\x01\x31\"H\n\x0cMPIParameter\x12\x0f\n\x04root\x18\x01 \x01(\r:\x01\x30\x12\x12\n\x07\x63omm_id\x18\x02 \x01(\x04:\x01\x30\x12\x13\n\x08group_id\x18\x03 \x01(\x04:\x01\x30\"!\n\x10PermuteParameter\x12\r\n\x05order\x18\x01 \x03(\r\"\x92\x01\n\x12NormalizeParameter\x12\x1c\n\x0e\x61\x63ross_spatial\x18\x01 \x01(\x08:\x04true\x12,\n\x0cscale_filler\x18\x02 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1c\n\x0e\x63hannel_shared\x18\x03 \x01(\x08:\x04true\x12\x12\n\x03\x65ps\x18\x04 \x01(\x02:\x05\x30.001\"d\n\x11ParallelParameter\x12\x1d\n\x0emultiple_nodes\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x16\n\x07shuffle\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x18\n\tpartition\x18\x03 \x01(\x08:\x05\x66\x61lse\"R\n\x0fResizeParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x0e\n\x02\x66x\x18\x02 \x01(\x02:\x02-1\x12\x0e\n\x02\x66y\x18\x03 \x01(\x02:\x02-1\"\'\n\x13\x45xpandDimsParameter\x12\x10\n\x04\x61xis\x18\x01 \x01(\x05:\x02-1\"\x90\x02\n\x11ProposalParameter\x12\x0e\n\x06stride\x18\x01 \x03(\x05\x12\r\n\x05ratio\x18\x02 \x03(\x02\x12\r\n\x05scale\x18\x03 \x03(\x02\x12\x1b\n\rpre_nms_top_n\x18\x04 \x01(\r:\x04\x36\x30\x30\x30\x12\x1b\n\x0epost_nms_top_n\x18\x05 \x01(\r:\x03\x33\x30\x30\x12\x17\n\nnms_thresh\x18\x06 \x01(\x02:\x03\x30.7\x12\x14\n\x08min_size\x18\x07 \x01(\r:\x02\x31\x36\x12\x14\n\tmin_level\x18\x08 \x01(\x05:\x01\x32\x12\x14\n\tmax_level\x18\t \x01(\x05:\x01\x35\x12\x1c\n\x0f\x63\x61nonical_scale\x18\n \x01(\x05:\x03\x32\x32\x34\x12\x1a\n\x0f\x63\x61nonical_level\x18\x0b \x01(\x05:\x01\x34\"\xa6\x01\n\x14\x42\x61tchRenormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x30.001\x12\x10\n\x05r_max\x18\x04 \x01(\x02:\x01\x33\x12\x10\n\x05\x64_max\x18\x05 \x01(\x02:\x01\x35\x12\x16\n\x07t_delta\x18\x06 \x01(\x02:\x05\x30.001\"?\n\x14\x44\x65nseConcatParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x16\n\x0bgrowth_rate\x18\x02 \x01(\x05:\x01\x30\"N\n\x12\x46ocalLossParameter\x12\x13\n\x05\x61lpha\x18\x01 \x01(\x02:\x04\x30.25\x12\x10\n\x05gamma\x18\x02 \x01(\x02:\x01\x32\x12\x11\n\x06neg_id\x18\x03 \x01(\x05:\x01\x30\"\"\n\x0fGatherParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x30\"{\n\x12GroupNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x30.001\x12\x11\n\x05group\x18\x05 \x01(\r:\x02\x33\x32*\x1c\n\x05Phase\x12\t\n\x05TRAIN\x10\x00\x12\x08\n\x04TEST\x10\x01')
 )
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)

@@ -40,8 +40,8 @@ _PHASE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=17642,
-  serialized_end=17670,
+  serialized_start=17641,
+  serialized_end=17669,
 )
 _sym_db.RegisterEnumDescriptor(_PHASE)

@@ -5492,7 +5492,7 @@ _NORMALIZEPARAMETER = _descriptor.Descriptor(
    _descriptor.FieldDescriptor(
      name='eps', full_name='caffe.NormalizeParameter.eps', index=3,
      number=4, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=1e-010,
+      has_default_value=True, default_value=0.001,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
@@ -5508,7 +5508,7 @@ _NORMALIZEPARAMETER = _descriptor.Descriptor(
  oneofs=[
  ],
  serialized_start=16516,
-  serialized_end=16663,
+  serialized_end=16662,
 )


@@ -5551,8 +5551,8 @@ _PARALLELPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16665,
-  serialized_end=16765,
+  serialized_start=16664,
+  serialized_end=16764,
 )


@@ -5595,8 +5595,8 @@ _RESIZEPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16767,
-  serialized_end=16849,
+  serialized_start=16766,
+  serialized_end=16848,
 )


@@ -5725,8 +5725,8 @@ _PROPOSALPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16893,
-  serialized_end=17165,
+  serialized_start=16892,
+  serialized_end=17164,
 )


@@ -5790,8 +5790,8 @@ _BATCHRENORMPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17168,
-  serialized_end=17334,
+  serialized_start=17167,
+  serialized_end=17333,
 )


@@ -5827,8 +5827,8 @@ _DENSECONCATPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17336,
-  serialized_end=17399,
+  serialized_start=17335,
+  serialized_end=17398,
 )


@@ -5871,8 +5871,8 @@ _FOCALLOSSPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17401,
-  serialized_end=17479,
+  serialized_start=17400,
+  serialized_end=17478,
 )


@@ -5901,8 +5901,8 @@ _GATHERPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17481,
-  serialized_end=17515,
+  serialized_start=17480,
+  serialized_end=17514,
 )


@@ -5952,8 +5952,8 @@ _GROUPNORMPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17517,
-  serialized_end=17640,
+  serialized_start=17516,
+  serialized_end=17639,
 )

 _BLOBPROTO.fields_by_name['shape'].message_type = _BLOBSHAPE

--- a/Dragon/python/setup.py
+++ b/Dragon/python/setup.py
@@ -42,7 +42,7 @@ find_modules()


 setup(name = 'dragon',
-      version='0.2.2.7',
+      version='0.2.2.8',
      description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
      url='https://github.com/seetaresearch/Dragon',
      author='Ting Pan',

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -56,10 +56,14 @@ void Graph::ForwardShareDyeing(string u, string ancestor) {
    if (renamed_.count(u)) return;
    renamed_[u] = ancestor;
    if (dag_[u].childs.size() == 1) {
-        string op_type = dag_[dag_[u].childs[0]].op_type;
-        auto* schema = OpSchemaRegistry::Schema(op_type);
+        auto& v = dag_[u].childs[0];
+        auto& op_def = dag_[v].op_def;
+        auto* schema = OpSchemaRegistry::Schema(op_def.type());
        if (schema->AllowInplace())
-            ForwardShareDyeing(dag_[u].childs[0], ancestor);
+            for (int i = 0; i < op_def.input_size(); i++)
+                if (op_def.input(i) == u &&
+                        schema->CheckInplace(i, 0))
+                            ForwardShareDyeing(v, ancestor);
    }
 }

@@ -95,7 +99,7 @@ void Graph::BackwardPruneDyeing(string v) {

 GraphDef Graph::Prune(const GraphDef& meta_graph) {
    dag_.clear(); colored_.clear();
-    //  build Graph
+    //  build DAG
    for (int i = 0; i < meta_graph.op_size(); i++) {
        const OperatorDef& op = meta_graph.op(i);
        for (auto& v : op.output()) {
@@ -108,7 +112,7 @@ GraphDef Graph::Prune(const GraphDef& meta_graph) {
                dag_[u].childs.push_back(v);
                dag_[v].op_idx = i;
            }
-            dag_[v].op_type = op.type();
+            dag_[v].op_def = op;
        }
    }

@@ -150,7 +154,7 @@ GraphDef Graph::Prune(const GraphDef& meta_graph) {
    //  check if having feeded tensors
    for (auto& tensor : ws()->GetTensors()) outputs.insert(tensor);
    //  note that we use map to keep topo-order
-    map<int, OperatorDef> ops_final;
+    map<int, OperatorDef> final_sequence;

    for (auto it : selected_op_indices) {
        OperatorDef op_def;
@@ -167,19 +171,56 @@ GraphDef Graph::Prune(const GraphDef& meta_graph) {
            if (!colored_[output]) *op_def.mutable_output(i) = "ignore";
            else outputs.insert(op_def.output(i));
        }
-        ops_final[it].CopyFrom(op_def);
-    }
-
-    //  build the pruned graph
-    GraphDef pruned_graph;
-    pruned_graph.CopyFrom(meta_graph);
-    pruned_graph.clear_op();
-    for (auto it : ops_final) pruned_graph.add_op()->CopyFrom(it.second);
-    return pruned_graph;
+        //  handle handcraft cases
+        if (op_def.type() == "AffineGradient") {
+            //  trigger in-place if not solving dAlpha
+            if (op_def.output(1) == "ignore")
+                *op_def.mutable_input(0) = "ignore";
+        } else if (op_def.type() == "MulGradient" ||
+                   op_def.type() == "RMulGradient") {
+            if (op_def.output(0) == "ignore")
+                *op_def.mutable_input(1) = "ignore";
+            if (op_def.output(1) == "ignore")
+                *op_def.mutable_input(0) = "ignore";
+        } else if (op_def.type() == "DivGradient" ||
+                   op_def.type() == "RDivGradient") {
+            //  dX2 requires both X1 and X2
+            if (op_def.output(1) == "ignore") {
+                *op_def.mutable_input(0) = "ignore";
+                if (op_def.output(0) == "ignore")
+                    *op_def.mutable_input(1) = "ignore";
+            }
+        }
+        //  push into the final sequence
+        final_sequence[it].CopyFrom(op_def);
+    }
+
+    //  done!
+    GraphDef g;
+    g.CopyFrom(meta_graph); g.clear_op();
+    for (auto it : final_sequence)
+        g.add_op()->CopyFrom(it.second);
+    return g;
 }

 GraphDef Graph::Share(const GraphDef& optimized_graph) {
-    renamed_.clear();
+    dag_.clear(); renamed_.clear();
+    //  build DAG
+    for (int i = 0; i < optimized_graph.op_size(); i++) {
+        const OperatorDef& op = optimized_graph.op(i);
+        for (auto& v : op.output()) {
+            vector<string> sp_u;
+            if (!op.input_size()) sp_u.resize(op.output_size(), "");
+            else sp_u.assign(op.input().begin(), op.input().end());
+            for (auto& u : sp_u) {
+                if (u == "ignore") continue;
+                dag_[v].parents.push_back(u);
+                dag_[u].childs.push_back(v);
+                dag_[v].op_idx = i;
+            }
+            dag_[v].op_def = op;
+        }
+    }

    //  forward dyeing to search available tensors that be shared
    for (int i = 0; i < optimized_graph.op_size(); i++) {
@@ -188,28 +229,27 @@ GraphDef Graph::Share(const GraphDef& optimized_graph) {
        for (auto& v : op.output()) ForwardShareDyeing(v, v);
    }

-    GraphDef shared_graph;
-    shared_graph.CopyFrom(optimized_graph);
+    GraphDef g; g.CopyFrom(optimized_graph);

    //  rename to create in-place
    for (int i = 0; i < optimized_graph.op_size(); i++) {
        const OperatorDef& op = optimized_graph.op(i);
        for (int j = 0; j < op.input_size(); j++) {
-            if (renamed_.count(op.input(j))) {
-                *shared_graph.mutable_op(i)->
-                    mutable_input(j) = renamed_[op.input(j)];
-                ws()->CreateRename(op.input(j), renamed_[op.input(j)]);
-            }
+            if (renamed_.count(op.input(j)) &&
+                ws()->SetProxy(op.input(j), renamed_[op.input(j)]))
+                    *g.mutable_op(i)->mutable_input(j)
+                        = renamed_[op.input(j)];
        }
        for (int j = 0; j < op.output_size(); j++) {
-            if (renamed_.count(op.output(j))) {
-                *shared_graph.mutable_op(i)->
-                    mutable_output(j) = renamed_[op.output(j)];
-                ws()->CreateRename(op.output(j), renamed_[op.output(j)]);
+            if (renamed_.count(op.output(j)) &&
+                ws()->SetProxy(op.output(j), renamed_[op.output(j)]))
+                    *g.mutable_op(i)->mutable_output(j)
+                        = renamed_[op.output(j)];
        }
    }
-    }
-    return shared_graph;
+
+    //  done!
+    return g;
 }

 void Graph::ShareGrads(GraphDef& optimized_graph) {

--- a/Dragon/src/operators/activation/cudnn_dropout_op.cc
+++ b/Dragon/src/operators/activation/cudnn_dropout_op.cc
-#include "operators/activation/dropout_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/activation/dropout_op.h"

 #ifdef WITH_CUDNN


--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
-#include "operators/activation/dropout_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/activation/dropout_op.h"

 namespace dragon {


--- a/Dragon/src/operators/activation/elu_op.cc
+++ b/Dragon/src/operators/activation/elu_op.cc
-#include "operators/activation/elu_op.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "operators/activation/elu_op.h"

 namespace dragon {


--- a/Dragon/src/operators/activation/prelu_op.cc
+++ b/Dragon/src/operators/activation/prelu_op.cc
-#include "operators/activation/prelu_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "operators/activation/prelu_op.h"

 namespace dragon {


--- a/Dragon/src/operators/activation/relu_op.cc
+++ b/Dragon/src/operators/activation/relu_op.cc
-#include "operators/activation/relu_op.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "operators/activation/relu_op.h"

 namespace dragon {


--- a/Dragon/src/operators/activation/selu_op.cc
+++ b/Dragon/src/operators/activation/selu_op.cc
-#include "operators/activation/selu_op.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "operators/activation/selu_op.h"

 namespace dragon {


--- a/Dragon/src/operators/activation/sigmoid_op.cc
+++ b/Dragon/src/operators/activation/sigmoid_op.cc
-#include "operators/activation/sigmoid_op.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "operators/activation/sigmoid_op.h"

 namespace dragon {


--- a/Dragon/src/operators/activation/softmax_op.cc
+++ b/Dragon/src/operators/activation/softmax_op.cc
-#include "operators/activation/softmax_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "operators/activation/softmax_op.h"

 namespace dragon {


--- a/Dragon/src/operators/activation/tanh_op.cc
+++ b/Dragon/src/operators/activation/tanh_op.cc
-#include "operators/activation/tanh_op.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "operators/activation/tanh_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/add_op.cc
+++ b/Dragon/src/operators/arithmetic/add_op.cc
-#include "operators/arithmetic/add_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/fundamental_op.h"

 namespace dragon {

 template <class Context> template <typename T>
 void AddOp<Context>::EltwiseRunWithType() {
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Add<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    math::Add<T, Context>(Output(0)->count(), x1, x2, y);
 }

 template <class Context> template <typename T>
 void AddOp<Context>::BroadcastRunWithType(int type) {
    TIndex outer_dim, inner_dim;
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();

    ctx().template Copy<T, Context, Context>(
-        Input(0).count(), Ydata, X1data);
+        Output(0)->count(), y, x1);

    if (type == 0 || type == 1) {
        if (type == 0) {
@@ -34,158 +34,127 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, multiplier, X2data,
-                        1.0, Ydata, &ctx());
-    } 
-    else if (type == 2) {
+                    1.0, multiplier, x2,
+                        1.0, y, &ctx());
+    } else if (type == 2) {
        outer_dim = Input(0).dim(0);
        inner_dim = Input(0).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, X2data, multiplier,
-                        1.0, Ydata, &ctx());
+                    1.0, x2, multiplier,
+                        1.0, y, &ctx());
    }
 }

 template <class Context>
 void AddOp<Context>::RunOnDevice() {
+    DeclareX1X2;
    Output(0)->ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims()) 
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1) 
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) && 
-                 Input(1).count(0, Input(1).axis(-1)) == 1)  
-            BroadcastRunWithType<float>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
+        RunByX1X2(float);
    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-            Input(1).count(0, Input(1).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        RunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(0),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(Add);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Add);
 #endif
-OPERATOR_SCHEMA(Add).NumInputs(2).NumOutputs(1).Inplace({ { 0, 0 }, { 1, 0 } });
+OPERATOR_SCHEMA(Add)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context> template <typename T>
 void AddGradientOp<Context>::EltwiseRunWithType() {
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();

    if (Output(1)->name() != "ignore") {
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        ctx().template Copy<T, Context, Context>(
-            Output(1)->count(), dX2data, dYdata);
+            Output(1)->count(), dx2, dy);
    }

    if (Output(0)->name() != "ignore") {
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx().template Copy<T, Context, Context>(
-            Output(0)->count(), dX1data, dYdata);
+            Output(0)->count(), dx1, dy);
    }
 }

 template <class Context> template <typename T>
 void AddGradientOp<Context>::BroadcastRunWithType(int type) {
+    DefineX1X2;
    TIndex outer_dim, inner_dim;
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();

    if (Output(1)->name() != "ignore") {
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        if (type == 0 || type == 1) {
            if (type == 0) {
-                outer_dim = Input(-1).count();
+                outer_dim = X1->count();
                inner_dim = 1;
            } else {
-                outer_dim = Input(-1).count(0, Input(-1).axis(-1));
-                inner_dim = Input(-1).dim(-1);
+                outer_dim = X1->count(0, X1->axis(-1));
+                inner_dim = X1->dim(-1);
            }
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
-                    1.0, dYdata, multiplier,
-                        0.0, dX2data, &ctx());
-        }
-        else if (type == 2) {
-            outer_dim = Input(-1).dim(0);
-            inner_dim = Input(-1).count(1);
+                    1.0, dy, multiplier,
+                        0.0, dx2, &ctx());
+        } else if (type == 2) {
+            outer_dim = X1->dim(0);
+            inner_dim = X1->count(1);
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
-                    1.0, dYdata, multiplier,
-                        0.0, dX2data, &ctx());
+                    1.0, dy, multiplier,
+                        0.0, dx2, &ctx());
        }
    }

    if (Output(0)->name() != "ignore") {
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx().template Copy<T, Context, Context>(
-            Output(0)->count(), dX1data, dYdata);
+            X1->count(), dx1, dy);
    }
 }

 template <class Context>
 void AddGradientOp<Context>::RunOnDevice() {
-    Output(0)->ReshapeLike(Input(-1));
-    Output(1)->ReshapeLike(Input(0));
-
-    if (XIsType(Input(0), float)) {
-        if (Input(-1).dims() == Input(0).dims()) 
-            EltwiseRunWithType<float>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else if (XIsType(Input(0), float16)) {
-        if (Input(-1).dims() == Input(0).dims()) 
-            EltwiseRunWithType<float16>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    DefineX1X2;
+    Output(0)->ReshapeLike(*X1);
+    Output(1)->ReshapeLike(*X2);
+
+    if (XIsType(Input(-1), float)) {
+        RunByX1X2(float);
+    } else if (XIsType(Input(-1), float16)) {
+        RunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(-1),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(AddGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(AddGradient);
 #endif
-OPERATOR_SCHEMA(AddGradient).NumInputs(2).NumOutputs(2).Inplace({ { 1, 0 } });
+OPERATOR_SCHEMA(AddGradient).NumInputs(1).NumOutputs(2);

 class GetAddGradient : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetAddGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
-            vector<string> {I(1), GO(0)},
+            vector<string> {GO(0)},
            vector<string> {GI(0), GI(1)});
    }
 };

--- a/Dragon/src/operators/arithmetic/affine_op.cc
+++ b/Dragon/src/operators/arithmetic/affine_op.cc
-#include "operators/arithmetic/affine_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
 #include "utils/op_kernel.h"
+#include "operators/arithmetic/affine_op.h"

 namespace dragon {

@@ -50,7 +50,7 @@ DEPLOY_CPU(Affine);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Affine);
 #endif
-OPERATOR_SCHEMA(Affine).NumInputs(2, 3).NumOutputs(1);
+OPERATOR_SCHEMA(Affine).NumInputs(2, 3).NumOutputs(1).Inplace({ { 0, 0 } });

 template <class Context> template <typename T>
 void AffineGradientOp<Context>::BiasRunWithType() {
@@ -71,12 +71,12 @@ void AffineGradientOp<Context>::BiasRunWithType() {

 template <class Context> template <typename T>
 void AffineGradientOp<Context>::ScaleRunWithType() {
-    Output(0)->ReshapeLike(Input(0));
+    Output(0)->ReshapeLike(Input(-1));
    Output(1)->ReshapeLike(Input(1));
    DECLARE_MULTIPLIER(multiplier, sum_dim);

    sum_result.Reshape({ outer_dim * scale_dim });
-    bool is_eltwise = (Input(0).count() == Input(1).count());
+    bool is_eltwise = (Input(-1).count() == Input(1).count());
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* Xdata = Input(0).template data<T, Context>();
    auto* dScale = Output(1)->template mutable_data<T, Context>();
@@ -123,7 +123,7 @@ void AffineGradientOp<Context>::ScaleRunWithType() {

 template <class Context> template <typename T>
 void AffineGradientOp<Context>::RunWithType() {
-    Output(0)->ReshapeLike(Input(0));
+    Output(0)->ReshapeLike(Input(-1));

    auto* dYdata = Input(-1).template data<T, Context>();
    auto* Adata = Input(1).template data<T, Context>();
@@ -137,25 +137,25 @@ void AffineGradientOp<Context>::RunWithType() {
 template <class Context>
 void AffineGradientOp<Context>::RunOnDevice() {
    start_axis = axis;
-    if (start_axis < 0) start_axis += (int)Input(0).ndim();
-    if (num_axes == -1) num_axes = (int)Input(0).ndim() - start_axis;
+    if (start_axis < 0) start_axis += (int)Input(-1).ndim();
+    if (num_axes == -1) num_axes = (int)Input(-1).ndim() - start_axis;
    else if (num_axes == 0) num_axes = 1;

-    CHECK_LT(start_axis, (int)Input(0).ndim());
-    CHECK_LE(start_axis + num_axes, (int)Input(0).ndim());
+    CHECK_LT(start_axis, (int)Input(-1).ndim());
+    CHECK_LE(start_axis + num_axes, (int)Input(-1).ndim());

-    outer_dim = Input(0).count(0, start_axis);
-    inner_dim = Input(0).count(start_axis + num_axes);
+    outer_dim = Input(-1).count(0, start_axis);
+    inner_dim = Input(-1).count(start_axis + num_axes);
    scale_dim = Input(1).count();
    sum_dim = std::max(outer_dim, inner_dim);
    dim = scale_dim * inner_dim;

-    if (XIsType(Input(0), float)) {
+    if (XIsType(Input(-1), float)) {
        if (Output(2)->name() != "ignore") BiasRunWithType<float>();
        if (Output(1)->name() != "ignore") ScaleRunWithType<float>();
        if (Output(0)->name() != "ignore") RunWithType<float>();
    } else {
-        LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+        LOG(FATAL) << DTypeHelper(Input(-1), { "float32" });
    }
 }


--- a/Dragon/src/operators/arithmetic/clip_op.cc
+++ b/Dragon/src/operators/arithmetic/clip_op.cc
-#include "operators/arithmetic/clip_op.h"
+#include "core/workspace.h"
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-#include "core/workspace.h"
+#include "operators/arithmetic/clip_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
+++ b/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
 #ifdef WITH_CUDNN

-#include "operators/arithmetic/affine_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/affine_op.h"

 namespace dragon {

 template <class Context> template <typename T>
 void CuDNNAffineOp<Context>::RunWithType() {
-    this->template ResetDesc<T>();
+    this->template ResetDesc<T>(Input(0));
    const auto& dim_start = Input(0).dims().begin() + start_axis;
    const auto& dim_end = dim_start + num_axes;
    vector<TIndex> param_dims(dim_start, dim_end);
@@ -56,13 +56,13 @@ DEPLOY_CUDNN(Affine);

 template <class Context> template <typename T>
 void CuDNNAffineGradientOp<Context>::RunWithType() {
-    this->template ResetDesc<T>();
-    outer_dim = Input(0).count(0, start_axis);
-    inner_dim = Input(0).count(start_axis + num_axes);
+    this->template ResetDesc<T>(Input(-1));
+    outer_dim = Input(-1).count(0, start_axis);
+    inner_dim = Input(-1).count(start_axis + num_axes);
    scale_dim = Input(1).count();
    sum_dim = std::max(outer_dim, inner_dim);
    dim = scale_dim * inner_dim;
-    Output(0)->ReshapeLike(Input(0));
+    Output(0)->ReshapeLike(Input(-1));

    auto* dYdata = Input(-1).template data<T, Context>();
    auto* Adata = Input(1).template data<T, Context>();
@@ -99,7 +99,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
        Output(2)->ReshapeLike(Input(1));
        auto* dBdata = Output(2)->template mutable_data<T, Context>();
        //  eltwise
-        if (Input(0).count() == Input(1).count()) {
+        if (Input(-1).count() == Input(1).count()) {
            math::Axpy<T, Context>(Output(2)->count(),
                1.f, dYdata, dBdata, &ctx());
        } else {
@@ -212,8 +212,8 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient_v2(

 template <class Context>
 void CuDNNAffineGradientOp<Context>::RunOnDevice() {
-    if (XIsType(Input(0), float)) RunWithType<float>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    if (XIsType(Input(-1), float)) RunWithType<float>();
+    else LOG(FATAL) << DTypeHelper(Input(-1), { "float32" });
 }

 DEPLOY_CUDNN(AffineGradient);

--- a/Dragon/src/operators/arithmetic/div_op.cc
+++ b/Dragon/src/operators/arithmetic/div_op.cc
-#include "operators/arithmetic/div_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/fundamental_op.h"

 namespace dragon {

 template <class Context> template <typename T>
 void DivOp<Context>::EltwiseRunWithType() {
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Div<T, Context>(Input(0).count(), X1data, X2data, Ydata);
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    math::Div<T, Context>(Output(0)->count(), x1, x2, y);
 }

 template <class Context> template <typename T>
 void DivOp<Context>::BroadcastRunWithType(int type) {
    TIndex outer_dim, inner_dim;
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    auto* c = ws()->template caches<T, Context>({
+        Output(0)->count() })[0];

    if (type == 0 || type == 1) {
        if (type == 0) {
@@ -31,175 +33,146 @@ void DivOp<Context>::BroadcastRunWithType(int type) {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, multiplier, X2data,
-                        0.0, Ydata, &ctx());
-        math::Div<T, Context>(Input(0).count(), X1data, Ydata, Ydata);
-    } 
-    else if (type == 2) {
+                    1.0, multiplier, x2,
+                        0.0, c, &ctx());
+        math::Div<T, Context>(Output(0)->count(), x1, c, y);
+    } else if (type == 2) {
        outer_dim = Input(0).dim(0);
        inner_dim = Input(0).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, X2data, multiplier,
-                        0.0, Ydata, &ctx());
-        math::Div<T, Context>(Input(0).count(), X1data, Ydata, Ydata);
+                    1.0, x2, multiplier,
+                        0.0, c, &ctx());
+        math::Div<T, Context>(Output(0)->count(), x1, c, y);
    }
 }

 template <class Context>
 void DivOp<Context>::RunOnDevice() {
+    DeclareX1X2;
    Output(0)->ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims()) 
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1) 
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) && 
-                 Input(1).count(0, Input(1).axis(-1)) == 1)  
-            BroadcastRunWithType<float>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
+        RunByX1X2(float);
    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-            Input(1).count(0, Input(1).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        RunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(0),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(Div);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Div);
 #endif
-OPERATOR_SCHEMA(Div).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(Div)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context> template <typename T>
 void DivGradientOp<Context>::EltwiseRunWithType() {
-    auto* dYdata = Input(2).template data<T, Context>();
+    DefineX1X2;
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (Output(1)->name() != "ignore") {
-        auto* X1data = Input(0).template data<T, Context>();
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Input(-1).count(), dYdata, X1data, dX1data); // dY * X_{1}
-        math::Square<T, Context>(Input(1).count(), X2data, dX2data); // X_{2}^{2}
-        math::Inv<T, Context>(Input(1).count(), -1.0, dX2data, dX2data); // -1 / X_{2}^{2}
-        math::Mul<T, Context>(Input(1).count(), dX1data, dX2data, dX2data);
+        auto* x1 = Input(0).template data<T, Context>();
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
+        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
+        math::Mul<T,Context>(X1->count(), dy, x1, c); // dY * X1
+        math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
+        math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
+        math::Mul<T, Context>(X2->count(), c, dx2, dx2);
    }
+
    if (Output(0)->name() != "ignore") {
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        math::Div<T, Context>(Input(0).count(), dYdata, X2data, dX1data);
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
+        math::Div<T, Context>(X1->count(), dy, x2, dx1);
    }
 }

 template <class Context> template <typename T>
 void DivGradientOp<Context>::BroadcastRunWithType(int type) {
+    DefineX1X2;
    TIndex outer_dim, inner_dim;
-    auto* dYdata = Input(2).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (type == 0) {
-        outer_dim = Input(0).count();
+        outer_dim = X1->count();
        inner_dim = 1;
    } else if (type == 1) {
-        outer_dim = Input(0).count(0, Input(0).axis(-1));
-        inner_dim = Input(0).dim(-1);
+        outer_dim = X1->count(0, X1->axis(-1));
+        inner_dim = X1->dim(-1);
    } else if (type == 2) {
-        outer_dim = Input(0).dim(0);
-        inner_dim = Input(0).count(1);
+        outer_dim = X1->dim(0);
+        inner_dim = X1->count(1);
    }

    if (Output(1)->name() != "ignore") {
-        auto* X1data = Input(0).template data<T, Context>();
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
-        auto* Bdata = ws()->template caches<T, Context>({ Input(1).count() })[0];
-        math::Mul<T, Context>(Input(-1).count(), dYdata, X1data, dX1data); // dY * X_{1}
+        auto* x1 = Input(0).template data<T, Context>();
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
+        auto cs = ws()->template caches<T, Context>(
+            { X1->count(), X2->count() });
+        math::Mul<T, Context>(X1->count(), dy, x1, cs[0]); // dY * X1
+        math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
+        math::Inv<T, Context>(X2->count(), -1.0, dx2, dx2); // -1 / X2^{2}
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
-            math::Square<T, Context>(Input(1).count(), X2data, dX2data); // X_{2}^{2}
-            math::Inv<T, Context>(Input(1).count(), -1.0, dX2data, dX2data); // -1 / X_{2}^{2}
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
-                    1.0, dX1data, multiplier,
-                        0.0, Bdata, &ctx());
-        }
-        else if (type == 2) {
+                    1.0, cs[0], multiplier,
+                        0.0, cs[1], &ctx());
+        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
-            math::Square<T, Context>(Input(1).count(), X2data, dX2data); // X_{2}^{2}
-            math::Inv<T, Context>(Input(1).count(), -1.0, dX2data, dX2data); // -1 / X_{2}^{2}
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
-                    1.0, dX1data, multiplier,
-                        0.0, Bdata, &ctx());
+                    1.0, cs[0], multiplier,
+                        0.0, cs[1], &ctx());
        }
-        math::Mul<T, Context>(Input(1).count(), Bdata, dX2data, dX2data);
+        math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2);
    }

    if (Output(0)->name() != "ignore") {
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
-                        1.0, multiplier, X2data,
-                            0.0, dX1data, &ctx());
+                        1.0, multiplier, x2,
+                            0.0, dx1, &ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
-                        1.0, X2data, multiplier,
-                            0.0, dX1data, &ctx());
+                        1.0, x2, multiplier,
+                            0.0, dx1, &ctx());
        }
-        math::Div<T, Context>(Output(0)->count(), dYdata, dX1data, dX1data);
+        math::Div<T, Context>(X1->count(), dy, dx1, dx1);
    }
 }

 template <class Context>
 void DivGradientOp<Context>::RunOnDevice() {
-    Output(0)->ReshapeLike(Input(0));
-    Output(1)->ReshapeLike(Input(1));
-
-    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-            Input(1).count(0, Input(1).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-            Input(1).count(0, Input(1).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    DefineX1X2;
+    Output(0)->ReshapeLike(*X1);
+    Output(1)->ReshapeLike(*X2);
+
+    if (XIsType(Input(-1), float)) {
+        RunByX1X2(float);
+    } else if (XIsType(Input(-1), float16)) {
+        RunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(-1),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(DivGradient);

--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
-#include "operators/arithmetic/dot_op.h"
-#include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/dot_op.h"

 namespace dragon {

@@ -99,9 +98,9 @@ void DotGradientOp<Context>::DotRunWithType() {
    auto* dYdata = Input(2).template data<T, CPUContext>();
    auto* dX1data = Output(0)->template mutable_data<T, Context>();
    auto* dX2data = Output(1)->template mutable_data<T, Context>();
-    this->ctx().template Copy<T, Context, Context>(
+    ctx().template Copy<T, Context, Context>(
        Output(0)->count(), dX1data, X2data);
-    this->ctx().template Copy<T, Context, Context>(
+    ctx().template Copy<T, Context, Context>(
        Output(1)->count(), dX2data, X1data);
    math::MulScalar<T, Context>(Output(0)->count(), dYdata[0], dX1data);
    math::MulScalar<T, Context>(Output(1)->count(), dYdata[0], dX2data);

--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
-#include "operators/arithmetic/eltwise_op.h"
-#include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/eltwise_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/exp_op.cc
+++ b/Dragon/src/operators/arithmetic/exp_op.cc
-#include "operators/arithmetic/exp_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/exp_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
-#include "operators/arithmetic/gram_matrix_op.h"
-#include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/gram_matrix_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/inner_product_op.cc
+++ b/Dragon/src/operators/arithmetic/inner_product_op.cc
-#include "operators/arithmetic/inner_product_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
+#include "operators/arithmetic/inner_product_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/log_op.cc
+++ b/Dragon/src/operators/arithmetic/log_op.cc
-#include "operators/arithmetic/log_op.h"
-#include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/log_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
-#include "operators/arithmetic/matmul_op.h"
-#include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/matmul_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/mul_op.cc
+++ b/Dragon/src/operators/arithmetic/mul_op.cc
-#include "operators/arithmetic/mul_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/fundamental_op.h"

 namespace dragon {

 template <class Context> template <typename T>
 void MulOp<Context>::EltwiseRunWithType() {
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Input(0).count(), X1data, X2data, Ydata);
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
 }

 template <class Context> template <typename T>
 void MulOp<Context>::BroadcastRunWithType(int type) {
    TIndex outer_dim, inner_dim;
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    auto* c = ws()->template caches<T, Context>({ 
+        Output(0)->count() })[0];

    if (type == 0 || type == 1) {
        if (type == 0) {
@@ -31,165 +33,135 @@ void MulOp<Context>::BroadcastRunWithType(int type) {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, multiplier, X2data,
-                        0.0, Ydata, &ctx());
-        math::Mul<T, Context>(Input(0).count(), X1data, Ydata, Ydata);
-    } 
-    else if (type == 2) {
+                    1.0, multiplier, x2,
+                        0.0, c, &ctx());
+        math::Mul<T, Context>(Output(0)->count(), x1, c, y);
+    } else if (type == 2) {
        outer_dim = Input(0).dim(0);
        inner_dim = Input(0).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, X2data, multiplier,
-                        0.0, Ydata, &ctx());
-        math::Mul<T, Context>(Input(0).count(), X1data, Ydata, Ydata);
+                    1.0, x2, multiplier,
+                        0.0, c, &ctx());
+        math::Mul<T, Context>(Output(0)->count(), x1, c, y);
    }
 }

 template <class Context>
 void MulOp<Context>::RunOnDevice() {
+    DeclareX1X2;
    Output(0)->ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims()) 
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1) 
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) && 
-                 Input(1).count(0, Input(1).axis(-1)) == 1)  
-            BroadcastRunWithType<float>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
+        RunByX1X2(float);
    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-            Input(1).count(0, Input(1).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        RunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(0),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(Mul);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Mul);
 #endif
-OPERATOR_SCHEMA(Mul).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(Mul)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context> template <typename T>
 void MulGradientOp<Context>::EltwiseRunWithType() {
-    auto* dYdata = Input(2).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (Output(1)->name() != "ignore") {
-        auto* X1data = Input(0).template data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Input(0).count(), dYdata, X1data, dX2data);
+        auto* x1 = Input(0).template data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
+        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
    }
+
    if (Output(0)->name() != "ignore") {
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Input(0).count(), dYdata, X2data, dX1data);
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
+        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
    }
 }

 template <class Context> template <typename T>
 void MulGradientOp<Context>::BroadcastRunWithType(int type) {
+    DefineX1X2;
    TIndex outer_dim, inner_dim;
-    auto* dYdata = Input(2).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (type == 0) {
-        outer_dim = Input(0).count();
+        outer_dim = X1->count();
        inner_dim = 1;
    } else if (type == 1) {
-        outer_dim = Input(0).count(0, Input(0).axis(-1));
-        inner_dim = Input(0).dim(-1);
+        outer_dim = X1->count(0, X1->axis(-1));
+        inner_dim = X1->dim(-1);
    } else if (type == 2) {
-        outer_dim = Input(0).dim(0);
-        inner_dim = Input(0).count(1);
+        outer_dim = X1->dim(0);
+        inner_dim = X1->count(1);
    }

    if (Output(1)->name() != "ignore") {
-        auto* X1data = Input(0).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
+        auto* x1 = Input(0).template data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
+        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
+        math::Mul<T, Context>(X1->count(), dy, x1, c);
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier,  outer_dim);
-            math::Mul<T, Context>(Input(-1).count(), dYdata, X1data, dX1data);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
-                    1.0, dX1data, multiplier,
-                        0.0, dX2data, &ctx());
+                    1.0, c, multiplier,
+                        0.0, dx2, &ctx());
        } else if (type == 2) {
-            outer_dim = Input(0).dim(0);
-            inner_dim = Input(0).count(1);
            DECLARE_MULTIPLIER(multiplier, inner_dim);
-            math::Mul<T, Context>(Input(-1).count(), dYdata, X1data, dX1data);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
-                    1.0, X1data, multiplier,
-                        0.0, dX2data, &ctx());
+                    1.0, c, multiplier,
+                        0.0, dx2, &ctx());
        }
    }

    if (Output(0)->name() != "ignore") {
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
-                        1.0, multiplier, X2data,
-                            0.0, dX1data, &ctx());
+                        1.0, multiplier, x2,
+                            0.0, dx1, &ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
-                        1.0, X2data, multiplier,
-                            0.0, dX1data, &ctx());
+                        1.0, x2, multiplier,
+                            0.0, dx1, &ctx());
        }
-        math::Mul<T, Context>(Output(0)->count(), dYdata, dX1data, dX1data);
+        math::Mul<T, Context>(X1->count(), dy, dx1, dx1);
    }
 }

 template <class Context>
 void MulGradientOp<Context>::RunOnDevice() {
-    Output(0)->ReshapeLike(Input(0));
-    Output(1)->ReshapeLike(Input(1));
-
-    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims()) 
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-                 Input(1).count(0, Input(1).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-            Input(1).count(0, Input(1).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    DefineX1X2;
+    Output(0)->ReshapeLike(*X1);
+    Output(1)->ReshapeLike(*X2);
+
+    if (XIsType(Input(-1), float)) {
+        RunByX1X2(float);
+    } else if (XIsType(Input(-1), float16)) {
+        RunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(-1),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(MulGradient);

--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
-#include "operators/arithmetic/pow_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/pow_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/radd_op.cc
+++ b/Dragon/src/operators/arithmetic/radd_op.cc
-#include "operators/arithmetic/add_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/fundamental_op.h"

 namespace dragon {

 template <class Context> template <typename T>
 void RAddOp<Context>::EltwiseRunWithType() {
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Add<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    math::Add<T, Context>(Output(0)->count(), x1, x2, y);
 }

 template <class Context> template <typename T>
 void RAddOp<Context>::BroadcastRunWithType(int type) {
    TIndex outer_dim, inner_dim;
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+
    ctx().template Copy<T, Context, Context>(
-        Input(1).count(), Ydata, X2data);
+        Output(0)->count(), y, x2);

    if (type == 0 || type == 1) {
        if (type == 0) {
@@ -33,155 +34,127 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, multiplier, X1data,
-                        1.0, Ydata, &ctx());
-    } 
-    else if (type == 2) {
+                    1.0, multiplier, x1,
+                        1.0, y, &ctx());
+    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, X1data, multiplier,
-                        1.0, Ydata, &ctx());
+                    1.0, x1, multiplier,
+                        1.0, y, &ctx());
    }
 }

 template <class Context>
 void RAddOp<Context>::RunOnDevice() {
+    DeclareX1X2;
    Output(0)->ReshapeLike(Input(1));

    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
+        RRunByX1X2(float);
    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        RRunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(0),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(RAdd);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(RAdd);
 #endif
-OPERATOR_SCHEMA(RAdd).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(RAdd)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });

 template <class Context> template <typename T>
 void RAddGradientOp<Context>::EltwiseRunWithType() {
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (Output(1)->name() != "ignore") {
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        ctx().template Copy<T, Context, Context>(
-            Output(1)->count(), dX2data, dYdata);
+            Output(1)->count(), dx2, dy);
    }
+
    if (Output(0)->name() != "ignore") {
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx().template Copy<T, Context, Context>(
-            Output(0)->count(), dX1data, dYdata);
+            Output(0)->count(), dx1, dy);
    }
 }

 template <class Context> template <typename T>
 void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
+    DefineX1X2;
    TIndex outer_dim, inner_dim;
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();

    if (Output(0)->name() != "ignore") {
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        if (type == 0 || type == 1) {
            if (type == 0) {
-                outer_dim = Input(-1).count();
+                outer_dim = X2->count();
                inner_dim = 1;
            } else {
-                outer_dim = Input(-1).count(0, Input(-1).axis(-1));
-                inner_dim = Input(-1).dim(-1);
+                outer_dim = X2->count(0, X2->axis(-1));
+                inner_dim = X2->dim(-1);
            }
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
-                    1.0, dYdata, multiplier,
-                        0.0, dX1data, &ctx());
-        }
-        else if (type == 2) {
-            outer_dim = Input(-1).dim(0);
-            inner_dim = Input(-1).count(1);
+                    1.0, dy, multiplier,
+                        0.0, dx1, &ctx());
+        } else if (type == 2) {
+            outer_dim = X2->dim(0);
+            inner_dim = X2->count(1);
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
-                    1.0, dYdata, multiplier,
-                        0.0, dX1data, &ctx());
+                    1.0, dy, multiplier,
+                        0.0, dx1, &ctx());
        }
    }

    if (Output(1)->name() != "ignore") {
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(Output(1)->count(), dX2data, dYdata);
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
+        ctx().template Copy<T, Context, Context>(
+            X2->count(), dx2, dy);
    }
 }

 template <class Context>
 void RAddGradientOp<Context>::RunOnDevice() {
-    Output(1)->ReshapeLike(Input(-1));
-    Output(0)->ReshapeLike(Input(0));
-
-    if (XIsType(Input(0), float)) {
-        if (Input(-1).dims() == Input(0).dims())
-            EltwiseRunWithType<float>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else if (XIsType(Input(0), float16)) {
-        if (Input(-1).dims() == Input(0).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-            << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    DefineX1X2;
+    Output(0)->ReshapeLike(*X1);
+    Output(1)->ReshapeLike(*X2);
+
+    if (XIsType(Input(-1), float)) {
+        RRunByX1X2(float);
+    } else if (XIsType(Input(-1), float16)) {
+        RRunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(-1),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(RAddGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(RAddGradient);
 #endif
-OPERATOR_SCHEMA(RAddGradient).NumInputs(2).NumOutputs(2);
+OPERATOR_SCHEMA(RAddGradient).NumInputs(1).NumOutputs(2);

 class GetRAddGradient : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetRAddGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
-            vector<string> {I(0), GO(0)},
+            vector<string> {GO(0)},
            vector<string> {GI(0), GI(1)});
    }
 };

--- a/Dragon/src/operators/arithmetic/rdiv_op.cc
+++ b/Dragon/src/operators/arithmetic/rdiv_op.cc
-#include "operators/arithmetic/div_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/fundamental_op.h"

 namespace dragon {

 template <class Context> template <typename T>
 void RDivOp<Context>::EltwiseRunWithType() {
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Div<T, Context>(Input(0).count(), X1data, X2data, Ydata);
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    math::Div<T, Context>(Output(0)->count(), x1, x2, y);
 }

 template <class Context> template <typename T>
 void RDivOp<Context>::BroadcastRunWithType(int type) {
    TIndex outer_dim, inner_dim;
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    auto* c = ws()->template caches<T, Context>({
+        Output(0)->count() })[0];

    if (type == 0 || type == 1) {
        if (type == 0) {
@@ -31,171 +33,144 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, multiplier, X1data,
-                        0.0, Ydata, &ctx());
-        math::Div<T, Context>(Input(1).count(), Ydata, X2data, Ydata);
-    } 
-    else if (type == 2) {
+                    1.0, multiplier, x1,
+                        0.0, c, &ctx());
+        math::Div<T, Context>(Output(0)->count(), c, x2, y);
+    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, X1data, multiplier,
-                        0.0, Ydata, &ctx());
-        math::Div<T, Context>(Input(1).count(), Ydata, X2data, Ydata);
+                    1.0, x1, multiplier,
+                        0.0, c, &ctx());
+        math::Div<T, Context>(Output(0)->count(), c, x2, y);
    }
 }

 template <class Context>
 void RDivOp<Context>::RunOnDevice() {
+    DeclareX1X2;
    Output(0)->ReshapeLike(Input(1));

    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
+        RRunByX1X2(float);
    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        RRunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(0),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(RDiv);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(RDiv);
 #endif
-OPERATOR_SCHEMA(RDiv).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(RDiv)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });

 template <class Context> template <typename T>
 void RDivGradientOp<Context>::EltwiseRunWithType() {
-    auto* dYdata = Input(-1).template data<T, Context>();
+    DefineX1X2;
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (Output(1)->name() != "ignore") {
-        auto* X1data = Input(0).template data<T, Context>();
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Input(-1).count(), dYdata, X1data, dX1data); // dY * X_{1}
-        math::Square<T, Context>(Input(1).count(), X2data, dX2data); // X_{2}^{2}
-        math::Inv<T, Context>(Input(1).count(), -1.0, dX2data, dX2data); // -1 / X_{2}^{2}
-        math::Mul<T, Context>(Input(1).count(), dX1data, dX2data, dX2data);
+        auto* x1 = Input(0).template data<T, Context>();
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
+        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
+        math::Mul<T, Context>(X1->count(), dy, x1, c); // dY * X1
+        math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
+        math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
+        math::Mul<T, Context>(X2->count(), c, dx2, dx2);
    }
+
    if (Output(0)->name() != "ignore") {
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        math::Div<T, Context>(Input(0).count(), dYdata, X2data, dX1data);
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
+        math::Div<T, Context>(X1->count(), dy, x2, dx1);
    }
 }

 template <class Context> template <typename T>
 void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
+    DefineX1X2;
    TIndex outer_dim, inner_dim;
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (type == 0) {
-        outer_dim = Input(-1).count();
+        outer_dim = X2->count();
        inner_dim = 1;
    } else if (type == 1) {
-        outer_dim = Input(-1).count(0, Input(-1).axis(-1));
-        inner_dim = Input(-1).dim(-1);
+        outer_dim = X2->count(0, X2->axis(-1));
+        inner_dim = X2->dim(-1);
    } else if (type == 2) {
-        outer_dim = Input(-1).dim(0);
-        inner_dim = Input(-1).count(1);
+        outer_dim = X2->dim(0);
+        inner_dim = X2->count(1);
    }

    if (Output(0)->name() != "ignore") {
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
-        math::Div<T, Context>(Input(-1).count(), dYdata, X2data, dX2data); // dY * X_{2}
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
+        auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
+        math::Div<T, Context>(X2->count(), dy, x2, c);
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
-                    1.0, dX2data, multiplier,
-                        0.0, dX1data, &ctx());
-        }
-        else if (type == 2) {
+                    1.0, c, multiplier,
+                        0.0, dx1, &ctx());
+        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
-                    1.0, dX2data, multiplier,
-                        0.0, dX1data, &ctx());
+                    1.0, c, multiplier,
+                        0.0, dx1, &ctx());
        }
    }

    if (Output(1)->name() != "ignore") {
-        auto* X1data = Input(0).template data<T, Context>();
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
+        auto* x1 = Input(0).template data<T, Context>();
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
-                        -1.0, multiplier, X1data,
-                            0.0, dX2data, &ctx());
+                        -1.0, multiplier, x1,
+                            0.0, dx2, &ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
-                        -1.0, X1data, multiplier,
-                            0.0, dX2data, &ctx());
+                        -1.0, x1, multiplier,
+                            0.0, dx2, &ctx());
        }
-        math::Mul<T, Context>(Input(-1).count(), dYdata, dX2data, dX2data); // -dY * X_{1}
-        math::Div<T, Context>(Output(1)->count(), dX2data, X2data, dX2data);
-        math::Div<T, Context>(Output(1)->count(), dX2data, X2data, dX2data);
+        math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
+        math::Div<T, Context>(X2->count(), dx2, x2, dx2);
+        math::Div<T, Context>(X2->count(), dx2, x2, dx2);
    }
 }

 template <class Context>
 void RDivGradientOp<Context>::RunOnDevice() {
-    Output(0)->ReshapeLike(Input(0));
-    Output(1)->ReshapeLike(Input(1));
-
-    if (XIsType(Input(0), float)) {
-        if (Input(-1).dims() == Input(0).dims())
-            EltwiseRunWithType<float>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else if (XIsType(Input(0), float16)) {
-        if (Input(-1).dims() == Input(0).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-            << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    DefineX1X2;
+    Output(0)->ReshapeLike(*X1);
+    Output(1)->ReshapeLike(*X2);
+
+    if (XIsType(Input(-1), float)) {
+        RRunByX1X2(float);
+    } else if (XIsType(Input(-1), float16)) {
+        RRunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(-1),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(RDivGradient);

--- a/Dragon/src/operators/arithmetic/rmul_op.cc
+++ b/Dragon/src/operators/arithmetic/rmul_op.cc
-#include "operators/arithmetic/mul_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/fundamental_op.h"

 namespace dragon {

 template <class Context> template <typename T>
 void RMulOp<Context>::EltwiseRunWithType() {
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Input(0).count(), X1data, X2data, Ydata);
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
 }

 template <class Context> template <typename T>
 void RMulOp<Context>::BroadcastRunWithType(int type) {
    TIndex outer_dim, inner_dim;
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    auto* c = ws()->template caches<T, Context>({
+        Output(0)->count() })[0];

    if (type == 0 || type == 1) {
        if (type == 0) {
@@ -31,163 +33,135 @@ void RMulOp<Context>::BroadcastRunWithType(int type) {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, multiplier, X1data,
-                        0.0, Ydata, &ctx());
-        math::Mul<T, Context>(Input(1).count(), X2data, Ydata, Ydata);
-    } 
-    else if (type == 2) {
+                    1.0, multiplier, x1,
+                        0.0, c, &ctx());
+        math::Mul<T, Context>(Output(0)->count(), c, x2, y);
+    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, X1data, multiplier,
-                        0.0, Ydata, &ctx());
-        math::Mul<T, Context>(Input(1).count(), X2data, Ydata, Ydata);
+                    1.0, x1, multiplier,
+                        0.0, c, &ctx());
+        math::Mul<T, Context>(Output(0)->count(), c, x2, y);
    }
 }

 template <class Context>
 void RMulOp<Context>::RunOnDevice() {
+    DeclareX1X2;
    Output(0)->ReshapeLike(Input(1));

    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
+        RRunByX1X2(float);
    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        RRunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(0),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(RMul);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(RMul);
 #endif
-OPERATOR_SCHEMA(RMul).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(RMul)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });

 template <class Context> template <typename T>
 void RMulGradientOp<Context>::EltwiseRunWithType() {
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (Output(1)->name() != "ignore") {
-        auto* X1data = Input(0).template data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Input(0).count(), dYdata, X1data, dX2data);
+        auto* x1 = Input(0).template data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
+        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
    }
+
    if (Output(0)->name() != "ignore") {
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Input(0).count(), dYdata, X2data, dX1data);
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
+        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
    }
 }

 template <class Context> template <typename T>
 void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
+    DefineX1X2;
    TIndex outer_dim, inner_dim;
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (type == 0) {
-        outer_dim = Input(-1).count();
+        outer_dim = X2->count();
        inner_dim = 1;
    } else if (type == 1) {
-        outer_dim = Input(-1).count(0, Input(-1).axis(-1));
-        inner_dim = Input(-1).dim(-1);
+        outer_dim = X2->count(0, X2->axis(-1));
+        inner_dim = X2->dim(-1);
    } else if (type == 2) {
-        outer_dim = Input(-1).dim(0);
-        inner_dim = Input(-1).count(1);
+        outer_dim = X2->dim(0);
+        inner_dim = X2->count(1);
    }

    if (Output(0)->name() != "ignore") {
-        auto* X2data = Input(1).template data<T, Context>();
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
+        auto* x2 = Input(1).template data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
+        auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
+        math::Mul<T, Context>(X2->count(), dy, x2, c);
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
-            math::Mul<T, Context>(Input(-1).count(), dYdata, X2data, dX2data);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
-                    1.0, dX2data, multiplier,
-                        0.0, dX1data, &ctx());
+                    1.0, c, multiplier,
+                        0.0, dx1, &ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
-            math::Mul<T, Context>(Input(-1).count(), dYdata, X2data, dX2data);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
-                    1.0, dX2data, multiplier,
-                        0.0, dX1data, &ctx());
+                    1.0, c, multiplier,
+                        0.0, dx1, &ctx());
        }
    }

    if (Output(1)->name() != "ignore") {
-        auto* X1data = Input(0).template data<T, Context>();
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
+        auto* x1 = Input(0).template data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
-                        1.0, multiplier, X1data,
-                            0.0, dX2data, &ctx());
+                        1.0, multiplier, x1,
+                            0.0, dx2, &ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
-                        1.0, X1data, multiplier,
-                            0.0, dX2data, &ctx());
+                        1.0, x1, multiplier,
+                            0.0, dx2, &ctx());
        }
-        math::Mul<T, Context>(Input(-1).count(), dYdata, dX2data, dX2data);
+        math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
    }
 }

 template <class Context>
 void RMulGradientOp<Context>::RunOnDevice() {
-    Output(0)->ReshapeLike(Input(0));
-    Output(1)->ReshapeLike(Input(1));
-
-    if (XIsType(Input(0), float)) {
-        if (Input(-1).dims() == Input(0).dims())
-            EltwiseRunWithType<float>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else if (XIsType(Input(0), float16)) {
-        if (Input(-1).dims() == Input(0).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-            << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    DefineX1X2;
+    Output(0)->ReshapeLike(*X1);
+    Output(1)->ReshapeLike(*X2);
+
+    if (XIsType(Input(-1), float)) {
+        RRunByX1X2(float);
+    } else if (XIsType(Input(-1), float16)) {
+        RRunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(-1),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(RMulGradient);

--- a/Dragon/src/operators/arithmetic/rsub_op.cc
+++ b/Dragon/src/operators/arithmetic/rsub_op.cc
-#include "operators/arithmetic/sub_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/fundamental_op.h"

 namespace dragon {

 template <class Context> template <typename T>
 void RSubOp<Context>::EltwiseRunWithType() {
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Sub<T, Context>(Input(0).count(), X1data, X2data, Ydata);
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+    math::Sub<T, Context>(Output(0)->count(), x1, x2, y);
 }

 template <class Context> template <typename T>
 void RSubOp<Context>::BroadcastRunWithType(int type) {
    TIndex outer_dim, inner_dim;
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+
    ctx().template Copy<T, Context, Context>(
-        Input(1).count(), Ydata, X2data);
+        Output(0)->count(), y, x2);

    if (type == 0 || type == 1) {
        if (type == 0) {
@@ -33,157 +34,127 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, multiplier, X1data, 
-                        -1.0, Ydata, &ctx());
-    } 
-    else if (type == 2) {
+                    1.0, multiplier, x1,
+                        -1.0, y, &ctx());
+    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    1.0, X1data, multiplier,
-                        -1.0, Ydata, &ctx());
+                    1.0, x1, multiplier,
+                        -1.0, y, &ctx());
    }
 }

 template <class Context>
 void RSubOp<Context>::RunOnDevice() {
+    DeclareX1X2;
    Output(0)->ReshapeLike(Input(1));

    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
+        RRunByX1X2(float);
    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        RRunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(0),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(RSub);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(RSub);
 #endif
-OPERATOR_SCHEMA(RSub).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(RSub)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });

 template <class Context> template <typename T>
 void RSubGradientOp<Context>::EltwiseRunWithType() {
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (Output(1)->name() != "ignore") {
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
-        math::Scale<T, Context>(Output(1)->count(),
-            -1.0, dYdata, dX2data, &ctx());
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
+        math::Scale<T, Context>(
+            Output(1)->count(), -1, dy, dx2, &ctx());
    }
+
    if (Output(0)->name() != "ignore") {
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx().template Copy<T, Context, Context>(
-            Output(0)->count(), dX1data, dYdata);
+            Output(0)->count(), dx1, dy);
    }
 }

 template <class Context> template <typename T>
 void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
+    DefineX1X2;
    TIndex outer_dim, inner_dim;
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();

    if (Output(0)->name() != "ignore") {
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        if (type == 0 || type == 1) {
            if (type == 0) {
-                outer_dim = Input(-1).count();
+                outer_dim = X2->count();
                inner_dim = 1;
            } else {
-                outer_dim = Input(-1).count(0, Input(-1).axis(-1));
-                inner_dim = Input(-1).dim(-1);
+                outer_dim = X2->count(0, X2->axis(-1));
+                inner_dim = X2->dim(-1);
            }
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
-                    1.0, dYdata, multiplier,
-                        0.0, dX1data, &ctx());
-        }
-        else if (type == 2) {
-            outer_dim = Input(-1).dim(0);
-            inner_dim = Input(-1).count(1);
+                    1.0, dy, multiplier,
+                        0.0, dx1, &ctx());
+        } else if (type == 2) {
+            outer_dim = X2->dim(0);
+            inner_dim = X2->count(1);
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
-                    1.0, dYdata, multiplier,
-                        0.0, dX1data, &ctx());
+                    1.0, dy, multiplier,
+                        0.0, dx1, &ctx());
        }
    }

    if (Output(1)->name() != "ignore") {
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
-            Output(1)->count(), dX2data, dYdata);
-        math::MulScalar<T, Context>(Output(1)->count(), -1.f, dX2data);
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
+        math::Scale<T, Context>(
+            X2->count(), -1, dy, dx2, &ctx());
    }
 }

 template <class Context>
 void RSubGradientOp<Context>::RunOnDevice() {
-    Output(1)->ReshapeLike(Input(-1));
-    Output(0)->ReshapeLike(Input(0));
-
-    if (XIsType(Input(0), float)) {
-        if (Input(-1).dims() == Input(0).dims())
-            EltwiseRunWithType<float>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else if (XIsType(Input(0), float16)) {
-        if (Input(-1).dims() == Input(0).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-            << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    DefineX1X2;
+    Output(0)->ReshapeLike(*X1);
+    Output(1)->ReshapeLike(*X2);
+
+    if (XIsType(Input(-1), float)) {
+        RRunByX1X2(float);
+    } else if (XIsType(Input(-1), float16)) {
+        RRunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(-1),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(RSubGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(RSubGradient);
 #endif
-OPERATOR_SCHEMA(RSubGradient).NumInputs(2).NumOutputs(2);
+OPERATOR_SCHEMA(RSubGradient).NumInputs(1).NumOutputs(2);

 class GetRSubGradient : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetRSubGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
-            vector<string> {I(0), GO(0)},
+            vector<string> {GO(0)},
            vector<string> {GI(0), GI(1)});
    }
 };

--- a/Dragon/src/operators/arithmetic/square_op.cc
+++ b/Dragon/src/operators/arithmetic/square_op.cc
-#include "operators/arithmetic/square_op.h"
-#include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/square_op.h"

 namespace dragon {


--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
-#include "operators/arithmetic/sub_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/arithmetic/fundamental_op.h"

 namespace dragon {

@@ -9,17 +9,18 @@ void SubOp<Context>::EltwiseRunWithType() {
    auto* X1data = Input(0).template data<T, Context>();
    auto* X2data = Input(1).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Sub<T, Context>(Input(0).count(), X1data, X2data, Ydata);
+    math::Sub<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
 }

 template <class Context> template <typename T>
 void SubOp<Context>::BroadcastRunWithType(int type) {
    TIndex outer_dim, inner_dim;
-    auto* X1data = Input(0).template data<T, Context>();
-    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* x1 = Input(0).template data<T, Context>();
+    auto* x2 = Input(1).template data<T, Context>();
+    auto* y = Output(0)->template mutable_data<T, Context>();
+
    ctx().template Copy<T, Context, Context>(
-        Input(0).count(), Ydata, X1data);
+        Output(0)->count(), y, x1);

    if (type == 0 || type == 1) {
        if (type == 0) {
@@ -33,8 +34,8 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    -1.0, multiplier, X2data,
-                        1.0, Ydata, &ctx());
+                    -1.0, multiplier, x2,
+                        1.0, y, &ctx());
    } 
    else if (type == 2) {
        outer_dim = Input(0).dim(0);
@@ -43,146 +44,118 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
-                    -1.0, X2data, multiplier,
-                        1.0, Ydata, &ctx());
+                    -1.0, x2, multiplier,
+                        1.0, y, &ctx());
    }
 }

 template <class Context>
 void SubOp<Context>::RunOnDevice() {
+    DeclareX1X2;
    Output(0)->ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) {
-        if (Input(0).dims() == Input(1).dims()) 
-            EltwiseRunWithType<float>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1) 
-            BroadcastRunWithType<float>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) && 
-                 Input(1).count(0, Input(1).axis(-1)) == 1)  
-            BroadcastRunWithType<float>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
+        RunByX1X2(float);
    } else if (XIsType(Input(0), float16)) {
-        if (Input(0).dims() == Input(1).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(0).dim(0) == Input(1).dim(0) && Input(1).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(0).dim(-1) == Input(1).dim(-1) &&
-            Input(1).count(0, Input(1).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(1).ndim() == 1 && Input(1).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(0).DimString() << "  " << Input(1).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        RunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(0),
+            { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(Sub);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Sub);
 #endif
-OPERATOR_SCHEMA(Sub).NumInputs(2).NumOutputs(1).Inplace({ { 0, 0 }, { 1, 0 } });
+OPERATOR_SCHEMA(Sub)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context> template <typename T>
 void SubGradientOp<Context>::EltwiseRunWithType() {
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();
+
    if (Output(1)->name() != "ignore") {
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale<T, Context>(Output(1)->count(),
-            -1.0, dYdata, dX2data, &ctx());
+            -1.0, dy, dx2, &ctx());
    }
+
    if (Output(0)->name() != "ignore") {
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx().template Copy<T, Context, Context>(
-            Output(0)->count(), dX1data, dYdata);
+            Output(0)->count(), dx1, dy);
    }
 }

 template <class Context> template <typename T>
 void SubGradientOp<Context>::BroadcastRunWithType(int type) {
+    DefineX1X2;
    TIndex outer_dim, inner_dim;
-    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dy = Input(-1).template data<T, Context>();

    if (Output(1)->name() != "ignore") {
-        auto* dX2data = Output(1)->template mutable_data<T, Context>();
+        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        if (type == 0 || type == 1) {
            if (type == 0) {
-                outer_dim = Input(-1).count();
+                outer_dim = X1->count();
                inner_dim = 1;
            } else {
-                outer_dim = Input(-1).count(0, Input(-1).axis(-1));
-                inner_dim = Input(-1).dim(-1);
+                outer_dim = X1->count(0, X1->axis(-1));
+                inner_dim = X1->dim(-1);
            }
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
-                    -1.0, dYdata, multiplier,
-                        0.0, dX2data, &ctx());
-        }
-        else if (type == 2) {
-            outer_dim = Input(-1).dim(0);
-            inner_dim = Input(-1).count(1);
+                    -1.0, dy, multiplier,
+                        0.0, dx2, &ctx());
+        } else if (type == 2) {
+            outer_dim = X1->dim(0);
+            inner_dim = X1->count(1);
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
-                    -1.0, dYdata, multiplier,
-                        0.0, dX2data, &ctx());
+                    -1.0, dy, multiplier,
+                        0.0, dx2, &ctx());
        }
    }

    if (Output(0)->name() != "ignore") {
-        auto* dX1data = Output(0)->template mutable_data<T, Context>();
+        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        ctx().template Copy<T, Context, Context>(
-            Output(0)->count(), dX1data, dYdata);
+            X1->count(), dx1, dy);
    }
 }

 template <class Context>
 void SubGradientOp<Context>::RunOnDevice() {
-    Output(0)->ReshapeLike(Input(-1));
-    Output(1)->ReshapeLike(Input(0));
-
-    if (XIsType(Input(0), float)) {
-        if (Input(-1).dims() == Input(0).dims()) 
-            EltwiseRunWithType<float>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else if (XIsType(Input(0), float16)) {
-        if (Input(-1).dims() == Input(0).dims())
-            EltwiseRunWithType<float16>();
-        else if (Input(-1).dim(0) == Input(0).dim(0) && Input(0).count(1) == 1)
-            BroadcastRunWithType<float16>(2);
-        else if (Input(-1).dim(-1) == Input(0).dim(-1) &&
-                 Input(0).count(0, Input(0).axis(-1)) == 1)
-            BroadcastRunWithType<float16>(1);
-        else if (Input(0).ndim() == 1 && Input(0).dim(0) == 1)
-            BroadcastRunWithType<float16>(0);
-        else LOG(FATAL) << "Could not be broadcast together with shapes "
-                        << Input(-1).DimString() << "  " << Input(0).DimString();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    DefineX1X2;
+    Output(0)->ReshapeLike(*X1);
+    Output(1)->ReshapeLike(*X2);
+
+    if (XIsType(Input(-1), float)) {
+        RunByX1X2(float);
+    } else if (XIsType(Input(-1), float16)) {
+        RunByX1X2(float16);
+    } else {
+        LOG(FATAL) << DTypeHelper(Input(-1),
+        { "float32", "float16" });
+    }
 }

 DEPLOY_CPU(SubGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SubGradient);
 #endif
-OPERATOR_SCHEMA(SubGradient).NumInputs(2).NumOutputs(2).Inplace({ { 1, 0 } });
+OPERATOR_SCHEMA(SubGradient).NumInputs(1).NumOutputs(2);

 class GetSubGradient : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetSubGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
-            vector<string> {I(1), GO(0)},
+            vector<string> {GO(0)},
            vector<string> {GI(0), GI(1)});
    }
 };

--- a/Dragon/src/operators/control_flow/compare_op.cc
+++ b/Dragon/src/operators/control_flow/compare_op.cc
-#include "operators/control_flow/compare_op.h"
 #include "utils/op_kernel.h"
+#include "operators/control_flow/compare_op.h"

 namespace dragon {


--- a/Dragon/src/operators/loss/ctc_loss_op.cc
+++ b/Dragon/src/operators/loss/ctc_loss_op.cc
-#include "operators/loss/ctc_loss_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
+#include "operators/loss/ctc_loss_op.h"

 namespace dragon {

@@ -12,7 +12,8 @@ OPERATOR_SCHEMA(CTCLoss).NumInputs(2).NumOutputs(1);

 template <class Context> template <typename T>
 void CTCLossGradientOp<Context>::RunWithType() {
-    auto* gradT = ws()->GetTensor("/mnt/" + anchor() + "/ctc/grads");
+    auto* gradT = ws()->GetTensor(
+        "/mnt/" + anchor() + "/ctc/grads");
    Output(0)->ReshapeLike(*gradT);

    auto* Gdata = gradT->template data<T, Context>();

--- a/Dragon/src/operators/loss/cudnn_ctc_loss_op.cc
+++ b/Dragon/src/operators/loss/cudnn_ctc_loss_op.cc
-#include "operators/loss/ctc_loss_op.h"
 #include "core/workspace.h"
+#include "operators/loss/ctc_loss_op.h"

 #ifdef WITH_CUDNN

@@ -15,7 +15,8 @@ void CuDNNCTCLossOp<Context>::WrapIO() {
    const auto batch_size = Input(0).dim(1);
    const auto max_num_labels = Input(1).dim(1);
    CHECK_EQ(batch_size, Input(1).dim(0))
-        << "\nExcepted " << batch_size << " groups(i.e. batch_size) of labels,"
+        << "\nExcepted " << batch_size
+        << " groups(i.e. batch_size) of labels,"
        << "\nbut got " << Input(1).dim(0) << ".";
    //  CuDNN currently does not support variable input lengths
    input_lengths = vector<int>(batch_size, max_seq_len);
@@ -23,12 +24,16 @@ void CuDNNCTCLossOp<Context>::WrapIO() {
    auto* Ldata = Input(1).template data<int, CPUContext>();
    for (int n = 0; n < batch_size; ++n) {
        auto start = Ldata + n * max_num_labels;
-        auto res = std::find(start, start + max_num_labels, (int)padding_mask);
+        auto res = std::find(
+            start, start + max_num_labels,
+                (int)padding_mask);
        int len = std::distance(start, res);
        CHECK_LE(len, CUDNN_LABEL_LENGTH_LIMIT)
-            << "\nThe max label length is " << CUDNN_LABEL_LENGTH_LIMIT
+            << "\nThe max label length is "
+            << CUDNN_LABEL_LENGTH_LIMIT
            << ", but got " << len << ".";
-        std::copy(start, start + len, std::back_inserter(packed_labels));
+        std::copy(start, start + len,
+            std::back_inserter(packed_labels));
        label_lengths[n] = len;
    }
    Output(0)->Reshape(vector<TIndex>({ 1 }));

--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
-#include "operators/loss/l1_loss_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/loss/l1_loss_op.h"

 namespace dragon {

@@ -19,10 +19,13 @@ void L1LossOp<Context>::RunWithType() {
        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
    }

-    T normalizer;
-    if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = Input(0).count();
-    else if (normalization == "NONE") normalizer = 1;
+    T normalizer = 1;
+    if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = Input(0).count();
+    }
+
    T loss = math::ASum<T, Context>(diff->count(), diff_data);
    math::Set<T, Context>(1, loss / normalizer, Ydata);
 }
@@ -52,11 +55,13 @@ void L1LossGradientOp<Context>::RunWithType() {
        1, &dYdata_host, dYdata);
    kernel::AbsGrad<T, Context>(diff->count(), diff_data, diff_data);

-    T alpha = dYdata_host, normalizer;
-    if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = Input(0).count();
-    else if (normalization == "NONE") normalizer = 1;
-    alpha = alpha / normalizer;
+    T alpha = dYdata_host, normalizer = 1;
+    if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = Input(0).count();
+    } alpha = alpha / normalizer;
+
    for (int i = 0; i < 2; i++) {
        if (Output(i)->name() == "ignore") continue;
        Output(i)->ReshapeLike(Input(i));

--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
-#include "operators/loss/l2_loss_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/loss/l2_loss_op.h"

 namespace dragon {

@@ -17,10 +17,13 @@ void L2LossOp<Context>::RunWithType() {
        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
    }

-    T normalizer;
-    if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = Input(0).count();
-    else if (normalization == "NONE") normalizer = 1;
+    T normalizer = 1;
+    if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = Input(0).count();
+    }
+
    T loss = T(0.5) * math::Dot<T, Context>(diff->count(),
        diff_data, diff_data, &ctx());
    math::Set<T, Context>(1, loss / normalizer, Ydata);
@@ -50,11 +53,13 @@ void L2LossGradientOp<Context>::RunWithType() {
    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);

-    T alpha = dYdata_host, normalizer;
-    if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = Input(0).count();
-    else if (normalization == "NONE") normalizer = 1;
-    alpha = alpha / normalizer;
+    T alpha = dYdata_host, normalizer = 1;
+    if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = Input(0).count();
+    } alpha = alpha / normalizer;
+
    for (int i = 0; i < 2; i++) {
        if (Output(i)->name() == "ignore") continue;
        Output(i)->ReshapeLike(Input(i));

--- a/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
-#include "operators/loss/sigmoid_cross_entropy_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/loss/sigmoid_cross_entropy_op.h"

 namespace dragon {

@@ -10,10 +10,10 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Tdata = Input(1).template data<T, Context>();
    auto* Ldata = losses.template mutable_data<T, Context>();
-    auto* Vdata = valid.template mutable_data<T, Context>();
+    auto* Fdata = flags.template mutable_data<T, Context>();

    kernel::SigmoidCrossEntropy<T, Context>(
-        Input(0).count(), Xdata, Tdata, Ldata, Vdata);
+        Input(0).count(), Xdata, Tdata, Ldata, Fdata, &ctx());

    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
@@ -21,13 +21,17 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
        return;
    }

-    T normalizer;
-    if (normalization == "VALID")
+    T normalizer = 1;
+    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<T, Context>(valid.count(), Vdata), 1.f);
-    else if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = Input(0).count();
-    else if (normalization == "NONE") normalizer = 1;
+            math::ASum<T, Context>(
+                flags.count(), Fdata), 1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = Input(0).count();
+    }
+
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
@@ -39,7 +43,7 @@ void SigmoidCrossEntropyOp<Context>::RunOnDevice() {
    CHECK_EQ(Input(0).count(), Input(1).count())
        << "\nNumber of predictions must match the number of labels.";
    losses.ReshapeLike(Input(0));
-    valid.ReshapeLike(Input(0));
+    flags.ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) RunWithType<float>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
@@ -55,11 +59,11 @@ template <class Context> template <typename T>
 void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Tdata = Input(1).template data<T, Context>();
-    auto* Vdata = valid.template mutable_data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
+    auto* Fdata = flags.template mutable_data<T, Context>();

    kernel::SigmoidCrossEntropyGrad<T, Context>(
-        Input(0).count(), Xdata, Tdata, dXdata, Vdata);
+        Input(0).count(), Xdata, Tdata, dXdata, Fdata, &ctx());

    if (normalization == "UNIT") {
        auto* dYdata = Input(-1).template data<T, Context>();
@@ -67,13 +71,17 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
            dYdata, dXdata, dXdata); return;
    }

-    T normalizer;
-    if (normalization == "VALID")
+    T normalizer = 1;
+    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<T, Context>(valid.count(), Vdata), 1.f);
-    else if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = Input(0).count();
-    else if (normalization == "NONE") normalizer = 1;
+            math::ASum<T, Context>(
+                flags.count(), Fdata), 1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = Input(0).count();
+    }
+
    auto* dYdata = Input(-1).template data<T, Context>();
    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
@@ -84,7 +92,7 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
 template <class Context>
 void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
-    valid.ReshapeLike(Input(0));
+    flags.ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) RunWithType<float>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
@@ -96,7 +104,8 @@ DEPLOY_CUDA(SigmoidCrossEntropyGradient);
 #endif
 OPERATOR_SCHEMA(SigmoidCrossEntropyGradient).NumInputs(3).NumOutputs(1);

-class GetSigmoidCrossEntropyGradient final : public GradientMakerBase {
+class GetSigmoidCrossEntropyGradient
+    final : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetSigmoidCrossEntropyGradient);
    vector<OperatorDef> MakeDefs() override {
@@ -105,6 +114,9 @@ class GetSigmoidCrossEntropyGradient final : public GradientMakerBase {
            vector<string> {GI(0)});
    }
 };
-REGISTER_GRADIENT(SigmoidCrossEntropy, GetSigmoidCrossEntropyGradient);
+REGISTER_GRADIENT(
+    SigmoidCrossEntropy,
+    GetSigmoidCrossEntropyGradient
+);

 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
+#include "core/workspace.h"
+#include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/loss/sigmoid_focal_loss_op.h"
+
+namespace dragon {
+
+template <class Context> template <typename T>
+void SigmoidFocalLossOp<Context>::RunWithType() {
+    auto* Xdata = Input(0).template data<T, Context>();
+    auto* Tdata = Input(1).template data<T, Context>();
+    auto* Ldata = losses.template mutable_data<T, Context>();
+    auto* Fdata = flags.template mutable_data<T, Context>();
+
+    kernel::SigmoidFocalLoss<T, Context>(
+        outer_dim, axis_dim, inner_dim,
+            pos_alpha, neg_alpha, gamma, neg_id,
+                Xdata, Tdata, Ldata, Fdata, &ctx());
+
+    if (normalization == "UNIT") {
+        Output(0)->ReshapeLike(losses);
+        Output(0)->template Copy<Context, Context>(losses);
+        return;
+    }
+
+    T normalizer = 1;
+    if (normalization == "VALID") {
+        normalizer = std::max(
+            math::ASum<T, Context>(
+                flags.count(), Fdata), 1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = outer_dim * inner_dim;
+    }
+
+    T loss = math::ASum<T, Context>(losses.count(), Ldata);
+    Output(0)->Reshape({ 1 });
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    math::Set<T, Context>(1, loss / normalizer, Ydata);
+}
+
+template <class Context>
+void SigmoidFocalLossOp<Context>::RunOnDevice() {
+    outer_dim = Input(0).count(0, axis);
+    axis_dim = Input(0).dim(axis);
+    inner_dim = Input(0).count(axis + 1);
+    CHECK_EQ(outer_dim * inner_dim, Input(1).count())
+        << "\nNumber of predictions must match the number of labels.";
+
+    losses.ReshapeLike(Input(0));
+    flags.ReshapeLike(Input(0));
+
+    if (XIsType(Input(0), float)) RunWithType<float>();
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+}
+
+DEPLOY_CPU(SigmoidFocalLoss);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(SigmoidFocalLoss);
+#endif
+OPERATOR_SCHEMA(SigmoidFocalLoss).NumInputs(2).NumOutputs(1);
+
+template <class Context> template <typename T>
+void SigmoidFocalLossGradientOp<Context>::RunWithType() {
+    auto* Xdata = Input(0).template data<T, Context>();
+    auto* Tdata = Input(1).template data<T, Context>();
+    auto* dXdata = Output(0)->template mutable_data<T, Context>();
+    auto* Fdata = flags.template mutable_data<T, Context>();
+
+    kernel::SigmoidFocalLossGradient<T, Context>(
+        outer_dim, axis_dim, inner_dim,
+            pos_alpha, neg_alpha, gamma, neg_id,
+                Xdata, Tdata, dXdata, Fdata, &ctx());
+
+    if (normalization == "UNIT") {
+        auto* dYdata = Input(-1).template data<T, Context>();
+        math::Mul<T, Context>(Output(0)->count(),
+            dYdata, dXdata, dXdata); return;
+    }
+
+    T normalizer = 1;
+    if (normalization == "VALID") {
+        normalizer = std::max(
+            math::ASum<T, Context>(
+                flags.count(), Fdata), 1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = Input(0).count();
+    }
+
+    auto* dYdata = Input(-1).template data<T, Context>();
+    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+        1, &dYdata_host, dYdata);
+    math::Scal<T, Context>(Output(0)->count(),
+        dYdata_host / normalizer, dXdata, &ctx());
+}
+
+template <class Context>
+void SigmoidFocalLossGradientOp<Context>::RunOnDevice() {
+    outer_dim = Input(0).count(0, axis);
+    axis_dim = Input(0).dim(axis);
+    inner_dim = Input(0).count(axis + 1);
+
+    Output(0)->ReshapeLike(Input(0));
+    flags.ReshapeLike(Input(0));
+
+    if (XIsType(Input(0), float)) RunWithType<float>();
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+}
+
+DEPLOY_CPU(SigmoidFocalLossGradient);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(SigmoidFocalLossGradient);
+#endif
+OPERATOR_SCHEMA(SigmoidFocalLossGradient).NumInputs(3).NumOutputs(1);
+
+class GetSigmoidFocalLossGradient
+    final : public GradientMakerBase {
+ public:
+    GRADIENT_MAKER_CTOR(GetSigmoidFocalLossGradient);
+    vector<OperatorDef> MakeDefs() override {
+        return SingleDef(def.type() + "Gradient", "",
+            vector<string> {I(0), I(1), GO(0)},
+            vector<string> {GI(0)});
+    }
+};
+REGISTER_GRADIENT(
+    SigmoidFocalLoss,
+    GetSigmoidFocalLossGradient
+);
+
+}    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
-#include "operators/loss/smooth_l1_loss_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/loss/smooth_l1_loss_op.h"

 namespace dragon {

@@ -27,10 +27,13 @@ void SmoothL1LossOp<Context>::RunWithType() {
            outside_w_data, error_data, error_data);
    }

-    T normalizer;
-    if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = Input(0).count();
-    else if (normalization == "NONE") normalizer = 1;
+    T normalizer = 1;
+    if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = Input(0).count();
+    }
+
    T loss = math::ASum<T, Context>(error->count(), error_data);
    math::Set<T, Context>(1, loss / normalizer, Ydata);
 }
@@ -66,11 +69,12 @@ void SmoothL1LossGradientOp<Context>::RunWithType() {
    kernel::SmoothL1Grad<T, Context>(
        diff->count(), beta, diff_data, diff_data);

-    T alpha = dYdata_host, normalizer;
-    if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = Input(0).count();
-    else if (normalization == "NONE") normalizer = 1;
-    alpha = alpha / normalizer;
+    T alpha = dYdata_host, normalizer = 1;
+    if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = Input(0).count();
+    } alpha = alpha / normalizer;

    for (int i = 0; i < 2; i++) {
        if (Output(i)->name() == "ignore") continue;
@@ -107,7 +111,8 @@ DEPLOY_CUDA(SmoothL1LossGradient);
 #endif
 OPERATOR_SCHEMA(SmoothL1LossGradient).NumInputs(3, 5).NumOutputs(2);

-class GetSmoothL1LossGradient final : public GradientMakerBase {
+class GetSmoothL1LossGradient
+    final : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetSmoothL1LossGradient);
    vector<OperatorDef> MakeDefs() override {
@@ -119,6 +124,9 @@ class GetSmoothL1LossGradient final : public GradientMakerBase {
                      vector<string> {GI(0), GI(1)});
    }
 };
-REGISTER_GRADIENT(SmoothL1Loss, GetSmoothL1LossGradient);
+REGISTER_GRADIENT(
+    SmoothL1Loss,
+    GetSmoothL1LossGradient
+);

 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/softmax_cross_entropy_op.cc
-#include "operators/activation/softmax_op.h"
-#include "operators/loss/softmax_cross_entropy_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
 #include "utils/proto_utils.h"
+#include "operators/activation/softmax_op.h"
+#include "operators/loss/softmax_cross_entropy_op.h"

 namespace dragon {

@@ -37,10 +37,13 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
                Ldata, Ydata); return;
    }

-    T normalizer;
-    if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
-    else if (normalization == "NONE") normalizer = 1;
+    T normalizer = 1;
+    if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = outer_dim * inner_dim;
+    }
+
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
@@ -85,10 +88,13 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
            Pdata, dXdata, dXdata); return;
    }

-    T normalizer;
-    if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
-    else if (normalization == "NONE") normalizer = 1;
+    T normalizer = 1;
+    if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = outer_dim * inner_dim;
+    }
+
    auto* dYdata = Input(-1).template data<T, Context>();
    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
@@ -113,7 +119,8 @@ DEPLOY_CUDA(SoftmaxCrossEntropyGradient);
 #endif
 OPERATOR_SCHEMA(SoftmaxCrossEntropyGradient).NumInputs(3).NumOutputs(1);

-class GetSoftmaxCrossEntropyGradient final : public GradientMakerBase {
+class GetSoftmaxCrossEntropyGradient
+    final : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetSoftmaxCrossEntropyGradient);
    vector<OperatorDef> MakeDefs() override {
@@ -122,6 +129,9 @@ class GetSoftmaxCrossEntropyGradient final : public GradientMakerBase {
            vector<string> {GI(0)});
    }
 };
-REGISTER_GRADIENT(SoftmaxCrossEntropy, GetSoftmaxCrossEntropyGradient);
+REGISTER_GRADIENT(
+    SoftmaxCrossEntropy,
+    GetSoftmaxCrossEntropyGradient
+);

 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/sparse_softmax_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_focal_loss_op.cc
-#include "operators/activation/softmax_op.h"
-#include "operators/loss/sparse_softmax_focal_loss_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
 #include "utils/proto_utils.h"
+#include "operators/activation/softmax_op.h"
+#include "operators/loss/softmax_focal_loss_op.h"

 namespace dragon {

 template <class Context> template <typename T>
-void SparseSoftmaxFocalLossOp<Context>::RunWithType() {
+void SoftmaxFocalLossOp<Context>::RunWithType() {
    auto* Pdata = this->prob->template data<T, Context>();
    auto* Tdata = Input(1).template data<T, Context>();
    auto* Idata = !this->ignores.count() ? nullptr :
@@ -16,7 +16,7 @@ void SparseSoftmaxFocalLossOp<Context>::RunWithType() {
    auto* Ldata = losses.template mutable_data<T, Context>();
    auto* Fdata = flags.template mutable_data<T, Context>();

-    kernel::SparseSoftmaxFocalLoss<T, Context>(
+    kernel::SoftmaxFocalLoss<T, Context>(
        outer_dim, Input(0).dim(axis), inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
                Pdata, Tdata, Idata, this->ignores.count(),
@@ -28,13 +28,17 @@ void SparseSoftmaxFocalLossOp<Context>::RunWithType() {
        return;
    }

-    T normalizer;
-    if (normalization == "VALID")
+    T normalizer = 1;
+    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<T, Context>(flags.count(), Fdata), 1.f);
-    else if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
-    else if (normalization == "NONE") normalizer = 1;
+            math::ASum<T, Context>(
+                flags.count(), Fdata), 1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL"){
+        normalizer = outer_dim * inner_dim;
+    }
+
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
@@ -42,7 +46,7 @@ void SparseSoftmaxFocalLossOp<Context>::RunWithType() {
 }

 template <class Context>
-void SparseSoftmaxFocalLossOp<Context>::RunOnDevice() {
+void SoftmaxFocalLossOp<Context>::RunOnDevice() {
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);
    CHECK_EQ(outer_dim * inner_dim, Input(1).count())
@@ -57,14 +61,14 @@ void SparseSoftmaxFocalLossOp<Context>::RunOnDevice() {
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

-DEPLOY_CPU(SparseSoftmaxFocalLoss);
+DEPLOY_CPU(SoftmaxFocalLoss);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(SparseSoftmaxFocalLoss);
+DEPLOY_CUDA(SoftmaxFocalLoss);
 #endif
-OPERATOR_SCHEMA(SparseSoftmaxFocalLoss).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(SoftmaxFocalLoss).NumInputs(2).NumOutputs(1);

 template <class Context> template <typename T>
-void SparseSoftmaxFocalLossGradientOp<Context>::RunWithType() {
+void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
    auto* Pdata = this->prob->template mutable_data<T, Context>();
    auto* Tdata = Input(1).template data<T, Context>();
    auto* Idata = !this->ignores.count() ? nullptr :
@@ -72,7 +76,7 @@ void SparseSoftmaxFocalLossGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    auto* Fdata = flags.template mutable_data<T, Context>();

-    kernel::SparseSoftmaxFocalLossGrad<T, Context>(
+    kernel::SoftmaxFocalLossGrad<T, Context>(
        outer_dim, Output(0)->dim(axis), inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
                Pdata, Tdata, Idata, this->ignores.count(),
@@ -88,13 +92,17 @@ void SparseSoftmaxFocalLossGradientOp<Context>::RunWithType() {
            Pdata, dXdata, dXdata); return;
    }

-    T normalizer;
-    if (normalization == "VALID")
+    T normalizer = 1;
+    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<T, Context>(flags.count(), Fdata), 1.f);
-    else if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
-    else if (normalization == "NONE") normalizer = 1;
+            math::ASum<T, Context>(
+                flags.count(), Fdata), 1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = outer_dim * inner_dim;
+    }
+
    auto* dYdata = Input(-1).template data<T, Context>();
    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
@@ -103,7 +111,7 @@ void SparseSoftmaxFocalLossGradientOp<Context>::RunWithType() {
 }

 template <class Context>
-void SparseSoftmaxFocalLossGradientOp<Context>::RunOnDevice() {
+void SoftmaxFocalLossGradientOp<Context>::RunOnDevice() {
    this->prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
    outer_dim = this->prob->count(0, axis);
    inner_dim = this->prob->count(axis + 1);
@@ -114,21 +122,25 @@ void SparseSoftmaxFocalLossGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

-DEPLOY_CPU(SparseSoftmaxFocalLossGradient);
+DEPLOY_CPU(SoftmaxFocalLossGradient);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(SparseSoftmaxFocalLossGradient);
+DEPLOY_CUDA(SoftmaxFocalLossGradient);
 #endif
-OPERATOR_SCHEMA(SparseSoftmaxFocalLossGradient).NumInputs(3).NumOutputs(1);
+OPERATOR_SCHEMA(SoftmaxFocalLossGradient).NumInputs(3).NumOutputs(1);

-class GetSparseSoftmaxFocalLossGradient final : public GradientMakerBase {
+class GetSoftmaxFocalLossGradient
+    final : public GradientMakerBase {
 public:
-    GRADIENT_MAKER_CTOR(GetSparseSoftmaxFocalLossGradient);
+    GRADIENT_MAKER_CTOR(GetSoftmaxFocalLossGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
            vector<string> {I(0), I(1), GO(0)},
            vector<string> {GI(0)});
    }
 };
-REGISTER_GRADIENT(SparseSoftmaxFocalLoss, GetSparseSoftmaxFocalLossGradient);
+REGISTER_GRADIENT(
+    SoftmaxFocalLoss,
+    GetSoftmaxFocalLossGradient
+);

 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
-#include "operators/activation/softmax_op.h"
-#include "operators/loss/sparse_softmax_cross_entropy_op.h"
-#include "core/workspace.h"
-#include "utils/math_functions.h"
+#include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
 #include "utils/proto_utils.h"
+#include "operators/activation/softmax_op.h"
+#include "operators/loss/sparse_softmax_cross_entropy_op.h"

 namespace dragon {

@@ -23,11 +23,13 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRun() {

 template <class Context>
 void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRunFP16() {
-    Tensor* XF32 = ws()->CreateTensor("/mnt/" + anchor() + "/softmax/xf32");
+    Tensor* XF32 = ws()->CreateTensor(
+        "/mnt/" + anchor() + "/softmax/xf32");
    XF32->ReshapeLike(Input(0));
    auto* XdataF16 = Input(0).template data<float16, Context>();
    auto* XdataF32 = XF32->template mutable_data<float, Context>();
-    kernel::TypeA2B<float16, float, Context>(Input(0).count(), XdataF16, XdataF32);
+    kernel::TypeA2B<float16, float, Context>(
+        Input(0).count(), XdataF16, XdataF32);
    OperatorDef softmax_def = MakeOperatorDef("Softmax", "",
        vector<string>({ XF32->name() }),
        vector<string>({ "/mnt/" + anchor() + "/softmax/prob" }));
@@ -35,7 +37,8 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRunFP16() {
    if (def().has_device_option())
        softmax_def.mutable_device_option()
            ->CopyFrom(def().device_option());
-    if (!softmax_op) softmax_op.reset(CreateOperator(softmax_def, ws()));
+    if (!softmax_op) softmax_op.reset(
+        CreateOperator(softmax_def, ws()));
    else softmax_op->MutableOp(softmax_def);
    softmax_op->Run();
 }
@@ -60,13 +63,17 @@ void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
        return;
    }

-    Tx normalizer;
-    if (normalization == "VALID")
+    Tx normalizer = 1;
+    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<Tx, Context>(flags.count(), Fdata), (Tx)1.f);
-    else if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
-    else if (normalization == "NONE") normalizer = 1;
+            math::ASum<Tx, Context>(
+                flags.count(), Fdata), (Tx)1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = outer_dim * inner_dim;
+    }
+
    Tx loss = math::ASum<Tx, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<Tx, Context>();
@@ -126,13 +133,17 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
        return;
    }

-    Tx normalizer;
-    if (normalization == "VALID")
+    Tx normalizer = 1;
+    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<Tx, Context>(flags.count(), Fdata), (Tx)1.f);
-    else if (normalization == "BATCH_SIZE") normalizer = Input(0).dim(0);
-    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
-    else if (normalization == "NONE") normalizer = 1;
+            math::ASum<Tx, Context>(
+                flags.count(), Fdata), (Tx)1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = outer_dim * inner_dim;
+    }
+
    auto* dYdata = Input(-1).template data<Tx, Context>();
    Tx dYdata_host; ctx().template Copy<Tx, CPUContext, Context>(
        1, &dYdata_host, dYdata);
@@ -167,7 +178,8 @@ DEPLOY_CUDA(SparseSoftmaxCrossEntropyGradient);
 #endif
 OPERATOR_SCHEMA(SparseSoftmaxCrossEntropyGradient).NumInputs(3).NumOutputs(1);

-class GetSparseSoftmaxCrossEntropyGradient final : public GradientMakerBase {
+class GetSparseSoftmaxCrossEntropyGradient
+    final : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetSparseSoftmaxCrossEntropyGradient);
    vector<OperatorDef> MakeDefs() override {
@@ -176,6 +188,9 @@ class GetSparseSoftmaxCrossEntropyGradient final : public GradientMakerBase {
            vector<string> {GI(0)});
    }
 };
-REGISTER_GRADIENT(SparseSoftmaxCrossEntropy, GetSparseSoftmaxCrossEntropyGradient);
+REGISTER_GRADIENT(
+    SparseSoftmaxCrossEntropy,
+    GetSparseSoftmaxCrossEntropyGradient
+);

 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/misc/accuracy_op.cc
+++ b/Dragon/src/operators/misc/accuracy_op.cc
 #include <algorithm>
-#include "operators/misc/accuracy_op.h"
+
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
 #include "utils/math_functions.h"
-
+#include "operators/misc/accuracy_op.h"

 namespace dragon {


--- a/Dragon/src/operators/misc/astype_op.cc
+++ b/Dragon/src/operators/misc/astype_op.cc
-#include "operators/misc/astype_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/misc/astype_op.h"

 namespace dragon {


--- a/Dragon/src/operators/misc/gradient_op.cc
+++ b/Dragon/src/operators/misc/gradient_op.cc
-#include "operators/misc/gradient_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/misc/gradient_op.h"

 namespace dragon {


--- a/Dragon/src/operators/misc/image_data_op.cc
+++ b/Dragon/src/operators/misc/image_data_op.cc
-#include "operators/misc/image_data_op.h"
 #include "utils/op_kernel.h"
+#include "operators/misc/image_data_op.h"

 namespace dragon {


--- a/Dragon/src/operators/misc/initialize_op.cc
+++ b/Dragon/src/operators/misc/initialize_op.cc
-#include "operators/misc/initialize_op.h"
 #include "core/workspace.h"
+#include "operators/misc/initialize_op.h"

 namespace dragon {


--- a/Dragon/src/operators/misc/python_op.cc
+++ b/Dragon/src/operators/misc/python_op.cc
@@ -26,11 +26,11 @@ RunOp<Context>::RunOp(const OperatorDef& def, Workspace* ws)
    //  init interpreter & load module
    Py_Initialize();
    PyObject* py_module = PyImport_ImportModule(module.c_str());
-    CHECK(py_module) << "\nFail to import py module: " << module;
+    CHECK(py_module) << "\nFailed to Import Module: " << module;
    PyObject* py_dict = PyModule_GetDict(py_module);
    PyObject* py_op = PyDict_GetItemString(py_dict, op.c_str());
-    CHECK(py_op) << "\nFail not import operator: " << op
-                 << " from module: " << module;
+    CHECK(py_op) << "\nFailed to Import Operator: " << op
+                 << " from Module: " << module;
    self = PyObject_CallObject(py_op, NULL);

    //  wrap inputs and outputs
@@ -46,9 +46,22 @@ RunOp<Context>::RunOp(const OperatorDef& def, Workspace* ws)
    PyObject_SetAttr(self, Bytes("param_str_"), CS2Bytes(param_str));

    //  backward compatibility: self.setup(inputs, outputs)
-    if (PyObject_HasAttr(self, Bytes("setup"))) {
-        PyObject_CallMethod(self, "setup", "OO", inputs, outputs);
-    }
+    if (PyObject_HasAttr(self, Bytes("setup")))
+        CHECK(PyObject_CallMethod(
+            self, "setup", "OO", inputs, outputs))
+                << CallMethodHelper("setup");
+}
+
+template <class Context>
+string RunOp<Context>::CallMethodHelper(
+    const string&           method) {
+    std::stringstream ss;
+    ss <<"\nFailed to call: "
+       << "<" + module << "." << op
+       << "." << method << "(*args, **kwargs)>\n"
+       << "This is a FATAL error to terminate "
+       << "<" << name() << ">.";
+    return ss.str();
 }

 template <class Context>
@@ -58,14 +71,20 @@ void RunOp<Context>::RunOnDevice() {

    //  backward compatibility: reshape(inputs, outputs)
    if (PyObject_HasAttr(self, Bytes("reshape"))) {
-        PyObject_CallMethod(self, "reshape", "OO", inputs, outputs);
+        CHECK(PyObject_CallMethod(
+            self, "reshape", "OO", inputs, outputs))
+                << CallMethodHelper("reshape");
    }

    //  overloaded run inferfaces
    if (PyObject_HasAttr(self, Bytes("forward"))) {
-        PyObject_CallMethod(self, "forward", "OO", inputs, outputs);
+        CHECK(PyObject_CallMethod(
+            self, "forward", "OO", inputs, outputs))
+                << CallMethodHelper("forward");
    } else if (PyObject_HasAttr(self, Bytes("run"))) {
-        PyObject_CallMethod(self, "run", "OO", inputs, outputs);
+        CHECK(PyObject_CallMethod(
+            self, "run", "OO", inputs, outputs))
+                << CallMethodHelper("run");
    }
 }

@@ -85,17 +104,20 @@ void TemplateGradientOp<Context>::RunOnDevice() {

    //  backward compatibility: reshape(inputs, outputs)
    if (PyObject_HasAttr(this->self, Bytes("reshape"))) {
-        PyObject_CallMethod(this->self, "reshape",
-            "OO", this->inputs, this->outputs);
+        CHECK(PyObject_CallMethod(this->self, "reshape",
+            "OO", this->inputs, this->outputs))
+                << this->CallMethodHelper("reshape");
    }

    //  overloaded run inferfaces
    if (PyObject_HasAttr(this->self, Bytes("backward"))) {
-        PyObject_CallMethod(this->self, "forward",
-            "OO", this->inputs, this->outputs);
+        CHECK(PyObject_CallMethod(this->self, "backward",
+            "OO", this->inputs, this->outputs))
+                << this->CallMethodHelper("backward");
    } else if (PyObject_HasAttr(this->self, Bytes("grad"))) {
-        PyObject_CallMethod(this->self, "grad",
-            "OO", this->inputs, this->outputs);
+        CHECK(PyObject_CallMethod(this->self, "grad",
+            "OO", this->inputs, this->outputs))
+                << this->CallMethodHelper("grad");
    }
 }


--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
-#include "operators/mpi/mpi_broadcast_op.h"
 #include "utils/math_functions.h"
+#include "operators/mpi/mpi_broadcast_op.h"

 #ifdef WITH_MPI


--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
-#include "operators/mpi/mpi_gather_op.h"
 #include "utils/math_functions.h"
+#include "operators/mpi/mpi_gather_op.h"

 #ifdef WITH_MPI


--- a/Dragon/src/operators/ndarray/arange_op.cc
+++ b/Dragon/src/operators/ndarray/arange_op.cc
-#include "operators/ndarray/arange_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/ndarray/arange_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/argreduce_op.cc
+++ b/Dragon/src/operators/ndarray/argreduce_op.cc
-#include "operators/ndarray/argreduce_op.h"
 #include "utils/op_kernel.h"
+#include "operators/ndarray/argreduce_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/concat_op.cc
+++ b/Dragon/src/operators/ndarray/concat_op.cc
-#include "operators/ndarray/concat_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/ndarray/concat_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/crop_op.cc
+++ b/Dragon/src/operators/ndarray/crop_op.cc
-#include "operators/ndarray/crop_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/ndarray/crop_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/expand_dims_op.cc
+++ b/Dragon/src/operators/ndarray/expand_dims_op.cc
-#include "operators/ndarray/expand_dims_op.h"
 #include "core/workspace.h"
+#include "operators/ndarray/expand_dims_op.h"

 namespace dragon {

@@ -9,7 +9,8 @@ void ExpandDimsOp<Context>::RunOnDevice() {
    if (axis == -1 || axis >= (int)dims.size()) dims.push_back(1);
    else dims.insert(dims.begin() + axis, 1);
    //  save Xshape
-    Tensor* sv = ws()->CreateTensor("/mnt/" + anchor() + "/expand_dims/x_shape");
+    Tensor* sv = ws()->CreateTensor(
+        "/mnt/" + anchor() + "/expand_dims/x_shape");
    sv->Reshape({ (TIndex)Input(0).ndim() });
    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
    for (int i = 0; i < Input(0).ndim(); i++) Sdata[i] = Input(0).dim(i);
@@ -22,11 +23,14 @@ DEPLOY_CPU(ExpandDims);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ExpandDims);
 #endif
-OPERATOR_SCHEMA(ExpandDims).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(ExpandDims)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context>
 void ExpandDimsGradientOp<Context>::RunOnDevice() {
-    Tensor* sv = ws()->GetTensor("/mnt/" + anchor() + "/expand_dims/x_shape");
+    Tensor* sv = ws()->GetTensor(
+        "/mnt/" + anchor() + "/expand_dims/x_shape");
    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
    vector<TIndex> x_shape(sv->count());
    for (int i = 0; i < sv->count(); i++) x_shape[i] = Sdata[i]; 
@@ -39,7 +43,9 @@ DEPLOY_CPU(ExpandDimsGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ExpandDimsGradient);
 #endif
-OPERATOR_SCHEMA(ExpandDimsGradient).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(ExpandDimsGradient)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 class GetExpandDimsGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/ndarray/flatten_op.cc
+++ b/Dragon/src/operators/ndarray/flatten_op.cc
-#include "operators/ndarray/flatten_op.h"
 #include "core/workspace.h"
+#include "operators/ndarray/flatten_op.h"

 namespace dragon {

@@ -28,8 +28,12 @@ void FlattenOp<Context>::KeepRun() {
        << ", can not keep " + keep_axes << " .";
    vector<TIndex> output_dims;
    int i = 0;
-    for (; i < keep_axes - 1; i++) output_dims.push_back(Input(0).dim(i));
-    if (Input(0).count(i) != 1) output_dims.push_back(Input(0).count(i));
+    for (; i < keep_axes - 1; i++)
+        output_dims.push_back(Input(0).dim(i));
+
+    if (Input(0).count(i) != 1)
+        output_dims.push_back(Input(0).count(i));
+
    if (Output(0)->name() != Input(0).name())
        Output(0)->template Copy<Context, Context>(Input(0));
 }
@@ -37,10 +41,12 @@ void FlattenOp<Context>::KeepRun() {
 template <class Context>
 void FlattenOp<Context>::RunOnDevice() {
    //  save Xshape
-    Tensor* sv = ws()->CreateTensor("/mnt/" + anchor() + "/flatten/x_shape");
+    Tensor* sv = ws()->CreateTensor(
+        "/mnt/" + anchor() + "/flatten/x_shape");
    sv->Reshape({ (TIndex)Input(0).ndim() });
    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
-    for (int i = 0; i < Input(0).ndim(); i++) Sdata[i] = Input(0).dim(i);
+    for (int i = 0; i < Input(0).ndim(); i++) 
+        Sdata[i] = Input(0).dim(i);
    if (keep_axes != INT_MAX) KeepRun();
    else SqueezeRun();
 }
@@ -49,12 +55,15 @@ DEPLOY_CPU(Flatten);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Flatten);
 #endif
-OPERATOR_SCHEMA(Flatten).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(Flatten)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });


 template <class Context>
 void FlattenGradientOp<Context>::RunOnDevice() {
-    Tensor* sv = ws()->GetTensor("/mnt/" + anchor() + "/flatten/x_shape");
+    Tensor* sv = ws()->GetTensor(
+        "/mnt/" + anchor() + "/flatten/x_shape");
    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
    vector<TIndex> x_shape(sv->count());
    for (int i = 0; i < sv->count(); i++) x_shape[i] = Sdata[i];
@@ -67,7 +76,9 @@ DEPLOY_CPU(FlattenGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(FlattenGradient);
 #endif
-OPERATOR_SCHEMA(FlattenGradient).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(FlattenGradient)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 class GetFlattenGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/ndarray/gather_op.cc
+++ b/Dragon/src/operators/ndarray/gather_op.cc
-#include "operators/ndarray/gather_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/ndarray/gather_op.h"

 namespace dragon {

@@ -29,7 +29,8 @@ void GatherOp<Context>::RunOnDevice() {
    inner_dim = Input(0).count(axis + 1);
    Output(0)->Reshape(output_dims);

-    CHECK(Input(1).template IsType<int>()) << "\nThe type of indices should be int32.";
+    CHECK(Input(1).template IsType<int>()) 
+        << "\nThe type of indices should be int32.";
    if (XIsType(Input(0), float)) RunWithType<float>();
    else if (XIsType(Input(0), int)) RunWithType<int>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
@@ -62,7 +63,8 @@ void GatherGradientOp<Context>::RunOnDevice() {
    inner_dim = Input(0).count(axis + 1);
    Output(0)->ReshapeLike(Input(0));

-    CHECK(Input(1).template IsType<int>()) << "\nThe type of indices should be int32.";
+    CHECK(Input(1).template IsType<int>()) 
+        << "\nThe type of indices should be int32.";
    if (XIsType(Input(0), float)) RunWithType<float>();
    else if (XIsType(Input(0), int)) RunWithType<int>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });

--- a/Dragon/src/operators/ndarray/one_hot_op.cc
+++ b/Dragon/src/operators/ndarray/one_hot_op.cc
-#include "operators/ndarray/one_hot_op.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/ndarray/one_hot_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/pad_op.cc
+++ b/Dragon/src/operators/ndarray/pad_op.cc
-#include "operators/ndarray/pad_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/ndarray/pad_op.h"

 namespace dragon {

@@ -53,8 +53,10 @@ void PadOp<Context>::EdgeRunWithType() {
 template <class Context>
 void PadOp<Context>::RunOnDevice() {
    CHECK_EQ(Input(0).ndim(), pad_l.size())
-        << "\nThe padding is performed on " << pad_l.size() << " dimensions, "
-        << "but the num of dimensions of input is " << Input(0).ndim() << ".";
+        << "\nThe padding is performed on "
+        << pad_l.size() << " dimensions, "
+        << "but the num of dimensions of input is "
+        << Input(0).ndim() << ".";

    //  do nothing
    if (process_axes.size() == 0) {
@@ -80,11 +82,15 @@ void PadOp<Context>::RunOnDevice() {
            else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
        } else if (mode == "REFLECT") {
            CHECK_LE(pad_l[axis], dim + 1)
-                << "\nThe dimension of axis " << axis << " is " << dim << ","
-                << "\nwhile the excepted bounds of pad_l for reflecting are (0, " << dim + 1 << "].";
+                << "\nThe dimension of axis " << axis
+                << " is " << dim << ","
+                << "\nwhile the excepted bounds of pad_l "
+                << "for reflecting are (0, " << dim + 1 << "].";
            CHECK_LE(pad_r[axis], dim - 1)
-                << "\nThe dimension of axis " << axis << " is " << dim << ","
-                << "\nwhile the excepted bounds of pad_r for reflecting are (0, " << dim - 1 << "].";
+                << "\nThe dimension of axis " << axis
+                << " is " << dim << ","
+                << "\nwhile the excepted bounds of pad_r "
+                << "for reflecting are (0, " << dim - 1 << "].";
            if (XIsType(Input(0), float)) ReflectRunWithType<float>();
            else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
        } else if (mode == "EDGE")  {
@@ -161,8 +167,10 @@ void PadGradientOp<Context>::EdgeRunWithType() {
 template <class Context>
 void PadGradientOp<Context>::RunOnDevice() {
    CHECK_EQ(Input(0).ndim(), pad_l.size())
-        << "\nThe padding is performed on " << pad_l.size() << " dimensions, "
-        << "but the number of dimensions of input is " << Input(0).ndim() << ".";
+        << "\nThe padding is performed on "
+        << pad_l.size() << " dimensions, "
+        << "but the number of dimensions of input is "
+        << Input(0).ndim() << ".";

    //  do nothing 
    if (process_axes.size() == 0) {
@@ -188,11 +196,15 @@ void PadGradientOp<Context>::RunOnDevice() {
            else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
        } else if (mode == "REFLECT") {
            CHECK_LE(pad_l[axis], dim + 1)
-                << "\nThe dimension of axis " << axis << " is " << dim << ","
-                << "\nwhile the excepted bounds of pad_l for reflecting are (0, " << dim + 1 << "].";
+                << "\nThe dimension of axis " << axis
+                << " is " << dim << ","
+                << "\nwhile the excepted bounds of pad_l "
+                << "for reflecting are (0, " << dim + 1 << "].";
            CHECK_LE(pad_r[axis], dim - 1)
-                << "\nThe dimension of axis " << axis << " is " << dim << ","
-                << "\nwhile the excepted bounds of pad_r for reflecting are (0, " << dim - 1 << "].";
+                << "\nThe dimension of axis " << axis 
+                << " is " << dim << ","
+                << "\nwhile the excepted bounds of pad_r "
+                << "for reflecting are (0, " << dim - 1 << "].";
            if (XIsType(Input(0), float)) ReflectRunWithType<float>();
            else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
        } else if (mode == "EDGE")  {

--- a/Dragon/src/operators/ndarray/random_pick_op.cc
+++ b/Dragon/src/operators/ndarray/random_pick_op.cc
-#include "operators/ndarray/random_pick_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/ndarray/random_pick_op.h"

 namespace dragon {

 template <class Context> template <typename T>
 void RandomPickOp<Context>::RunWithType() {
    auto* indices = pick_indices->template mutable_data<int, CPUContext>();
-    /*
    for (int i = 0; i < pick_indices->count(); i++)
-        indices[i] = int((*ctx().rand_generator())() % x_slice_dim);  */
+        indices[i] = int((*ctx().rand_generator())() % x_slice_dim);

    auto* Xdata = Input(0).template data<T, Context>();
    indices = pick_indices->template mutable_data<int, Context>();
@@ -31,7 +30,8 @@ void RandomPickOp<Context>::RunOnDevice() {
    inner_dim = Input(0).count(axis + 1);
    Output(0)->Reshape(output_dims);

-    pick_indices = ws()->CreateTensor("/mnt/" + anchor() + "/pick/indices");
+    pick_indices = ws()->CreateTensor(
+        "/mnt/" + anchor() + "/pick/indices");
    pick_indices->Reshape({ max_samples });

    if (XIsType(Input(0), float)) RunWithType<float>();
@@ -64,7 +64,8 @@ void RandomPickGradientOp<Context>::RunWithType() {

 template <class Context>
 void RandomPickGradientOp<Context>::RunOnDevice() {
-    pick_indices = ws()->GetTensor("/mnt/" + anchor() + "/pick/indices");
+    pick_indices = ws()->GetTensor(
+        "/mnt/" + anchor() + "/pick/indices");

    x_slice_dim = Input(0).dim(axis);
    y_slice_dim = pick_indices->count();

--- a/Dragon/src/operators/ndarray/reduce_op.cc
+++ b/Dragon/src/operators/ndarray/reduce_op.cc
-#include "operators/ndarray/reduce_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/ndarray/reduce_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/repeat_op.cc
+++ b/Dragon/src/operators/ndarray/repeat_op.cc
-#include "operators/ndarray/repeat_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/ndarray/repeat_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/reshape_op.cc
+++ b/Dragon/src/operators/ndarray/reshape_op.cc
-#include "operators/ndarray/reshape_op.h"
 #include "core/workspace.h"
+#include "operators/ndarray/reshape_op.h"

 namespace dragon {

@@ -32,8 +32,8 @@ void ReshapeOp<Context>::RunOnDevice() {
        if (require_shape[i] == 0) {
            //  handle unchanged dim
            CHECK_LT(i, (int)Xdims.size())
-                << "\nDim(" << i << ") is out of the Xdims range of (0, "
-                << Xdims.size() << ").";
+                << "\nDim(" << i << ") is out of the Xdims "
+                << "range of (0, " << Xdims.size() << ").";
            new_shape[i] = Xdims[i];
        } else if (require_shape[i] > 0) {
            //  handle reseted dim
@@ -41,8 +41,8 @@ void ReshapeOp<Context>::RunOnDevice() {
        } else {
            //  handle inferred dim
            CHECK_EQ(infer_dim, -1)
-                << "\nDim(" << infer_dim << ") required infer before"
-                << "\ncould not infer for dim(" << i << ") both.";
+                << "\nCould not infer Dim( " << infer_dim << "), "
+                << "Dim(" << i << ") both.";
            new_shape[i] = -1;
            infer_dim = i;
        }
@@ -55,7 +55,8 @@ void ReshapeOp<Context>::RunOnDevice() {
            if (new_shape[i] == -1) {
                CHECK_EQ(Input(0).count() % total_count, 0)
                    << "\nCan not change the total size: "
-                    << Input(0).DimString() << " -> " << DimString(new_shape);
+                    << Input(0).DimString()
+                    << " -> " << DimString(new_shape);
                new_shape[i] = Input(0).count() / total_count;
                total_count *= new_shape[i];
                break;
@@ -64,9 +65,11 @@ void ReshapeOp<Context>::RunOnDevice() {
    }
    CHECK_EQ(total_count, Input(0).count())
        << "\nCan not change the total size."
-        << Input(0).DimString() << " -> " << DimString(new_shape);
+        << Input(0).DimString()
+        << " -> " << DimString(new_shape);
    //  save Xshape
-    Tensor* sv = ws()->CreateTensor("/mnt/" + anchor() + "/reshape/x_shape");
+    Tensor* sv = ws()->CreateTensor(
+        "/mnt/" + anchor() + "/reshape/x_shape");
    sv->Reshape({ (TIndex)Input(0).ndim() });
    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
    for (int i = 0; i < Input(0).ndim(); i++) Sdata[i] = Input(0).dim(i);
@@ -79,12 +82,15 @@ DEPLOY_CPU(Reshape);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Reshape);
 #endif
-OPERATOR_SCHEMA(Reshape).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(Reshape)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });


 template <class Context>
 void ReshapeGradientOp<Context>::RunOnDevice() {
-    Tensor* sv = ws()->GetTensor("/mnt/" + anchor() + "/reshape/x_shape");
+    Tensor* sv = ws()->GetTensor(
+        "/mnt/" + anchor() + "/reshape/x_shape");
    auto* Sdata = sv->template mutable_data<TIndex, CPUContext>();
    vector<TIndex> x_shape(sv->count());
    for (int i = 0; i < sv->count(); i++) x_shape[i] = Sdata[i];

--- a/Dragon/src/operators/ndarray/slice_op.cc
+++ b/Dragon/src/operators/ndarray/slice_op.cc
-#include "operators/ndarray/slice_op.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/ndarray/slice_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/stack_op.cc
+++ b/Dragon/src/operators/ndarray/stack_op.cc
-#include "operators/ndarray/stack_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/ndarray/stack_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/tile_op.cc
+++ b/Dragon/src/operators/ndarray/tile_op.cc
-#include "operators/ndarray/tile_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/ndarray/tile_op.h"

 namespace dragon {


--- a/Dragon/src/operators/ndarray/transpose_op.cc
+++ b/Dragon/src/operators/ndarray/transpose_op.cc
-#include "operators/ndarray/transpose_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/ndarray/transpose_op.h"

 namespace dragon {


--- a/Dragon/src/operators/norm/batch_norm_op.cc
+++ b/Dragon/src/operators/norm/batch_norm_op.cc
-#include "operators/norm/batch_norm_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/filler.h"
+#include "utils/math_functions.h"
+#include "operators/norm/batch_norm_op.h"

 namespace dragon {


--- a/Dragon/src/operators/norm/batch_renorm_op.cc
+++ b/Dragon/src/operators/norm/batch_renorm_op.cc
-#include "operators/norm/batch_renorm_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/filler.h"
+#include "utils/math_functions.h"
+#include "operators/norm/batch_renorm_op.h"

 namespace dragon {


--- a/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
+++ b/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
-#include "operators/norm/batch_norm_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
+#include "operators/norm/batch_norm_op.h"

 #ifdef WITH_CUDNN


--- a/Dragon/src/operators/norm/fused_batch_norm.cc
+++ b/Dragon/src/operators/norm/fused_batch_norm.cc
-#include "operators/norm/batch_norm_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/filler.h"
+#include "utils/math_functions.h"
+#include "operators/norm/batch_norm_op.h"

 namespace dragon {


--- a/Dragon/src/operators/norm/fused_group_norm.cc
+++ b/Dragon/src/operators/norm/fused_group_norm.cc
-#include "operators/norm/group_norm_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/filler.h"
+#include "utils/math_functions.h"
+#include "operators/norm/group_norm_op.h"

 namespace dragon {


--- a/Dragon/src/operators/norm/group_norm_op.cc
+++ b/Dragon/src/operators/norm/group_norm_op.cc
-#include "operators/norm/group_norm_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/filler.h"
+#include "utils/math_functions.h"
+#include "operators/norm/group_norm_op.h"

 namespace dragon {


--- a/Dragon/src/operators/norm/instance_norm_op.cc
+++ b/Dragon/src/operators/norm/instance_norm_op.cc
-#include "operators/norm/instance_norm_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/filler.h"
+#include "utils/math_functions.h"
+#include "operators/norm/instance_norm_op.h"

 namespace dragon {


--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
-#include "operators/norm/l2_norm_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/norm/l2_norm_op.h"

 namespace dragon {

@@ -14,7 +14,8 @@ void L2NormOp<Context>::RunWithType() {
    buffer.Reshape(dims);

    //  normalize by inner_dim independently if not across it
-    norm = ws()->CreateTensor("/mnt/" + anchor() + "/l2norm/normalizer");
+    norm = ws()->CreateTensor(
+        "/mnt/" + anchor() + "/l2norm/normalizer");
    dims = Input(0).dims();
    for (int i = axis; i < end_axis; i++) dims[i] = 1;
    norm->Reshape(dims);

--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
-#include "operators/recurrent/cudnn_recurrent_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
+#include "operators/recurrent/cudnn_recurrent_op.h"

 #ifdef WITH_CUDNN


--- a/Dragon/src/operators/recurrent/lstm_cell_op.cc
+++ b/Dragon/src/operators/recurrent/lstm_cell_op.cc
-#include "operators/recurrent/lstm_cell_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/recurrent/lstm_cell_op.h"

 namespace dragon {

@@ -12,8 +12,8 @@ void LSTMCellOp<Context>::RunWithType() {
    auto* Hdata = Output(0)->template mutable_data<T, Context>();
    auto* Cdata = Output(1)->template mutable_data<T, Context>();

-    kernel::LSTMCell<T, Context>(Input(1).count(),
-        Input(1).dim(0), Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
+    kernel::LSTMCell<T, Context>(Input(1).count(), Input(1).dim(0),
+        Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
            CXdata, XAdata, Cdata, Hdata);
 }

@@ -42,8 +42,8 @@ void LSTMCellGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    auto* dCXdata = Output(1)->template mutable_data<T, Context>();

-    kernel::LSTMCellGrad<T, Context>(Input(1).count(),
-        Input(1).dim(0), Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
+    kernel::LSTMCellGrad<T, Context>(Input(1).count(), Input(1).dim(0),
+        Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
            CXdata, XAdata, Cdata, dCdata, dHdata, dCXdata, dXdata);
 }


--- a/Dragon/src/operators/recurrent/recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/recurrent_op.cc
-#include "operators/recurrent/recurrent_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/recurrent/recurrent_op.h"

 namespace dragon {


--- a/Dragon/src/operators/recurrent/rnn_param_op.cc
+++ b/Dragon/src/operators/recurrent/rnn_param_op.cc
-#include "operators/recurrent/rnn_param_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/recurrent/rnn_param_op.h"

 namespace dragon {


--- a/Dragon/src/operators/update/adam_update_op.cc
+++ b/Dragon/src/operators/update/adam_update_op.cc
-#include "operators/update/adam_update_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/update/adam_update_op.h"

 namespace dragon {


--- a/Dragon/src/operators/update/collective_update_op.cc
+++ b/Dragon/src/operators/update/collective_update_op.cc
-#include "operators/update/collective_update_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
+#include "operators/update/collective_update_op.h"

 namespace dragon {


--- a/Dragon/src/operators/update/moving_average_op.cc
+++ b/Dragon/src/operators/update/moving_average_op.cc
-#include "operators/update/moving_average_op.h"
 #include "utils/math_functions.h"
+#include "operators/update/moving_average_op.h"

 namespace dragon {


--- a/Dragon/src/operators/update/nesterov_update_op.cc
+++ b/Dragon/src/operators/update/nesterov_update_op.cc
-#include "operators/update/nesterov_update_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "operators/update/nesterov_update_op.h"

 namespace dragon {


--- a/Dragon/src/operators/update/rmsprop_update_op.cc
+++ b/Dragon/src/operators/update/rmsprop_update_op.cc
-#include "operators/update/rmsprop_update_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/update/rmsprop_update_op.h"

 namespace dragon {


--- a/Dragon/src/operators/update/sgd_update_op.cc
+++ b/Dragon/src/operators/update/sgd_update_op.cc
-#include "operators/update/sgd_update_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/update/sgd_update_op.h"

 namespace dragon {


--- a/Dragon/src/operators/update/update_op_base.cc
+++ b/Dragon/src/operators/update/update_op_base.cc
-#include "operators/update/update_op_base.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/cast.h"
+#include "utils/math_functions.h"
+#include "operators/update/update_op_base.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
-#include "operators/vision/bias_add_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
 #include "utils/op_kernel.h"
+#include "operators/vision/bias_add_op.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/bilinear_resize_op.cc
+++ b/Dragon/src/operators/vision/bilinear_resize_op.cc
-#include "operators/vision/bilinear_resize_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/vision/bilinear_resize_op.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/conv2d_op.cc
+++ b/Dragon/src/operators/vision/conv2d_op.cc
-#include "operators/vision/conv_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
+#include "operators/vision/conv_op.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/conv2d_transpose_op.cc
-#include "operators/vision/conv_transpose_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
+#include "operators/vision/conv_transpose_op.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/conv_op_base.cc
+++ b/Dragon/src/operators/vision/conv_op_base.cc
-#include "operators/vision/conv_op_base.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
+#include "operators/vision/conv_op_base.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
 #ifdef WITH_CUDNN

-#include "operators/vision/conv_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
 #include "utils/op_kernel.h"
+#include "operators/vision/conv_op.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
 #ifdef WITH_CUDNN

-#include "operators/vision/conv_transpose_op.h"
 #include "core/workspace.h"
 #include "utils/filler.h"
 #include "utils/op_kernel.h"
+#include "operators/vision/conv_transpose_op.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/dense_concat_op.cc
+++ b/Dragon/src/operators/vision/dense_concat_op.cc
-#include "operators/vision/dense_concat_op.h"
 #include "core/workspace.h"
 #include "utils/op_kernel.h"
+#include "operators/vision/dense_concat_op.h"

 namespace dragon {

@@ -12,7 +12,8 @@ OPERATOR_SCHEMA(DenseConcat).NumInputs(2).NumOutputs(1);

 template <class Context> template <typename T>
 void DenseConcatGradientOp<Context>::RestoreX1() {
-    CHECK_GT(growth_rate, 0) << "\nInvalid growth rate, please preset it.";
+    CHECK_GT(growth_rate, 0)
+        << "\nInvalid growth rate, please preset it.";
    this->concat_dims = Input(-1).dims();
    this->y_concat_dim = this->concat_dims[this->axis];
    this->outer_dim = Input(-1).count(0, this->axis);

--- a/Dragon/src/operators/vision/lrn_op.cc
+++ b/Dragon/src/operators/vision/lrn_op.cc
-#include "operators/vision/lrn_op.h"
+#include "core/workspace.h"
+#include "utils/math_functions.h"
 #include "operators/arithmetic/pow_op.h"
 #include "operators/arithmetic/eltwise_op.h"
+#include "operators/vision/lrn_op.h"
 #include "operators/vision/pooling_op.h"
-#include "core/workspace.h"
-#include "utils/math_functions.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
-#include "operators/vision/nn_resize_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/vision/nn_resize_op.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/pooling2d_op.cc
+++ b/Dragon/src/operators/vision/pooling2d_op.cc
-#include "operators/vision/pooling_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/vision/pooling_op.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/roi_align_op.cc
+++ b/Dragon/src/operators/vision/roi_align_op.cc
-#include "operators/vision/roi_align_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/vision/roi_align_op.h"

 namespace dragon {


--- a/Dragon/src/operators/vision/roi_pooling_op.cc
+++ b/Dragon/src/operators/vision/roi_pooling_op.cc
-#include "operators/vision/roi_pooling_op.h"
 #include "core/workspace.h"
-#include "utils/math_functions.h"
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/vision/roi_pooling_op.h"

 namespace dragon {


--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
@@ -195,7 +195,7 @@ __global__ void _Exp(
    const T*                a,
    T*                      y) {
    CUDA_KERNEL_LOOP(idx, n) {
-        y[idx] = std::exp(a[idx]);
+        y[idx] = exp(a[idx]);
    }
 }

@@ -214,7 +214,7 @@ __global__ void _Log(
    const T*                a,
    T*                      y) {
    CUDA_KERNEL_LOOP(idx, n) {
-        y[idx] = std::log(a[idx]);
+        y[idx] = log(a[idx]);
    }
 }

@@ -252,7 +252,7 @@ __global__ void _Sqrt(
    const T*                x,
    T*                      y) {
    CUDA_KERNEL_LOOP(idx, n) {
-        y[idx] = std::sqrt(x[idx]);
+        y[idx] = sqrt(x[idx]);
    }
 }

@@ -272,7 +272,7 @@ __global__ void _Pow(
    const T*                a,
    T*                      y) {
    CUDA_KERNEL_LOOP(idx, n) {
-        y[idx] = std::pow(a[idx], alpha);
+        y[idx] = pow(a[idx], alpha);
    }
 }


--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -512,40 +512,135 @@ template<> void AbsGrad<float, CPUContext>(

 template <> void SigmoidCrossEntropy<float, CPUContext>(
    const int               count,
-    const float*            x,
-    const float*            target,
-    float*                  loss,
-    float*                  valid) {
+    const float*            logits,
+    const float*            targets,
+    float*                  losses,
+    float*                  flags,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
    for (int i = 0; i < count; ++i) {
-        if (target[i] < 0) {
-            loss[i] = valid[i] = 0.;
+        if (targets[i] < 0) {
+            losses[i] = flags[i] = 0;
        } else {
-            loss[i] = std::log(
-                1 + std::exp(x[i] - 2 * x[i] * (x[i] >= 0))
-            ) + x[i] * ((x[i] >= 0) - target[i]);
-            valid[i] = 1.;
+            losses[i] = std::log(
+                1 + std::exp(logits[i] - 2 * logits[i] * (logits[i] >= 0))
+            ) + logits[i] * ((logits[i] >= 0) - targets[i]);
+            flags[i] = 1;
        }
    }
 }

 template <> void SigmoidCrossEntropyGrad<float, CPUContext>(
    const int               count,
-    const float*            x,
-    const float*            target,
-    float*                  dx,
-    float*                  valid) {
+    const float*            logits,
+    const float*            targets,
+    float*                  dlogits,
+    float*                  flags,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
    for (int i = 0; i < count; ++i) {
-        if (target[i] < 0) {
-            dx[i] = valid[i] = 0.;
+        if (targets[i] < 0) {
+            dlogits[i] = flags[i] = 0;
        } else {
-            dx[i] = 1. / (1. + expf(-x[i])) - target[i];
-            valid[i] = 1.;
+            dlogits[i] = 1 / (1 + std::exp(-logits[i])) - targets[i];
+            flags[i] = 1;
+        }
+    }
+}
+
+/******************** loss.sigmoid_focal_loss ********************/
+
+template <> void SigmoidFocalLoss<float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const float*            logits,
+    const float*            targets,
+    float*                  losses,
+    float*                  flags,
+    CPUContext*             ctx) {
+    for (int oix = 0; oix < outer_dim; ++oix) {
+        for (int aix = 0; aix < axis_dim; ++aix) {
+            TIndex offset = oix * axis_dim + aix;
+            for (int iix = 0; iix < inner_dim; ++iix) {
+                const TIndex i = offset * inner_dim + iix;
+                const int t = targets[oix * inner_dim + iix];
+                //  ``0`` is reserved for targets if neg id is zero
+                //  use ``aix + 1`` to match the targets
+                float c1 = (t == (aix + (neg_id ? 0 : 1)));
+                float c2 = (t != -1) & (t != (aix + (neg_id ? 0 : 1)));
+                float p = 1 / (1 + std::exp(-logits[i]));  //  logit -> prob
+
+                //  (1 - p)^{gamma} * log(p)
+                float pos_term = std::pow(1 - p, gamma) * (
+                    std::log(std::max(p, FLT_MIN))
+                );
+    
+                //  p^{gamma} * log(1 - p)
+                float neg_term = std::pow(p, gamma) * (
+                    -logits[i] * (logits[i] >= 0) - std::log(
+                        1 + std::exp(logits[i] - 2 * logits[i] * (logits[i] >= 0)))
+                );
+
+                losses[i] = 0.0;
+                losses[i] += -c1 * pos_term * pos_alpha;
+                losses[i] += -c2 * neg_term * neg_alpha;
+                flags[i] = c1;
+            }
+        }
+    }
+}
+
+template <> void SigmoidFocalLossGradient<float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const float*            logits,
+    const float*            targets,
+    float*                  dlogits,
+    float*                  flags,
+    CPUContext*             ctx) {
+    for (int oix = 0; oix < outer_dim; ++oix) {
+        for (int aix = 0; aix < axis_dim; ++aix) {
+            TIndex offset = oix * axis_dim + aix;
+            for (int iix = 0; iix < inner_dim; ++iix) {
+                const TIndex i = offset * inner_dim + iix;
+                const int t = targets[oix * inner_dim + iix];
+                //  ``0`` is reserved for targets if neg id is zero
+                //  use ``aix + 1`` to match the targets
+                float c1 = (t == (aix + (neg_id ? 0 : 1)));
+                float c2 = (t != -1) & (t != (aix + (neg_id ? 0 : 1)));
+                float p = 1 / (1 + std::exp(-logits[i]));  //  logit -> prob
+
+                // (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
+                float pos_term = std::pow((1 - p), gamma) * (
+                    1 - p - p * gamma * std::log(std::max(p, FLT_MIN))
+                );
+
+                // p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
+                float neg_term = std::pow(p, gamma) * (
+                    (-logits[i] * (logits[i] >= 0) - log(
+                        1 + exp(logits[i] - 2 * logits[i] * (logits[i] >= 0)))
+                    ) * (1 - p) * gamma - p
+                );
+
+                dlogits[i] = 0.0;
+                dlogits[i] += -c1 * pos_term * pos_alpha;
+                dlogits[i] += -c2 * neg_term * neg_alpha;
+                flags[i] = c1;
+            }
        }
    }
 }
@@ -600,6 +695,95 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(
    }
 }

+/******************** loss.softmax_focal_loss ********************/
+
+template <> void SoftmaxFocalLoss<float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const float*            prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CPUContext*             ctx) {
+    for (int oix = 0; oix < outer_dim; ++oix) {
+        for (int iix = 0; iix < inner_dim; ++iix) {
+            const int idx = oix * inner_dim + iix;
+            const int label = labels[idx];
+            int k;
+            for (k = 0; k < num_ignores; ++k) {
+                if (label == ignores[k]) {
+                    losses[idx] = flags[idx] = 0;
+                    break;
+                }
+            }
+            if (k == num_ignores) {
+                const int t = (oix * axis_dim + label) * inner_dim + iix;
+                float labeled_prob = std::max(labeled_prob, FLT_MIN);
+                float scale = std::pow((1.f - prob[t]), gamma);
+                scale = label > neg_id ?
+                    pos_alpha * scale :  neg_alpha * scale;
+                losses[idx] = -scale * std::log(labeled_prob);
+                flags[idx] = label > neg_id ? 1 : 0;
+            }
+        }
+    }
+}
+
+template<> void SoftmaxFocalLossGrad<float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const float*            prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  dx,
+    float*                  flags,
+    CPUContext*             ctx) {
+    flags[0] = 0;
+    for (int oix = 0; oix < outer_dim; ++oix) {
+        for (int iix = 0; iix < inner_dim; ++iix) {
+            const int label = labels[oix * inner_dim + iix];
+            int k;
+            for (k = 0; k < num_ignores; ++k)
+                if (label == ignores[k]) break;
+            if (k != num_ignores) {
+                for (int c = 0; c < axis_dim; ++c)
+                    dx[(oix * axis_dim + c) * inner_dim + iix] = 0;
+            } else {
+                const int t = (oix * axis_dim + label) * inner_dim + iix;
+                float onemp = 1. - prob[t];
+                //  unstable if gamma is 0
+                float grad = -gamma * pow(onemp, gamma - 1)
+                                    * log(std::max(prob[t], FLT_MIN))
+                                    * prob[t] + pow(onemp, gamma);
+                grad = label > neg_id ?
+                    pos_alpha * grad : neg_alpha * grad;
+                for (int c = 0; c < axis_dim; ++c) {
+                    const int i_ = (oix * axis_dim + c) * inner_dim + iix;
+                    if (c == label) {
+                        dx[i_] = grad * (prob[t] - 1);
+                    } else {
+                        dx[i_] = grad * prob[i_];
+                    }
+                }
+                if (label > neg_id) flags[0]++;
+            }
+        }
+    }
+}
+
 /******************** loss.sparse_softmax_cross_entropy ********************/

 template <typename Tx, typename Ty>
@@ -730,95 +914,6 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CPUContext>(
                num_ignores, dx, flags);
 }

-/******************** loss.sparse_softmax_focal_loss ********************/
-
-template <> void SparseSoftmaxFocalLoss<float, CPUContext>(
-    const int               outer_dim,
-    const int               axis_dim,
-    const int               inner_dim,
-    const float             pos_alpha,
-    const float             neg_alpha,
-    const float             gamma,
-    const int               neg_id,
-    const float*            prob,
-    const float*            labels,
-    const int*              ignores,
-    const int               num_ignores,
-    float*                  losses,
-    float*                  flags,
-    CPUContext*             ctx) {
-    for (int oix = 0; oix < outer_dim; ++oix) {
-        for (int iix = 0; iix < inner_dim; ++iix) {
-            const int idx = oix * inner_dim + iix;
-            const int label = labels[idx];
-            int k;
-            for (k = 0; k < num_ignores; ++k) {
-                if (label == ignores[k]) {
-                    losses[idx] = flags[idx] = 0;
-                    break;
-                }
-            }
-            if (k == num_ignores) {
-                const int t = (oix * axis_dim + label) * inner_dim + iix;
-                float labeled_prob = std::max(labeled_prob, FLT_MIN);
-                float scale = std::pow((1.f - prob[t]), gamma);
-                scale = label > neg_id ?
-                    pos_alpha * scale :  neg_alpha * scale;
-                losses[idx] = -scale * std::log(labeled_prob);
-                flags[idx] = label > neg_id ? 1 : 0;
-            }
-        }
-    }
-}
-
-template<> void SparseSoftmaxFocalLossGrad<float, CPUContext>(
-    const int               outer_dim,
-    const int               axis_dim,
-    const int               inner_dim,
-    const float             pos_alpha,
-    const float             neg_alpha,
-    const float             gamma,
-    const int               neg_id,
-    const float*            prob,
-    const float*            labels,
-    const int*              ignores,
-    const int               num_ignores,
-    float*                  dx,
-    float*                  flags,
-    CPUContext*             ctx) {
-    flags[0] = 0;
-    for (int oix = 0; oix < outer_dim; ++oix) {
-        for (int iix = 0; iix < inner_dim; ++iix) {
-            const int label = labels[oix * inner_dim + iix];
-            int k;
-            for (k = 0; k < num_ignores; ++k)
-                if (label == ignores[k]) break;
-            if (k != num_ignores) {
-                for (int c = 0; c < axis_dim; ++c)
-                    dx[(oix * axis_dim + c) * inner_dim + iix] = 0;
-            } else {
-                const int t = (oix * axis_dim + label) * inner_dim + iix;
-                float onemp = 1. - prob[t];
-                //  unstable if gamma is 0
-                float grad = -gamma * pow(onemp, gamma - 1)
-                                    * log(std::max(prob[t], FLT_MIN))
-                                    * prob[t] + pow(onemp, gamma);
-                grad = label > neg_id ?
-                    pos_alpha * grad : neg_alpha * grad;
-                for (int c = 0; c < axis_dim; ++c) {
-                    const int i_ = (oix * axis_dim + c) * inner_dim + iix;
-                    if (c == label) {
-                        dx[i_] = grad * (prob[t] - 1);
-                    } else {
-                        dx[i_] = grad * prob[i_];
-                    }
-                }
-                if (label > neg_id) flags[0]++;
-            }
-        }
-    }
-}
-
 /******************** misc.astype ********************/

 template <typename Ta, typename Tb>

--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
@@ -282,7 +282,7 @@ __global__ void _Elu(
    T*                      y) {
    CUDA_KERNEL_LOOP(idx, count) {
        y[idx] = x[idx] > 0 ? x[idx] :
-            alpha * (std::exp(x[idx]) - 1);
+            alpha * (exp(x[idx]) - 1);
    }
 }

@@ -378,7 +378,7 @@ __global__ void _SElu(
    T*                      y) {
    CUDA_KERNEL_LOOP(idx, count) {
        y[idx] = x[idx] > 0 ? 1.0507 * x[idx] :
-            1.7581 * (std::exp(x[idx]) - 1);
+            1.7581 * (exp(x[idx]) - 1);
    }
 }

@@ -500,7 +500,7 @@ __global__ void _SoftmaxExp(
    const int               count,
    T*                      y) {
    CUDA_KERNEL_LOOP(idx, count) {
-        y[idx] = std::exp(y[idx]);
+        y[idx] = exp(y[idx]);
    }
 }

@@ -619,7 +619,7 @@ __global__ void _Tanh(
    const T*                x,
    T*                      y) {
    CUDA_KERNEL_LOOP(i, count) {
-        y[i] = std::tanh(x[i]);
+        y[i] = tanh(x[i]);
    }
 }

@@ -804,59 +804,192 @@ template<> void AbsGrad<float, CUDAContext>(
 template <typename T>
 __global__ void _SigmoidCrossEntropy(
    const int               count,
-    const T*                x,
-    const T*                target,
-    T*                      loss,
-    T*                      valid) {
+    const T*                logits,
+    const T*                targets,
+    T*                      losses,
+    T*                      flags) {
    CUDA_KERNEL_LOOP(idx, count) {
-        if (target[idx] < 0) {
-            loss[idx] = valid[idx] = 0.;
+        if (targets[idx] < 0) {
+            losses[idx] = flags[idx] = 0;
        } else {
-            loss[idx] = std::log(1 +
-                std::exp(x[idx] - 2 * x[idx] * (x[idx] >= 0))
-            ) + x[idx] * ((x[idx] >= 0) - target[idx]);
-            valid[idx] = 1.;
+            losses[idx] = log(1 +
+                exp(logits[idx] - 2 * logits[idx] * (logits[idx] >= 0))
+            ) + logits[idx] * ((logits[idx] >= 0) - targets[idx]);
+            flags[idx] = 1;
        }
    }
 }

 template <> void SigmoidCrossEntropy<float, CUDAContext>(
    const int               count,
-    const float*            x,
-    const float*            target,
-    float*                  loss,
-    float*                  valid) {
+    const float*            logits,
+    const float*            targets,
+    float*                  losses,
+    float*                  flags,
+    CUDAContext*            ctx) {
    _SigmoidCrossEntropy<float>
-        << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-            count, x, target, loss, valid);
+        << <CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >> >(
+                count, logits, targets, losses, flags);
 }

 template <typename T>
 __global__ void _SigmoidCrossEntropyGrad(
    const int               count,
-    const T*                x,
-    const T*                target,
-    T*                      dx,
-    T*                      valid) {
+    const T*                logits,
+    const T*                targets,
+    T*                      dlogits,
+    T*                      flags) {
    CUDA_KERNEL_LOOP(idx, count) {
-        if (target[idx] < 0) {
-            dx[idx] = valid[idx] = 0.;
+        if (targets[idx] < 0) {
+            dlogits[idx] = flags[idx] = 0;
        } else {
-            dx[idx] = 1. / (1. + expf(-x[idx])) - target[idx];
-            valid[idx] = 1.;
+            dlogits[idx] = 1 / (1 + exp(-logits[idx])) - targets[idx];
+            flags[idx] = 1;
        }
    }
 }

 template <> void SigmoidCrossEntropyGrad<float, CUDAContext>(
    const int               count,
-    const float*            x,
-    const float*            target,
-    float*                  dx,
-    float*                  valid) {
+    const float*            logits,
+    const float*            targets,
+    float*                  dlogits,
+    float*                  flags,
+    CUDAContext*            ctx) {
    _SigmoidCrossEntropyGrad<float>
-        << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-            count, x, target, dx, valid);
+        << <CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >> >(
+                count, logits, targets, dlogits, flags);
+}
+
+/******************** loss.sigmoid_focal_loss ********************/
+
+template <typename T>
+__global__ void _SigmoidFocalLoss(
+    const int               count,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const T*                logits,
+    const T*                targets,
+    T*                      losses,
+    T*                      flags) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int iix = idx % inner_dim;
+        const int aix = (idx / inner_dim) % axis_dim;
+        const int oix = idx / inner_dim / axis_dim;
+        const int t = targets[oix * inner_dim + iix];
+        //  ``0`` is reserved for targets if neg id is zero
+        //  use ``aix + 1`` to match the targets
+        T c1 = (t == (aix + (neg_id ? 0 : 1)));
+        T c2 = (t != -1) & (t != (aix + (neg_id ? 0 : 1)));
+        T p = 1 / (1 + exp(-logits[idx]));  //  logit -> prob
+
+        // (1 - p)^{gamma} * log(p)
+        T pos_term = pow(1 - p, gamma) * log(max(p, FLT_MIN));
+
+        // p^{gamma} * log(1 - p)
+        T neg_term = pow(p, gamma) * (
+            -logits[idx] * (logits[idx] >= 0) - log(
+                1 + exp(logits[idx] - 2 * logits[idx] * (logits[idx] >= 0)))
+       );
+
+        losses[idx] = 0.0;
+        losses[idx] += -c1 * pos_term * pos_alpha;
+        losses[idx] += -c2 * neg_term * neg_alpha;
+        flags[idx] = c1;
+    }
+}
+
+template <> void SigmoidFocalLoss<float, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const float*            logits,
+    const float*            targets,
+    float*                  losses,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    TIndex count = outer_dim * axis_dim * inner_dim;
+    _SigmoidFocalLoss<float>
+        << <CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >> >(
+                count, axis_dim, inner_dim,
+                    pos_alpha, neg_alpha, gamma, neg_id,
+                        logits, targets, losses, flags);
+}
+
+template <typename T>
+__global__ void _SigmoidFocalLossGradient(
+    const int               count,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const T*                logits,
+    const T*                targets,
+    T*                      dlogits,
+    T*                      flags) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int iix = idx % inner_dim;
+        const int aix = (idx / inner_dim) % axis_dim;
+        const int oix = idx / inner_dim / axis_dim;
+        const int t = targets[oix * inner_dim + iix];
+        //  ``0`` is reserved for targets if neg id is zero
+        //  use ``aix + 1`` to match the targets
+        T c1 = (t == (aix + (neg_id ? 0 : 1)));
+        T c2 = (t != -1) & (t != (aix + (neg_id ? 0 : 1)));
+        T p = 1 / (1 + exp(-logits[idx]));  //  logit -> prob
+
+        // (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
+        T pos_term = pow((1 - p), gamma) * (
+            1 - p - p * gamma * log(max(p, FLT_MIN))
+        );
+
+        // p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
+        T neg_term = pow(p, gamma) * (
+            (-logits[idx] * (logits[idx] >= 0) - log(
+                1 + exp(logits[idx] - 2 * logits[idx] * (logits[idx] >= 0)))
+            ) * (1 - p) * gamma - p
+        );
+
+        dlogits[idx] = 0.0;
+        dlogits[idx] += -c1 * pos_term * pos_alpha;
+        dlogits[idx] += -c2 * neg_term * neg_alpha;
+        flags[idx] = c1;
+    }
+}
+
+template <> void SigmoidFocalLossGradient<float, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const float*            logits,
+    const float*            targets,
+    float*                  dlogits,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    TIndex count = outer_dim * axis_dim * inner_dim;
+    _SigmoidFocalLossGradient<float>
+        << <CUDA_BLOCKS(count), CUDA_THREADS,
+            0, ctx->cuda_stream() >> >(
+                count, axis_dim, inner_dim,
+                    pos_alpha, neg_alpha, gamma, neg_id,
+                        logits, targets, dlogits, flags);
 }

 /******************** loss.smooth_l1_loss ********************/
@@ -933,19 +1066,23 @@ template <> void SoftmaxCrossEntropy<float, CUDAContext>(
            count, prob, target, loss);
 }

-/******************** loss.sparse_softmax_cross_entropy ********************/
+/******************** loss.softmax_focal_loss ********************/

-template <typename Tx, typename Ty>
-__global__ void _SparseSoftmaxCrossEntropy(
+template <typename T>
+__global__ void _SoftmaxFocalLoss(
    const int               count,
    const int               axis_dim,
    const int               inner_dim,
-    const Tx*               prob,
-    const Ty*               labels,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const T*                prob,
+    const T*                labels,
    const int*              ignores,
    const int               num_ignores,
-    Tx*                     losses,
-    Tx*                     flags) {
+    T*                      losses,
+    T*                      flags) {
    CUDA_KERNEL_LOOP(idx, count) {
        const int oix = idx / inner_dim;
        const int iix = idx % inner_dim;
@@ -958,19 +1095,24 @@ __global__ void _SparseSoftmaxCrossEntropy(
            }
        }
        if (k == num_ignores) {
-            losses[idx] = -log(
-                max(prob[(oix * axis_dim + label)
-                    * inner_dim + iix], FLT_MIN)
-            );
-            flags[idx] = 1;
+            const int t = (oix * axis_dim + label) * inner_dim + iix;
+            T scale = pow(1.f - prob[t], gamma);
+            scale = label > neg_id ?
+                pos_alpha * scale : neg_alpha * scale;
+            losses[idx] = -scale * log(max(prob[t], FLT_MIN));
+            flags[idx] = label > neg_id ? 1 : 0;
        }
    }
 }

-template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
+template <> void SoftmaxFocalLoss<float, CUDAContext>(
    const int               outer_dim,
    const int               axis_dim,
    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
    const float*            prob,
    const float*            labels,
    const int*              ignores,
@@ -979,45 +1121,30 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
    float*                  flags,
    CUDAContext*            ctx) {
    const int num_preds = outer_dim * inner_dim;
-    _SparseSoftmaxCrossEntropy<float, float>
-        << <CUDA_BLOCKS(num_preds), CUDA_THREADS,
-            0, ctx->cuda_stream() >> >(
-                num_preds, axis_dim, inner_dim,
-                    prob, labels, ignores, num_ignores,
-                        losses, flags);
-}
-
-template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
-    const int               outer_dim,
-    const int               axis_dim,
-    const int               inner_dim,
-    const float*            prob,
-    const int64_t*          labels,
-    const int*              ignores,
-    const int               num_ignores,
-    float*                  losses,
-    float*                  flags,
-    CUDAContext*            ctx) {
-    const int num_preds = outer_dim * inner_dim;
-    _SparseSoftmaxCrossEntropy<float, int64_t>
+    _SoftmaxFocalLoss<float>
        << <CUDA_BLOCKS(num_preds), CUDA_THREADS,
            0, ctx->cuda_stream() >> >(
                num_preds, axis_dim, inner_dim,
+                    pos_alpha, neg_alpha, gamma, neg_id,
                        prob, labels, ignores, num_ignores,
                            losses, flags);
 }

-template <typename Tx, typename Ty>
-__global__ void _SparseSoftmaxCrossEntropyGrad(
+template <typename T>
+__global__ void _SoftmaxFocalLossGrad(
    const int               count,
    const int               axis_dim,
    const int               inner_dim,
-    const Tx*               prob,
-    const Ty*               labels,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
+    const T*                prob,
+    const T*                labels,
    const int*              ignores,
    const int               num_ignores,
-    Tx*                     dx,
-    Tx*                     flags) {
+    T*                      dx,
+    T*                      flags) {
    CUDA_KERNEL_LOOP(idx, count) {
        const int oix = idx / inner_dim;
        const int iix = idx % inner_dim;
@@ -1030,16 +1157,35 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
                dx[(oix * axis_dim + c) * inner_dim + iix] = 0;
            flags[idx] = 0;
        } else {
-            dx[(oix * axis_dim + label) * inner_dim + iix] -= 1;
-            flags[idx] = 1;
+            const int t = (oix * axis_dim + label) * inner_dim + iix;
+            T onemp = 1. - prob[t];
+            //  unstable if gamma is 0
+            T grad = -gamma * pow(onemp, gamma - 1)
+                            * log(max(prob[t], FLT_MIN))
+                            * prob[t] + pow(onemp, gamma);
+            grad = label > neg_id ?
+                pos_alpha * grad : neg_alpha * grad;
+            for (int c = 0; c < axis_dim; c++) {
+                const int i = (oix * axis_dim + c) * inner_dim + iix;
+                if (c == label) {
+                    dx[i] = grad * (prob[t] - 1);
+                } else {
+                    dx[i] = grad * prob[i];
+                }
+            }
+            flags[idx] = label > neg_id ? 1 : 0;
        }
    }
 }

-template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
+template<> void SoftmaxFocalLossGrad<float, CUDAContext>(
    const int               outer_dim,
    const int               axis_dim,
    const int               inner_dim,
+    const float             pos_alpha,
+    const float             neg_alpha,
+    const float             gamma,
+    const int               neg_id,
    const float*            prob,
    const float*            labels,
    const int*              ignores,
@@ -1048,51 +1194,28 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
    float*                  flags,
    CUDAContext*            ctx) {
    const int num_preds = outer_dim * inner_dim;
-    _SparseSoftmaxCrossEntropyGrad<float, float>
-        << <CUDA_BLOCKS(num_preds), CUDA_THREADS,
-            0, ctx->cuda_stream() >> >(
-                num_preds, axis_dim, inner_dim,
-                    prob, labels, ignores, num_ignores, 
-                        dx, flags);
-}
-
-template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>(
-    const int               outer_dim,
-    const int               axis_dim,
-    const int               inner_dim,
-    const float*            prob,
-    const int64_t*          labels,
-    const int*              ignores,
-    const int               num_ignores,
-    float*                  dx,
-    float*                  flags,
-    CUDAContext*            ctx) {
-    const int num_preds = outer_dim * inner_dim;
-    _SparseSoftmaxCrossEntropyGrad<float, int64_t>
+    _SoftmaxFocalLossGrad<float>
        << <CUDA_BLOCKS(num_preds), CUDA_THREADS,
            0, ctx->cuda_stream() >> >(
                num_preds, axis_dim, inner_dim,
+                    pos_alpha, neg_alpha, gamma, neg_id,
                        prob, labels, ignores, num_ignores,
                            dx, flags);
 }

-/******************** loss.sparse_softmax_focal_loss ********************/
+/******************** loss.sparse_softmax_cross_entropy ********************/

-template <typename T>
-__global__ void _SparseSoftmaxFocalLoss(
+template <typename Tx, typename Ty>
+__global__ void _SparseSoftmaxCrossEntropy(
    const int               count,
    const int               axis_dim,
    const int               inner_dim,
-    const float             pos_alpha,
-    const float             neg_alpha,
-    const float             gamma,
-    const int               neg_id,
-    const T*                prob,
-    const T*                labels,
+    const Tx*               prob,
+    const Ty*               labels,
    const int*              ignores,
    const int               num_ignores,
-    T*                      losses,
-    T*                      flags) {
+    Tx*                     losses,
+    Tx*                     flags) {
    CUDA_KERNEL_LOOP(idx, count) {
        const int oix = idx / inner_dim;
        const int iix = idx % inner_dim;
@@ -1105,24 +1228,19 @@ __global__ void _SparseSoftmaxFocalLoss(
            }
        }
        if (k == num_ignores) {
-            const int t = (oix * axis_dim + label) * inner_dim + iix;
-            T scale = pow(1.f - prob[t], gamma);
-            scale = label > neg_id ?
-                pos_alpha * scale : neg_alpha * scale;
-            losses[idx] = -scale * std::log(max(prob[t], FLT_MIN));
-            flags[idx] = label > neg_id ? 1 : 0;
+            losses[idx] = -log(
+                max(prob[(oix * axis_dim + label)
+                    * inner_dim + iix], FLT_MIN)
+            );
+            flags[idx] = 1;
        }
    }
 }

-template <> void SparseSoftmaxFocalLoss<float, CUDAContext>(
+template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
    const int               outer_dim,
    const int               axis_dim,
    const int               inner_dim,
-    const float             pos_alpha,
-    const float             neg_alpha,
-    const float             gamma,
-    const int               neg_id,
    const float*            prob,
    const float*            labels,
    const int*              ignores,
@@ -1131,30 +1249,45 @@ template <> void SparseSoftmaxFocalLoss<float, CUDAContext>(
    float*                  flags,
    CUDAContext*            ctx) {
    const int num_preds = outer_dim * inner_dim;
-    _SparseSoftmaxFocalLoss<float>
+    _SparseSoftmaxCrossEntropy<float, float>
        << <CUDA_BLOCKS(num_preds), CUDA_THREADS,
            0, ctx->cuda_stream() >> >(
                num_preds, axis_dim, inner_dim,
-                    pos_alpha, neg_alpha, gamma, neg_id,
                    prob, labels, ignores, num_ignores,
                        losses, flags);
 }

-template <typename T>
-__global__ void _SparseSoftmaxFocalLossGrad(
+template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _SparseSoftmaxCrossEntropy<float, int64_t>
+        << <CUDA_BLOCKS(num_preds), CUDA_THREADS,
+            0, ctx->cuda_stream() >> >(
+                num_preds, axis_dim, inner_dim,
+                    prob, labels, ignores, num_ignores,
+                        losses, flags);
+}
+
+template <typename Tx, typename Ty>
+__global__ void _SparseSoftmaxCrossEntropyGrad(
    const int               count,
    const int               axis_dim,
    const int               inner_dim,
-    const float             pos_alpha,
-    const float             neg_alpha,
-    const float             gamma,
-    const int               neg_id,
-    const T*                prob,
-    const T*                labels,
+    const Tx*               prob,
+    const Ty*               labels,
    const int*              ignores,
    const int               num_ignores,
-    T*                      dx,
-    T*                      flags) {
+    Tx*                     dx,
+    Tx*                     flags) {
    CUDA_KERNEL_LOOP(idx, count) {
        const int oix = idx / inner_dim;
        const int iix = idx % inner_dim;
@@ -1167,35 +1300,16 @@ __global__ void _SparseSoftmaxFocalLossGrad(
                dx[(oix * axis_dim + c) * inner_dim + iix] = 0;
            flags[idx] = 0;
        } else {
-            const int t = (oix * axis_dim + label) * inner_dim + iix;
-            T onemp = 1. - prob[t];
-            //  unstable if gamma is 0
-            T grad = -gamma * pow(onemp, gamma - 1)
-                            * log(max(prob[t], FLT_MIN))
-                            * prob[t] + pow(onemp, gamma);
-            grad = label > neg_id ?
-                pos_alpha * grad : neg_alpha * grad;
-            for (int c = 0; c < axis_dim; c++) {
-                const int i = (oix * axis_dim + c) * inner_dim + iix;
-                if (c == label) {
-                    dx[i] = grad * (prob[t] - 1);
-                } else {
-                    dx[i] = grad * prob[i];
-                }
-            }
-            flags[idx] = label > neg_id ? 1 : 0;
+            dx[(oix * axis_dim + label) * inner_dim + iix] -= 1;
+            flags[idx] = 1;
        }
    }
 }

-template<> void SparseSoftmaxFocalLossGrad<float, CUDAContext>(
+template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
    const int               outer_dim,
    const int               axis_dim,
    const int               inner_dim,
-    const float             pos_alpha,
-    const float             neg_alpha,
-    const float             gamma,
-    const int               neg_id,
    const float*            prob,
    const float*            labels,
    const int*              ignores,
@@ -1204,11 +1318,30 @@ template<> void SparseSoftmaxFocalLossGrad<float, CUDAContext>(
    float*                  flags,
    CUDAContext*            ctx) {
    const int num_preds = outer_dim * inner_dim;
-    _SparseSoftmaxFocalLossGrad<float>
+    _SparseSoftmaxCrossEntropyGrad<float, float>
+        << <CUDA_BLOCKS(num_preds), CUDA_THREADS,
+            0, ctx->cuda_stream() >> >(
+                num_preds, axis_dim, inner_dim,
+                    prob, labels, ignores, num_ignores, 
+                        dx, flags);
+}
+
+template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  dx,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _SparseSoftmaxCrossEntropyGrad<float, int64_t>
        << <CUDA_BLOCKS(num_preds), CUDA_THREADS,
            0, ctx->cuda_stream() >> >(
                num_preds, axis_dim, inner_dim,
-                    pos_alpha, neg_alpha, gamma, neg_id,
                    prob, labels, ignores, num_ignores,
                        dx, flags);
 }
@@ -2355,7 +2488,7 @@ __global__ void _LSTMCellAct(
    CUDA_KERNEL_LOOP(idx, count) {
        const int offset = idx % x_offset;
        xact[idx] = offset < c_offset ?
-            _SigmoidUnit<float>(xact[idx]) : std::tanh(xact[idx]);
+            _SigmoidUnit<float>(xact[idx]) : tanh(xact[idx]);
    }
 }

@@ -2379,7 +2512,7 @@ __global__ void _LSTMCellGate(
        const T o = x[offset + o_offset];
        T c_ = x[offset + c_offset];
        c_ = c[idx] = f * cx[idx] + i * c_;
-        h[idx] = o * std::tanh(c_);
+        h[idx] = o * tanh(c_);
    }
 }