add memonger for Dragon

Ting PAN
Commit ddb76e7b authored Aug 18, 2017 by Ting PAN
Showing with 1800 additions and 982 deletions
Dragon/include/core/graph.h
Dragon/include/core/operator.h
Dragon/include/core/tensor.h
Dragon/include/core/workspace.h
Dragon/include/operators/activation/dropout_op.h
Dragon/include/operators/activation/relu_op.h
Dragon/include/operators/activation/sigmoid_op.h
Dragon/include/operators/activation/softmax_op.h
Dragon/include/operators/activation/tanh_op.h
Dragon/include/operators/arithmetic/add_op.h
Dragon/include/operators/arithmetic/bias_add_op.h
Dragon/include/operators/arithmetic/clip_op.h
Dragon/include/operators/arithmetic/div_op.h
Dragon/include/operators/arithmetic/dot_op.h
Dragon/include/operators/arithmetic/eltwise_op.h
Dragon/include/operators/arithmetic/exp_op.h
Dragon/include/operators/arithmetic/gram_matrix_op.h
Dragon/include/operators/arithmetic/inner_product_op.h
Dragon/include/operators/arithmetic/log_op.h
Dragon/include/operators/arithmetic/matmul_op.h
--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -42,6 +42,7 @@ class Graph final : public GraphBase {
    GraphDef Prune(const GraphDef& graph_def);
    GraphDef Share(const GraphDef& graph_def);
    GraphDef MakeUpdate(const GraphDef& graph_def);
+    void RecomputingAware(const GraphDef& graph_def, Workspace* ws);
    inline Workspace* ws() const { return ws_; }

--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -80,30 +80,35 @@ class Operator : public OperatorBase {
        allow_run_ = true;
        allow_run_ &= _MPICheck();
        allow_run_ &= (!(OutputSize() == 1 && output(0)->name() == "ignore"));
+        allow_share_grads_ = (!op_def.debug_mode());
+        allow_share_grads_ &= op_def.share_grads(); 
+        allow_share_grads_ &= (type().find("Gradient") != string::npos);
    }
    virtual void Run() final {
        if (!allow_run_)  return;
+        MakeResource();
        ctx_.SwitchToDevice();
-        if (!op_def_.debug_mode()) ShareBeforeRun();
        MemorySwitch();
        RunOnDevice();
-        if (!op_def_.debug_mode()) ClearAfterRun();
        ctx_.FinishDeviceCompution();
+        CleanResource();
    }
+    virtual void ElimateCorruption();
+    virtual void ShareGradient();
+    virtual void MakeResource();
+    virtual void CleanResource();
    void MemorySwitch() {
        for (int i = 0; i < InputSize(); i++)
-            if (input(i).name() != "ignore")
+            if (input(i).name() != "ignore") input(i).SwitchToDevice();
-                input(i).SwitchToDevice();
        for (int i = 0; i < OutputSize(); i++)
-            if (output(i)->name() != "ignore")
+            if (output(i)->name() != "ignore") output(i)->SwitchToDevice();
-                output(i)->SwitchToDevice();
    }
-    virtual void ShareBeforeRun() { /*** share tensors here if necessary ***/ }
    virtual void RunOnDevice() = 0;
-    virtual void ClearAfterRun()  { /*** clear tensors here if necessary ***/ }
    inline Context& ctx() { return ctx_; }
    inline string anchor() { return GetSingleArg("anchor", name()); }
@@ -111,7 +116,7 @@ class Operator : public OperatorBase {
 protected:
    Context ctx_;
-    bool allow_run_;
+    bool allow_run_, allow_share_grads_;
 private:
    bool _MPICheck() {
@@ -169,6 +174,9 @@ DECLARE_REGISTRY(CUDNNOperatorRegistry, OperatorBase, const OperatorDef&, Worksp
    } \
  }
+#define DISABLE_SHARE_GRADIENT   \
+    this->allow_share_grads_ = false
 #define INSTANTIATE_OPERATOR(name, context) \
  template class name##Op<context>;

--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -30,7 +30,7 @@ class Tensor {
            CHECK_GT(d, 0);
            new_size *= d;
        }
-        if (size_ != new_size && 
+        if (size_ != new_size && own_mem_ &&
            capacity_ < TIndex(new_size * meta_.itemsize())) {
            memory_.reset();
            capacity_ = 0;
@@ -38,9 +38,7 @@ class Tensor {
        size_ = new_size;
    }
-    void ReshapeLike(const Tensor& other) { 
+    void ReshapeLike(const Tensor& other) { Reshape(other.dims_); }
-        Reshape(other.dims_); 
-    }
    inline const string& name() const { return name_; }
@@ -92,63 +90,86 @@ class Tensor {
        return ss.str();
    }
-    MixedMemory::State memory_state() const { return memory_->state(); }
+    inline bool is_corrupted() const { return is_corrupted_; }
-    MixedMemory* memory() const { return memory_.get(); }
+    inline void Corrupt() { is_corrupted_ = true; }
-    void SwitchToDevice() { if(memory_) memory_->SwitchToDevice(); }
+    MixedMemory* memory() const { return own_mem_ ? memory_.get() : ex_memory_; }
+    MixedMemory::State memory_state() const { 
+        MixedMemory* mem = memory();
+        CHECK(mem) << "memory access before allowcating.";
+        return memory()->state(); 
+    }
+    void SwitchToDevice() { 
+        MixedMemory* mem = own_mem_ ? memory_.get() : ex_memory_;
+        if (mem) mem->SwitchToDevice();
+    }
    const TypeMeta& meta() const { return meta_; }
    void SetMeta(const TypeMeta& meta) { meta_ = meta; }
    template <typename T> inline bool IsType() { return meta_.Match<T>(); }
    template <class Context>
-    const void* raw_data() const {
+    void mutable_data_ptr(void** data_ptr) {
-        CHECK(memory_.get()) << "memory access before allowcating.";
+        MixedMemory* mem = memory();
-        if (TypeMeta::Id<Context>() == TypeMeta::Id<CPUContext>())
+        if (!mem) {
-            return memory_->cpu_data();
+            *data_ptr = nullptr;
-        else if (TypeMeta::Id<Context>() == TypeMeta::Id<CUDAContext>())
+        } else {
-            return memory_->cuda_data();
+            if (TypeMeta::Id<Context>() == TypeMeta::Id<CPUContext>()) {
-        else LOG(FATAL) << "unknown memory type access. only CPU or CUDA are supported.";
+                *data_ptr = mem->mutable_cpu_data();
-        return nullptr;
+            } else if (TypeMeta::Id<Context>() == TypeMeta::Id<CUDAContext>()) {
+                *data_ptr = mem->mutable_cuda_data();
+            } else {
+                LOG(FATAL) << "unknown memory type access. only CPU or CUDA are supported.";
+            }
        }
-    template <typename T, class Context>
-    const T* data() const {
-        return static_cast<const T*>(raw_data<Context>());
    }
    template <class Context>
-    void active_data_ptr(void** data_ptr) {
+    const void* const_data_ptr() const {
-        if (!memory_) {
+        MixedMemory* mem = memory();
-            *data_ptr = nullptr;
+        CHECK(mem) << "memory access before allowcating.";
-        } else {
        if (TypeMeta::Id<Context>() == TypeMeta::Id<CPUContext>()) {
-                *data_ptr = memory_->mutable_cpu_data();
+             return mem->cpu_data();
        } else if (TypeMeta::Id<Context>() == TypeMeta::Id<CUDAContext>()) {
-                *data_ptr = memory_->mutable_cuda_data();
+             return mem->cuda_data();
-            }
+        } else {
+             LOG(FATAL) << "unknown memory type access. only CPU or CUDA are supported.";
+             return nullptr;
        }
    }
    template <class Context>
    void* raw_mutable_data(const TypeMeta& meta) {
        void* data_ptr;
-        active_data_ptr<Context>(&data_ptr);
+        if (own_mem_) {
+            mutable_data_ptr<Context>(&data_ptr);
            if (meta_ == meta && data_ptr) {
                return data_ptr;
            } else {
-            meta_ = meta;    //  copy-assign the meta
+                meta_ = meta;
-            CHECK_GT(size_, 0);    //  must specify a valid size
+                CHECK_GT(size_, 0);
                memory_.reset(new MixedMemory(meta, size_* meta_.itemsize()));
-            //  malloc
+                mutable_data_ptr<Context>(&data_ptr);  //  malloc
-            if (TypeMeta::Id<Context>() == TypeMeta::Id<CPUContext>())
-                data_ptr = memory_->mutable_cpu_data();
-            else if (TypeMeta::Id<Context>() == TypeMeta::Id<CUDAContext>())
-                data_ptr = memory_->mutable_cuda_data();
-            //  init for each structed element if necessary
                if (meta.ctor()) meta_.ctor()(data_ptr, size_);
            }
            capacity_ = size_ * meta_.itemsize();
            return data_ptr;
+        } else {
+            meta_ = meta;
+            CHECK_GT(size_, 0);
+            TIndex ex_capacity_ = ex_memory_->nbytes();
+            if (ex_capacity_ >= TIndex(size_ * meta.itemsize())) {
+                mutable_data_ptr<Context>(&data_ptr);
+            } else {
+                delete ex_memory_;
+                ex_memory_ = new MixedMemory(meta, size_* meta_.itemsize());
+                mutable_data_ptr<Context>(&data_ptr);  //  malloc
+                if (meta.ctor()) meta_.ctor()(data_ptr, size_);
+                capacity_ = size_ * meta.itemsize();
+            }
+            return data_ptr;
+        }
    }
    template <class Context>
@@ -159,22 +180,30 @@ class Tensor {
        return raw_mutable_data<Context>(meta_);
    }
+    template <class Context>
+    const void* raw_data() const { return const_data_ptr<Context>(); }
    template <typename T, class Context>
    T* mutable_data() {
        void* data_ptr;
-        active_data_ptr<Context>(&data_ptr);
+        mutable_data_ptr<Context>(&data_ptr);
        if (data_ptr && meta_ == TypeMeta::Make<T>()) return static_cast<T*>(data_ptr);
        return static_cast<T*>(raw_mutable_data<Context>(TypeMeta::Make<T>()));
    }
-    void Share(const Tensor& other) {
+    template <typename T, class Context>
+    const T* data() const {
+        return static_cast<const T*>(raw_data<Context>());
+    }
+    inline void Share(const Tensor& other) {
        CHECK_EQ(size_, other.size_);
        memory_ = other.memory_;
        meta_ = other.meta_;
        capacity_ = other.capacity_;
    }
-    void Replace(const Tensor& other) {
+    inline void Replace(const Tensor& other) {
        memory_ = other.memory_;
        meta_ = other.meta_;
        capacity_ = other.capacity_;
@@ -182,23 +211,27 @@ class Tensor {
        dims_ = other.dims_;
    }
-    void Reset() {
+    inline void Move(MixedMemory* mem) {
+        if (mem != nullptr) ex_memory_ = mem; 
+        else ex_memory_ = new MixedMemory(TypeMeta::Make<float>(), 4);
+        own_mem_ = false;
+    }
+    inline void Reset() {
        size_ = capacity_ = 0;
        meta_ = TypeMeta();
        dims_.clear();
        memory_.reset();
    }
-    void Release() {
-        memory_.reset();
-    }
 private:
    vector<TIndex> dims_;
    TIndex size_ = 0, capacity_ = 0;
    TypeMeta meta_;
    string name_;
    shared_ptr<MixedMemory> memory_;
+    MixedMemory* ex_memory_ = nullptr;
+    bool is_corrupted_ = false, own_mem_ = true;
 };
 }    // namespace dragon

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -13,23 +13,28 @@
 namespace dragon {
-#define WORKSPACE_MIN_BUFFER_SIZE 3
+#define WORKSPACE_COMMON_BUFFER_SIZE 2
-#define WORKSPACE_MAX_BUFFER_SIZE 3
+#define WORKSPACE_GRAD_BUFFER_SIZE 1
+#define WORKSPACE_MAX_CORRUPTED_SIZE 2
 class Workspace{
 public:
    typedef Map<string, unique_ptr<Tensor> > TensorMap;
+    typedef Map<string, stack<string> > BufferMap;
    typedef Map<string, unique_ptr<mutex> > LockMap;
    typedef Map<string, unique_ptr<GraphBase> > GraphMap;
    typedef Map<string, TensorFiller> FillerMap;
    typedef Map<string, string> RenameMap;
+    typedef Map<string, vector<OperatorBase*> > RecomputeMap;
    Workspace(): root_folder_(".") { init(); }
    Workspace(string root_folder) : root_folder_(root_folder) { init(); }
+    ~Workspace();
    void init() { 
        CreateTensor("ignore"); 
-        for (int i = 0; i < WORKSPACE_MIN_BUFFER_SIZE; i++) CreateBuffer();
+        CreateBuffer("Common", WORKSPACE_COMMON_BUFFER_SIZE);
+        CreateBuffer("Grad", WORKSPACE_GRAD_BUFFER_SIZE);
    }
    /******************** Tensor ********************/
@@ -101,33 +106,39 @@ class Workspace{
    /******************** Buffer ********************/
-    inline Tensor* CreateBuffer() {
+    inline void CreateBuffer(string category, int num) {
-        int buffer_idx = 1;
+        CHECK(!buffer_map_.count(category));
-        string name;
+        buffer_map_[category] = stack<string>();
-        while (1) {
+        for (int i = 1; i <= num; i++) {
-            name = "_t_buffer_" + dragon_cast<string, int>(buffer_idx++);
+            string name = "_t_" + category + "_buffer_" + dragon_cast<string, int>(i);
-            if (!HasTensor(name)) break;
+            buffer_map_[category].push(name);
+            CreateTensor(name);
        }
-        buffer_stack_.push(name);
-        return CreateTensor(name);
    }
-    inline Tensor* GetBuffer() {
+    inline Tensor* GetBuffer(string category = "Common") {
-        if (!buffer_stack_.empty()) {
+        if (!buffer_map_[category].empty()) {
-            string name = buffer_stack_.top();
+            string name = buffer_map_[category].top();
-            buffer_stack_.pop();
+            buffer_map_[category].pop();
            return GetTensor(name);
        }
-        LOG(FATAL) << "buffers are not enough, add more if necessary.";
+        LOG(FATAL) << "buffers of [" << category << "] "
+                   << "are not enough, add more if necessary.";
        return nullptr;
    }
-    inline void ReleaseBuffer(Tensor* tensor, bool force_release=false) {
+    inline void ReleaseBuffer(Tensor* tensor, 
+                              string category = "Common",
+                              bool enforce = false) {
+        static Map<string, int> limits = {
+            { "Common", WORKSPACE_COMMON_BUFFER_SIZE },
+            { "Grad", WORKSPACE_GRAD_BUFFER_SIZE }};
+        if (buffer_map_[category].size() >= limits[category] || enforce) {
            //  release directly
-        if (buffer_stack_.size() >= WORKSPACE_MAX_BUFFER_SIZE || force_release) {
            ReleaseTensor(tensor->name());
-        } else {    //  recover as a available buffer
+        } else {    
-            buffer_stack_.push(tensor->name());
+            //  recover as a available buffer
+            buffer_map_[category].push(tensor->name());
        }
    }
@@ -158,14 +169,30 @@ class Workspace{
        rename_map_[old_tensor] = new_tensor;
    }
+    inline void AddRecompute(const string& tensor, OperatorBase* op) {
+        if (!recompute_map_.count(tensor)) {
+            recompute_map_[tensor] = vector<OperatorBase*>();
+        }
+        recompute_map_[tensor].push_back(op);
+    }
+    inline vector<OperatorBase*> GetRecompute(const string& tensor) {
+        if (recompute_map_.count(tensor)) {
+            return recompute_map_[tensor];
+        } else {
+            return vector<OperatorBase*>();
+        }
+    }
 private:
    TensorMap tensor_map_;
+    BufferMap buffer_map_;
    LockMap lock_map_;
    GraphMap graph_map_;
    FillerMap filler_map_;
    RenameMap rename_map_;   
+    RecomputeMap recompute_map_;
    string root_folder_;
-    stack<string> buffer_stack_;
 };
 }    // namespace dragon

--- a/Dragon/include/operators/activation/dropout_op.h
+++ b/Dragon/include/operators/activation/dropout_op.h
@@ -43,10 +43,11 @@ class DropoutGradientOp final : public Operator<Context> {
        threshold = static_cast<unsigned int>(UINT_MAX * prob);
        if (use_scale) scale = 1.0 / (1.0 - prob);
        else scale = 1.0;
+        DISABLE_SHARE_GRADIENT;
    }
    void RunOnDevice() override;
-    void ClearAfterRun() override;
+    void CleanResource() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/activation/relu_op.h
+++ b/Dragon/include/operators/activation/relu_op.h
@@ -30,7 +30,9 @@ class ReluGradientOp : public Operator<Context> {
 public:
    ReluGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          slope(OperatorBase::GetSingleArg<float>("slope", 0.0)) {}
+          slope(OperatorBase::GetSingleArg<float>("slope", 0.0)) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/activation/sigmoid_op.h
+++ b/Dragon/include/operators/activation/sigmoid_op.h
@@ -23,7 +23,10 @@ class SigmoidOp final : public Operator<Context> {
 template <class Context>
 class SigmoidGradientOp final : public Operator<Context> {
 public:
-    USE_SIMPLE_CTOR_DTOR(SigmoidGradientOp);
+    SigmoidGradientOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/activation/softmax_op.h
+++ b/Dragon/include/operators/activation/softmax_op.h
@@ -32,7 +32,9 @@ class SoftmaxGradientOp final : public Operator<Context> {
 public:
    SoftmaxGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          axis(OperatorBase::GetSingleArg<int>("axis", 1)) {}
+          axis(OperatorBase::GetSingleArg<int>("axis", 1)) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/activation/tanh_op.h
+++ b/Dragon/include/operators/activation/tanh_op.h
@@ -23,7 +23,10 @@ class TanhOp final : public Operator<Context> {
 template <class Context>
 class TanhGradientOp final : public Operator<Context> {
 public:
-    USE_SIMPLE_CTOR_DTOR(TanhGradientOp);
+     TanhGradientOp(const OperatorDef& op_def, Workspace* ws)
+         : Operator<Context>(op_def, ws) {
+         DISABLE_SHARE_GRADIENT;
+     }
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/arithmetic/add_op.h
+++ b/Dragon/include/operators/arithmetic/add_op.h
@@ -29,9 +29,8 @@ class AddGradientOp final : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(AddGradientOp);
-    void ShareBeforeRun() override;
+    void ShareGradient() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void EltwiseRunWithType();
    template <typename T> void BroadcastRunWithType(int type);

--- a/Dragon/include/operators/arithmetic/bias_add_op.h
+++ b/Dragon/include/operators/arithmetic/bias_add_op.h
@@ -35,9 +35,7 @@ class BiasAddGradientOp final : public Operator<Context> {
        : Operator<Context>(op_def, ws),
          data_format(OperatorBase::GetSingleArg<string>("data_format", "NCHW")) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void NCHWRunWithType();
    template <typename T> void NHWCRunWithType();

--- a/Dragon/include/operators/arithmetic/clip_op.h
+++ b/Dragon/include/operators/arithmetic/clip_op.h
@@ -33,9 +33,7 @@ class ClipGradientOp final : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(ClipGradientOp);
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/arithmetic/div_op.h
+++ b/Dragon/include/operators/arithmetic/div_op.h
@@ -29,9 +29,8 @@ class DivGradientOp final : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(DivGradientOp);
-    void ShareBeforeRun() override;
+    void ShareGradient() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void EltwiseRunWithType();
    template <typename T> void BroadcastRunWithType(int type);

--- a/Dragon/include/operators/arithmetic/dot_op.h
+++ b/Dragon/include/operators/arithmetic/dot_op.h
@@ -37,6 +37,7 @@ class DotGradientOp final : public Operator<Context> {
        transA(OperatorBase::GetSingleArg<bool>("TransA", false)),
        transB(OperatorBase::GetSingleArg<bool>("TransB", false)) {}
+    void ShareGradient() override;
    void RunOnDevice() override;
    template <typename T> void DotRunWithType();
    template <typename T> void GemmRunWithType();

--- a/Dragon/include/operators/arithmetic/eltwise_op.h
+++ b/Dragon/include/operators/arithmetic/eltwise_op.h
@@ -48,9 +48,8 @@ class EltwiseGradientOp final : public Operator<Context> {
        } else coeffs.resize(InputSize(), float(1));
    }
-    void ShareBeforeRun() override;
+    void ShareGradient() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void SumRunWithType();
    template <typename T> void ProdRunWithType();

--- a/Dragon/include/operators/arithmetic/exp_op.h
+++ b/Dragon/include/operators/arithmetic/exp_op.h
@@ -25,9 +25,7 @@ class ExpGradientOp final : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(ExpGradientOp);
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 };

--- a/Dragon/include/operators/arithmetic/gram_matrix_op.h
+++ b/Dragon/include/operators/arithmetic/gram_matrix_op.h
@@ -33,9 +33,7 @@ class GramMatrixGradientOp final : public Operator<Context> {
        : Operator<Context>(op_def, ws),
          axis(OperatorBase::GetSingleArg<int>("axis", 1)) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/arithmetic/inner_product_op.h
+++ b/Dragon/include/operators/arithmetic/inner_product_op.h
@@ -39,9 +39,7 @@ class InnerProductGradientOp final : public Operator<Context> {
          num_output(OperatorBase::GetSingleArg<int>("num_output", 0)),
          transW(OperatorBase::GetSingleArg<bool>("TransW", true)) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/arithmetic/log_op.h
+++ b/Dragon/include/operators/arithmetic/log_op.h
@@ -25,9 +25,7 @@ class LogGradientOp final : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(LogGradientOp);
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 };

--- a/Dragon/include/operators/arithmetic/matmul_op.h
+++ b/Dragon/include/operators/arithmetic/matmul_op.h
@@ -36,9 +36,8 @@ class MatmulGradientOp final : public Operator<Context> {
        transA(OperatorBase::GetSingleArg<bool>("TransA", false)),
        transB(OperatorBase::GetSingleArg<bool>("TransB", false)) {}
-    void ShareBeforeRun() override;
+    void ShareGradient() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/arithmetic/mul_op.h
+++ b/Dragon/include/operators/arithmetic/mul_op.h
@@ -29,9 +29,8 @@ class MulGradientOp final : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(MulGradientOp);
-    void ShareBeforeRun() override;
+    void ShareGradient() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void EltwiseRunWithType();
    template <typename T> void BroadcastRunWithType(int type);

--- a/Dragon/include/operators/arithmetic/pow_op.h
+++ b/Dragon/include/operators/arithmetic/pow_op.h
@@ -40,9 +40,7 @@ class PowGradientOp final : public Operator<Context> {
        power_scale = power * scale;
    }
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/arithmetic/scale_op.h
+++ b/Dragon/include/operators/arithmetic/scale_op.h
@@ -35,9 +35,7 @@ class ScaleGradientOp final : public Operator<Context> {
        axis(OperatorBase::GetSingleArg<int>("axis", 1)),
        num_axes(OperatorBase::GetSingleArg<int>("num_axes", -1)) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void BiasRunWithType();
    template <typename T> void ScaleRunWithType();
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/arithmetic/square_op.h
+++ b/Dragon/include/operators/arithmetic/square_op.h
@@ -25,9 +25,7 @@ class SquareGradientOp final : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(SquareGradientOp);
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 };

--- a/Dragon/include/operators/arithmetic/sub_op.h
+++ b/Dragon/include/operators/arithmetic/sub_op.h
@@ -29,9 +29,8 @@ class SubGradientOp final : public Operator<Context> {
 public:
    USE_SIMPLE_CTOR_DTOR(SubGradientOp);
-    void ShareBeforeRun() override;
+    void ShareGradient() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void EltwiseRunWithType();
    template <typename T> void BroadcastRunWithType(int type);

--- a/Dragon/include/operators/common/at_op.h
+++ b/Dragon/include/operators/common/at_op.h
@@ -34,9 +34,7 @@ class AtGradientOp final : public Operator<Context> {
          axis(OperatorBase::GetSingleArg<int>("axis", 0)),
          acc_grad(OperatorBase::GetSingleArg<bool>("acc_gradient", false)) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/common/concat_op.h
+++ b/Dragon/include/operators/common/concat_op.h
@@ -36,9 +36,8 @@ class ConcatGradientOp : public Operator<Context> {
          axis(OperatorBase::GetSingleArg<int>("axis", 1)),
          nin(OperatorBase::GetSingleArg<int>("num_input", 1)) {}
-    void ShareBeforeRun() override;
+    void ShareGradient() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/common/crop_op.h
+++ b/Dragon/include/operators/common/crop_op.h
@@ -57,9 +57,7 @@ class CropGradientOp final : public Operator<Context > {
    }
    void ComputeOutputShape();
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
    template <typename T> void RecursiveRunWithType(vector<TIndex> idxs, 
                                                    const vector<TIndex>& offsets,

--- a/Dragon/include/operators/common/expand_dims_op.h
+++ b/Dragon/include/operators/common/expand_dims_op.h
@@ -27,7 +27,10 @@ class ExpandDimsOp final : public Operator<Context> {
 template <class Context>
 class ExpandDimsGradientOp final : public Operator<Context> {
 public:
-    USE_SIMPLE_CTOR_DTOR(ExpandDimsGradientOp);
+     ExpandDimsGradientOp(const OperatorDef& op_def, Workspace* ws)
+         : Operator<Context>(op_def, ws) {
+         DISABLE_SHARE_GRADIENT;
+     }
    void RunOnDevice() override;
 };

--- a/Dragon/include/operators/common/flatten_op.h
+++ b/Dragon/include/operators/common/flatten_op.h
@@ -28,7 +28,10 @@ class FlattenOp final : public Operator<Context> {
 template <class Context>
 class FlattenGradientOp final : public Operator<Context> {
 public:
-    USE_SIMPLE_CTOR_DTOR(FlattenGradientOp);
+    FlattenGradientOp(const OperatorDef& op_def, Workspace* ws)
+         : Operator<Context>(op_def, ws) {
+         DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
 };

--- a/Dragon/include/operators/common/python_op.h
+++ b/Dragon/include/operators/common/python_op.h
@@ -43,7 +43,9 @@ template <class Context>
 class TemplateGradientOp : public TemplateOp<Context> {
 public:
    TemplateGradientOp(const OperatorDef& op_def, Workspace* ws)
-        : TemplateOp<Context>(op_def, ws) {}
+        : TemplateOp<Context>(op_def, ws) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
 };

--- a/Dragon/include/operators/common/reduce_op.h
+++ b/Dragon/include/operators/common/reduce_op.h
@@ -39,9 +39,7 @@ class ReduceGradientOp final : public Operator<Context> {
        axis(OperatorBase::GetSingleArg<int>("axis", -1)),
        operation(OperatorBase::GetSingleArg<string>("operation", "NONE")) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void SumRunWithType();
    template <typename T> void MeanRunWithType();

--- a/Dragon/include/operators/common/reshape_op.h
+++ b/Dragon/include/operators/common/reshape_op.h
@@ -30,7 +30,10 @@ class ReshapeOp final : public Operator<Context> {
 template <class Context>
 class ReshapeGradientOp final : public Operator<Context> {
 public:
-    USE_SIMPLE_CTOR_DTOR(ReshapeGradientOp);
+    ReshapeGradientOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws) {
+        DISABLE_SHARE_GRADIENT; 
+    }
    void RunOnDevice() override;
 };

--- a/Dragon/include/operators/common/scan_op.h
+++ b/Dragon/include/operators/common/scan_op.h
@@ -61,6 +61,8 @@ class ScanGradientOp final: public Operator<Context> {
        //  handle GI(x)
        for (int i = 0; i < forward_inputs.size(); i++)
            terms[forward_inputs[i] + "_grad"] = output(i)->name();
+        DISABLE_SHARE_GRADIENT;
    }
    void RunOnDevice() override;

--- a/Dragon/include/operators/common/slice_op.h
+++ b/Dragon/include/operators/common/slice_op.h
@@ -35,7 +35,9 @@ class SliceGradientOp final : public Operator<Context> {
    SliceGradientOp(const OperatorDef& op_def, Workspace* ws):
        Operator<Context>(op_def, ws),
        axis(OperatorBase::GetSingleArg<int>("axis", 1)),
-        nout(OperatorBase::GetSingleArg<int>("num_output", 1)) {}
+        nout(OperatorBase::GetSingleArg<int>("num_output", 1)) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/common/tile_op.h
+++ b/Dragon/include/operators/common/tile_op.h
@@ -43,9 +43,7 @@ class TileGradientOp : public Operator<Context> {
                process_axes.push_back({ i, multiples[i] });
    }
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template<typename T> void TileRunWithType();
 protected:

--- a/Dragon/include/operators/common/transpose_op.h
+++ b/Dragon/include/operators/common/transpose_op.h
@@ -33,9 +33,7 @@ class TransposeGradientOp final : public Operator<Context> {
    TransposeGradientOp(const OperatorDef& op_def, Workspace* ws) 
        : Operator<Context>(op_def, ws) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/loss/l1_loss_op.h
+++ b/Dragon/include/operators/loss/l1_loss_op.h
@@ -16,14 +16,12 @@ class L1LossOp : public Operator<Context> {
 public:
    L1LossOp(const OperatorDef& op_def, Workspace* ws) 
        : Operator<Context>(op_def, ws),
-          coeff(OperatorBase::GetSingleArg<float>("coeff", 1.0)),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "BATCH_SIZE")) {}
    void RunOnDevice() override;
    template <typename T> void RunWithType();
 protected:
-    float coeff;
    Tensor* diff;
    string normalization;
 };
@@ -33,14 +31,13 @@ class L1LossGradientOp final : public Operator<Context> {
 public:
    L1LossGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-        coeff(OperatorBase::GetSingleArg<float>("coeff", 1.0)),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "BATCH_SIZE")) {}
+    void ShareGradient() override;
    void RunOnDevice() override;
    template <typename T> void RunWithType();
 protected:
-    float coeff;
    Tensor* diff;
    string normalization;
 };

--- a/Dragon/include/operators/loss/l2_loss_op.h
+++ b/Dragon/include/operators/loss/l2_loss_op.h
@@ -16,14 +16,12 @@ class L2LossOp : public Operator<Context> {
 public:
    L2LossOp(const OperatorDef& op_def, Workspace* ws) 
        : Operator<Context>(op_def, ws),
-          coeff(OperatorBase::GetSingleArg<float>("coeff", 1.0)),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "BATCH_SIZE")) {}
    void RunOnDevice() override;
    template <typename T> void RunWithType();
 protected:
-    float coeff;
    Tensor* diff;
    string normalization;
 };
@@ -33,14 +31,13 @@ class L2LossGradientOp final : public Operator<Context> {
 public:
    L2LossGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
-          coeff(OperatorBase::GetSingleArg<float>("coeff", 1.0)),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "BATCH_SIZE")) {}
+    void ShareGradient() override;
    void RunOnDevice() override;
    template <typename T> void RunWithType();
 protected:
-    float coeff;
    Tensor* diff;
    string normalization;
 };

--- a/Dragon/include/operators/loss/sigmoid_cross_entropy_loss_op.h
+++ b/Dragon/include/operators/loss/sigmoid_cross_entropy_loss_op.h
@@ -4,19 +4,20 @@
 // Written by Ting Pan
 // --------------------------------------------------------
-#ifndef DRAGON_OPERATORS_LOSS_SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
+#ifndef DRAGON_OPERATORS_LOSS_SIGMOID_CROSS_ENTROPY_OP_H_
-#define DRAGON_OPERATORS_LOSS_SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
+#define DRAGON_OPERATORS_LOSS_SIGMOID_CROSS_ENTROPY_OP_H_
 #include "core/operator.h"
 namespace dragon {
 template <class Context>
-class SigmoidCrossEntropyLossOp final : public Operator<Context> {
+class SigmoidCrossEntropyOp final : public Operator<Context> {
 public:
-    SigmoidCrossEntropyLossOp(const OperatorDef& op_def, Workspace* ws)
+    SigmoidCrossEntropyOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "FULL")) {}
    void RunOnDevice() override;
    template <typename T> void RunWithType();
@@ -27,9 +28,9 @@ class SigmoidCrossEntropyLossOp final : public Operator<Context> {
 };
 template <class Context>
-class SigmoidCrossEntropyLossGradientOp final : public Operator<Context> {
+class SigmoidCrossEntropyGradientOp final : public Operator<Context> {
 public:
-    SigmoidCrossEntropyLossGradientOp(const OperatorDef& op_def, Workspace* ws)
+    SigmoidCrossEntropyGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "FULL")) {}
@@ -43,4 +44,4 @@ class SigmoidCrossEntropyLossGradientOp final : public Operator<Context> {
 }    // namespace dragon
-#endif    // DRAGON_OPERATORS_LOSS_SIGMOID_CROSS_ENTROPY_LOSS_OP_H_
+#endif    // DRAGON_OPERATORS_LOSS_SIGMOID_CROSS_ENTROPY_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/loss/softmax_cross_entropy_loss_op.h
+++ b/Dragon/include/operators/loss/softmax_cross_entropy_loss_op.h
@@ -4,17 +4,17 @@
 // Written by Ting Pan
 // --------------------------------------------------------
-#ifndef DRAGON_OPERATORS_LOSS_SOFTMAX_CROSS_ENTROPY_LOSS_OP_H_
+#ifndef DRAGON_OPERATORS_LOSS_SOFTMAX_CROSS_ENTROPY_OP_H_
-#define DRAGON_OPERATORS_LOSS_SOFTMAX_CROSS_ENTROPY_LOSS_OP_H_
+#define DRAGON_OPERATORS_LOSS_SOFTMAX_CROSS_ENTROPY_OP_H_
 #include "core/operator.h"
 namespace dragon {
 template <class Context>
-class SoftmaxCrossEntropyLossOp final : public Operator<Context> {
+class SoftmaxCrossEntropyOp final : public Operator<Context> {
 public:
-    SoftmaxCrossEntropyLossOp(const OperatorDef& op_def, Workspace* ws) 
+    SoftmaxCrossEntropyOp(const OperatorDef& op_def, Workspace* ws) 
        : Operator<Context>(op_def, ws),
          axis(OperatorBase::GetSingleArg<int>("axis", 1)),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "FULL")) {
@@ -39,9 +39,9 @@ class SoftmaxCrossEntropyLossOp final : public Operator<Context> {
 };
 template <class Context>
-class SoftmaxCrossEntropyLossGradientOp final : public Operator<Context> {
+class SoftmaxCrossEntropyGradientOp final : public Operator<Context> {
 public:
-    SoftmaxCrossEntropyLossGradientOp(const OperatorDef& op_def, Workspace* ws) 
+    SoftmaxCrossEntropyGradientOp(const OperatorDef& op_def, Workspace* ws) 
        : Operator<Context>(op_def, ws),
          axis(OperatorBase::GetSingleArg<int>("axis", 1)),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "FULL")) {}
@@ -57,4 +57,4 @@ class SoftmaxCrossEntropyLossGradientOp final : public Operator<Context> {
 }    // namespace dragon
-#endif    // DRAGON_OPERATORS_LOSS_SOFTMAX_CROSS_ENTROPY_LOSS_OP_H_
+#endif    // DRAGON_OPERATORS_LOSS_SOFTMAX_CROSS_ENTROPY_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/loss/softmax_loss_op.h
+++ b/Dragon/include/operators/loss/softmax_loss_op.h
@@ -4,17 +4,17 @@
 // Written by Ting Pan
 // --------------------------------------------------------
-#ifndef DRAGON_OPERATORS_LOSS_SOFTMAX_LOSS_OP_H_
+#ifndef DRAGON_OPERATORS_LOSS_SPARSE_SOFTMAX_CROSS_ENTROPY_OP_H_
-#define DRAGON_OPERATORS_LOSS_SOFTMAX_LOSS_OP_H_
+#define DRAGON_OPERATORS_LOSS_SPARSE_SOFTMAX_CROSS_ENTROPY_OP_H_
 #include "core/operator.h"
 namespace dragon {
 template <class Context>
-class SoftmaxLossOp final : public Operator<Context> {
+class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
 public:
-    SoftmaxLossOp(const OperatorDef& op_def, Workspace* ws) 
+     SparseSoftmaxCrossEntropyOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
          axis(OperatorBase::GetSingleArg<int>("axis", 1)),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "VALID")) {
@@ -45,9 +45,9 @@ class SoftmaxLossOp final : public Operator<Context> {
 };
 template <class Context>
-class SoftmaxLossGradientOp final : public Operator<Context> {
+class SparseSoftmaxCrossEntropyGradientOp : public Operator<Context> {
 public:
-    SoftmaxLossGradientOp(const OperatorDef& op_def, Workspace* ws) 
+     SparseSoftmaxCrossEntropyGradientOp(const OperatorDef& op_def, Workspace* ws)
        : Operator<Context>(op_def, ws),
          axis(OperatorBase::GetSingleArg<int>("axis", 1)),
          normalization(OperatorBase::GetSingleArg<string>("normalization", "VALID")) {
@@ -71,4 +71,4 @@ class SoftmaxLossGradientOp final : public Operator<Context> {
 }    // namespace dragon
-#endif    // DRAGON_OPERATORS_LOSS_SOFTMAX_LOSS_OP_H_
+#endif    // DRAGON_OPERATORS_LOSS_SPARSE_SOFTMAX_CROSS_ENTROPY_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/loss/sparse_softmax_focal_loss_op.h
+++ b/Dragon/include/operators/loss/sparse_softmax_focal_loss_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+#ifndef DRAGON_OPERATORS_LOSS_SPARSE_SOFTMAX_FOCAL_LOSS_OP_H_
+#define DRAGON_OPERATORS_LOSS_SPARSE_SOFTMAX_FOCAL_LOSS_OP_H_
+#include "operators/loss/sparse_softmax_cross_entropy_op.h"
+namespace dragon {
+template <class Context>
+class SparseSoftmaxFocalLossOp final : public SparseSoftmaxCrossEntropyOp<Context> {
+ public:
+     SparseSoftmaxFocalLossOp(const OperatorDef& op_def, Workspace* ws)
+         : SparseSoftmaxCrossEntropyOp<Context>(op_def, ws),
+           axis(OperatorBase::GetSingleArg<int>("axis", 1)),
+           normalization(OperatorBase::GetSingleArg<string>("normalization", "VALID")),
+           alpha(OperatorBase::GetSingleArg<float>("alpha", 1.0)),
+           gamma(OperatorBase::GetSingleArg<float>("gamma", 2.0)),
+           use_pseudo_metric(OperatorBase::GetSingleArg<bool>("use_pseudo_metric", true)) {
+         if (alpha == 1.0) use_pseudo_metric = false;
+     }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+ protected:
+    float alpha, gamma;
+    bool use_pseudo_metric;
+    TIndex axis, outer_dim, inner_dim;
+    Tensor* scale;
+    string normalization;
+};
+template <class Context>
+class SparseSoftmaxFocalLossGradientOp final : public SparseSoftmaxCrossEntropyGradientOp<Context> {
+ public:
+     SparseSoftmaxFocalLossGradientOp(const OperatorDef& op_def, Workspace* ws)
+         : SparseSoftmaxCrossEntropyGradientOp<Context>(op_def, ws),
+           axis(OperatorBase::GetSingleArg<int>("axis", 1)),
+           normalization(OperatorBase::GetSingleArg<string>("normalization", "VALID")),
+           gamma(OperatorBase::GetSingleArg<float>("gamma", 2.0)),
+           eps(OperatorBase::GetSingleArg<float>("eps", float(1e-10))) {}
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+ protected:
+    float gamma, eps;
+    TIndex axis, outer_dim, inner_dim;
+    Tensor* scale;
+    string normalization;
+};
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_LOSS_SPARSE_SOFTMAX_FOCAL_LOSS_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/mpi/mpi_broadcast_op.h
+++ b/Dragon/include/operators/mpi/mpi_broadcast_op.h
@@ -27,7 +27,9 @@ template <class Context>
 class MPIBroadcastGradientOp final : public ModelMPIBase<Context> {
 public:
    MPIBroadcastGradientOp(const OperatorDef& op_def, Workspace* ws)
-        : ModelMPIBase<Context>(op_def, ws) {}
+        : ModelMPIBase<Context>(op_def, ws) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/mpi/mpi_gather_op.h
+++ b/Dragon/include/operators/mpi/mpi_gather_op.h
@@ -27,7 +27,9 @@ template <class Context>
 class MPIGatherGradientOp final : public ModelMPIBase<Context> {
 public:
    MPIGatherGradientOp(const OperatorDef& op_def, Workspace *ws) 
-        : ModelMPIBase<Context>(op_def, ws) {}
+        : ModelMPIBase<Context>(op_def, ws) {
+        DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/norm/batch_norm_op.h
+++ b/Dragon/include/operators/norm/batch_norm_op.h
@@ -30,7 +30,7 @@ class BatchNormOp : public Operator<Context> {
    Tensor* num_multiplier, *spatial_multiplier, *stddev, *var;
    TIndex num, channels, spatial_dim, nbychans;
    int use_stats;
-    bool use_global_stats, inplace;
+    bool use_global_stats, inplace, is_recomputing;
 };
 template <class Context>
@@ -40,9 +40,7 @@ class BatchNormGradientOp final : public Operator<Context> {
        : Operator<Context>(op_def, ws),
          use_stats(OperatorBase::GetSingleArg<int>("use_stats", -1)) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:
@@ -68,7 +66,7 @@ class BNOp : public Operator<Context> {
 protected:
    float momentum, eps;
    int use_stats;
-    bool use_global_stats;
+    bool use_global_stats, is_recomputing;
 };
 template <class Context>
@@ -79,9 +77,8 @@ class BNGradientOp : public Operator<Context> {
          eps(OperatorBase::GetSingleArg<float>("eps", float(1e-3))),
          use_stats(OperatorBase::GetSingleArg<int>("use_stats", -1)) { }
-    void ShareBeforeRun() override;
+    void ShareGradient() override;
    void RunOnDevice() override { NOT_IMPLEMENTED; }
-    void ClearAfterRun() override;
    template <typename T> void RunWithType() { NOT_IMPLEMENTED; }
 protected:
@@ -115,7 +112,7 @@ class CuDNNBNOp final : public BNOp<Context> {
    cudnnTensorDescriptor_t input_desc, output_desc, bn_desc;
    TIndex num, channels, spatial_dim;
    Tensor* mean, *var;
-    bool use_global_stats;
+    bool use_global_stats, is_recomputing;
 };
 template <class Context>

--- a/Dragon/include/operators/norm/batch_renorm_op.h
+++ b/Dragon/include/operators/norm/batch_renorm_op.h
@@ -36,7 +36,7 @@ class BatchRenormOp : public Operator<Context> {
    Tensor* stddev, *r, *var, *x_norm;
    TIndex num, channels, spatial_dim, nbychans;
    int use_stats;
-    bool use_global_stats, inplace;
+    bool use_global_stats, inplace, is_recomputing;
 };
 template <class Context>
@@ -46,9 +46,7 @@ class BatchRenormGradientOp final : public Operator<Context> {
        : Operator<Context>(op_def, ws),
          use_stats(OperatorBase::GetSingleArg<int>("use_stats", -1)) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/norm/instance_norm_op.h
+++ b/Dragon/include/operators/norm/instance_norm_op.h
@@ -36,9 +36,7 @@ class InstanceNormGradientOp final : public Operator<Context> {
    InstanceNormGradientOp(const OperatorDef& op_def, Workspace *ws) 
        : Operator<Context>(op_def, ws) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/norm/l2_norm_op.h
+++ b/Dragon/include/operators/norm/l2_norm_op.h
@@ -40,9 +40,7 @@ class L2NormGradientOp final : public Operator<Context> {
          axis(OperatorBase::GetSingleArg<int>("axis", 0)),
          num_axes(OperatorBase::GetSingleArg<int>("num_axes", -1)) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/recurrent/lstm_unit_op.h
+++ b/Dragon/include/operators/recurrent/lstm_unit_op.h
@@ -30,7 +30,10 @@ class LSTMUnitOp : public Operator<Context> {
 template <class Context>
 class LSTMUnitGradientOp : public Operator<Context> {
 public:
-    USE_SIMPLE_CTOR_DTOR(LSTMUnitGradientOp);
+     LSTMUnitGradientOp(const OperatorDef& op_def, Workspace* ws)
+         : Operator<Context>(op_def, ws) {
+         this->allow_share_grads_ = false;
+     }
    void RunOnDevice() override;
    template <typename T> void RunWithType();

--- a/Dragon/include/operators/common/utils_op.h
+++ b/Dragon/include/operators/common/utils_op.h
@@ -4,23 +4,14 @@
 // Written by Ting Pan
 // --------------------------------------------------------
-#ifndef DRAGON_OPERATORS_COMMON_UTILS_OP_H_
+#ifndef DRAGON_OPERATORS_UTILS_ACCURACY_OP_H_
-#define DRAGON_OPERATORS_COMMON_UTILS_OP_H_
+#define DRAGON_OPERATORS_UTILS_ACCURACY_OP_H_
 #include "core/operator.h"
 namespace dragon {
 template <class Context>
-class CopyOp final: public Operator<Context> {
- public:
-    USE_SIMPLE_CTOR_DTOR(CopyOp);
-    void RunOnDevice() override;
-    template <typename T> void RunWithType();
-};
-template <class Context>
 class AccuracyOp final: public Operator<Context> {
 public:
    AccuracyOp(const OperatorDef& op_def, Workspace* ws)
@@ -42,22 +33,6 @@ class AccuracyOp final: public Operator<Context> {
    Tensor ignore_labels;
 };
-template <class Context>
-class OneHotOp final : public Operator < Context > {
- public:
-    OneHotOp(const OperatorDef& op_def, Workspace* ws)
-        : Operator<Context>(op_def, ws),
-          depth(OperatorBase::GetSingleArg<int>("depth", -1)),
-          on_value(OperatorBase::GetSingleArg<int>("on_value", 1)),
-          off_value(OperatorBase::GetSingleArg<int>("off_value", 0)) {}
-    void RunOnDevice() override;
-    template <typename T> void RunWithType();
- protected:
-    TIndex depth, on_value, off_value;
-};
 }    // namespace dragon
-#endif    // DRAGON_OPERATORS_COMMON_UTILS_OP_H_
+#endif    // DRAGON_OPERATORS_UTILS_ACCURACY_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/copy_op.h
+++ b/Dragon/include/operators/utils/copy_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+#ifndef DRAGON_OPERATORS_UTILS_COPY_OP_H_
+#define DRAGON_OPERATORS_UTILS_COPY_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class CopyOp final : public Operator<Context> {
+ public:
+     USE_SIMPLE_CTOR_DTOR(CopyOp);
+     void RunOnDevice() override;
+     template <typename T> void RunWithType();
+};
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_UTILS_COPY_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/utils/gradient_op.h
+++ b/Dragon/include/operators/utils/gradient_op.h
@@ -19,6 +19,7 @@ class GradientGenerateOp final: public Operator<Context> {
          defaults(OperatorBase::GetRepeatedArg<float>("defaults")) {
        CHECK_EQ(InputSize(), OutputSize());
        CHECK_EQ(defaults.size(), OutputSize());
+        DISABLE_SHARE_GRADIENT;
    }
    void RunOnDevice() override;
@@ -35,6 +36,7 @@ class GradientGatherOp final : public Operator<Context> {
        : Operator<Context>(op_def, ws) {
        for (int i = 0; i < InputSize(); i++)
            if (input(i).name() != "ignore") indices.push_back(i);
+        DISABLE_SHARE_GRADIENT;
    }
    void RunOnDevice() override;
@@ -47,7 +49,11 @@ class GradientGatherOp final : public Operator<Context> {
 template <class Context>
 class StopGradientOp final : public Operator<Context> {
 public:
-    USE_SIMPLE_CTOR_DTOR(StopGradientOp);
+    StopGradientOp(const OperatorDef& op_def, Workspace* ws)
+         : Operator<Context>(op_def, ws) {
+         DISABLE_SHARE_GRADIENT;
+    }
    void RunOnDevice() override;
 };

--- a/Dragon/include/operators/utils/one_hot_op.h
+++ b/Dragon/include/operators/utils/one_hot_op.h
+// --------------------------------------------------------
+// Dragon
+// Copyright(c) 2017 SeetaTech
+// Written by Ting Pan
+// --------------------------------------------------------
+#ifndef DRAGON_OPERATORS_UTILS_ONE_HOT_OP_H_
+#define DRAGON_OPERATORS_UTILS_ONE_HOT_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class OneHotOp final : public Operator < Context > {
+ public:
+    OneHotOp(const OperatorDef& op_def, Workspace* ws)
+        : Operator<Context>(op_def, ws),
+          depth(OperatorBase::GetSingleArg<int>("depth", -1)),
+          on_value(OperatorBase::GetSingleArg<int>("on_value", 1)),
+          off_value(OperatorBase::GetSingleArg<int>("off_value", 0)) {}
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+ protected:
+    TIndex depth, on_value, off_value;
+};
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_UTILS_ONE_HOT_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/vision/conv_op.h
+++ b/Dragon/include/operators/vision/conv_op.h
@@ -30,9 +30,7 @@ class ConvGradientOp : public ConvOp<Context> {
    ConvGradientOp(const OperatorDef& def, Workspace* ws) 
        : ConvOp<Context>(def, ws) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 };

--- a/Dragon/include/operators/vision/deconv_op.h
+++ b/Dragon/include/operators/vision/deconv_op.h
@@ -31,9 +31,7 @@ class DeConvGradientOp : public DeConvOp<Context> {
    DeConvGradientOp(const OperatorDef& def, Workspace* ws) :
        DeConvOp<Context>(def, ws) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 };

--- a/Dragon/include/operators/vision/dense_concat_op.h
+++ b/Dragon/include/operators/vision/dense_concat_op.h
@@ -15,21 +15,21 @@ template <class Context>
 class DenseConcatOp final : public ConcatOp<Context> {
 public:
     DenseConcatOp(const OperatorDef& op_def, Workspace* ws)
-         : ConcatOp<Context>(op_def, ws) { }
+         : ConcatOp<Context>(op_def, ws) {}
-    void RunOnDevice() override;
 };
 template <class Context>
 class DenseConcatGradientOp : public ConcatGradientOp<Context> {
-public:
+ public:
    DenseConcatGradientOp(const OperatorDef& op_def, Workspace* ws)
-        : ConcatGradientOp<Context>(op_def, ws) {}
+        : ConcatGradientOp<Context>(op_def, ws),
+          growth_rate(OperatorBase::GetSingleArg<int>("growth_rate", 0)) {}
+    void ElimateCorruption() override;
+    template <typename T> void RestoreX1();
-    void ShareBeforeRun() override;
+ protected:
-    void RunOnDevice() override;
+     TIndex growth_rate;
-    void ClearAfterRun() override;
-    template <typename T> void RunWithType();
 };

--- a/Dragon/include/operators/vision/nn_resize_op.h
+++ b/Dragon/include/operators/vision/nn_resize_op.h
@@ -35,9 +35,7 @@ class NNResizeGradientOp : public Operator<Context> {
    NNResizeGradientOp(const OperatorDef& op_def, Workspace* ws) 
        : Operator<Context>(op_def, ws) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void RunWithType();
 };

--- a/Dragon/include/operators/vision/pooling_op.h
+++ b/Dragon/include/operators/vision/pooling_op.h
@@ -69,9 +69,7 @@ class PoolingGradientOp: public Operator<Context> {
    }
    void Reshape();
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
    template <typename T> void MaxRunWithType();
    template <typename T> void AvgRunWithType();

--- a/Dragon/include/operators/vision/roi_align_op.h
+++ b/Dragon/include/operators/vision/roi_align_op.h
@@ -44,9 +44,8 @@ class ROIAlignGradientOp : public Operator<Context> {
        CHECK_GT(pool_w, 0) << "\npool_w must > 0";
    }
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
+    void CleanResource() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/operators/vision/roi_pooling_op.h
+++ b/Dragon/include/operators/vision/roi_pooling_op.h
@@ -41,9 +41,8 @@ class ROIPoolingGradientOp final : public Operator<Context> {
        pool_w(OperatorBase::GetSingleArg<int>("pool_w", 0)),
        spatial_scale(OperatorBase::GetSingleArg<float>("spatial_scale", 1.0)) {}
-    void ShareBeforeRun() override;
    void RunOnDevice() override;
-    void ClearAfterRun() override;
+    void CleanResource() override;
    template <typename T> void RunWithType();
 protected:

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -286,21 +286,12 @@ void TransposeGrad(const int count,
                   const T* dy, 
                   T* dx);
-/******************** common.utils ********************/
-template <typename T, class Context>
-void OneHot(const int count, 
-            const int depth, 
-            const int on_value, 
-            const T* x, 
-            T* y);
 /******************** loss.l1_loss ********************/
 template <typename T, class Context>
 void AbsGrad(const int count, const T* dy, T* dx);
-/******************** loss.sigmoid_cross_entropy_loss ********************/
+/******************** loss.sigmoid_cross_entropy ********************/
 template <typename T, class Context>
 void SigmoidCrossEntropy(const int count, const T* x, const T* target, T* loss);
@@ -313,12 +304,12 @@ void SmoothL1(const int count, const float sigma2, const T* x,  T* y);
 template <typename T, class Context>
 void SmoothL1Grad(const int count, const float sigma2, const T* dy, T* dx);
-/******************** loss.softmax_cross_entropy_loss ********************/
+/******************** loss.softmax_cross_entropy ********************/
 template <typename T, class Context>
 void SoftmaxCrossEntropy(const int count, const T* prob, const T* target, T* loss);
-/******************** loss.softmax_loss ********************/
+/******************** loss.sparse_softmax_cross_entropy ********************/
 template <typename T, class Context>
 void SparseSoftmaxCrossEntropy(const int count, 
@@ -332,12 +323,42 @@ void SparseSoftmaxCrossEntropy(const int count,
                               Tensor* ignore);
 template <typename T, class Context>
-void SoftmaxLossGrad(const int count, 
+void SparseSoftmaxCrossEntropyGrad(const int count,
                                   const int classes, 
                                   const int outer_dim, 
                                   const int inner_dim, 
+                                   const T* prob, 
                                   const T* labels,
+                                   T* valid, 
+                                   Tensor* ignore, 
+                                   T* dXdata);
+/******************** loss.sparse_softmax_focal_loss ********************/
+template <typename T, class Context>
+void SparseSoftmaxFocalLoss(const int count, 
+                            const int classes, 
+                            const int outer_dim, 
+                            const int inner_dim, 
+                            const float alpha,
+                            const float gamma,
                            const T* prob, 
+                            const T* labels,
+                            T* scale,
+                            T* loss, 
+                            T* valid,
+                            Tensor* ignore);
+template <typename T, class Context>
+void SparseSoftmaxFocalLossGrad(const int count,
+                                const int classes, 
+                                const int outer_dim, 
+                                const int inner_dim,
+                                const float gamma,
+                                const float eps,
+                                const T* scale,
+                                const T* prob,
+                                const T* labels,
                                T* valid, 
                                Tensor* ignore, 
                                T* dXdata);
@@ -422,6 +443,15 @@ void MemoryData(const int count,
                const Tx* x, 
                Ty* y);
+/******************** utils.one_hot ********************/
+template <typename T, class Context>
+void OneHot(const int count, 
+            const int depth, 
+            const int on_value, 
+            const T* x, 
+            T* y);
 /******************** vision.conv ********************/
 template <typename T, class Context>

--- a/Dragon/modules/python/CMakeLists.txt
+++ b/Dragon/modules/python/CMakeLists.txt
--- a/Dragon/modules/python/dragon.h
+++ b/Dragon/modules/python/dragon.h
--- a/Dragon/python/dragon/config.py
+++ b/Dragon/python/dragon/config.py
@@ -20,7 +20,11 @@ option['device'] = 'CPU'
 option['gpu_id'] = 0
 option['use_cudnn'] = False
 option['random_seed'] = 3
-option['debug_mode'] = True
+# if True, disable Dragon-Memonger
+option['debug_mode'] = False
+option['share_grads'] = False # set it by Dragon-Memonger
+option['allow_mirrow_stage'] = True # default
 def EnableCPU():
    global option
@@ -32,8 +36,8 @@ def EnableCUDA(gpu_id=0, use_cudnn=True):
    option['gpu_id'] = gpu_id
    option['use_cudnn'] = use_cudnn
-# TODO(Pan): please not use @setter
+# TODO(PhyscalX): please not use @setter
-# TODO(Pan): seems that it can't change the global value
+# TODO(PhyscalX): seems that it can't change the global value
 def SetRandomSeed(seed):
    global option

--- a/Dragon/python/dragon/core/workspace.py
+++ b/Dragon/python/dragon/core/workspace.py
@@ -176,6 +176,6 @@ def Restore(filename, format=0):
                FeedTensor(key, ndarray)
    elif format is 1:
-        # TODO(pan): caffemodel can't save the tensor name
+        # TODO(PhyscalX): caffemodel can't save the tensor name
-        # TODO(pan): we simply use 'Scope + LayerName + @paramX'
+        # TODO(PhyscalX): we simply use 'Scope + LayerName + @paramX'
        RestoreCC(filename, '', format)
\ No newline at end of file
--- a/Dragon/python/dragon/memonger.py
+++ b/Dragon/python/dragon/memonger.py
+# --------------------------------------------------------
+# Dragon
+# Copyright(c) 2017 SeetaTech
+# Written by Ting Pan
+# --------------------------------------------------------
+def share_grads(enabled=True):
+    from dragon.config import option
+    option['share_grads'] = enabled
+def drop(op_func, *args, **kwargs):
+    kwargs['mirrow_stage'] = True
+    return op_func(*args, **kwargs)
\ No newline at end of file
--- a/Dragon/python/dragon/operators/custom/examples/data_process.py
+++ b/Dragon/python/dragon/operators/custom/examples/data_process.py
@@ -4,12 +4,12 @@
 # Written by Ting Pan
 # --------------------------------------------------------
+from __future__ import print_function
 import numpy as np
 import dragon.core.workspace as ws
 import dragon.ops as ops
 import dragon.vm.theano as theano
 from multiprocessing import Process, Queue
-from dragon.config import logger
 """ How to custom a RunOp in Dragon """
@@ -32,7 +32,7 @@ class Fetcher(Process):
        self.daemon = True
        def cleanup():
-            logger.info('Terminating Fetcher......')
+            print('Terminating Fetcher......')
            self.terminate()
            self.join()
@@ -104,4 +104,4 @@ if __name__ == '__main__':
    foo()
    # fetch
-    logger.info('y \n-------------- \n', y.get_value(), '\n')
+    print('y \n-------------- \n', y.get_value(), '\n')
\ No newline at end of file
--- a/Dragon/python/dragon/operators/custom/examples/vec_mult.py
+++ b/Dragon/python/dragon/operators/custom/examples/vec_mult.py
@@ -4,13 +4,13 @@
 # Written by Ting Pan
 # --------------------------------------------------------
+from __future__ import print_function
 import numpy as np
 import dragon.core.workspace as ws
 import dragon.ops as ops
 from dragon.core.tensor import Tensor
 import dragon.vm.theano.tensor as T
 import dragon.vm.theano as theano
-from dragon.config import logger
 """ How to custom a TemplateOp in Dragon """
@@ -91,14 +91,14 @@ if __name__ == '__main__':
    foo = theano.function(outputs=y)
    # feed
-    ws.FeedTensor(x1, np.ones((5, 3)))
+    ws.FeedTensor(x1, np.ones((5, 3), dtype=np.float32))
-    ws.FeedTensor(x2, np.ones((5, 3)) * 5.0)
+    ws.FeedTensor(x2, np.ones((5, 3), dtype=np.float32) * 5.0)
    # run
    foo()
    # fetch
-    logger.info('y \n-------------- \n', y.get_value(), '\n')
+    print('y \n-------------- \n', y.get_value(), '\n')
-    logger.info('dx1 \n-------------- \n', dx1.get_value(), '\n')
+    print('dx1 \n-------------- \n', dx1.get_value(), '\n')
-    logger.info('dx2 \n-------------- \n', dx2.get_value(), '\n')
+    print('dx2 \n-------------- \n', dx2.get_value(), '\n')
--- a/Dragon/python/dragon/operators/loss.py
+++ b/Dragon/python/dragon/operators/loss.py
@@ -7,7 +7,7 @@
 from dragon.core.tensor import Tensor
 import numpy as np
-def SoftmaxLoss(inputs, axis=1, normalization='VALID', ignore_labels=(), **kwargs):
+def SparseSoftmaxCrossEntropy(inputs, axis=1, normalization='VALID', ignore_labels=(), **kwargs):
    """
    :param inputs:          a list of Tensor contains [input, label]
    :param axis             a int of using which axis to compute softmax
@@ -17,12 +17,12 @@ def SoftmaxLoss(inputs, axis=1, normalization='VALID', ignore_labels=(), **kwarg
    """
    if not isinstance(inputs, list) or len(inputs) is not 2:
-        raise RuntimeError('SoftmaxLoss Operator accpets a list of 2 Tensors')
+        raise RuntimeError('SparseSoftmaxCrossEntropy Operator accpets a list of 2 Tensors')
    args = locals(); kwargs = args['kwargs']
    del args['kwargs']; kwargs = dict(args, **kwargs)
-    output = Tensor.CreateOperator(nout=1, op_type='SoftmaxLoss', **kwargs)
+    output = Tensor.CreateOperator(nout=1, op_type='SparseSoftmaxCrossEntropy', **kwargs)
    if inputs[0].shape is not None:
        if normalization != 'UNIT': output.shape = [1]
@@ -35,7 +35,7 @@ def SoftmaxLoss(inputs, axis=1, normalization='VALID', ignore_labels=(), **kwarg
    return output
-def SigmoidCrossEntropyLoss(inputs, normalization='FULL', **kwargs):
+def SigmoidCrossEntropy(inputs, normalization='FULL', **kwargs):
    """
    :param inputs:          a list of Tensor contains [input, label]
    :param normalization:   a str of (UNIT, FULL, BATCH_SIZE, NONE)
@@ -43,12 +43,12 @@ def SigmoidCrossEntropyLoss(inputs, normalization='FULL', **kwargs):
    """
    if not isinstance(inputs, list) or len(inputs) is not 2:
-        raise RuntimeError('SigmoidCrossEntropyLoss Operator accpets a list of 2 Tensors')
+        raise RuntimeError('SigmoidCrossEntropy Operator accpets a list of 2 Tensors')
    args = locals(); kwargs = args['kwargs']
    del args['kwargs']; kwargs = dict(args, **kwargs)
-    output = Tensor.CreateOperator(nout=1, op_type='SigmoidCrossEntropyLoss', **kwargs)
+    output = Tensor.CreateOperator(nout=1, op_type='SigmoidCrossEntropy', **kwargs)
    if inputs[0].shape is not None:
        if normalization != 'UNIT': output.shape = [1]
@@ -57,7 +57,7 @@ def SigmoidCrossEntropyLoss(inputs, normalization='FULL', **kwargs):
    return output
-def SoftmaxCrossEntropyLoss(inputs, axis=1, normalization='FULL', **kwargs):
+def SoftmaxCrossEntropy(inputs, axis=1, normalization='FULL', **kwargs):
    """
    :param inputs:          a list of Tensor contains [input, label]
    :param normalization:   a str of (UNIT, FULL, BATCH_SIZE, NONE)
@@ -65,12 +65,12 @@ def SoftmaxCrossEntropyLoss(inputs, axis=1, normalization='FULL', **kwargs):
    """
    if not isinstance(inputs, list) or len(inputs) is not 2:
-        raise RuntimeError('SoftmaxCrossEntropyLoss Operator accpets a list of 2 Tensors')
+        raise RuntimeError('SoftmaxCrossEntropy Operator accpets a list of 2 Tensors')
    args = locals(); kwargs = args['kwargs']
    del args['kwargs']; kwargs = dict(args, **kwargs)
-    output =  Tensor.CreateOperator(nout=1, op_type='SoftmaxCrossEntropyLoss', **kwargs)
+    output =  Tensor.CreateOperator(nout=1, op_type='SoftmaxCrossEntropy', **kwargs)
    if inputs[0].shape is not None:
        if normalization != 'UNIT': output.shape = [1]
@@ -96,7 +96,7 @@ def SmoothL1Loss(inputs, sigma=1.0, **kwargs):
    return output
-def L1Loss(inputs, normalization='BATCH_SIZE', coeff=1.0, **kwargs):
+def L1Loss(inputs, normalization='BATCH_SIZE', **kwargs):
    if not isinstance(inputs, list) or len(inputs) < 2:
        raise RuntimeError('L1Loss Operator accpets a list of at least 2 Tensors')
@@ -109,7 +109,7 @@ def L1Loss(inputs, normalization='BATCH_SIZE', coeff=1.0, **kwargs):
    return output
-def L2Loss(inputs, normalization='BATCH_SIZE', coeff=1.0, **kwargs):
+def L2Loss(inputs, normalization='BATCH_SIZE', **kwargs):
    if not isinstance(inputs, list) or len(inputs) < 2:
        raise RuntimeError('L2Loss Operator accpets a list of at least 2 Tensors')
@@ -120,3 +120,35 @@ def L2Loss(inputs, normalization='BATCH_SIZE', coeff=1.0, **kwargs):
    output = Tensor.CreateOperator(nout=1, op_type='L2Loss', **kwargs)
    if inputs[0].shape is not None: output.shape = [1]
    return output
+def SparseSoftmaxFocalLoss(inputs, axis=1, normalization='VALID', ignore_labels=(),
+                           alpha=0.25, gamma=2.0, eps=1e-10, use_pseudo_metric=True, **kwargs):
+    """
+    :param inputs:          a list of Tensor contains [input, label]
+    :param axis             a int of using which axis to compute softmax
+    :param normalization:   a str of (UNIT, FULL, VALID, BATCH_SIZE, NONE)
+    :param ignore_labels:   a list of int contatins the labels to ignore
+    :param alpha            a float of the alpha value
+    :param gamma            a float of the gamma value
+    :param eps              a float of the eps value
+    :return:                a Tensor of loss with the shape (1,)
+    """
+    if not isinstance(inputs, list) or len(inputs) is not 2:
+        raise RuntimeError('SoftmaxFocalLoss Operator accpets a list of 2 Tensors')
+    args = locals(); kwargs = args['kwargs']
+    del args['kwargs']; kwargs = dict(args, **kwargs)
+    output = Tensor.CreateOperator(nout=1, op_type='SparseSoftmaxFocalLoss', **kwargs)
+    if inputs[0].shape is not None:
+        if normalization != 'UNIT': output.shape = [1]
+        elif all(dim is not None for dim in inputs[0].shape):
+            outer_dim = int(np.prod(inputs[0].shape[0 : axis]))
+            inner_dim = int(np.prod(inputs[0].shape[axis + 1 :]))
+            output.shape = [outer_dim * inner_dim]
+        else: output.shape = [None]
+    return output
\ No newline at end of file
--- a/Dragon/python/dragon/operators/vision.py
+++ b/Dragon/python/dragon/operators/vision.py
@@ -197,7 +197,7 @@ def BiasAdd(inputs, data_format='NCHW', **kwargs):
    return output
-def DenseConcat(inputs, axis=1, **kwargs):
+def DenseConcat(inputs, growth_rate, axis=1, **kwargs):
    if not isinstance(inputs, list) or len(inputs) != 2:
        raise RuntimeError('DenseConcat Operator accepts 2 Tensors as inputs')
@@ -207,6 +207,7 @@ def DenseConcat(inputs, axis=1, **kwargs):
    kwargs['num_input'] = len(inputs)
    output = Tensor.CreateOperator(nout=1, op_type='DenseConcat', **kwargs)
    if all(input.shape is not None for input in inputs):
        if all(input.shape[axis] is not None for input in inputs):
            output.shape = inputs[0].shape[:]

--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
@@ -52,12 +52,13 @@ Softmax = act.Softmax
 Dropout = act.Dropout
 # loss
-SoftmaxLoss = loss.SoftmaxLoss
+SparseSoftmaxCrossEntropy = loss.SparseSoftmaxCrossEntropy
-SigmoidCrossEntropyLoss = loss.SigmoidCrossEntropyLoss
+SigmoidCrossEntropy = loss.SigmoidCrossEntropy
-SoftmaxCrossEntropyLoss = loss.SoftmaxCrossEntropyLoss
+SoftmaxCrossEntropy = loss.SoftmaxCrossEntropy
 SmoothL1Loss = loss.SmoothL1Loss
 L1Loss = loss.L1Loss
 L2Loss = loss.L2Loss
+SparseSoftmaxFocalLoss = loss.SparseSoftmaxFocalLoss
 # arithmetic
 Add = math.Add

--- a/Dragon/python/dragon/protos/dragon.proto
+++ b/Dragon/python/dragon/protos/dragon.proto
@@ -50,6 +50,7 @@ message OperatorDef {
    repeated Argument arg= 5;
    optional DeviceOption device_option = 6;
    optional bool debug_mode = 7 [default = false];
+    optional bool share_grads = 8 [default = false];
 }
 message GradientTarget {
@@ -65,7 +66,6 @@ message UpdateTarget {
    repeated Argument arg = 4;
 }
-//  simply copy from caffe1
 message TensorFiller {
    optional string tensor = 1;
    optional string type = 2 [default = 'constant'];
@@ -89,4 +89,5 @@ message GraphDef {
    repeated GradientTarget g_target = 8;
    repeated UpdateTarget u_target = 9;
    optional bool debug_mode = 10 [default = false];
+    optional bool share_grads = 11 [default = false];
 }
\ No newline at end of file
--- a/Dragon/python/dragon/protos/dragon_pb2.py
+++ b/Dragon/python/dragon/protos/dragon_pb2.py
@@ -19,7 +19,7 @@ _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor.FileDescriptor(
  name='dragon.proto',
  package='',
-  serialized_pb=_b('\n\x0c\x64ragon.proto\"\xf7\x01\n\x0bTensorProto\x12\x0c\n\x04\x64ims\x18\x01 \x03(\x05\x12/\n\tdata_type\x18\x02 \x01(\x0e\x32\x15.TensorProto.DataType:\x05\x46LOAT\x12\x16\n\nfloat_data\x18\x03 \x03(\x02\x42\x02\x10\x01\x12\x16\n\nint32_data\x18\x04 \x03(\x05\x42\x02\x10\x01\x12\x11\n\tbyte_data\x18\x05 \x01(\x0c\x12\x13\n\x0bstring_data\x18\x06 \x03(\x0c\x12\x0c\n\x04name\x18\x07 \x01(\t\"C\n\x08\x44\x61taType\x12\t\n\x05\x46LOAT\x10\x01\x12\t\n\x05INT32\x10\x02\x12\x08\n\x04\x42YTE\x10\x03\x12\n\n\x06STRING\x10\x04\x12\x0b\n\x07\x46LOAT16\x10\x0c\",\n\x0cTensorProtos\x12\x1c\n\x06protos\x18\x01 \x03(\x0b\x32\x0c.TensorProto\"\x80\x01\n\x08\x41rgument\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\t\n\x01\x66\x18\x02 \x01(\x02\x12\t\n\x01i\x18\x03 \x01(\x05\x12\x0b\n\x03i64\x18\t \x01(\x03\x12\t\n\x01s\x18\x04 \x01(\t\x12\t\n\x01\x62\x18\x08 \x01(\x08\x12\x0e\n\x06\x66loats\x18\x05 \x03(\x02\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0f\n\x07strings\x18\x07 \x03(\t\"p\n\x0c\x44\x65viceOption\x12%\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\x0b.DeviceType:\x03\x43PU\x12\x11\n\x06gpu_id\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0brandom_seed\x18\x03 \x01(\r:\x01\x33\x12\x0e\n\x06\x65ngine\x18\x04 \x01(\t\"\xa1\x01\n\x0bOperatorDef\x12\r\n\x05input\x18\x01 \x03(\t\x12\x0e\n\x06output\x18\x02 \x03(\t\x12\x0c\n\x04name\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x16\n\x03\x61rg\x18\x05 \x03(\x0b\x32\t.Argument\x12$\n\rdevice_option\x18\x06 \x01(\x0b\x32\r.DeviceOption\x12\x19\n\ndebug_mode\x18\x07 \x01(\x08:\x05\x66\x61lse\"=\n\x0eGradientTarget\x12\x0c\n\x04\x63ost\x18\x01 \x01(\t\x12\x0b\n\x03wrt\x18\x02 \x01(\t\x12\x10\n\x08\x65xternal\x18\x03 \x01(\t\"R\n\x0cUpdateTarget\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06tensor\x18\x03 \x03(\t\x12\x16\n\x03\x61rg\x18\x04 \x03(\x0b\x32\t.Argument\"\x8d\x02\n\x0cTensorFiller\x12\x0e\n\x06tensor\x18\x01 \x01(\t\x12\x16\n\x04type\x18\x02 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03low\x18\x04 \x01(\x02:\x01\x30\x12\x0f\n\x04high\x18\x05 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x06 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x07 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x08 \x01(\x02:\x01\x33\x12\x39\n\rvariance_norm\x18\t \x01(\x0e\x32\x1a.TensorFiller.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x46\x41N_AVG\x10\x02\"\xf3\x01\n\x08GraphDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x18\n\x02op\x18\x02 \x03(\x0b\x32\x0c.OperatorDef\x12\x12\n\ngraph_type\x18\x03 \x01(\t\x12$\n\rdevice_option\x18\x05 \x01(\x0b\x32\r.DeviceOption\x12\x16\n\x03\x61rg\x18\x06 \x03(\x0b\x32\t.Argument\x12\x0e\n\x06target\x18\x07 \x03(\t\x12!\n\x08g_target\x18\x08 \x03(\x0b\x32\x0f.GradientTarget\x12\x1f\n\x08u_target\x18\t \x03(\x0b\x32\r.UpdateTarget\x12\x19\n\ndebug_mode\x18\n \x01(\x08:\x05\x66\x61lse*+\n\nDeviceType\x12\x07\n\x03\x43PU\x10\x00\x12\x08\n\x04\x43UDA\x10\x01\x12\n\n\x06OPENCL\x10\x02')
+  serialized_pb=_b('\n\x0c\x64ragon.proto\"\xf7\x01\n\x0bTensorProto\x12\x0c\n\x04\x64ims\x18\x01 \x03(\x05\x12/\n\tdata_type\x18\x02 \x01(\x0e\x32\x15.TensorProto.DataType:\x05\x46LOAT\x12\x16\n\nfloat_data\x18\x03 \x03(\x02\x42\x02\x10\x01\x12\x16\n\nint32_data\x18\x04 \x03(\x05\x42\x02\x10\x01\x12\x11\n\tbyte_data\x18\x05 \x01(\x0c\x12\x13\n\x0bstring_data\x18\x06 \x03(\x0c\x12\x0c\n\x04name\x18\x07 \x01(\t\"C\n\x08\x44\x61taType\x12\t\n\x05\x46LOAT\x10\x01\x12\t\n\x05INT32\x10\x02\x12\x08\n\x04\x42YTE\x10\x03\x12\n\n\x06STRING\x10\x04\x12\x0b\n\x07\x46LOAT16\x10\x0c\",\n\x0cTensorProtos\x12\x1c\n\x06protos\x18\x01 \x03(\x0b\x32\x0c.TensorProto\"\x80\x01\n\x08\x41rgument\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\t\n\x01\x66\x18\x02 \x01(\x02\x12\t\n\x01i\x18\x03 \x01(\x05\x12\x0b\n\x03i64\x18\t \x01(\x03\x12\t\n\x01s\x18\x04 \x01(\t\x12\t\n\x01\x62\x18\x08 \x01(\x08\x12\x0e\n\x06\x66loats\x18\x05 \x03(\x02\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0f\n\x07strings\x18\x07 \x03(\t\"p\n\x0c\x44\x65viceOption\x12%\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\x0b.DeviceType:\x03\x43PU\x12\x11\n\x06gpu_id\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0brandom_seed\x18\x03 \x01(\r:\x01\x33\x12\x0e\n\x06\x65ngine\x18\x04 \x01(\t\"\xbd\x01\n\x0bOperatorDef\x12\r\n\x05input\x18\x01 \x03(\t\x12\x0e\n\x06output\x18\x02 \x03(\t\x12\x0c\n\x04name\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x16\n\x03\x61rg\x18\x05 \x03(\x0b\x32\t.Argument\x12$\n\rdevice_option\x18\x06 \x01(\x0b\x32\r.DeviceOption\x12\x19\n\ndebug_mode\x18\x07 \x01(\x08:\x05\x66\x61lse\x12\x1a\n\x0bshare_grads\x18\x08 \x01(\x08:\x05\x66\x61lse\"=\n\x0eGradientTarget\x12\x0c\n\x04\x63ost\x18\x01 \x01(\t\x12\x0b\n\x03wrt\x18\x02 \x01(\t\x12\x10\n\x08\x65xternal\x18\x03 \x01(\t\"R\n\x0cUpdateTarget\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06tensor\x18\x03 \x03(\t\x12\x16\n\x03\x61rg\x18\x04 \x03(\x0b\x32\t.Argument\"\x8d\x02\n\x0cTensorFiller\x12\x0e\n\x06tensor\x18\x01 \x01(\t\x12\x16\n\x04type\x18\x02 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03low\x18\x04 \x01(\x02:\x01\x30\x12\x0f\n\x04high\x18\x05 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x06 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x07 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x08 \x01(\x02:\x01\x33\x12\x39\n\rvariance_norm\x18\t \x01(\x0e\x32\x1a.TensorFiller.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x46\x41N_AVG\x10\x02\"\x8f\x02\n\x08GraphDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x18\n\x02op\x18\x02 \x03(\x0b\x32\x0c.OperatorDef\x12\x12\n\ngraph_type\x18\x03 \x01(\t\x12$\n\rdevice_option\x18\x05 \x01(\x0b\x32\r.DeviceOption\x12\x16\n\x03\x61rg\x18\x06 \x03(\x0b\x32\t.Argument\x12\x0e\n\x06target\x18\x07 \x03(\t\x12!\n\x08g_target\x18\x08 \x03(\x0b\x32\x0f.GradientTarget\x12\x1f\n\x08u_target\x18\t \x03(\x0b\x32\r.UpdateTarget\x12\x19\n\ndebug_mode\x18\n \x01(\x08:\x05\x66\x61lse\x12\x1a\n\x0bshare_grads\x18\x0b \x01(\x08:\x05\x66\x61lse*+\n\nDeviceType\x12\x07\n\x03\x43PU\x10\x00\x12\x08\n\x04\x43UDA\x10\x01\x12\n\n\x06OPENCL\x10\x02')
 )
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
@@ -44,8 +44,8 @@ _DEVICETYPE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=1386,
+  serialized_start=1442,
-  serialized_end=1429,
+  serialized_end=1485,
 )
 _sym_db.RegisterEnumDescriptor(_DEVICETYPE)
@@ -110,8 +110,8 @@ _TENSORFILLER_VARIANCENORM = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=1086,
+  serialized_start=1114,
-  serialized_end=1138,
+  serialized_end=1166,
 )
 _sym_db.RegisterEnumDescriptor(_TENSORFILLER_VARIANCENORM)
@@ -412,6 +412,13 @@ _OPERATORDEF = _descriptor.Descriptor(
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
+    _descriptor.FieldDescriptor(
+      name='share_grads', full_name='OperatorDef.share_grads', index=7,
+      number=8, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
  ],
  extensions=[
  ],
@@ -424,7 +431,7 @@ _OPERATORDEF = _descriptor.Descriptor(
  oneofs=[
  ],
  serialized_start=558,
-  serialized_end=719,
+  serialized_end=747,
 )
@@ -467,8 +474,8 @@ _GRADIENTTARGET = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=721,
+  serialized_start=749,
-  serialized_end=782,
+  serialized_end=810,
 )
@@ -518,8 +525,8 @@ _UPDATETARGET = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=784,
+  serialized_start=812,
-  serialized_end=866,
+  serialized_end=894,
 )
@@ -605,8 +612,8 @@ _TENSORFILLER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=869,
+  serialized_start=897,
-  serialized_end=1138,
+  serialized_end=1166,
 )
@@ -680,6 +687,13 @@ _GRAPHDEF = _descriptor.Descriptor(
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
+    _descriptor.FieldDescriptor(
+      name='share_grads', full_name='GraphDef.share_grads', index=9,
+      number=11, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
  ],
  extensions=[
  ],
@@ -691,8 +705,8 @@ _GRAPHDEF = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=1141,
+  serialized_start=1169,
-  serialized_end=1384,
+  serialized_end=1440,
 )
 _TENSORPROTO.fields_by_name['data_type'].enum_type = _TENSORPROTO_DATATYPE

--- a/Dragon/python/dragon/vm/caffe/layers/__init__.py
+++ b/Dragon/python/dragon/vm/caffe/layers/__init__.py
@@ -11,7 +11,7 @@ from .vision import ConvolutionLayer, DeconvolutionLayer, PoolingLayer, \
 from .neuron import ReLULayer, DropoutLayer, TanhLayer, PowerLayer
 from .loss import SoftmaxWithLossLayer, SigmoidCrossEntropyLossLayer, \
-                  L2LossLayer, SmoothL1LossLayer
+                  L2LossLayer, SmoothL1LossLayer, SoftmaxWithFocalLossLayer
 from .mpi import MPIBroadcastLayer, MPIGatherLayer

--- a/Dragon/python/dragon/vm/caffe/layers/common.py
+++ b/Dragon/python/dragon/vm/caffe/layers/common.py
@@ -93,8 +93,9 @@ class ConcatLayer(Layer):
 class DenseConcatLayer(Layer):
    def __init__(self, LayerParameter):
        super(DenseConcatLayer, self).__init__(LayerParameter)
-        param = LayerParameter.concat_param
+        param = LayerParameter.dense_concat_param
-        self._param = {'axis': param.axis}
+        self._param = {'axis': param.axis,
+                       'growth_rate': param.growth_rate}
    def Setup(self, bottom):
        super(DenseConcatLayer, self).Setup(bottom)
@@ -268,7 +269,7 @@ class BNLayer(Layer):
        if scale_param.HasField('filler'):
            self.Fill(scale, scale_param, 'filler')
-        else: scale.Constant(value=1.0)
+        else: scale.Uniform(low=0.0, high=1.0)
        self.Fill(bias, scale_param, 'bias_filler')
        self.norm_blobs = [{'data': mean, 'diff': None},
                           {'data': var, 'diff': None}]

--- a/Dragon/python/dragon/vm/caffe/layers/layer.py
+++ b/Dragon/python/dragon/vm/caffe/layers/layer.py
@@ -19,14 +19,17 @@ class Layer(object):
        self._name = LayerParameter.name
        self._blobs = []
        self._param = {}
-        self._mpi_param = {}
+        self._common_param = {}
        for include in LayerParameter.include:
            mpi_rank = [int(rank) for rank in include.mpi_rank]
-            if len(mpi_rank) > 0: self._mpi_param['mpi_rank'] = mpi_rank
+            if len(mpi_rank) > 0: self._common_param['mpi_rank'] = mpi_rank
+        if LayerParameter.HasField('mirrow_stage'):
+            self._common_param['mirrow_stage'] = LayerParameter.mirrow_stage
    def Setup(self, bottom):
-        self._param = dict(self._param, **self._mpi_param)
+        self._param = dict(self._param, **self._common_param)
    def Fill(self, tensor, param, filler):
        """ wrapper for caffe filler """

--- a/Dragon/python/dragon/vm/caffe/layers/loss.py
+++ b/Dragon/python/dragon/vm/caffe/layers/loss.py
@@ -24,7 +24,7 @@ class SoftmaxWithLossLayer(Layer):
    def Setup(self, bottom):
        super(SoftmaxWithLossLayer, self).Setup(bottom)
-        return ops.SoftmaxLoss(bottom, **self._param)
+        return ops.SparseSoftmaxCrossEntropy(bottom, **self._param)
 class SigmoidCrossEntropyLossLayer(Layer):
@@ -40,7 +40,7 @@ class SigmoidCrossEntropyLossLayer(Layer):
    def Setup(self, bottom):
        super(SigmoidCrossEntropyLossLayer, self).Setup(bottom)
-        return ops.SigmoidCrossEntropyLoss(bottom, **self._param)
+        return ops.SigmoidCrossEntropy(bottom, **self._param)
 class L2LossLayer(Layer):
@@ -64,3 +64,27 @@ class SmoothL1LossLayer(Layer):
    def Setup(self, bottom):
        super(SmoothL1LossLayer, self).Setup(bottom)
        return ops.SmoothL1Loss(bottom, **self._param)
+class SoftmaxWithFocalLossLayer(Layer):
+    def __init__(self, LayerParameter):
+        super(SoftmaxWithFocalLossLayer, self).__init__(LayerParameter)
+        param = LayerParameter.loss_param
+        softmax_param = LayerParameter.softmax_param
+        focal_loss_param = LayerParameter.focal_loss_param
+        norm_mode = {0: 'FULL', 1: 'VALID', 2: 'BATCH_SIZE', 3: 'NONE'}
+        normalization = 'VALID'
+        if param.HasField('normalize'):
+            if not param.normalize: normalization='BATCH_SIZE'
+        else: normalization = norm_mode[param.normalization]
+        self._param = {'axis': softmax_param.axis,
+                       'normalization': normalization,
+                       'ignore_labels': [param.ignore_label] if param.HasField('ignore_label') else [],
+                       'alpha': float(focal_loss_param.alpha),
+                       'gamma': float(focal_loss_param.gamma),
+                       'eps': float(focal_loss_param.eps),
+                       'use_pseudo_metric': focal_loss_param.use_pseudo_metric}
+    def Setup(self, bottom):
+        super(SoftmaxWithFocalLossLayer, self).Setup(bottom)
+        return ops.SparseSoftmaxFocalLoss(bottom, **self._param)
--- a/Dragon/python/dragon/vm/caffe/model_libs.py
+++ b/Dragon/python/dragon/vm/caffe/model_libs.py
@@ -20,7 +20,6 @@ def make_if_not_exist(path):
        os.makedirs(path)
 def UnpackVariable(var, num):
-  assert len > 0
  if type(var) is list and len(var) == num:
    return var
  else:
@@ -277,7 +276,7 @@ def VGGNetBody(net, from_layer, need_fc=True, fully_conv=False, reduced=False,
            dilation = 1
    kernel_size = 3
-    pad = int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2
+    pad = int(int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2)
    net.conv5_1 = L.Convolution(net[name], num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
    net.relu5_1 = L.ReLU(net.conv5_1, in_place=True)
    net.conv5_2 = L.Convolution(net.relu5_1, num_output=512, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
@@ -319,7 +318,7 @@ def VGGNetBody(net, from_layer, need_fc=True, fully_conv=False, reduced=False,
                else:
                    kernel_size = 7
                    num_output = 4096
-            pad = int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2
+            pad = int(int((kernel_size + (dilation - 1) * (kernel_size - 1)) - 1) / 2)
            net.fc6 = L.Convolution(net[name], num_output=num_output, pad=pad, kernel_size=kernel_size, dilation=dilation, **kwargs)
            net.relu6 = L.ReLU(net.fc6, in_place=True)

--- a/Dragon/python/dragon/vm/caffe/proto/caffe.proto
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe.proto
@@ -318,6 +318,9 @@ message LayerParameter {
  repeated string bottom = 3; // the name of each bottom blob
  repeated string top = 4; // the name of each top blob
+  // The mirrow stage optimization
+  optional bool mirrow_stage = 162 [default = false];  
  // The train / test phase for computation.
  optional Phase phase = 10;
@@ -418,6 +421,8 @@ message LayerParameter {
  optional ExpandDimsParameter expand_dims_param = 159;
  optional ProposalParameter proposal_param = 160;
  optional BatchRenormParameter batch_renorm_param = 161;
+  optional DenseConcatParameter dense_concat_param = 163;
+  optional FocalLossParameter focal_loss_param = 164;
 }
 // Message that stores parameters used to apply transformation
@@ -1494,3 +1499,16 @@ message BatchRenormParameter {
  optional float d_max = 5 [default = 5.0];
  optional float t_delta = 6 [default = 1.0];
 }
+message DenseConcatParameter {
+  optional int32 axis = 1 [default = 1];
+  optional int32 growth_rate = 2 [default = 0];
+}
+message FocalLossParameter {
+  optional float alpha = 1 [default = 1.0];
+  optional float gamma = 2 [default = 0.25];
+  optional float eps = 3 [default = 1e-10];
+  optional bool use_pseudo_metric = 4 [default = true];
+}
--- a/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
--- a/Dragon/python/dragon/vm/tensorflow/ops/nn_ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/ops/nn_ops.py
@@ -119,7 +119,7 @@ def bias_add(value, bias, data_format='NCHW', name=None):
 def sigmoid_cross_entropy_with_logits(logits, targets, name=None):
-    return ops.SigmoidCrossEntropyLoss([logits, targets], normalization='UNIT', name=None)
+    return ops.SigmoidCrossEntropy([logits, targets], normalization='UNIT', name=None)
 def softmax_cross_entropy_with_logits(_sentinel=None,
@@ -131,13 +131,13 @@ def softmax_cross_entropy_with_logits(_sentinel=None,
                         'with named arguments (labels=..., logits=..., ...)')
    if dim == -1: dim = 1
-    return ops.SoftmaxCrossEntropyLoss([logits, labels], axis=dim, normalization='UNIT', name=name)
+    return ops.SoftmaxCrossEntropy([logits, labels], axis=dim, normalization='UNIT', name=name)
 def sparse_softmax_cross_entropy_with_logits(logits, labels, dim=-1, name=None):
    if dim == -1: dim = 1
-    return ops.SoftmaxLoss([logits, labels], axis=dim, normalization='UNIT', name=name)
+    return ops.SparseSoftmaxCrossEntropy([logits, labels], axis=dim, normalization='UNIT', name=name)
 def l2_loss(t, name=None):

--- a/Dragon/python/dragon/vm/theano/core/function.py
+++ b/Dragon/python/dragon/vm/theano/core/function.py
@@ -77,10 +77,11 @@ def GraphDef_Update(graph_def, updater):
            u_target.arg.add().CopyFrom(MakeArgument(k, v))
        graph_def.u_target.extend([u_target])
-def GraphDef_Debug(graph_def):
+def GraphDef_Opt(graph_def):
-    """ generate debug mode for CC Graph """
+    """ generate opt options for CC Graph """
    from dragon.config import option
    graph_def.debug_mode = option['debug_mode']
+    graph_def.share_grads = option['share_grads']
 def GraphDef_Device(graph_def):
    """ generate deivce info for CC Graph """
@@ -155,13 +156,13 @@ def function(inputs=[], outputs=[], swaps=None, updater=None):
    if len(outputs) > 0:
        GraphDef_Device(graph_def)
-        GraphDef_Debug(graph_def)
+        GraphDef_Opt(graph_def)
        GraphDef_Grad(graph_def, outputs)
        GraphDef_Phase(graph_def, outputs)
    elif updater is not None:
        GraphDef_Device(graph_def)
-        GraphDef_Debug(graph_def)
+        GraphDef_Opt(graph_def)
        GraphDef_Update(graph_def, updater)
    # call c api to create graph

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -257,6 +257,7 @@ GraphDef Graph::MakeUpdate(const GraphDef& graph_def) {
 bool Graph::Create(const GraphDef& graph_def, Workspace* ws) {
    bool has_device_option = graph_def.has_device_option();
    bool has_debug_mode = graph_def.has_debug_mode();
+    bool has_share_grads = graph_def.has_share_grads();
    for (const OperatorDef& plain_op_def: graph_def.op()) {
        OperatorDef op_def(plain_op_def);
        LOG(DEBUG) << "Create Operator " << plain_op_def.name() 
@@ -270,12 +271,83 @@ bool Graph::Create(const GraphDef& graph_def, Workspace* ws) {
        if (!op_def.has_debug_mode() && has_debug_mode)
            op_def.set_debug_mode(graph_def.debug_mode());
+        //  inherit share_grads if necessary
+        if (!op_def.has_share_grads() && has_share_grads)
+            op_def.set_share_grads(graph_def.share_grads());
        OperatorBase* op = CreateOperator(op_def, ws);
        ops_.push_back(op);
    }
    return true;
 }
+void Graph::RecomputingAware(const GraphDef& graph_def, Workspace* ws) {
+    GraphDef fake_graph(graph_def);
+    Map<string, vector<OperatorBase*> > fake_recompute_map;
+    Map<string, string> rename_map;
+    Map<string, Set<string> > hash_map;
+    Map<string, int> multi_use_count;
+    //  check mirrow stage
+    for (int i = 0; i < ops_.size(); i++) {
+        if (ops_[i]->type().find("Gradient") != string::npos) continue;
+        bool mirrow_stage = ops_[i]->GetSingleArg<bool>("mirrow_stage", false);
+        for (auto& u : graph_def.op(i).input()) {
+            bool inplace_flag = false;
+            for (auto& v : graph_def.op(i).output()) if (u == v) inplace_flag = true;
+            mirrow_stage &= (!inplace_flag);
+            if (!inplace_flag) multi_use_count[u]++;
+        }
+        if (mirrow_stage) {
+            //  TODO(PhyscalX):  we assume that input(0)-output(0) as a force in-place currently
+            OperatorDef* op = fake_graph.mutable_op(i);
+            if (rename_map.count(op->input(0))) 
+                *op->mutable_input(0) = rename_map[op->input(0)];
+            rename_map[op->output(0)] = op->input(0);
+            *op->mutable_output(0) = op->input(0);
+            ops_[i]->input(0).Corrupt();    //  mark a flag
+        }
+    }
+    //  sub-graph aware
+    for (int i = 0; i < ops_.size(); i++) {
+        if (ops_[i]->type().find("Gradient") != string::npos) continue;
+        OperatorDef fake_op = fake_graph.op(i);
+        OperatorDef op = graph_def.op(i);
+        for (int j = 0; j < op.output_size(); j++) {
+            string v = op.output(j);
+            string fake_v = fake_op.output(j);  
+            if (!fake_recompute_map.count(fake_v))
+                fake_recompute_map[fake_v] = vector<OperatorBase*>(); 
+            if (v != fake_v) {
+                if (multi_use_count[fake_v] >= 2)
+                    fake_recompute_map[fake_v] = ws->GetRecompute(fake_v);
+            }    
+            fake_recompute_map[fake_v].push_back(ops_[i]);
+            for (int k = 0; k < fake_recompute_map[fake_v].size(); k++) {
+                if (!hash_map.count(v)) hash_map[v] = Set<string>();
+                string op_name = fake_recompute_map[fake_v][k]->name();
+                if (!hash_map[v].count(op_name)) {
+                    ws->AddRecompute(v, fake_recompute_map[fake_v][k]);
+                    hash_map[v].insert(op_name);
+                }
+            }
+        }
+    }
+    //  prepare resources
+    Tensor* head = ws->CreateTensor("_t_mirrow_stage_head");
+    head->Reshape(vector<TIndex>(1, WORKSPACE_MAX_CORRUPTED_SIZE));
+    Tensor* recompute_flag = ws->CreateTensor("_t_global_recompute_flag");
+    recompute_flag->Reshape(vector<TIndex>(1, 1));
+    recompute_flag->mutable_data<bool, CPUContext>()[0] = false;
+    for (int i = 0; i < WORKSPACE_MAX_CORRUPTED_SIZE; i++) {
+        string name = "_t_mirrow_stage_buffer_" + dragon_cast<string, int>(i);
+        Tensor* buffer = ws->CreateTensor(name);
+        head->mutable_data<string, CPUContext>()[i] = "";
+    }
+}
 Graph::Graph(const GraphDef& graph_def, Workspace* ws)
    : GraphBase(graph_def, ws) {
    GraphDef optimized_graph;
@@ -297,6 +369,9 @@ Graph::Graph(const GraphDef& graph_def, Workspace* ws)
    //  create
    Create(optimized_graph, ws);
+    //  recomputing-aware
+    RecomputingAware(optimized_graph, ws);
 }
 bool Graph::Run(const string& include, const string& exclude) {

--- a/Dragon/src/core/operator.cc
+++ b/Dragon/src/core/operator.cc
@@ -59,6 +59,91 @@ Gradient MakeGradientForOp(const OperatorDef& def, const vector<string>& g_outpu
    return grad;
 }
+template <class Context>
+void Operator<Context>::ElimateCorruption() {
+    Set<string> all_heads;
+    queue<int> safe_heads;
+    Tensor* head = ws()->GetTensor("_t_mirrow_stage_head");
+    string* head_data = head->mutable_data<string, CPUContext>();
+    for (int i = 0; i < head->count(); i++) all_heads.insert(head_data[i]);
+    //  sub-graph run
+    for (int i = 0; i < InputSize(); i++) {
+        if (input(i).is_corrupted())   {
+            if (all_heads.count(input(i).name())) continue;
+            LOG(DEBUG) << "Tensor(" << input(i).name() << ") is corrupted, recompute...  ";
+            Tensor* recompute_flag = ws()->GetTensor("_t_global_recompute_flag");
+            vector<OperatorBase*> list = ws()->GetRecompute(input(i).name());
+            recompute_flag->mutable_data<bool, CPUContext>()[0] = true;
+            for (int j = 0; j < list.size(); j++) list[j]->Run();
+            recompute_flag->mutable_data<bool, CPUContext>()[0] = false;
+        }
+    }
+    //  check available head
+    all_heads.clear();
+    for (int i = 0; i < head->count(); i++) {
+        bool safe = true;
+        for (int j = 0; j < InputSize(); j++) 
+            if (head_data[i] == input(j).name()) safe = false;
+        if (safe) safe_heads.push(i);
+        all_heads.insert(head_data[i]);
+    }
+    //  pre-process
+    for (int i = 0; i < OutputSize(); i++) {
+        if (output(i)->is_corrupted()) {
+            bool inplace_flag = false;
+            for (int j = 0; j < InputSize(); j++)
+                if (output(i)->name() == input(j).name()) inplace_flag = true;
+            if (inplace_flag || all_heads.count(output(i)->name())) continue;    //  skip to use new buffer
+            CHECK(!safe_heads.empty())
+                << "\nat most (" << safe_heads.size() << " [safe] / "
+                << all_heads.size() << " [total] can be used for corrupted output in "
+                << "(" << name() << ", " << type() << "), "
+                << "\nadd WORKSPACE_MAX_CORRUPTED_SIZE for more powerful mirrow stage ?";
+            int idx = safe_heads.front();
+            safe_heads.pop();
+            Tensor* buffer = ws()->GetTensor("_t_mirrow_stage_buffer_" + dragon_cast<string, int>(idx));
+            output(i)->Move(buffer->memory());
+            head_data[idx] = output(i)->name();
+        }
+    }
+}
+template <class Context>
+void Operator<Context>::ShareGradient() {
+    //  TODO(PhyscalX):  we preset input(-1)->output(0) to share
+    if (output(0)->name() != "ignore") {
+        Tensor* dX = ws()->GetBuffer("Grad");
+        output(0)->Replace(*dX);
+    }
+}
+template <class Context>
+void Operator<Context>::MakeResource() {
+    ElimateCorruption();
+    if (allow_share_grads_) ShareGradient();
+}
+template <class Context>
+void Operator<Context>::CleanResource() {
+    //  post-process for mirrow stage
+    Map<string, int> head_to_idx;
+    Tensor* head = ws()->GetTensor("_t_mirrow_stage_head");
+    string* head_data = head->mutable_data<string, CPUContext>();
+    for (int i = 0; i < head->count(); i++) head_to_idx[head_data[i]] = i;
+    for (int i = 0; i < OutputSize(); i++) {
+        if (output(i)->is_corrupted() && head_to_idx.count(output(i)->name())) {
+            string used = "_t_mirrow_stage_buffer_" + dragon_cast<string, int>(head_to_idx[output(i)->name()]);
+            Tensor* buffer = ws()->GetTensor(used);
+            if (output(i)->memory() != buffer->memory()) buffer->Move(output(i)->memory());
+        }
+    } 
+    if (allow_share_grads_) {
+        //  TODO(PhyscalX):  we preset input(-1)->output(0) to share
+        Tensor* dY = &input(-1);
+        ws()->ReleaseBuffer(dY, "Grad");
+    }
+}
 DEFINE_REGISTRY(CPUOperatorRegistry, OperatorBase,const OperatorDef&, Workspace*);
 DEFINE_REGISTRY(CUDAOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*);
 DEFINE_REGISTRY(CUDNNOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*);
@@ -94,4 +179,13 @@ INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints)
 INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings)
 #undef INSTANTIATE_GET_REPEATED_ARGUMENT
+template void Operator<CPUContext>::ElimateCorruption();
+template void Operator<CUDAContext>::ElimateCorruption();
+template void Operator<CPUContext>::ShareGradient();
+template void Operator<CUDAContext>::ShareGradient();
+template void Operator<CPUContext>::MakeResource();
+template void Operator<CUDAContext>::MakeResource();
+template void Operator<CPUContext>::CleanResource();
+template void Operator<CUDAContext>::CleanResource();
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/core/workspace.cc
+++ b/Dragon/src/core/workspace.cc
@@ -13,4 +13,14 @@ GraphBase* Workspace::CreateGraph(const GraphDef& graph_def) {
    return graph_map_[graph_def.name()].get();
 }
+Workspace::~Workspace() {
+    for (int i = 0; i < WORKSPACE_MAX_CORRUPTED_SIZE; i++) {
+        string name = "_t_mirrow_stage_buffer_" + dragon_cast<string, int>(i);
+        if (HasTensor(name)) {
+            MixedMemory* mem = GetTensor(name)->memory();
+            if (mem != nullptr) delete mem;
+        }
+    }
+}
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
@@ -70,8 +70,9 @@ void DropoutGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void DropoutGradientOp<Context>::ClearAfterRun() {
+void DropoutGradientOp<Context>::CleanResource() {
-    ws()->ReleaseBuffer(mask, true);
+    Operator<Context>::CleanResource();
+    ws()->ReleaseBuffer(mask, "Common", true);
 }
 DEPLOY_CPU(DropoutGradient);
@@ -81,7 +82,7 @@ DEPLOY_CUDA(DropoutGradient);
 OPERATOR_SCHEMA(DropoutGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
 class GetDropoutGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetDropoutGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
@@ -92,4 +93,3 @@ public:
 REGISTER_GRADIENT(Dropout, GetDropoutGradient);
 }    // namepsace dragon
\ No newline at end of file
--- a/Dragon/src/operators/activation/relu_op.cc
+++ b/Dragon/src/operators/activation/relu_op.cc
@@ -48,7 +48,7 @@ DEPLOY_CUDA(ReluGradient);
 OPERATOR_SCHEMA(ReluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }});
 class GetReluGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetReluGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/activation/sigmoid_op.cc
+++ b/Dragon/src/operators/activation/sigmoid_op.cc
@@ -48,7 +48,7 @@ DEPLOY_CUDA(SigmoidGradient);
 OPERATOR_SCHEMA(SigmoidGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
 class GetSigmoidGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetSigmoidGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/activation/softmax_op.cc
+++ b/Dragon/src/operators/activation/softmax_op.cc
@@ -71,7 +71,7 @@ DEPLOY_CUDA(SoftmaxGradient);
 OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
 class GetSoftmaxGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetSoftmaxGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/activation/tanh_op.cc
+++ b/Dragon/src/operators/activation/tanh_op.cc
@@ -48,7 +48,7 @@ DEPLOY_CUDA(TanhGradient);
 OPERATOR_SCHEMA(TanhGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
 class GetTanhGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetTanhGradient);
    vector<OperatorDef>    MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
@@ -59,4 +59,3 @@ public:
 REGISTER_GRADIENT(Tanh, GetTanhGradient);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/arithmetic/add_op.cc
+++ b/Dragon/src/operators/arithmetic/add_op.cc
@@ -160,22 +160,16 @@ void AddGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void AddGradientOp<Context>::ShareBeforeRun() {
+void AddGradientOp<Context>::ShareGradient() {
    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer();
+            Tensor* dX = ws()->GetBuffer("Grad");
-            if (dX != nullptr) output(i)->Replace(*dX);
+            output(i)->Replace(*dX);
            break;
        }
    }
 }
-template <class Context>
-void AddGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(AddGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(AddGradient);
@@ -183,7 +177,7 @@ DEPLOY_CUDA(AddGradient);
 OPERATOR_SCHEMA(AddGradient).NumInputs(2).NumOutputs(2);
 class GetAddGradient : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetAddGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/bias_add_op.cc
+++ b/Dragon/src/operators/arithmetic/bias_add_op.cc
@@ -95,18 +95,6 @@ void BiasAddGradientOp<Context>::RunOnDevice() {
    }
 }
-template <class Context>
-void BiasAddGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void BiasAddGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(BiasAddGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(BiasAddGradient);
@@ -114,7 +102,7 @@ DEPLOY_CUDA(BiasAddGradient);
 OPERATOR_SCHEMA(BiasAddGradient).NumInputs(3).NumOutputs(2);
 class GetBiasAddGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetBiasAddGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/clip_op.cc
+++ b/Dragon/src/operators/arithmetic/clip_op.cc
@@ -45,18 +45,6 @@ void ClipGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void ClipGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void ClipGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(ClipGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ClipGradient);
@@ -64,7 +52,7 @@ DEPLOY_CUDA(ClipGradient);
 OPERATOR_SCHEMA(ClipGradient).NumInputs(2).NumOutputs(1);
 class GetClipGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetClipGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/div_op.cc
+++ b/Dragon/src/operators/arithmetic/div_op.cc
@@ -191,22 +191,16 @@ void DivGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void DivGradientOp<Context>::ShareBeforeRun() {
+void DivGradientOp<Context>::ShareGradient() {
    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer();
+            Tensor* dX = ws()->GetBuffer("Grad");
-            if (dX != nullptr) output(i)->Replace(*dX);
+            output(i)->Replace(*dX);
            break;
        }
    }
 }
-template <class Context>
-void DivGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(DivGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(DivGradient);
@@ -214,7 +208,7 @@ DEPLOY_CUDA(DivGradient);
 OPERATOR_SCHEMA(DivGradient).NumInputs(3).NumOutputs(2);
 class GetDivGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetDivGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
 #include "operators/arithmetic/dot_op.h"
+#include "core/workspace.h"
 #include "utils/math_functions.h" 
 namespace dragon {
@@ -169,6 +170,17 @@ void DotGradientOp<Context>::RunOnDevice() {
    }
 }
+template <class Context>
+void DotGradientOp<Context>::ShareGradient() {
+    for (int i = 0; i < OutputSize(); i++) {
+        if (output(i)->name() != "ignore") {
+            Tensor* dX = ws()->GetBuffer("Grad");
+            output(i)->Replace(*dX);
+            break;
+        }
+    }
+}
 DEPLOY_CPU(DotGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(DotGradient);
@@ -176,7 +188,7 @@ DEPLOY_CUDA(DotGradient);
 OPERATOR_SCHEMA(DotGradient).NumInputs(3).NumOutputs(2);
 class GetDotGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetDotGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
@@ -118,22 +118,16 @@ void EltwiseGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void EltwiseGradientOp<Context>::ShareBeforeRun() {
+void EltwiseGradientOp<Context>::ShareGradient() {
    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer();
+            Tensor* dX = ws()->GetBuffer("Grad");
-            if (dX != nullptr) output(i)->Replace(*dX);
+            output(i)->Replace(*dX);
            break;
        }
    }
 }
-template <class Context>
-void EltwiseGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(EltwiseGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(EltwiseGradient);
@@ -141,7 +135,7 @@ DEPLOY_CUDA(EltwiseGradient);
 OPERATOR_SCHEMA(EltwiseGradient).NumInputs(3, INT_MAX).NumOutputs(2, INT_MAX);
 class GetEltwiseGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetEltwiseGradient);
    vector<OperatorDef> MakeDefs() override {
        vector<string> inputs, outputs;

--- a/Dragon/src/operators/arithmetic/exp_op.cc
+++ b/Dragon/src/operators/arithmetic/exp_op.cc
@@ -41,19 +41,6 @@ void ExpGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void ExpGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void ExpGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(ExpGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ExpGradient);
@@ -61,7 +48,7 @@ DEPLOY_CUDA(ExpGradient);
 OPERATOR_SCHEMA(ExpGradient).NumInputs(2).NumOutputs(1);
 class GetExpGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetExpGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
@@ -61,18 +61,6 @@ void GramMatrixGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void GramMatrixGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void GramMatrixGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(GramMatrixGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(GramMatrixGradient);
@@ -80,7 +68,7 @@ DEPLOY_CUDA(GramMatrixGradient);
 OPERATOR_SCHEMA(GramMatrixGradient).NumInputs(2).NumOutputs(1);
 class GetGramMatrixGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetGramMatrixGradient);
    vector<OperatorDef> MakeDefs() override{
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/inner_product_op.cc
+++ b/Dragon/src/operators/arithmetic/inner_product_op.cc
@@ -120,20 +120,6 @@ void InnerProductGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void InnerProductGradientOp<Context>::ShareBeforeRun() {
-    if (output(0)->name() != "ignore") {
-        Tensor* dX = ws()->GetBuffer();
-        if (dX != nullptr) output(0)->Replace(*dX);
-    }
-}
-template <class Context>
-void InnerProductGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(InnerProductGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(InnerProductGradient);
@@ -141,7 +127,7 @@ DEPLOY_CUDA(InnerProductGradient);
 OPERATOR_SCHEMA(InnerProductGradient).NumInputs(3).NumOutputs(3);
 class GetInnerProductGradient : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetInnerProductGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/log_op.cc
+++ b/Dragon/src/operators/arithmetic/log_op.cc
@@ -41,18 +41,6 @@ void LogGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void LogGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void LogGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(LogGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(LogGradient);
@@ -60,7 +48,7 @@ DEPLOY_CUDA(LogGradient);
 OPERATOR_SCHEMA(LogGradient).NumInputs(2).NumOutputs(1);
 class GetLogGradient : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetLogGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
@@ -110,22 +110,16 @@ void MatmulGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void MatmulGradientOp<Context>::ShareBeforeRun() {
+void MatmulGradientOp<Context>::ShareGradient() {
    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer();
+            Tensor* dX = ws()->GetBuffer("Grad");
-            if (dX != nullptr) output(i)->Replace(*dX);
+            output(i)->Replace(*dX);
            break;
        }
    }
 }
-template <class Context>
-void MatmulGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(MatmulGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(MatmulGradient);
@@ -133,7 +127,7 @@ DEPLOY_CUDA(MatmulGradient);
 OPERATOR_SCHEMA(MatmulGradient).NumInputs(3).NumOutputs(2);
 class GetMatmulGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetMatmulGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/mul_op.cc
+++ b/Dragon/src/operators/arithmetic/mul_op.cc
@@ -179,22 +179,16 @@ void MulGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void MulGradientOp<Context>::ShareBeforeRun() {
+void MulGradientOp<Context>::ShareGradient() {
    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer();
+            Tensor* dX = ws()->GetBuffer("Grad");
-            if (dX != nullptr) output(i)->Replace(*dX);
+            output(i)->Replace(*dX);
            break;
        }
    }
 }
-template <class Context>
-void MulGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(MulGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(MulGradient);
@@ -202,7 +196,7 @@ DEPLOY_CUDA(MulGradient);
 OPERATOR_SCHEMA(MulGradient).NumInputs(3).NumOutputs(2);
 class GetMulGradient : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetMulGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
@@ -80,18 +80,6 @@ void PowGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void PowGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void PowGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(PowGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(PowGradient);
@@ -99,7 +87,7 @@ DEPLOY_CUDA(PowGradient);
 OPERATOR_SCHEMA(PowGradient).NumInputs(3).NumOutputs(1);
 class GetPowGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetPowGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/scale_op.cc
+++ b/Dragon/src/operators/arithmetic/scale_op.cc
@@ -137,18 +137,6 @@ void ScaleGradientOp<Context>::RunOnDevice() {
    }
 }
-template <class Context>
-void ScaleGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void ScaleGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(ScaleGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ScaleGradient);
@@ -156,7 +144,7 @@ DEPLOY_CUDA(ScaleGradient);
 OPERATOR_SCHEMA(ScaleGradient).NumInputs(3).NumOutputs(3);
 class GetScaleGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetScaleGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/square_op.cc
+++ b/Dragon/src/operators/arithmetic/square_op.cc
@@ -42,18 +42,6 @@ void SquareGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void SquareGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void SquareGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(SquareGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SquareGradient);
@@ -61,7 +49,7 @@ DEPLOY_CUDA(SquareGradient);
 OPERATOR_SCHEMA(SquareGradient).NumInputs(2).NumOutputs(1);
 class GetSquareGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetSquareGradient);
    vector<OperatorDef> MakeDefs() override{
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
@@ -160,22 +160,16 @@ void SubGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void SubGradientOp<Context>::ShareBeforeRun() {
+void SubGradientOp<Context>::ShareGradient() {
    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer();
+            Tensor* dX = ws()->GetBuffer("Grad");
-            if (dX != nullptr) output(i)->Replace(*dX);
+            output(i)->Replace(*dX);
            break;
        }
    }
 }
-template <class Context>
-void SubGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(SubGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SubGradient);
@@ -183,7 +177,7 @@ DEPLOY_CUDA(SubGradient);
 OPERATOR_SCHEMA(SubGradient).NumInputs(3).NumOutputs(2);
 class GetSubGradient : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetSubGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/common/at_op.cc
+++ b/Dragon/src/operators/common/at_op.cc
@@ -57,18 +57,6 @@ void AtGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void AtGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void AtGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(AtGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(AtGradient);
@@ -76,7 +64,7 @@ DEPLOY_CUDA(AtGradient);
 OPERATOR_SCHEMA(AtGradient).NumInputs(3).NumOutputs(1);
 class GetAtGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetAtGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "", 

--- a/Dragon/src/operators/common/concat_op.cc
+++ b/Dragon/src/operators/common/concat_op.cc
@@ -101,22 +101,16 @@ void ConcatGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void ConcatGradientOp<Context>::ShareBeforeRun() {
+void ConcatGradientOp<Context>::ShareGradient() {
    for (int i = 0; i < OutputSize(); i++) {
        if (output(i)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer();
+            Tensor* dX = ws()->GetBuffer("Grad");
-            if (dX != nullptr) output(i)->Replace(*dX);
+            output(i)->Replace(*dX);
            break;
        }
    }
 }
-template <class Context>
-void ConcatGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(ConcatGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ConcatGradient);
@@ -124,7 +118,7 @@ DEPLOY_CUDA(ConcatGradient);
 OPERATOR_SCHEMA(ConcatGradient).NumInputs(2, INT_MAX).NumOutputs(1, INT_MAX);
 class GetConcatGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetConcatGradient);
    vector<OperatorDef> MakeDefs() override {
        vector<string> inputs, outputs;

--- a/Dragon/src/operators/common/crop_op.cc
+++ b/Dragon/src/operators/common/crop_op.cc
@@ -139,18 +139,6 @@ void CropGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void CropGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void CropGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(CropGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(CropGradient);
@@ -158,7 +146,7 @@ DEPLOY_CUDA(CropGradient);
 OPERATOR_SCHEMA(CropGradient).NumInputs(2).NumOutputs(1);
 class GetCropGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetCropGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/common/expand_dims_op.cc
+++ b/Dragon/src/operators/common/expand_dims_op.cc
@@ -30,7 +30,7 @@ DEPLOY_CUDA(ExpandDimsGradient);
 OPERATOR_SCHEMA(ExpandDimsGradient).NumInputs(2).NumOutputs(1);
 class GetExpandDimsGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetExpandDimsGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/common/flatten_op.cc
+++ b/Dragon/src/operators/common/flatten_op.cc
@@ -39,7 +39,7 @@ DEPLOY_CUDA(FlattenGradient);
 OPERATOR_SCHEMA(FlattenGradient).NumInputs(2).NumOutputs(1);
 class GetFlattenGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetFlattenGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/common/python_op.cc
+++ b/Dragon/src/operators/common/python_op.cc
@@ -91,7 +91,7 @@ DEPLOY_CUDA(TemplateGradient);
 OPERATOR_SCHEMA(TemplateGradient);
 class GetTemplateGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetTemplateGradient);
    vector<OperatorDef> MakeDefs() override {
        vector<string> inputs, outputs;
@@ -104,5 +104,3 @@ public:
 REGISTER_GRADIENT(Template, GetTemplateGradient);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/common/reduce_op.cc
+++ b/Dragon/src/operators/common/reduce_op.cc
@@ -107,18 +107,6 @@ void ReduceGradientOp<Context>::RunOnDevice() {
    }
 }
-template <class Context>
-void ReduceGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void ReduceGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(ReduceGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ReduceGradient);
@@ -126,7 +114,7 @@ DEPLOY_CUDA(ReduceGradient);
 OPERATOR_SCHEMA(ReduceGradient).NumInputs(2).NumOutputs(1);
 class GetReduceGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetReduceGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/common/reshape_op.cc
+++ b/Dragon/src/operators/common/reshape_op.cc
@@ -67,7 +67,7 @@ DEPLOY_CUDA(ReshapeGradient);
 OPERATOR_SCHEMA(ReshapeGradient).NumInputs(2).NumOutputs(1);
 class GetReshapeGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetReshapeGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
@@ -78,4 +78,3 @@ public:
 REGISTER_GRADIENT(Reshape, GetReshapeGradient);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/common/scan_op.cc
+++ b/Dragon/src/operators/common/scan_op.cc
@@ -214,7 +214,7 @@ DEPLOY_CUDA(ScanGradient);
 OPERATOR_SCHEMA(ScanGradient).NumInputs(2, INT_MAX).NumOutputs(1, INT_MAX);
 class GetScanGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetScanGradient);
    vector<OperatorDef> MakeDefs() override {
        vector<string> inputs, outputs;

--- a/Dragon/src/operators/common/slice_op.cc
+++ b/Dragon/src/operators/common/slice_op.cc
@@ -81,7 +81,7 @@ DEPLOY_CUDA(SliceGradient);
 OPERATOR_SCHEMA(SliceGradient).NumInputs(2, INT_MAX).NumOutputs(1);
 class GetSliceGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetSliceGradient);
    vector<OperatorDef> MakeDefs() override {
        vector<string> inputs(1, I(0));

--- a/Dragon/src/operators/common/tile_op.cc
+++ b/Dragon/src/operators/common/tile_op.cc
@@ -114,18 +114,6 @@ void TileGradientOp<Context>::RunOnDevice() {
    ws()->ReleaseBuffer(dest);
 }
-template <class Context>
-void TileGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void TileGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(TileGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(TileGradient);
@@ -133,7 +121,7 @@ DEPLOY_CUDA(TileGradient);
 OPERATOR_SCHEMA(TileGradient).NumInputs(1).NumOutputs(1);
 class GetTileGradient final : public GradientMakerBase { 
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetTileGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/common/transpose_op.cc
+++ b/Dragon/src/operators/common/transpose_op.cc
@@ -79,18 +79,6 @@ void TransposeGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void TransposeGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void TransposeGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(TransposeGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(TransposeGradient);
@@ -98,7 +86,7 @@ DEPLOY_CUDA(TransposeGradient);
 OPERATOR_SCHEMA(TransposeGradient).NumInputs(2).NumOutputs(1);
 class GetTransposeGradient final : public GradientMakerBase{
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetTransposeGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
@@ -18,8 +18,8 @@ void L1LossOp<Context>::RunWithType() {
        auto* Wdata = input(2).template data<T, Context>();
        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
    }
-    T abs_val = math::ASum<T, Context>(diff->count(), diff_data);
+    Ydata[0] = math::ASum<T, Context>(diff->count(), diff_data);
-    Ydata[0] = coeff * abs_val;
    T normalizer;
    if (normalization == "BATCH_SIZE") normalizer = input(0).dim(0);
    else if (normalization == "FULL") normalizer = input(0).count();
@@ -46,9 +46,11 @@ OPERATOR_SCHEMA(L1Loss).NumInputs(2, 3).NumOutputs(1);
 template <class Context> template <typename T>
 void L1LossGradientOp<Context>::RunWithType() {
-    auto* dYdata = diff->template mutable_data<T, Context>();
+    auto* diff_data = diff->template mutable_data<T, Context>();
-    kernel::AbsGrad<T, Context>(diff->count(), dYdata, dYdata);
+    auto* dYdata = input(-1).template data<T, CPUContext>();
-    T alpha = coeff, normalizer;
+    kernel::AbsGrad<T, Context>(diff->count(), diff_data, diff_data);
+    T alpha = dYdata[0], normalizer;
    if (normalization == "BATCH_SIZE") normalizer = input(0).dim(0);
    else if (normalization == "FULL") normalizer = input(0).count();
    else if (normalization == "NONE") normalizer = 1;
@@ -59,7 +61,7 @@ void L1LossGradientOp<Context>::RunWithType() {
        auto* dXdata = output(i)->template mutable_data<T, Context>();
        const T sign = (i == 0) ? 1 : -1;
        alpha *= sign;
-        math::Axpby<T, Context>(output(i)->count(), alpha, dYdata, 0, dXdata);
+        math::Axpby<T, Context>(output(i)->count(), alpha, diff_data, 0, dXdata);
    }
 }
@@ -71,6 +73,17 @@ void L1LossGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
+template <class Context>
+void L1LossGradientOp<Context>::ShareGradient() {
+    for (int i = 0; i < OutputSize(); i++) {
+        if (output(i)->name() != "ignore") {
+            Tensor* dX = ws()->GetBuffer("Grad");
+            output(i)->Replace(*dX);
+            break;
+        }
+    }
+}
 DEPLOY_CPU(L1LossGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(L1LossGradient);
@@ -78,7 +91,7 @@ DEPLOY_CUDA(L1LossGradient);
 OPERATOR_SCHEMA(L1LossGradient).NumInputs(3).NumOutputs(2);
 class GetL1LossGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetL1LossGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
@@ -16,8 +16,8 @@ void L2LossOp<Context>::RunWithType() {
        auto* Wdata = input(2).template data<T, Context>();
        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
    }
-    T dot = math::Dot<T, Context>(diff->count(), diff_data, diff_data);
+    Ydata[0] = T(0.5) * math::Dot<T, Context>(diff->count(), diff_data, diff_data);
-    Ydata[0] = T(0.5) * coeff * dot;
    T normalizer;
    if (normalization == "BATCH_SIZE") normalizer = input(0).dim(0);
    else if (normalization == "FULL") normalizer = input(0).count();
@@ -44,8 +44,10 @@ OPERATOR_SCHEMA(L2Loss).NumInputs(2, 3).NumOutputs(1);
 template <class Context> template <typename T>
 void L2LossGradientOp<Context>::RunWithType() {
-    auto* dYdata = diff->template mutable_data<T, Context>();
+    auto* diff_data = diff->template mutable_data<T, Context>();
-    T alpha = coeff, normalizer;
+    auto* dYdata = input(-1).template data<T, CPUContext>();
+    T alpha = dYdata[0], normalizer;
    if (normalization == "BATCH_SIZE") normalizer = input(0).dim(0);
    else if (normalization == "FULL") normalizer = input(0).count();
    else if (normalization == "NONE") normalizer = 1;
@@ -56,7 +58,7 @@ void L2LossGradientOp<Context>::RunWithType() {
        auto* dXdata = output(i)->template mutable_data<T, Context>();
        const T sign = (i == 0) ? 1 : -1;
        alpha *= sign;
-        math::Axpby<T, Context>(output(i)->count(), alpha, dYdata, 0, dXdata);
+        math::Axpby<T, Context>(output(i)->count(), alpha, diff_data, 0, dXdata);
    }
 }
@@ -68,6 +70,17 @@ void L2LossGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
+template <class Context>
+void L2LossGradientOp<Context>::ShareGradient() {
+    for (int i = 0; i < OutputSize(); i++) {
+        if (output(i)->name() != "ignore") {
+            Tensor* dX = ws()->GetBuffer("Grad");
+            output(i)->Replace(*dX);
+            break;
+        }
+    }
+}
 DEPLOY_CPU(L2LossGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(L2LossGradient);
@@ -75,9 +88,9 @@ DEPLOY_CUDA(L2LossGradient);
 OPERATOR_SCHEMA(L2LossGradient).NumInputs(3).NumOutputs(2);
 class GetL2LossGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetL2LossGradient);
-    vector<OperatorDef> MakeDefs() override{
+    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
            vector<string> {I(0), I(1), GO(0)},
            vector<string> {GI(0), GI(1)});
@@ -86,5 +99,3 @@ public:
 REGISTER_GRADIENT(L2Loss, GetL2LossGradient);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/sigmoid_cross_entropy_loss_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_cross_entropy_loss_op.cc
-#include "operators/loss/sigmoid_cross_entropy_loss_op.h"
+#include "operators/loss/sigmoid_cross_entropy_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
@@ -6,7 +6,7 @@
 namespace dragon {
 template <class Context> template <typename T>
-void SigmoidCrossEntropyLossOp<Context>::RunWithType() {
+void SigmoidCrossEntropyOp<Context>::RunWithType() {
    auto* Xdata = input(0).template data<T, Context>();
    auto* Pdata = prob->template mutable_data<T, Context>();
    kernel::Sigmoid<T, Context>(prob->count(), Xdata, Pdata);
@@ -32,7 +32,7 @@ void SigmoidCrossEntropyLossOp<Context>::RunWithType() {
 }
 template <class Context>
-void SigmoidCrossEntropyLossOp<Context>::RunOnDevice() {
+void SigmoidCrossEntropyOp<Context>::RunOnDevice() {
    CHECK_EQ(input(0).count(), input(1).count())
        << "\nnumber of predictions must match the number of labels.";
    prob = ws()->CreateTensor("_t_" + anchor() + "_sigmoid_prob");
@@ -43,14 +43,14 @@ void SigmoidCrossEntropyLossOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-DEPLOY_CPU(SigmoidCrossEntropyLoss);
+DEPLOY_CPU(SigmoidCrossEntropy);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(SigmoidCrossEntropyLoss);
+DEPLOY_CUDA(SigmoidCrossEntropy);
 #endif
-OPERATOR_SCHEMA(SigmoidCrossEntropyLoss).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(SigmoidCrossEntropy).NumInputs(2).NumOutputs(1);
 template <class Context> template <typename T>
-void SigmoidCrossEntropyLossGradientOp<Context>::RunWithType() {
+void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
    auto* Pdata = prob->template data<T, Context>();
    auto* Tdata = input(1).template data<T, Context>();
    auto* dXdata = output(0)->template mutable_data<T, Context>();
@@ -72,7 +72,7 @@ void SigmoidCrossEntropyLossGradientOp<Context>::RunWithType() {
 }
 template <class Context>
-void SigmoidCrossEntropyLossGradientOp<Context>::RunOnDevice() {
+void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
    prob = ws()->GetTensor("_t_" + anchor() + "_sigmoid_prob");
    output(0)->ReshapeLike(input(0));
@@ -80,21 +80,21 @@ void SigmoidCrossEntropyLossGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-DEPLOY_CPU(SigmoidCrossEntropyLossGradient);
+DEPLOY_CPU(SigmoidCrossEntropyGradient);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(SigmoidCrossEntropyLossGradient);
+DEPLOY_CUDA(SigmoidCrossEntropyGradient);
 #endif
-OPERATOR_SCHEMA(SigmoidCrossEntropyLossGradient).NumInputs(3).NumOutputs(1);
+OPERATOR_SCHEMA(SigmoidCrossEntropyGradient).NumInputs(3).NumOutputs(1);
-class GetSigmoidCrossEntropyLossGradient final : public GradientMakerBase {
+class GetSigmoidCrossEntropyGradient final : public GradientMakerBase {
-public:
+ public:
-    GRADIENT_MAKER_CTOR(GetSigmoidCrossEntropyLossGradient);
+    GRADIENT_MAKER_CTOR(GetSigmoidCrossEntropyGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
            vector<string> {I(0), I(1), GO(0)},
            vector<string> {GI(0)});
    }
 };
-REGISTER_GRADIENT(SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient);
+REGISTER_GRADIENT(SigmoidCrossEntropy, GetSigmoidCrossEntropyGradient);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
@@ -52,17 +52,18 @@ OPERATOR_SCHEMA(SmoothL1Loss).NumInputs(2, 4).NumOutputs(1);
 template <class Context> template <typename T>
 void SmoothL1LossGradientOp<Context>::RunWithType() {
-    auto* dYdata = diff->template mutable_data<T, Context>();
+    auto* diff_data = diff->template mutable_data<T, Context>();
+    auto* dYdata = input(-1).template data<T, CPUContext>();
-    kernel::SmoothL1Grad<T, Context>(diff->count(), sigma2, dYdata, dYdata);
+    kernel::SmoothL1Grad<T, Context>(diff->count(), sigma2, diff_data, diff_data);
    for (int i = 0; i < 2; i++) {
        if (output(i)->name() == "ignore") continue;
        output(i)->ReshapeLike(input(i));
        auto* dXdata = output(i)->template mutable_data<T, Context>();
        const T sign = (i == 0) ? 1 : -1;
-        const T coeff = sign / input(i).dim(0);
+        const T coeff = sign / input(i).dim(0) * dYdata[0];
-        math::Axpby<T, Context>(output(i)->count(), coeff, dYdata, 0, dXdata);
+        math::Axpby<T, Context>(output(i)->count(), coeff, diff_data, 0, dXdata);
        if (InputSize() > 3) {
            auto* inside_w_data = input(2).template data<T, Context>();
            math::Mul<T, Context>(output(i)->count(), inside_w_data, dXdata, dXdata);
@@ -89,7 +90,7 @@ DEPLOY_CUDA(SmoothL1LossGradient);
 OPERATOR_SCHEMA(SmoothL1LossGradient).NumInputs(3, 5).NumOutputs(2);
 class GetSmoothL1LossGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetSmoothL1LossGradient);
    vector<OperatorDef> MakeDefs() override {
        vector<string> inputs;

--- a/Dragon/src/operators/loss/softmax_cross_entropy_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_cross_entropy_loss_op.cc
 #include "operators/activation/softmax_op.h"
-#include "operators/loss/softmax_cross_entropy_loss_op.h"
+#include "operators/loss/softmax_cross_entropy_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
@@ -8,7 +8,7 @@
 namespace dragon {
 template <class Context> template <typename T>
-void SoftmaxCrossEntropyLossOp<Context>::RunWithType() {
+void SoftmaxCrossEntropyOp<Context>::RunWithType() {
    auto* Pdata = prob->template data<T, Context>();
    auto* Tdata = input(1).template data<T, Context>();
    auto* Ldata = losses.template mutable_data<T, Context>();
@@ -36,7 +36,7 @@ void SoftmaxCrossEntropyLossOp<Context>::RunWithType() {
 }
 template <class Context>
-void SoftmaxCrossEntropyLossOp<Context>::RunOnDevice() {
+void SoftmaxCrossEntropyOp<Context>::RunOnDevice() {
    outer_dim = input(0).count(0, axis);
    inner_dim = input(0).count(axis + 1);
    CHECK_EQ(input(0).count(), input(1).count())
@@ -49,14 +49,14 @@ void SoftmaxCrossEntropyLossOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-DEPLOY_CPU(SoftmaxCrossEntropyLoss);
+DEPLOY_CPU(SoftmaxCrossEntropy);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(SoftmaxCrossEntropyLoss);
+DEPLOY_CUDA(SoftmaxCrossEntropy);
 #endif
-OPERATOR_SCHEMA(SoftmaxCrossEntropyLoss).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(SoftmaxCrossEntropy).NumInputs(2).NumOutputs(1);
 template <class Context> template <typename T>
-void SoftmaxCrossEntropyLossGradientOp<Context>::RunWithType() {
+void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
    auto* Tdata = input(1).template data<T, Context>();
    auto* Pdata = prob->template mutable_data<T, Context>();
    auto* dXdata = output(0)->template mutable_data<T, Context>();
@@ -75,7 +75,6 @@ void SoftmaxCrossEntropyLossGradientOp<Context>::RunWithType() {
        return;
    }
-    //  normalize
    T normalizer;
    if (normalization == "BATCH_SIZE") normalizer = outer_dim;
    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
@@ -85,7 +84,7 @@ void SoftmaxCrossEntropyLossGradientOp<Context>::RunWithType() {
 }
 template <class Context>
-void SoftmaxCrossEntropyLossGradientOp<Context>::RunOnDevice() {
+void SoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
    prob = ws()->GetTensor("_t_" + anchor() + "_softmax_prob");
    outer_dim = prob->count(0, axis);
    inner_dim = prob->count(axis + 1);
@@ -95,21 +94,21 @@ void SoftmaxCrossEntropyLossGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-DEPLOY_CPU(SoftmaxCrossEntropyLossGradient);
+DEPLOY_CPU(SoftmaxCrossEntropyGradient);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(SoftmaxCrossEntropyLossGradient);
+DEPLOY_CUDA(SoftmaxCrossEntropyGradient);
 #endif
-OPERATOR_SCHEMA(SoftmaxCrossEntropyLossGradient).NumInputs(3).NumOutputs(1);
+OPERATOR_SCHEMA(SoftmaxCrossEntropyGradient).NumInputs(3).NumOutputs(1);
-class GetSoftmaxCrossEntropyLossGradient final : public GradientMakerBase { 
+class GetSoftmaxCrossEntropyGradient final : public GradientMakerBase { 
-public:
+ public:
-    GRADIENT_MAKER_CTOR(GetSoftmaxCrossEntropyLossGradient);
+    GRADIENT_MAKER_CTOR(GetSoftmaxCrossEntropyGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
            vector<string> {I(0), I(1), GO(0)},
            vector<string> {GI(0)});
    }
 };
-REGISTER_GRADIENT(SoftmaxCrossEntropyLoss, GetSoftmaxCrossEntropyLossGradient);
+REGISTER_GRADIENT(SoftmaxCrossEntropy, GetSoftmaxCrossEntropyGradient);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/softmax_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_loss_op.cc
 #include "operators/activation/softmax_op.h"
-#include "operators/loss/softmax_loss_op.h"
+#include "operators/loss/sparse_softmax_cross_entropy_op.h"
 #include "core/workspace.h"
 #include "utils/math_functions.h"
 #include "utils/op_kernel.h"
@@ -8,13 +8,21 @@
 namespace dragon {
 template <class Context> template <typename T>
-void SoftmaxLossOp<Context>::RunWithType() {
+void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
    auto* prob_data = prob->template data<T, Context>();
    auto* label_data = input(1).template data<T, Context>();
    auto* loss_data = losses.template mutable_data<T, Context>();
    auto* valid_data = valid.template mutable_data<T, Context>();
-    kernel::SparseSoftmaxCrossEntropy<T, Context>(input(0).count(), input(0).dim(axis),
-        outer_dim, inner_dim, prob_data, label_data, loss_data, valid_data, &ignore);
+    kernel::SparseSoftmaxCrossEntropy<T, Context>(input(0).count(),
+                                                input(0).dim(axis),
+                                                         outer_dim,
+                                                         inner_dim,
+                                                         prob_data,
+                                                        label_data,
+                                                         loss_data,
+                                                        valid_data,
+                                                          &ignore);
    if (normalization == "UNIT") {
        output(0)->ReshapeLike(losses);
@@ -35,7 +43,7 @@ void SoftmaxLossOp<Context>::RunWithType() {
 }
 template <class Context>
-void SoftmaxLossOp<Context>::RunOnDevice() {
+void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
    outer_dim = input(0).count(0, axis);
    inner_dim = input(0).count(axis + 1);
    CHECK_EQ(outer_dim * inner_dim, input(1).count())
@@ -49,21 +57,29 @@ void SoftmaxLossOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-DEPLOY_CPU(SoftmaxLoss);
+DEPLOY_CPU(SparseSoftmaxCrossEntropy);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(SoftmaxLoss);
+DEPLOY_CUDA(SparseSoftmaxCrossEntropy);
 #endif
-OPERATOR_SCHEMA(SoftmaxLoss).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(SparseSoftmaxCrossEntropy).NumInputs(2).NumOutputs(1);
 template <class Context> template <typename T>
-void SoftmaxLossGradientOp<Context>::RunWithType() {
+void SparseSoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
    auto* label_data = input(1).template data<T, Context>();
    auto* prob_data = prob->template mutable_data<T, Context>();
    auto* dXdata = output(0)->template mutable_data<T, Context>();
    auto* valid_data = valid.template mutable_data<T, Context>();
    ctx().template Copy<T, Context, Context>(prob->count(), dXdata, prob_data);
-    kernel::SoftmaxLossGrad<T, Context>(output(0)->count(), output(0)->dim(axis),
-        outer_dim, inner_dim, label_data, prob_data, valid_data, &ignore, dXdata);
+    kernel::SparseSoftmaxCrossEntropyGrad<T, Context>(output(0)->count(),
+                                                    output(0)->dim(axis),
+                                                               outer_dim,
+                                                               inner_dim,
+                                                               prob_data,
+                                                              label_data,
+                                                              valid_data,
+                                                                 &ignore,
+                                                                 dXdata);
    if (normalization == "UNIT") {
        auto* dYdata = input(-1).template data<T, Context>();
@@ -83,7 +99,7 @@ void SoftmaxLossGradientOp<Context>::RunWithType() {
 }
 template <class Context>
-void SoftmaxLossGradientOp<Context>::RunOnDevice() {
+void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
    prob = ws()->GetTensor("_t_" + anchor() + "_softmax_prob");
    outer_dim = prob->count(0, axis);
    inner_dim = prob->count(axis + 1);
@@ -94,21 +110,21 @@ void SoftmaxLossGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-DEPLOY_CPU(SoftmaxLossGradient);
+DEPLOY_CPU(SparseSoftmaxCrossEntropyGradient);
 #ifdef WITH_CUDA
-DEPLOY_CUDA(SoftmaxLossGradient);
+DEPLOY_CUDA(SparseSoftmaxCrossEntropyGradient);
 #endif
-OPERATOR_SCHEMA(SoftmaxLossGradient).NumInputs(3).NumOutputs(1);
+OPERATOR_SCHEMA(SparseSoftmaxCrossEntropyGradient).NumInputs(3).NumOutputs(1);
-class GetSoftmaxLossGradient final : public GradientMakerBase {
+class GetSparseSoftmaxCrossEntropyGradient final : public GradientMakerBase {
-public:
+ public:
-    GRADIENT_MAKER_CTOR(GetSoftmaxLossGradient);
+    GRADIENT_MAKER_CTOR(GetSparseSoftmaxCrossEntropyGradient);
-    vector<OperatorDef> MakeDefs() override{
+    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
            vector<string> {I(0), I(1), GO(0)},
            vector<string> {GI(0)});
    }
 };
-REGISTER_GRADIENT(SoftmaxLoss, GetSoftmaxLossGradient);
+REGISTER_GRADIENT(SparseSoftmaxCrossEntropy, GetSparseSoftmaxCrossEntropyGradient);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/sparse_softmax_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_focal_loss_op.cc
+#include "operators/activation/softmax_op.h"
+#include "operators/loss/sparse_softmax_focal_loss_op.h"
+#include "core/workspace.h"
+#include "utils/math_functions.h"
+#include "utils/op_kernel.h"
+#include "utils/proto_utils.h"
+namespace dragon {
+template <class Context> template <typename T>
+void SparseSoftmaxFocalLossOp<Context>::RunWithType() {
+    auto* prob_data = this->prob->template data<T, Context>();
+    auto* label_data = input(1).template data<T, Context>();
+    auto* loss_data = this->losses.template mutable_data<T, Context>();
+    auto* valid_data = this->valid.template mutable_data<T, Context>();
+    auto* scale_data = scale->template mutable_data<T, Context>();
+    kernel::SparseSoftmaxFocalLoss<T, Context>(input(0).count(), 
+                                             input(0).dim(axis),
+                                                      outer_dim, 
+                                                      inner_dim,
+                                                          alpha,
+                                                          gamma,
+                                                      prob_data, 
+                                                     label_data, 
+                                                     scale_data,
+                                                      loss_data, 
+                                                     valid_data, 
+                                                 &this->ignore);
+    if (normalization == "UNIT") {
+        if (use_pseudo_metric) {
+            math::MulScalar<T, Context>(this->losses.count(), 
+                                                 1.0 / alpha, 
+                                                  loss_data);
+        }
+        output(0)->ReshapeLike(this->losses);
+        output(0)->Share(this->losses);
+        return;
+    }
+    T normalizer;
+    if (normalization == "VALID")
+        normalizer = math::ASum<T, Context>(this->valid.count(), valid_data);
+    else if (normalization == "BATCH_SIZE") normalizer = outer_dim;
+    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
+    else if (normalization == "NONE") normalizer = 1;
+    T loss = math::ASum<T, Context>(this->losses.count(), loss_data);
+    loss = use_pseudo_metric ? loss / alpha : loss;
+    output(0)->Reshape(vector<TIndex>(1, 1));
+    auto* Ydata = output(0)->template mutable_data<T, CPUContext>();
+    Ydata[0] = loss / normalizer;
+}
+template <class Context>
+void SparseSoftmaxFocalLossOp<Context>::RunOnDevice() {
+    outer_dim = input(0).count(0, axis);
+    inner_dim = input(0).count(axis + 1);
+    CHECK_EQ(outer_dim * inner_dim, input(1).count())
+        << "\nnumber of predictions must match the number of labels.";
+    this->valid.Reshape(vector<TIndex>(1, outer_dim * inner_dim));
+    this->losses.Reshape(vector<TIndex>(1, outer_dim * inner_dim));
+    this->softmax_op->Run();
+    this->prob = ws()->GetTensor("_t_" + anchor() + "_softmax_prob");
+    scale = ws()->CreateTensor("_t_" + anchor() + "_focal_scale");
+    scale->ReshapeLike(*this->prob);
+    if (input(0).template IsType<float>()) RunWithType<float>();
+    else LOG(FATAL) << "unsupported input types.";
+}
+DEPLOY_CPU(SparseSoftmaxFocalLoss);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(SparseSoftmaxFocalLoss);
+#endif
+OPERATOR_SCHEMA(SparseSoftmaxFocalLoss).NumInputs(2).NumOutputs(1);
+template <class Context> template <typename T>
+void SparseSoftmaxFocalLossGradientOp<Context>::RunWithType() {
+    auto* label_data = input(1).template data<T, Context>();
+    auto* prob_data = this->prob->template mutable_data<T, Context>();
+    auto* dXdata = output(0)->template mutable_data<T, Context>();
+    auto* valid_data = this->valid.template mutable_data<T, Context>();
+    auto* scale_data = scale->template mutable_data<T, Context>();
+    kernel::SparseSoftmaxFocalLossGrad<T, Context>(output(0)->count(), 
+                                                 output(0)->dim(axis),
+                                                            outer_dim, 
+                                                            inner_dim, 
+                                                                gamma,
+                                                                  eps,
+                                                           scale_data,
+                                                            prob_data,
+                                                           label_data,
+                                                           valid_data, 
+                                                        &this->ignore, 
+                                                              dXdata);
+    if (normalization == "UNIT") {
+        auto* dYdata = input(-1).template data<T, Context>();
+        kernel::SumGrad<T, Context>(input(0).count() / input(0).dim(axis),
+                                                       input(0).dim(axis), 
+                                                                inner_dim, 
+                                                                      1.0, 
+                                                                   dYdata, 
+                                                               prob_data);
+        math::Mul<T, Context>(output(0)->count(), prob_data, dXdata, dXdata);
+        return;
+    }
+    T normalizer;
+    if (normalization == "VALID") normalizer = math::ASum<T, Context>(this->valid.count(), valid_data);
+    else if (normalization == "BATCH_SIZE") normalizer = outer_dim;
+    else if (normalization == "FULL") normalizer = outer_dim * inner_dim;
+    else if (normalization == "NONE") normalizer = 1;
+    auto* dYdata = input(-1).template data<T, CPUContext>();
+    math::Scal<T, Context>(output(0)->count(), dYdata[0] / normalizer, dXdata);
+}
+template <class Context>
+void SparseSoftmaxFocalLossGradientOp<Context>::RunOnDevice() {
+    this->prob = ws()->GetTensor("_t_" + anchor() + "_softmax_prob");
+    scale = ws()->GetTensor("_t_" + anchor() + "_focal_scale");
+    outer_dim = this->prob->count(0, axis);
+    inner_dim = this->prob->count(axis + 1);
+    output(0)->ReshapeLike(input(0));
+    this->valid.Reshape(vector<TIndex>(1, outer_dim * inner_dim));
+    if (input(0).template IsType<float>()) RunWithType<float>();
+    else LOG(FATAL) << "unsupported input types.";
+}
+DEPLOY_CPU(SparseSoftmaxFocalLossGradient);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(SparseSoftmaxFocalLossGradient);
+#endif
+OPERATOR_SCHEMA(SparseSoftmaxFocalLossGradient).NumInputs(3).NumOutputs(1);
+class GetSparseSoftmaxFocalLossGradient final : public GradientMakerBase {
+ public:
+    GRADIENT_MAKER_CTOR(GetSparseSoftmaxFocalLossGradient);
+    vector<OperatorDef> MakeDefs() override {
+        return SingleDef(def.type() + "Gradient", "",
+            vector<string> {I(0), I(1), GO(0)},
+            vector<string> {GI(0)});
+    }
+};
+REGISTER_GRADIENT(SparseSoftmaxFocalLoss, GetSparseSoftmaxFocalLossGradient);
+}    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
@@ -104,7 +104,7 @@ DEPLOY_CUDA(MPIBroadcastGradient);
 OPERATOR_SCHEMA(MPIBroadcastGradient).NumInputs(1).NumOutputs(1);
 class GetMPIBroadcastGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetMPIBroadcastGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
@@ -86,7 +86,7 @@ DEPLOY_CUDA(MPIGatherGradient);
 OPERATOR_SCHEMA(MPIGatherGradient).NumInputs(2, INT_MAX).NumOutputs(1);
 class GetMPIGatherGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetMPIGatherGradient);
    vector<OperatorDef> MakeDefs() override {
        vector<string> inputs(1, I(0));

--- a/Dragon/src/operators/norm/batch_norm_op.cc
+++ b/Dragon/src/operators/norm/batch_norm_op.cc
@@ -63,7 +63,7 @@ void BatchNormOp<Context>::RunWithType() {
                                                                            1.0,
                                                                         Ydata);
-    if (!use_global_stats) {
+    if (!use_global_stats && !is_recomputing) {
        //  Var(X) = E((X - EX) ^ 2)
        math::Square<T, Context>(output(0)->count(), Ydata, Std_data);
        math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
@@ -120,9 +120,12 @@ void BatchNormOp<Context>::RunOnDevice() {
    if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false;
    else use_global_stats = use_stats == 1 ? true : false;
+    is_recomputing = ws()->GetTensor("_t_global_recompute_flag")
+                         ->template data<bool, CPUContext>()[0];
    //  if true, Act/Exp/Pow/Norm Ops can not exist before when train
    if (inplace) output(0)->Share(input(0));
    if (input(0).template IsType<float>()) RunWithType<float>();
    else if (input(0).template IsType<float16>()) RunWithType<float16>();
    else LOG(FATAL) << "unsupported input types.";
@@ -248,18 +251,6 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void BatchNormGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void BatchNormGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(BatchNormGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(BatchNormGradient);
@@ -267,7 +258,7 @@ DEPLOY_CUDA(BatchNormGradient);
 OPERATOR_SCHEMA(BatchNormGradient).NumInputs(3).NumOutputs(1);
 class GetBatchNormGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetBatchNormGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/norm/batch_renorm_op.cc
+++ b/Dragon/src/operators/norm/batch_renorm_op.cc
@@ -72,7 +72,7 @@ void BatchRenormOp<Context>::RunWithType() {
                                                                            1.0, 
                                                                         Ydata);
-    if (!use_global_stats) {
+    if (!use_global_stats && !is_recomputing) {
        //  Var(X) = E((X - EX) ^ 2)
        math::Pow<T, Context>(stddev->count(), 2, Ydata, Std_data);
        math::Gemv<T, Context>(CblasNoTrans, nbychans, spatial_dim,
@@ -97,7 +97,7 @@ void BatchRenormOp<Context>::RunWithType() {
    math::AddScalar<T, Context>(mean.count(), eps, tVar_data);
    math::Pow<T, Context>(mean.count(), 0.5, tVar_data, tVar_data);
-    if (!use_global_stats) {
+    if (!use_global_stats && !is_recomputing) {
        //  normalize history var
        math::AddScalar<T, Context>(mean.count(), eps, thVar_data);
        math::Pow<T, Context>(mean.count(), 0.5, thVar_data, thVar_data);
@@ -183,6 +183,8 @@ void BatchRenormOp<Context>::RunOnDevice() {
    if (use_stats == -1) use_global_stats = phase() == "TEST" ? true : false;
    else use_global_stats = use_stats == 1 ? true : false;
+    is_recomputing = ws()->GetTensor("_t_global_recompute_flag")
+                        ->template data<bool, CPUContext>()[0];
    //  if true, Act/Exp/Pow/Norm Ops can not exist before when train
    if (inplace) output(0)->Share(input(0));
@@ -314,7 +316,7 @@ void BatchRenormGradientOp<Context>::RunWithType() {
    //  release buffer
    ws()->ReleaseBuffer(stddev);
-    ws()->ReleaseBuffer(x_norm, true);
+    ws()->ReleaseBuffer(x_norm, "Common", true);
 }
 template <class Context>
@@ -336,18 +338,6 @@ void BatchRenormGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void BatchRenormGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void BatchRenormGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(BatchRenormGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(BatchRenormGradient);
@@ -355,7 +345,7 @@ DEPLOY_CUDA(BatchRenormGradient);
 OPERATOR_SCHEMA(BatchRenormGradient).NumInputs(3).NumOutputs(1);
 class GetBatchRenormGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetBatchRenormGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/norm/cudnn_bn_op.cc
+++ b/Dragon/src/operators/norm/cudnn_bn_op.cc
@@ -54,7 +54,7 @@ void CuDNNBNOp<Context>::SpatialRunWithType() {
                                                                  bn_desc,
                                                                    Sdata,
                                                                    Bdata,
-                                                     1.0 - this->momentum,
+                              is_recomputing ? 0.0 : 1.0 - this->momentum,
                                                               hMean_data,
                                                                hVar_data,
                                                                this->eps,
@@ -110,7 +110,7 @@ void CuDNNBNOp<Context>::PerActivationRunWithType() {
                                                                  bn_desc,
                                                                    Sdata,
                                                                    Bdata,
-                                                     1.0 - this->momentum,
+                              is_recomputing ? 0.0 : 1.0 - this->momentum,
                                                               hMean_data,
                                                                hVar_data,
                                                                this->eps,
@@ -133,6 +133,8 @@ void CuDNNBNOp<Context>::RunOnDevice() {
    if (this->use_stats == -1) use_global_stats = phase() == "TEST" ? true : false;
    else use_global_stats = this->use_stats == 1 ? true : false;
+    is_recomputing = ws()->GetTensor("_t_global_recompute_flag")
+                         ->template data<bool, CPUContext>()[0];
    if (input(0).template IsType<float>()) {
        if (input(0).ndim() == 4) SpatialRunWithType<float>();
@@ -344,28 +346,22 @@ void CuDNNBNGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void BNGradientOp<Context>::ShareBeforeRun() {
+void BNGradientOp<Context>::ShareGradient() {
    if (use_global_stats) {
        if (output(0)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer();
+            Tensor* dX = ws()->GetBuffer("Grad");
-            if (dX != nullptr) output(0)->Replace(*dX);
+            output(0)->Replace(*dX);
        }
    } else {
        if (output(0)->name() != "ignore" ||
            output(1)->name() != "ignore" ||
            output(2)->name() != "ignore") {
-            Tensor* dX = ws()->GetBuffer();
+            Tensor* dX = ws()->GetBuffer("Grad");
-            if (dX != nullptr) output(0)->Replace(*dX);
+            output(0)->Replace(*dX);
        }
    }
 }
-template <class Context>
-void BNGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(BNGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(BNGradient);
@@ -374,7 +370,7 @@ OPERATOR_SCHEMA(BNGradient).NumInputs(5).NumOutputs(3);
 DEPLOY_CUDNN(BNGradient);
 class GetBNGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetBNGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/norm/instance_norm_op.cc
+++ b/Dragon/src/operators/norm/instance_norm_op.cc
@@ -155,18 +155,6 @@ void InstanceNormGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void InstanceNormGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void InstanceNormGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(InstanceNormGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(InstanceNormGradient);
@@ -174,7 +162,7 @@ DEPLOY_CUDA(InstanceNormGradient);
 OPERATOR_SCHEMA(InstanceNormGradient).NumInputs(3).NumOutputs(1);
 class GetInstanceNormGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetInstanceNormGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
@@ -180,18 +180,6 @@ void L2NormGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void L2NormGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void L2NormGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(L2NormGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(L2NormGradient);
@@ -199,7 +187,7 @@ DEPLOY_CUDA(L2NormGradient);
 OPERATOR_SCHEMA(L2NormGradient).NumInputs(2).NumOutputs(1);
 class GetL2NormGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetL2NormGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/recurrent/lstm_unit_op.cc
+++ b/Dragon/src/operators/recurrent/lstm_unit_op.cc
@@ -85,7 +85,7 @@ DEPLOY_CUDA(LSTMUnitGradient);
 OPERATOR_SCHEMA(LSTMUnitGradient).NumInputs(5).NumOutputs(2);
 class GetLSTMUnitGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetLSTMUnitGradient);
    vector<OperatorDef> MakeDefs() override{
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/update/async_update_op.cc
+++ b/Dragon/src/operators/update/async_update_op.cc
--- a/Dragon/src/operators/common/utils_op.cc
+++ b/Dragon/src/operators/common/utils_op.cc
 #include <algorithm>
-#include "operators/common/utils_op.h"
+#include "operators/utils/accuracy_op.h"
 #include "utils/math_functions.h"
-#include "utils/op_kernel.h"
 namespace dragon {
-template <class Context> template <typename T>
-void CopyOp<Context>::RunWithType() { 
-    auto* Xdata = input(0).template data<T, Context>();
-    auto* Ydata = output(0)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(output(0)->count(), Ydata, Xdata);
-}
-template <class Context>
-void CopyOp<Context>::RunOnDevice() {
-    output(0)->ReshapeLike(input(0));
-    if (input(0).template IsType<float>()) RunWithType<float>();
-    else if (input(0).template IsType<float16>()) RunWithType<float16>();
-    else LOG(FATAL) << "unsupported input types.";
-}
-DEPLOY_CPU(Copy);
-#ifdef WITH_CUDA
-DEPLOY_CUDA(Copy);
-#endif
-OPERATOR_SCHEMA(Copy).NumInputs(1).NumOutputs(1);
-NO_GRADIENT(Copy);
 template <class Context> template <typename T>
 void AccuracyOp<Context>::RunWithType() {
    if (OutputSize() > 1) {
@@ -94,35 +70,4 @@ OPERATOR_SCHEMA(Accuracy).NumInputs(2).NumOutputs(1, 2);
 NO_GRADIENT(Accuracy);
-template <class Context> template <typename T>
-void OneHotOp<Context>::RunWithType() {
-    auto* Xdata = input(0).template data<T, Context>();
-    auto* Ydata = output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(output(0)->count(), 
-                          dragon_cast<T, float>(float(off_value)), 
-                          Ydata);
-    kernel::OneHot<T, Context>(input(0).count(), depth, on_value, Xdata, Ydata);
-}
-template <class Context>
-void OneHotOp<Context>::RunOnDevice() {
-    vector<TIndex> dims = input(0).dims();
-    dims.push_back(depth);
-    output(0)->Reshape(dims);
-    if (input(0).template IsType<float>()) RunWithType<float>();
-    else LOG(FATAL) << "unsupported input types.";
-}
-DEPLOY_CPU(OneHot);
-#ifdef WITH_CUDA
-DEPLOY_CUDA(OneHot);
-#endif
-OPERATOR_SCHEMA(OneHot).NumInputs(1).NumOutputs(1);
-NO_GRADIENT(OneHot);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/utils/cast_op.cpp
+++ b/Dragon/src/operators/utils/cast_op.cpp
--- a/Dragon/src/operators/utils/copy_op.cc
+++ b/Dragon/src/operators/utils/copy_op.cc
+#include "operators/utils/copy_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void CopyOp<Context>::RunWithType() { 
+    auto* Xdata = input(0).template data<T, Context>();
+    auto* Ydata = output(0)->template mutable_data<T, Context>();
+    ctx().template Copy<T, Context, Context>(output(0)->count(), Ydata, Xdata);
+}
+template <class Context>
+void CopyOp<Context>::RunOnDevice() {
+    output(0)->ReshapeLike(input(0));
+    if (input(0).template IsType<float>()) RunWithType<float>();
+    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+    else LOG(FATAL) << "unsupported input types.";
+}
+DEPLOY_CPU(Copy);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(Copy);
+#endif
+OPERATOR_SCHEMA(Copy).NumInputs(1).NumOutputs(1);
+NO_GRADIENT(Copy);
+}    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/utils/one_hot_op.cc
+++ b/Dragon/src/operators/utils/one_hot_op.cc
+#include "operators/utils/one_hot_op.h"
+#include "utils/math_functions.h"
+#include "utils/op_kernel.h"
+namespace dragon {
+template <class Context> template <typename T>
+void OneHotOp<Context>::RunWithType() {
+    auto* Xdata = input(0).template data<T, Context>();
+    auto* Ydata = output(0)->template mutable_data<T, Context>();
+    math::Set<T, Context>(output(0)->count(), 
+                          dragon_cast<T, float>(float(off_value)), 
+                          Ydata);
+    kernel::OneHot<T, Context>(input(0).count(), depth, on_value, Xdata, Ydata);
+}
+template <class Context>
+void OneHotOp<Context>::RunOnDevice() {
+    vector<TIndex> dims = input(0).dims();
+    dims.push_back(depth);
+    output(0)->Reshape(dims);
+    if (input(0).template IsType<float>()) RunWithType<float>();
+    else LOG(FATAL) << "unsupported input types.";
+}
+DEPLOY_CPU(OneHot);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(OneHot);
+#endif
+OPERATOR_SCHEMA(OneHot).NumInputs(1).NumOutputs(1);
+NO_GRADIENT(OneHot);
+}    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/vision/conv_op.cc
+++ b/Dragon/src/operators/vision/conv_op.cc
@@ -96,20 +96,6 @@ void ConvGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types."; 
 }
-template <class Context>
-void ConvGradientOp<Context>::ShareBeforeRun() {
-    if (output(0)->name() != "ignore") {
-        Tensor* dX = ws()->GetBuffer();
-        if (dX != nullptr) output(0)->Replace(*dX);
-    }
-}
-template <class Context>
-void ConvGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(ConvGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ConvGradient);
@@ -117,7 +103,7 @@ DEPLOY_CUDA(ConvGradient);
 OPERATOR_SCHEMA(ConvGradient).NumInputs(3).NumOutputs(3);
 class GetConvGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetConvGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/vision/conv_op_base.cc
+++ b/Dragon/src/operators/vision/conv_op_base.cc
--- a/Dragon/src/operators/vision/cudnn_conv_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv_op.cc
--- a/Dragon/src/operators/vision/deconv_op.cc
+++ b/Dragon/src/operators/vision/deconv_op.cc
@@ -97,20 +97,6 @@ void DeConvGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void DeConvGradientOp<Context>::ShareBeforeRun() {
-    if (output(0)->name() != "ignore") {
-        Tensor* dX = ws()->GetBuffer();
-        if (dX != nullptr) output(0)->Replace(*dX);
-    }
-}
-template <class Context>
-void DeConvGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(DeConvGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(DeConvGradient);
@@ -118,7 +104,7 @@ DEPLOY_CUDA(DeConvGradient);
 OPERATOR_SCHEMA(DeConvGradient).NumInputs(3).NumOutputs(3);
 class GetDeConvGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetDeConvGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/vision/dense_concat_op.cc
+++ b/Dragon/src/operators/vision/dense_concat_op.cc
@@ -4,12 +4,6 @@
 namespace dragon {
-template <class Context>
-void DenseConcatOp<Context>::RunOnDevice() {
-    ConcatOp<Context>::RunOnDevice();
-    input(0).Release();  // keep shape, just release mem
-}
 DEPLOY_CPU(DenseConcat);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(DenseConcat);
@@ -17,16 +11,18 @@ DEPLOY_CUDA(DenseConcat);
 OPERATOR_SCHEMA(DenseConcat).NumInputs(2).NumOutputs(1);
 template <class Context> template <typename T>
-void DenseConcatGradientOp<Context>::RunWithType() {
+void DenseConcatGradientOp<Context>::RestoreX1() {
-    //  restore X1 from Y
+    CHECK_GT(growth_rate, 0) << "invalid growth rate, please preset it.";
-    auto* Ydata = input(-2).template data<T, Context>();
-    auto* Xdata = input(0).template mutable_data<T, Context>();
-    this->x_concat_dim = input(0).dim(this->axis);
-    TIndex count = input(0).count();
    this->concat_dims = input(-1).dims();
    this->y_concat_dim = this->concat_dims[this->axis];
    this->outer_dim = input(-1).count(0, this->axis);
    this->inner_dim = input(-1).count(this->axis + 1);
+    this->concat_dims[this->axis] -= growth_rate;
+    input(0).Reshape(this->concat_dims);
+    this->x_concat_dim = input(0).dim(this->axis);
+    TIndex count = input(0).count();
+    auto* Ydata = input(-2).template data<T, Context>();
+    auto* Xdata = input(0).template mutable_data<T, Context>();
    kernel::ConcatGrad<T, Context>(count,
                         this->outer_dim, 
                         this->inner_dim,
@@ -39,26 +35,65 @@ void DenseConcatGradientOp<Context>::RunWithType() {
 }
 template <class Context>
-void DenseConcatGradientOp<Context>::RunOnDevice() {
+void DenseConcatGradientOp<Context>::ElimateCorruption() {
-    if (input(0).template IsType<float>()) RunWithType<float>();
+    Set<string> all_heads;
-    else if (input(0).template IsType<float16>()) RunWithType<float16>();
+    queue<int> safe_heads;
-    else LOG(FATAL) << "unsupported input types.";
+    Tensor* head = ws()->GetTensor("_t_mirrow_stage_head");
+    string* head_data = head->mutable_data<string, CPUContext>();
+    for (int i = 0; i < head->count(); i++) all_heads.insert(head_data[i]);
-    ConcatGradientOp<Context>::RunOnDevice();
+    //  sub-graph run
-}
+    if (input(0).is_corrupted() && !all_heads.count(input(0).name())) {
+        //  pre-process
+        LOG(DEBUG) << "Tensor(" << input(0).name() << ") is corrupted, recompute...  ";
+        for (int i = 0; i < head->count(); i++) {
+            bool safe = true;
+            for (int j = 0; j < InputSize(); j++)
+                if (head_data[i] == input(j).name()) safe = false;
+            if (safe) safe_heads.push(i);
+        }
+        int idx = safe_heads.front();
+        safe_heads.pop();
+        Tensor* buffer = ws()->GetTensor("_t_mirrow_stage_buffer_" + dragon_cast<string, int>(idx));
+        input(0).Move(buffer->memory());
+        head_data[idx] = input(0).name();
+        if (input(-2).template IsType<float>()) RestoreX1<float>();
+        else if (input(-2).template IsType<float16>()) RestoreX1<float16>();
+        else LOG(FATAL) << "unsupported input types.";
+        //  post-process
+        if (input(0).memory() != buffer->memory()) buffer->Move(input(0).memory());
+    }
-template <class Context>
+    //  check available head
-void DenseConcatGradientOp<Context>::ShareBeforeRun() {
+    while (!safe_heads.empty()) safe_heads.pop();
-    Tensor* dX = ws()->GetBuffer();
+    all_heads.clear();
-    if (dX != nullptr) output(0)->Replace(*dX);
+    for (int i = 0; i < head->count(); i++) {
-}
+        bool safe = true;
+        for (int j = 0; j < InputSize(); j++) 
+            if (head_data[i] == input(j).name()) safe = false;
+        if (safe) safe_heads.push(i);
+        all_heads.insert(head_data[i]);
+    }
-template <class Context>
+    //  pre-process
-void DenseConcatGradientOp<Context>::ClearAfterRun() {
+    for (int i = 0; i < OutputSize(); i++) {
-    Tensor* dY = &input(-1);
+        if (output(i)->is_corrupted()) {
-    Tensor* Y = &input(-2);
+            bool inplace_flag = false;
-    ws()->ReleaseBuffer(dY);
+            for (int j = 0; j < InputSize(); j++)
-    ws()->ReleaseBuffer(Y, true);
+                if (output(i)->name() == input(j).name()) inplace_flag = true;
+            if (inplace_flag || all_heads.count(output(i)->name())) continue;    //  skip to use new buffer
+            CHECK(!safe_heads.empty())
+                << "\nat most (" << safe_heads.size() << " [safe] / "
+                << all_heads.size() << " [total] can be used for corrupted output in "
+                << "(" << name() << ", " << type() << "), "
+                << "\nadd WORKSPACE_MAX_CORRUPTED_SIZE for more powerful mirrow stage ?";
+            int idx = safe_heads.front();
+            safe_heads.pop();
+            Tensor* buffer = ws()->GetTensor("_t_mirrow_stage_buffer_" + dragon_cast<string, int>(idx));
+            output(i)->Move(buffer->memory());
+            head_data[idx] = output(i)->name();
+        }
+    }
 }
 DEPLOY_CPU(DenseConcatGradient);
@@ -68,7 +103,7 @@ DEPLOY_CUDA(DenseConcatGradient);
 OPERATOR_SCHEMA(DenseConcatGradient).NumInputs(4).NumOutputs(2);
 class GetDenseConcatGradient : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetDenseConcatGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/vision/lrn_op.cc
+++ b/Dragon/src/operators/vision/lrn_op.cc
@@ -247,7 +247,7 @@ DEPLOY_CUDA(LRNGradient);
 OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
 class GetLRNGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetLRNGradient);
    vector<OperatorDef> MakeDefs() override{
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
@@ -54,18 +54,6 @@ void NNResizeGradientOp<Context>::RunOnDevice() {
    else LOG(FATAL) << "unsupported input types.";
 }
-template <class Context>
-void NNResizeGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void NNResizeGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(NNResizeGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(NNResizeGradient);
@@ -73,7 +61,7 @@ DEPLOY_CUDA(NNResizeGradient);
 OPERATOR_SCHEMA(NNResizeGradient).NumInputs(2).NumOutputs(1);
 class GetNNResizeGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetNNResizeGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/vision/pooling_op.cc
+++ b/Dragon/src/operators/vision/pooling_op.cc
@@ -151,18 +151,6 @@ void PoolingGradientOp<Context>::RunOnDevice() {
    }
 }
-template <class Context>
-void PoolingGradientOp<Context>::ShareBeforeRun() {
-    Tensor* dX = ws()->GetBuffer();
-    if (dX != nullptr) output(0)->Replace(*dX);
-}
-template <class Context>
-void PoolingGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-}
 DEPLOY_CPU(PoolingGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(PoolingGradient);
@@ -170,7 +158,7 @@ DEPLOY_CUDA(PoolingGradient);
 OPERATOR_SCHEMA(PoolingGradient).NumInputs(3).NumOutputs(1);
 class GetPoolingGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetPoolingGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/vision/roi_align_op.cc
+++ b/Dragon/src/operators/vision/roi_align_op.cc
@@ -57,17 +57,10 @@ void ROIAlignGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void ROIAlignGradientOp<Context>::ShareBeforeRun() {
+void ROIAlignGradientOp<Context>::CleanResource() {
-    Tensor* dX = ws()->GetBuffer();
+    Operator<Context>::CleanResource(); 
-    if (dX != nullptr) output(0)->Replace(*dX);
+    ws()->ReleaseBuffer(mask_h, "Common", true);
-}
+    ws()->ReleaseBuffer(mask_w, "Common", true);
-template <class Context>
-void ROIAlignGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-    ws()->ReleaseBuffer(mask_h);
-    ws()->ReleaseBuffer(mask_w);
 }
 DEPLOY_CPU(ROIAlignGradient);
@@ -77,7 +70,7 @@ DEPLOY_CUDA(ROIAlignGradient);
 OPERATOR_SCHEMA(ROIAlignGradient).NumInputs(3).NumOutputs(1);
 class GetROIAlignGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetROIAlignGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/operators/vision/roi_pooling_op.cc
+++ b/Dragon/src/operators/vision/roi_pooling_op.cc
@@ -55,16 +55,9 @@ void ROIPoolingGradientOp<Context>::RunOnDevice() {
 }
 template <class Context>
-void ROIPoolingGradientOp<Context>::ShareBeforeRun() {
+void ROIPoolingGradientOp<Context>::CleanResource() {
-    Tensor* dX = ws()->GetBuffer();
+    Operator<Context>::CleanResource();
-    if (dX != nullptr) output(0)->Replace(*dX);
+    ws()->ReleaseBuffer(mask, "Common", true);
-}
-template <class Context>
-void ROIPoolingGradientOp<Context>::ClearAfterRun() {
-    Tensor* dY = &input(-1);
-    ws()->ReleaseBuffer(dY);
-    ws()->ReleaseBuffer(mask);
 }
 DEPLOY_CPU(ROIPoolingGradient);
@@ -74,7 +67,7 @@ DEPLOY_CUDA(ROIPoolingGradient);
 OPERATOR_SCHEMA(ROIPoolingGradient).NumInputs(3).NumOutputs(1);
 class GetROIPoolingGradient final : public GradientMakerBase {
-public:
+ public:
    GRADIENT_MAKER_CTOR(GetROIPoolingGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",

--- a/Dragon/src/protos/caffemodel.proto
+++ b/Dragon/src/protos/caffemodel.proto
@@ -22,5 +22,3 @@ message LayerParameter {
  optional string name = 1; // the layer name
  repeated BlobProto blobs = 7;
 }
--- a/Dragon/src/protos/dragon.pb.cc
+++ b/Dragon/src/protos/dragon.pb.cc
@@ -137,7 +137,7 @@ void protobuf_AssignDesc_dragon_2eproto() {
      ::google::protobuf::MessageFactory::generated_factory(),
      sizeof(DeviceOption));
  OperatorDef_descriptor_ = file->message_type(4);
-  static const int OperatorDef_offsets_[7] = {
+  static const int OperatorDef_offsets_[8] = {
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(OperatorDef, input_),
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(OperatorDef, output_),
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(OperatorDef, name_),
@@ -145,6 +145,7 @@ void protobuf_AssignDesc_dragon_2eproto() {
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(OperatorDef, arg_),
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(OperatorDef, device_option_),
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(OperatorDef, debug_mode_),
+    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(OperatorDef, share_grads_),
  };
  OperatorDef_reflection_ =
    new ::google::protobuf::internal::GeneratedMessageReflection(
@@ -217,7 +218,7 @@ void protobuf_AssignDesc_dragon_2eproto() {
      sizeof(TensorFiller));
  TensorFiller_VarianceNorm_descriptor_ = TensorFiller_descriptor_->enum_type(0);
  GraphDef_descriptor_ = file->message_type(8);
-  static const int GraphDef_offsets_[9] = {
+  static const int GraphDef_offsets_[10] = {
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(GraphDef, name_),
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(GraphDef, op_),
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(GraphDef, graph_type_),
@@ -227,6 +228,7 @@ void protobuf_AssignDesc_dragon_2eproto() {
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(GraphDef, g_target_),
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(GraphDef, u_target_),
    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(GraphDef, debug_mode_),
+    GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(GraphDef, share_grads_),
  };
  GraphDef_reflection_ =
    new ::google::protobuf::internal::GeneratedMessageReflection(
@@ -316,29 +318,31 @@ void protobuf_AddDesc_dragon_2eproto() {
    "oats\030\005 \003(\002\022\014\n\004ints\030\006 \003(\005\022\017\n\007strings\030\007 \003("
    "\t\"p\n\014DeviceOption\022%\n\013device_type\030\001 \001(\0162\013"
    ".DeviceType:\003CPU\022\021\n\006gpu_id\030\002 \001(\005:\0010\022\026\n\013r"
-    "andom_seed\030\003 \001(\r:\0013\022\016\n\006engine\030\004 \001(\t\"\241\001\n\013"
+    "andom_seed\030\003 \001(\r:\0013\022\016\n\006engine\030\004 \001(\t\"\275\001\n\013"
    "OperatorDef\022\r\n\005input\030\001 \003(\t\022\016\n\006output\030\002 \003"
    "(\t\022\014\n\004name\030\003 \001(\t\022\014\n\004type\030\004 \001(\t\022\026\n\003arg\030\005 "
    "\003(\0132\t.Argument\022$\n\rdevice_option\030\006 \001(\0132\r."
-    "DeviceOption\022\031\n\ndebug_mode\030\007 \001(\010:\005false\""
+    "DeviceOption\022\031\n\ndebug_mode\030\007 \001(\010:\005false\022"
-    "=\n\016GradientTarget\022\014\n\004cost\030\001 \001(\t\022\013\n\003wrt\030\002"
+    "\032\n\013share_grads\030\010 \001(\010:\005false\"=\n\016GradientT"
-    " \001(\t\022\020\n\010external\030\003 \001(\t\"R\n\014UpdateTarget\022\014"
+    "arget\022\014\n\004cost\030\001 \001(\t\022\013\n\003wrt\030\002 \001(\t\022\020\n\010exte"
-    "\n\004name\030\001 \001(\t\022\014\n\004type\030\002 \001(\t\022\016\n\006tensor\030\003 \003"
+    "rnal\030\003 \001(\t\"R\n\014UpdateTarget\022\014\n\004name\030\001 \001(\t"
-    "(\t\022\026\n\003arg\030\004 \003(\0132\t.Argument\"\215\002\n\014TensorFil"
+    "\022\014\n\004type\030\002 \001(\t\022\016\n\006tensor\030\003 \003(\t\022\026\n\003arg\030\004 "
-    "ler\022\016\n\006tensor\030\001 \001(\t\022\026\n\004type\030\002 \001(\t:\010const"
+    "\003(\0132\t.Argument\"\215\002\n\014TensorFiller\022\016\n\006tenso"
-    "ant\022\020\n\005value\030\003 \001(\002:\0010\022\016\n\003low\030\004 \001(\002:\0010\022\017\n"
+    "r\030\001 \001(\t\022\026\n\004type\030\002 \001(\t:\010constant\022\020\n\005value"
-    "\004high\030\005 \001(\002:\0011\022\017\n\004mean\030\006 \001(\002:\0010\022\016\n\003std\030\007"
+    "\030\003 \001(\002:\0010\022\016\n\003low\030\004 \001(\002:\0010\022\017\n\004high\030\005 \001(\002:"
-    " \001(\002:\0011\022\020\n\005scale\030\010 \001(\002:\0013\0229\n\rvariance_no"
+    "\0011\022\017\n\004mean\030\006 \001(\002:\0010\022\016\n\003std\030\007 \001(\002:\0011\022\020\n\005s"
-    "rm\030\t \001(\0162\032.TensorFiller.VarianceNorm:\006FA"
+    "cale\030\010 \001(\002:\0013\0229\n\rvariance_norm\030\t \001(\0162\032.T"
-    "N_IN\"4\n\014VarianceNorm\022\n\n\006FAN_IN\020\000\022\013\n\007FAN_"
+    "ensorFiller.VarianceNorm:\006FAN_IN\"4\n\014Vari"
-    "OUT\020\001\022\013\n\007FAN_AVG\020\002\"\363\001\n\010GraphDef\022\014\n\004name\030"
+    "anceNorm\022\n\n\006FAN_IN\020\000\022\013\n\007FAN_OUT\020\001\022\013\n\007FAN"
-    "\001 \001(\t\022\030\n\002op\030\002 \003(\0132\014.OperatorDef\022\022\n\ngraph"
+    "_AVG\020\002\"\217\002\n\010GraphDef\022\014\n\004name\030\001 \001(\t\022\030\n\002op\030"
-    "_type\030\003 \001(\t\022$\n\rdevice_option\030\005 \001(\0132\r.Dev"
+    "\002 \003(\0132\014.OperatorDef\022\022\n\ngraph_type\030\003 \001(\t\022"
-    "iceOption\022\026\n\003arg\030\006 \003(\0132\t.Argument\022\016\n\006tar"
+    "$\n\rdevice_option\030\005 \001(\0132\r.DeviceOption\022\026\n"
-    "get\030\007 \003(\t\022!\n\010g_target\030\010 \003(\0132\017.GradientTa"
+    "\003arg\030\006 \003(\0132\t.Argument\022\016\n\006target\030\007 \003(\t\022!\n"
-    "rget\022\037\n\010u_target\030\t \003(\0132\r.UpdateTarget\022\031\n"
+    "\010g_target\030\010 \003(\0132\017.GradientTarget\022\037\n\010u_ta"
-    "\ndebug_mode\030\n \001(\010:\005false*+\n\nDeviceType\022\007"
+    "rget\030\t \003(\0132\r.UpdateTarget\022\031\n\ndebug_mode\030"
-    "\n\003CPU\020\000\022\010\n\004CUDA\020\001\022\n\n\006OPENCL\020\002", 1429);
+    "\n \001(\010:\005false\022\032\n\013share_grads\030\013 \001(\010:\005false"
+    "*+\n\nDeviceType\022\007\n\003CPU\020\000\022\010\n\004CUDA\020\001\022\n\n\006OPE"
+    "NCL\020\002", 1485);
  ::google::protobuf::MessageFactory::InternalRegisterGeneratedFile(
    "dragon.proto", &protobuf_RegisterTypes);
  TensorProto::default_instance_ = new TensorProto();
@@ -2171,6 +2175,7 @@ const int OperatorDef::kTypeFieldNumber;
 const int OperatorDef::kArgFieldNumber;
 const int OperatorDef::kDeviceOptionFieldNumber;
 const int OperatorDef::kDebugModeFieldNumber;
+const int OperatorDef::kShareGradsFieldNumber;
 #endif  // !_MSC_VER
 OperatorDef::OperatorDef()
@@ -2197,6 +2202,7 @@ void OperatorDef::SharedCtor() {
  type_ = const_cast< ::std::string*>(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
  device_option_ = NULL;
  debug_mode_ = false;
+  share_grads_ = false;
  ::memset(_has_bits_, 0, sizeof(_has_bits_));
 }
@@ -2239,7 +2245,18 @@ OperatorDef* OperatorDef::New() const {
 }
 void OperatorDef::Clear() {
-  if (_has_bits_[0 / 32] & 108) {
+#define OFFSET_OF_FIELD_(f) (reinterpret_cast<char*>(      \
+  &reinterpret_cast<OperatorDef*>(16)->f) - \
+   reinterpret_cast<char*>(16))
+#define ZR_(first, last) do {                              \
+    size_t f = OFFSET_OF_FIELD_(first);                    \
+    size_t n = OFFSET_OF_FIELD_(last) - f + sizeof(last);  \
+    ::memset(&first, 0, n);                                \
+  } while (0)
+  if (_has_bits_[0 / 32] & 236) {
+    ZR_(debug_mode_, share_grads_);
    if (has_name()) {
      if (name_ != &::google::protobuf::internal::GetEmptyStringAlreadyInited()) {
        name_->clear();
@@ -2253,8 +2270,11 @@ void OperatorDef::Clear() {
    if (has_device_option()) {
      if (device_option_ != NULL) device_option_->::DeviceOption::Clear();
    }
-    debug_mode_ = false;
  }
+#undef OFFSET_OF_FIELD_
+#undef ZR_
  input_.Clear();
  output_.Clear();
  arg_.Clear();
@@ -2382,6 +2402,21 @@ bool OperatorDef::MergePartialFromCodedStream(
        } else {
          goto handle_unusual;
        }
+        if (input->ExpectTag(64)) goto parse_share_grads;
+        break;
+      }
+      // optional bool share_grads = 8 [default = false];
+      case 8: {
+        if (tag == 64) {
+         parse_share_grads:
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   bool, ::google::protobuf::internal::WireFormatLite::TYPE_BOOL>(
+                 input, &share_grads_)));
+          set_has_share_grads();
+        } else {
+          goto handle_unusual;
+        }
        if (input->ExpectAtEnd()) goto success;
        break;
      }
@@ -2468,6 +2503,11 @@ void OperatorDef::SerializeWithCachedSizes(
    ::google::protobuf::internal::WireFormatLite::WriteBool(7, this->debug_mode(), output);
  }
+  // optional bool share_grads = 8 [default = false];
+  if (has_share_grads()) {
+    ::google::protobuf::internal::WireFormatLite::WriteBool(8, this->share_grads(), output);
+  }
  if (!unknown_fields().empty()) {
    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
        unknown_fields(), output);
@@ -2539,6 +2579,11 @@ void OperatorDef::SerializeWithCachedSizes(
    target = ::google::protobuf::internal::WireFormatLite::WriteBoolToArray(7, this->debug_mode(), target);
  }
+  // optional bool share_grads = 8 [default = false];
+  if (has_share_grads()) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteBoolToArray(8, this->share_grads(), target);
+  }
  if (!unknown_fields().empty()) {
    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
        unknown_fields(), target);
@@ -2577,6 +2622,11 @@ int OperatorDef::ByteSize() const {
      total_size += 1 + 1;
    }
+    // optional bool share_grads = 8 [default = false];
+    if (has_share_grads()) {
+      total_size += 1 + 1;
+    }
  }
  // repeated string input = 1;
  total_size += 1 * this->input_size();
@@ -2641,6 +2691,9 @@ void OperatorDef::MergeFrom(const OperatorDef& from) {
    if (from.has_debug_mode()) {
      set_debug_mode(from.debug_mode());
    }
+    if (from.has_share_grads()) {
+      set_share_grads(from.share_grads());
+    }
  }
  mutable_unknown_fields()->MergeFrom(from.unknown_fields());
 }
@@ -2671,6 +2724,7 @@ void OperatorDef::Swap(OperatorDef* other) {
    arg_.Swap(&other->arg_);
    std::swap(device_option_, other->device_option_);
    std::swap(debug_mode_, other->debug_mode_);
+    std::swap(share_grads_, other->share_grads_);
    std::swap(_has_bits_[0], other->_has_bits_[0]);
    _unknown_fields_.Swap(&other->_unknown_fields_);
    std::swap(_cached_size_, other->_cached_size_);
@@ -4066,6 +4120,7 @@ const int GraphDef::kTargetFieldNumber;
 const int GraphDef::kGTargetFieldNumber;
 const int GraphDef::kUTargetFieldNumber;
 const int GraphDef::kDebugModeFieldNumber;
+const int GraphDef::kShareGradsFieldNumber;
 #endif  // !_MSC_VER
 GraphDef::GraphDef()
@@ -4092,6 +4147,7 @@ void GraphDef::SharedCtor() {
  graph_type_ = const_cast< ::std::string*>(&::google::protobuf::internal::GetEmptyStringAlreadyInited());
  device_option_ = NULL;
  debug_mode_ = false;
+  share_grads_ = false;
  ::memset(_has_bits_, 0, sizeof(_has_bits_));
 }
@@ -4134,6 +4190,16 @@ GraphDef* GraphDef::New() const {
 }
 void GraphDef::Clear() {
+#define OFFSET_OF_FIELD_(f) (reinterpret_cast<char*>(      \
+  &reinterpret_cast<GraphDef*>(16)->f) - \
+   reinterpret_cast<char*>(16))
+#define ZR_(first, last) do {                              \
+    size_t f = OFFSET_OF_FIELD_(first);                    \
+    size_t n = OFFSET_OF_FIELD_(last) - f + sizeof(last);  \
+    ::memset(&first, 0, n);                                \
+  } while (0)
  if (_has_bits_[0 / 32] & 13) {
    if (has_name()) {
      if (name_ != &::google::protobuf::internal::GetEmptyStringAlreadyInited()) {
@@ -4149,7 +4215,11 @@ void GraphDef::Clear() {
      if (device_option_ != NULL) device_option_->::DeviceOption::Clear();
    }
  }
-  debug_mode_ = false;
+  ZR_(debug_mode_, share_grads_);
+#undef OFFSET_OF_FIELD_
+#undef ZR_
  op_.Clear();
  arg_.Clear();
  target_.Clear();
@@ -4301,6 +4371,21 @@ bool GraphDef::MergePartialFromCodedStream(
        } else {
          goto handle_unusual;
        }
+        if (input->ExpectTag(88)) goto parse_share_grads;
+        break;
+      }
+      // optional bool share_grads = 11 [default = false];
+      case 11: {
+        if (tag == 88) {
+         parse_share_grads:
+          DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive<
+                   bool, ::google::protobuf::internal::WireFormatLite::TYPE_BOOL>(
+                 input, &share_grads_)));
+          set_has_share_grads();
+        } else {
+          goto handle_unusual;
+        }
        if (input->ExpectAtEnd()) goto success;
        break;
      }
@@ -4395,6 +4480,11 @@ void GraphDef::SerializeWithCachedSizes(
    ::google::protobuf::internal::WireFormatLite::WriteBool(10, this->debug_mode(), output);
  }
+  // optional bool share_grads = 11 [default = false];
+  if (has_share_grads()) {
+    ::google::protobuf::internal::WireFormatLite::WriteBool(11, this->share_grads(), output);
+  }
  if (!unknown_fields().empty()) {
    ::google::protobuf::internal::WireFormat::SerializeUnknownFields(
        unknown_fields(), output);
@@ -4477,6 +4567,11 @@ void GraphDef::SerializeWithCachedSizes(
    target = ::google::protobuf::internal::WireFormatLite::WriteBoolToArray(10, this->debug_mode(), target);
  }
+  // optional bool share_grads = 11 [default = false];
+  if (has_share_grads()) {
+    target = ::google::protobuf::internal::WireFormatLite::WriteBoolToArray(11, this->share_grads(), target);
+  }
  if (!unknown_fields().empty()) {
    target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray(
        unknown_fields(), target);
@@ -4517,6 +4612,11 @@ int GraphDef::ByteSize() const {
      total_size += 1 + 1;
    }
+    // optional bool share_grads = 11 [default = false];
+    if (has_share_grads()) {
+      total_size += 1 + 1;
+    }
  }
  // repeated .OperatorDef op = 2;
  total_size += 1 * this->op_size();
@@ -4602,6 +4702,9 @@ void GraphDef::MergeFrom(const GraphDef& from) {
    if (from.has_debug_mode()) {
      set_debug_mode(from.debug_mode());
    }
+    if (from.has_share_grads()) {
+      set_share_grads(from.share_grads());
+    }
  }
  mutable_unknown_fields()->MergeFrom(from.unknown_fields());
 }
@@ -4634,6 +4737,7 @@ void GraphDef::Swap(GraphDef* other) {
    g_target_.Swap(&other->g_target_);
    u_target_.Swap(&other->u_target_);
    std::swap(debug_mode_, other->debug_mode_);
+    std::swap(share_grads_, other->share_grads_);
    std::swap(_has_bits_[0], other->_has_bits_[0]);
    _unknown_fields_.Swap(&other->_unknown_fields_);
    std::swap(_cached_size_, other->_cached_size_);

--- a/Dragon/src/protos/dragon.pb.h
+++ b/Dragon/src/protos/dragon.pb.h
@@ -815,6 +815,13 @@ class OperatorDef : public ::google::protobuf::Message {
  inline bool debug_mode() const;
  inline void set_debug_mode(bool value);
+  // optional bool share_grads = 8 [default = false];
+  inline bool has_share_grads() const;
+  inline void clear_share_grads();
+  static const int kShareGradsFieldNumber = 8;
+  inline bool share_grads() const;
+  inline void set_share_grads(bool value);
  // @@protoc_insertion_point(class_scope:OperatorDef)
 private:
  inline void set_has_name();
@@ -825,6 +832,8 @@ class OperatorDef : public ::google::protobuf::Message {
  inline void clear_has_device_option();
  inline void set_has_debug_mode();
  inline void clear_has_debug_mode();
+  inline void set_has_share_grads();
+  inline void clear_has_share_grads();
  ::google::protobuf::UnknownFieldSet _unknown_fields_;
@@ -837,6 +846,7 @@ class OperatorDef : public ::google::protobuf::Message {
  ::google::protobuf::RepeatedPtrField< ::Argument > arg_;
  ::DeviceOption* device_option_;
  bool debug_mode_;
+  bool share_grads_;
  friend void  protobuf_AddDesc_dragon_2eproto();
  friend void protobuf_AssignDesc_dragon_2eproto();
  friend void protobuf_ShutdownFile_dragon_2eproto();
@@ -1441,6 +1451,13 @@ class GraphDef : public ::google::protobuf::Message {
  inline bool debug_mode() const;
  inline void set_debug_mode(bool value);
+  // optional bool share_grads = 11 [default = false];
+  inline bool has_share_grads() const;
+  inline void clear_share_grads();
+  static const int kShareGradsFieldNumber = 11;
+  inline bool share_grads() const;
+  inline void set_share_grads(bool value);
  // @@protoc_insertion_point(class_scope:GraphDef)
 private:
  inline void set_has_name();
@@ -1451,6 +1468,8 @@ class GraphDef : public ::google::protobuf::Message {
  inline void clear_has_device_option();
  inline void set_has_debug_mode();
  inline void clear_has_debug_mode();
+  inline void set_has_share_grads();
+  inline void clear_has_share_grads();
  ::google::protobuf::UnknownFieldSet _unknown_fields_;
@@ -1465,6 +1484,7 @@ class GraphDef : public ::google::protobuf::Message {
  ::google::protobuf::RepeatedPtrField< ::GradientTarget > g_target_;
  ::google::protobuf::RepeatedPtrField< ::UpdateTarget > u_target_;
  bool debug_mode_;
+  bool share_grads_;
  friend void  protobuf_AddDesc_dragon_2eproto();
  friend void protobuf_AssignDesc_dragon_2eproto();
  friend void protobuf_ShutdownFile_dragon_2eproto();
@@ -2712,6 +2732,30 @@ inline void OperatorDef::set_debug_mode(bool value) {
  // @@protoc_insertion_point(field_set:OperatorDef.debug_mode)
 }
+// optional bool share_grads = 8 [default = false];
+inline bool OperatorDef::has_share_grads() const {
+  return (_has_bits_[0] & 0x00000080u) != 0;
+}
+inline void OperatorDef::set_has_share_grads() {
+  _has_bits_[0] |= 0x00000080u;
+}
+inline void OperatorDef::clear_has_share_grads() {
+  _has_bits_[0] &= ~0x00000080u;
+}
+inline void OperatorDef::clear_share_grads() {
+  share_grads_ = false;
+  clear_has_share_grads();
+}
+inline bool OperatorDef::share_grads() const {
+  // @@protoc_insertion_point(field_get:OperatorDef.share_grads)
+  return share_grads_;
+}
+inline void OperatorDef::set_share_grads(bool value) {
+  set_has_share_grads();
+  share_grads_ = value;
+  // @@protoc_insertion_point(field_set:OperatorDef.share_grads)
+}
 // -------------------------------------------------------------------
 // GradientTarget
@@ -3904,6 +3948,30 @@ inline void GraphDef::set_debug_mode(bool value) {
  // @@protoc_insertion_point(field_set:GraphDef.debug_mode)
 }
+// optional bool share_grads = 11 [default = false];
+inline bool GraphDef::has_share_grads() const {
+  return (_has_bits_[0] & 0x00000200u) != 0;
+}
+inline void GraphDef::set_has_share_grads() {
+  _has_bits_[0] |= 0x00000200u;
+}
+inline void GraphDef::clear_has_share_grads() {
+  _has_bits_[0] &= ~0x00000200u;
+}
+inline void GraphDef::clear_share_grads() {
+  share_grads_ = false;
+  clear_has_share_grads();
+}
+inline bool GraphDef::share_grads() const {
+  // @@protoc_insertion_point(field_get:GraphDef.share_grads)
+  return share_grads_;
+}
+inline void GraphDef::set_share_grads(bool value) {
+  set_has_share_grads();
+  share_grads_ = value;
+  // @@protoc_insertion_point(field_set:GraphDef.share_grads)
+}
 // @@protoc_insertion_point(namespace_scope)

--- a/Dragon/src/protos/dragon.proto
+++ b/Dragon/src/protos/dragon.proto
@@ -50,6 +50,7 @@ message OperatorDef {
    repeated Argument arg= 5;
    optional DeviceOption device_option = 6;
    optional bool debug_mode = 7 [default = false];
+    optional bool share_grads = 8 [default = false];
 }
 message GradientTarget {
@@ -65,7 +66,6 @@ message UpdateTarget {
    repeated Argument arg = 4;
 }
-//  simply copy from caffe1
 message TensorFiller {
    optional string tensor = 1;
    optional string type = 2 [default = 'constant'];
@@ -89,4 +89,5 @@ message GraphDef {
    repeated GradientTarget g_target = 8;
    repeated UpdateTarget u_target = 9;
    optional bool debug_mode = 10 [default = false];
+    optional bool share_grads = 11 [default = false];
 }
\ No newline at end of file
--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -650,19 +650,6 @@ template <> void TransposeGrad<float16, CPUContext>(const int count,
    }
 }
-/******************** common.utils ********************/
-template <> void OneHot<float, CPUContext>(const int count,
-                                           const int depth,
-                                           const int on_value,
-                                           const float* x,
-                                           float* y) {
-    for (int i = 0; i < count; ++i) {
-        const int val = x[i];
-        y[i * depth + val] = on_value;
-    }
-}
 /******************** loss.l1_loss ********************/
 template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, float* dx) {
@@ -673,7 +660,7 @@ template<> void AbsGrad<float, CPUContext>(const int count, const float* dy, flo
    }
 }
-/******************** loss.sigmoid_cross_entropy_loss ********************/
+/******************** loss.sigmoid_cross_entropy ********************/
 template <> void SigmoidCrossEntropy<float, CPUContext>(const int count, 
                                                        const float* x, 
@@ -712,7 +699,7 @@ template<> void SmoothL1Grad<float, CPUContext>(const int count,
    }
 }
-/******************** loss.softmax_cross_entropy_loss ********************/
+/******************** loss.softmax_cross_entropy ********************/
 template <> void SoftmaxCrossEntropy<float, CPUContext>(const int count, 
                                                        const float* prob, 
@@ -723,7 +710,7 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(const int count,
    }
 }
-/******************** loss.softmax_loss ********************/
+/******************** loss.sparse_softmax_cross_entropy ********************/
 template <> void SparseSoftmaxCrossEntropy<float, CPUContext>(const int count, 
                                                              const int classes, 
@@ -735,8 +722,7 @@ template <> void SparseSoftmaxCrossEntropy<float, CPUContext>(const int count,
                                                              float* valid, 
                                                              Tensor* ignore) {
    const int* ignores = ignore->count() > 0 ? 
-                         ignore->data<int, CPUContext>() : 
+                         ignore->data<int, CPUContext>() : nullptr;
-                         nullptr;
    const int dim = count / outer_dim;
    for (int i = 0; i < outer_dim; ++i) {
        for (int j = 0; j < inner_dim; ++j) {
@@ -751,27 +737,25 @@ template <> void SparseSoftmaxCrossEntropy<float, CPUContext>(const int count,
            }
            if (k == ignore->count()) {
                float labeled_prob = prob[i * dim + label * inner_dim + j];
-                labeled_prob = std::max(labeled_prob, FLT_MIN);
+                loss[idx] = -std::log(std::max(labeled_prob, FLT_MIN));
-                loss[idx] = log(labeled_prob);
                valid[idx] = 1;
            }
        }
    }
 }
-template<> void SoftmaxLossGrad<float, CPUContext>(const int count, 
+template<> void SparseSoftmaxCrossEntropyGrad<float, CPUContext>(const int count,
                                                                 const int classes,
                                                                 const int outer_dim,
                                                                 const int inner_dim,
-                                                   const float* labels, 
                                                                 const float* prob,
+                                                                 const float* labels,
                                                                 float* valid,
                                                                 Tensor* ignore,
                                                                 float* dXdata) {
    int dim = count / outer_dim;
    const int* ignores = ignore->count() > 0 ?
-                         ignore->data <int, CPUContext>() : 
+                         ignore->data <int, CPUContext>() : nullptr;
-                         nullptr;
    valid[0] = 0;
    for (int i = 0; i < outer_dim; ++i) {
        for (int j = 0; j < inner_dim; ++j) {
@@ -790,6 +774,93 @@ template<> void SoftmaxLossGrad<float, CPUContext>(const int count,
    }
 }
+/******************** loss.sparse_softmax_focal_loss ********************/
+template <> void SparseSoftmaxFocalLoss<float, CPUContext>(const int count,
+                                                           const int classes,
+                                                           const int outer_dim,
+                                                           const int inner_dim,
+                                                           const float alpha,
+                                                           const float gamma,
+                                                           const float* prob,
+                                                           const float* labels,
+                                                           float* scale,
+                                                           float* loss,
+                                                           float* valid,
+                                                           Tensor* ignore) {
+    const int* ignores = ignore->count() > 0 ? 
+                         ignore->data<int, CPUContext>() : nullptr;
+    const int dim = count / outer_dim;
+    for (int i = 0; i < count; ++i) {
+        scale[i] = alpha * std::pow((1.0f - prob[i]), gamma);
+    }
+    for (int i = 0; i < outer_dim; ++i) {
+        for (int j = 0; j < inner_dim; ++j) {
+            const int idx = i * inner_dim + j;
+            const int label = labels[idx];
+            int k;
+            for (k = 0; k < ignore->count(); ++k) {
+                if (label == ignores[k]) {
+                    loss[idx] = valid[idx] = 0;
+                    break;
+                }
+            }
+            if (k == ignore->count()) {
+                const int t_ = i * dim + label * inner_dim + j;
+                float labeled_prob = prob[t_];
+                loss[idx] = -scale[t_] * std::log(std::max(labeled_prob, FLT_MIN));
+                valid[idx] = 1;
+            }
+        }
+    }
+}
+template<> void SparseSoftmaxFocalLossGrad<float, CPUContext>(const int count,
+                                                              const int classes, 
+                                                              const int outer_dim, 
+                                                              const int inner_dim,
+                                                              const float gamma,
+                                                              const float eps,
+                                                              const float* scale,
+                                                              const float* prob, 
+                                                              const float* labels, 
+                                                              float* valid, 
+                                                              Tensor* ignore, 
+                                                              float* dXdata) {
+    int dim = count / outer_dim;
+    const int* ignores = ignore->count() > 0 ?
+                         ignore->data <int, CPUContext>() : nullptr;
+    valid[0] = 0;
+    for (int i = 0; i < outer_dim; ++i) {
+        for (int j = 0; j < inner_dim; ++j) {
+            const int label = labels[i * inner_dim + j];
+            int k;
+            for (k = 0; k < ignore->count(); ++k)
+                if (label == ignores[k]) break;
+            if (k != ignore->count()) {
+                for (int c = 0; c < classes; ++c)
+                    dXdata[i * dim + c * inner_dim + j] = 0;
+            } else {
+                const int t_ = i * dim + label * inner_dim + j;
+                float grad = -gamma * (scale[t_] / std::max((1.0f - prob[t_]), eps))
+                                    * std::log(std::max(prob[t_], FLT_MIN)) 
+                                    * prob[t_] + scale[t_];
+                for (int c = 0; c < classes; ++c) {
+                    const int i_ = i * dim + c * inner_dim + j;
+                    if (c == label) {
+                        dXdata[i_] = grad * (prob[t_] - 1);
+                    } else {
+                        dXdata[i_] = grad * prob[i_];
+                    }
+                }
+                valid[0]++;
+            }
+        }
+    }
+}
 /******************** recurrent.lstm_uint ********************/
 template <> void LSTMUnit<float, CPUContext>(const int count, 
@@ -1013,6 +1084,19 @@ template <> void MemoryData<uint8_t, float16, CPUContext>(const int count,
    LOG(FATAL) << "unsupport float16 with CPU";
 }
+/******************** utils.one_hot ********************/
+template <> void OneHot<float, CPUContext>(const int count,
+                                           const int depth,
+                                           const int on_value,
+                                           const float* x,
+                                           float* y) {
+    for (int i = 0; i < count; ++i) {
+        const int val = x[i];
+        y[i * depth + val] = on_value;
+    }
+}
 /******************** vision.conv ********************/
 template <> void Im2Col<float, CPUContext>(const int channels, 

--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
@@ -1204,34 +1204,6 @@ template <> void TransposeGrad<float16, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }
-/******************** common.utils ********************/
-template <typename T>
-__global__ void _OneHot(const int count,
-                        const int depth, 
-                        const int on_value, 
-                        const float* x,
-                        float* y) {
-    CUDA_KERNEL_LOOP(idx, count) {
-        const int val = x[idx];
-        y[idx * depth + val] = on_value;
-    }
-}
-template <> void OneHot<float, CUDAContext>(const int count,
-                                            const int depth,
-                                            const int on_value,
-                                            const float* x,
-                                            float* y) {
-    _OneHot<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
-                                                                depth,
-                                                             on_value,
-                                                                    x,
-                                                                   y);
-    CUDA_POST_KERNEL_CHECK;
-}
 /******************** loss.l1_loss ********************/
 template <typename T>
@@ -1248,7 +1220,7 @@ template<> void AbsGrad<float, CUDAContext>(const int count, const float* dy, fl
    CUDA_POST_KERNEL_CHECK;
 }
-/******************** loss.sigmoid_cross_entropy_loss ********************/
+/******************** loss.sigmoid_cross_entropy ********************/
 template <typename T>
 __global__ void _SigmoidCrossEntropy(const int count, 
@@ -1311,7 +1283,7 @@ template<> void SmoothL1Grad<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }
-/******************** loss.softmax_cross_entropy_loss ********************/
+/******************** loss.softmax_cross_entropy ********************/
 template <typename T>
 __global__ void _SoftmaxCrossEntropy(const int count, 
@@ -1334,7 +1306,7 @@ template <> void SoftmaxCrossEntropy<float, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }
-/******************** loss.softmax_loss ********************/
+/******************** loss.sparse_softmax_cross_entropy ********************/
 template <typename T>
 __global__ void _SparseSoftmaxCrossEntropy(const int count, 
@@ -1375,8 +1347,7 @@ template <> void SparseSoftmaxCrossEntropy<float, CUDAContext>(const int count,
                                                               float* valid, 
                                                               Tensor* ignore) {
    const int* ignores = ignore->count() > 0 ?
-                         ignore->data<int, CUDAContext>() : 
+                         ignore->data<int, CUDAContext>() : nullptr;
-                         nullptr;
    const int num_preds = outer_dim * inner_dim;
    _SparseSoftmaxCrossEntropy<float> << <GET_BLOCKS(num_preds), CUDA_NUM_THREADS >> >(num_preds, 
                                                                                            prob, 
@@ -1391,7 +1362,7 @@ template <> void SparseSoftmaxCrossEntropy<float, CUDAContext>(const int count,
 }
 template <typename T>
-__global__ void _SoftmaxLossGrad(const int count, 
+__global__ void _SparseSoftmaxCrossEntropyGrad(const int count,
                                               const T* prob, 
                                               const T* labels, 
                                               T* dx, 
@@ -1418,12 +1389,12 @@ __global__ void _SoftmaxLossGrad(const int count,
    }
 }
-template<> void SoftmaxLossGrad<float, CUDAContext>(const int count, 
+template<> void SparseSoftmaxCrossEntropyGrad<float, CUDAContext>(const int count,
                                                                  const int classes, 
                                                                  const int outer_dim, 
                                                                  const int inner_dim, 
-                                                    const float* labels, 
                                                                  const float* prob,
+                                                                  const float* labels,
                                                                  float* valid, 
                                                                  Tensor* ignore, 
                                                                  float* dXdata) {
@@ -1431,7 +1402,155 @@ template<> void SoftmaxLossGrad<float, CUDAContext>(const int count,
                         ignore->data <int, CUDAContext >() : 
                         nullptr;
    const int num_preds = outer_dim * inner_dim;
-    _SoftmaxLossGrad<float> << <GET_BLOCKS(num_preds), CUDA_NUM_THREADS >> >(num_preds, 
+    _SparseSoftmaxCrossEntropyGrad<float> << <GET_BLOCKS(num_preds), CUDA_NUM_THREADS >> >(num_preds,
+                                                                                                prob, 
+                                                                                              labels, 
+                                                                                              dXdata,
+                                                                                             classes, 
+                                                                                           inner_dim, 
+                                                                                             ignores, 
+                                                                                     ignore->count(), 
+                                                                                              valid);
+    CUDA_POST_KERNEL_CHECK;
+}
+/******************** loss.sparse_softmax_focal_loss ********************/
+template <typename T>
+__global__ void _FocalScale(const int count,
+                            const float alpha,
+                            const float gamma,
+                            const T* prob,
+                            T* scale) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        scale[idx] = alpha * std::pow((1.0f - prob[idx]), gamma);
+    }
+}
+template <typename T>
+__global__ void _SparseSoftmaxFocalLoss(const int count,
+                                        const T* scale,
+                                        const T* prob, 
+                                        const T* labels,
+                                        T* loss,
+                                        const int classes, 
+                                        const int inner_dim, 
+                                        const int* ignores, 
+                                        const int ignore_num, 
+                                        T* valid) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int o_idx = idx / inner_dim;
+        const int i_idx = idx % inner_dim;
+        const int label = labels[o_idx * inner_dim + i_idx];
+        int k;
+        for (k = 0; k < ignore_num; k++) {
+            if (label == ignores[k]) {
+                loss[idx] = valid[idx]  = 0;
+                break;
+            }
+        }
+        if (k == ignore_num) {
+            const int t_ = (o_idx * classes + label) * inner_dim + i_idx;
+            loss[idx] = -scale[t_] * std::log(max(prob[t_], FLT_MIN));
+            valid[idx] = 1;
+        }
+    }
+}
+template <> void SparseSoftmaxFocalLoss<float, CUDAContext>(const int count,
+                                                            const int classes,
+                                                            const int outer_dim,
+                                                            const int inner_dim,
+                                                            const float alpha,
+                                                            const float gamma,
+                                                            const float* prob,
+                                                            const float* labels,
+                                                            float* scale,
+                                                            float* loss,
+                                                            float* valid,
+                                                            Tensor* ignore) {
+    const int* ignores = ignore->count() > 0 ?
+                         ignore->data<int, CUDAContext>() : nullptr;
+    const int num_preds = outer_dim * inner_dim;
+    _FocalScale<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                    alpha,
+                                                                    gamma,
+                                                                     prob,
+                                                                   scale);
+    _SparseSoftmaxFocalLoss<float> << <GET_BLOCKS(num_preds), CUDA_NUM_THREADS >> >(num_preds,
+                                                                                        scale,
+                                                                                         prob,
+                                                                                       labels, 
+                                                                                         loss,
+                                                                                      classes, 
+                                                                                    inner_dim, 
+                                                                                      ignores, 
+                                                                              ignore->count(), 
+                                                                                       valid);
+    CUDA_POST_KERNEL_CHECK;
+}
+template <typename T>
+__global__ void _SparseSoftmaxFocalLossGrad(const int count,
+                                            const float gamma,
+                                            const float eps,
+                                            const T* scale,
+                                            const T* prob, 
+                                            const T* labels, 
+                                            T* dx, 
+                                            const int classes, 
+                                            const int inner_dim, 
+                                            const int* ignores, 
+                                            const int ignore_num, 
+                                            T* valid) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int o_idx = idx / inner_dim;
+        const int i_idx = idx % inner_dim;
+        const int label = labels[o_idx * inner_dim + i_idx];
+        int k;
+        for (k = 0; k < ignore_num; k++) 
+            if (label == ignores[k]) break;
+        if (k != ignore_num) {
+            for (int c = 0; c < classes; c++)
+                dx[(o_idx * classes + c) * inner_dim + i_idx] = 0;
+            valid[idx] = 0;
+        } else {
+            const int t_ = (o_idx * classes + label) * inner_dim + i_idx;
+            T grad = -gamma * (scale[t_] / max((1.0f - prob[t_]), eps))
+                            * std::log(max(prob[t_], FLT_MIN)) 
+                            * prob[t_] + scale[t_];
+            for (int c = 0; c < classes; c++) {
+                const int i_ = (o_idx * classes + c) * inner_dim + i_idx;
+                if (c == label) {
+                    dx[i_] = grad * (prob[t_] - 1);
+                } else {
+                    dx[i_] = grad * prob[i_];
+                }
+            }
+            valid[idx] = 1;
+        }
+    }
+}
+template<> void SparseSoftmaxFocalLossGrad<float, CUDAContext>(const int count,
+                                                               const int classes, 
+                                                               const int outer_dim, 
+                                                               const int inner_dim,
+                                                               const float gamma,
+                                                               const float eps,
+                                                               const float* scale,
+                                                               const float* prob, 
+                                                               const float* labels, 
+                                                               float* valid, 
+                                                               Tensor* ignore, 
+                                                               float* dXdata) {
+    const int* ignores = ignore->count() > 0 ? 
+                         ignore->data <int, CUDAContext >() : nullptr;
+    const int num_preds = outer_dim * inner_dim;
+    _SparseSoftmaxFocalLossGrad<float> << <GET_BLOCKS(num_preds), CUDA_NUM_THREADS >> >(num_preds,
+                                                                                            gamma,
+                                                                                              eps,
+                                                                                            scale,
                                                                                             prob, 
                                                                                           labels, 
                                                                                           dXdata,
@@ -1847,6 +1966,34 @@ template <> void MemoryData<uint8_t, float16, CUDAContext>(const int count,
    CUDA_POST_KERNEL_CHECK;
 }
+/******************** utils.one_hot ********************/
+template <typename T>
+__global__ void _OneHot(const int count,
+                        const int depth, 
+                        const int on_value, 
+                        const float* x,
+                        float* y) {
+    CUDA_KERNEL_LOOP(idx, count) {
+        const int val = x[idx];
+        y[idx * depth + val] = on_value;
+    }
+}
+template <> void OneHot<float, CUDAContext>(const int count,
+                                            const int depth,
+                                            const int on_value,
+                                            const float* x,
+                                            float* y) {
+    _OneHot<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
+                                                                depth,
+                                                             on_value,
+                                                                    x,
+                                                                   y);
+    CUDA_POST_KERNEL_CHECK;
+}
 /******************** vision.conv ********************/
 template<typename T>

--- a/README.md
+++ b/README.md
@@ -164,19 +164,38 @@ dragon.config.EnableCPU()
 dragon.config.EnableCUDA(device_id, use_cudnn=True)
 ```
-### Automatic Memory Optimization(AMC)
+### Memonger
+Dragon is a extremely memory efficient framework.
+It is supported to drop intermediate results(mirrow stage) during forward phase, and share grads during backward phase,
+takes 25% and 50% memory-usage comparing caffe and tensorflow respectively.
+To use it, just:
 ```Shell
-import dragon.config
+import dragon.memonger as opt
-dragon.config.SetDebugMode(False)
+```
+- ShareGrads
+```Shell
+opt.share_grads()
 ```
-This option will make all gradients share a global tensor(debugging is intractable).
+- Drop
+```Shell
+import dragon.ops as ops
+y = opt.drop(ops.Relu, x)
+```
-which prefers a 50% memory-usage and 15% slower solution during training phase.
 ### Scope
+As a graph based framework, Dragon supports various scopes.
 - NameScope
 ```Shell