Add Cambricon's CNML Context

Ting PAN
Commit 96f7277e authored Nov 20, 2018 by Ting PAN
Showing with 4958 additions and 1148 deletions
CHANGES
Docker/ubuntu-16.04-cpu-openblas/Dockerfile
Docker/ubuntu-16.04-cuda9.0-cudnn7/Dockerfile
Dragon/CMakeLists.txt
Dragon/include/core/common.h
Dragon/include/core/context.h
Dragon/include/core/context_cnml.h
Dragon/include/core/context_cuda.h
Dragon/include/core/graph.h
Dragon/include/core/mixedmem.h
Dragon/include/core/operator.h
Dragon/include/core/tensor.h
Dragon/include/core/types.h
Dragon/include/core/workspace.h
Dragon/include/operators/activation/dropout_op.h
Dragon/include/operators/activation/relu_op.h
Dragon/include/operators/activation/softmax_op.h
Dragon/include/operators/arithmetic/affine_op.h
Dragon/include/operators/arithmetic/clip_op.h
Dragon/include/operators/arithmetic/maximum_op.h
--- a/CHANGES
+++ b/CHANGES
+------------------------------------------------------------------------
+The list of most significant changes made over time in Dragon.
+Dragon 0.2.2.12 (20181120)
+DRAGON_VERSION == 2212
+Changes (w.r.t. Dragon 0.2.2.11):
+Preview Features:
+- Added Cambricon's CNML context.
+- Added the support for Int8(Char) Tensor.
+- Removed the cuda device id query from pointer.
+- Added ``DropBlock2dOp``
+- Added ``MaximumOp``, ``MinimumOp``, ``NLLLossOp``.
+- Added CuDNN support for ``BiasAddOp``.
+- Optimized memory usage of ``DropoutOp``.
+- Replaced ``thread_local`` with platform TLS solution.
+- Changed the default norm eps from 1e-3 to 1e-5,
+  affected: ``BatchNorm``, ``BatchRenorm``, ``GroupNorm``, ``InstanceNorm``, ``L2Norm``.
+- Enforced CUDA FP16 support (i.e. Removed ``WITH_CUDA_FP16``).
+- [PyTorch] Added ``torch.one_hot``.
+- [PyTorch] Added ``torch.log``, ``Tensor.log``, ``torch.exp`` and ``Tensor.exp``.
+- [PyTorch] Added ``torch.minimum``, ``torch.maximum``,
+   ``torch.clamp``, ``Tensor.clamp``, ``Tensor.clamp_``.
+- [PyTorch] Added ``nn.ELU`` and ``nn.SELU``.
+- [PyTorch] Added ``nn.GroupNorm``.
+- [PyTorch] Added ``nn.NLLLoss``, ``nn.BCEWithLogitsLoss``,
+   ``nn.L1Loss``, ``nn.MSELoss``, ``nn.SmoothL1Loss``.
+- [PyTorch] Added ``nn.DropBlock2d``.
+- [PyTorch] Added ``train`` and ``eval`` mode for Module,
+   affected: ``nn.BatchNorm``, ``nn.Dropout``.
+- [PyTorch] Deprecated the ``size_average`` and ``reduce`` in
+    ``nn.Loss``, added ``reduction`` instead.
+- [PyTorch] ``torch.save`` can save both ``torch.Tensor`` and other pickle values.
+- [PyCaffe] Added ``DropBlockLayer``.
+Bugs fixed:
+- Fixed the uncomputed output in ``BiasAddGradientOp``.
+- Fixed the incorrect gradients of ``ClipGradientOp``.
+- Fixed the wrong results of ``math::Inv`` under ``CPUContext``.
+- Fixed the issue that the default device is used on initializing NCCL.
+- Removed the strictly shape check in ``SmoothL1Op``.
+- Fixed wrong CXX API exporting under Win32.
+- [PyTorch] Fixed an issue that multiple ``GradientGather`` are triggered by one Operator.
+- [PyTorch] Fixed the schema check by in-place fundamental ops.
+- [PyTorch] Fixed the missing shape and dtype after ``Tensor.copy_``.
+- [PyTorch] Fixed an issue that ``Tensor.fill_`` and ``Tensor.zero_``
+  will change the data type of an non-empty Tensor.
+- [PyTorch] Fixed the Python2 Int(s) check.
+------------------------------------------------------------------------
\ No newline at end of file
--- a/Docker/ubuntu-16.04-cpu-openblas/Dockerfile
+++ b/Docker/ubuntu-16.04-cpu-openblas/Dockerfile
@@ -8,10 +8,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    unzip \
    ssh \
    vim \
+    libtbb-dev \
+    libsdl2-dev \
    libnuma-dev \
    libprotobuf-dev \
    protobuf-compiler \
+    libopencv-dev \
    libopenblas-dev \
+    libboost-all-dev \
    python3-pip \
    python3-dev \
    python3-pyqt4 \
@@ -40,3 +44,5 @@ RUN git clone https://github.com/seetaresearch/Dragon.git && \
    wget http://dragon.seetatech.com/download/docker/ubuntu-16.04-cpu-openblas/CMakeLists.txt && \
    mkdir build && cd build && cmake .. && make install -j8 && cd .. && rm -rf build && \
    cd python && python3 setup.py install
+RUN rm /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python && ln -s /usr/bin/pip3 /usr/bin/pip
\ No newline at end of file
--- a/Docker/ubuntu-16.04-cuda9.0-cudnn7/Dockerfile
+++ b/Docker/ubuntu-16.04-cuda9.0-cudnn7/Dockerfile
@@ -9,10 +9,14 @@ RUN rm /etc/apt/sources.list.d/cuda.list && rm /etc/apt/sources.list.d/nvidia-ml
    unzip \
    ssh \
    vim \
+    libtbb-dev \
+    libsdl2-dev \
    libnuma-dev \
    libprotobuf-dev \
    protobuf-compiler \
+    libopencv-dev \
    libopenblas-dev \
+    libboost-all-dev \
    libnccl2 \
    libnccl-dev \
    python3-pip \
@@ -43,3 +47,5 @@ RUN git clone https://github.com/seetaresearch/Dragon.git && \
    wget http://dragon.seetatech.com/download/docker/ubuntu-16.04-cuda9.0-cudnn7/CMakeLists.txt && \
    mkdir build && cd build && cmake .. && make install -j8 && cd .. && rm -rf build && \
    cd python && python3 setup.py install
+RUN rm /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python && ln -s /usr/bin/pip3 /usr/bin/pip
\ No newline at end of file
--- a/Dragon/CMakeLists.txt
+++ b/Dragon/CMakeLists.txt
@@ -17,7 +17,6 @@ option(WITH_SSE                    "Set ON to use SSE 4.1"  ON)
 option(WITH_MPI                    "Set ON to use MPI"  OFF)
 option(WITH_MPI_CUDA               "Set ON to use MPI-CUDA"  OFF)
 option(WITH_MPI_NCCL               "Set ON to use MPI-NCCL"  OFF)
-option(WITH_CUDA_FP16              "Set ON to use FP16"  ON)
 # Set your 3rdparty
 set(3RDPARTY_DIR  ${PROJECT_SOURCE_DIR}/../3rdparty)
@@ -163,10 +162,6 @@ if (WITH_MPI_NCCL)
    ADD_DEFINITIONS(-DWITH_MPI_NCCL)
    message(STATUS "Use MPI-NCCL [Optional]")
 endif()
-if (WITH_CUDA_FP16)
-    ADD_DEFINITIONS(-DWITH_CUDA_FP16)
-    message(STATUS "Use CUDA FP16 [Optional]")
-endif()
 # ---[ Flags
 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}")

--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -13,6 +13,7 @@
 #define DRAGON_CORE_COMMON_H_
 #include <ctime>
+#include <random>
 #include <climits>
 #include <memory>
 #include <string>
@@ -49,25 +50,35 @@ using Map = std::unordered_map<Key, Value>;
 template <typename Value>
 using Set = std::unordered_set<Value> ;
-/*
+/* * * * * * * * * * * * * * * * * * * * *
- * Define the Kernel version.
+ *                                       *
- *
+ *            Kernel Version             *
- * | Major(2) | Minor(2) | Patch(11) |
+ *                                       *
- */
+ *     Major(2) | Minor(2) | Patch(12)   *
-#define DRAGON_VERSION 2211
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+#define DRAGON_VERSION 2212
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *          Default Random Seed          *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
-/*
- * Define the default random seed.
- */
 #define DEFAULT_RNG_SEED 3
-/*
+/* * * * * * * * * * * * * * * * * * * * *
- * Define the common marcos.
+ *                                       *
- */
+ *                Macros                 *
-#ifdef _MSC_VER
+ *                                       *
-#if _MSC_VER < 1900
+ * * * * * * * * * * * * * * * * * * * * */
-#define thread_local __declspec(thread)
-#endif
+//  avoid using of "thread_local" for VS2013 or older Xcode
+#if defined(__clang__) || defined(__GNUC__)
+#define TLS_OBJECT __thread
+#else
+#define TLS_OBJECT __declspec(thread)
 #endif
 #define CONCATENATE_IMPL(s1, s2) s1##s2

--- a/Dragon/include/core/context.h
+++ b/Dragon/include/core/context.h
@@ -12,15 +12,8 @@
 #ifndef DRAGON_CORE_CONTEXT_H_
 #define DRAGON_CORE_CONTEXT_H_
-#include <random>
-#include <ctime>
 #include "core/common.h"
-#ifdef WITH_CUDA
-#include "utils/cuda_device.h"
-#endif
 namespace dragon {
 class CPUContext {
@@ -45,7 +38,7 @@ class CPUContext {
 #else
        data = malloc(nbytes);
 #endif
-        CHECK(data) << "Malloc mem: " << nbytes << " bytes failed.";
+        CHECK(data) << "\nMalloc mem: " << nbytes << " bytes failed.";
        return data;
    }

--- a/Dragon/include/core/context_cnml.h
+++ b/Dragon/include/core/context_cnml.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// ------------------------------------------------------------
+#ifndef DRAGON_CORE_CONTEXT_CNML_H_
+#define DRAGON_CORE_CONTEXT_CNML_H_
+/* CAMBRICON's CNRT && CNML Environment */
+#include "core/common.h"
+struct cnrtStream;
+struct cnmlCpuTensor;
+struct cnmlTensor;
+struct cnmlFusionOp;
+typedef struct cnrtStream* cnrtStream_t;
+typedef struct cnmlCpuTensor* cnmlCpuTensor_t;
+typedef struct cnmlTensor* cnmlTensor_t;
+typedef struct cnmlFusionOp* cnmlFusionOp_t;
+namespace dragon {
+class CNRTObject;
+class CNMLContext {
+ public:
+     CNMLContext(const DeviceOption& option)
+        : device_id_(option.device_id()),
+        random_seed_(option.has_random_seed() ?
+            option.random_seed() : DEFAULT_RNG_SEED) {
+        CHECK_EQ(option.device_type(), CNML);
+    }
+    CNMLContext(const int device_id = 0)
+        : device_id_(device_id),
+          random_seed_(DEFAULT_RNG_SEED) {}
+    void SwitchToDevice(int stream_id);
+    inline void SwitchToDevice() { SwitchToDevice(1); }
+    void FinishDeviceCompution();
+    static void* New(size_t nbytes);
+    static void Memset(
+        size_t              nbytes,
+        void*               ptr);
+    inline void MemsetAsync(
+        size_t              nbytes,
+        void*               ptr) {
+        Memset(nbytes, ptr);
+    }
+    template<class DstContext, class SrcContext>
+    static void Memcpy(
+        size_t              nbytes,
+        void*               dst,
+        const void*         src);
+    template<class DstContext, class SrcContext>
+    inline void MemcpyAsync(
+        size_t              nbytes,
+        void*               dst,
+        const void*         src) {
+        Memcpy<DstContext, SrcContext>(dst, src, nbytes);
+    }
+    static void Delete(void* data);
+    inline int device_id() const { return device_id_; }
+    inline void set_stream_id(int stream_id) { stream_id_ = stream_id; }
+    inline cnrtStream_t cnrt_stream() {
+        return cnrt_stream(device_id_, stream_id_);
+    }
+    static cnrtStream_t cnrt_stream(
+        int                 device_id,
+        int                 stream_id);
+    static std::mutex& mutex() { static std::mutex m; return m; }
+    static thread_local CNRTObject cnrt_object_;
+ private:
+    int device_id_, stream_id_ = 1, random_seed_;
+    unique_ptr<std::mt19937> rand_generator_;
+};
+}    // namepsace dragon
+#endif    // DRAGON_CORE_CONTEXT_CNML_H_
\ No newline at end of file
--- a/Dragon/include/core/context_cuda.h
+++ b/Dragon/include/core/context_cuda.h
@@ -12,8 +12,9 @@
 #ifndef DRAGON_CORE_CONTEXT_CUDA_H_
 #define DRAGON_CORE_CONTEXT_CUDA_H_
+/* NVIDIA's CUDA Environment */
 #include "core/common.h"
-#include "core/context.h"
 #include "utils/cuda_device.h"
 #include "utils/cudnn_device.h"
@@ -52,13 +53,13 @@ class CUDAObject {
    }
    //  follow the caffe2,
-    //  each device takes a group of non-bl0cking streams
+    //  each device takes a group of non-blocking streams
    //  the stream 0 is reserved for default stream,
    //  as some computations really require it,
    //  e.g. cublas.asum() and mixed cpu/cuda operations
    //  besides, somes calls, such as cudnn.conv() and cudnn.rnn(),
    //  produce wrong results if running them on non-blocking streams
-    //  note that caffe2 also use default streams (within CuDNNState)
+    //  note that caffe2 also uses default streams (within CuDNNState)
    cudaStream_t GetStream(int device_id, int stream_id) {
        vector<cudaStream_t>& dev_streams = cuda_streams[device_id];
        if (dev_streams.size() <= (unsigned)stream_id)
@@ -140,7 +141,7 @@ class CUDAContext {
    inline static void* New(size_t nbytes) {
        void* data;
        cudaMalloc(&data, nbytes);
-        CHECK(data) << "Malloc cuda mem: " 
+        CHECK(data) << "\nMalloc cuda mem: "
                    << nbytes << " bytes failed.";
        return data;
    }
@@ -199,11 +200,11 @@ class CUDAContext {
    static cudaStream_t cuda_stream(
        int                 device_id,
        int                 stream_id) {
-        return cuda_object_.GetStream(device_id, stream_id);
+        return cuda_object()->GetStream(device_id, stream_id);
    }
    cublasHandle_t cublas_handle() {
-        return cuda_object_.GetCuBLASHandle(device_id_, stream_id_);
+        return cuda_object()->GetCuBLASHandle(device_id_, stream_id_);
    }
    inline std::mt19937* rand_generator() {
@@ -227,13 +228,17 @@ class CUDAContext {
 #ifdef WITH_CUDNN
    cudnnHandle_t cudnn_handle() {
-        return cuda_object_.GetCuDNNHandle(device_id_, stream_id_);
+        return cuda_object()->GetCuDNNHandle(device_id_, stream_id_);
    }
 #endif
    static std::mutex& mutex() { static std::mutex m; return m; }
-    static thread_local CUDAObject cuda_object_;
+    static CUDAObject* cuda_object() {
+        static TLS_OBJECT CUDAObject* cuda_object_;
+        if (!cuda_object_) cuda_object_ = new CUDAObject();
+        return cuda_object_;
+    }
 private:
    int device_id_, stream_id_ = 1, random_seed_;

--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -48,10 +48,10 @@ class GraphBase {
    Workspace* ws_;
 };
-class Graph final : public GraphBase {
+class Graph : public GraphBase {
 public:
    Graph(const GraphDef& meta_graph, Workspace* ws);
-    ~Graph() { for (auto* op : ops_) delete op; }
+    virtual ~Graph() { for (auto* op : ops_) delete op; }
    bool Create(
        const GraphDef&         optimized_graph,
@@ -73,7 +73,7 @@ class Graph final : public GraphBase {
    inline Workspace* ws() const { return ws_; }
- private:
+ protected:
    void ForwardShareDyeing(string u, string ancestor);
    void ForwardPruneDyeing(
        string                  u,
@@ -98,6 +98,9 @@ DECLARE_REGISTRY(
    const GraphDef&,
    Workspace*);
+#define REGISTER_GRAPH(name, ...) \
+    REGISTER_CLASS(GraphRegistry, name, __VA_ARGS__)
 }    // namespace dragon
 #endif    // DRAGON_CORE_GRAPH_H_
\ No newline at end of file
--- a/Dragon/include/core/mixedmem.h
+++ b/Dragon/include/core/mixedmem.h
@@ -12,30 +12,49 @@
 #ifndef DRAGON_CORE_MIXEDMEM_H_
 #define DRAGON_CORE_MIXEDMEM_H_
-#include "context.h"
+#include "core/context.h"
-#include "context_cuda.h"
+#include "core/context_cuda.h"
+#include "core/context_cnml.h"
 namespace dragon {
+typedef enum {
+    NCHW,
+    NHWC,
+} DataOrder;
 class MixedMemory {
 public:
-    enum State {
+    typedef enum {
        UNINITIALIZED,
        STATE_AT_CPU,
        STATE_AT_CUDA,
+        STATE_AT_CNML,
        SWITCHED,
-        SYNCED };
+        SYNCED,
+    } State;
-    MixedMemory() : cpu_ptr_(nullptr), cuda_ptr_(nullptr) {}
+    MixedMemory() : cpu_ptr_(nullptr),
+          cuda_ptr_(nullptr), cnml_ptr_(nullptr) {}
    MixedMemory(const TypeMeta& meta, const size_t nbytes)
-        : meta_(meta), nbytes_(nbytes),
+        : meta_(meta), nbytes_(nbytes), cpu_ptr_(nullptr),
-          cpu_ptr_(nullptr), cuda_ptr_(nullptr) {}
+          cuda_ptr_(nullptr), cnml_ptr_(nullptr) {}
    ~MixedMemory();
    const void* cpu_data();
    const void* cuda_data();
+    const void* cnml_data();
    void* mutable_cpu_data();
    void* mutable_cuda_data();
+    void* mutable_cnml_data();
+    void* malloc_cnml_data();
+    void fetch_cnml_data(void** data);
+    cnmlCpuTensor_t& cnml_cpu_tensor();
+    cnmlTensor_t& cnml_mlu_tensor();
    void set_cpu_data(void* cpu_ptr, size_t nbytes);
    void SwitchToDevice();
@@ -43,23 +62,35 @@ class MixedMemory {
    inline size_t nbytes() const { return nbytes_; }
-    inline void* cpu_ptr() { state_ = STATE_AT_CPU; return cpu_ptr_; }
+    inline size_t nchunks() const { return nchunks_; }
-    inline void* cuda_ptr() { state_ = STATE_AT_CUDA; return cuda_ptr_; }
+    void set_nchunks(size_t nchunks) { nchunks_ = nchunks; }
    inline State state() const { return state_; }
+    inline DataOrder order() const { return order_; }
+    inline void set_order(DataOrder order) { order_ = order; }
    const Map<string, string> info() const;
-    void ToCUDA();
    void ToCPU();
+    void ToCUDA();
 private:
-    void* cpu_ptr_, *cuda_ptr_;
-    bool own_cpu_ptr_ = true;
-    State state_ = UNINITIALIZED;
-    size_t nbytes_ = 0;
    TypeMeta meta_;
+    size_t nbytes_ = 0, nchunks_ = 1;
+    DataOrder order_ = NCHW;
+    State state_ = UNINITIALIZED;
+    void* cpu_ptr_, *cuda_ptr_, *cnml_ptr_;
+    int own_cpu_ptr_ = 1, ptr_device_ = 0;
+    /* For CAMBRICON's CNML Environment */
+    cnmlCpuTensor_t cnml_cpu_tensor_ = nullptr;
+    cnmlTensor_t cnml_mlu_tensor_ = nullptr;
 };
 }    // namespace dragon
-#endif
+#endif    // DRAGON_CORE_MIXEDMEM_H_
\ No newline at end of file
--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -44,7 +44,9 @@ class OperatorBase {
                   const string& anchor);
    inline void SwitchToPhase(const string& phase) { phase_ = phase; }
    virtual void Run(int stream_id = 1) { NOT_IMPLEMENTED; }
+    virtual void Fusion(void* graph) { NOT_IMPLEMENTED; }
    inline const string& name() const { return def_.name(); }
    inline const string& type() const { return def_.type(); }
@@ -186,12 +188,22 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);
+/* NVIDIA's Accelerated Library - CUDNN */
 DECLARE_REGISTRY(
    CUDNNOperatorRegistry,
    OperatorBase,
    const OperatorDef&,
    Workspace*);
+/* CAMBRICON's Accelerated Library - CNML */
+DECLARE_REGISTRY(
+    CNMLOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
 #define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
    if (tensor.count() == 0) { \
        CHECK(ws()->GetFiller(tensor.name())) \
@@ -310,6 +322,9 @@ DECLARE_REGISTRY(
 #define INSTANTIATE_CUDNN_OPERATOR(name) \
    template class CuDNN##name##Op<CUDAContext>;
+#define INSTANTIATE_CNML_OPERATOR(name) \
+    template class CnML##name##Op<CNMLContext>;
 #define REGISTER_CPU_OPERATOR(name, ...) \
    REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
@@ -319,6 +334,9 @@ DECLARE_REGISTRY(
 #define REGISTER_CUDNN_OPERATOR(name, ...) \
    REGISTER_CLASS(CUDNNOperatorRegistry, name, __VA_ARGS__)
+#define REGISTER_CNML_OPERATOR(name, ...) \
+    REGISTER_CLASS(CNMLOperatorRegistry, name, __VA_ARGS__)
 #define DEPLOY_CPU(name) \
    REGISTER_CPU_OPERATOR(name, name##Op<CPUContext>); \
    INSTANTIATE_OPERATOR(name, CPUContext);
@@ -336,6 +354,10 @@ DECLARE_REGISTRY(
    REGISTER_CUDNN_OPERATOR(name, CuDNN##name##Op<CUDAContext>); \
    INSTANTIATE_CUDNN_OPERATOR(name);
+#define DEPLOY_CNML(name) \
+    REGISTER_CNML_OPERATOR(name, CnML##name##Op<CNMLContext>); \
+    INSTANTIATE_CNML_OPERATOR(name);
 }    // namespace dragon
 #endif    // DRAGON_CORE_OPERATOR_H_
\ No newline at end of file
--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -10,7 +10,7 @@
 // ------------------------------------------------------------
 #ifndef DRAGON_CORE_TENSOR_H_
-#define DRAONG_CORE_TENSOR_H_
+#define DRAGON_CORE_TENSOR_H_
 #include "core/common.h"
 #include "core/mixedmem.h"
@@ -103,16 +103,20 @@ class Tensor {
        return offset;
    }
-    inline string DimString() const {
+    static inline string DimString(
-        if (ndim() == 0) return "(0,)";
+        const vector<TIndex>&   dims) {
+        if (dims.size() == 0) return "(0,)";
        std::stringstream ss;
        ss << "(";
-        for (int i = 0; i < ndim() - 1; i++) ss << dim(i) << ",";
+        for (int i = 0; i < dims.size() - 1; i++)
-        if (ndim() == 1) ss << dim(0) << ",)";
+            ss << dims[i] << ",";
-        else ss << dim(ndim() - 1) << ")";
+        if (dims.size() == 1) ss << dims[0] << ",)";
+        else ss << dims.back() << ")";
        return ss.str();
    }
+    inline string DimString() const { return DimString(dims_); }
    inline bool is_corrupted() const { return is_corrupted_; }
    inline void Corrupt() { is_corrupted_ = true; }
@@ -156,9 +160,12 @@ class Tensor {
            } else if (TypeMeta::Id<Context>() ==
                    TypeMeta::Id<CUDAContext>()) {
                *data_ptr = mem->mutable_cuda_data();
+            } else if (TypeMeta::Id<Context>() == 
+                    TypeMeta::Id<CNMLContext>()) {
+                *data_ptr = mem->mutable_cnml_data();
            } else {
-                LOG(FATAL) << "Unknown memory type. "
+                LOG(FATAL) << "Unknown memory type.\n"
-                           << "Only CPU or CUDA is supported.";
+                           << "Only CPU, CUDA and CNML are supported.";
            }
        }
    }
@@ -173,9 +180,12 @@ class Tensor {
        } else if (TypeMeta::Id<Context>() ==
                TypeMeta::Id<CUDAContext>()) {
             return mem->cuda_data();
+        } else if (TypeMeta::Id<Context>() == 
+                TypeMeta::Id<CNMLContext>()) {
+            return mem->cnml_data();
        } else {
-             LOG(FATAL) << "Unknown memory type. "
+             LOG(FATAL) << "Unknown memory type.\n"
-                        << "Only CPU or CUDA are supported.";
+                        << "Only CPU, CUDA, and CNML are supported.";
             return nullptr;
        }
    }
@@ -295,4 +305,4 @@ class Tensor {
 }    // namespace dragon
-#endif    // DRAONG_CORE_TENSOR_H_
+#endif    // DRAGON_CORE_TENSOR_H_
\ No newline at end of file
--- a/Dragon/include/core/types.h
+++ b/Dragon/include/core/types.h
@@ -18,6 +18,9 @@
 namespace dragon {
+typedef char int8;
+typedef unsigned char uint8;
 #ifdef _MSC_VER
 typedef struct __declspec(align(2)) {
@@ -49,8 +52,8 @@ inline const TypeMeta& TypeStringToMeta(
            { "int64", TypeMeta::Make<int64_t>() },
            { "float64", TypeMeta::Make<double>() },
            { "float16", TypeMeta::Make<float16>() },
-            { "uint8", TypeMeta::Make<uint8_t>() },
+            { "uint8", TypeMeta::Make<uint8>() },
-            { "int8", TypeMeta::Make<char>() },
+            { "int8", TypeMeta::Make<int8>() },
    };
    static TypeMeta unknown_type;
    return s2m_type_map.count(str_type) ?
@@ -66,8 +69,8 @@ inline const std::string TypeMetaToString(
            { TypeMeta::Id<int64_t>(), "int64" },
            { TypeMeta::Id<double>(), "float64", },
            { TypeMeta::Id<float16>(), "float16" },
-            { TypeMeta::Id<uint8_t>(), "uint8" },
+            { TypeMeta::Id<uint8>(), "uint8" },
-            { TypeMeta::Id<char>(), "int8" }
+            { TypeMeta::Id<int8>(), "int8" }
    };
    return m2s_type_map.count(meta.id()) ?
        m2s_type_map[meta.id()] : "unknown";

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -47,8 +47,8 @@ class Workspace {
        recompute_flag->Reshape({ 1 });
        recompute_flag->mutable_data<bool, CPUContext>()[0] = false;
        for (int i = 0; i < WORKSPACE_MAX_CORRUPTED_SIZE; i++) {
-            string name = "/opt/mirror_stage/buffer_" +
+            string name = "/opt/mirror_stage/buffer_"
-                dragon_cast<string, int>(i);
+                + std::to_string(i);
            Tensor* buffer = CreateTensor(name);
            head->mutable_data<string, CPUContext>()[i] = "";
        }
@@ -277,7 +277,8 @@ class Workspace {
    inline bool SetProxy(
        const string&           key,
        const string&           proxy) {
-        if (proxy_map_.count(key))
+        if (key == proxy) return false;
+        if (proxy_map_.count(key) > 0)
            return proxy_map_[key] == proxy;
        proxy_map_[key] = proxy;
        return true;

--- a/Dragon/include/operators/activation/dropout_op.h
+++ b/Dragon/include/operators/activation/dropout_op.h
@@ -23,7 +23,7 @@ class DropoutOp final : public Operator<Context> {
    DropoutOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          use_scale(OperatorBase::Arg<bool>("scale", true)) {
-        GET_ARGUMENT_WITH_DESC(float, prob, 0.5);
+        GET_ARGUMENT_WITH_DESC(float, prob, 0.5f);
        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
    }
    USE_OPERATOR_FUNCTIONS;
@@ -42,7 +42,7 @@ class DropoutGradientOp final : public Operator<Context> {
    DropoutGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          use_scale(OperatorBase::Arg<bool>("scale", true)) {
-        GET_ARGUMENT_WITH_DESC(float, prob, 0.5);
+        GET_ARGUMENT_WITH_DESC(float, prob, 0.5f);
        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
    }
    USE_OPERATOR_FUNCTIONS;
@@ -53,7 +53,6 @@ class DropoutGradientOp final : public Operator<Context> {
 protected:
     DECLARE_ARGUMENT_WITH_DESC(float, prob);
     bool use_scale;
-     Tensor* mask;
 };
 DEFINE_ARGUMENT_WITH_DESC(float, DropoutOp, prob);
@@ -70,7 +69,7 @@ public:
        : Operator<Context>(def, ws), states_initialized(false),
        use_scale(OperatorBase::Arg<bool>("scale", true)),
        random_seed(DEFAULT_RNG_SEED) {
-        GET_ARGUMENT_WITH_DESC(float, prob, 0.5);
+        GET_ARGUMENT_WITH_DESC(float, prob, 0.5f);
        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
        CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropout_desc));
@@ -101,7 +100,7 @@ public:
        : Operator<Context>(def, ws), states_initialized(false),
        use_scale(OperatorBase::Arg<bool>("scale", true)),
        random_seed(DEFAULT_RNG_SEED) {
-        GET_ARGUMENT_WITH_DESC(float, prob, 0.5);
+        GET_ARGUMENT_WITH_DESC(float, prob, 0.5f);
        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
        CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropout_desc));

--- a/Dragon/include/operators/activation/relu_op.h
+++ b/Dragon/include/operators/activation/relu_op.h
@@ -21,7 +21,7 @@ class ReluOp : public Operator<Context> {
 public:
    ReluOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          slope(OperatorBase::Arg<float>("slope", 0.0)) {}
+          slope(OperatorBase::Arg<float>("slope", 0.f)) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;
@@ -36,7 +36,7 @@ class ReluGradientOp : public Operator<Context> {
 public:
    ReluGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          slope(OperatorBase::Arg<float>("slope", 0.0)) {}
+          slope(OperatorBase::Arg<float>("slope", 0.f)) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;

--- a/Dragon/include/operators/activation/softmax_op.h
+++ b/Dragon/include/operators/activation/softmax_op.h
@@ -48,8 +48,6 @@ class SoftmaxGradientOp final : public Operator<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNSoftmaxOp final : public Operator<Context> {
 public:
@@ -70,8 +68,7 @@ class CuDNNSoftmaxOp final : public Operator<Context> {
    template <typename T> void RunWithType();
 protected:
-    int axis;
+    TIndex axis, outer_dim, inner_dim;
-    TIndex outer_dim, inner_dim;
    cudnnTensorDescriptor_t input_desc, output_desc;
 };
@@ -95,8 +92,7 @@ class CuDNNSoftmaxGradientOp final : public Operator<Context> {
    template <typename T> void RunWithType();
 protected:
-    int axis;
+    TIndex axis, outer_dim, inner_dim;
-    TIndex outer_dim, inner_dim;
    cudnnTensorDescriptor_t input_desc, output_desc;
 };

--- a/Dragon/include/operators/arithmetic/affine_op.h
+++ b/Dragon/include/operators/arithmetic/affine_op.h
@@ -55,7 +55,7 @@ class AffineGradientOp final : public Operator<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
+#if CUDNN_VERSION_MIN(6, 0, 0)
 template <class Context>
 class CuDNNAffineOpBase : public Operator<Context> {
@@ -152,6 +152,8 @@ protected:
    Tensor sum_result;
 };
+#endif
 #endif    // WITH_CUDNN
 }    // namespace dragon

--- a/Dragon/include/operators/arithmetic/clip_op.h
+++ b/Dragon/include/operators/arithmetic/clip_op.h
@@ -36,11 +36,17 @@ class ClipOp final : public Operator<Context> {
 template <class Context>
 class ClipGradientOp final : public Operator<Context> {
 public:
-    USE_SIMPLE_CTOR_DTOR(ClipGradientOp);
+    ClipGradientOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          low(OperatorBase::Arg<float>("low", -FLT_MAX)),
+          high(OperatorBase::Arg<float>("high", FLT_MAX)) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;
    template <typename T> void RunWithType();
+ protected:
+    float low, high;
 };
 }    // namespace dragon

--- a/Dragon/include/operators/arithmetic/maximum_op.h
+++ b/Dragon/include/operators/arithmetic/maximum_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// ------------------------------------------------------------
+#ifndef DRAGON_OPERATORS_ARITHMETIC_MAXIMUM_OP_H_
+#define DRAGON_OPERATORS_ARITHMETIC_MAXIMUM_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class MaximumOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MaximumOp);
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType();
+};
+template <class Context>
+class MaximumGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MaximumGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType();
+};
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_ARITHMETIC_MAXIMUM_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/arithmetic/minimum_op.h
+++ b/Dragon/include/operators/arithmetic/minimum_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// ------------------------------------------------------------
+#ifndef DRAGON_OPERATORS_ARITHMETIC_MINIMUM_OP_H_
+#define DRAGON_OPERATORS_ARITHMETIC_MINIMUM_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class MinimumOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MinimumOp);
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType();
+};
+template <class Context>
+class MinimumGradientOp final : public Operator<Context> {
+ public:
+    USE_SIMPLE_CTOR_DTOR(MinimumGradientOp);
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void EltwiseRunWithType();
+    template <typename T> void BroadcastRunWithType();
+};
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_ARITHMETIC_MINIMUM_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/loss/ctc_loss_op.h
+++ b/Dragon/include/operators/loss/ctc_loss_op.h
@@ -43,8 +43,6 @@ public:
 #if CUDNN_VERSION_MIN(7, 0, 0)
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNCTCLossOp final : public Operator<Context> {
 public:

--- a/Dragon/include/operators/loss/nll_loss_op.h
+++ b/Dragon/include/operators/loss/nll_loss_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// -------------------------------------------------------------
+#ifndef DRAGON_OPERATORS_LOSS_NLL_LOSS_OP_H_
+#define DRAGON_OPERATORS_LOSS_NLL_LOSS_OP_H_
+#include "core/operator.h"
+namespace dragon {
+template <class Context>
+class NLLLossOp : public Operator<Context> {
+ public:
+    NLLLossOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
+        : Operator<Context>(def, ws),
+          axis(OperatorBase::Arg<int>("axis", 1)),
+          normalization(OperatorBase::Arg<string>(
+              "normalization", "VALID")) {
+        auto xs = OperatorBase::Args<int>("ignore_labels");
+        if (xs.size()) {
+            ignores.Reshape({ (TIndex)xs.size() });
+            auto* Idata = ignores.mutable_data<int, CPUContext>();
+            for (int i = 0; i < xs.size(); i++) Idata[i] = xs[i];
+        }
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename Tx, typename Ty> void RunWithType();
+ protected:
+    TIndex axis, outer_dim, inner_dim;
+    Tensor losses, flags, ignores;
+    string normalization;
+};
+template <class Context>
+class NLLLossGradientOp : public Operator<Context> {
+ public:
+    NLLLossGradientOp(
+        const OperatorDef&          def,
+        Workspace*                  ws)
+        : Operator<Context>(def, ws),
+          axis(OperatorBase::Arg<int>("axis", 1)),
+          normalization(OperatorBase::Arg<string>(
+              "normalization", "VALID")) {
+        auto xs = OperatorBase::Args<int>("ignore_labels");
+        if (xs.size()) {
+            ignores.Reshape({ (TIndex)xs.size() });
+            auto* Idata = ignores.mutable_data<int, CPUContext>();
+            for (int i = 0; i < xs.size(); i++) Idata[i] = xs[i];
+        }
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename Tx, typename Ty> void RunWithType();
+ protected:
+    TIndex axis, outer_dim, inner_dim;
+    Tensor ignores, flags;
+    string normalization;
+};
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_LOSS_NLL_LOSS_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/misc/initialize_op.h
+++ b/Dragon/include/operators/misc/initialize_op.h
@@ -22,7 +22,8 @@ class InitializeOp : public Operator<Context> {
 public:
    InitializeOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          shape_desc(OperatorBase::Arg<string>("shape", "")) {
+          shape_desc(OperatorBase::Arg<string>("shape", "")),
+          dtype(OperatorBase::Arg<string>("dtype", "float32")) {
        GET_ARGUMENTS_WITH_DESC(int, dims);
    }
    USE_OPERATOR_FUNCTIONS;
@@ -32,19 +33,29 @@ class InitializeOp : public Operator<Context> {
 protected:
    DECLARE_ARGUMENTS_WITH_DESC(int, dims);
-    string shape_desc;
+    string shape_desc, dtype;
    TensorFiller filler;
 };
 template <class Context>
-class FillOp final : public InitializeOp<Context> {
+class FillOp final : public Operator<Context> {
 public:
    FillOp(const OperatorDef& def, Workspace* ws)
-        : InitializeOp<Context>(def, ws) {
+        : Operator<Context>(def, ws),
-        this->filler.set_type("constant");
+          shape_desc(OperatorBase::Arg<string>("shape", "")),
-        this->filler.set_value(OperatorBase::Arg<float>("value", 0.0));
+          dtype(OperatorBase::Arg<string>("dtype", "float32")),
+          value(OperatorBase::Arg<float>("value", 0.0)) {
+        GET_ARGUMENTS_WITH_DESC(int, dims);
    }
    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+ protected:
+    DECLARE_ARGUMENTS_WITH_DESC(int, dims);
+    string shape_desc, dtype;
+    float value;
 };
 template <class Context>
@@ -130,6 +141,7 @@ public:
 };
 DEFINE_ARGUMENTS_WITH_DESC(int, InitializeOp, dims);
+DEFINE_ARGUMENTS_WITH_DESC(int, FillOp, dims);
 }    // namespace

--- a/Dragon/include/operators/norm/batch_norm_op.h
+++ b/Dragon/include/operators/norm/batch_norm_op.h
@@ -25,7 +25,7 @@ class BatchNormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
          momentum(OperatorBase::Arg<float>("momentum", 0.9f)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          use_stats(OperatorBase::Arg<int>("use_stats", -1)),
          mode(OperatorBase::Arg<string>("mode", "DEFAULT")) {
        if (axis != -1) 
@@ -81,7 +81,7 @@ class FusedBatchNormOp : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
          momentum(OperatorBase::Arg<float>("momentum", 0.9f)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          use_stats(OperatorBase::Arg<int>("use_stats", -1)) {}
    USE_OPERATOR_FUNCTIONS;
@@ -105,7 +105,7 @@ class FusedBatchNormGradientOp : public Operator<Context> {
    FusedBatchNormGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          use_stats(OperatorBase::Arg<int>("use_stats", -1)) {}
    USE_OPERATOR_FUNCTIONS;
@@ -127,14 +127,13 @@ class FusedBatchNormGradientOp : public Operator<Context> {
 #if CUDNN_VERSION_MIN(5, 0, 0)
-#include "utils/cudnn_device.h"
 template <class Context>
-class CuDNNBatchNormOp final : public FusedBatchNormOp<Context> {
+class CuDNNBatchNormOp final
+    : public FusedBatchNormOp<Context> {
 public:
    CuDNNBatchNormOp(const OperatorDef& def, Workspace* ws)
        : FusedBatchNormOp<Context>(def, ws),
-          eps64(OperatorBase::Arg<float>("eps", 1e-3f)) {
+          eps64(OperatorBase::Arg<float>("eps", 1e-5f)) {
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bn_desc));
@@ -167,11 +166,12 @@ class CuDNNBatchNormOp final : public FusedBatchNormOp<Context> {
 };
 template <class Context>
-class CuDNNBatchNormGradientOp final : public FusedBatchNormGradientOp<Context> {
+class CuDNNBatchNormGradientOp final
+    : public FusedBatchNormGradientOp<Context> {
 public:
    CuDNNBatchNormGradientOp(const OperatorDef& def, Workspace* ws)
        : FusedBatchNormGradientOp<Context>(def, ws),
-          eps64(OperatorBase::Arg<float>("eps", 1e-3f)) {
+          eps64(OperatorBase::Arg<float>("eps", 1e-5f)) {
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bn_desc));

--- a/Dragon/include/operators/norm/batch_renorm_op.h
+++ b/Dragon/include/operators/norm/batch_renorm_op.h
@@ -23,7 +23,7 @@ class BatchRenormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
          momentum(OperatorBase::Arg<float>("momentum", 0.9f)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          r_max(OperatorBase::Arg<float>("r_max", 3.f)),
          d_max(OperatorBase::Arg<float>("d_max", 5.f)),
          t_delta(OperatorBase::Arg<float>("t_delta", 1.f)),

--- a/Dragon/include/operators/norm/group_norm_op.h
+++ b/Dragon/include/operators/norm/group_norm_op.h
@@ -23,7 +23,7 @@ class GroupNormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          group(OperatorBase::Arg<int>("group", 32)),
          axis(OperatorBase::Arg<int>("axis", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)) {
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)) {
        if (axis != -1) 
            CHECK_EQ(axis, 1) 
                << "\nThe axis can only be set to 1.";
@@ -73,7 +73,7 @@ class FusedGroupNormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          group(OperatorBase::Arg<int>("group", 32)),
          axis(OperatorBase::Arg<int>("axis", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)) {}
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)) {}
    USE_OPERATOR_FUNCTIONS;
    void Setup();

--- a/Dragon/include/operators/norm/instance_norm_op.h
+++ b/Dragon/include/operators/norm/instance_norm_op.h
@@ -22,9 +22,10 @@ class InstanceNormOp final : public Operator<Context> {
    InstanceNormOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)) {
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)) {
        if (axis != -1) 
-            CHECK_EQ(axis, 1) << "\nThe axis can only be set to 1.";
+            CHECK_EQ(axis, 1)
+                << "\nThe axis can only be set to 1.";
    }
    USE_OPERATOR_FUNCTIONS;
@@ -47,7 +48,8 @@ class InstanceNormGradientOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", -1)) {
        if (axis != -1)
-            CHECK_EQ(axis, 1) << "\nThe axis can only be set to 1.";
+            CHECK_EQ(axis, 1)
+                << "\nThe axis can only be set to 1.";
    }
    USE_OPERATOR_FUNCTIONS;

--- a/Dragon/include/operators/norm/l2_norm_op.h
+++ b/Dragon/include/operators/norm/l2_norm_op.h
@@ -23,7 +23,7 @@ class L2NormOp final : public Operator<Context> {
        : Operator<Context>(def, ws),
          axis(OperatorBase::Arg<int>("axis", 0)),
          num_axes(OperatorBase::Arg<int>("num_axes", -1)),
-          eps(OperatorBase::Arg<float>("eps", 1e-3f)),
+          eps(OperatorBase::Arg<float>("eps", 1e-5f)),
          mode(OperatorBase::Arg<string>("mode", "SUM")) {}
    USE_OPERATOR_FUNCTIONS;

--- a/Dragon/include/operators/recurrent/cudnn_recurrent_op.h
+++ b/Dragon/include/operators/recurrent/cudnn_recurrent_op.h
@@ -20,8 +20,6 @@ namespace dragon {
 #if CUDNN_VERSION_MIN(5, 0, 0)
-#include "utils/cudnn_device.h"
 class cudnnTensorDescriptors {
 public:
    cudnnTensorDescriptors(const int num_descs) {

--- a/Dragon/include/operators/vision/bias_add_op.h
+++ b/Dragon/include/operators/vision/bias_add_op.h
@@ -21,7 +21,8 @@ class BiasAddOp final : public Operator<Context> {
 public:
    BiasAddOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          data_format(OperatorBase::Arg<string>("data_format", "NCHW")) {}
+          data_format(OperatorBase::Arg<string>(
+              "data_format", "NCHW")) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;
@@ -37,7 +38,8 @@ class BiasAddGradientOp final : public Operator<Context> {
 public:
    BiasAddGradientOp(const OperatorDef& def, Workspace* ws)
        : Operator<Context>(def, ws),
-          data_format(OperatorBase::Arg<string>("data_format", "NCHW")) {}
+          data_format(OperatorBase::Arg<string>(
+              "data_format", "NCHW")) {}
    USE_OPERATOR_FUNCTIONS;
    void RunOnDevice() override;
@@ -48,6 +50,62 @@ class BiasAddGradientOp final : public Operator<Context> {
    string data_format;
 };
+#ifdef WITH_CUDNN
+template <class Context>
+class CuDNNBiasAddOp final : public Operator<Context> {
+ public:
+    CuDNNBiasAddOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          data_format(OperatorBase::Arg<string>(
+              "data_format", "NCHW")) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&output_desc));
+    }
+    USE_OPERATOR_FUNCTIONS;
+    ~CuDNNBiasAddOp() {
+        CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc));
+        CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+ protected:
+    TIndex outer_dim, dim, inner_dim;
+    string data_format;
+    cudnnTensorDescriptor_t bias_desc, output_desc;
+};
+template <class Context>
+class CuDNNBiasAddGradientOp final : public Operator<Context> {
+public:
+    CuDNNBiasAddGradientOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          data_format(OperatorBase::Arg<string>(
+              "data_format", "NCHW")) {
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&input_desc));
+        CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc));
+    }
+    USE_OPERATOR_FUNCTIONS;
+    ~CuDNNBiasAddGradientOp() {
+        CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc));
+        CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc));
+    }
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+protected:
+    TIndex outer_dim, dim, inner_dim;
+    string data_format;
+    cudnnTensorDescriptor_t input_desc, bias_desc;
+};
+#endif //   WITH_CUDNN
 }    // namespace dragon
 #endif    // DRAGON_OPERATORS_VISION_BIAS_ADD_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/vision/conv_op.h
+++ b/Dragon/include/operators/vision/conv_op.h
@@ -50,8 +50,6 @@ class Conv2dGradientOp : public Conv2dOp<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNConv2dOp final : public Conv2dOp<Context> {
 public:
@@ -97,7 +95,7 @@ class CuDNNConv2dOp final : public Conv2dOp<Context> {
    cudnnConvolutionDescriptor_t conv_desc;
    cudnnFilterDescriptor_t filter_desc;
    size_t fwd_data_size;
-    TIndex bias_offset, cudnn_group;
+    TIndex cudnn_group;
    vector<TIndex> input_dims;
    bool enable_tensor_core;
 };
@@ -148,7 +146,7 @@ class CuDNNConv2dGradientOp final : public Conv2dGradientOp<Context> {
    cudnnConvolutionDescriptor_t conv_desc;
    cudnnFilterDescriptor_t filter_desc;
    size_t bwd_filter_size, bwd_data_size;
-    TIndex bias_offset, cudnn_group;
+    TIndex cudnn_group;
    vector<TIndex> input_dims;
    bool enable_tensor_core;
 };

--- a/Dragon/include/operators/vision/conv_op_base.h
+++ b/Dragon/include/operators/vision/conv_op_base.h
@@ -84,6 +84,7 @@ class ConvOpBase : public Operator<Context> {
                                                   ctx());
        } else LOG(FATAL) << "ConvNd has not been implemented yet";
    }
    template <typename T> void Col2Im(const T* col, T* im) {
        if (Input(0).ndim() == 4) {
             kernel::Col2Im2d<T, Context>(conv_in_channels,

--- a/Dragon/include/operators/vision/conv_transpose_op.h
+++ b/Dragon/include/operators/vision/conv_transpose_op.h
@@ -54,8 +54,6 @@ class Conv2dTransposeGradientOp : public Conv2dTransposeOp<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNConv2dTransposeOp final : public Conv2dTransposeOp<Context> {
 public:
@@ -100,7 +98,7 @@ class CuDNNConv2dTransposeOp final : public Conv2dTransposeOp<Context> {
    cudnnConvolutionDescriptor_t conv_desc;
    cudnnFilterDescriptor_t filter_desc;
    size_t fwd_data_size;
-    TIndex bias_offset, cudnn_group;
+    TIndex cudnn_group;
    vector<TIndex> input_dims;
    bool enable_tensor_core;
 };
@@ -150,7 +148,7 @@ public:
    cudnnConvolutionDescriptor_t conv_desc;
    cudnnFilterDescriptor_t filter_desc;
    size_t bwd_filter_size, bwd_data_size;
-    TIndex bias_offset, cudnn_group;
+    TIndex cudnn_group;
    vector<TIndex> input_dims;
    bool enable_tensor_core;
 };

--- a/Dragon/include/operators/vision/drop_block_op.h
+++ b/Dragon/include/operators/vision/drop_block_op.h
+// ------------------------------------------------------------
+// Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+//
+// Licensed under the BSD 2-Clause License.
+// You should have received a copy of the BSD 2-Clause License
+// along with the software. If not, See,
+//
+//      <https://opensource.org/licenses/BSD-2-Clause>
+//
+// ------------------------------------------------------------
+#ifndef DRAGON_OPERATORS_VISION_DROP_BLOCK_OP_H_
+#define DRAGON_OPERATORS_VISION_DROP_BLOCK_OP_H_
+#include "core/operator.h"
+#include "utils/math_functions.h"
+namespace dragon {
+template <class Context>
+class DropBlock2dOp final : public Operator<Context> {
+ public:
+    DropBlock2dOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws),
+          block_size(OperatorBase::Arg<int>("block_size", 7)),
+          alpha(OperatorBase::Arg<float>("alpha", 1.f)),
+          decrement(OperatorBase::Arg<float>("decrement", 0.f)),
+          data_format(OperatorBase::Arg<string>("data_format", "NCHW")) {
+        GET_ARGUMENT_WITH_DESC(float, keep_prob, 0.9f);
+        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+ protected:
+    DECLARE_ARGUMENT_WITH_DESC(float, keep_prob);
+    TIndex block_size, seed_h, seed_w;
+    TIndex n, c, h, w;
+    float alpha, decrement, apply_prob = 1., gamma;
+    string data_format;
+    vector<TIndex> seed_dims;
+};
+template <class Context>
+class DropBlock2dGradientOp final : public Operator<Context> {
+ public:
+    DropBlock2dGradientOp(const OperatorDef& def, Workspace* ws)
+        : Operator<Context>(def, ws) {
+        SwitchToPhase(OperatorBase::Arg<string>("phase", ""));
+    }
+    USE_OPERATOR_FUNCTIONS;
+    void RunOnDevice() override;
+    template <typename T> void RunWithType();
+};
+DEFINE_ARGUMENT_WITH_DESC(float, DropBlock2dOp, keep_prob);
+}    // namespace dragon
+#endif    // DRAGON_OPERATORS_VISION_DROP_BLOCK_OP_H_
\ No newline at end of file
--- a/Dragon/include/operators/vision/lrn_op.h
+++ b/Dragon/include/operators/vision/lrn_op.h
@@ -16,7 +16,10 @@
 namespace dragon {
-enum LRNMode { ACROSS_CHANNELS, WITHIN_CHANNEL };
+typedef enum {
+    ACROSS_CHANNELS,
+    WITHIN_CHANNEL,
+} LRNMode;
 template <class Context>
 class LRNOp : public Operator<Context> {
@@ -82,8 +85,6 @@ class LRNGradientOp : public Operator<Context> {
 #ifdef WITH_CUDNN
-#include "utils/cudnn_device.h"
 template <class Context>
 class CuDNNLRNOp final : public LRNOp<Context> {
 public:

--- a/Dragon/include/operators/vision/pooling_op.h
+++ b/Dragon/include/operators/vision/pooling_op.h
--- a/Dragon/include/utils/caffemodel.h
+++ b/Dragon/include/utils/caffemodel.h
@@ -73,7 +73,7 @@ inline void LoadCaffeModel(
        const string& layer_name = layer.name();
        string prefix = layer_name + "/param:";
        for (int j = 0; j < layer.blobs_size(); j++) {
-            string tensor_name = prefix + dragon_cast<string, int>(j);
+            string tensor_name = prefix + std::to_string(j);
            if (!ws->HasTensor(tensor_name))
                LOG(WARNING) << "Tensor(" << tensor_name << ") "
                << "does not exist in any Graphs, skip.";
@@ -114,7 +114,7 @@ inline void SavaCaffeModel(
    int layer_idx = -1;
    for (int i = 0; i < tensors.size(); i++) {
        if (tensors[i]->count() <= 0) continue;
-        vector<string> splits = SplitString(
+        vector<string> splits = str::split(
            tensors[i]->name(), "/param:");
        if (layer_hash.count(splits[0]) == 0) {
            layer_hash[splits[0]] = ++layer_idx;

--- a/Dragon/include/utils/cast.h
+++ b/Dragon/include/utils/cast.h
@@ -28,6 +28,10 @@ template<> inline int dragon_cast<int, float>(float val) {
    return static_cast<int>(val);
 }
+template<> inline int64_t dragon_cast<int64_t, float>(float val) {
+    return static_cast<int64_t>(val);
+}
 template<> inline float dragon_cast<float, float>(float val) {
    return val; 
 }
@@ -127,7 +131,7 @@ template<> inline float32 dragon_cast<float32, float>(float val) {
    return dragon_cast<float32, float16>(t);
 }
-#ifdef WITH_CUDA_FP16
+#ifdef WITH_CUDA
 template<> inline half dragon_cast<half, float>(float val) {
 #if CUDA_VERSION_MIN(9, 0, 0)
@@ -165,7 +169,7 @@ template<> inline half2 dragon_cast<half2, float16>(float16 val) {
 }
-#endif    // WITH_CUDA_FP16
+#endif    // WITH_CUDA
 }    // namespace dragon

--- a/Dragon/include/utils/cuda_device.h
+++ b/Dragon/include/utils/cuda_device.h
@@ -101,16 +101,10 @@ inline int CUDA_NUM_DEVICES() {
    return count;
 }
-inline int CUDA_DEVICE() {
+inline int CUDA_GET_DEVICE() {
-    int gpu_id;
+    int device_id;
-    cudaGetDevice(&gpu_id);
+    cudaGetDevice(&device_id);
-    return gpu_id;
+    return device_id;
-}
-inline int CUDA_DEVICE(const void* ptr) {
-    cudaPointerAttributes attr;
-    CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
-    return attr.device;
 }
 struct CUDADeviceProps {
@@ -132,7 +126,7 @@ inline const cudaDeviceProp& GetDeviceProperty(
 }
 inline bool CUDA_TRUE_FP16_AVAILABLE() {
-    int device = CUDA_DEVICE();
+    int device = CUDA_GET_DEVICE();
    auto& prop = GetDeviceProperty(device);
    return prop.major >= 6;
 }
@@ -141,7 +135,7 @@ inline bool TENSOR_CORE_AVAILABLE() {
 #if CUDA_VERSION < 9000
    return false;
 #else
-    int device = CUDA_DEVICE();
+    int device = CUDA_GET_DEVICE();
    auto& prop = GetDeviceProperty(device);
    return prop.major >= 7;
 #endif
@@ -149,23 +143,16 @@ inline bool TENSOR_CORE_AVAILABLE() {
 class DeviceGuard {
 public:
-    DeviceGuard(int newDevice)
+    DeviceGuard(int new_id) : prev_id(CUDA_GET_DEVICE()) {
-        : previous_(CUDA_DEVICE()) {
+        if (prev_id != new_id) CUDA_CHECK(cudaSetDevice(new_id));
-        if (previous_ != newDevice)
-            CUDA_CHECK(cudaSetDevice(newDevice));
    }
-    ~DeviceGuard() {
+    ~DeviceGuard() { CUDA_CHECK(cudaSetDevice(prev_id)); }
-        CUDA_CHECK(cudaSetDevice(previous_));
-    }
 private:
-    int previous_;
+    int prev_id;
 };
-#define CUDA_FP16_NOT_COMPILED \
-    LOG(FATAL) << "CUDA-FP16 was not compiled."
 #else
 #define CUDA_NOT_COMPILED \

--- a/Dragon/include/utils/cudnn_device.h
+++ b/Dragon/include/utils/cudnn_device.h
@@ -55,7 +55,6 @@ template<> class CUDNNType<double> {
    typedef double BNParamType;
 };
-#ifdef WITH_CUDA_FP16
 template<> class CUDNNType<float16> {
 public:
    static const cudnnDataType_t type = CUDNN_DATA_HALF;
@@ -63,37 +62,63 @@ template<> class CUDNNType<float16> {
    static const void *one, *zero;
    typedef float BNParamType;
 };
-#endif
 template <typename T>
-void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, Tensor* tensor);
+void cudnnSetTensorDesc(
+    cudnnTensorDescriptor_t*            desc,
+    Tensor*                             tensor);
 template <typename T>
-void cudnnSetTensor4dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, Tensor* tensor);
+void cudnnSetTensor4dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    Tensor*                             tensor);
 template <typename T>
-void cudnnSetTensor5dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, Tensor* tensor);
+void cudnnSetTensor5dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    Tensor*                             tensor);
 template <typename T>
-void cudnnSetTensor3dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, Tensor* tensor);
+void cudnnSetTensor3dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    Tensor*                             tensor);
 template <typename T>
-void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc, const std::vector<int64_t>& dims);
+void cudnnSetTensorDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const std::vector<int64_t>&         dims);
 template <typename T>
-void cudnnSetTensor4dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, const std::vector<int64_t>& dims);
+void cudnnSetTensor4dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    const std::vector<int64_t>&         dims);
 template <typename T>
-void cudnnSetTensor4dDescWithGroup(cudnnTensorDescriptor_t* desc, const string& data_format, const std::vector<int64_t>& dims, const int64_t group);
+void cudnnSetTensor4dDescWithGroup(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    const std::vector<int64_t>&         dims,
+    const int64_t                       group);
 template <typename T>
-void cudnnSetTensor5dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, const std::vector<int64_t>& dims);
+void cudnnSetTensor5dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    const std::vector<int64_t>&         dims);
 template <typename T>
-void cudnnSetTensor3dDesc(cudnnTensorDescriptor_t* desc, const string& data_format, const std::vector<int64_t>& dims);
+void cudnnSetTensor3dDesc(
+    cudnnTensorDescriptor_t*            desc,
+    const string&                       data_format,
+    const std::vector<int64_t>&         dims);
 template <typename T>
-void cudnnSetTensorDesc(cudnnTensorDescriptor_t* desc,
+void cudnnSetTensorDesc(
+    cudnnTensorDescriptor_t*            desc,
    const std::vector<int64_t>&         dims,
    const std::vector<int64_t>&         strides);

--- a/Dragon/include/utils/math_functions.h
+++ b/Dragon/include/utils/math_functions.h
@@ -69,7 +69,7 @@ template <typename T, class Context>
 void RandomBernoulli(
    const int               n,
    const float             p,
-    uint32_t*               x,
+    T*                      x,
    Context*                ctx);
 /******************** Level-1 ********************/

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -25,21 +25,21 @@ typedef int64_t TIndex;
 template <typename T, class Context>
 void Dropout(
    const int               count,
-    T                       prob,
+    float                   prob,
-    T                       scale,
+    float                   scale,
    const T*                x,
-    uint32_t*               mask,
+    uint32_t*               mask32,
+    uint8_t*                mask8,
    T*                      y,
    Context*                ctx);
-template <typename T, class Context>
+template <typename Tx, typename Tm, class Context>
-void DropoutGrad(
+void ApplyMask(
    const int               count,
-    T                       prob,
+    const float             scale,
-    T                       scale,
+    const Tx*               x,
-    const T*                dy,
+    const Tm*               mask,
-    const uint32_t*         mask,
+    Tx*                     y,
-    T*                      dx,
    Context*                ctx);
 /******************** activation.elu ********************/
@@ -234,10 +234,95 @@ void Clip(
    const float             low,
    const float             high,
    const T*                x,
-    T*                      mask,
    T*                      y,
    Context*                ctx);
+template <typename T, class Context>
+void ClipGrad(
+    const int               count,
+    const float             low,
+    const float             high,
+    const T*                x,
+    const T*                dy,
+    T*                      dx,
+    Context*                ctx);
+/******************** arithmetic.maximum ********************/
+template <typename T, class Context>
+void MaximumE(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    T*                      y,
+    Context*                ctx);
+template <typename T, class Context>
+void MaximumB(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    T*                      y,
+    Context*                ctx);
+template <typename T, class Context>
+void MaximumEGrad(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    const T*                dy,
+    T*                      dx1,
+    T*                      dx2,
+    Context*                ctx);
+template <typename T, class Context>
+void MaximumBGrad(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    const T*                dy,
+    T*                      dx1,
+ /* T*                      dx2, */
+    Context*                ctx);
+/******************** arithmetic.minimum ********************/
+template <typename T, class Context>
+void MinimumE(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    T*                      y,
+    Context*                ctx);
+template <typename T, class Context>
+void MinimumB(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    T*                      y,
+    Context*                ctx);
+template <typename T, class Context>
+void MinimumEGrad(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    const T*                dy,
+    T*                      dx1,
+    T*                      dx2,
+    Context*                ctx);
+template <typename T, class Context>
+void MinimumBGrad(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    const T*                dy,
+    T*                      dx1,
+ /* T*                      dx2, */
+    Context*                ctx);
 /******************** control_flow.compare ********************/
 template <typename T, class Context>
@@ -257,6 +342,34 @@ void AbsGrad(
    T*                      dx,
    Context*                ctx);
+/******************** loss.nll_loss ********************/
+template <typename Tx, typename Ty, class Context>
+void NLLLoss(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               log_prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    Context*                ctx);
+template <typename Tx, typename Ty, class Context>
+void NLLLossGrad(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    Tx*                     dx,
+    float*                  flags,
+    Context*                ctx);
 /******************** loss.sigmoid_cross_entropy ********************/
 template <typename T, class Context>
@@ -902,6 +1015,23 @@ void Col2Im2d(
    T*                      im,
    Context*                ctx);
+/******************** vision.drop_block ********************/
+template <class Context>
+void DropBlock2d(
+    const int               N,
+    const int               C,
+    const int               H,
+    const int               W,
+    const int               seed_h,
+    const int               seed_w,
+    const int               block_size,
+    const float             gamma,
+    const string&           data_format,
+    uint32_t*               seed,
+    int*                    mask,
+    Context*                ctx);
 /******************** vision.nn_resize ********************/
 template <typename T, class Context>

--- a/Dragon/include/utils/sse_alternative.h
+++ b/Dragon/include/utils/sse_alternative.h
@@ -111,7 +111,7 @@ void Axpby(
    const T                 beta,
    T*                      y);
-}    // namespace ssd
+}    // namespace sse
 }    // namespace dragon

--- a/Dragon/include/utils/string.h
+++ b/Dragon/include/utils/string.h
@@ -18,11 +18,11 @@
 #include <iostream>
 #include <cstdlib>
-#include "utils/cast.h"
 namespace dragon {
-inline std::vector<std::string> SplitString(
+namespace str {
+inline std::vector<std::string> split(
    const std::string&              str,
    const std::string&              c) {
    std::vector<std::string> ret;
@@ -36,17 +36,7 @@ inline std::vector<std::string> SplitString(
    return ret;
 }
-#define DEFINE_NUMBER2STRING(T) \
+}    // namespace str
-    template<> inline std::string dragon_cast<std::string, T>(T val) { \
-       std::stringstream ss; ss << val; return ss.str(); \
-    }
-DEFINE_NUMBER2STRING(int);
-DEFINE_NUMBER2STRING(unsigned long long);
-template<> inline int dragon_cast<int, std::string>(std::string val) {
-    return atoi(val.c_str()); 
-}
 }    // namespace dragon

--- a/Dragon/modules/cxx/CMakeLists.txt
+++ b/Dragon/modules/cxx/CMakeLists.txt
@@ -2,6 +2,7 @@ message(STATUS "Found CXX Module: ${CMAKE_CURRENT_LIST_DIR}")
 FILE(GLOB_RECURSE MODULE_FILES *.h *.hpp *.c *.cpp *.cu *.cc)
 FILE(GLOB_RECURSE SRC_FILES ../../src/*.c ../../src/*.cpp ../../src/*.cu ../../src/*.cc)
+LIST(REMOVE_ITEM SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/../../src/operators/misc/python_op.cc)
 # ---[ Target
 if (WITH_CUDA)
@@ -36,7 +37,9 @@ if(WIN32)
    TARGET_LINK_LIBRARIES(${PROJECT_NAME}_cxx shlwapi.lib)
 endif()
-SET_TARGET_PROPERTIES(${PROJECT_NAME}_cxx PROPERTIES OUTPUT_NAME dragon_cxx)
+SET_TARGET_PROPERTIES(${PROJECT_NAME}_cxx PROPERTIES OUTPUT_NAME dragon)
+SET_TARGET_PROPERTIES(${PROJECT_NAME}_cxx PROPERTIES DEFINE_SYMBOL DRAGON_CXX_EXPORTS)
 # ---[ Install
-INSTALL(TARGETS ${PROJECT_NAME}_cxx DESTINATION ${PROJECT_BINARY_DIR}/../lib)
+INSTALL(TARGETS ${PROJECT_NAME}_cxx DESTINATION ${PROJECT_BINARY_DIR}/../api/lib)
\ No newline at end of file
+FILE(INSTALL dragon.h DESTINATION ${PROJECT_BINARY_DIR}/../api/include)
\ No newline at end of file
--- a/Dragon/modules/cxx/device.cc
+++ b/Dragon/modules/cxx/device.cc
 #include "dragon.h"
-#include "core/common.h"
+#include "utils/logging.h"
 namespace dragon {

--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -12,6 +12,12 @@
 namespace dragon {
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *               Workspace               *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
 Map<string, unique_ptr < Workspace > > g_workspaces;
 Map<string, vector<string> > sub_workspaces;
 std::mutex g_mutex;
@@ -29,7 +35,8 @@ Workspace* CreateWorkspace(const std::string& name){
 Workspace* ResetWorkspace(const std::string& name) {
    std::unique_lock<std::mutex> lock(g_mutex);
    CHECK(g_workspaces.count(name))
-        << "\nWorkspace(" << name << ") does not exist, can not be reset.";
+        << "\nWorkspace(" << name << ") does not exist."
+        << "\nCan not be reset.";
    LOG(INFO) << "Reset the Workspace(" << name << ").";
    g_workspaces[name].reset(new Workspace(name));
    for (auto& sub_workspace : sub_workspaces[name]) {
@@ -43,7 +50,8 @@ Workspace* ResetWorkspace(const std::string& name) {
 void ReleaseWorkspace(const std::string& name) {
    std::unique_lock<std::mutex> lock(g_mutex);
    CHECK(g_workspaces.count(name))
-        << "\nWorkspace(" << name << ") does not exist, can not be released.";
+        << "\nWorkspace(" << name << ") does not exist."
+        << "\nCan not be released.";
    LOG(INFO) << "Release the Workspace(" << name << ").";
    g_workspaces[name].reset();
    g_workspaces.erase(name);
@@ -61,6 +69,12 @@ void MoveWorkspace(
              << "into the Workspace(" << target_ws->name() << ").";
 }
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Graph                  *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
 std::string CreateGraph(
    const std::string&          graph_file,
    Workspace*                  ws) {
@@ -102,6 +116,19 @@ std::string CreateGraph(
    return meta_graph.name();
 }
+void RunGraph(
+    const std::string&          graph_name,
+    Workspace*                  ws,
+    const int                   stream_id) {
+    ws->RunGraph(graph_name, "", "", stream_id);
+}
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Tensor                 *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
 void CreateTensor(
    const std::string&          name,
    Workspace*                  ws) {
@@ -109,6 +136,32 @@ void CreateTensor(
 }
 template <typename T>
+T* FetchTensor(
+    const std::string&          name,
+    vector<TIndex>&             shape,
+    Workspace*                  ws){
+    if (!ws->HasTensor(name)){
+        LOG(FATAL) << "Tensor(" << name << ")"
+            << " doesn't exist, try create it before.";
+    }
+    Tensor* tensor = ws->GetTensor(name);
+    if (tensor->meta().id() == 0){
+        LOG(FATAL) << "Tensor(" << name << ")"
+            << " has not been computed yet";
+    }
+    shape = tensor->dims();
+    void* data = malloc(tensor->nbytes());
+    if (tensor->memory_state() == MixedMemory::STATE_AT_CUDA) {
+        CUDAContext::Memcpy<CPUContext, CUDAContext>(
+            tensor->nbytes(), data, tensor->raw_data<CUDAContext>());
+    } else {
+        CPUContext::Memcpy<CPUContext, CPUContext>(
+            tensor->nbytes(), data, tensor->raw_data<CPUContext>());
+    }
+    return static_cast<T*>(data);
+}
+template <typename T>
 void FeedTensor(
    const std::string&          name,
    const vector<TIndex>&       shape,
@@ -135,6 +188,12 @@ void FeedTensor(
    }
 }
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                I / O                  *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
 void TransplantCaffeModel(
    const std::string&          input_model,
    const std::string&          output_model) {
@@ -146,7 +205,7 @@ void TransplantCaffeModel(
        const string& layer_name = layer.name();
        string prefix = layer_name + "/param:";
        for (int j = 0; j < layer.blobs_size(); j++) {
-            string tensor_name = prefix + dragon_cast<string, int>(j);
+            string tensor_name = prefix + std::to_string(j);
            BlobProto blob = layer.blobs(j);
            TensorProto* proto = protos.add_protos();
            proto->set_data_type(TensorProto_DataType_FLOAT);
@@ -218,7 +277,7 @@ void LoadCaffemodel(
        const string& layer_name = layer.name();
        string prefix = scope + layer_name + "/param:";
        for (int j = 0; j < layer.blobs_size(); j++){
-            string tensor_name = prefix + dragon_cast<string, int>(j);
+            string tensor_name = prefix + std::to_string(j);
            if (!ws->HasTensor(tensor_name))
                ws->CreateTensor(tensor_name);
            BlobProto blob = layer.blobs(j);
@@ -248,63 +307,54 @@ void LoadCaffemodel(
    }
 }
-void RunGraph(
+/* * * * * * * * * * * * * * * * * * * * *
-    const std::string&          graph_name,
+ *                                       *
-    Workspace*                  ws,
+ *                Config                 *
-    const int                   stream_id) {
+ *                                       *
-    ws->RunGraph(graph_name, "", "", stream_id);
+ * * * * * * * * * * * * * * * * * * * * */
-}
-template <typename T>
-T* FetchTensor(
-    const std::string&          name,
-    vector<TIndex>&             shape,
-    Workspace*                  ws){
-    if (!ws->HasTensor(name)){
-        LOG(FATAL) << "Tensor(" << name << ")"
-            << " doesn't exist, try create it before.";
-    }
-    Tensor* tensor = ws->GetTensor(name);
-    if (tensor->meta().id() == 0){
-        LOG(FATAL) << "Tensor(" << name << ")"
-            << " has not been computed yet";
-    }
-    shape = tensor->dims();
-    void* data = malloc(tensor->nbytes());
-    if (tensor->memory_state() == MixedMemory::STATE_AT_CUDA) {
-        CUDAContext::Memcpy<CPUContext, CUDAContext>(
-            tensor->nbytes(), data, tensor->raw_data<CUDAContext>());
-    } else {
-        CPUContext::Memcpy<CPUContext, CPUContext>(
-            tensor->nbytes(), data, tensor->raw_data<CPUContext>());
-    }
-    return static_cast<T*>(data);
-}
 void SetLogLevel(const std::string& level) {
    SetLogDestination(StrToLogSeverity(level));
 }
-template float* FetchTensor<float>(
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *               Template                *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+template DRAGON_API float* FetchTensor<float>(
    const std::string&,
    std::vector<TIndex>&,
    Workspace*);
-template void FeedTensor<float>(
+template DRAGON_API float16* FetchTensor<float16>(
+    const std::string&,
+    std::vector<TIndex>&,
+    Workspace*);
+template DRAGON_API void FeedTensor<float>(
    const std::string&,
    const std::vector<TIndex>&,
    const float*,
    const Device&,
    Workspace*);
-template void FeedTensor<int>(
+template DRAGON_API void FeedTensor<float16>(
+    const std::string&,
+    const std::vector<TIndex>&,
+    const float16*,
+    const Device&,
+    Workspace*);
+template DRAGON_API void FeedTensor<int>(
    const std::string&,
    const std::vector<TIndex>&,
    const int*,
    const Device&,
    Workspace*);
-template void FeedTensor<uint8_t>(
+template DRAGON_API void FeedTensor<uint8_t>(
    const std::string&,
    const std::vector<TIndex>&,
    const uint8_t*,

--- a/Dragon/modules/cxx/dragon.h
+++ b/Dragon/modules/cxx/dragon.h
@@ -16,10 +16,28 @@
 #include <cstdint>
 #include <vector>
-#ifdef WIN32
+#ifdef _MSC_VER
-    #define EXPORT __declspec(dllexport)
+    #ifdef DRAGON_CXX_EXPORTS
+        #define DRAGON_API __declspec(dllexport)
+    #else
+        #define DRAGON_API __declspec(dllimport)
+    #endif
 #else
-    #define EXPORT
+    #define DRAGON_API
+#endif
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *           Internal Headers            *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+#ifdef DRAGON_CXX_EXPORTS
+#include "core/types.h"
+#else
+namespace dragon {
+    struct float16;
+}
 #endif
 namespace dragon {
@@ -28,72 +46,102 @@ typedef int64_t TIndex;
 class Workspace;
-class Device {
+class DRAGON_API Device {
 public:
-    EXPORT Device();
+    Device();
-    EXPORT explicit Device(std::string device_type);
+    explicit Device(std::string device_type);
-    EXPORT Device(std::string device_type, int device_id);
+    Device(std::string device_type, int device_id);
-    EXPORT const int& device_type() const { return device_type_; }
+    const int& device_type() const { return device_type_; }
-    EXPORT const int device_id() const { return device_id_; }
+    const int device_id() const { return device_id_; }
 private:
    int device_type_, device_id_;
 };
-EXPORT Workspace* CreateWorkspace(const std::string& name);
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *               Workspace               *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
-EXPORT Workspace* ResetWorkspace(const std::string& name);
+DRAGON_API Workspace* CreateWorkspace(const std::string& name);
-EXPORT void ReleaseWorkspace(const std::string& name);
+DRAGON_API Workspace* ResetWorkspace(const std::string& name);
-EXPORT void MoveWorkspace(Workspace* main, Workspace* sub);
+DRAGON_API void ReleaseWorkspace(const std::string& name);
-EXPORT std::string CreateGraph(
+DRAGON_API void MoveWorkspace(Workspace* main, Workspace* sub);
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Graph                  *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+DRAGON_API std::string CreateGraph(
    const std::string&          graph_file,
    Workspace*                  ws);
-EXPORT std::string CreateGraph(
+DRAGON_API std::string CreateGraph(
    const std::string&          graph_file,
    const Device&               device,
    Workspace*                  ws);
-EXPORT void RunGraph(
+DRAGON_API void RunGraph(
    const std::string&          graph_name,
    Workspace*                  ws,
    const int                   stream_id = 1);
-EXPORT void CreateTensor(
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Tensor                 *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+DRAGON_API void CreateTensor(
    const std::string&          name,
    Workspace*                  ws);
 template <typename T>
-EXPORT void FeedTensor(
+DRAGON_API T* FetchTensor(
    const std::string&          name,
-    const std::vector<TIndex>&  shape,
+    std::vector<TIndex>&        shape,
-    const T*                    data,
-    const Device&               device,
    Workspace*                  ws);
 template <typename T>
-EXPORT T* FetchTensor(
+DRAGON_API void FeedTensor(
    const std::string&          name,
-    std::vector<TIndex>&        shape,
+    const std::vector<TIndex>&  shape,
+    const T*                    data,
+    const Device&               device,
    Workspace*                  ws);
-EXPORT void LoadCaffemodel(
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                I / O                  *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+DRAGON_API void LoadCaffemodel(
    const std::string&          model_file,
    Workspace*                  ws);
-EXPORT void TransplantCaffeModel(
+DRAGON_API void TransplantCaffeModel(
    const std::string&          input_model,
    const std::string&          output_model);
-EXPORT void LoadDragonmodel(
+DRAGON_API void LoadDragonmodel(
    const std::string&          model_file,
    Workspace*                  ws);
-EXPORT void SetLogLevel(const std::string& level);
+/* * * * * * * * * * * * * * * * * * * * *
+ *                                       *
+ *                Config                 *
+ *                                       *
+ * * * * * * * * * * * * * * * * * * * * */
+DRAGON_API void SetLogLevel(const std::string& level);
 }    // namespace dragon

--- a/Dragon/modules/python/dragon.cc
+++ b/Dragon/modules/python/dragon.cc
@@ -19,7 +19,8 @@ Workspace* ws() { return g_workspace; }
 TypeId CTypeToFetcher(TypeId type) {
    static Map<TypeId,TypeId> c_type_map {
-            { TypeMeta::Id<uint8_t>(), TypeMeta::Id<NumpyFetcher>() },
+        { TypeMeta::Id<int8>(), TypeMeta::Id<NumpyFetcher>() },
+        { TypeMeta::Id<uint8>(), TypeMeta::Id<NumpyFetcher>() },
        { TypeMeta::Id<int>(), TypeMeta::Id<NumpyFetcher>() },
        { TypeMeta::Id<int64_t>(), TypeMeta::Id<NumpyFetcher>() },
        { TypeMeta::Id<float>(), TypeMeta::Id<NumpyFetcher>() },
@@ -197,6 +198,11 @@ inline PyObject* FeedTensorCC(PyObject* self, PyObject* args) {
    }
 }
+inline PyObject* OnModuleExitCC(PyObject* self, PyObject* args) {
+    g_workspaces.clear();
+    Py_RETURN_TRUE;
+}
 #define PYFUNC(name) {#name, name, METH_VARARGS, ""}
 #define PYENDFUNC {nullptr, nullptr, 0, nullptr}
@@ -255,6 +261,7 @@ PyMethodDef* GetAllMethods() {
        PYFUNC(SnapshotCC),
        /****  Config ****/
        PYFUNC(SetLogLevelCC),
+        PYFUNC(OnModuleExitCC),
        PYENDFUNC,
    };
    return g_python_methods;
@@ -272,9 +279,11 @@ void common_init() {
 }
 #ifdef WITH_PYTHON3
-static struct PyModuleDef libdragon = { PyModuleDef_HEAD_INIT,
+static struct PyModuleDef libdragon = {
+    PyModuleDef_HEAD_INIT,
    "libdragon", "", -1,
-                                        GetAllMethods() };
+    GetAllMethods() 
+};
 PyMODINIT_FUNC PyInit_libdragon(void) {
    PyObject* module = PyModule_Create(&libdragon);
@@ -285,7 +294,8 @@ PyMODINIT_FUNC PyInit_libdragon(void) {
 #else   // WITH_PYTHON2
 PyMODINIT_FUNC initlibdragon(void) {
-    PyObject* moudle = Py_InitModule("libdragon", GetAllMethods());
+    PyObject* moudle = Py_InitModule(
+        "libdragon", GetAllMethods());
    if (moudle == nullptr) return;
    common_init();
 }

--- a/Dragon/modules/python/dragon.h
+++ b/Dragon/modules/python/dragon.h
@@ -31,7 +31,8 @@ class TensorFetcherBase {
 class TensorFeederBase {
 public:
    virtual ~TensorFeederBase() {}
-    virtual PyObject* Feed(const DeviceOption& option, 
+    virtual PyObject* Feed(
+        const DeviceOption&             option,
        PyArrayObject*                  array,
        Tensor*                         tensor) = 0;
 };
@@ -61,7 +62,7 @@ class NumpyFetcher : public TensorFetcherBase {
            PyErr_SetString(PyExc_RuntimeError, s.c_str());
            return nullptr;
        }
-        //  create a empty array with r shape
+        //  create a empty array with the same shape
        PyObject* array = PyArray_SimpleNew(
            tensor.ndim(), npy_dims.data(), npy_type);
        //  copy the tensor data to the numpy array
@@ -88,7 +89,8 @@ class StringFetcher : public TensorFetcherBase {
 class NumpyFeeder : public TensorFeederBase {
 public:
-    PyObject* Feed(const DeviceOption& option,
+    PyObject* Feed(
+        const DeviceOption&         option,
        PyArrayObject*              original_array,
        Tensor*                     tensor) override {
        PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
@@ -100,7 +102,6 @@ class NumpyFeeder : public TensorFeederBase {
        if (meta.id() != tensor->meta().id() && tensor->meta().id() != 0)
            LOG(WARNING) << "Feed Tensor(" << tensor->name() << ")"
                         << " with different data type from original one.";
-        tensor->SetMeta(meta);
        int ndim = PyArray_NDIM(array);
        npy_intp* npy_dims = PyArray_DIMS(array);
        vector<TIndex> dims;
@@ -110,16 +111,16 @@ class NumpyFeeder : public TensorFeederBase {
 #ifdef WITH_CUDA
            CUDAContext context(option);
            context.SwitchToDevice();
+            auto* data = tensor->raw_mutable_data<CUDAContext>(meta);
            context.Memcpy<CUDAContext, CPUContext>(tensor->nbytes(),
-                             tensor->raw_mutable_data<CUDAContext>(),
+                      data, static_cast<void*>(PyArray_DATA(array)));
-                            static_cast<void*>(PyArray_DATA(array)));
 #else   
            LOG(FATAL) << "CUDA was not compiled.";
 #endif
        } else {
+            auto* data = tensor->raw_mutable_data<CPUContext>(meta);
            CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(),
-                                 tensor->raw_mutable_data<CPUContext>(),
+                         data, static_cast<void*>(PyArray_DATA(array)));
-                               static_cast<void*>(PyArray_DATA(array)));
        }
        Py_XDECREF(array);
        Py_RETURN_TRUE;

--- a/Dragon/modules/python/py_cuda.h
+++ b/Dragon/modules/python/py_cuda.h
@@ -25,4 +25,4 @@ inline PyObject* IsCUDADriverSufficientCC(PyObject* self, PyObject* args) {
 #endif
 }
-#endif    // DRAGON_PYTHON_PY_MPI_H_
+#endif    // DRAGON_PYTHON_PY_CUDA_H_
\ No newline at end of file
--- a/Dragon/modules/python/py_tensor.h
+++ b/Dragon/modules/python/py_tensor.h
@@ -94,7 +94,6 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) {
    if (meta.id() != tensor->meta().id() && tensor->meta().id() != 0)
        LOG(WARNING) << "Set Tensor(" << tensor->name() << ")"
        << " with different data type from original one.";
-    tensor->SetMeta(meta);
    int ndim = PyList_Size(shape);
    CHECK_GT(ndim, 0)
        << "\nThe len of shape should be greater than 1. Got " << ndim << ".";
@@ -112,9 +111,9 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) {
    if (dev_opt.device_type() == CUDA) {
        CUDAContext ctx(dev_opt);
        ctx.SwitchToDevice();
-        tensor->raw_mutable_data<CUDAContext>();
+        tensor->raw_mutable_data<CUDAContext>(meta);
    } else {
-        tensor->raw_mutable_data<CPUContext>();
+        tensor->raw_mutable_data<CPUContext>(meta);
    }
    Py_RETURN_TRUE;
 }
@@ -173,19 +172,19 @@ PyObject* TensorFromTensorCC(PyObject* self, PyObject* args) {
    Tensor* srcT = ws()->GetTensor(src_name);
    Tensor* dstT = ws()->CreateTensor(dst_name);
    dstT->ReshapeLike(*srcT);
-    dstT->SetMeta(srcT->meta());
+    const TypeMeta& meta = srcT->meta();
    if (dst_ctx.device_type() == DeviceType::CUDA) {
        if (src_ctx.device_type() == DeviceType::CUDA) {
            //  CUDA <- CUDA
            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
                srcT->nbytes(),
-                    dstT->raw_mutable_data<CUDAContext>(),
+                    dstT->raw_mutable_data<CUDAContext>(meta),
                        srcT->raw_data<CUDAContext>());
        } else {
            //  CUDA <- CPU
            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
                srcT->nbytes(),
-                    dstT->raw_mutable_data<CUDAContext>(),
+                    dstT->raw_mutable_data<CUDAContext>(meta),
                        srcT->raw_data<CPUContext>());
        }
    } else {
@@ -193,13 +192,13 @@ PyObject* TensorFromTensorCC(PyObject* self, PyObject* args) {
            //  CPU <- CUDA
            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
                srcT->nbytes(),
-                    dstT->raw_mutable_data<CPUContext>(),
+                    dstT->raw_mutable_data<CPUContext>(meta),
                        srcT->raw_data<CUDAContext>());
        } else {
            //  CPU <- CPU
            CUDAContext::Memcpy<CUDAContext, CUDAContext>(
                srcT->nbytes(),
-                    dstT->raw_mutable_data<CPUContext>(),
+                    dstT->raw_mutable_data<CPUContext>(meta),
                        srcT->raw_data<CPUContext>());
        }
    }

--- a/Dragon/modules/python/py_types.h
+++ b/Dragon/modules/python/py_types.h
@@ -23,8 +23,8 @@ inline const int TypeMetaToNPY(const TypeMeta& meta) {
        { TypeMeta::Id<int64_t>(), NPY_INT64 },
        { TypeMeta::Id<double>(), NPY_FLOAT64 },
        { TypeMeta::Id<float16>(), NPY_FLOAT16 },
-            { TypeMeta::Id<uint8_t>(), NPY_UINT8 },
+        { TypeMeta::Id<uint8>(), NPY_UINT8 },
-            { TypeMeta::Id<char>(), NPY_INT8 }
+        { TypeMeta::Id<int8>(), NPY_INT8 }
    };
    return m2npy_type_map.count(meta.id()) ? m2npy_type_map[meta.id()] : -1;
 }
@@ -36,11 +36,12 @@ inline const TypeMeta& TypeNPYToMeta(int npy_type) {
        { NPY_INT64, TypeMeta::Make<int64_t>() },
        { NPY_FLOAT64, TypeMeta::Make<double>() },
        { NPY_FLOAT16, TypeMeta::Make<float16>() },
-            { NPY_UINT8, TypeMeta::Make<uint8_t>() },
+        { NPY_UINT8, TypeMeta::Make<uint8>() },
-            { NPY_INT8, TypeMeta::Make<char>() },
+        { NPY_INT8, TypeMeta::Make<int8>() },
    };
    static TypeMeta unknown_type;
-    return npy2m_type_map.count(npy_type) ? npy2m_type_map[npy_type] : unknown_type;
+    return npy2m_type_map.count(npy_type) ?
+        npy2m_type_map[npy_type] : unknown_type;
 }
 #endif    // DRAGON_PYTHON_PY_TYPES_H_
\ No newline at end of file
--- a/Dragon/python/dragon/config.py
+++ b/Dragon/python/dragon/config.py
@@ -26,11 +26,11 @@ option = {}
 REGISTERED_OPERATORS = set(s for s in RegisteredOperatorsCC())
 NO_GRADIENT_OPERATORS = set(s for s in NoGradientOperatorsCC())
-# The current device, 'CPU' or 'CUDA'
+# The current device, 'CPU', 'CUDA' or 'CNML'
 option['device'] = 'CPU'
 # The device id
-option['gpu_id'] = 0
+option['device_id'] = 0
 # Whether to use cuDNN if possible
 option['use_cudnn'] = False
@@ -44,6 +44,9 @@ option['debug_mode'] = False
 # Whether to share grads
 option['share_grads'] = True
+# Optional graph type
+option['graph_type'] = ''
 # Whether to log the meta graphs
 option['log_meta_graph'] = False
@@ -84,7 +87,7 @@ def IsCUDADriverSufficient():
 def EnableCUDA(gpu_id=0, use_cudnn=True):
-    """Enable CUDA mode globally.
+    """Enable NVIDIA's CUDA mode globally.
    Parameters
    ----------
@@ -100,9 +103,28 @@ def EnableCUDA(gpu_id=0, use_cudnn=True):
    """
    global option
    option['device'] = 'CUDA'
-    option['gpu_id'] = gpu_id
+    option['device_id'] = gpu_id
    option['use_cudnn'] = use_cudnn
+def EnableCNML(mlu_id=0):
+    """Enable Cambricon's CNML mode globally.
+    Parameters
+    ----------
+    device_id : int
+        The id of MLU to use.
+    Returns
+    -------
+    None
+    """
+    global option
+    option['device'] = 'CNML'
+    option['device_id'] = mlu_id
 # TODO(PhyscalX): please not use @setter
 # TODO(PhyscalX): seems that it can't change the global value
@@ -133,7 +155,6 @@ def GetRandomSeed():
        The global random seed.
    """
-    global option
    return option['random_seed']
@@ -151,7 +172,7 @@ def SetGPU(id):
    """
    global option
-    option['gpu_id'] = id
+    option['device_id'] = id
 def GetGPU():
@@ -163,8 +184,7 @@ def GetGPU():
        The global id of GPU.
    """
-    global option
+    return option['device_id']
-    return option['gpu_id']
 def SetDebugMode(enabled=True):
@@ -186,6 +206,25 @@ def SetDebugMode(enabled=True):
    option['debug_mode'] = enabled
+def SetGraphType(graph_type=''):
+    """Set the graph type.
+    If empty, the default DAG graph will be used.
+    Parameters
+    ----------
+    graph_type : str
+        The graph type.
+    Returns
+    -------
+    None
+    """
+    global option
+    option['graph_type'] = graph_type
 def LogMetaGraph(enabled=True):
    """Enable to log meta graph globally.

--- a/Dragon/python/dragon/core/tensor.py
+++ b/Dragon/python/dragon/core/tensor.py
@@ -737,7 +737,7 @@ class Tensor(object):
        Parameters
        ----------
-        new_value : basic type, list or numpy.ndarray
+        new_value : number, list or numpy.ndarray
            The values to set.
        Returns

--- a/Dragon/python/dragon/core/tensor_utils.py
+++ b/Dragon/python/dragon/core/tensor_utils.py
@@ -325,5 +325,7 @@ def GetTensorInfo(tensor, stream=1):
        info['mem'].append('CPU'); info['device_id'] = 0
    if 'CUDA' in info:
        info['mem'].append('CUDA'); info['device_id'] = int(info['CUDA'])
+    if 'CNML' in info:
+        info['mem'].append('CNML'); info['device_id'] = int(info['CNML'])
    info['init'] = len(info['mem']) > 0
    return info
\ No newline at end of file
--- a/Dragon/python/dragon/core/workspace.py
+++ b/Dragon/python/dragon/core/workspace.py
@@ -439,7 +439,7 @@ def FetchTensor(tensor):
    Returns
    -------
-    numpy.ndarray
+    ndarray
        The values copied from the backend.
    References
@@ -457,7 +457,7 @@ def FeedTensor(tensor, array, force_cpu=False, dtype=None):
    ----------
    tensor : Tensor or str
        The tensor to feed.
-    ndarray : basic type, list or numpy.ndarray
+    ndarray : number, list or ndarray
        The values to feed.
    force_cpu : boolean
        Whether force to feed to cpu context.
@@ -488,25 +488,23 @@ def FeedTensor(tensor, array, force_cpu=False, dtype=None):
    """
    name = tensor.name if hasattr(tensor, 'name') else str(tensor)
-    dev = None
+    if force_cpu is True:
-    if force_cpu is True: dev = utils.MakeDeviceOption(0, 0)
+        dev = utils.MakeDeviceOption(0, 0)
    else:
        from dragon.core.scope import _DEVICE_SCOPE
        if _DEVICE_SCOPE != '':
-            supports = {'/cpu': 0, '/gpu': 1}
+            supports = {'/cpu': 0, '/gpu': 1, '/mlu': 2}
            dev = pb.DeviceOption()
            dev.device_type = supports[_DEVICE_SCOPE.split(':')[0]]
-            dev.gpu_id = int(_DEVICE_SCOPE.split(':')[1])
+            dev.device_id = int(_DEVICE_SCOPE.split(':')[1])
        else:
            from dragon.config import option
            if option['device'] == 'CUDA':
-                dev = utils.MakeDeviceOption(1, option['gpu_id'])
+                dev = utils.MakeDeviceOption(1, option['device_id'])
-            elif option['device'] == 'CPU':
+            else:
                dev = utils.MakeDeviceOption(0, 0)
    if not isinstance(array, np.ndarray):
-        if not isinstance(array, list):
-            array = [array]
        auto_data_type = np.float32 if dtype is None else dtype
    else:
        auto_data_type = array.dtype if dtype is None else dtype
@@ -573,8 +571,8 @@ def RunGraph(graph_name, inputs=(), outputs=[], stage=None, return_outputs=True)
    Returns
    -------
-    None, numpy.ndarray or list of numpy.ndarray
+    None, ndarray or list of ndarray
-        The outputs, format as numpy.ndarray.
+        The outputs, format as ndarray.
    See Also
    --------

--- a/Dragon/python/dragon/docs/contents/ops.rst
+++ b/Dragon/python/dragon/docs/contents/ops.rst
@@ -42,6 +42,7 @@ List                   Brief
 `BilinearResize`_      Resize the image with Bi-linear method.
 `BiasAdd`_             Add the bias across channels to a ``NCHW`` or ``NHWC`` input.
 `DenseConcat`_         Memory-efficient concatenation for DenseNet. `[Huang et.al, 2017] <http://arxiv.org/abs/1608.06993>`_.
+`DropBlock2d`_         Randomly drop the outputs according to the spatial blocks. `[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_.
 ===================    ======================================================================
 Recurrent
@@ -76,6 +77,7 @@ Loss
 =============================      ======================================================================
 List                               Brief
 =============================      ======================================================================
+`NLLLoss`_                         Negative likelihood loss with sparse labels.
 `SparseSoftmaxCrossEntropy`_       SoftmaxCrossEntropy with sparse labels.
 `SigmoidCrossEntropy`_             SigmoidCrossEntropy.
 `SoftmaxCrossEntropy`_             SoftmaxCrossEntropy with dense(one-hot) labels.
@@ -102,6 +104,8 @@ List               Brief
 `Exp`_             Calculate the exponential of input.
 `Square`_          Calculate the square of input.
 `Sqrt`_            Calculate the sqrt of input.
+`Maximum`_         Return the max value of given two inputs.
+`Minimum`_          Return the min value of given two inputs.
 `Clip`_            Clip the input to be between lower and higher bounds.
 `Matmul`_          Matrix Multiplication.
 `InnerProduct`_    InnerProduct Function.
@@ -215,6 +219,7 @@ List                 Brief
 .. _BilinearResize: operators/vision.html#dragon.operators.vision.BilinearResize
 .. _BiasAdd: operators/vision.html#dragon.operators.vision.BiasAdd
 .. _DenseConcat: operators/vision.html#dragon.operators.vision.DenseConcat
+.. _DropBlock2d: operators/vision.html#dragon.operators.vision.DropBlock2d
 .. _RNN: operators/recurrent.html#dragon.operators.recurrent.RNN
 .. _LSTM: operators/recurrent.html#dragon.operators.recurrent.LSTM
@@ -231,6 +236,7 @@ List                 Brief
 .. _Softmax: operators/activation.html#dragon.operators.activation.Softmax
 .. _Dropout: operators/activation.html#dragon.operators.activation.Dropout
+.. _NLLLoss: operators/loss.html#dragon.operators.loss.NLLLoss
 .. _SparseSoftmaxCrossEntropy: operators/loss.html#dragon.operators.loss.SparseSoftmaxCrossEntropy
 .. _SigmoidCrossEntropy: operators/loss.html#dragon.operators.loss.SigmoidCrossEntropy
 .. _SoftmaxCrossEntropy: operators/loss.html#dragon.operators.loss.SoftmaxCrossEntropy
@@ -246,6 +252,8 @@ List                 Brief
 .. _Mul: operators/arithmetic.html#dragon.operators.arithmetic.Mul
 .. _Div: operators/arithmetic.html#dragon.operators.arithmetic.Div
 .. _Clip: operators/arithmetic.html#dragon.operators.arithmetic.Clip
+.. _Maximum: operators/arithmetic.html#dragon.operators.arithmetic.Maximum
+.. _Minimum: operators/arithmetic.html#dragon.operators.arithmetic.Minimum
 .. _Pow: operators/arithmetic.html#dragon.operators.arithmetic.Pow
 .. _Log: operators/arithmetic.html#dragon.operators.arithmetic.Log
 .. _Exp: operators/arithmetic.html#dragon.operators.arithmetic.Exp

--- a/Dragon/python/dragon/docs/contents/vm/caffe/layer.rst
+++ b/Dragon/python/dragon/docs/contents/vm/caffe/layer.rst
@@ -32,6 +32,7 @@ List                       Brief
 `LRNLayer`_                The implementation of ``LRNLayer``.
 `NNResizeLayer`_           The implementation of ``NNResizeLayer``.
 `BilinearResizeLayer`_     The implementation of ``BilinearResizeLayer``.
+`DropBlockLayer`_          The implementation of ``DropBlockLayer``.
 ======================     =============================================================================
@@ -160,6 +161,7 @@ API Reference
 .. _LRNLayer: #dragon.vm.caffe.layers.vision.LRNLayer
 .. _NNResizeLayer: #dragon.vm.caffe.layers.vision.NNResizeLayer
 .. _BilinearResizeLayer: #dragon.vm.caffe.layers.vision.BilinearResizeLayer
+.. _DropBlockLayer: #dragon.vm.caffe.layers.vision.DropBlockLayer
 .. _ReLULayer: #dragon.vm.caffe.layers.neuron.ReLULayer
 .. _PReLULayer: #dragon.vm.caffe.layers.neuron.PReLULayer

--- a/Dragon/python/dragon/import_c_apis.py
+++ b/Dragon/python/dragon/import_c_apis.py
@@ -15,6 +15,7 @@ from __future__ import print_function
 import sys
 import logging
+import atexit
 try:
    from dragon.libdragon import *
@@ -22,3 +23,5 @@ except ImportError as e:
    logging.critical(
        'Cannot import dragon. Error: {0}'.format(str(e)))
    sys.exit(1)
+atexit.register(OnModuleExitCC)
\ No newline at end of file
--- a/Dragon/python/dragon/io/data_transformer.py
+++ b/Dragon/python/dragon/io/data_transformer.py
@@ -101,7 +101,8 @@ class DataTransformer(Process):
            im = im.reshape((datum.height, datum.width, datum.channels))
        # random scale
-        random_scale = npr.uniform() * (self._max_random_scale - self._min_random_scale) \
+        random_scale = npr.uniform() * (
+            self._max_random_scale - self._min_random_scale) \
                + self._min_random_scale
        if random_scale != 1.0:
            if sys.version_info >= (3, 0):
@@ -110,7 +111,9 @@ class DataTransformer(Process):
            else:
                # Fuck Fuck Fuck opencv-python2, it always has a BUG
                # that leads to duplicate cuDA handles created at gpu:0
-                new_shape = (int(im.shape[1] * random_scale), int(im.shape[0] * random_scale))
+                new_shape = (
+                    int(np.ceil(im.shape[1] * random_scale)),
+                    int(np.ceil(im.shape[0] * random_scale)))
                im = PIL.Image.fromarray(im)
                im = im.resize(new_shape, PIL.Image.BILINEAR)
                im = np.array(im)

--- a/Dragon/python/dragon/operators/__init__.py
+++ b/Dragon/python/dragon/operators/__init__.py
@@ -9,10 +9,12 @@
 #
 # ------------------------------------------------------------
+import numpy as np
 from dragon.core.tensor import Tensor
 INT_MAX = 2147483647
 def CheckInputs(inputs, *args):
    def Verify(inputs, min_num, max_num):
        # type checking
@@ -44,6 +46,17 @@ def ParseArguments(locals):
    return dict(__all__, **kwargs)
+def WrapConstants(constants, dtype='float32'):
+    if not isinstance(constants, Tensor):
+        if not isinstance(constants, np.ndarray):
+            constants = np.array(constants, dtype=dtype)
+        tensor = Tensor()
+        tensor.set_value(constants)
+        tensor.shape = constants.shape
+        constants = tensor
+    return constants
 def AddArgumentWithDesc(arguments, property, name, as_target=True):
    if isinstance(property, Tensor):
        if as_target:

--- a/Dragon/python/dragon/operators/arithmetic.py
+++ b/Dragon/python/dragon/operators/arithmetic.py
@@ -115,6 +115,70 @@ def Div(inputs, **kwargs):
    return output
+def Maximum(inputs, **kwargs):
+    """Return the max value of given two inputs.
+    Parameters
+    ----------
+    inputs : list
+        The input tensors, A and B.
+    Returns
+    -------
+    Tensor
+        The output tensor.
+    """
+    inputs[0] = WrapConstants(inputs[0], dtype='float32')
+    inputs[1] = WrapConstants(inputs[1], dtype='float32')
+    CheckInputs(inputs, 2)
+    arguments = ParseArguments(locals())
+    output = Tensor.CreateOperator(nout=1, op_type='Maximum', **arguments)
+    if inputs[0].shape is not None and \
+        inputs[1].shape is not None:
+            output.shape = inputs[0].shape[:]
+            if output.shape != inputs[1].shape and \
+                len(output.shape) < len(inputs[1].shape):
+                    output.shape = inputs[1].shape
+    return output
+def Minimum(inputs, **kwargs):
+    """Return the min value of given two inputs.
+    Parameters
+    ----------
+    inputs : list
+        The input tensors, A and B.
+    Returns
+    -------
+    Tensor
+        The output tensor.
+    """
+    inputs[0] = WrapConstants(inputs[0], dtype='float32')
+    inputs[1] = WrapConstants(inputs[1], dtype='float32')
+    CheckInputs(inputs, 2)
+    arguments = ParseArguments(locals())
+    output = Tensor.CreateOperator(nout=1, op_type='Minimum', **arguments)
+    if inputs[0].shape is not None and \
+        inputs[1].shape is not None:
+            output.shape = inputs[0].shape[:]
+            if output.shape != inputs[1].shape and \
+                len(output.shape) < len(inputs[1].shape):
+                    output.shape = inputs[1].shape
+    return output
 def Clip(inputs, low=None, high=None, **kwargs):
    """Clip the input to be between lower and higher bounds.

--- a/Dragon/python/dragon/operators/initializer.py
+++ b/Dragon/python/dragon/operators/initializer.py
@@ -36,15 +36,19 @@ def _wrap_output_shape(output, shape):
    return output
-def Fill(shape, value=0, **kwargs):
+def Fill(shape, value=0, dtype='float32', **kwargs):
    """Return a Tensor with specific value filled.
+    If ``dtype`` is None, tensor
    Parameters
    ----------
    shape : list, tuple or Tensor
        The output shape.
    value : basic numerical type
        The value to fill.
+    dtype : str
+        The optional data type.
    Returns
    -------

--- a/Dragon/python/dragon/operators/loss.py
+++ b/Dragon/python/dragon/operators/loss.py
@@ -19,6 +19,46 @@ from . import *
 from .activation import Softmax
+def NLLLoss(inputs, axis=1, normalization='VALID', ignore_labels=(), **kwargs):
+    """Negative likelihood loss with sparse labels.
+    Parameters
+    ----------
+    inputs : list of Tensor
+        The inputs, represent [input, sparse_labels].
+    axis : int
+        The axis of softmax function.
+    normalization : str
+        The normalization, ``UNIT``, ``FULL``, ``VALID``, ``BATCH_SIZE`` or ``NONE``.
+    ignore_label : tuple or list
+        The label id to ignore. Default is ``empty``.
+    Returns
+    -------
+    Tensor
+        The loss.
+    Notes
+    -----
+    Set the normalization to ``UNIT`` will return unreduced losses.
+    """
+    CheckInputs(inputs, 2)
+    arguments = ParseArguments(locals())
+    output = Tensor.CreateOperator(nout=1, op_type='NLLLoss', **arguments)
+    if inputs[0].shape is not None:
+        if normalization != 'UNIT': output.shape = [1]
+        elif all(dim is not None for dim in inputs[0].shape):
+            outer_dim = int(np.prod(inputs[0].shape[0 : axis]))
+            inner_dim = int(np.prod(inputs[0].shape[axis + 1 :]))
+            output.shape = [outer_dim * inner_dim]
+        else: output.shape = [None]
+    return output
 def SparseSoftmaxCrossEntropy(inputs, axis=1, normalization='VALID', ignore_labels=(), **kwargs):
    """SoftmaxCrossEntropy with sparse labels.

--- a/Dragon/python/dragon/operators/norm.py
+++ b/Dragon/python/dragon/operators/norm.py
@@ -16,8 +16,10 @@ from __future__ import print_function
 from . import *
-def BatchNorm(inputs, axis=-1, momentum=0.9, eps=1e-3,
+def BatchNorm(
-              use_stats=-1, mode='DEFAULT', **kwargs):
+    inputs, axis=-1, momentum=0.9, eps=1e-5,
+    use_stats=-1, mode='DEFAULT', **kwargs
+):
    """Batch Normalization. `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
    It follows the implementation of `Caffe`_, that scale procedure is moved to `ops.Scale(*args, **kwargs)`_.
@@ -70,9 +72,11 @@ def BatchNorm(inputs, axis=-1, momentum=0.9, eps=1e-3,
    return output
-def BatchRenorm(inputs, axis=-1, momentum=0.9, eps=1e-3,
+def BatchRenorm(
+    inputs, axis=-1, momentum=0.9, eps=1e-5,
    r_max=3.0, d_max=5.0, t_delta=0.001,
-                use_stats=-1, mode='DEFAULT', **kwargs):
+    use_stats=-1, mode='DEFAULT', **kwargs
+):
    """Batch Renormalization. `[Ioffe, 2017] <https://arxiv.org/abs/1702.03275>`_.
    It follows the implementation of `Caffe`_, that scale procedure is moved to `ops.Scale(*args, **kwargs)`_.
@@ -131,7 +135,10 @@ def BatchRenorm(inputs, axis=-1, momentum=0.9, eps=1e-3,
    return output
-def FusedBatchNorm(inputs, axis=-1, momentum=0.9, eps=1e-3, use_stats=-1, **kwargs):
+def FusedBatchNorm(
+    inputs, axis=-1, momentum=0.9, eps=1e-5,
+    use_stats=-1, **kwargs
+):
    """Batch Normalization, with scale procedure after normalization.
    Parameters
@@ -170,7 +177,7 @@ def FusedBatchNorm(inputs, axis=-1, momentum=0.9, eps=1e-3, use_stats=-1, **kwar
    return output
-def GroupNorm(inputs, group=32, axis=-1, eps=1e-3, **kwargs):
+def GroupNorm(inputs, group=32, axis=-1, eps=1e-5, **kwargs):
    """Group Normalization. `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
    Parameters
@@ -203,7 +210,7 @@ def GroupNorm(inputs, group=32, axis=-1, eps=1e-3, **kwargs):
    return output
-def FusedGroupNorm(inputs, group=32, axis=-1, eps=1e-3, **kwargs):
+def FusedGroupNorm(inputs, group=32, axis=-1, eps=1e-5, **kwargs):
    """Group Normalization, with scale procedure after normalization.
    Parameters
@@ -236,7 +243,7 @@ def FusedGroupNorm(inputs, group=32, axis=-1, eps=1e-3, **kwargs):
    return output
-def InstanceNorm(inputs, axis=-1, eps=1e-3, **kwargs):
+def InstanceNorm(inputs, axis=-1, eps=1e-5, **kwargs):
    """Instance Normalization. `[Ulyanov et.al, 2016] <https://arxiv.org/abs/1607.08022>`_
    Parameters

--- a/Dragon/python/dragon/operators/vision.py
+++ b/Dragon/python/dragon/operators/vision.py
@@ -630,3 +630,44 @@ def DenseConcat(inputs, growth_rate=0, axis=1, **kwargs):
                output.shape[axis] += inputs[i].shape[axis]
    return output
+def DropBlock2d(inputs, block_size=7, keep_prob=0.9,
+                alpha=1., decrement=0., data_format='NCHW', **kwargs):
+    """Randomly drop the outputs according to the spatial blocks. `[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_.
+    Set the ``decrement`` to schedule ``keep_prob`` for each iteration.
+    Set the ``alpha`` to decrease ``gamma`` for different stages.
+    Parameters
+    ----------
+    inputs : Tensor
+        The input tensor.
+    block_size : int
+        The size of dropping block.
+    keep_prob : float or Tensor
+        The prob of keeping. Default is ``0.9``.
+    alpha : float
+        The scale factor to gamma.
+    decrement : float
+        The decrement to keep prob.
+    data_format : str
+        The data format, ``NCHW`` or ``NHWC``.
+    Returns
+    -------
+    Tensor
+        The output tensor.
+    """
+    CheckInputs(inputs, 1)
+    arguments = ParseArguments(locals())
+    arguments = AddArgumentWithDesc(arguments, keep_prob, 'keep_prob', as_target=False)
+    output = Tensor.CreateOperator(nout=1, op_type='DropBlock2d', **arguments)
+    if inputs.shape is not None:
+        output.shape = inputs.shape[:]
+    return output
\ No newline at end of file
--- a/Dragon/python/dragon/ops.py
+++ b/Dragon/python/dragon/ops.py
@@ -51,6 +51,7 @@ NNResize = vision.NNResize
 BilinearResize = vision.BilinearResize
 BiasAdd = vision.BiasAdd
 DenseConcat = vision.DenseConcat
+DropBlock2d = vision.DropBlock2d
 # recurrent
 LSTMCell = recurrent.LSTMCell
@@ -70,6 +71,7 @@ Softmax = act.Softmax
 Dropout = act.Dropout
 # loss
+NLLLoss = loss.NLLLoss
 SparseSoftmaxCrossEntropy = loss.SparseSoftmaxCrossEntropy
 SigmoidCrossEntropy = loss.SigmoidCrossEntropy
 SoftmaxCrossEntropy = loss.SoftmaxCrossEntropy
@@ -85,6 +87,8 @@ Add = math.Add
 Sub = math.Sub
 Mul = math.Mul
 Div = math.Div
+Maximum = math.Maximum
+Minimum = math.Minimum
 Clip = math.Clip
 Matmul = math.Matmul
 Pow = math.Pow

--- a/Dragon/python/dragon/protos/dragon.proto
+++ b/Dragon/python/dragon/protos/dragon.proto
@@ -35,7 +35,11 @@ message Argument {
    repeated string strings=7;
 }
-enum DeviceType { CPU = 0; CUDA = 1; OPENCL = 2; }
+enum DeviceType { 
+	CPU = 0; 
+	CUDA = 1; 
+	CNML = 2; 
+}
 message DeviceOption {
    optional DeviceType device_type = 1 [default = CPU];

--- a/Dragon/python/dragon/protos/dragon_pb2.py
+++ b/Dragon/python/dragon/protos/dragon_pb2.py
@@ -19,7 +19,7 @@ _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor.FileDescriptor(
  name='dragon.proto',
  package='dragon',
-  serialized_pb=_b('\n\x0c\x64ragon.proto\x12\x06\x64ragon\"\xfe\x01\n\x0bTensorProto\x12\x0c\n\x04\x64ims\x18\x01 \x03(\x05\x12\x36\n\tdata_type\x18\x02 \x01(\x0e\x32\x1c.dragon.TensorProto.DataType:\x05\x46LOAT\x12\x16\n\nfloat_data\x18\x03 \x03(\x02\x42\x02\x10\x01\x12\x16\n\nint32_data\x18\x04 \x03(\x05\x42\x02\x10\x01\x12\x11\n\tbyte_data\x18\x05 \x01(\x0c\x12\x13\n\x0bstring_data\x18\x06 \x03(\x0c\x12\x0c\n\x04name\x18\x07 \x01(\t\"C\n\x08\x44\x61taType\x12\t\n\x05\x46LOAT\x10\x01\x12\t\n\x05INT32\x10\x02\x12\x08\n\x04\x42YTE\x10\x03\x12\n\n\x06STRING\x10\x04\x12\x0b\n\x07\x46LOAT16\x10\x0c\"3\n\x0cTensorProtos\x12#\n\x06protos\x18\x01 \x03(\x0b\x32\x13.dragon.TensorProto\"\x80\x01\n\x08\x41rgument\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\t\n\x01\x66\x18\x02 \x01(\x02\x12\t\n\x01i\x18\x03 \x01(\x05\x12\x0b\n\x03i64\x18\t \x01(\x03\x12\t\n\x01s\x18\x04 \x01(\t\x12\t\n\x01\x62\x18\x08 \x01(\x08\x12\x0e\n\x06\x66loats\x18\x05 \x03(\x02\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0f\n\x07strings\x18\x07 \x03(\t\"z\n\x0c\x44\x65viceOption\x12,\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\x12.dragon.DeviceType:\x03\x43PU\x12\x14\n\tdevice_id\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0brandom_seed\x18\x03 \x01(\r:\x01\x33\x12\x0e\n\x06\x65ngine\x18\x04 \x01(\t\"\x94\x01\n\x0bOperatorDef\x12\r\n\x05input\x18\x01 \x03(\t\x12\x0e\n\x06output\x18\x02 \x03(\t\x12\x0c\n\x04name\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x1d\n\x03\x61rg\x18\x05 \x03(\x0b\x32\x10.dragon.Argument\x12+\n\rdevice_option\x18\x06 \x01(\x0b\x32\x14.dragon.DeviceOption\"=\n\x0eGradientTarget\x12\x0c\n\x04\x63ost\x18\x01 \x01(\t\x12\x0b\n\x03wrt\x18\x02 \x01(\t\x12\x10\n\x08\x65xternal\x18\x03 \x01(\t\"Y\n\x0cUpdateTarget\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06tensor\x18\x03 \x03(\t\x12\x1d\n\x03\x61rg\x18\x04 \x03(\x0b\x32\x10.dragon.Argument\"\x94\x02\n\x0cTensorFiller\x12\x0e\n\x06tensor\x18\x01 \x01(\t\x12\x16\n\x04type\x18\x02 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03low\x18\x04 \x01(\x02:\x01\x30\x12\x0f\n\x04high\x18\x05 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x06 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x07 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x08 \x01(\x02:\x01\x33\x12@\n\rvariance_norm\x18\t \x01(\x0e\x32!.dragon.TensorFiller.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x46\x41N_AVG\x10\x02\"\xfb\x01\n\x08GraphDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1f\n\x02op\x18\x02 \x03(\x0b\x32\x13.dragon.OperatorDef\x12\x12\n\ngraph_type\x18\x03 \x01(\t\x12+\n\rdevice_option\x18\x05 \x01(\x0b\x32\x14.dragon.DeviceOption\x12\x1d\n\x03\x61rg\x18\x06 \x03(\x0b\x32\x10.dragon.Argument\x12\x0e\n\x06target\x18\x07 \x03(\t\x12(\n\x08g_target\x18\x08 \x03(\x0b\x32\x16.dragon.GradientTarget\x12&\n\x08u_target\x18\t \x03(\x0b\x32\x14.dragon.UpdateTarget*+\n\nDeviceType\x12\x07\n\x03\x43PU\x10\x00\x12\x08\n\x04\x43UDA\x10\x01\x12\n\n\x06OPENCL\x10\x02')
+  serialized_pb=_b('\n\x0c\x64ragon.proto\x12\x06\x64ragon\"\xfe\x01\n\x0bTensorProto\x12\x0c\n\x04\x64ims\x18\x01 \x03(\x05\x12\x36\n\tdata_type\x18\x02 \x01(\x0e\x32\x1c.dragon.TensorProto.DataType:\x05\x46LOAT\x12\x16\n\nfloat_data\x18\x03 \x03(\x02\x42\x02\x10\x01\x12\x16\n\nint32_data\x18\x04 \x03(\x05\x42\x02\x10\x01\x12\x11\n\tbyte_data\x18\x05 \x01(\x0c\x12\x13\n\x0bstring_data\x18\x06 \x03(\x0c\x12\x0c\n\x04name\x18\x07 \x01(\t\"C\n\x08\x44\x61taType\x12\t\n\x05\x46LOAT\x10\x01\x12\t\n\x05INT32\x10\x02\x12\x08\n\x04\x42YTE\x10\x03\x12\n\n\x06STRING\x10\x04\x12\x0b\n\x07\x46LOAT16\x10\x0c\"3\n\x0cTensorProtos\x12#\n\x06protos\x18\x01 \x03(\x0b\x32\x13.dragon.TensorProto\"\x80\x01\n\x08\x41rgument\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\t\n\x01\x66\x18\x02 \x01(\x02\x12\t\n\x01i\x18\x03 \x01(\x05\x12\x0b\n\x03i64\x18\t \x01(\x03\x12\t\n\x01s\x18\x04 \x01(\t\x12\t\n\x01\x62\x18\x08 \x01(\x08\x12\x0e\n\x06\x66loats\x18\x05 \x03(\x02\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0f\n\x07strings\x18\x07 \x03(\t\"z\n\x0c\x44\x65viceOption\x12,\n\x0b\x64\x65vice_type\x18\x01 \x01(\x0e\x32\x12.dragon.DeviceType:\x03\x43PU\x12\x14\n\tdevice_id\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0brandom_seed\x18\x03 \x01(\r:\x01\x33\x12\x0e\n\x06\x65ngine\x18\x04 \x01(\t\"\x94\x01\n\x0bOperatorDef\x12\r\n\x05input\x18\x01 \x03(\t\x12\x0e\n\x06output\x18\x02 \x03(\t\x12\x0c\n\x04name\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x1d\n\x03\x61rg\x18\x05 \x03(\x0b\x32\x10.dragon.Argument\x12+\n\rdevice_option\x18\x06 \x01(\x0b\x32\x14.dragon.DeviceOption\"=\n\x0eGradientTarget\x12\x0c\n\x04\x63ost\x18\x01 \x01(\t\x12\x0b\n\x03wrt\x18\x02 \x01(\t\x12\x10\n\x08\x65xternal\x18\x03 \x01(\t\"Y\n\x0cUpdateTarget\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06tensor\x18\x03 \x03(\t\x12\x1d\n\x03\x61rg\x18\x04 \x03(\x0b\x32\x10.dragon.Argument\"\x94\x02\n\x0cTensorFiller\x12\x0e\n\x06tensor\x18\x01 \x01(\t\x12\x16\n\x04type\x18\x02 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03low\x18\x04 \x01(\x02:\x01\x30\x12\x0f\n\x04high\x18\x05 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x06 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x07 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x08 \x01(\x02:\x01\x33\x12@\n\rvariance_norm\x18\t \x01(\x0e\x32!.dragon.TensorFiller.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x46\x41N_AVG\x10\x02\"\xfb\x01\n\x08GraphDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1f\n\x02op\x18\x02 \x03(\x0b\x32\x13.dragon.OperatorDef\x12\x12\n\ngraph_type\x18\x03 \x01(\t\x12+\n\rdevice_option\x18\x05 \x01(\x0b\x32\x14.dragon.DeviceOption\x12\x1d\n\x03\x61rg\x18\x06 \x03(\x0b\x32\x10.dragon.Argument\x12\x0e\n\x06target\x18\x07 \x03(\t\x12(\n\x08g_target\x18\x08 \x03(\x0b\x32\x16.dragon.GradientTarget\x12&\n\x08u_target\x18\t \x03(\x0b\x32\x14.dragon.UpdateTarget*)\n\nDeviceType\x12\x07\n\x03\x43PU\x10\x00\x12\x08\n\x04\x43UDA\x10\x01\x12\x08\n\x04\x43NML\x10\x02')
 )
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
@@ -38,21 +38,21 @@ _DEVICETYPE = _descriptor.EnumDescriptor(
      options=None,
      type=None),
    _descriptor.EnumValueDescriptor(
-      name='OPENCL', index=2, number=2,
+      name='CNML', index=2, number=2,
      options=None,
      type=None),
  ],
  containing_type=None,
  options=None,
  serialized_start=1427,
-  serialized_end=1470,
+  serialized_end=1468,
 )
 _sym_db.RegisterEnumDescriptor(_DEVICETYPE)
 DeviceType = enum_type_wrapper.EnumTypeWrapper(_DEVICETYPE)
 CPU = 0
 CUDA = 1
-OPENCL = 2
+CNML = 2
 _TENSORPROTO_DATATYPE = _descriptor.EnumDescriptor(

--- a/Dragon/python/dragon/updaters.py
+++ b/Dragon/python/dragon/updaters.py
@@ -14,7 +14,6 @@ from __future__ import division
 from __future__ import print_function
 import pprint
-import numpy as np
 import dragon.core.workspace as ws
 from dragon.core.tensor import Tensor
@@ -43,7 +42,7 @@ class BaseUpdater(object):
        self._defaults = {
            'scale_gradient': scale_gradient,
            'clip_gradient': clip_gradient,
-            'l2_decay': l2_decay
+            'l2_decay': l2_decay,
        }
        self._param_group = []
        self._slot = slot
@@ -77,7 +76,7 @@ class BaseUpdater(object):
        defaults = self.__dict__.get('_defaults')
        if item in defaults:
            if self._registered:
-                return ws.FetchTensor(self._slot + '/' + item)[0]
+                return ws.FetchTensor(self._slot + '/' + item)
            else: return defaults[item]
        return self.__dict__[item]
@@ -85,9 +84,8 @@ class BaseUpdater(object):
        defaults = self.__dict__.get('_defaults')
        if defaults is not None and key in defaults:
            if self._registered:
-                # convert all defaults as float32 for convenience
+                ws.FeedTensor(self._slot + '/' + key, value,
-                ws.FeedTensor(self._slot + '/' + key,
+                    dtype='float32', force_cpu=True)
-                    np.array([value], dtype=np.float32))
            else:
                self._defaults[key] = value
        else:
@@ -96,8 +94,8 @@ class BaseUpdater(object):
    def register_in_workspace(self):
        if not self._registered:
            for k, v in self._defaults.items():
-                # convert all defaults as float32 for convenience
+                ws.FeedTensor(self._slot + "/" + k, v,
-                ws.FeedTensor(self._slot + "/" + k, np.array([v], dtype=np.float32))
+                    dtype='float32', force_cpu=True)
            self._registered = True
            if self._verbose:
                from dragon.config import logger

--- a/Dragon/python/dragon/version.py
+++ b/Dragon/python/dragon/version.py
@@ -14,7 +14,7 @@ from __future__ import division
 from __future__ import print_function
 version = '0.2.2'
-full_version = '0.2.2.11'
+full_version = '0.2.2.13'
 release = False
 if not release:

--- a/Dragon/python/dragon/vm/caffe/layers/__init__.py
+++ b/Dragon/python/dragon/vm/caffe/layers/__init__.py
@@ -19,7 +19,8 @@ from .vision import ConvolutionLayer, \
                    ROIPoolingLayer, \
                    ROIAlignLayer, \
                    NNResizeLayer, \
-                    BilinearResizeLayer
+                    BilinearResizeLayer, \
+                    DropBlockLayer
 from .neuron import ReLULayer, \
                    PReLULayer, \

--- a/Dragon/python/dragon/vm/caffe/layers/common.py
+++ b/Dragon/python/dragon/vm/caffe/layers/common.py
@@ -446,10 +446,13 @@ class InstanceNormLayer(Layer):
    The implementation of ``InstanceNormLayer``.
    Introduced by `[Ulyanov et.al, 2016] <https://arxiv.org/abs/1607.08022>`_
    """
    def __init__(self, LayerParameter):
        super(InstanceNormLayer, self).__init__(LayerParameter)
-        self._param = {'axis': 1}
+        param = LayerParameter.instance_norm_param
+        self._param = {'eps': param.eps,
+                       'axis': 1}
    def Setup(self, bottom):
        super(InstanceNormLayer, self).Setup(bottom)

--- a/Dragon/python/dragon/vm/caffe/layers/vision.py
+++ b/Dragon/python/dragon/vm/caffe/layers/vision.py
@@ -250,7 +250,7 @@ class NNResizeLayer(Layer):
    Parameters
    ----------
-    shape : caffe_pb2. BlobShape
+    shape : caffe_pb2.BlobShape
        The output shape. Refer `ResizeParameter.shape`_.
    fx : float
        The scale factor of height. Refer `ResizeParameter.fx`_.
@@ -283,7 +283,7 @@ class BilinearResizeLayer(Layer):
    Parameters
    ----------
-    shape : caffe_pb2. BlobShape
+    shape : caffe_pb2.BlobShape
        The output shape. Refer `ResizeParameter.shape`_.
    fx : float
        The scale factor of height. Refer `ResizeParameter.fx`_.
@@ -309,3 +309,33 @@ class BilinearResizeLayer(Layer):
                raise ValueError('The second bottom should be provided to determine the shape.')
            self._param['shape_like'] = bottom[1]
        return ops.BilinearResize(input, **self._param)
+class DropBlockLayer(Layer):
+    """The implementation of ``DropBlock2dLayer``.
+    Parameters
+    ----------
+    block_size : int
+        The size of dropping block. Refer ``DropBlockParameter.block_size``.
+    keep_prob : float
+        The prob of keeping. Refer ``DropBlockParameter.keep_prob``.
+    alpha : float
+        The scale factor to gamma. Refer ``DropBlockParameter.alpha``.
+    decrement : float
+        The decrement to keep prob. Refer ``DropBlockParameter.decrement``.
+    """
+    def __init__(self, LayerParameter):
+        super(DropBlockLayer, self).__init__(LayerParameter)
+        param = LayerParameter.drop_block_param
+        self._param = {'block_size': param.block_size,
+                       'keep_prob': param.keep_prob,
+                       'alpha': param.alpha,
+                       'decrement': param.decrement,
+                       'data_format': 'NCHW'}
+    def Setup(self, bottom):
+        super(DropBlockLayer, self).Setup(bottom)
+        input = bottom[0] if isinstance(bottom, list) else bottom
+        return ops.DropBlock2d(input, **self._param)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/caffe/proto/caffe.proto
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe.proto
@@ -424,7 +424,9 @@ message LayerParameter {
  optional DenseConcatParameter dense_concat_param = 163;
  optional FocalLossParameter focal_loss_param = 164;
  optional GatherParameter gather_param = 165;
-  optional GroupNormParameter group_norm_param = 166;
+  optional InstanceNormParameter instance_norm_param = 166;
+  optional GroupNormParameter group_norm_param = 167;
+  optional DropBlockParameter drop_block_param = 168;
 }
 // Message that stores parameters used to apply transformation
@@ -537,7 +539,7 @@ message BatchNormParameter {
  optional float moving_average_fraction = 2 [default = 0.9];
  // Small value to add to the variance estimate so that we don't divide by
  // zero.
-  optional float eps = 3 [default = 1e-3];
+  optional float eps = 3 [default = 1e-5];
 }
 message BiasParameter {
@@ -595,7 +597,7 @@ message ConvolutionParameter {
  repeated uint32 stride = 6; // The stride; defaults to 1
  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
  // holes. (Kernel dilation is sometimes referred to by its use in the
-  // algorithme à trous from Holschneider et al. 1987.)
+  // algorithme ¨¤ trous from Holschneider et al. 1987.)
  repeated uint32 dilation = 18; // The dilation; defaults to 1
  // For 2D convolution only, the *_h and *_w versions may also be used to
@@ -1456,7 +1458,7 @@ message NormalizeParameter {
  // Whether or not scale parameters are shared across channels.
  optional bool channel_shared = 3 [default = true];
  // Epsilon for not dividing by zero while normalizing variance
-  optional float eps = 4 [default = 1e-3];
+  optional float eps = 4 [default = 1e-5];
 }
 message ParallelParameter {
@@ -1492,7 +1494,7 @@ message ProposalParameter {
 message BatchRenormParameter {
  optional bool use_global_stats = 1;
  optional float moving_average_fraction = 2 [default = 0.9];
-  optional float eps = 3 [default = 1e-3];
+  optional float eps = 3 [default = 1e-5];
  optional float r_max = 4 [default = 3.0];
  optional float d_max = 5 [default = 5.0];
  optional float t_delta = 6 [default = 0.001];
@@ -1513,17 +1515,18 @@ message GatherParameter {
  optional int32 axis = 1 [default = 0];
 }
-message GroupNormParameter {
+message InstanceNormParameter {
-  // If false, accumulate global mean/variance values via a moving average. If
+  optional float eps = 1 [default = 1e-5];
-  // true, use those accumulated values instead of computing mean/variance
-  // across the batch.
-  optional bool use_global_stats = 1;
-  // How much does the moving average decay each iteration?
-  optional float moving_average_fraction = 2 [default = 0.9];
-  // Small value to add to the variance estimate so that we don't divide by
-  // zero.
-  optional float eps = 3 [default = 1e-3];
-  optional uint32 group = 5 [default = 32]; // The group size
 }
+message GroupNormParameter {
+  optional float eps = 1 [default = 1e-5];
+  optional int32 group = 2 [default = 32]; // The group size
+}
+message DropBlockParameter {
+  optional int32 block_size = 1 [default = 7];
+  optional float keep_prob = 2 [default = 0.9];
+  optional float alpha = 3 [default = 1.0];
+  optional float decrement = 4 [default = 0.0];
+}
--- a/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
+++ b/Dragon/python/dragon/vm/caffe/proto/caffe_pb2.py
@@ -19,7 +19,7 @@ _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor.FileDescriptor(
  name='caffe.proto',
  package='caffe',
-  serialized_pb=_b('\n\x0b\x63\x61\x66\x66\x65.proto\x12\x05\x63\x61\x66\x66\x65\"\x1c\n\tBlobShape\x12\x0f\n\x03\x64im\x18\x01 \x03(\x03\x42\x02\x10\x01\"\xcc\x01\n\tBlobProto\x12\x1f\n\x05shape\x18\x07 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x10\n\x04\x64\x61ta\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04\x64iff\x18\x06 \x03(\x02\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_data\x18\x08 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_diff\x18\t \x03(\x01\x42\x02\x10\x01\x12\x0e\n\x03num\x18\x01 \x01(\x05:\x01\x30\x12\x13\n\x08\x63hannels\x18\x02 \x01(\x05:\x01\x30\x12\x11\n\x06height\x18\x03 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\x05:\x01\x30\"2\n\x0f\x42lobProtoVector\x12\x1f\n\x05\x62lobs\x18\x01 \x03(\x0b\x32\x10.caffe.BlobProto\"\x91\x01\n\x05\x44\x61tum\x12\x10\n\x08\x63hannels\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x04 \x01(\x0c\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x12\n\nfloat_data\x18\x06 \x03(\x02\x12\x16\n\x07\x65ncoded\x18\x07 \x01(\x08:\x05\x66\x61lse\x12\x0e\n\x06labels\x18\x08 \x03(\x05\"\x8a\x02\n\x0f\x46illerParameter\x12\x16\n\x04type\x18\x01 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x02 \x01(\x02:\x01\x30\x12\x0e\n\x03min\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03max\x18\x04 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x05 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x06 \x01(\x02:\x01\x31\x12\x12\n\x06sparse\x18\x07 \x01(\x05:\x02-1\x12\x42\n\rvariance_norm\x18\x08 \x01(\x0e\x32#.caffe.FillerParameter.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x41VERAGE\x10\x02\"\x8e\x02\n\x0cNetParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12%\n\x0binput_shape\x18\x08 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x11\n\tinput_dim\x18\x04 \x03(\x05\x12\x1d\n\x0e\x66orce_backward\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x1e\n\x05state\x18\x06 \x01(\x0b\x32\x0f.caffe.NetState\x12\x19\n\ndebug_info\x18\x07 \x01(\x08:\x05\x66\x61lse\x12$\n\x05layer\x18\x64 \x03(\x0b\x32\x15.caffe.LayerParameter\x12\'\n\x06layers\x18\x02 \x03(\x0b\x32\x17.caffe.V1LayerParameter\"\xc9\n\n\x0fSolverParameter\x12\x0b\n\x03net\x18\x18 \x01(\t\x12&\n\tnet_param\x18\x19 \x01(\x0b\x32\x13.caffe.NetParameter\x12\x11\n\ttrain_net\x18\x01 \x01(\t\x12\x10\n\x08test_net\x18\x02 \x03(\t\x12,\n\x0ftrain_net_param\x18\x15 \x01(\x0b\x32\x13.caffe.NetParameter\x12+\n\x0etest_net_param\x18\x16 \x03(\x0b\x32\x13.caffe.NetParameter\x12$\n\x0btrain_state\x18\x1a \x01(\x0b\x32\x0f.caffe.NetState\x12#\n\ntest_state\x18\x1b \x03(\x0b\x32\x0f.caffe.NetState\x12\x11\n\ttest_iter\x18\x03 \x03(\x05\x12\x18\n\rtest_interval\x18\x04 \x01(\x05:\x01\x30\x12 \n\x11test_compute_loss\x18\x13 \x01(\x08:\x05\x66\x61lse\x12!\n\x13test_initialization\x18  \x01(\x08:\x04true\x12\x0f\n\x07\x62\x61se_lr\x18\x05 \x01(\x02\x12\x10\n\x08stage_lr\x18\x32 \x03(\x02\x12\x12\n\nstage_iter\x18\x33 \x03(\x05\x12\x0f\n\x07\x64isplay\x18\x06 \x01(\x05\x12\x17\n\x0c\x61verage_loss\x18! \x01(\x05:\x01\x31\x12\x10\n\x08max_iter\x18\x07 \x01(\x05\x12\x14\n\titer_size\x18$ \x01(\x05:\x01\x31\x12\x11\n\tlr_policy\x18\x08 \x01(\t\x12\r\n\x05gamma\x18\t \x01(\x02\x12\r\n\x05power\x18\n \x01(\x02\x12\x10\n\x08momentum\x18\x0b \x01(\x02\x12\x14\n\x0cweight_decay\x18\x0c \x01(\x02\x12\x1f\n\x13regularization_type\x18\x1d \x01(\t:\x02L2\x12\x10\n\x08stepsize\x18\r \x01(\x05\x12\x11\n\tstepvalue\x18\" \x03(\x05\x12\x1a\n\x0e\x63lip_gradients\x18# \x01(\x02:\x02-1\x12\x13\n\x08snapshot\x18\x0e \x01(\x05:\x01\x30\x12\x17\n\x0fsnapshot_prefix\x18\x0f \x01(\t\x12\x1c\n\rsnapshot_diff\x18\x10 \x01(\x08:\x05\x66\x61lse\x12K\n\x0fsnapshot_format\x18% \x01(\x0e\x32%.caffe.SolverParameter.SnapshotFormat:\x0b\x42INARYPROTO\x12;\n\x0bsolver_mode\x18\x11 \x01(\x0e\x32!.caffe.SolverParameter.SolverMode:\x03GPU\x12\x14\n\tdevice_id\x18\x12 \x01(\x05:\x01\x30\x12\x17\n\x0brandom_seed\x18\x14 \x01(\x03:\x02-1\x12\x11\n\x04type\x18( \x01(\t:\x03SGD\x12\x15\n\x05\x64\x65lta\x18\x1f \x01(\x02:\x06\x31\x65-008\x12\x18\n\tmomentum2\x18\' \x01(\x02:\x05\x30.999\x12\x17\n\trms_decay\x18& \x01(\x02:\x04\x30.99\x12\x19\n\ndebug_info\x18\x17 \x01(\x08:\x05\x66\x61lse\x12\"\n\x14snapshot_after_train\x18\x1c \x01(\x08:\x04true\x12;\n\x0bsolver_type\x18\x1e \x01(\x0e\x32!.caffe.SolverParameter.SolverType:\x03SGD\"+\n\x0eSnapshotFormat\x12\x08\n\x04HDF5\x10\x00\x12\x0f\n\x0b\x42INARYPROTO\x10\x01\"\x1e\n\nSolverMode\x12\x07\n\x03\x43PU\x10\x00\x12\x07\n\x03GPU\x10\x01\"U\n\nSolverType\x12\x07\n\x03SGD\x10\x00\x12\x0c\n\x08NESTEROV\x10\x01\x12\x0b\n\x07\x41\x44\x41GRAD\x10\x02\x12\x0b\n\x07RMSPROP\x10\x03\x12\x0c\n\x08\x41\x44\x41\x44\x45LTA\x10\x04\x12\x08\n\x04\x41\x44\x41M\x10\x05\"l\n\x0bSolverState\x12\x0c\n\x04iter\x18\x01 \x01(\x05\x12\x13\n\x0blearned_net\x18\x02 \x01(\t\x12!\n\x07history\x18\x03 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x17\n\x0c\x63urrent_step\x18\x04 \x01(\x05:\x01\x30\"N\n\x08NetState\x12!\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase:\x04TEST\x12\x10\n\x05level\x18\x02 \x01(\x05:\x01\x30\x12\r\n\x05stage\x18\x03 \x03(\t\"\x85\x01\n\x0cNetStateRule\x12\x1b\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase\x12\x11\n\tmin_level\x18\x02 \x01(\x05\x12\x11\n\tmax_level\x18\x03 \x01(\x05\x12\r\n\x05stage\x18\x04 \x03(\t\x12\x11\n\tnot_stage\x18\x05 \x03(\t\x12\x10\n\x08mpi_rank\x18\x06 \x03(\r\"\xa3\x01\n\tParamSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x31\n\nshare_mode\x18\x02 \x01(\x0e\x32\x1d.caffe.ParamSpec.DimCheckMode\x12\x12\n\x07lr_mult\x18\x03 \x01(\x02:\x01\x31\x12\x15\n\ndecay_mult\x18\x04 \x01(\x02:\x01\x31\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xcb\x19\n\x0eLayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06\x62ottom\x18\x03 \x03(\t\x12\x0b\n\x03top\x18\x04 \x03(\t\x12\x1c\n\x0cmirror_stage\x18\xa2\x01 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x05phase\x18\n \x01(\x0e\x32\x0c.caffe.Phase\x12\x13\n\x0bloss_weight\x18\x05 \x03(\x02\x12\x1f\n\x05param\x18\x06 \x03(\x0b\x32\x10.caffe.ParamSpec\x12\x1f\n\x05\x62lobs\x18\x07 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x16\n\x0epropagate_down\x18\x0b \x03(\x08\x12$\n\x07include\x18\x08 \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18\t \x03(\x0b\x32\x13.caffe.NetStateRule\x12\x37\n\x0ftransform_param\x18\x64 \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18\x65 \x01(\x0b\x32\x14.caffe.LossParameter\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x66 \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18g \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12\x34\n\x10\x62\x61tch_norm_param\x18\x8b\x01 \x01(\x0b\x32\x19.caffe.BatchNormParameter\x12)\n\nbias_param\x18\x8d\x01 \x01(\x0b\x32\x14.caffe.BiasParameter\x12,\n\x0c\x63oncat_param\x18h \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18i \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18j \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12)\n\ncrop_param\x18\x90\x01 \x01(\x0b\x32\x14.caffe.CropParameter\x12(\n\ndata_param\x18k \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18l \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18m \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18n \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12\'\n\telu_param\x18\x8c\x01 \x01(\x0b\x32\x13.caffe.ELUParameter\x12+\n\x0b\x65mbed_param\x18\x89\x01 \x01(\x0b\x32\x15.caffe.EmbedParameter\x12&\n\texp_param\x18o \x01(\x0b\x32\x13.caffe.ExpParameter\x12/\n\rflatten_param\x18\x87\x01 \x01(\x0b\x32\x17.caffe.FlattenParameter\x12\x31\n\x0fhdf5_data_param\x18p \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18q \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18r \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18s \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18t \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18u \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12+\n\x0binput_param\x18\x8f\x01 \x01(\x0b\x32\x15.caffe.InputParameter\x12\'\n\tlog_param\x18\x86\x01 \x01(\x0b\x32\x13.caffe.LogParameter\x12&\n\tlrn_param\x18v \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18w \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18x \x01(\x0b\x32\x13.caffe.MVNParameter\x12\x33\n\x0fparameter_param\x18\x91\x01 \x01(\x0b\x32\x19.caffe.ParameterParameter\x12.\n\rpooling_param\x18y \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18z \x01(\x0b\x32\x15.caffe.PowerParameter\x12+\n\x0bprelu_param\x18\x83\x01 \x01(\x0b\x32\x15.caffe.PReLUParameter\x12-\n\x0cpython_param\x18\x82\x01 \x01(\x0b\x32\x16.caffe.PythonParameter\x12\x33\n\x0freduction_param\x18\x88\x01 \x01(\x0b\x32\x19.caffe.ReductionParameter\x12(\n\nrelu_param\x18{ \x01(\x0b\x32\x14.caffe.ReLUParameter\x12/\n\rreshape_param\x18\x85\x01 \x01(\x0b\x32\x17.caffe.ReshapeParameter\x12+\n\x0bscale_param\x18\x8e\x01 \x01(\x0b\x32\x15.caffe.ScaleParameter\x12.\n\rsigmoid_param\x18| \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18} \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12\'\n\tspp_param\x18\x84\x01 \x01(\x0b\x32\x13.caffe.SPPParameter\x12*\n\x0bslice_param\x18~ \x01(\x0b\x32\x15.caffe.SliceParameter\x12(\n\ntanh_param\x18\x7f \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x33\n\x0fthreshold_param\x18\x80\x01 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12)\n\ntile_param\x18\x8a\x01 \x01(\x0b\x32\x14.caffe.TileParameter\x12\x36\n\x11window_data_param\x18\x81\x01 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12\x36\n\x11roi_pooling_param\x18\x97\x01 \x01(\x0b\x32\x1a.caffe.ROIPoolingParameter\x12;\n\x14smooth_l1_loss_param\x18\x98\x01 \x01(\x0b\x32\x1c.caffe.SmoothL1LossParameter\x12\'\n\tmpi_param\x18\x99\x01 \x01(\x0b\x32\x13.caffe.MPIParameter\x12/\n\rpermute_param\x18\x9a\x01 \x01(\x0b\x32\x17.caffe.PermuteParameter\x12\x33\n\x0fnormalize_param\x18\x9b\x01 \x01(\x0b\x32\x19.caffe.NormalizeParameter\x12\x31\n\x0eparallel_param\x18\x9d\x01 \x01(\x0b\x32\x18.caffe.ParallelParameter\x12-\n\x0cresize_param\x18\x9e\x01 \x01(\x0b\x32\x16.caffe.ResizeParameter\x12\x36\n\x11\x65xpand_dims_param\x18\x9f\x01 \x01(\x0b\x32\x1a.caffe.ExpandDimsParameter\x12\x31\n\x0eproposal_param\x18\xa0\x01 \x01(\x0b\x32\x18.caffe.ProposalParameter\x12\x38\n\x12\x62\x61tch_renorm_param\x18\xa1\x01 \x01(\x0b\x32\x1b.caffe.BatchRenormParameter\x12\x38\n\x12\x64\x65nse_concat_param\x18\xa3\x01 \x01(\x0b\x32\x1b.caffe.DenseConcatParameter\x12\x34\n\x10\x66ocal_loss_param\x18\xa4\x01 \x01(\x0b\x32\x19.caffe.FocalLossParameter\x12-\n\x0cgather_param\x18\xa5\x01 \x01(\x0b\x32\x16.caffe.GatherParameter\x12\x34\n\x10group_norm_param\x18\xa6\x01 \x01(\x0b\x32\x19.caffe.GroupNormParameter\"\xa7\x02\n\x17TransformationParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x31\x12\x15\n\x06mirror\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x14\n\tcrop_size\x18\x03 \x01(\r:\x01\x30\x12\x12\n\x07padding\x18\x0b \x01(\r:\x01\x30\x12\x11\n\tmean_file\x18\x04 \x01(\t\x12\x12\n\nmean_value\x18\x05 \x03(\x02\x12\x1a\n\x0b\x66orce_color\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\nforce_gray\x18\x07 \x01(\x08:\x05\x66\x61lse\x12!\n\x12\x63olor_augmentation\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x10min_random_scale\x18\t \x01(\x02:\x01\x31\x12\x1b\n\x10max_random_scale\x18\n \x01(\x02:\x01\x31\"\xf5\x01\n\rLossParameter\x12\x14\n\x0cignore_label\x18\x01 \x01(\x05\x12\x44\n\rnormalization\x18\x03 \x01(\x0e\x32&.caffe.LossParameter.NormalizationMode:\x05VALID\x12\x11\n\tnormalize\x18\x02 \x01(\x08\x1a\'\n\x13\x45xpandDimsParameter\x12\x10\n\x04\x61xis\x18\x01 \x01(\x05:\x02-1\"L\n\x11NormalizationMode\x12\x08\n\x04\x46ULL\x10\x00\x12\t\n\x05VALID\x10\x01\x12\x0e\n\nBATCH_SIZE\x10\x02\x12\x08\n\x04NONE\x10\x03\x12\x08\n\x04UNIT\x10\x04\"L\n\x11\x41\x63\x63uracyParameter\x12\x10\n\x05top_k\x18\x01 \x01(\r:\x01\x31\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x14\n\x0cignore_label\x18\x03 \x01(\x05\"M\n\x0f\x41rgMaxParameter\x12\x1a\n\x0bout_max_val\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x05top_k\x18\x02 \x01(\r:\x01\x31\x12\x0c\n\x04\x61xis\x18\x03 \x01(\x05\"9\n\x0f\x43oncatParameter\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x15\n\nconcat_dim\x18\x01 \x01(\r:\x01\x31\"h\n\x12\x42\x61tchNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x30.001\"]\n\rBiasParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\"L\n\x18\x43ontrastiveLossParameter\x12\x11\n\x06margin\x18\x01 \x01(\x02:\x01\x31\x12\x1d\n\x0elegacy_version\x18\x02 \x01(\x08:\x05\x66\x61lse\"\xfc\x03\n\x14\x43onvolutionParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x0b\n\x03pad\x18\x03 \x03(\r\x12\x13\n\x0bkernel_size\x18\x04 \x03(\r\x12\x0e\n\x06stride\x18\x06 \x03(\r\x12\x10\n\x08\x64ilation\x18\x12 \x03(\r\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x10\n\x08kernel_h\x18\x0b \x01(\r\x12\x10\n\x08kernel_w\x18\x0c \x01(\r\x12\x10\n\x08stride_h\x18\r \x01(\r\x12\x10\n\x08stride_w\x18\x0e \x01(\r\x12\x10\n\x05group\x18\x05 \x01(\r:\x01\x31\x12-\n\rweight_filler\x18\x07 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x08 \x01(\x0b\x32\x16.caffe.FillerParameter\x12;\n\x06\x65ngine\x18\x0f \x01(\x0e\x32\".caffe.ConvolutionParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x10 \x01(\x05:\x01\x31\x12\x1e\n\x0f\x66orce_nd_im2col\x18\x11 \x01(\x08:\x05\x66\x61lse\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"0\n\rCropParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x32\x12\x0e\n\x06offset\x18\x02 \x03(\r\"\xa4\x02\n\rDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x31\n\x07\x62\x61\x63kend\x18\x08 \x01(\x0e\x32\x17.caffe.DataParameter.DB:\x07LEVELDB\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\"\n\x13\x66orce_encoded_color\x18\t \x01(\x08:\x05\x66\x61lse\x12\x13\n\x08prefetch\x18\n \x01(\r:\x01\x35\"\x1b\n\x02\x44\x42\x12\x0b\n\x07LEVELDB\x10\x00\x12\x08\n\x04LMDB\x10\x01\"I\n\x10\x44ropoutParameter\x12\x1a\n\rdropout_ratio\x18\x01 \x01(\x02:\x03\x30.5\x12\x19\n\x0bscale_train\x18\x02 \x01(\x08:\x04true\"\xa0\x01\n\x12\x44ummyDataParameter\x12+\n\x0b\x64\x61ta_filler\x18\x01 \x03(\x0b\x32\x16.caffe.FillerParameter\x12\x1f\n\x05shape\x18\x06 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x0b\n\x03num\x18\x02 \x03(\r\x12\x10\n\x08\x63hannels\x18\x03 \x03(\r\x12\x0e\n\x06height\x18\x04 \x03(\r\x12\r\n\x05width\x18\x05 \x03(\r\"\xa5\x01\n\x10\x45ltwiseParameter\x12\x39\n\toperation\x18\x01 \x01(\x0e\x32!.caffe.EltwiseParameter.EltwiseOp:\x03SUM\x12\r\n\x05\x63oeff\x18\x02 \x03(\x02\x12\x1e\n\x10stable_prod_grad\x18\x03 \x01(\x08:\x04true\"\'\n\tEltwiseOp\x12\x08\n\x04PROD\x10\x00\x12\x07\n\x03SUM\x10\x01\x12\x07\n\x03MAX\x10\x02\" \n\x0c\x45LUParameter\x12\x10\n\x05\x61lpha\x18\x01 \x01(\x02:\x01\x31\"\xac\x01\n\x0e\x45mbedParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x11\n\tinput_dim\x18\x02 \x01(\r\x12\x17\n\tbias_term\x18\x03 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"D\n\x0c\x45xpParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"9\n\x10\x46lattenParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x14\n\x08\x65nd_axis\x18\x02 \x01(\x05:\x02-1\"O\n\x11HDF5DataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x16\n\x07shuffle\x18\x03 \x01(\x08:\x05\x66\x61lse\"(\n\x13HDF5OutputParameter\x12\x11\n\tfile_name\x18\x01 \x01(\t\"^\n\x12HingeLossParameter\x12\x30\n\x04norm\x18\x01 \x01(\x0e\x32\x1e.caffe.HingeLossParameter.Norm:\x02L1\"\x16\n\x04Norm\x12\x06\n\x02L1\x10\x01\x12\x06\n\x02L2\x10\x02\"\x97\x02\n\x12ImageDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\nbatch_size\x18\x04 \x01(\r:\x01\x31\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x16\n\x07shuffle\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnew_height\x18\t \x01(\r:\x01\x30\x12\x14\n\tnew_width\x18\n \x01(\r:\x01\x30\x12\x16\n\x08is_color\x18\x0b \x01(\x08:\x04true\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\x0c \x01(\t:\x00\"\'\n\x15InfogainLossParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\"\xcb\x01\n\x15InnerProductParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0f\n\x04\x61xis\x18\x05 \x01(\x05:\x01\x31\x12\x18\n\ttranspose\x18\x06 \x01(\x08:\x05\x66\x61lse\"1\n\x0eInputParameter\x12\x1f\n\x05shape\x18\x01 \x03(\x0b\x32\x10.caffe.BlobShape\"D\n\x0cLogParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xb8\x02\n\x0cLRNParameter\x12\x15\n\nlocal_size\x18\x01 \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x02 \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x03 \x01(\x02:\x04\x30.75\x12\x44\n\x0bnorm_region\x18\x04 \x01(\x0e\x32\x1e.caffe.LRNParameter.NormRegion:\x0f\x41\x43ROSS_CHANNELS\x12\x0c\n\x01k\x18\x05 \x01(\x02:\x01\x31\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.LRNParameter.Engine:\x07\x44\x45\x46\x41ULT\"5\n\nNormRegion\x12\x13\n\x0f\x41\x43ROSS_CHANNELS\x10\x00\x12\x12\n\x0eWITHIN_CHANNEL\x10\x01\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xbd\x01\n\x13MemoryDataParameter\x12\x12\n\nbatch_size\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\x12\x0e\n\x06height\x18\x03 \x01(\r\x12\r\n\x05width\x18\x04 \x01(\r\x12;\n\x05\x64type\x18\x05 \x01(\x0e\x32#.caffe.MemoryDataParameter.DataType:\x07\x46LOAT32\"$\n\x08\x44\x61taType\x12\x0b\n\x07\x46LOAT32\x10\x00\x12\x0b\n\x07\x46LOAT16\x10\x01\"e\n\x0cMVNParameter\x12 \n\x12normalize_variance\x18\x01 \x01(\x08:\x04true\x12\x1e\n\x0f\x61\x63ross_channels\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x13\n\x03\x65ps\x18\x03 \x01(\x02:\x06\x31\x65-009\"5\n\x12ParameterParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\"\xa2\x03\n\x10PoolingParameter\x12\x35\n\x04pool\x18\x01 \x01(\x0e\x32\".caffe.PoolingParameter.PoolMethod:\x03MAX\x12\x0e\n\x03pad\x18\x04 \x01(\r:\x01\x30\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x02 \x01(\r\x12\x10\n\x08kernel_h\x18\x05 \x01(\r\x12\x10\n\x08kernel_w\x18\x06 \x01(\r\x12\x11\n\x06stride\x18\x03 \x01(\r:\x01\x31\x12\x10\n\x08stride_h\x18\x07 \x01(\r\x12\x10\n\x08stride_w\x18\x08 \x01(\r\x12\x37\n\x06\x65ngine\x18\x0b \x01(\x0e\x32\x1e.caffe.PoolingParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x1d\n\x0eglobal_pooling\x18\x0c \x01(\x08:\x05\x66\x61lse\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Y\n\x13ROIPoolingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"F\n\x0ePowerParameter\x12\x10\n\x05power\x18\x01 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"g\n\x0fPythonParameter\x12\x0e\n\x06module\x18\x01 \x01(\t\x12\r\n\x05layer\x18\x02 \x01(\t\x12\x13\n\tparam_str\x18\x03 \x01(\t:\x00\x12 \n\x11share_in_parallel\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xad\x01\n\x12ReductionParameter\x12=\n\toperation\x18\x01 \x01(\x0e\x32%.caffe.ReductionParameter.ReductionOp:\x03SUM\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x10\n\x05\x63oeff\x18\x03 \x01(\x02:\x01\x31\"5\n\x0bReductionOp\x12\x07\n\x03SUM\x10\x01\x12\x08\n\x04\x41SUM\x10\x02\x12\t\n\x05SUMSQ\x10\x03\x12\x08\n\x04MEAN\x10\x04\"\x8d\x01\n\rReLUParameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\x12\x34\n\x06\x65ngine\x18\x02 \x01(\x0e\x32\x1b.caffe.ReLUParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Z\n\x10ReshapeParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\x08num_axes\x18\x03 \x01(\x05:\x02-1\"\xa5\x01\n\x0eScaleParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x18\n\tbias_term\x18\x04 \x01(\x08:\x05\x66\x61lse\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"x\n\x10SigmoidParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SigmoidParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"L\n\x0eSliceParameter\x12\x0f\n\x04\x61xis\x18\x03 \x01(\x05:\x01\x31\x12\x13\n\x0bslice_point\x18\x02 \x03(\r\x12\x14\n\tslice_dim\x18\x01 \x01(\r:\x01\x31\"\x89\x01\n\x10SoftmaxParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SoftmaxParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"r\n\rTanHParameter\x12\x34\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1b.caffe.TanHParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"T\n\rTileParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05tiles\x18\x02 \x01(\x05\x12#\n\tmultiples\x18\x03 \x01(\x0b\x32\x10.caffe.BlobShape\"*\n\x12ThresholdParameter\x12\x14\n\tthreshold\x18\x01 \x01(\x02:\x01\x30\"\xc1\x02\n\x13WindowDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0c\x66g_threshold\x18\x07 \x01(\x02:\x03\x30.5\x12\x19\n\x0c\x62g_threshold\x18\x08 \x01(\x02:\x03\x30.5\x12\x19\n\x0b\x66g_fraction\x18\t \x01(\x02:\x04\x30.25\x12\x16\n\x0b\x63ontext_pad\x18\n \x01(\r:\x01\x30\x12\x17\n\tcrop_mode\x18\x0b \x01(\t:\x04warp\x12\x1b\n\x0c\x63\x61\x63he_images\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\r \x01(\t:\x00\"\xeb\x01\n\x0cSPPParameter\x12\x16\n\x0epyramid_height\x18\x01 \x01(\r\x12\x31\n\x04pool\x18\x02 \x01(\x0e\x32\x1e.caffe.SPPParameter.PoolMethod:\x03MAX\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.SPPParameter.Engine:\x07\x44\x45\x46\x41ULT\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xe0\x13\n\x10V1LayerParameter\x12\x0e\n\x06\x62ottom\x18\x02 \x03(\t\x12\x0b\n\x03top\x18\x03 \x03(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12$\n\x07include\x18  \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18! \x03(\x0b\x32\x13.caffe.NetStateRule\x12/\n\x04type\x18\x05 \x01(\x0e\x32!.caffe.V1LayerParameter.LayerType\x12\x1f\n\x05\x62lobs\x18\x06 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x0e\n\x05param\x18\xe9\x07 \x03(\t\x12>\n\x0f\x62lob_share_mode\x18\xea\x07 \x03(\x0e\x32$.caffe.V1LayerParameter.DimCheckMode\x12\x10\n\x08\x62lobs_lr\x18\x07 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x08 \x03(\x02\x12\x13\n\x0bloss_weight\x18# \x03(\x02\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x1b \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18\x17 \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12,\n\x0c\x63oncat_param\x18\t \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18( \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18\n \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12(\n\ndata_param\x18\x0b \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18\x0c \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18\x1a \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18\x18 \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12&\n\texp_param\x18) \x01(\x0b\x32\x13.caffe.ExpParameter\x12\x31\n\x0fhdf5_data_param\x18\r \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18\x0e \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18\x1d \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18\x0f \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18\x10 \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18\x11 \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12&\n\tlrn_param\x18\x12 \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18\x16 \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18\" \x01(\x0b\x32\x13.caffe.MVNParameter\x12.\n\rpooling_param\x18\x13 \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18\x15 \x01(\x0b\x32\x15.caffe.PowerParameter\x12(\n\nrelu_param\x18\x1e \x01(\x0b\x32\x14.caffe.ReLUParameter\x12.\n\rsigmoid_param\x18& \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18\' \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12*\n\x0bslice_param\x18\x1f \x01(\x0b\x32\x15.caffe.SliceParameter\x12(\n\ntanh_param\x18% \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x32\n\x0fthreshold_param\x18\x19 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12\x35\n\x11window_data_param\x18\x14 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12\x37\n\x0ftransform_param\x18$ \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18* \x01(\x0b\x32\x14.caffe.LossParameter\x12&\n\x05layer\x18\x01 \x01(\x0b\x32\x17.caffe.V0LayerParameter\"\xd8\x04\n\tLayerType\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06\x41\x42SVAL\x10#\x12\x0c\n\x08\x41\x43\x43URACY\x10\x01\x12\n\n\x06\x41RGMAX\x10\x1e\x12\x08\n\x04\x42NLL\x10\x02\x12\n\n\x06\x43ONCAT\x10\x03\x12\x14\n\x10\x43ONTRASTIVE_LOSS\x10%\x12\x0f\n\x0b\x43ONVOLUTION\x10\x04\x12\x08\n\x04\x44\x41TA\x10\x05\x12\x11\n\rDECONVOLUTION\x10\'\x12\x0b\n\x07\x44ROPOUT\x10\x06\x12\x0e\n\nDUMMY_DATA\x10 \x12\x12\n\x0e\x45UCLIDEAN_LOSS\x10\x07\x12\x0b\n\x07\x45LTWISE\x10\x19\x12\x07\n\x03\x45XP\x10&\x12\x0b\n\x07\x46LATTEN\x10\x08\x12\r\n\tHDF5_DATA\x10\t\x12\x0f\n\x0bHDF5_OUTPUT\x10\n\x12\x0e\n\nHINGE_LOSS\x10\x1c\x12\n\n\x06IM2COL\x10\x0b\x12\x0e\n\nIMAGE_DATA\x10\x0c\x12\x11\n\rINFOGAIN_LOSS\x10\r\x12\x11\n\rINNER_PRODUCT\x10\x0e\x12\x07\n\x03LRN\x10\x0f\x12\x0f\n\x0bMEMORY_DATA\x10\x1d\x12\x1d\n\x19MULTINOMIAL_LOGISTIC_LOSS\x10\x10\x12\x07\n\x03MVN\x10\"\x12\x0b\n\x07POOLING\x10\x11\x12\t\n\x05POWER\x10\x1a\x12\x08\n\x04RELU\x10\x12\x12\x0b\n\x07SIGMOID\x10\x13\x12\x1e\n\x1aSIGMOID_CROSS_ENTROPY_LOSS\x10\x1b\x12\x0b\n\x07SILENCE\x10$\x12\x0b\n\x07SOFTMAX\x10\x14\x12\x10\n\x0cSOFTMAX_LOSS\x10\x15\x12\t\n\x05SPLIT\x10\x16\x12\t\n\x05SLICE\x10!\x12\x08\n\x04TANH\x10\x17\x12\x0f\n\x0bWINDOW_DATA\x10\x18\x12\r\n\tTHRESHOLD\x10\x1f\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xfd\x07\n\x10V0LayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x12\n\nnum_output\x18\x03 \x01(\r\x12\x16\n\x08\x62iasterm\x18\x04 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x06 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0e\n\x03pad\x18\x07 \x01(\r:\x01\x30\x12\x12\n\nkernelsize\x18\x08 \x01(\r\x12\x10\n\x05group\x18\t \x01(\r:\x01\x31\x12\x11\n\x06stride\x18\n \x01(\r:\x01\x31\x12\x35\n\x04pool\x18\x0b \x01(\x0e\x32\".caffe.V0LayerParameter.PoolMethod:\x03MAX\x12\x1a\n\rdropout_ratio\x18\x0c \x01(\x02:\x03\x30.5\x12\x15\n\nlocal_size\x18\r \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x0e \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x0f \x01(\x02:\x04\x30.75\x12\x0c\n\x01k\x18\x16 \x01(\x02:\x01\x31\x12\x0e\n\x06source\x18\x10 \x01(\t\x12\x10\n\x05scale\x18\x11 \x01(\x02:\x01\x31\x12\x10\n\x08meanfile\x18\x12 \x01(\t\x12\x11\n\tbatchsize\x18\x13 \x01(\r\x12\x13\n\x08\x63ropsize\x18\x14 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x15 \x01(\x08:\x05\x66\x61lse\x12\x1f\n\x05\x62lobs\x18\x32 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x10\n\x08\x62lobs_lr\x18\x33 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x34 \x03(\x02\x12\x14\n\trand_skip\x18\x35 \x01(\r:\x01\x30\x12\x1d\n\x10\x64\x65t_fg_threshold\x18\x36 \x01(\x02:\x03\x30.5\x12\x1d\n\x10\x64\x65t_bg_threshold\x18\x37 \x01(\x02:\x03\x30.5\x12\x1d\n\x0f\x64\x65t_fg_fraction\x18\x38 \x01(\x02:\x04\x30.25\x12\x1a\n\x0f\x64\x65t_context_pad\x18: \x01(\r:\x01\x30\x12\x1b\n\rdet_crop_mode\x18; \x01(\t:\x04warp\x12\x12\n\x07new_num\x18< \x01(\x05:\x01\x30\x12\x17\n\x0cnew_channels\x18= \x01(\x05:\x01\x30\x12\x15\n\nnew_height\x18> \x01(\x05:\x01\x30\x12\x14\n\tnew_width\x18? \x01(\x05:\x01\x30\x12\x1d\n\x0eshuffle_images\x18@ \x01(\x08:\x05\x66\x61lse\x12\x15\n\nconcat_dim\x18\x41 \x01(\r:\x01\x31\x12\x36\n\x11hdf5_output_param\x18\xe9\x07 \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"W\n\x0ePReLUParameter\x12&\n\x06\x66iller\x18\x01 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1d\n\x0e\x63hannel_shared\x18\x02 \x01(\x08:\x05\x66\x61lse\")\n\x15SmoothL1LossParameter\x12\x10\n\x05sigma\x18\x01 \x01(\x02:\x01\x31\"H\n\x0cMPIParameter\x12\x0f\n\x04root\x18\x01 \x01(\r:\x01\x30\x12\x12\n\x07\x63omm_id\x18\x02 \x01(\x04:\x01\x30\x12\x13\n\x08group_id\x18\x03 \x01(\x04:\x01\x30\"!\n\x10PermuteParameter\x12\r\n\x05order\x18\x01 \x03(\r\"\x92\x01\n\x12NormalizeParameter\x12\x1c\n\x0e\x61\x63ross_spatial\x18\x01 \x01(\x08:\x04true\x12,\n\x0cscale_filler\x18\x02 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1c\n\x0e\x63hannel_shared\x18\x03 \x01(\x08:\x04true\x12\x12\n\x03\x65ps\x18\x04 \x01(\x02:\x05\x30.001\"d\n\x11ParallelParameter\x12\x1d\n\x0emultiple_nodes\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x16\n\x07shuffle\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x18\n\tpartition\x18\x03 \x01(\x08:\x05\x66\x61lse\"R\n\x0fResizeParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x0e\n\x02\x66x\x18\x02 \x01(\x02:\x02-1\x12\x0e\n\x02\x66y\x18\x03 \x01(\x02:\x02-1\"\'\n\x13\x45xpandDimsParameter\x12\x10\n\x04\x61xis\x18\x01 \x01(\x05:\x02-1\"\x90\x02\n\x11ProposalParameter\x12\x0e\n\x06stride\x18\x01 \x03(\x05\x12\r\n\x05ratio\x18\x02 \x03(\x02\x12\r\n\x05scale\x18\x03 \x03(\x02\x12\x1b\n\rpre_nms_top_n\x18\x04 \x01(\r:\x04\x36\x30\x30\x30\x12\x1b\n\x0epost_nms_top_n\x18\x05 \x01(\r:\x03\x33\x30\x30\x12\x17\n\nnms_thresh\x18\x06 \x01(\x02:\x03\x30.7\x12\x14\n\x08min_size\x18\x07 \x01(\r:\x02\x31\x36\x12\x14\n\tmin_level\x18\x08 \x01(\x05:\x01\x32\x12\x14\n\tmax_level\x18\t \x01(\x05:\x01\x35\x12\x1c\n\x0f\x63\x61nonical_scale\x18\n \x01(\x05:\x03\x32\x32\x34\x12\x1a\n\x0f\x63\x61nonical_level\x18\x0b \x01(\x05:\x01\x34\"\xa6\x01\n\x14\x42\x61tchRenormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x30.001\x12\x10\n\x05r_max\x18\x04 \x01(\x02:\x01\x33\x12\x10\n\x05\x64_max\x18\x05 \x01(\x02:\x01\x35\x12\x16\n\x07t_delta\x18\x06 \x01(\x02:\x05\x30.001\"?\n\x14\x44\x65nseConcatParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x16\n\x0bgrowth_rate\x18\x02 \x01(\x05:\x01\x30\"N\n\x12\x46ocalLossParameter\x12\x13\n\x05\x61lpha\x18\x01 \x01(\x02:\x04\x30.25\x12\x10\n\x05gamma\x18\x02 \x01(\x02:\x01\x32\x12\x11\n\x06neg_id\x18\x03 \x01(\x05:\x01\x30\"\"\n\x0fGatherParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x30\"{\n\x12GroupNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x30.001\x12\x11\n\x05group\x18\x05 \x01(\r:\x02\x33\x32*\x1c\n\x05Phase\x12\t\n\x05TRAIN\x10\x00\x12\x08\n\x04TEST\x10\x01')
+  serialized_pb=_b('\n\x0b\x63\x61\x66\x66\x65.proto\x12\x05\x63\x61\x66\x66\x65\"\x1c\n\tBlobShape\x12\x0f\n\x03\x64im\x18\x01 \x03(\x03\x42\x02\x10\x01\"\xcc\x01\n\tBlobProto\x12\x1f\n\x05shape\x18\x07 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x10\n\x04\x64\x61ta\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04\x64iff\x18\x06 \x03(\x02\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_data\x18\x08 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_diff\x18\t \x03(\x01\x42\x02\x10\x01\x12\x0e\n\x03num\x18\x01 \x01(\x05:\x01\x30\x12\x13\n\x08\x63hannels\x18\x02 \x01(\x05:\x01\x30\x12\x11\n\x06height\x18\x03 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\x05:\x01\x30\"2\n\x0f\x42lobProtoVector\x12\x1f\n\x05\x62lobs\x18\x01 \x03(\x0b\x32\x10.caffe.BlobProto\"\x91\x01\n\x05\x44\x61tum\x12\x10\n\x08\x63hannels\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x04 \x01(\x0c\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x12\n\nfloat_data\x18\x06 \x03(\x02\x12\x16\n\x07\x65ncoded\x18\x07 \x01(\x08:\x05\x66\x61lse\x12\x0e\n\x06labels\x18\x08 \x03(\x05\"\x8a\x02\n\x0f\x46illerParameter\x12\x16\n\x04type\x18\x01 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x02 \x01(\x02:\x01\x30\x12\x0e\n\x03min\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03max\x18\x04 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x05 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x06 \x01(\x02:\x01\x31\x12\x12\n\x06sparse\x18\x07 \x01(\x05:\x02-1\x12\x42\n\rvariance_norm\x18\x08 \x01(\x0e\x32#.caffe.FillerParameter.VarianceNorm:\x06\x46\x41N_IN\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x41VERAGE\x10\x02\"\x8e\x02\n\x0cNetParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12%\n\x0binput_shape\x18\x08 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x11\n\tinput_dim\x18\x04 \x03(\x05\x12\x1d\n\x0e\x66orce_backward\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x1e\n\x05state\x18\x06 \x01(\x0b\x32\x0f.caffe.NetState\x12\x19\n\ndebug_info\x18\x07 \x01(\x08:\x05\x66\x61lse\x12$\n\x05layer\x18\x64 \x03(\x0b\x32\x15.caffe.LayerParameter\x12\'\n\x06layers\x18\x02 \x03(\x0b\x32\x17.caffe.V1LayerParameter\"\xc9\n\n\x0fSolverParameter\x12\x0b\n\x03net\x18\x18 \x01(\t\x12&\n\tnet_param\x18\x19 \x01(\x0b\x32\x13.caffe.NetParameter\x12\x11\n\ttrain_net\x18\x01 \x01(\t\x12\x10\n\x08test_net\x18\x02 \x03(\t\x12,\n\x0ftrain_net_param\x18\x15 \x01(\x0b\x32\x13.caffe.NetParameter\x12+\n\x0etest_net_param\x18\x16 \x03(\x0b\x32\x13.caffe.NetParameter\x12$\n\x0btrain_state\x18\x1a \x01(\x0b\x32\x0f.caffe.NetState\x12#\n\ntest_state\x18\x1b \x03(\x0b\x32\x0f.caffe.NetState\x12\x11\n\ttest_iter\x18\x03 \x03(\x05\x12\x18\n\rtest_interval\x18\x04 \x01(\x05:\x01\x30\x12 \n\x11test_compute_loss\x18\x13 \x01(\x08:\x05\x66\x61lse\x12!\n\x13test_initialization\x18  \x01(\x08:\x04true\x12\x0f\n\x07\x62\x61se_lr\x18\x05 \x01(\x02\x12\x10\n\x08stage_lr\x18\x32 \x03(\x02\x12\x12\n\nstage_iter\x18\x33 \x03(\x05\x12\x0f\n\x07\x64isplay\x18\x06 \x01(\x05\x12\x17\n\x0c\x61verage_loss\x18! \x01(\x05:\x01\x31\x12\x10\n\x08max_iter\x18\x07 \x01(\x05\x12\x14\n\titer_size\x18$ \x01(\x05:\x01\x31\x12\x11\n\tlr_policy\x18\x08 \x01(\t\x12\r\n\x05gamma\x18\t \x01(\x02\x12\r\n\x05power\x18\n \x01(\x02\x12\x10\n\x08momentum\x18\x0b \x01(\x02\x12\x14\n\x0cweight_decay\x18\x0c \x01(\x02\x12\x1f\n\x13regularization_type\x18\x1d \x01(\t:\x02L2\x12\x10\n\x08stepsize\x18\r \x01(\x05\x12\x11\n\tstepvalue\x18\" \x03(\x05\x12\x1a\n\x0e\x63lip_gradients\x18# \x01(\x02:\x02-1\x12\x13\n\x08snapshot\x18\x0e \x01(\x05:\x01\x30\x12\x17\n\x0fsnapshot_prefix\x18\x0f \x01(\t\x12\x1c\n\rsnapshot_diff\x18\x10 \x01(\x08:\x05\x66\x61lse\x12K\n\x0fsnapshot_format\x18% \x01(\x0e\x32%.caffe.SolverParameter.SnapshotFormat:\x0b\x42INARYPROTO\x12;\n\x0bsolver_mode\x18\x11 \x01(\x0e\x32!.caffe.SolverParameter.SolverMode:\x03GPU\x12\x14\n\tdevice_id\x18\x12 \x01(\x05:\x01\x30\x12\x17\n\x0brandom_seed\x18\x14 \x01(\x03:\x02-1\x12\x11\n\x04type\x18( \x01(\t:\x03SGD\x12\x15\n\x05\x64\x65lta\x18\x1f \x01(\x02:\x06\x31\x65-008\x12\x18\n\tmomentum2\x18\' \x01(\x02:\x05\x30.999\x12\x17\n\trms_decay\x18& \x01(\x02:\x04\x30.99\x12\x19\n\ndebug_info\x18\x17 \x01(\x08:\x05\x66\x61lse\x12\"\n\x14snapshot_after_train\x18\x1c \x01(\x08:\x04true\x12;\n\x0bsolver_type\x18\x1e \x01(\x0e\x32!.caffe.SolverParameter.SolverType:\x03SGD\"+\n\x0eSnapshotFormat\x12\x08\n\x04HDF5\x10\x00\x12\x0f\n\x0b\x42INARYPROTO\x10\x01\"\x1e\n\nSolverMode\x12\x07\n\x03\x43PU\x10\x00\x12\x07\n\x03GPU\x10\x01\"U\n\nSolverType\x12\x07\n\x03SGD\x10\x00\x12\x0c\n\x08NESTEROV\x10\x01\x12\x0b\n\x07\x41\x44\x41GRAD\x10\x02\x12\x0b\n\x07RMSPROP\x10\x03\x12\x0c\n\x08\x41\x44\x41\x44\x45LTA\x10\x04\x12\x08\n\x04\x41\x44\x41M\x10\x05\"l\n\x0bSolverState\x12\x0c\n\x04iter\x18\x01 \x01(\x05\x12\x13\n\x0blearned_net\x18\x02 \x01(\t\x12!\n\x07history\x18\x03 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x17\n\x0c\x63urrent_step\x18\x04 \x01(\x05:\x01\x30\"N\n\x08NetState\x12!\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase:\x04TEST\x12\x10\n\x05level\x18\x02 \x01(\x05:\x01\x30\x12\r\n\x05stage\x18\x03 \x03(\t\"\x85\x01\n\x0cNetStateRule\x12\x1b\n\x05phase\x18\x01 \x01(\x0e\x32\x0c.caffe.Phase\x12\x11\n\tmin_level\x18\x02 \x01(\x05\x12\x11\n\tmax_level\x18\x03 \x01(\x05\x12\r\n\x05stage\x18\x04 \x03(\t\x12\x11\n\tnot_stage\x18\x05 \x03(\t\x12\x10\n\x08mpi_rank\x18\x06 \x03(\r\"\xa3\x01\n\tParamSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x31\n\nshare_mode\x18\x02 \x01(\x0e\x32\x1d.caffe.ParamSpec.DimCheckMode\x12\x12\n\x07lr_mult\x18\x03 \x01(\x02:\x01\x31\x12\x15\n\ndecay_mult\x18\x04 \x01(\x02:\x01\x31\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xbd\x1a\n\x0eLayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06\x62ottom\x18\x03 \x03(\t\x12\x0b\n\x03top\x18\x04 \x03(\t\x12\x1c\n\x0cmirror_stage\x18\xa2\x01 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x05phase\x18\n \x01(\x0e\x32\x0c.caffe.Phase\x12\x13\n\x0bloss_weight\x18\x05 \x03(\x02\x12\x1f\n\x05param\x18\x06 \x03(\x0b\x32\x10.caffe.ParamSpec\x12\x1f\n\x05\x62lobs\x18\x07 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x16\n\x0epropagate_down\x18\x0b \x03(\x08\x12$\n\x07include\x18\x08 \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18\t \x03(\x0b\x32\x13.caffe.NetStateRule\x12\x37\n\x0ftransform_param\x18\x64 \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18\x65 \x01(\x0b\x32\x14.caffe.LossParameter\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x66 \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18g \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12\x34\n\x10\x62\x61tch_norm_param\x18\x8b\x01 \x01(\x0b\x32\x19.caffe.BatchNormParameter\x12)\n\nbias_param\x18\x8d\x01 \x01(\x0b\x32\x14.caffe.BiasParameter\x12,\n\x0c\x63oncat_param\x18h \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18i \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18j \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12)\n\ncrop_param\x18\x90\x01 \x01(\x0b\x32\x14.caffe.CropParameter\x12(\n\ndata_param\x18k \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18l \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18m \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18n \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12\'\n\telu_param\x18\x8c\x01 \x01(\x0b\x32\x13.caffe.ELUParameter\x12+\n\x0b\x65mbed_param\x18\x89\x01 \x01(\x0b\x32\x15.caffe.EmbedParameter\x12&\n\texp_param\x18o \x01(\x0b\x32\x13.caffe.ExpParameter\x12/\n\rflatten_param\x18\x87\x01 \x01(\x0b\x32\x17.caffe.FlattenParameter\x12\x31\n\x0fhdf5_data_param\x18p \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18q \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18r \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18s \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18t \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18u \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12+\n\x0binput_param\x18\x8f\x01 \x01(\x0b\x32\x15.caffe.InputParameter\x12\'\n\tlog_param\x18\x86\x01 \x01(\x0b\x32\x13.caffe.LogParameter\x12&\n\tlrn_param\x18v \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18w \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18x \x01(\x0b\x32\x13.caffe.MVNParameter\x12\x33\n\x0fparameter_param\x18\x91\x01 \x01(\x0b\x32\x19.caffe.ParameterParameter\x12.\n\rpooling_param\x18y \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18z \x01(\x0b\x32\x15.caffe.PowerParameter\x12+\n\x0bprelu_param\x18\x83\x01 \x01(\x0b\x32\x15.caffe.PReLUParameter\x12-\n\x0cpython_param\x18\x82\x01 \x01(\x0b\x32\x16.caffe.PythonParameter\x12\x33\n\x0freduction_param\x18\x88\x01 \x01(\x0b\x32\x19.caffe.ReductionParameter\x12(\n\nrelu_param\x18{ \x01(\x0b\x32\x14.caffe.ReLUParameter\x12/\n\rreshape_param\x18\x85\x01 \x01(\x0b\x32\x17.caffe.ReshapeParameter\x12+\n\x0bscale_param\x18\x8e\x01 \x01(\x0b\x32\x15.caffe.ScaleParameter\x12.\n\rsigmoid_param\x18| \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18} \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12\'\n\tspp_param\x18\x84\x01 \x01(\x0b\x32\x13.caffe.SPPParameter\x12*\n\x0bslice_param\x18~ \x01(\x0b\x32\x15.caffe.SliceParameter\x12(\n\ntanh_param\x18\x7f \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x33\n\x0fthreshold_param\x18\x80\x01 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12)\n\ntile_param\x18\x8a\x01 \x01(\x0b\x32\x14.caffe.TileParameter\x12\x36\n\x11window_data_param\x18\x81\x01 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12\x36\n\x11roi_pooling_param\x18\x97\x01 \x01(\x0b\x32\x1a.caffe.ROIPoolingParameter\x12;\n\x14smooth_l1_loss_param\x18\x98\x01 \x01(\x0b\x32\x1c.caffe.SmoothL1LossParameter\x12\'\n\tmpi_param\x18\x99\x01 \x01(\x0b\x32\x13.caffe.MPIParameter\x12/\n\rpermute_param\x18\x9a\x01 \x01(\x0b\x32\x17.caffe.PermuteParameter\x12\x33\n\x0fnormalize_param\x18\x9b\x01 \x01(\x0b\x32\x19.caffe.NormalizeParameter\x12\x31\n\x0eparallel_param\x18\x9d\x01 \x01(\x0b\x32\x18.caffe.ParallelParameter\x12-\n\x0cresize_param\x18\x9e\x01 \x01(\x0b\x32\x16.caffe.ResizeParameter\x12\x36\n\x11\x65xpand_dims_param\x18\x9f\x01 \x01(\x0b\x32\x1a.caffe.ExpandDimsParameter\x12\x31\n\x0eproposal_param\x18\xa0\x01 \x01(\x0b\x32\x18.caffe.ProposalParameter\x12\x38\n\x12\x62\x61tch_renorm_param\x18\xa1\x01 \x01(\x0b\x32\x1b.caffe.BatchRenormParameter\x12\x38\n\x12\x64\x65nse_concat_param\x18\xa3\x01 \x01(\x0b\x32\x1b.caffe.DenseConcatParameter\x12\x34\n\x10\x66ocal_loss_param\x18\xa4\x01 \x01(\x0b\x32\x19.caffe.FocalLossParameter\x12-\n\x0cgather_param\x18\xa5\x01 \x01(\x0b\x32\x16.caffe.GatherParameter\x12:\n\x13instance_norm_param\x18\xa6\x01 \x01(\x0b\x32\x1c.caffe.InstanceNormParameter\x12\x34\n\x10group_norm_param\x18\xa7\x01 \x01(\x0b\x32\x19.caffe.GroupNormParameter\x12\x34\n\x10\x64rop_block_param\x18\xa8\x01 \x01(\x0b\x32\x19.caffe.DropBlockParameter\"\xa7\x02\n\x17TransformationParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x31\x12\x15\n\x06mirror\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x14\n\tcrop_size\x18\x03 \x01(\r:\x01\x30\x12\x12\n\x07padding\x18\x0b \x01(\r:\x01\x30\x12\x11\n\tmean_file\x18\x04 \x01(\t\x12\x12\n\nmean_value\x18\x05 \x03(\x02\x12\x1a\n\x0b\x66orce_color\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\nforce_gray\x18\x07 \x01(\x08:\x05\x66\x61lse\x12!\n\x12\x63olor_augmentation\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x10min_random_scale\x18\t \x01(\x02:\x01\x31\x12\x1b\n\x10max_random_scale\x18\n \x01(\x02:\x01\x31\"\xf5\x01\n\rLossParameter\x12\x14\n\x0cignore_label\x18\x01 \x01(\x05\x12\x44\n\rnormalization\x18\x03 \x01(\x0e\x32&.caffe.LossParameter.NormalizationMode:\x05VALID\x12\x11\n\tnormalize\x18\x02 \x01(\x08\x1a\'\n\x13\x45xpandDimsParameter\x12\x10\n\x04\x61xis\x18\x01 \x01(\x05:\x02-1\"L\n\x11NormalizationMode\x12\x08\n\x04\x46ULL\x10\x00\x12\t\n\x05VALID\x10\x01\x12\x0e\n\nBATCH_SIZE\x10\x02\x12\x08\n\x04NONE\x10\x03\x12\x08\n\x04UNIT\x10\x04\"L\n\x11\x41\x63\x63uracyParameter\x12\x10\n\x05top_k\x18\x01 \x01(\r:\x01\x31\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x14\n\x0cignore_label\x18\x03 \x01(\x05\"M\n\x0f\x41rgMaxParameter\x12\x1a\n\x0bout_max_val\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x05top_k\x18\x02 \x01(\r:\x01\x31\x12\x0c\n\x04\x61xis\x18\x03 \x01(\x05\"9\n\x0f\x43oncatParameter\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x15\n\nconcat_dim\x18\x01 \x01(\r:\x01\x31\"i\n\x12\x42\x61tchNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x13\n\x03\x65ps\x18\x03 \x01(\x02:\x06\x31\x65-005\"]\n\rBiasParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\"L\n\x18\x43ontrastiveLossParameter\x12\x11\n\x06margin\x18\x01 \x01(\x02:\x01\x31\x12\x1d\n\x0elegacy_version\x18\x02 \x01(\x08:\x05\x66\x61lse\"\xfc\x03\n\x14\x43onvolutionParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x0b\n\x03pad\x18\x03 \x03(\r\x12\x13\n\x0bkernel_size\x18\x04 \x03(\r\x12\x0e\n\x06stride\x18\x06 \x03(\r\x12\x10\n\x08\x64ilation\x18\x12 \x03(\r\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x10\n\x08kernel_h\x18\x0b \x01(\r\x12\x10\n\x08kernel_w\x18\x0c \x01(\r\x12\x10\n\x08stride_h\x18\r \x01(\r\x12\x10\n\x08stride_w\x18\x0e \x01(\r\x12\x10\n\x05group\x18\x05 \x01(\r:\x01\x31\x12-\n\rweight_filler\x18\x07 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x08 \x01(\x0b\x32\x16.caffe.FillerParameter\x12;\n\x06\x65ngine\x18\x0f \x01(\x0e\x32\".caffe.ConvolutionParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x10 \x01(\x05:\x01\x31\x12\x1e\n\x0f\x66orce_nd_im2col\x18\x11 \x01(\x08:\x05\x66\x61lse\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"0\n\rCropParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x32\x12\x0e\n\x06offset\x18\x02 \x03(\r\"\xa4\x02\n\rDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x31\n\x07\x62\x61\x63kend\x18\x08 \x01(\x0e\x32\x17.caffe.DataParameter.DB:\x07LEVELDB\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\"\n\x13\x66orce_encoded_color\x18\t \x01(\x08:\x05\x66\x61lse\x12\x13\n\x08prefetch\x18\n \x01(\r:\x01\x35\"\x1b\n\x02\x44\x42\x12\x0b\n\x07LEVELDB\x10\x00\x12\x08\n\x04LMDB\x10\x01\"I\n\x10\x44ropoutParameter\x12\x1a\n\rdropout_ratio\x18\x01 \x01(\x02:\x03\x30.5\x12\x19\n\x0bscale_train\x18\x02 \x01(\x08:\x04true\"\xa0\x01\n\x12\x44ummyDataParameter\x12+\n\x0b\x64\x61ta_filler\x18\x01 \x03(\x0b\x32\x16.caffe.FillerParameter\x12\x1f\n\x05shape\x18\x06 \x03(\x0b\x32\x10.caffe.BlobShape\x12\x0b\n\x03num\x18\x02 \x03(\r\x12\x10\n\x08\x63hannels\x18\x03 \x03(\r\x12\x0e\n\x06height\x18\x04 \x03(\r\x12\r\n\x05width\x18\x05 \x03(\r\"\xa5\x01\n\x10\x45ltwiseParameter\x12\x39\n\toperation\x18\x01 \x01(\x0e\x32!.caffe.EltwiseParameter.EltwiseOp:\x03SUM\x12\r\n\x05\x63oeff\x18\x02 \x03(\x02\x12\x1e\n\x10stable_prod_grad\x18\x03 \x01(\x08:\x04true\"\'\n\tEltwiseOp\x12\x08\n\x04PROD\x10\x00\x12\x07\n\x03SUM\x10\x01\x12\x07\n\x03MAX\x10\x02\" \n\x0c\x45LUParameter\x12\x10\n\x05\x61lpha\x18\x01 \x01(\x02:\x01\x31\"\xac\x01\n\x0e\x45mbedParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x11\n\tinput_dim\x18\x02 \x01(\r\x12\x17\n\tbias_term\x18\x03 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"D\n\x0c\x45xpParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"9\n\x10\x46lattenParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x14\n\x08\x65nd_axis\x18\x02 \x01(\x05:\x02-1\"O\n\x11HDF5DataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x16\n\x07shuffle\x18\x03 \x01(\x08:\x05\x66\x61lse\"(\n\x13HDF5OutputParameter\x12\x11\n\tfile_name\x18\x01 \x01(\t\"^\n\x12HingeLossParameter\x12\x30\n\x04norm\x18\x01 \x01(\x0e\x32\x1e.caffe.HingeLossParameter.Norm:\x02L1\"\x16\n\x04Norm\x12\x06\n\x02L1\x10\x01\x12\x06\n\x02L2\x10\x02\"\x97\x02\n\x12ImageDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\nbatch_size\x18\x04 \x01(\r:\x01\x31\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x16\n\x07shuffle\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnew_height\x18\t \x01(\r:\x01\x30\x12\x14\n\tnew_width\x18\n \x01(\r:\x01\x30\x12\x16\n\x08is_color\x18\x0b \x01(\x08:\x04true\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\x0c \x01(\t:\x00\"\'\n\x15InfogainLossParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\"\xcb\x01\n\x15InnerProductParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x04 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0f\n\x04\x61xis\x18\x05 \x01(\x05:\x01\x31\x12\x18\n\ttranspose\x18\x06 \x01(\x08:\x05\x66\x61lse\"1\n\x0eInputParameter\x12\x1f\n\x05shape\x18\x01 \x03(\x0b\x32\x10.caffe.BlobShape\"D\n\x0cLogParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xb8\x02\n\x0cLRNParameter\x12\x15\n\nlocal_size\x18\x01 \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x02 \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x03 \x01(\x02:\x04\x30.75\x12\x44\n\x0bnorm_region\x18\x04 \x01(\x0e\x32\x1e.caffe.LRNParameter.NormRegion:\x0f\x41\x43ROSS_CHANNELS\x12\x0c\n\x01k\x18\x05 \x01(\x02:\x01\x31\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.LRNParameter.Engine:\x07\x44\x45\x46\x41ULT\"5\n\nNormRegion\x12\x13\n\x0f\x41\x43ROSS_CHANNELS\x10\x00\x12\x12\n\x0eWITHIN_CHANNEL\x10\x01\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xbd\x01\n\x13MemoryDataParameter\x12\x12\n\nbatch_size\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\x12\x0e\n\x06height\x18\x03 \x01(\r\x12\r\n\x05width\x18\x04 \x01(\r\x12;\n\x05\x64type\x18\x05 \x01(\x0e\x32#.caffe.MemoryDataParameter.DataType:\x07\x46LOAT32\"$\n\x08\x44\x61taType\x12\x0b\n\x07\x46LOAT32\x10\x00\x12\x0b\n\x07\x46LOAT16\x10\x01\"e\n\x0cMVNParameter\x12 \n\x12normalize_variance\x18\x01 \x01(\x08:\x04true\x12\x1e\n\x0f\x61\x63ross_channels\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x13\n\x03\x65ps\x18\x03 \x01(\x02:\x06\x31\x65-009\"5\n\x12ParameterParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\"\xa2\x03\n\x10PoolingParameter\x12\x35\n\x04pool\x18\x01 \x01(\x0e\x32\".caffe.PoolingParameter.PoolMethod:\x03MAX\x12\x0e\n\x03pad\x18\x04 \x01(\r:\x01\x30\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x02 \x01(\r\x12\x10\n\x08kernel_h\x18\x05 \x01(\r\x12\x10\n\x08kernel_w\x18\x06 \x01(\r\x12\x11\n\x06stride\x18\x03 \x01(\r:\x01\x31\x12\x10\n\x08stride_h\x18\x07 \x01(\r\x12\x10\n\x08stride_w\x18\x08 \x01(\r\x12\x37\n\x06\x65ngine\x18\x0b \x01(\x0e\x32\x1e.caffe.PoolingParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x1d\n\x0eglobal_pooling\x18\x0c \x01(\x08:\x05\x66\x61lse\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Y\n\x13ROIPoolingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"F\n\x0ePowerParameter\x12\x10\n\x05power\x18\x01 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"g\n\x0fPythonParameter\x12\x0e\n\x06module\x18\x01 \x01(\t\x12\r\n\x05layer\x18\x02 \x01(\t\x12\x13\n\tparam_str\x18\x03 \x01(\t:\x00\x12 \n\x11share_in_parallel\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xad\x01\n\x12ReductionParameter\x12=\n\toperation\x18\x01 \x01(\x0e\x32%.caffe.ReductionParameter.ReductionOp:\x03SUM\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x10\n\x05\x63oeff\x18\x03 \x01(\x02:\x01\x31\"5\n\x0bReductionOp\x12\x07\n\x03SUM\x10\x01\x12\x08\n\x04\x41SUM\x10\x02\x12\t\n\x05SUMSQ\x10\x03\x12\x08\n\x04MEAN\x10\x04\"\x8d\x01\n\rReLUParameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\x12\x34\n\x06\x65ngine\x18\x02 \x01(\x0e\x32\x1b.caffe.ReLUParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"Z\n\x10ReshapeParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\x08num_axes\x18\x03 \x01(\x05:\x02-1\"\xa5\x01\n\x0eScaleParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12&\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x18\n\tbias_term\x18\x04 \x01(\x08:\x05\x66\x61lse\x12+\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\"x\n\x10SigmoidParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SigmoidParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"L\n\x0eSliceParameter\x12\x0f\n\x04\x61xis\x18\x03 \x01(\x05:\x01\x31\x12\x13\n\x0bslice_point\x18\x02 \x03(\r\x12\x14\n\tslice_dim\x18\x01 \x01(\r:\x01\x31\"\x89\x01\n\x10SoftmaxParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.caffe.SoftmaxParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"r\n\rTanHParameter\x12\x34\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1b.caffe.TanHParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"T\n\rTileParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05tiles\x18\x02 \x01(\x05\x12#\n\tmultiples\x18\x03 \x01(\x0b\x32\x10.caffe.BlobShape\"*\n\x12ThresholdParameter\x12\x14\n\tthreshold\x18\x01 \x01(\x02:\x01\x30\"\xc1\x02\n\x13WindowDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0c\x66g_threshold\x18\x07 \x01(\x02:\x03\x30.5\x12\x19\n\x0c\x62g_threshold\x18\x08 \x01(\x02:\x03\x30.5\x12\x19\n\x0b\x66g_fraction\x18\t \x01(\x02:\x04\x30.25\x12\x16\n\x0b\x63ontext_pad\x18\n \x01(\r:\x01\x30\x12\x17\n\tcrop_mode\x18\x0b \x01(\t:\x04warp\x12\x1b\n\x0c\x63\x61\x63he_images\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\r \x01(\t:\x00\"\xeb\x01\n\x0cSPPParameter\x12\x16\n\x0epyramid_height\x18\x01 \x01(\r\x12\x31\n\x04pool\x18\x02 \x01(\x0e\x32\x1e.caffe.SPPParameter.PoolMethod:\x03MAX\x12\x33\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1a.caffe.SPPParameter.Engine:\x07\x44\x45\x46\x41ULT\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xe0\x13\n\x10V1LayerParameter\x12\x0e\n\x06\x62ottom\x18\x02 \x03(\t\x12\x0b\n\x03top\x18\x03 \x03(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12$\n\x07include\x18  \x03(\x0b\x32\x13.caffe.NetStateRule\x12$\n\x07\x65xclude\x18! \x03(\x0b\x32\x13.caffe.NetStateRule\x12/\n\x04type\x18\x05 \x01(\x0e\x32!.caffe.V1LayerParameter.LayerType\x12\x1f\n\x05\x62lobs\x18\x06 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x0e\n\x05param\x18\xe9\x07 \x03(\t\x12>\n\x0f\x62lob_share_mode\x18\xea\x07 \x03(\x0e\x32$.caffe.V1LayerParameter.DimCheckMode\x12\x10\n\x08\x62lobs_lr\x18\x07 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x08 \x03(\x02\x12\x13\n\x0bloss_weight\x18# \x03(\x02\x12\x30\n\x0e\x61\x63\x63uracy_param\x18\x1b \x01(\x0b\x32\x18.caffe.AccuracyParameter\x12,\n\x0c\x61rgmax_param\x18\x17 \x01(\x0b\x32\x16.caffe.ArgMaxParameter\x12,\n\x0c\x63oncat_param\x18\t \x01(\x0b\x32\x16.caffe.ConcatParameter\x12?\n\x16\x63ontrastive_loss_param\x18( \x01(\x0b\x32\x1f.caffe.ContrastiveLossParameter\x12\x36\n\x11\x63onvolution_param\x18\n \x01(\x0b\x32\x1b.caffe.ConvolutionParameter\x12(\n\ndata_param\x18\x0b \x01(\x0b\x32\x14.caffe.DataParameter\x12.\n\rdropout_param\x18\x0c \x01(\x0b\x32\x17.caffe.DropoutParameter\x12\x33\n\x10\x64ummy_data_param\x18\x1a \x01(\x0b\x32\x19.caffe.DummyDataParameter\x12.\n\reltwise_param\x18\x18 \x01(\x0b\x32\x17.caffe.EltwiseParameter\x12&\n\texp_param\x18) \x01(\x0b\x32\x13.caffe.ExpParameter\x12\x31\n\x0fhdf5_data_param\x18\r \x01(\x0b\x32\x18.caffe.HDF5DataParameter\x12\x35\n\x11hdf5_output_param\x18\x0e \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\x12\x33\n\x10hinge_loss_param\x18\x1d \x01(\x0b\x32\x19.caffe.HingeLossParameter\x12\x33\n\x10image_data_param\x18\x0f \x01(\x0b\x32\x19.caffe.ImageDataParameter\x12\x39\n\x13infogain_loss_param\x18\x10 \x01(\x0b\x32\x1c.caffe.InfogainLossParameter\x12\x39\n\x13inner_product_param\x18\x11 \x01(\x0b\x32\x1c.caffe.InnerProductParameter\x12&\n\tlrn_param\x18\x12 \x01(\x0b\x32\x13.caffe.LRNParameter\x12\x35\n\x11memory_data_param\x18\x16 \x01(\x0b\x32\x1a.caffe.MemoryDataParameter\x12&\n\tmvn_param\x18\" \x01(\x0b\x32\x13.caffe.MVNParameter\x12.\n\rpooling_param\x18\x13 \x01(\x0b\x32\x17.caffe.PoolingParameter\x12*\n\x0bpower_param\x18\x15 \x01(\x0b\x32\x15.caffe.PowerParameter\x12(\n\nrelu_param\x18\x1e \x01(\x0b\x32\x14.caffe.ReLUParameter\x12.\n\rsigmoid_param\x18& \x01(\x0b\x32\x17.caffe.SigmoidParameter\x12.\n\rsoftmax_param\x18\' \x01(\x0b\x32\x17.caffe.SoftmaxParameter\x12*\n\x0bslice_param\x18\x1f \x01(\x0b\x32\x15.caffe.SliceParameter\x12(\n\ntanh_param\x18% \x01(\x0b\x32\x14.caffe.TanHParameter\x12\x32\n\x0fthreshold_param\x18\x19 \x01(\x0b\x32\x19.caffe.ThresholdParameter\x12\x35\n\x11window_data_param\x18\x14 \x01(\x0b\x32\x1a.caffe.WindowDataParameter\x12\x37\n\x0ftransform_param\x18$ \x01(\x0b\x32\x1e.caffe.TransformationParameter\x12(\n\nloss_param\x18* \x01(\x0b\x32\x14.caffe.LossParameter\x12&\n\x05layer\x18\x01 \x01(\x0b\x32\x17.caffe.V0LayerParameter\"\xd8\x04\n\tLayerType\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06\x41\x42SVAL\x10#\x12\x0c\n\x08\x41\x43\x43URACY\x10\x01\x12\n\n\x06\x41RGMAX\x10\x1e\x12\x08\n\x04\x42NLL\x10\x02\x12\n\n\x06\x43ONCAT\x10\x03\x12\x14\n\x10\x43ONTRASTIVE_LOSS\x10%\x12\x0f\n\x0b\x43ONVOLUTION\x10\x04\x12\x08\n\x04\x44\x41TA\x10\x05\x12\x11\n\rDECONVOLUTION\x10\'\x12\x0b\n\x07\x44ROPOUT\x10\x06\x12\x0e\n\nDUMMY_DATA\x10 \x12\x12\n\x0e\x45UCLIDEAN_LOSS\x10\x07\x12\x0b\n\x07\x45LTWISE\x10\x19\x12\x07\n\x03\x45XP\x10&\x12\x0b\n\x07\x46LATTEN\x10\x08\x12\r\n\tHDF5_DATA\x10\t\x12\x0f\n\x0bHDF5_OUTPUT\x10\n\x12\x0e\n\nHINGE_LOSS\x10\x1c\x12\n\n\x06IM2COL\x10\x0b\x12\x0e\n\nIMAGE_DATA\x10\x0c\x12\x11\n\rINFOGAIN_LOSS\x10\r\x12\x11\n\rINNER_PRODUCT\x10\x0e\x12\x07\n\x03LRN\x10\x0f\x12\x0f\n\x0bMEMORY_DATA\x10\x1d\x12\x1d\n\x19MULTINOMIAL_LOGISTIC_LOSS\x10\x10\x12\x07\n\x03MVN\x10\"\x12\x0b\n\x07POOLING\x10\x11\x12\t\n\x05POWER\x10\x1a\x12\x08\n\x04RELU\x10\x12\x12\x0b\n\x07SIGMOID\x10\x13\x12\x1e\n\x1aSIGMOID_CROSS_ENTROPY_LOSS\x10\x1b\x12\x0b\n\x07SILENCE\x10$\x12\x0b\n\x07SOFTMAX\x10\x14\x12\x10\n\x0cSOFTMAX_LOSS\x10\x15\x12\t\n\x05SPLIT\x10\x16\x12\t\n\x05SLICE\x10!\x12\x08\n\x04TANH\x10\x17\x12\x0f\n\x0bWINDOW_DATA\x10\x18\x12\r\n\tTHRESHOLD\x10\x1f\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xfd\x07\n\x10V0LayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x12\n\nnum_output\x18\x03 \x01(\r\x12\x16\n\x08\x62iasterm\x18\x04 \x01(\x08:\x04true\x12-\n\rweight_filler\x18\x05 \x01(\x0b\x32\x16.caffe.FillerParameter\x12+\n\x0b\x62ias_filler\x18\x06 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x0e\n\x03pad\x18\x07 \x01(\r:\x01\x30\x12\x12\n\nkernelsize\x18\x08 \x01(\r\x12\x10\n\x05group\x18\t \x01(\r:\x01\x31\x12\x11\n\x06stride\x18\n \x01(\r:\x01\x31\x12\x35\n\x04pool\x18\x0b \x01(\x0e\x32\".caffe.V0LayerParameter.PoolMethod:\x03MAX\x12\x1a\n\rdropout_ratio\x18\x0c \x01(\x02:\x03\x30.5\x12\x15\n\nlocal_size\x18\r \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x0e \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x0f \x01(\x02:\x04\x30.75\x12\x0c\n\x01k\x18\x16 \x01(\x02:\x01\x31\x12\x0e\n\x06source\x18\x10 \x01(\t\x12\x10\n\x05scale\x18\x11 \x01(\x02:\x01\x31\x12\x10\n\x08meanfile\x18\x12 \x01(\t\x12\x11\n\tbatchsize\x18\x13 \x01(\r\x12\x13\n\x08\x63ropsize\x18\x14 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x15 \x01(\x08:\x05\x66\x61lse\x12\x1f\n\x05\x62lobs\x18\x32 \x03(\x0b\x32\x10.caffe.BlobProto\x12\x10\n\x08\x62lobs_lr\x18\x33 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x34 \x03(\x02\x12\x14\n\trand_skip\x18\x35 \x01(\r:\x01\x30\x12\x1d\n\x10\x64\x65t_fg_threshold\x18\x36 \x01(\x02:\x03\x30.5\x12\x1d\n\x10\x64\x65t_bg_threshold\x18\x37 \x01(\x02:\x03\x30.5\x12\x1d\n\x0f\x64\x65t_fg_fraction\x18\x38 \x01(\x02:\x04\x30.25\x12\x1a\n\x0f\x64\x65t_context_pad\x18: \x01(\r:\x01\x30\x12\x1b\n\rdet_crop_mode\x18; \x01(\t:\x04warp\x12\x12\n\x07new_num\x18< \x01(\x05:\x01\x30\x12\x17\n\x0cnew_channels\x18= \x01(\x05:\x01\x30\x12\x15\n\nnew_height\x18> \x01(\x05:\x01\x30\x12\x14\n\tnew_width\x18? \x01(\x05:\x01\x30\x12\x1d\n\x0eshuffle_images\x18@ \x01(\x08:\x05\x66\x61lse\x12\x15\n\nconcat_dim\x18\x41 \x01(\r:\x01\x31\x12\x36\n\x11hdf5_output_param\x18\xe9\x07 \x01(\x0b\x32\x1a.caffe.HDF5OutputParameter\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"W\n\x0ePReLUParameter\x12&\n\x06\x66iller\x18\x01 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1d\n\x0e\x63hannel_shared\x18\x02 \x01(\x08:\x05\x66\x61lse\")\n\x15SmoothL1LossParameter\x12\x10\n\x05sigma\x18\x01 \x01(\x02:\x01\x31\"H\n\x0cMPIParameter\x12\x0f\n\x04root\x18\x01 \x01(\r:\x01\x30\x12\x12\n\x07\x63omm_id\x18\x02 \x01(\x04:\x01\x30\x12\x13\n\x08group_id\x18\x03 \x01(\x04:\x01\x30\"!\n\x10PermuteParameter\x12\r\n\x05order\x18\x01 \x03(\r\"\x93\x01\n\x12NormalizeParameter\x12\x1c\n\x0e\x61\x63ross_spatial\x18\x01 \x01(\x08:\x04true\x12,\n\x0cscale_filler\x18\x02 \x01(\x0b\x32\x16.caffe.FillerParameter\x12\x1c\n\x0e\x63hannel_shared\x18\x03 \x01(\x08:\x04true\x12\x13\n\x03\x65ps\x18\x04 \x01(\x02:\x06\x31\x65-005\"d\n\x11ParallelParameter\x12\x1d\n\x0emultiple_nodes\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x16\n\x07shuffle\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x18\n\tpartition\x18\x03 \x01(\x08:\x05\x66\x61lse\"R\n\x0fResizeParameter\x12\x1f\n\x05shape\x18\x01 \x01(\x0b\x32\x10.caffe.BlobShape\x12\x0e\n\x02\x66x\x18\x02 \x01(\x02:\x02-1\x12\x0e\n\x02\x66y\x18\x03 \x01(\x02:\x02-1\"\'\n\x13\x45xpandDimsParameter\x12\x10\n\x04\x61xis\x18\x01 \x01(\x05:\x02-1\"\x90\x02\n\x11ProposalParameter\x12\x0e\n\x06stride\x18\x01 \x03(\x05\x12\r\n\x05ratio\x18\x02 \x03(\x02\x12\r\n\x05scale\x18\x03 \x03(\x02\x12\x1b\n\rpre_nms_top_n\x18\x04 \x01(\r:\x04\x36\x30\x30\x30\x12\x1b\n\x0epost_nms_top_n\x18\x05 \x01(\r:\x03\x33\x30\x30\x12\x17\n\nnms_thresh\x18\x06 \x01(\x02:\x03\x30.7\x12\x14\n\x08min_size\x18\x07 \x01(\r:\x02\x31\x36\x12\x14\n\tmin_level\x18\x08 \x01(\x05:\x01\x32\x12\x14\n\tmax_level\x18\t \x01(\x05:\x01\x35\x12\x1c\n\x0f\x63\x61nonical_scale\x18\n \x01(\x05:\x03\x32\x32\x34\x12\x1a\n\x0f\x63\x61nonical_level\x18\x0b \x01(\x05:\x01\x34\"\xa7\x01\n\x14\x42\x61tchRenormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12$\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x03\x30.9\x12\x13\n\x03\x65ps\x18\x03 \x01(\x02:\x06\x31\x65-005\x12\x10\n\x05r_max\x18\x04 \x01(\x02:\x01\x33\x12\x10\n\x05\x64_max\x18\x05 \x01(\x02:\x01\x35\x12\x16\n\x07t_delta\x18\x06 \x01(\x02:\x05\x30.001\"?\n\x14\x44\x65nseConcatParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x16\n\x0bgrowth_rate\x18\x02 \x01(\x05:\x01\x30\"N\n\x12\x46ocalLossParameter\x12\x13\n\x05\x61lpha\x18\x01 \x01(\x02:\x04\x30.25\x12\x10\n\x05gamma\x18\x02 \x01(\x02:\x01\x32\x12\x11\n\x06neg_id\x18\x03 \x01(\x05:\x01\x30\"\"\n\x0fGatherParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x30\",\n\x15InstanceNormParameter\x12\x13\n\x03\x65ps\x18\x01 \x01(\x02:\x06\x31\x65-005\"<\n\x12GroupNormParameter\x12\x13\n\x03\x65ps\x18\x01 \x01(\x02:\x06\x31\x65-005\x12\x11\n\x05group\x18\x02 \x01(\x05:\x02\x33\x32\"k\n\x12\x44ropBlockParameter\x12\x15\n\nblock_size\x18\x01 \x01(\x05:\x01\x37\x12\x16\n\tkeep_prob\x18\x02 \x01(\x02:\x03\x30.9\x12\x10\n\x05\x61lpha\x18\x03 \x01(\x02:\x01\x31\x12\x14\n\tdecrement\x18\x04 \x01(\x02:\x01\x30*\x1c\n\x05Phase\x12\t\n\x05TRAIN\x10\x00\x12\x08\n\x04TEST\x10\x01')
 )
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
@@ -40,8 +40,8 @@ _PHASE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=17641,
+  serialized_start=17850,
-  serialized_end=17669,
+  serialized_end=17878,
 )
 _sym_db.RegisterEnumDescriptor(_PHASE)
@@ -209,8 +209,8 @@ _LOSSPARAMETER_NORMALIZATIONMODE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=6595,
+  serialized_start=6709,
-  serialized_end=6671,
+  serialized_end=6785,
 )
 _sym_db.RegisterEnumDescriptor(_LOSSPARAMETER_NORMALIZATIONMODE)
@@ -235,8 +235,8 @@ _CONVOLUTIONPARAMETER_ENGINE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=7634,
+  serialized_start=7749,
-  serialized_end=7677,
+  serialized_end=7792,
 )
 _sym_db.RegisterEnumDescriptor(_CONVOLUTIONPARAMETER_ENGINE)
@@ -257,8 +257,8 @@ _DATAPARAMETER_DB = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=7995,
+  serialized_start=8110,
-  serialized_end=8022,
+  serialized_end=8137,
 )
 _sym_db.RegisterEnumDescriptor(_DATAPARAMETER_DB)
@@ -283,8 +283,8 @@ _ELTWISEPARAMETER_ELTWISEOP = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=8389,
+  serialized_start=8504,
-  serialized_end=8428,
+  serialized_end=8543,
 )
 _sym_db.RegisterEnumDescriptor(_ELTWISEPARAMETER_ELTWISEOP)
@@ -305,8 +305,8 @@ _HINGELOSSPARAMETER_NORM = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=8963,
+  serialized_start=9078,
-  serialized_end=8985,
+  serialized_end=9100,
 )
 _sym_db.RegisterEnumDescriptor(_HINGELOSSPARAMETER_NORM)
@@ -327,8 +327,8 @@ _LRNPARAMETER_NORMREGION = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=9852,
+  serialized_start=9967,
-  serialized_end=9905,
+  serialized_end=10020,
 )
 _sym_db.RegisterEnumDescriptor(_LRNPARAMETER_NORMREGION)
@@ -353,8 +353,8 @@ _LRNPARAMETER_ENGINE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=7634,
+  serialized_start=7749,
-  serialized_end=7677,
+  serialized_end=7792,
 )
 _sym_db.RegisterEnumDescriptor(_LRNPARAMETER_ENGINE)
@@ -375,8 +375,8 @@ _MEMORYDATAPARAMETER_DATATYPE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=10106,
+  serialized_start=10221,
-  serialized_end=10142,
+  serialized_end=10257,
 )
 _sym_db.RegisterEnumDescriptor(_MEMORYDATAPARAMETER_DATATYPE)
@@ -401,8 +401,8 @@ _POOLINGPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=10630,
+  serialized_start=10745,
-  serialized_end=10676,
+  serialized_end=10791,
 )
 _sym_db.RegisterEnumDescriptor(_POOLINGPARAMETER_POOLMETHOD)
@@ -427,8 +427,8 @@ _POOLINGPARAMETER_ENGINE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=7634,
+  serialized_start=7749,
-  serialized_end=7677,
+  serialized_end=7792,
 )
 _sym_db.RegisterEnumDescriptor(_POOLINGPARAMETER_ENGINE)
@@ -457,8 +457,8 @@ _REDUCTIONPARAMETER_REDUCTIONOP = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=11112,
+  serialized_start=11227,
-  serialized_end=11165,
+  serialized_end=11280,
 )
 _sym_db.RegisterEnumDescriptor(_REDUCTIONPARAMETER_REDUCTIONOP)
@@ -483,8 +483,8 @@ _RELUPARAMETER_ENGINE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=7634,
+  serialized_start=7749,
-  serialized_end=7677,
+  serialized_end=7792,
 )
 _sym_db.RegisterEnumDescriptor(_RELUPARAMETER_ENGINE)
@@ -509,8 +509,8 @@ _SIGMOIDPARAMETER_ENGINE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=7634,
+  serialized_start=7749,
-  serialized_end=7677,
+  serialized_end=7792,
 )
 _sym_db.RegisterEnumDescriptor(_SIGMOIDPARAMETER_ENGINE)
@@ -535,8 +535,8 @@ _SOFTMAXPARAMETER_ENGINE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=7634,
+  serialized_start=7749,
-  serialized_end=7677,
+  serialized_end=7792,
 )
 _sym_db.RegisterEnumDescriptor(_SOFTMAXPARAMETER_ENGINE)
@@ -561,8 +561,8 @@ _TANHPARAMETER_ENGINE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=7634,
+  serialized_start=7749,
-  serialized_end=7677,
+  serialized_end=7792,
 )
 _sym_db.RegisterEnumDescriptor(_TANHPARAMETER_ENGINE)
@@ -587,8 +587,8 @@ _SPPPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=10630,
+  serialized_start=10745,
-  serialized_end=10676,
+  serialized_end=10791,
 )
 _sym_db.RegisterEnumDescriptor(_SPPPARAMETER_POOLMETHOD)
@@ -613,8 +613,8 @@ _SPPPARAMETER_ENGINE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=7634,
+  serialized_start=7749,
-  serialized_end=7677,
+  serialized_end=7792,
 )
 _sym_db.RegisterEnumDescriptor(_SPPPARAMETER_ENGINE)
@@ -787,8 +787,8 @@ _V1LAYERPARAMETER_LAYERTYPE = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=14604,
+  serialized_start=14719,
-  serialized_end=15204,
+  serialized_end=15319,
 )
 _sym_db.RegisterEnumDescriptor(_V1LAYERPARAMETER_LAYERTYPE)
@@ -835,8 +835,8 @@ _V0LAYERPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor(
  ],
  containing_type=None,
  options=None,
-  serialized_start=10630,
+  serialized_start=10745,
-  serialized_end=10676,
+  serialized_end=10791,
 )
 _sym_db.RegisterEnumDescriptor(_V0LAYERPARAMETER_POOLMETHOD)
@@ -2269,12 +2269,26 @@ _LAYERPARAMETER = _descriptor.Descriptor(
      is_extension=False, extension_scope=None,
      options=None),
    _descriptor.FieldDescriptor(
-      name='group_norm_param', full_name='caffe.LayerParameter.group_norm_param', index=71,
+      name='instance_norm_param', full_name='caffe.LayerParameter.instance_norm_param', index=71,
      number=166, type=11, cpp_type=10, label=1,
      has_default_value=False, default_value=None,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
+    _descriptor.FieldDescriptor(
+      name='group_norm_param', full_name='caffe.LayerParameter.group_norm_param', index=72,
+      number=167, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='drop_block_param', full_name='caffe.LayerParameter.drop_block_param', index=73,
+      number=168, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
  ],
  extensions=[
  ],
@@ -2287,7 +2301,7 @@ _LAYERPARAMETER = _descriptor.Descriptor(
  oneofs=[
  ],
  serialized_start=2850,
-  serialized_end=6125,
+  serialized_end=6239,
 )
@@ -2386,8 +2400,8 @@ _TRANSFORMATIONPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=6128,
+  serialized_start=6242,
-  serialized_end=6423,
+  serialized_end=6537,
 )
@@ -2416,8 +2430,8 @@ _LOSSPARAMETER_EXPANDDIMSPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=6554,
+  serialized_start=6668,
-  serialized_end=6593,
+  serialized_end=6707,
 )
 _LOSSPARAMETER = _descriptor.Descriptor(
@@ -2460,8 +2474,8 @@ _LOSSPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=6426,
+  serialized_start=6540,
-  serialized_end=6671,
+  serialized_end=6785,
 )
@@ -2504,8 +2518,8 @@ _ACCURACYPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=6673,
+  serialized_start=6787,
-  serialized_end=6749,
+  serialized_end=6863,
 )
@@ -2548,8 +2562,8 @@ _ARGMAXPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=6751,
+  serialized_start=6865,
-  serialized_end=6828,
+  serialized_end=6942,
 )
@@ -2585,8 +2599,8 @@ _CONCATPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=6830,
+  serialized_start=6944,
-  serialized_end=6887,
+  serialized_end=7001,
 )
@@ -2614,7 +2628,7 @@ _BATCHNORMPARAMETER = _descriptor.Descriptor(
    _descriptor.FieldDescriptor(
      name='eps', full_name='caffe.BatchNormParameter.eps', index=2,
      number=3, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=0.001,
+      has_default_value=True, default_value=1e-005,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
@@ -2629,8 +2643,8 @@ _BATCHNORMPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=6889,
+  serialized_start=7003,
-  serialized_end=6993,
+  serialized_end=7108,
 )
@@ -2673,8 +2687,8 @@ _BIASPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=6995,
+  serialized_start=7110,
-  serialized_end=7088,
+  serialized_end=7203,
 )
@@ -2710,8 +2724,8 @@ _CONTRASTIVELOSSPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=7090,
+  serialized_start=7205,
-  serialized_end=7166,
+  serialized_end=7281,
 )
@@ -2860,8 +2874,8 @@ _CONVOLUTIONPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=7169,
+  serialized_start=7284,
-  serialized_end=7677,
+  serialized_end=7792,
 )
@@ -2897,8 +2911,8 @@ _CROPPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=7679,
+  serialized_start=7794,
-  serialized_end=7727,
+  serialized_end=7842,
 )
@@ -2991,8 +3005,8 @@ _DATAPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=7730,
+  serialized_start=7845,
-  serialized_end=8022,
+  serialized_end=8137,
 )
@@ -3028,8 +3042,8 @@ _DROPOUTPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8024,
+  serialized_start=8139,
-  serialized_end=8097,
+  serialized_end=8212,
 )
@@ -3093,8 +3107,8 @@ _DUMMYDATAPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8100,
+  serialized_start=8215,
-  serialized_end=8260,
+  serialized_end=8375,
 )
@@ -3138,8 +3152,8 @@ _ELTWISEPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8263,
+  serialized_start=8378,
-  serialized_end=8428,
+  serialized_end=8543,
 )
@@ -3168,8 +3182,8 @@ _ELUPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8430,
+  serialized_start=8545,
-  serialized_end=8462,
+  serialized_end=8577,
 )
@@ -3226,8 +3240,8 @@ _EMBEDPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8465,
+  serialized_start=8580,
-  serialized_end=8637,
+  serialized_end=8752,
 )
@@ -3270,8 +3284,8 @@ _EXPPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8639,
+  serialized_start=8754,
-  serialized_end=8707,
+  serialized_end=8822,
 )
@@ -3307,8 +3321,8 @@ _FLATTENPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8709,
+  serialized_start=8824,
-  serialized_end=8766,
+  serialized_end=8881,
 )
@@ -3351,8 +3365,8 @@ _HDF5DATAPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8768,
+  serialized_start=8883,
-  serialized_end=8847,
+  serialized_end=8962,
 )
@@ -3381,8 +3395,8 @@ _HDF5OUTPUTPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8849,
+  serialized_start=8964,
-  serialized_end=8889,
+  serialized_end=9004,
 )
@@ -3412,8 +3426,8 @@ _HINGELOSSPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8891,
+  serialized_start=9006,
-  serialized_end=8985,
+  serialized_end=9100,
 )
@@ -3519,8 +3533,8 @@ _IMAGEDATAPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=8988,
+  serialized_start=9103,
-  serialized_end=9267,
+  serialized_end=9382,
 )
@@ -3549,8 +3563,8 @@ _INFOGAINLOSSPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=9269,
+  serialized_start=9384,
-  serialized_end=9308,
+  serialized_end=9423,
 )
@@ -3614,8 +3628,8 @@ _INNERPRODUCTPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=9311,
+  serialized_start=9426,
-  serialized_end=9514,
+  serialized_end=9629,
 )
@@ -3644,8 +3658,8 @@ _INPUTPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=9516,
+  serialized_start=9631,
-  serialized_end=9565,
+  serialized_end=9680,
 )
@@ -3688,8 +3702,8 @@ _LOGPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=9567,
+  serialized_start=9682,
-  serialized_end=9635,
+  serialized_end=9750,
 )
@@ -3755,8 +3769,8 @@ _LRNPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=9638,
+  serialized_start=9753,
-  serialized_end=9950,
+  serialized_end=10065,
 )
@@ -3814,8 +3828,8 @@ _MEMORYDATAPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=9953,
+  serialized_start=10068,
-  serialized_end=10142,
+  serialized_end=10257,
 )
@@ -3858,8 +3872,8 @@ _MVNPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=10144,
+  serialized_start=10259,
-  serialized_end=10245,
+  serialized_end=10360,
 )
@@ -3888,8 +3902,8 @@ _PARAMETERPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=10247,
+  serialized_start=10362,
-  serialized_end=10300,
+  serialized_end=10415,
 )
@@ -3997,8 +4011,8 @@ _POOLINGPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=10303,
+  serialized_start=10418,
-  serialized_end=10721,
+  serialized_end=10836,
 )
@@ -4041,8 +4055,8 @@ _ROIPOOLINGPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=10723,
+  serialized_start=10838,
-  serialized_end=10812,
+  serialized_end=10927,
 )
@@ -4085,8 +4099,8 @@ _POWERPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=10814,
+  serialized_start=10929,
-  serialized_end=10884,
+  serialized_end=10999,
 )
@@ -4136,8 +4150,8 @@ _PYTHONPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=10886,
+  serialized_start=11001,
-  serialized_end=10989,
+  serialized_end=11104,
 )
@@ -4181,8 +4195,8 @@ _REDUCTIONPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=10992,
+  serialized_start=11107,
-  serialized_end=11165,
+  serialized_end=11280,
 )
@@ -4219,8 +4233,8 @@ _RELUPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=11168,
+  serialized_start=11283,
-  serialized_end=11309,
+  serialized_end=11424,
 )
@@ -4263,8 +4277,8 @@ _RESHAPEPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=11311,
+  serialized_start=11426,
-  serialized_end=11401,
+  serialized_end=11516,
 )
@@ -4321,8 +4335,8 @@ _SCALEPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=11404,
+  serialized_start=11519,
-  serialized_end=11569,
+  serialized_end=11684,
 )
@@ -4352,8 +4366,8 @@ _SIGMOIDPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=11571,
+  serialized_start=11686,
-  serialized_end=11691,
+  serialized_end=11806,
 )
@@ -4396,8 +4410,8 @@ _SLICEPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=11693,
+  serialized_start=11808,
-  serialized_end=11769,
+  serialized_end=11884,
 )
@@ -4434,8 +4448,8 @@ _SOFTMAXPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=11772,
+  serialized_start=11887,
-  serialized_end=11909,
+  serialized_end=12024,
 )
@@ -4465,8 +4479,8 @@ _TANHPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=11911,
+  serialized_start=12026,
-  serialized_end=12025,
+  serialized_end=12140,
 )
@@ -4509,8 +4523,8 @@ _TILEPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=12027,
+  serialized_start=12142,
-  serialized_end=12111,
+  serialized_end=12226,
 )
@@ -4539,8 +4553,8 @@ _THRESHOLDPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=12113,
+  serialized_start=12228,
-  serialized_end=12155,
+  serialized_end=12270,
 )
@@ -4653,8 +4667,8 @@ _WINDOWDATAPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=12158,
+  serialized_start=12273,
-  serialized_end=12479,
+  serialized_end=12594,
 )
@@ -4699,8 +4713,8 @@ _SPPPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=12482,
+  serialized_start=12597,
-  serialized_end=12717,
+  serialized_end=12832,
 )
@@ -5025,8 +5039,8 @@ _V1LAYERPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=12720,
+  serialized_start=12835,
-  serialized_end=15248,
+  serialized_end=15363,
 )
@@ -5315,8 +5329,8 @@ _V0LAYERPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=15251,
+  serialized_start=15366,
-  serialized_end=16272,
+  serialized_end=16387,
 )
@@ -5352,8 +5366,8 @@ _PRELUPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16274,
+  serialized_start=16389,
-  serialized_end=16361,
+  serialized_end=16476,
 )
@@ -5382,8 +5396,8 @@ _SMOOTHL1LOSSPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16363,
+  serialized_start=16478,
-  serialized_end=16404,
+  serialized_end=16519,
 )
@@ -5426,8 +5440,8 @@ _MPIPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16406,
+  serialized_start=16521,
-  serialized_end=16478,
+  serialized_end=16593,
 )
@@ -5456,8 +5470,8 @@ _PERMUTEPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16480,
+  serialized_start=16595,
-  serialized_end=16513,
+  serialized_end=16628,
 )
@@ -5492,7 +5506,7 @@ _NORMALIZEPARAMETER = _descriptor.Descriptor(
    _descriptor.FieldDescriptor(
      name='eps', full_name='caffe.NormalizeParameter.eps', index=3,
      number=4, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=0.001,
+      has_default_value=True, default_value=1e-005,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
@@ -5507,8 +5521,8 @@ _NORMALIZEPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16516,
+  serialized_start=16631,
-  serialized_end=16662,
+  serialized_end=16778,
 )
@@ -5551,8 +5565,8 @@ _PARALLELPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16664,
+  serialized_start=16780,
-  serialized_end=16764,
+  serialized_end=16880,
 )
@@ -5595,8 +5609,8 @@ _RESIZEPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16766,
+  serialized_start=16882,
-  serialized_end=16848,
+  serialized_end=16964,
 )
@@ -5625,8 +5639,8 @@ _EXPANDDIMSPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=6554,
+  serialized_start=6668,
-  serialized_end=6593,
+  serialized_end=6707,
 )
@@ -5725,8 +5739,8 @@ _PROPOSALPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=16892,
+  serialized_start=17008,
-  serialized_end=17164,
+  serialized_end=17280,
 )
@@ -5754,7 +5768,7 @@ _BATCHRENORMPARAMETER = _descriptor.Descriptor(
    _descriptor.FieldDescriptor(
      name='eps', full_name='caffe.BatchRenormParameter.eps', index=2,
      number=3, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=0.001,
+      has_default_value=True, default_value=1e-005,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
@@ -5790,8 +5804,8 @@ _BATCHRENORMPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17167,
+  serialized_start=17283,
-  serialized_end=17333,
+  serialized_end=17450,
 )
@@ -5827,8 +5841,8 @@ _DENSECONCATPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17335,
+  serialized_start=17452,
-  serialized_end=17398,
+  serialized_end=17515,
 )
@@ -5871,8 +5885,8 @@ _FOCALLOSSPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17400,
+  serialized_start=17517,
-  serialized_end=17478,
+  serialized_end=17595,
 )
@@ -5901,8 +5915,38 @@ _GATHERPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17480,
+  serialized_start=17597,
-  serialized_end=17514,
+  serialized_end=17631,
+)
+_INSTANCENORMPARAMETER = _descriptor.Descriptor(
+  name='InstanceNormParameter',
+  full_name='caffe.InstanceNormParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='eps', full_name='caffe.InstanceNormParameter.eps', index=0,
+      number=1, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=1e-005,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=17633,
+  serialized_end=17677,
 )
@@ -5914,30 +5958,67 @@ _GROUPNORMPARAMETER = _descriptor.Descriptor(
  containing_type=None,
  fields=[
    _descriptor.FieldDescriptor(
-      name='use_global_stats', full_name='caffe.GroupNormParameter.use_global_stats', index=0,
+      name='eps', full_name='caffe.GroupNormParameter.eps', index=0,
-      number=1, type=8, cpp_type=7, label=1,
+      number=1, type=2, cpp_type=6, label=1,
-      has_default_value=False, default_value=False,
+      has_default_value=True, default_value=1e-005,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='group', full_name='caffe.GroupNormParameter.group', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=32,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=17679,
+  serialized_end=17739,
+)
+_DROPBLOCKPARAMETER = _descriptor.Descriptor(
+  name='DropBlockParameter',
+  full_name='caffe.DropBlockParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='block_size', full_name='caffe.DropBlockParameter.block_size', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=7,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
    _descriptor.FieldDescriptor(
-      name='moving_average_fraction', full_name='caffe.GroupNormParameter.moving_average_fraction', index=1,
+      name='keep_prob', full_name='caffe.DropBlockParameter.keep_prob', index=1,
      number=2, type=2, cpp_type=6, label=1,
      has_default_value=True, default_value=0.9,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
    _descriptor.FieldDescriptor(
-      name='eps', full_name='caffe.GroupNormParameter.eps', index=2,
+      name='alpha', full_name='caffe.DropBlockParameter.alpha', index=2,
      number=3, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=0.001,
+      has_default_value=True, default_value=1,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
    _descriptor.FieldDescriptor(
-      name='group', full_name='caffe.GroupNormParameter.group', index=3,
+      name='decrement', full_name='caffe.DropBlockParameter.decrement', index=3,
-      number=5, type=13, cpp_type=3, label=1,
+      number=4, type=2, cpp_type=6, label=1,
-      has_default_value=True, default_value=32,
+      has_default_value=True, default_value=0,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
@@ -5952,8 +6033,8 @@ _GROUPNORMPARAMETER = _descriptor.Descriptor(
  extension_ranges=[],
  oneofs=[
  ],
-  serialized_start=17516,
+  serialized_start=17741,
-  serialized_end=17639,
+  serialized_end=17848,
 )
 _BLOBPROTO.fields_by_name['shape'].message_type = _BLOBSHAPE
@@ -6044,7 +6125,9 @@ _LAYERPARAMETER.fields_by_name['batch_renorm_param'].message_type = _BATCHRENORM
 _LAYERPARAMETER.fields_by_name['dense_concat_param'].message_type = _DENSECONCATPARAMETER
 _LAYERPARAMETER.fields_by_name['focal_loss_param'].message_type = _FOCALLOSSPARAMETER
 _LAYERPARAMETER.fields_by_name['gather_param'].message_type = _GATHERPARAMETER
+_LAYERPARAMETER.fields_by_name['instance_norm_param'].message_type = _INSTANCENORMPARAMETER
 _LAYERPARAMETER.fields_by_name['group_norm_param'].message_type = _GROUPNORMPARAMETER
+_LAYERPARAMETER.fields_by_name['drop_block_param'].message_type = _DROPBLOCKPARAMETER
 _LOSSPARAMETER_EXPANDDIMSPARAMETER.containing_type = _LOSSPARAMETER
 _LOSSPARAMETER.fields_by_name['normalization'].enum_type = _LOSSPARAMETER_NORMALIZATIONMODE
 _LOSSPARAMETER_NORMALIZATIONMODE.containing_type = _LOSSPARAMETER
@@ -6215,7 +6298,9 @@ DESCRIPTOR.message_types_by_name['BatchRenormParameter'] = _BATCHRENORMPARAMETER
 DESCRIPTOR.message_types_by_name['DenseConcatParameter'] = _DENSECONCATPARAMETER
 DESCRIPTOR.message_types_by_name['FocalLossParameter'] = _FOCALLOSSPARAMETER
 DESCRIPTOR.message_types_by_name['GatherParameter'] = _GATHERPARAMETER
+DESCRIPTOR.message_types_by_name['InstanceNormParameter'] = _INSTANCENORMPARAMETER
 DESCRIPTOR.message_types_by_name['GroupNormParameter'] = _GROUPNORMPARAMETER
+DESCRIPTOR.message_types_by_name['DropBlockParameter'] = _DROPBLOCKPARAMETER
 DESCRIPTOR.enum_types_by_name['Phase'] = _PHASE
 BlobShape = _reflection.GeneratedProtocolMessageType('BlobShape', (_message.Message,), dict(
@@ -6737,6 +6822,13 @@ GatherParameter = _reflection.GeneratedProtocolMessageType('GatherParameter', (_
  ))
 _sym_db.RegisterMessage(GatherParameter)
+InstanceNormParameter = _reflection.GeneratedProtocolMessageType('InstanceNormParameter', (_message.Message,), dict(
+  DESCRIPTOR = _INSTANCENORMPARAMETER,
+  __module__ = 'caffe_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.InstanceNormParameter)
+  ))
+_sym_db.RegisterMessage(InstanceNormParameter)
 GroupNormParameter = _reflection.GeneratedProtocolMessageType('GroupNormParameter', (_message.Message,), dict(
  DESCRIPTOR = _GROUPNORMPARAMETER,
  __module__ = 'caffe_pb2'
@@ -6744,6 +6836,13 @@ GroupNormParameter = _reflection.GeneratedProtocolMessageType('GroupNormParamete
  ))
 _sym_db.RegisterMessage(GroupNormParameter)
+DropBlockParameter = _reflection.GeneratedProtocolMessageType('DropBlockParameter', (_message.Message,), dict(
+  DESCRIPTOR = _DROPBLOCKPARAMETER,
+  __module__ = 'caffe_pb2'
+  # @@protoc_insertion_point(class_scope:caffe.DropBlockParameter)
+  ))
+_sym_db.RegisterMessage(DropBlockParameter)
 _BLOBSHAPE.fields_by_name['dim'].has_options = True
 _BLOBSHAPE.fields_by_name['dim']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))

--- a/Dragon/python/dragon/vm/tensorflow/framework/ops.py
+++ b/Dragon/python/dragon/vm/tensorflow/framework/ops.py
@@ -24,7 +24,7 @@ def convert_to_tensor(value, dtype=None, name=None, **kwargs):
    Parameters
    ----------
-    value : basic type, list or numpy.ndarray
+    value : number, list or numpy.ndarray
        The value to convert.
    dtype : Dtype or None
        The data type. If ``None``, inferred from the type of `value`.

--- a/Dragon/python/dragon/vm/theano/compile/function.py
+++ b/Dragon/python/dragon/vm/theano/compile/function.py
@@ -15,6 +15,7 @@ import numpy as np
 import dragon.core.mpi as mpi
 import dragon.core.workspace as ws
 import dragon.protos.dragon_pb2 as pb
 from dragon.core.utils import MakeArgument
 from dragon.core.gradient_maker import GraphGradientMaker
 from dragon.core.scope import GetOperatorName, GetTensorName
@@ -156,6 +157,7 @@ def GraphDef_Opt(meta_graph):
    OX = 3 if option['share_grads'] else 2
    if option['debug_mode']: OX = 1
    meta_graph.arg.add().CopyFrom(MakeArgument('optimization_level', OX))
+    meta_graph.graph_type = option['graph_type']
 def GraphDef_Device(meta_graph):
@@ -181,11 +183,12 @@ def GraphDef_Device(meta_graph):
    """
    from dragon.config import option
    if option['device'] is not 'None':
-        supports = {'CPU': 0, 'CUDA': 1}
+        supports = {'CPU': 0, 'CUDA': 1, 'CNML': 2}
        device_option = pb.DeviceOption()
        device_option.device_type = supports[option['device']]
-        device_option.device_id = option['gpu_id']
+        device_option.device_id = option['device_id']
        device_option.random_seed = option['random_seed']
+        if option['device'] == 'CUDA':
            if option['use_cudnn']: device_option.engine = 'CUDNN'
        meta_graph.device_option.CopyFrom(device_option)
@@ -217,16 +220,16 @@ def function(inputs=None, outputs=None, givens=None, updater=None):
    Examples
    --------
-    >>> x = Tensor('x').Variable()
+    >>> x = Tensor('x', dtype='float32').Variable()
    >>> y = x * 2
-    >>> f = theano.function(outputs=y)
+    >>> f = function(outputs=y)
-    >>> x.set_value(np.ones((2, 3), dtype=np.float32))
+    >>> x.set_value(np.ones((2, 3)))
    >>> print(f())
    >>> [[ 2.  2.  2.]
         [ 2.  2.  2.]]
-    >>> f = theano.function(inputs=x, outputs=y)
+    >>> f = function(inputs=x, outputs=y)
-    >>> print(f(np.ones((2, 3), dtype=np.float32)))
+    >>> print(f(np.ones((2, 3)))
    >>> [[ 2.  2.  2.]
         [ 2.  2.  2.]]
@@ -339,13 +342,15 @@ def eval(self, feed_dict=None):
                raise TypeError('The key of feed_dict key should be a Tensor.')
            if key.shape is not None:
                if len(key.shape) != len(value.shape):
-                    raise RuntimeError('The Tensor({}) was limited to {} dimensions, \
+                    raise RuntimeError(
-                                                    while feed a value with {} dimensions.'.
+                        'The Tensor({}) was limited to {} dimensions, \
-                                       format(key.name, len(key.shape), len(value.shape)))
+                         while feed a value with {} dimensions.'.format(
+                            key.name, len(key.shape), len(value.shape)))
                for i in range(len(key.shape)):
                    if key.shape[i] is None: continue
                    if key.shape[i] != value.shape[i]:
-                        raise RuntimeError('The shape of Tensor({}) was limited as ('.format(key.name) +
+                        raise RuntimeError(
+                            'The shape of Tensor({}) was limited as ('.format(key.name) +
                            ','.join([str(dim) for dim in key.shape]) + '), ' +
                            'while feed a value with (' + ','.join([str(dim) for dim in value.shape]) + ').')
        return self._eval_func(*feed_dict.values())

--- a/Dragon/python/dragon/vm/theano/compile/sharedvalue.py
+++ b/Dragon/python/dragon/vm/theano/compile/sharedvalue.py
@@ -20,7 +20,7 @@ def shared(value, name=None, **kwargs):
    Parameters
    ----------
-    value : basic type, list or numpy.ndarray
+    value : number, list or numpy.ndarray
        The numerical values.
    name : str
        The name of tensor.

--- a/Dragon/python/dragon/vm/torch/module.py
+++ b/Dragon/python/dragon/vm/torch/module.py
@@ -42,6 +42,7 @@ class Module(object):
        self._buffers = OrderedDict()
        self._persistent_key = self._op = None
        self._ctx = ('CPU', 0)
+        self.training = True
    def __getattr__(self, item):
        if '_parameters' in self.__dict__:
@@ -363,3 +364,12 @@ class Module(object):
    def run(self, inputs, outputs, auto_grad=True):
        meta = ('PERSISTENT', self.persistent_key, self.op)
        return RunOperator(inputs, outputs, meta, auto_grad=auto_grad)
+    def train(self, mode=True):
+        self.training = mode
+        for module in self.children():
+            module.train(mode)
+        return self
+    def eval(self):
+        return self.train(False)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/__init__.py
+++ b/Dragon/python/dragon/vm/torch/nn/__init__.py
@@ -10,20 +10,35 @@
 # ------------------------------------------------------------
 """We move the Module & Parameter to ``torch`` instead of ``torch.nn``,
 as it will be reused by the ``torch.ops``.
 """
 from dragon.vm.torch.module import Module
 from dragon.vm.torch.tensor import Parameter
 from .modules.conv import Conv2d, ConvTranspose2d
 from .modules.pooling import MaxPool2d, AvgPool2d
-from .modules.activation import ReLU, LeakyReLU, Sigmoid, Softmax
+from .modules.activation import (
+    ReLU, LeakyReLU, ELU, SELU,
+    Sigmoid, Softmax,
+)
 from .modules.linear import Linear
-from .modules.loss import CrossEntropyLoss
+from .modules.loss import (
+    BCEWithLogitsLoss,
+    NLLLoss, CrossEntropyLoss,
+    L1Loss, MSELoss, SmoothL1Loss,
+)
 from .modules.container import Container, Sequential, ModuleList
 from .modules.batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d
+from .modules.groupnorm import GroupNorm1d, GroupNorm2d, GroupNorm3d
 from .modules.affine import Affine
 from .modules.dropout import Dropout, Dropout2d, Dropout3d
+from .modules.dropblock import DropBlock2d
 from .modules.rnn import RNNBase, RNN, LSTM, GRU
 from . import init
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/functional.py
+++ b/Dragon/python/dragon/vm/torch/nn/functional.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# Codes are based on:
+#
+#      <https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py>
+#
+# ------------------------------------------------------------
+import warnings
+class _Reduction:
+    @staticmethod
+    def get_enum(reduction):
+        if reduction == 'none':
+            return 0
+        if reduction == 'elementwise_mean':
+            return 1
+        if reduction == 'sum':
+            return 2
+        raise ValueError(reduction + " is not a valid value for reduction")
+    # In order to support previous versions, accept boolean size_average and reduce
+    # and convert them into the new constants for now
+    # We use these functions in torch/legacy as well, in which case we'll silence the warning
+    @staticmethod
+    def legacy_get_string(size_average, reduce, emit_warning=True):
+        warning = "size_average and reduce args will be deprecated, please use reduction='{}' instead."
+        if size_average is None:
+            size_average = True
+        if reduce is None:
+            reduce = True
+        if size_average and reduce:
+            ret = 'elementwise_mean'
+        elif reduce:
+            ret = 'sum'
+        else:
+            ret = 'none'
+        if emit_warning:
+            warnings.warn(warning.format(ret))
+        return ret
+    @staticmethod
+    def legacy_get_enum(size_average, reduce, emit_warning=True):
+        return _Reduction.get_enum(_Reduction.legacy_get_string(size_average, reduce, emit_warning))
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/modules/activation.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/activation.py
@@ -55,6 +55,47 @@ class LeakyReLU(Module):
        return self.run(inputs, outputs)
+class ELU(Module):
+    def __init__(self, alpha=1.0, inplace=False):
+        super(ELU, self).__init__()
+        self.alpha = alpha
+        self._inplace = inplace
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Elu',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {
+                'alpha': self.alpha,
+            }
+        }
+    def forward(self, x):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [x if self._inplace else self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
+class SELU(Module):
+    def __init__(self, inplace=False):
+        super(SELU, self).__init__()
+        self._inplace = inplace
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'SElu',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [x if self._inplace else self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
 class Sigmoid(Module):
    def __init__(self, inplace=False):
        super(Sigmoid, self).__init__()

--- a/Dragon/python/dragon/vm/torch/nn/modules/batchnorm.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/batchnorm.py
@@ -102,7 +102,7 @@ class _BatchNorm(Module):
        inputs = [input] + self.inputs
        self.unify_devices(inputs)
        outputs = [self.register_output(input.dtype)]
-        phase = 'TRAIN' if input.requires_grad else 'TEST'
+        phase = 'TRAIN' if self.training else 'TEST'
        # Normalize the input by using batch stats ALWAYS
        # Note that the update of moving average is meaningless(
        # Because we can not remove it. Why? Ask nvidia and cuDNN -:)

--- a/Dragon/python/dragon/vm/torch/nn/modules/dropblock.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/dropblock.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from dragon.vm.torch.nn import Module
+class DropBlock2d(Module):
+    def __init__(self, block_size=7, kp=0.9,
+                 alpha=1., decrement=0., inplace=False):
+        super(DropBlock2d, self).__init__()
+        self.kp = kp
+        self.block_size = block_size
+        self.alpha = alpha
+        self.decrement = decrement
+        self.inplace = inplace
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'DropBlock2d',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {
+                'block_size': self.block_size,
+                'keep_prob': self.kp,
+                'alpha': self.alpha,
+                'decrement': self.decrement,
+                'data_format': 'NCHW',
+                'phase': 'TRAIN',
+            }
+        }
+    def forward(self, input):
+        if not self.training: return input
+        inputs = [input]
+        self.unify_devices(inputs)
+        outputs = [input if self.inplace else self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/modules/dropout.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/dropout.py
@@ -34,7 +34,7 @@ class Dropout(Module):
        }
    def forward(self, input):
-        if not input.requires_grad: return input
+        if not self.training: return input
        inputs = [input]
        self.unify_devices(inputs)
        outputs = [input if self.inplace else self.register_output(input.dtype)]

--- a/Dragon/python/dragon/vm/torch/nn/modules/groupnorm.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/groupnorm.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from dragon.vm.torch.tensor import Tensor
+from dragon.vm.torch.nn import Module, Parameter
+from dragon.vm.torch.ops.creation import zeros, ones
+from dragon.vm.torch.module import RunOperator
+class _GroupNorm(Module):
+    def __init__(self, num_features, group=32,
+                 eps=1e-5, affine=True):
+        super(_GroupNorm, self).__init__()
+        self.num_features = num_features
+        self.group = group
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self.weight = Parameter(Tensor(num_features))
+            self.bias = Parameter(Tensor(num_features))
+        else:
+            self.weight = self.bias = None
+        self.inputs = [self.weight, self.bias] if self.affine else []
+        self.reset_parameters()
+        self.register_op()
+    def reset_parameters(self):
+        if self.affine:
+            self.weight.data.uniform_()
+            self.bias.data.zero_()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'FusedGroupNorm' if self.affine else 'GroupNorm',
+            'n_inputs': 3 if self.affine else 1, 'n_outputs': 1,
+            'arguments': {
+                'group': self.group,
+                'axis': 1, # Data format: NCHW
+                'eps': self.eps,
+            }
+        }
+    def forward(self, input):
+        inputs = [input] + self.inputs
+        self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class GroupNorm1d(_GroupNorm):
+    """Dragon does not use separate backend functions."""
+    pass
+class GroupNorm2d(_GroupNorm):
+    """Dragon does not use separate backend functions."""
+    pass
+class GroupNorm3d(_GroupNorm):
+    """Dragon does not use separate backend functions."""
+    pass
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/nn/modules/loss.py
+++ b/Dragon/python/dragon/vm/torch/nn/modules/loss.py
@@ -18,50 +18,176 @@ from __future__ import division
 from __future__ import print_function
 from dragon.vm.torch.nn import Module
+from dragon.vm.torch.nn.functional import _Reduction
-def _assert_no_grad(variable):
-    assert not variable.requires_grad, \
-        "nn criterions don't compute the gradient w.r.t. targets - please " \
-        "mark these variables as not requiring gradients"
 class _Loss(Module):
-    def __init__(self, size_average=True):
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
        super(_Loss, self).__init__()
-        self.size_average = size_average
+        if size_average is not None or reduce is not None:
+            self.reduction = _Reduction.legacy_get_string(size_average, reduce)
+        else:
+            self.reduction = reduction
 class _WeightedLoss(_Loss):
-    def __init__(self, weight=None, size_average=True):
+    def __init__(self, weight=None, size_average=None, reduce=None, reduction='elementwise_mean'):
-        super(_WeightedLoss, self).__init__(size_average)
+        super(_WeightedLoss, self).__init__(size_average, reduce, reduction)
        self.weight = weight
-        # TODO(PhyscalX):  Dragon will support it later :).
        if weight is not None:
            raise NotImplementedError('WeightedLoss has been not implemented yet.')
+class NLLLoss(_WeightedLoss):
+    def __init__(self, weight=None, size_average=None, ignore_index=-100,
+                 reduce=None, reduction='elementwise_mean'):
+        super(NLLLoss, self).__init__(weight, size_average, reduce, reduction)
+        self.ignore_index = ignore_index
+        self.normalization = {
+            'elementwise_mean': 'VALID',
+            'sum': 'None',
+            'none': 'UNIT'}[self.reduction]
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'NLLLoss',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'axis': 1,
+                'normalization': self.normalization,
+                'ignore_labels': () if self.ignore_index < 0 else (self.ignore_index),
+            }
+        }
+    def forward(self, input, target):
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class BCEWithLogitsLoss(_WeightedLoss):
+    def __init__(self, weight=None, size_average=None, reduce=None,
+                 reduction='elementwise_mean', pos_weight=None):
+        super(BCEWithLogitsLoss, self).__init__(weight, size_average, reduce, reduction)
+        if pos_weight is not None:
+            raise NotImplementedError('Positive weight has been not implemented yet.')
+        self.normalization = {
+            'elementwise_mean': 'VALID',
+            'sum': 'None',
+            'none': 'UNIT'}[self.reduction]
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'SigmoidCrossEntropy',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'normalization': self.normalization,
+            }
+        }
+    def forward(self, input, target):
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
 class CrossEntropyLoss(_WeightedLoss):
-    def __init__(self, weight=None, size_average=True, ignore_index=-100, reduce=True):
+    def __init__(self, weight=None, size_average=None, ignore_index=-100,
-        super(CrossEntropyLoss, self).__init__(weight, size_average)
+                 reduce=None, reduction='elementwise_mean'):
+        super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
        self.ignore_index = ignore_index
-        self.reduce = reduce
+        self.normalization = {
+            'elementwise_mean': 'VALID',
+            'sum': 'None',
+            'none': 'UNIT'}[self.reduction]
        self.register_op()
    def register_op(self):
        self.op_meta = {
-            'op_type': 'SparseSoftmaxCrossEntropy' if self.reduce else 'SoftmaxCrossEntropy',
+            'op_type': 'SparseSoftmaxCrossEntropy',
            'n_inputs': 2, 'n_outputs': 1,
            'arguments': {
                'axis': 1,
-                'normalization': 'VALID' if self.size_average else 'NONE',
+                'normalization': self.normalization,
                'ignore_labels': () if self.ignore_index < 0 else (self.ignore_index),
            }
        }
    def forward(self, input, target):
-        _assert_no_grad(target)
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class L1Loss(_Loss):
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(L1Loss, self).__init__(size_average, reduce, reduction)
+        self.normalization = {
+            'elementwise_mean': 'BATCH_SIZE',
+            'sum': 'None'}[self.reduction]
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'L1Loss',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'normalization': self.normalization,
+            }
+        }
+    def forward(self, input, target):
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class MSELoss(_Loss):
+    def __init__(self, size_average=None, reduce=None, reduction='elementwise_mean'):
+        super(MSELoss, self).__init__(size_average, reduce, reduction)
+        self.normalization = {
+            'elementwise_mean': 'BATCH_SIZE',
+            'sum': 'None'}[self.reduction]
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'L2Loss',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'normalization': self.normalization,
+            }
+        }
+    def forward(self, input, target):
+        inputs = [input, target]; self.unify_devices(inputs)
+        outputs = [self.register_output(input.dtype)]
+        return self.run(inputs, outputs)
+class SmoothL1Loss(_Loss):
+    def __init__(self, size_average=None, beta=1.0,
+                 reduce=None, reduction='elementwise_mean'):
+        super(SmoothL1Loss, self).__init__(size_average, reduce, reduction)
+        self.normalization = {
+            'elementwise_mean': 'BATCH_SIZE',
+            'sum': 'None'}[self.reduction]
+        self.beta = beta
+        self.register_op()
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'SmoothL1Loss',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {
+                'beta': self.beta,
+                'normalization': self.normalization,
+            }
+        }
+    def forward(self, input, target):
        inputs = [input, target]; self.unify_devices(inputs)
        outputs = [self.register_output(input.dtype)]
        return self.run(inputs, outputs)
--- a/Dragon/python/dragon/vm/torch/ops/__init__.py
+++ b/Dragon/python/dragon/vm/torch/ops/__init__.py
@@ -11,11 +11,12 @@
 from .creation import (
    zeros, zeros_like, ones, ones_like,
-    rand, randn
+    one_hot, rand, randn,
 )
 from .arithmetic import (
-    add, sub, mul, div,
+    add, sub, mul, div, log, exp,
+    maximum, minimum, clamp,
 )
 from .ndarray import (

--- a/Dragon/python/dragon/vm/torch/ops/arithmetic.py
+++ b/Dragon/python/dragon/vm/torch/ops/arithmetic.py
@@ -16,14 +16,15 @@ from __future__ import print_function
 from dragon.vm.torch.tensor import Tensor
 from dragon.vm.torch.ops.primitive import MakeContext, WrapScalar
 from dragon.vm.torch.ops.factory import get_module
-from dragon.vm.torch.ops.modules.arithmetic import Fundamental
+from dragon.vm.torch.ops.modules.arithmetic import (
+    Fundamental, Log, Exp,
+    Maximum, Minimum, Clamp,
+)
 def _fundamental(input, value, op='Add', out=None):
    if not isinstance(value, Tensor):
-        if not isinstance(value, (int, float)):
-            raise TypeError('Type of value should be numerical, got {}.'
-                    .format(type(value)))
        value = WrapScalar(value, input._dtype, input._ctx)
    ctx = MakeContext(inputs=[input, value])
    key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1])
@@ -33,17 +34,63 @@ def _fundamental(input, value, op='Add', out=None):
 def _rfundamental(input, value, op='RAdd', out=None):
    if not isinstance(value, Tensor):
-        if not isinstance(value, (int, float)):
-            raise TypeError('Type of value should be numerical, got {}.'
-                    .format(type(value)))
        value = WrapScalar(value, input._dtype, input._ctx)
    ctx = MakeContext(inputs=[input, value])
    key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1])
    module = get_module(Fundamental, key, ctx, op_type=op)
    return module.forward(value, input, out)
+def _maximum(input, other, out=None):
+    if not isinstance(input, Tensor):
+        input = WrapScalar(input, 'float32', other._ctx)
+        dtype = other._dtype
+    elif not isinstance(other, Tensor):
+        other = WrapScalar(other, 'float32', input._ctx)
+        dtype = input._dtype
+    else: dtype = input._dtype
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/maximum/{}:{}'.format(ctx[0].lower(), ctx[1])
+    module = get_module(Maximum, key, ctx)
+    return module.forward(input, other, out, dtype)
+def _minimum(input, other, out=None):
+    if not isinstance(input, Tensor):
+        input = WrapScalar(input, 'float32', other._ctx)
+        dtype = other._dtype
+    elif not isinstance(other, Tensor):
+        other = WrapScalar(other, 'float32', input._ctx)
+        dtype = input._dtype
+    else: dtype = input._dtype
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/minimum/{}:{}'.format(ctx[0].lower(), ctx[1])
+    module = get_module(Minimum, key, ctx)
+    return module.forward(input, other, out, dtype)
+def _clamp(input, min=None, max=None, out=None):
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/clamp/{}:{}/min:{}/max:{}'.format(
+        ctx[0].lower(), ctx[1], min, max)
+    module = get_module(Clamp, key, ctx, min=min, max=max)
+    return module.forward(input, out)
+def _exp(input, out=None):
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/exp/{}:{}'.format(ctx[0].lower(), ctx[1])
+    module = get_module(Exp, key, ctx)
+    return module.forward(input, out)
+def _log(input, out=None):
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/log/{}:{}'.format(ctx[0].lower(), ctx[1])
+    module = get_module(Log, key, ctx)
+    return module.forward(input, out)
 def add(input, value, out=None):
    """Add the ``input`` and ``value`` into the output tensor.
@@ -126,3 +173,106 @@ def div(input, value, out=None):
    """
    return _fundamental(input, value, out=out, op='Div')
+def maximum(input, other, out=None):
+    """Return the max value of given two tensors.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    other : vm.torch.Tensor
+        The input tensor.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _maximum(input, other, out)
+def minimum(input, other, out=None):
+    """Return the min value of given two tensors.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    other : vm.torch.Tensor
+        The input tensor.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _minimum(input, other, out)
+def clamp(input, min=None, max=None, out=None):
+    """Clamp all elements into the range [min, max].
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    min : numerical or None
+        The min value.
+    max : numerical or None
+        The max value.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _clamp(input, min, max, out)
+def log(input, out=None):
+    """Compute the natural logarithm of input.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _log(input, out)
+def exp(input, out=None):
+    """Compute the exponential of input.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    out : vm.torch.Tensor or None
+        The output tensor.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _exp(input, out)
--- a/Dragon/python/dragon/vm/torch/ops/builtin.py
+++ b/Dragon/python/dragon/vm/torch/ops/builtin.py
@@ -21,12 +21,18 @@ from dragon.vm.torch.execute_engine import RunOperator
 from dragon.vm.torch.ops.factory import get_module
 from dragon.vm.torch.autograd.grad_mode import no_grad
 from dragon.vm.torch.ops.primitive import MakeContext
-from dragon.vm.torch.ops.arithmetic import _fundamental, _rfundamental
+from dragon.vm.torch.ops.arithmetic import (
+    _fundamental, _rfundamental, _log, _exp,
+    _clamp,
+)
 from dragon.vm.torch.ops.ndarray import (
    reshape, squeeze, unsqueeze,
    _permute, _repeat, _crop,
    _fill, _reduce, _arg_reduce,
 )
 from dragon.vm.torch.ops.modules.dtype import AsType
@@ -53,9 +59,14 @@ def copy_(self, src, non_blocking=False):
        The ``self`` tensor.
    """
+    # Copy memory
    FromTensor(
        src, CTX_TO_DEVICE_OPTION[tuple(src._ctx)],
        self.name, CTX_TO_DEVICE_OPTION[tuple(self._ctx)])
+    self._dtype = src._dtype
+    # Transfer the static shape if necessary
+    self._static_shape = src.size() \
+        if self._static_shape else None
    return self
@@ -295,6 +306,76 @@ def rdiv(self, value):
    return _rfundamental(self, value, op='RDiv')
+def clamp(self, min=None, max=None):
+    """Return a tensor that all elements are clamped into the range [min, max].
+    Parameters
+    ----------
+    min : numerical or None
+        The min value.
+    max : numerical or None
+        The max value.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _clamp(self, min, max)
+def clamp_(self, min=None, max=None):
+    """Clamp all elements are clamped into the range [min, max].
+    Parameters
+    ----------
+    min : numerical or None
+        The min value.
+    max : numerical or None
+        The max value.
+    Returns
+    -------
+    vm.torch.Tensor
+        The output tensor.
+    """
+    return _clamp(self, min, max, self)
+def log(self):
+    """Compute the natural logarithm of this tensor.
+    Parameters
+    ----------
+    None
+    Returns
+    -------
+    vm.torch.Tensor
+        The log tensor.
+    """
+    return _log(self)
+def exp(self):
+    """Compute the exponential of this tensor.
+    Parameters
+    ----------
+    None
+    Returns
+    -------
+    vm.torch.Tensor
+        The exp tensor.
+    """
+    return _exp(self)
 Tensor.add = add
 Tensor.add_ = add_
 Tensor.__radd__ = radd
@@ -308,6 +389,10 @@ Tensor.div = div
 Tensor.div_ = div_
 Tensor.__rdiv__ = rdiv
 Tensor.__rtruediv__ = rdiv
+Tensor.clamp = clamp
+Tensor.clamp_ = clamp_
+Tensor.log = log
+Tensor.exp = exp
 ##############################################
@@ -387,16 +472,12 @@ def _unsqueeze_(self, dim=None):
 def view(self, *args):
-    if self._static_shape:
-        raise RuntimeError('Can not view a leaf variable, it owns the static sizes.')
    return reshape(self, shape=args)
 def view_as(self, other):
    if not isinstance(other, Tensor):
        raise ValueError('The other should be a torch tensor.')
-    if self._static_shape:
-        raise RuntimeError('Can not view a leaf variable, it owns the static sizes.')
    return reshape(self, shape=None, shape_like=other)

--- a/Dragon/python/dragon/vm/torch/ops/creation.py
+++ b/Dragon/python/dragon/vm/torch/ops/creation.py
@@ -13,14 +13,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from dragon.vm.torch.ops.primitive import MakeContext, CanonicalAxis
+from dragon.vm.torch.ops.factory import get_module
 from dragon.vm.torch.tensor import LeafTensor
 from dragon.vm.torch.execute_engine import RunOperator
 from dragon.vm.torch.ops.primitive import MakeContext
+from dragon.vm.torch.ops.factory import get_module
+from dragon.vm.torch.ops.modules.creation import OneHot
 __all__= [
    'zeros', 'zeros_like', 'ones', 'ones_like',
-    'rand', 'randn',
+    'one_hot', 'rand', 'randn',
 ]
@@ -180,3 +186,26 @@ def randn(*sizes, **kwargs):
    inputs = []; outputs = [out]; ctx = MakeContext(inputs, outputs)
    meta = ('ONCE', 'RandomNormal', ctx)
    return RunOperator(inputs, outputs, meta, **arguments)
+def one_hot(input, depth):
+    """Return a ont hot tensor according to given input.
+    Parameters
+    ----------
+    input : vm.torch.Tensor
+        The input tensor.
+    depth : int
+        The depth of channels.
+    Returns
+    -------
+    vm.torch.FloatTensor
+        The output tensor.
+    """
+    ctx = MakeContext(inputs=[input])
+    key = 'torch/ops/one_hot/{}:{}/depth:{}'.format(
+        ctx[0].lower(), ctx[1], depth)
+    module = get_module(OneHot, key, ctx, depth=depth)
+    return module.forward(input)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/arithmetic.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/arithmetic.py
@@ -38,3 +38,125 @@ class Fundamental(BaseModule):
        inputs = [x1, x2]; self.unify_devices(inputs)
        outputs = [y] if y else [self.register_output(x1.dtype)]
        return self.run(inputs, outputs)
+class Maximum(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Maximum, self).__init__(key, ctx, **kwargs)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments for minimum op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Maximum',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x1, x2, y, dtype):
+        inputs = [x1, x2]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(dtype)]
+        return self.run(inputs, outputs)
+class Minimum(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Minimum, self).__init__(key, ctx, **kwargs)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments for minimum op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Minimum',
+            'n_inputs': 2, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x1, x2, y, dtype):
+        inputs = [x1, x2]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(dtype)]
+        return self.run(inputs, outputs)
+class Clamp(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Clamp, self).__init__(key, ctx, **kwargs)
+        self.min = kwargs.get('min', None)
+        self.max = kwargs.get('max', None)
+        if self.min is not None: self.min = float(self.min)
+        if self.max is not None: self.max = float(self.max)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments clamp op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Clip',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {
+                'low': self.min,
+                'high': self.max,
+            }
+        }
+    def forward(self, x, y):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
+class Log(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Log, self).__init__(key, ctx, **kwargs)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments for Log op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Log',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x, y):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
+class Exp(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(Exp, self).__init__(key, ctx, **kwargs)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No arguments for Log op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'Exp',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {}
+        }
+    def forward(self, x, y):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [y] if y else [self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/creation.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/creation.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#      <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from dragon.vm.torch.ops.modules.base import BaseModule
+class OneHot(BaseModule):
+    def __init__(self, key, ctx, **kwargs):
+        super(OneHot, self).__init__(key, ctx, **kwargs)
+        self.depth = kwargs.get('depth', 1)
+        self.register_arguments()
+        self.register_op()
+    def register_arguments(self):
+        """No Arguments for concat op."""
+        pass
+    def register_op(self):
+        self.op_meta = {
+            'op_type': 'OneHot',
+            'n_inputs': 1, 'n_outputs': 1,
+            'arguments': {
+                'depth': self.depth,
+            }
+        }
+    def forward(self, x):
+        inputs = [x]; self.unify_devices(inputs)
+        outputs = [self.register_output(x.dtype)]
+        return self.run(inputs, outputs)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/ops/modules/shape.py
+++ b/Dragon/python/dragon/vm/torch/ops/modules/shape.py
@@ -22,6 +22,7 @@ class Fill(BaseModule):
        super(Fill, self).__init__(key, ctx, **kwargs)
        self.len_shape = kwargs.get('len_shape', 0)
        self.value = kwargs.get('value', 0.0)
+        self.dtype = kwargs.get('dtype', 'float32')
        self.register_arguments()
        self.register_op()
@@ -34,6 +35,7 @@ class Fill(BaseModule):
            'op_type': 'Fill',
            'n_inputs': 0, 'n_outputs': 1,
            'arguments': {
+                'dtype': self.dtype,
                'value': float(self.value),
                'dims_desc': [d for d in self.shape] if len(self.shape) > 0 else None,
            }

--- a/Dragon/python/dragon/vm/torch/ops/ndarray.py
+++ b/Dragon/python/dragon/vm/torch/ops/ndarray.py
@@ -62,9 +62,10 @@ def _repeat(input, times):
 def _fill(input, shape, value):
    ctx = MakeContext(inputs=[input]); len_shape = len(shape)
-    key = 'torch/ops/fill/{}:{}/ndims:#{}/value:{}'.format(
+    key = 'torch/ops/fill/{}:{}/dtype:{}/ndims:#{}/value:{}'.format(
-        ctx[0].lower(), ctx[1], len_shape, value)
+        ctx[0].lower(), ctx[1], input._dtype, len_shape, value)
-    module = get_module(Fill, key, ctx, len_shape=len_shape, value=value)
+    module = get_module(Fill, key, ctx, len_shape=len_shape,
+        value=value, dtype=input._dtype)
    return module.forward(input, shape)

--- a/Dragon/python/dragon/vm/torch/ops/update.py
+++ b/Dragon/python/dragon/vm/torch/ops/update.py
@@ -35,7 +35,7 @@ def _update(param, grad, op_type, slot,
            lr_mult=1.0, decay_mult=1.0):
    ctx = MakeContext(inputs=[param])
    key = 'torch/ops/{}/{}:{}/{}/{}'.format(op_type.lower(),
-        ctx[0].lower(),ctx[1], slot, param.name)
+        ctx[0].lower(), ctx[1], slot, param.name)
    module = get_module(Update, key, ctx, op_type=op_type,
            lr_mult=lr_mult, decay_mult=decay_mult, slot=slot)
    return module.forward(param, grad)
\ No newline at end of file
--- a/Dragon/python/dragon/vm/torch/optim/optimizer.py
+++ b/Dragon/python/dragon/vm/torch/optim/optimizer.py
@@ -72,10 +72,9 @@ class Optimizer(object):
        param_temp = group['slot'] + '/{}'
        for k, v in group.items():
            if k in self._mutable_parameters:
-                # convert all defaults as float32 for convenience
                dg.workspace.FeedTensor(param_temp.format(
-                    self._mutable_parameters[k]),
+                    self._mutable_parameters[k]), v,
-                        np.array([v], dtype=np.float32))
+                        dtype='float32', force_cpu=True)
    def _run_update_ops(self, group):
        """Generate & Run UpdateOps.
@@ -107,10 +106,12 @@ class Optimizer(object):
        # Run regular update ops
        for p, g in zip(params, grads):
-            _update(p, g, op_type=self._update_type,
+            _update(p, g,
+                op_type=self._update_type,
                slot=group['slot'],
                lr_mult=group.get('lr_mult', 1.0),
-                decay_mult=group.get('decay_mult', 1.0))
+                decay_mult=group.get('decay_mult', 1.0)
+            )
    def zero_grad(self):
        """Set all gradients to zeros.

--- a/Dragon/python/dragon/vm/torch/serialization.py
+++ b/Dragon/python/dragon/vm/torch/serialization.py
@@ -17,9 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import os
+import os, sys, io
-import sys
+from dragon.core.tensor_utils import ToPyArrayEx
-import io
 if sys.version_info[0] == 2:
    import cPickle as pickle
@@ -61,17 +60,27 @@ def _with_file_like(f, mode, body):
            f.close()
-def _save(obj, f, pickle_module, pickle_protocol):
+def _save_dict(obj):
-    """Pickle the object into binary file.
+    """Recursively save the dict."""
+    if not isinstance(obj, dict):
+        raise ValueError('Currently only the state dict can be saved.')
+    py_dict = type(obj)()
+    for k, v in obj.items():
+        if isinstance(v, dict): py_dict[k] = _save_dict(v)
+        elif hasattr(v, 'name'): py_dict[k] = ToPyArrayEx(v)
+        else: py_dict[k] = v
+    return py_dict
-    """
+def _save(obj, f, pickle_module, pickle_protocol):
+    """Pickle the object into binary file."""
    if not isinstance(obj, dict):
        raise ValueError('Currently only the state dict can be saved.')
-    from collections import OrderedDict
+    py_dict = type(obj)()
-    from dragon.core.tensor_utils import ToPyArrayEx
-    py_dict = OrderedDict()
    for k, v in obj.items():
-        py_dict[k] = ToPyArrayEx(v)
+        if isinstance(v, dict): py_dict[k] = _save_dict(v)
+        elif hasattr(v, 'name'): py_dict[k] = ToPyArrayEx(v)
+        else: py_dict[k] = v
    pickle_module.dump(py_dict, f, pickle_protocol)

--- a/Dragon/python/dragon/vm/torch/tensor.py
+++ b/Dragon/python/dragon/vm/torch/tensor.py
@@ -13,8 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import sys
+import six
-import copy
 import numpy as np
 import dragon as dg
 import dragon.core.tensor_utils as tensor_utils
@@ -73,12 +72,12 @@ class Tensor(object):
                self._init_from_numpy(args[0])
            else:
                # + class torch.Tensor(size)
-                if not isinstance(args[0], int):
+                if not isinstance(args[0], six.integer_types):
                    raise ValueError('Excepted integer as size.')
                self._init_from_shape(args[0])
        else:
            # + torch.Tensor(*sizes)
-            if not all(type(arg) is int for arg in args):
+            if not all(isinstance(arg, six.integer_types) for arg in args):
                raise ValueError('Excepted integer(s) as sizes.')
            self._init_from_shape(shape=args)
@@ -90,7 +89,7 @@ class Tensor(object):
        self._ignored_grads = {self.name + '_grad'} if not self._requires_grad else None
    def _init_from_shape(self, shape):
-        if isinstance(shape, int): shape = [shape]
+        if isinstance(shape, six.integer_types): shape = [shape]
        self._static_shape = Size(shape)
        self._dg_tensor = tensor_utils.FromShape(shape, self._dtype,
                ctx=CTX_TO_DEVICE_OPTION[tuple(self._ctx)], name=TPool.get('leaf'))
@@ -904,6 +903,72 @@ class Tensor(object):
        """
        raise NotImplementedError('Refer torch.ops.builtin.div_')
+    def clamp(self, min=None, max=None):
+        """Return a tensor that all elements are clamped into the range [min, max].
+        Parameters
+        ----------
+        min : numerical or None
+            The min value.
+        max : numerical or None
+            The max value.
+        Returns
+        -------
+        vm.torch.Tensor
+            The output tensor.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin.clamp')
+    def clamp_(self, min=None, max=None):
+        """Clamp all elements are clamped into the range [min, max].
+        Parameters
+        ----------
+        min : numerical or None
+            The min value.
+        max : numerical or None
+            The max value.
+        Returns
+        -------
+        vm.torch.Tensor
+            The output tensor.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin.clamp_')
+    def log(self):
+        """Compute the natural logarithm of this tensor.
+        Parameters
+        ----------
+        None
+        Returns
+        -------
+        vm.torch.Tensor
+            The log tensor.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin.log')
+    def exp(self):
+        """Compute the exponential of this tensor.
+        Parameters
+        ----------
+        None
+        Returns
+        -------
+        vm.torch.Tensor
+            The exp tensor.
+        """
+        raise NotImplementedError('Refer torch.ops.builtin.exp')
    def mean(self, dim=None, keepdim=False):
        """Returns the mean of all elements or elements along the given dim.

--- a/Dragon/python/setup.py
+++ b/Dragon/python/setup.py
@@ -42,7 +42,7 @@ find_modules()
 setup(name = 'dragon',
-      version='0.2.2.11',
+      version='0.2.2.12',
      description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
      url='https://github.com/seetaresearch/Dragon',
      author='Ting Pan',

--- a/Dragon/src/core/context.cc
+++ b/Dragon/src/core/context.cc
@@ -3,10 +3,6 @@
 namespace dragon {
-#ifdef WITH_CUDA
-thread_local CUDAObject CUDAContext::cuda_object_;
-#endif    // WITH_CUDA
 //  cpu <- gpu
 template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
    size_t                  nbytes,

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -246,6 +246,9 @@ GraphDef Graph::Share(const GraphDef& optimized_graph) {
                    *g.mutable_op(i)->mutable_input(j)
                        = renamed_[op.input(j)];
        }
+        //  handle handcraft cases
+        if (op.type() == "BiasAddGradient")
+            renamed_[op.output(0)] = g.op(i).input(2);
        for (int j = 0; j < op.output_size(); j++) {
            if (whitelist.count(op.output(j)) == 0 &&
                renamed_.count(op.output(j)) &&
@@ -443,9 +446,10 @@ Graph::Graph(const GraphDef& meta_graph, Workspace* ws)
    }
    //  store the final graph as a tensor for visualization
-    Tensor* string_tensor = ws_->CreateTensor("GraphDef_" + optimized_graph.name());
+    Tensor* graphT = ws_->CreateTensor(
-    string_tensor->Reshape({ 1 });
+        "GraphDef_" + optimized_graph.name());
-    string* data = string_tensor->mutable_data<string, CPUContext>();
+    graphT->Reshape({ 1 });
+    auto* data = graphT->mutable_data<string, CPUContext>();
    data[0] = optimized_graph.SerializeAsString();
    //  create
@@ -473,11 +477,22 @@ bool Graph::Run(
    return true;
 }
-DEFINE_REGISTRY(GraphRegistry, GraphBase, const GraphDef&, Workspace*);
+GraphBase* NewGraph(
+    const GraphDef&             meta_graph,
+    Workspace*                  ws) {
+    if (!meta_graph.has_graph_type() ||
+         meta_graph.graph_type().empty())
+        return new Graph(meta_graph, ws);
-GraphBase* NewGraph(const GraphDef& meta_graph, Workspace* ws) {
+    return GraphRegistry()->Create(
-    if (!meta_graph.has_graph_type()) return new Graph(meta_graph, ws);
+        meta_graph.graph_type(), meta_graph, ws);
-    return GraphRegistry()->Create(meta_graph.graph_type(), meta_graph, ws);
 }
+DEFINE_REGISTRY(
+    GraphRegistry,
+    GraphBase,
+    const GraphDef&,
+    Workspace*
+);
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/core/graph_gradient.cc
+++ b/Dragon/src/core/graph_gradient.cc
@@ -4,10 +4,10 @@
 namespace dragon {
-#define str dragon_cast<std::string, int>
+bool GraphGradientMaker::CheckGrad(
+    const OperatorDef&              forward_op,
-bool GraphGradientMaker::CheckGrad(const OperatorDef& forward_op, 
+    const Set<string>&              targets,
-    const Set<string>& targets, vector< pair<string, int> >& gen_grads) {
+    vector< pair<string, int> >&    gen_grads) {
    if (NoGradientRegistry()->Has(forward_op.type())) {
        for (auto& input : forward_op.input()) 
            blacklist_set_.insert(input);
@@ -41,10 +41,11 @@ bool GraphGradientMaker::CheckGrad(const OperatorDef& forward_op,
 string GraphGradientMaker::GetOperatorName() {
    if (op_prefix_.empty()) return "runtime";
-    return op_prefix_ + str(cur_op_idx_++) + op_suffix_;
+    return op_prefix_ + std::to_string(cur_op_idx_++) + op_suffix_;
 }
-void GraphGradientMaker::Make(const GraphDef& forward_def, 
+void GraphGradientMaker::Make(
+    const GraphDef&                 forward_def,
    const vector<string>&           targets,
    GraphDef&                       new_def) {
    Map<string, int> inputs_count, grads_count;
@@ -61,9 +62,10 @@ void GraphGradientMaker::Make(const GraphDef& forward_def,
        }
    }
    for (auto& t : targets) targets_set.insert(t);
    //  PLAY for the backward
    for (int i = forward_def.op_size() - 1; i >= 0; i--) {
-        //  collect inputs & outputs, generate grad
+        //  collect inputs & outputs, generate RAW grad ops
        const OperatorDef& op = forward_def.op(i);
        vector< pair<string, int> > gen_grads;
        bool is_skip = CheckGrad(op, targets_set, gen_grads);
@@ -76,8 +78,9 @@ void GraphGradientMaker::Make(const GraphDef& forward_def,
            g_outputs.emplace_back(g_output);
        }
        Gradient grad = MakeGradientForOp(op, g_outputs);
-        //  post-process grad ops
-        unique_ptr<OperatorDef> gather_op;
+        //  process the RAW grad ops
+        vector<OperatorDef> gather_ops;
        for (auto& g_op : grad.ops) {
            //  set op name
            g_op.set_name(GetOperatorName());
@@ -112,27 +115,32 @@ void GraphGradientMaker::Make(const GraphDef& forward_def,
                string original_name = op.input(original_idx);
                if (inputs_count[original_name] > 1) {
                    //  split
-                    string split_name = *output + "_autosplit_" + str(grads_count[*output]++);
+                    string split_name = *output + "_autosplit_" 
+                        + std::to_string(grads_count[*output]++);
                    if (!is_skip) all_split_grads.insert(split_name);
                    //  gather
                    if (grads_count[*output] == inputs_count[original_name]) {
-                        gather_op.reset(new OperatorDef());
+                        OperatorDef gather_op;
-                        gather_op->set_name(GetOperatorName());
+                        gather_op.set_name(GetOperatorName());
-                        gather_op->set_type("GradientGather");
+                        gather_op.set_type("GradientGather");
-                        gather_op->add_output(*output);
+                        gather_op.add_output(*output);
                        if (g_op.has_device_option())
-                            gather_op->mutable_device_option()->CopyFrom(g_op.device_option());
+                            gather_op.mutable_device_option()
+                                ->CopyFrom(g_op.device_option());
                        for (int j = 0; j < grads_count[*output]; j++) {
-                            string key = *output + "_autosplit_" + str(j);
+                            string key = *output + "_autosplit_" + std::to_string(j);
-                            if (all_split_grads.count(key)) gather_op->add_input(key);
+                            if (all_split_grads.count(key)) gather_op.add_input(key);
                        }
+                        gather_ops.emplace_back(gather_op);
                    }
                    *output = split_name;
                }
            }
        }
-        //  append ops
+        //  now, append the required ops
        if (!is_skip) {
+            //  1) GradientGenerateOp
            if (gen_grads.size() > 0) {
                vector<string> op_inputs, op_outputs;
                Argument arg_defaults; arg_defaults.set_name("defaults");
@@ -143,21 +151,24 @@ void GraphGradientMaker::Make(const GraphDef& forward_def,
                    op_outputs.emplace_back(output);
                    arg_defaults.add_floats(grad.defaults[gen_grad.second]);
                }
-                OperatorDef generate_op = MakeOperatorDef("GradientGenerate",
+                OperatorDef generate_op = MakeOperatorDef(
-                                                           GetOperatorName(),
+                    "GradientGenerate", GetOperatorName(),
-                                                                   op_inputs,
+                        op_inputs, op_outputs,
-                                                                  op_outputs,
                            vector<Argument>(1, arg_defaults));
                if (op.has_device_option())
-                    generate_op.mutable_device_option()->CopyFrom(op.device_option());
+                    generate_op.mutable_device_option()
+                        ->CopyFrom(op.device_option());
                new_def.add_op()->CopyFrom(generate_op);
            }
-            for (auto& g_op : grad.ops) {
+            //  2) GradientOp
+            for (auto& g_op : grad.ops)
                new_def.add_op()->CopyFrom(g_op);
        }
-        }
+        //  3) GradientGatherOp
-        if (gather_op) new_def.add_op()->CopyFrom(*gather_op);
+        for (auto& gather_op : gather_ops)
-        //  done
+            new_def.add_op()->CopyFrom(gather_op);
+        //  done!
        if (!is_skip) {
            for (int i = 0; i < op.input_size(); i++) {
                if (!grad.g_inputs[i].empty())
@@ -189,7 +200,9 @@ void GraphGradientMaker::Make(const GraphDef& forward_def,
    } \
    *op->mutable_output(ix) = temp_grad;}
-void GraphGradientMaker::Share(const string& grads_prefix, GraphDef& graph) {
+void GraphGradientMaker::Share(
+    const string&                   grads_prefix,
+    GraphDef&                       graph) {
    Map<string, int> ref_count;
    //  count the refs for detecting leaf nodes
    for (auto& op : graph.op()) {
@@ -205,8 +218,7 @@ void GraphGradientMaker::Share(const string& grads_prefix, GraphDef& graph) {
    Map<string, string> temporary_grads;
    std::deque<string> grads_pool; 
    for (int i = 0; i < TEMPORARY_GRADS_LIMITS; i++)
-        grads_pool.push_back(grads_prefix + ":" +
+        grads_pool.push_back(grads_prefix + ":" + std::to_string(i));
-            dragon_cast<string, int>(i));
    for (int i = 0; i < graph.op_size(); i++) {
        OperatorDef* op = graph.mutable_op(i);

--- a/Dragon/src/core/mixedmem.cc
+++ b/Dragon/src/core/mixedmem.cc
@@ -31,12 +31,14 @@ void MixedMemory::ToCUDA() {
    switch (state_) {
    case UNINITIALIZED:
        cuda_ptr_ = CUDAContext::New(nbytes_);
+        ptr_device_ = CUDA_GET_DEVICE();
        state_ = STATE_AT_CUDA;
        break;
    case STATE_AT_CPU:
-        if (cuda_ptr_ == nullptr)
+        if (cuda_ptr_ == nullptr) {
            cuda_ptr_ = CUDAContext::New(nbytes_);
-        CUDAContext::Memcpy<CUDAContext, CPUContext>(
+            ptr_device_ = CUDA_GET_DEVICE();
+        } CUDAContext::Memcpy<CUDAContext, CPUContext>(
            nbytes_, cuda_ptr_, cpu_ptr_);
        state_ = SYNCED;
        break;
@@ -66,6 +68,10 @@ const void* MixedMemory::cuda_data() {
    return (const void*)cuda_ptr_;
 }
+const void* MixedMemory::cnml_data() {
+    return (const void*)cnml_ptr_;
+}
 void* MixedMemory::mutable_cpu_data() {
    ToCPU();
    state_ = STATE_AT_CPU;
@@ -78,6 +84,11 @@ void* MixedMemory::mutable_cuda_data() {
    return cuda_ptr_;
 }
+void* MixedMemory::mutable_cnml_data() {
+    state_ = STATE_AT_CNML;
+    return cnml_ptr_;
+}
 void MixedMemory::set_cpu_data(void* cpu_ptr, size_t nbytes) {
    bool use_cudahost_mem = false;
 #ifdef WITH_CUDA_HOST_MEM
@@ -123,9 +134,11 @@ MixedMemory::~MixedMemory() {
 void MixedMemory::SwitchToDevice() {
    if (cuda_ptr_) {
 #ifdef WITH_CUDA
-        int ptr_device = CUDA_DEVICE(cuda_ptr_);
+        int cur_device = CUDA_GET_DEVICE();
-        int cur_device = CUDA_DEVICE();
+        if (cur_device != ptr_device_) {
-        if (ptr_device != cur_device) state_ = SWITCHED;
+            state_ = SWITCHED;
+            ptr_device_ = cur_device;
+        }
 #endif
    }
 }
@@ -134,12 +147,12 @@ void MixedMemory::SwitchToCUDADevice(int device_id) {
 #ifdef WITH_CUDA
    DeviceGuard gurad(device_id);
    if (cuda_ptr_) {
-        int ptr_device = CUDA_DEVICE(cuda_ptr_);
+        if (device_id != ptr_device_) {
-        if (ptr_device != device_id) state_ = SWITCHED;
+            state_ = SWITCHED;
+            ptr_device_ = device_id;
+        }
    }
    ToCUDA();
-#else
-    CUDA_NOT_COMPILED;
 #endif
 }
@@ -148,6 +161,7 @@ const Map<string, string> MixedMemory::info() const {
        { UNINITIALIZED, "UNINITIALIZED" },
        { STATE_AT_CPU, "CPU" },
        { STATE_AT_CUDA, "CUDA" },
+        { STATE_AT_CNML, "CNML" },
        { SYNCED, "DEVICE" },
        { SWITCHED, "DEVICE" },
    };
@@ -155,15 +169,14 @@ const Map<string, string> MixedMemory::info() const {
    string _state_ = STATE_TO_STRING[state_];
    if (_state_ == "DEVICE") {
        if (cuda_ptr_) _state_ = "CUDA";
+        else if (cnml_ptr_) _state_ = "CNML";
        else LOG(FATAL) << "Device activated, "
                        << "but got invalid mem pointer.";
    }
    s2s["mem_at"] = _state_;
    if (cpu_ptr_) s2s["CPU"] = "0";
-#ifdef WITH_CUDA
+    if (cuda_ptr_) s2s["CUDA"] = std::to_string(ptr_device_);
-    if (cuda_ptr_) s2s["CUDA"] =
+    else if (cnml_ptr_) s2s["CNML"] = std::to_string(ptr_device_);
-        dragon_cast<string, int>(CUDA_DEVICE(cuda_ptr_));
-#endif
    return s2s;
 }

--- a/Dragon/src/core/operator.cc
+++ b/Dragon/src/core/operator.cc
@@ -5,7 +5,8 @@
 namespace dragon {
 OperatorBase::OperatorBase(
-    const OperatorDef& def, Workspace* ws)
+    const OperatorDef&          def,
+    Workspace*                  ws)
        : def_(def), ws_(ws), anchor_(def.name()) {
    for (auto& arg : def_.arg()) {
        CHECK_GT(arg.name().size(), 0);
@@ -73,6 +74,8 @@ OperatorBase* TryCreateOperator(
                CUDNNOperatorRegistry()->Has(key))
                return CUDNNOperatorRegistry()->Create(key, def, ws);
            return CUDAOperatorRegistry()->Create(key, def, ws);
+        case CNML:
+            return CNMLOperatorRegistry()->Create(key, def, ws);
        default:
            LOG(FATAL) << "Unknown device type: "
                       << def.device_option().device_type();
@@ -198,7 +201,8 @@ void Operator<Context>::ElimateCorruption() {
            int idx = safe_heads.front();
            safe_heads.pop();
            Tensor* buffer = ws()->GetTensor(
-                "/opt/mirror_stage/buffer_" + dragon_cast<string, int>(idx));
+                "/opt/mirror_stage/buffer_" 
+                    + std::to_string(idx));
            Output(i)->Move(buffer->memory());
            head_data[idx] = Output(i)->name();
        }
@@ -220,8 +224,8 @@ void Operator<Context>::CleanResource() {
    for (int i = 0; i < OutputSize(); i++) {
        if (Output(i)->is_corrupted() &&
                head_to_idx.count(Output(i)->name())) {
-            string used = "/opt/mirror_stage/buffer_" +
+            string used = "/opt/mirror_stage/buffer_" 
-                dragon_cast<string, int>(head_to_idx[Output(i)->name()]);
+                + std::to_string(head_to_idx[Output(i)->name()]);
            Tensor* buffer = ws()->GetTensor(used);
            if (Output(i)->memory() != buffer->memory())
                buffer->Move(Output(i)->memory());
@@ -248,6 +252,12 @@ DEFINE_REGISTRY(
    Workspace*);
 DEFINE_REGISTRY(
+    CNMLOperatorRegistry,
+    OperatorBase,
+    const OperatorDef&,
+    Workspace*);
+DEFINE_REGISTRY(
    GradientRegistry,
    GradientMakerBase,
    const OperatorDef&,
@@ -291,9 +301,12 @@ INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings)
 template void Operator<CPUContext>::ElimateCorruption();
 template void Operator<CUDAContext>::ElimateCorruption();
+template void Operator<CNMLContext>::ElimateCorruption();
 template void Operator<CPUContext>::MakeResource();
 template void Operator<CUDAContext>::MakeResource();
+template void Operator<CNMLContext>::MakeResource();
 template void Operator<CPUContext>::CleanResource();
 template void Operator<CUDAContext>::CleanResource();
+template void Operator<CNMLContext>::CleanResource();
 }    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/core/workspace.cc
+++ b/Dragon/src/core/workspace.cc
@@ -16,7 +16,8 @@ GraphBase* Workspace::CreateGraph(const GraphDef& meta_graph) {
 Workspace::~Workspace() {
    for (int i = 0; i < WORKSPACE_MAX_CORRUPTED_SIZE; i++) {
-        string name = "/opt/mirror_stage/buffer_" + dragon_cast<string, int>(i);
+        string name = "/opt/mirror_stage/buffer_"
+            + std::to_string(i);
        if (tensor_map_.count(name) > 0) {
            MixedMemory* mem = tensor_map_[name]->memory();
            if (mem != nullptr) delete mem;

--- a/Dragon/src/operators/activation/cudnn_dropout_op.cc
+++ b/Dragon/src/operators/activation/cudnn_dropout_op.cc
@@ -32,8 +32,8 @@ void CuDNNDropoutOp<Context>::RunWithType() {
                ctx()->cudnn_handle(), &states_size));
            std::lock_guard<std::mutex> lk(CUDAContext::mutex());
            Tensor* states = ws()->CreateTensor(
-                "/share/cudnn/dropout:" + dragon_cast<string,
+                "/share/cudnn/dropout:" + std::to_string(
-                    unsigned long long>(random_seed) + "/states");
+                    random_seed) + "/states");
            if (states->count() > 0) {
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
@@ -67,9 +67,7 @@ void CuDNNDropoutOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -89,14 +87,16 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
                ctx()->cudnn_handle(), &states_size));
            std::lock_guard<std::mutex> lk(CUDAContext::mutex());
            Tensor* states = ws()->CreateTensor(
-                "/share/cudnn/dropout:" + dragon_cast<string,
+                "/share/cudnn/dropout:" + std::to_string(
-                    unsigned long long>(random_seed) + "/states");
+                    random_seed) + "/states");
            if (states->count() > 0) {
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
                    dropout_desc, ctx()->cudnn_handle(), prob(),
                        Sdata, states_size, random_seed));
-            } else { LOG(FATAL) << "Missing states with seed: " << random_seed; }
+            } else { 
+                LOG(FATAL) << "Missing states with seed: " << random_seed; 
+            }
        }
        auto* dYdata = Input(-1).template data<T, Context>();
        auto* dXdata = Output(0)->template mutable_data<T, Context>();
@@ -119,9 +119,7 @@ void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/activation/cudnn_elu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_elu_op.cc
@@ -24,9 +24,7 @@ void CuDNNEluOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -52,9 +50,7 @@ void CuDNNEluGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
@@ -31,9 +31,7 @@ void CuDNNReluOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -69,9 +67,7 @@ void CuDNNReluGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
+++ b/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
@@ -29,9 +29,7 @@ void CuDNNSigmoidOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -65,9 +63,7 @@ void CuDNNSigmoidGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/activation/cudnn_softmax_op.cc
+++ b/Dragon/src/operators/activation/cudnn_softmax_op.cc
@@ -14,8 +14,7 @@ void CuDNNSoftmaxOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    CUDNN_CHECK(cudnnSoftmaxForward(
+    CUDNN_CHECK(cudnnSoftmaxForward(ctx()->cudnn_handle(),
-        ctx()->cudnn_handle(),
        CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
@@ -23,15 +22,13 @@ void CuDNNSoftmaxOp<Context>::RunWithType() {
 template <class Context>
 void CuDNNSoftmaxOp<Context>::RunOnDevice() {
-    if (axis == -1) axis = (int)Input(0).ndim() - 1;
+    if (axis == -1) axis = (TIndex)Input(0).ndim() - 1;
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -47,8 +44,7 @@ void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    CUDNN_CHECK(cudnnSoftmaxBackward(
+    CUDNN_CHECK(cudnnSoftmaxBackward(ctx()->cudnn_handle(),
-        ctx()->cudnn_handle(),
        CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
            CUDNNType<T>::one, input_desc, Ydata, input_desc, dYdata,
                CUDNNType<T>::zero, output_desc, dXdata));
@@ -56,15 +52,13 @@ void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
 template <class Context>
 void CuDNNSoftmaxGradientOp<Context>::RunOnDevice() {
-    if (axis == -1) axis = (int)Input(0).ndim() - 1;
+    if (axis == -1) axis = (TIndex)Input(0).ndim() - 1;
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/activation/cudnn_tanh_op.cc
+++ b/Dragon/src/operators/activation/cudnn_tanh_op.cc
@@ -29,9 +29,7 @@ void CuDNNTanhOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -65,9 +63,7 @@ void CuDNNTanhGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
@@ -8,22 +8,31 @@ template <class Context> template <typename T>
 void DropoutOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
+    float scale = use_scale ? 1.f / (1.f - prob()) : 1.f;
    if (phase() == "TEST") {
        if (Output(0) != &Input(0)) {
            ctx()->template Copy<T, Context, Context>(
                Output(0)->count(), Ydata, Xdata);
-            if (scale == 1.0) math::Scal<T, Context>(
+        }
-                Output(0)->count(), 1.0 - prob(), Ydata, ctx());
+        if (!use_scale) {
+            math::Scal<T, Context>(Output(0)->count(),
+                1.0 - prob(), Ydata, ctx());
        }
    } else if (phase() == "TRAIN") {
        Tensor* mask = ws()->CreateTensor(
            "/mnt/" + anchor() + "/dropout/mask");
        mask->ReshapeLike(Input(0));
-        uint32_t* Mdata = mask->template mutable_data<uint32_t, Context>();
+        auto WSdata = ws()->template caches<Context>({
+            mask->count() * sizeof(uint32_t) });
+        auto* Mdata = mask->template mutable_data<uint8_t, Context>();
        kernel::Dropout<T, Context>(
            Output(0)->count(), prob(), scale,
-                Xdata, Mdata, Ydata, ctx());
+                Xdata, (uint32_t*)WSdata[0],
+                    Mdata, Ydata, ctx());
    } else LOG(FATAL) << "Incorrect Op phase: " << phase();
 }
@@ -32,6 +41,7 @@ void DropoutOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }
@@ -39,22 +49,25 @@ DEPLOY_CPU(Dropout);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Dropout);
 #endif
-OPERATOR_SCHEMA(Dropout).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(Dropout)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });
 template <class Context> template <typename T>
 void DropoutGradientOp<Context>::RunWithType() {
-    mask = ws()->GetTensor("/mnt/" + anchor() + "/dropout/mask");
+    auto* mask = ws()->GetTensor(
+        "/mnt/" + anchor() + "/dropout/mask");
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    auto* Mdata = mask->template data<uint32_t, Context>();
+    auto* Mdata = mask->template data<uint8_t, Context>();
-    float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
+    float scale = use_scale ? 1.f / (1.f - prob()) : 1.f;
    if (phase() == "TEST") { NOT_IMPLEMENTED; }
    else if (phase() == "TRAIN") {
-        kernel::DropoutGrad<T, Context>(
+        kernel::ApplyMask<T, uint8_t, Context>(mask->count(),
-            Output(0)->count(), prob(), scale,
+            scale, dYdata, Mdata, dXdata, ctx());
-                dYdata, Mdata, dXdata, ctx());
-        ctx()->FinishDeviceCompution();
-        mask->Reset();
    } else LOG(FATAL) << "Incorrect Op phase: " << phase();
 }
@@ -63,14 +76,17 @@ void DropoutGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
 DEPLOY_CPU(DropoutGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(DropoutGradient);
 #endif
-OPERATOR_SCHEMA(DropoutGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
+OPERATOR_SCHEMA(DropoutGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });
 class GetDropoutGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/arithmetic/add_op.cc
+++ b/Dragon/src/operators/arithmetic/add_op.cc
@@ -148,7 +148,9 @@ DEPLOY_CPU(AddGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(AddGradient);
 #endif
-OPERATOR_SCHEMA(AddGradient).NumInputs(1).NumOutputs(2);
+OPERATOR_SCHEMA(AddGradient)
+    .NumInputs(1).NumOutputs(2)
+    .Inplace({ { 0, 0 } });
 class GetAddGradient : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/arithmetic/clip_op.cc
+++ b/Dragon/src/operators/arithmetic/clip_op.cc
@@ -7,15 +7,11 @@ namespace dragon {
 template <class Context> template <typename T>
 void ClipOp<Context>::RunWithType() {
-    Tensor* mask = ws()->CreateTensor(
-        "/mnt/" + anchor() + "/clip/mask");
-    mask->ReshapeLike(Input(0));
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    auto* Mdata = mask->template mutable_data<T, Context>();
    kernel::Clip<T, Context>(Output(0)->count(),
-        low, high, Xdata, Mdata, Ydata, ctx());
+        low, high, Xdata, Ydata, ctx());
 }
 template <class Context>
@@ -30,19 +26,16 @@ DEPLOY_CPU(Clip);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Clip);
 #endif
-OPERATOR_SCHEMA(Clip)
+OPERATOR_SCHEMA(Clip).NumInputs(1).NumOutputs(1);
-    .NumInputs(1).NumOutputs(1)
-    .Inplace({ { 0, 0 } });
 template <class Context> template <typename T>
 void ClipGradientOp<Context>::RunWithType() {
-    Tensor* mask = ws()->GetTensor(
+    auto* Xdata = Input(0).template data<T, Context>();
-        "/mnt/" + anchor() + "/clip/mask");
+    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    auto* Mdata = mask->template data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(),
+    kernel::ClipGrad<T, Context>(Output(0)->count(),
-        dXdata, Mdata, dXdata, ctx());
+        low, high, Xdata, dYdata, dXdata, ctx());
 }
 template <class Context>
@@ -57,16 +50,14 @@ DEPLOY_CPU(ClipGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ClipGradient);
 #endif
-OPERATOR_SCHEMA(ClipGradient)
+OPERATOR_SCHEMA(ClipGradient).NumInputs(2).NumOutputs(1);
-    .NumInputs(2).NumOutputs(1)
-    .Inplace({ { 1, 0 } });
 class GetClipGradient final : public GradientMakerBase {
 public:
    GRADIENT_MAKER_CTOR(GetClipGradient);
    vector<OperatorDef> MakeDefs() override {
        return SingleDef(def.type() + "Gradient", "",
-            vector<string> {O(0), GO(0)},
+            vector<string> {I(0), GO(0)},
            vector<string> {GI(0)});
    }
 };

--- a/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
+++ b/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
@@ -5,6 +5,8 @@
 #include "utils/math_functions.h"
 #include "operators/arithmetic/affine_op.h"
+#if CUDNN_VERSION_MIN(6, 0, 0)
 namespace dragon {
 template <class Context> template <typename T>
@@ -48,9 +50,7 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -213,4 +213,6 @@ DEPLOY_CUDNN(AffineGradient);
 }    // namespace dragon
+#endif
 #endif    // WITH_CUDNN
\ No newline at end of file
--- a/Dragon/src/operators/arithmetic/div_op.cc
+++ b/Dragon/src/operators/arithmetic/div_op.cc
@@ -146,6 +146,7 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
+        CHECK(dy != dx1) << "\nCan't set inplace if X2 was broadcast.";
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemm<T, Context>(
@@ -185,7 +186,9 @@ DEPLOY_CPU(DivGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(DivGradient);
 #endif
-OPERATOR_SCHEMA(DivGradient).NumInputs(3).NumOutputs(2);
+OPERATOR_SCHEMA(DivGradient)
+    .NumInputs(3).NumOutputs(2)
+    .Inplace({ { 2, 0 } });
 class GetDivGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/arithmetic/log_op.cc
+++ b/Dragon/src/operators/arithmetic/log_op.cc
@@ -29,7 +29,8 @@ void LogGradientOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
+    math::Div<T, Context>(Output(0)->count(),
+        dYdata, Xdata, dXdata, ctx());
 }
 template <class Context>

--- a/Dragon/src/operators/arithmetic/maximum_op.cc
+++ b/Dragon/src/operators/arithmetic/maximum_op.cc
+#include "core/workspace.h"
+#include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/arithmetic/maximum_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void MaximumOp<Context>::EltwiseRunWithType() {
+    auto* X1data = Input(0).template data<T, Context>();
+    auto* X2data = Input(1).template data<T, Context>();
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    kernel::MaximumE<T, Context>(Output(0)->count(),
+        X1data, X2data, Ydata, ctx());
+}
+template <class Context> template <typename T>
+void MaximumOp<Context>::BroadcastRunWithType() {
+    T max_val; float x2_val; const T* Xdata; T* Ydata;
+    if (Input(0).count() == 1) {
+        Output(0)->ReshapeLike(Input(1));
+        x2_val = Input(0).template data<float, CPUContext>()[0];
+        max_val = dragon_cast<T, float>(x2_val);
+        Xdata = Input(1).template data<T, Context>();
+        Ydata = Output(0)->template mutable_data<T, Context>();
+    } else if (Input(1).count() == 1) {
+        Output(0)->ReshapeLike(Input(0));
+        x2_val = Input(1).template data<float, CPUContext>()[0];
+        max_val = dragon_cast<T, float>(x2_val);
+        Xdata = Input(0).template data<T, Context>();
+        Ydata = Output(0)->template mutable_data<T, Context>();
+    } else { LOG(FATAL) << "Either Input(0) or Input(1) should be a scalar."; }
+    kernel::MaximumB<T, Context>(Output(0)->count(),
+        Xdata, max_val, Ydata, ctx());
+}
+template <class Context>
+void MaximumOp<Context>::RunOnDevice() {
+    if (Input(0).dims() == Input(1).dims()) {
+        Output(0)->ReshapeLike(Input(0));
+        if (XIsType(Input(0), float)) EltwiseRunWithType<float>();
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else {
+        if (XIsType(Input(0), float)) BroadcastRunWithType<float>();
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    }
+}
+DEPLOY_CPU(Maximum);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(Maximum);
+#endif
+OPERATOR_SCHEMA(Maximum).NumInputs(2).NumOutputs(1);
+template <class Context> template <typename T>
+void MaximumGradientOp<Context>::EltwiseRunWithType() {
+    auto* X1data = Input(0).template data<T, Context>();
+    auto* X2data = Input(1).template data<T, Context>();
+    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dX1data = Output(0)->template mutable_data<T, Context>();
+    auto* dX2data = Output(1)->template mutable_data<T, Context>();
+    kernel::MaximumEGrad<T, Context>(Output(0)->count(),
+        X1data, X2data, dYdata, dX1data, dX2data, ctx());
+}
+template <class Context> template <typename T>
+void MaximumGradientOp<Context>::BroadcastRunWithType() {
+    T max_val; float x2_val;
+    const T* Xdata; T* dX1data; float* dX2data;
+    auto* dYdata = Input(-1).template data<T, Context>();
+    if (Input(0).count() == 1) {
+        x2_val = Input(0).template data<float, CPUContext>()[0];
+        max_val = dragon_cast<T, float>(x2_val);
+        Xdata = Input(1).template data<T, Context>();
+        dX1data = Output(1)->template mutable_data<T, Context>();
+        dX2data = Output(0)->template mutable_data<float, Context>();
+        kernel::MaximumBGrad<T, Context>(Output(1)->count(),
+            Xdata, max_val, dYdata, dX1data, ctx());
+    } else if (Input(1).count() == 1) {
+        x2_val = Input(1).template data<float, CPUContext>()[0];
+        max_val = dragon_cast<T, float>(x2_val);
+        Xdata = Input(0).template data<T, Context>();
+        dX1data = Output(0)->template mutable_data<T, Context>();
+        dX2data = Output(1)->template mutable_data<float, Context>();
+        kernel::MaximumBGrad<T, Context>(Output(0)->count(),
+            Xdata, max_val, dYdata, dX1data, ctx());
+    } else { LOG(FATAL) << "Either Input(0) or Input(1) should be a scalar."; }
+    //  we simply zero the grad of scalar
+    math::Set<float, Context>(1, 0, dX2data, ctx());
+}
+template <class Context>
+void MaximumGradientOp<Context>::RunOnDevice() {
+    Output(0)->ReshapeLike(Input(0));
+    Output(1)->ReshapeLike(Input(1));
+    if (Input(0).dims() == Input(1).dims()) {
+        if (XIsType(Input(0), float)) EltwiseRunWithType<float>();
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else {
+        if (XIsType(Input(0), float)) BroadcastRunWithType<float>();
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    }
+}
+DEPLOY_CPU(MaximumGradient);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(MaximumGradient);
+#endif
+OPERATOR_SCHEMA(MaximumGradient).NumInputs(3).NumOutputs(2);
+class GetMaximumGradient final : public GradientMakerBase {
+ public:
+    GRADIENT_MAKER_CTOR(GetMaximumGradient);
+    vector<OperatorDef> MakeDefs() override {
+        return SingleDef(def.type() + "Gradient", "",
+            vector<string> {I(0), I(1), GO(0)},
+            vector<string> {GI(0), GI(1)});
+    }
+};
+REGISTER_GRADIENT(Maximum, GetMaximumGradient);
+}   // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/arithmetic/minimum_op.cc
+++ b/Dragon/src/operators/arithmetic/minimum_op.cc
+#include "core/workspace.h"
+#include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/arithmetic/minimum_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void MinimumOp<Context>::EltwiseRunWithType() {
+    auto* X1data = Input(0).template data<T, Context>();
+    auto* X2data = Input(1).template data<T, Context>();
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    kernel::MinimumE<T, Context>(Output(0)->count(),
+        X1data, X2data, Ydata, ctx());
+}
+template <class Context> template <typename T>
+void MinimumOp<Context>::BroadcastRunWithType() {
+    T min_val; float x2_val; const T* Xdata; T* Ydata;
+    if (Input(0).count() == 1) {
+        Output(0)->ReshapeLike(Input(1));
+        x2_val = Input(0).template data<float, CPUContext>()[0];
+        min_val = dragon_cast<T, float>(x2_val);
+        Xdata = Input(1).template data<T, Context>();
+        Ydata = Output(0)->template mutable_data<T, Context>();
+    } else if (Input(1).count() == 1) {
+        Output(0)->ReshapeLike(Input(0));
+        x2_val = Input(1).template data<float, CPUContext>()[0];
+        min_val = dragon_cast<T, float>(x2_val);
+        Xdata = Input(0).template data<T, Context>();
+        Ydata = Output(0)->template mutable_data<T, Context>();
+    } else { LOG(FATAL) << "Either Input(0) or Input(1) should be a scalar."; }
+    kernel::MinimumB<T, Context>(Output(0)->count(),
+        Xdata, min_val, Ydata, ctx());
+}
+template <class Context>
+void MinimumOp<Context>::RunOnDevice() {
+    if (Input(0).dims() == Input(1).dims()) {
+        Output(0)->ReshapeLike(Input(0));
+        if (XIsType(Input(0), float)) EltwiseRunWithType<float>();
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else {
+        if (XIsType(Input(0), float)) BroadcastRunWithType<float>();
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    }
+}
+DEPLOY_CPU(Minimum);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(Minimum);
+#endif
+OPERATOR_SCHEMA(Minimum).NumInputs(2).NumOutputs(1);
+template <class Context> template <typename T>
+void MinimumGradientOp<Context>::EltwiseRunWithType() {
+    auto* X1data = Input(0).template data<T, Context>();
+    auto* X2data = Input(1).template data<T, Context>();
+    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dX1data = Output(0)->template mutable_data<T, Context>();
+    auto* dX2data = Output(1)->template mutable_data<T, Context>();
+    kernel::MinimumEGrad<T, Context>(Output(0)->count(),
+        X1data, X2data, dYdata, dX1data, dX2data, ctx());
+}
+template <class Context> template <typename T>
+void MinimumGradientOp<Context>::BroadcastRunWithType() {
+    T min_val; float x2_val;
+    const T* Xdata; T* dX1data; float* dX2data;
+    auto* dYdata = Input(-1).template data<T, Context>();
+    if (Input(0).count() == 1) {
+        x2_val = Input(0).template data<float, CPUContext>()[0];
+        min_val = dragon_cast<T, float>(x2_val);
+        Xdata = Input(1).template data<T, Context>();
+        dX1data = Output(1)->template mutable_data<T, Context>();
+        dX2data = Output(0)->template mutable_data<float, Context>();
+        kernel::MinimumBGrad<T, Context>(Output(1)->count(),
+            Xdata, min_val, dYdata, dX1data, ctx());
+    } else if (Input(1).count() == 1) {
+        x2_val = Input(1).template data<float, CPUContext>()[0];
+        min_val = dragon_cast<T, float>(x2_val);
+        Xdata = Input(0).template data<T, Context>();
+        dX1data = Output(0)->template mutable_data<T, Context>();
+        dX2data = Output(1)->template mutable_data<float, Context>();
+        kernel::MinimumBGrad<T, Context>(Output(0)->count(),
+            Xdata, min_val, dYdata, dX1data, ctx());
+    } else { LOG(FATAL) << "Either Input(0) or Input(1) should be a scalar."; }
+    //  we simply zero the grad of scalar
+    math::Set<float, Context>(1, 0, dX2data, ctx());
+}
+template <class Context>
+void MinimumGradientOp<Context>::RunOnDevice() {
+    Output(0)->ReshapeLike(Input(0));
+    Output(1)->ReshapeLike(Input(1));
+    if (Input(0).dims() == Input(1).dims()) {
+        if (XIsType(Input(0), float)) EltwiseRunWithType<float>();
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else {
+        if (XIsType(Input(0), float)) BroadcastRunWithType<float>();
+        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    }
+}
+DEPLOY_CPU(MinimumGradient);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(MinimumGradient);
+#endif
+OPERATOR_SCHEMA(MinimumGradient).NumInputs(3).NumOutputs(2);
+class GetMinimumGradient final : public GradientMakerBase {
+ public:
+    GRADIENT_MAKER_CTOR(GetMinimumGradient);
+    vector<OperatorDef> MakeDefs() override {
+        return SingleDef(def.type() + "Gradient", "",
+            vector<string> {I(0), I(1), GO(0)},
+            vector<string> {GI(0), GI(1)});
+    }
+};
+REGISTER_GRADIENT(Minimum, GetMinimumGradient);
+}   // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/arithmetic/mul_op.cc
+++ b/Dragon/src/operators/arithmetic/mul_op.cc
@@ -134,6 +134,7 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
+        CHECK(dy != dx1) << "\nCan't set inplace if X2 was broadcast.";
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemm<T, Context>(
@@ -173,7 +174,9 @@ DEPLOY_CPU(MulGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(MulGradient);
 #endif
-OPERATOR_SCHEMA(MulGradient).NumInputs(3).NumOutputs(2);
+OPERATOR_SCHEMA(MulGradient)
+    .NumInputs(3).NumOutputs(2)
+    .Inplace({ { 2, 0 } });
 class GetMulGradient : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
@@ -15,6 +15,7 @@ void PowOp<Context>::RunWithType() {
            dragon_cast<T, float>(value), Ydata, ctx());
        return;
    }
    auto* Xdata = Input(0).template data<T, Context>();
    ctx()->template Copy<T, Context, Context>(count, Ydata, Xdata);
    if (scale != 1.f) math::Scal<T, Context>(count, scale, Ydata, ctx());

--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
@@ -150,7 +150,9 @@ DEPLOY_CPU(SubGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SubGradient);
 #endif
-OPERATOR_SCHEMA(SubGradient).NumInputs(1).NumOutputs(2);
+OPERATOR_SCHEMA(SubGradient)
+    .NumInputs(1).NumOutputs(2)
+    .Inplace({ { 0, 0 } });
 class GetSubGradient : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/control_flow/scan_op.cc
+++ b/Dragon/src/operators/control_flow/scan_op.cc
@@ -7,8 +7,6 @@
 #include "operators/control_flow/scan_op.h"
 #include "operators/ndarray/slice_op.h"
-#define str dragon_cast<string, int>
 namespace dragon {
 template <class Context>
@@ -29,14 +27,14 @@ void ScanOp<Context>::InitTemplate() {
    for (int i = 0; i < nseqs; i++) {
        OperatorDef* op = template_def.add_op();
        op->CopyFrom(slice_def);
-        op->set_name(name() + "(BodyOp." + str(i) + ")");
+        op->set_name(name() + "(BodyOp." + std::to_string(i) + ")");
        op->add_input(Input(i).name());
        terms[Input(i).name()] = Input(i).name() + "@1";
    }
    for (int i = 0; i < nrepeats; i++) {
        OperatorDef* op = template_def.add_op();
        op->CopyFrom(func_def.op(i));
-        op->set_name(name() + "(BodyOp." + str(i + nseqs) + ")@1");
+        op->set_name(name() + "(BodyOp." + std::to_string(i + nseqs) + ")@1");
        //  replace inputs term
        for (int j = 0; j < op->input_size(); j++) {
            string* input = op->mutable_input(j);
@@ -61,8 +59,8 @@ void ScanOp<Context>::UpdateTerms(int cur_step) {
    string prev, now;
    //  update sequences term
    for (int i = 0; i < nseqs; i++) {
-        prev = Input(i).name() + "@" + str(cur_step - 1);
+        prev = Input(i).name() + "@" + std::to_string(cur_step - 1);
-        now = Input(i).name() + "@" + str(cur_step);
+        now = Input(i).name() + "@" + std::to_string(cur_step);
        terms[prev] = now;
    }
    if (cur_step < 3) return;
@@ -70,8 +68,8 @@ void ScanOp<Context>::UpdateTerms(int cur_step) {
    //  only support the latest one-step (as Theano's done)
    for (int i = 0; i < nout; i++) {
        if (default_outputs[i].empty()) continue;
-        prev = Output(i)->name() + "@" + str(cur_step - 2);
+        prev = Output(i)->name() + "@" + std::to_string(cur_step - 2);
-        now = Output(i)->name() + "@" + str(cur_step - 1);
+        now = Output(i)->name() + "@" + std::to_string(cur_step - 1);
        terms[prev] = now;
    }
 }
@@ -90,7 +88,7 @@ void ScanOp<Context>::UnrollTemplate() {
    if (graphs.count(nsteps)) return;
    new_def.CopyFrom(template_def);
-    new_def.set_name(name() + "(ScanLen." + str(nsteps) + ")");
+    new_def.set_name(name() + "(ScanLen." + std::to_string(nsteps) + ")");
    Argument phase; phase.set_name("phase");
    phase.set_s(this->phase()); new_def.add_arg()->CopyFrom(phase);
    for (int idx = 0; idx < nseqs; idx++) {
@@ -100,7 +98,7 @@ void ScanOp<Context>::UnrollTemplate() {
        op->mutable_arg(1)->set_i(nslices);
        //  add slices as outputs
        for (int t = 1; t <= nslices; t++) {
-            string slice = op->input(0) + "@" + str(t);
+            string slice = op->input(0) + "@" + std::to_string(t);
            op->add_output(slice);
        }
    }
@@ -111,7 +109,8 @@ void ScanOp<Context>::UnrollTemplate() {
        for (int idx = copy_l; idx < copy_r; idx++) {
            OperatorDef* op = new_def.add_op();
            op->CopyFrom(new_def.op(idx));
-            op->set_name(SplitString(op->name(), "@")[0] + "@" + str(t));
+            op->set_name(str::split(op->name(), "@")[0]
+                + "@" + std::to_string(t));
            //  replace inputs
            for (int j = 0; j < op->input_size(); j++) {
                string* input = op->mutable_input(j);
@@ -120,18 +119,19 @@ void ScanOp<Context>::UnrollTemplate() {
            //  replace outputs
            for (int j = 0; j < op->output_size(); j++) {
                string* output = op->mutable_output(j);
-                terms[*output] = SplitString(*output, "@")[0] + "@" + str(t);
+                terms[*output] = str::split(*output, "@")[0]
+                    + "@" + std::to_string(t);
                *output = terms[*output];
            }
        }
    }
    for (int i = 0; i < nout; i++) {
        //  solve the last step only
-        new_def.add_target(func_def.target(i) + "@" + str(nsteps));
+        new_def.add_target(func_def.target(i) + "@" + std::to_string(nsteps));
        //  concat all steps if necessary
        if (Output(i)->name() == "ignore") continue;
        OperatorDef* op = new_def.add_op();
-        op->set_name(name() + "(BodyOp." + str(nseqs + nrepeats + i) + ")");
+        op->set_name(name() + "(BodyOp." + std::to_string(nseqs + nrepeats + i) + ")");
        op->set_type("Concat");
        Argument arg_axis, arg_nin;
        arg_axis.set_name("axis"); arg_axis.set_i(axis);
@@ -139,7 +139,7 @@ void ScanOp<Context>::UnrollTemplate() {
        op->add_arg()->CopyFrom(arg_axis);
        op->add_arg()->CopyFrom(arg_nin);
        for (int t = 1; t <= nsteps; t++)
-            op->add_input(Output(i)->name() + "@" + str(t));
+            op->add_input(Output(i)->name() + "@" + std::to_string(t));
        op->add_output(Output(i)->name());
        //  solve all the all steps
        new_def.add_target(Output(i)->name());
@@ -195,7 +195,7 @@ void ScanGradientOp<Context>::MakeOps(const GraphDef& forward_def,
    maker.Make(forward_def, targets, new_def);
    //  post-process
-    new_def.set_name(name() + "(ScanLen." + str(nsteps) + ")");
+    new_def.set_name(name() + "(ScanLen." + std::to_string(nsteps) + ")");
    for (auto& target : targets) {
        for (int i = 0; i < OutputSize(); i++) {
            if (Output(i)->name() == "ignore") continue;

--- a/Dragon/src/operators/loss/nll_loss_op.cc
+++ b/Dragon/src/operators/loss/nll_loss_op.cc
+#include "core/workspace.h"
+#include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/loss/nll_loss_op.h"
+namespace dragon {
+template <class Context> template <typename Tx, typename Ty>
+void NLLLossOp<Context>::RunWithType() {
+    auto* LPdata = Input(0).template data<Tx, Context>();
+    auto* Tdata = Input(1).template data<Ty, Context>();
+    auto* Idata = !ignores.count() ? nullptr :
+        ignores.template data<int, Context>();
+    auto* Ldata = losses.template mutable_data<float, Context>();
+    auto* Fdata = flags.template mutable_data<float, Context>();
+    kernel::NLLLoss<Tx, Ty, Context>(
+        outer_dim, Input(0).dim(axis), inner_dim,
+            LPdata, Tdata, Idata, ignores.count(),
+                Ldata, Fdata, ctx());
+    if (normalization == "UNIT") {
+        vector<TIndex> output_dims = Input(0).dims();
+        output_dims.erase(output_dims.begin() + axis);
+        Output(0)->Reshape(output_dims);
+        Output(0)->template CopyFrom<Context>(losses, ctx());
+        return;
+    }
+    float normalizer = 1;
+    if (normalization == "VALID") {
+        normalizer = std::max(
+            math::ASum<float, Context>(
+                flags.count(), Fdata), 1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = outer_dim * inner_dim;
+    }
+    float loss = math::ASum<float, Context>(losses.count(), Ldata);
+    Output(0)->Reshape({ 1 });
+    auto* Ydata = Output(0)->template mutable_data<float, Context>();
+    math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
+}
+template <class Context>
+void NLLLossOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+    outer_dim = Input(0).count(0, axis);
+    inner_dim = Input(0).count(axis + 1);
+    CHECK_EQ(outer_dim * inner_dim, Input(1).count())
+        << "\nNumber of predictions must match the number of labels.";
+    losses.Reshape({ outer_dim * inner_dim });
+    flags.Reshape({ outer_dim * inner_dim });
+    if (XIsType(Input(0), float)) {
+        if (XIsType(Input(1), float)) RunWithType<float, float>();
+        else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
+        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
+    } else if (XIsType(Input(0), float16)) {
+        if (XIsType(Input(1), float)) RunWithType<float16, float>();
+        else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
+        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+}
+DEPLOY_CPU(NLLLoss);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(NLLLoss);
+#endif
+OPERATOR_SCHEMA(NLLLoss).NumInputs(2).NumOutputs(1);
+template <class Context> template <typename Tx, typename Ty>
+void NLLLossGradientOp<Context>::RunWithType() {
+    auto* LPdata = Input(0).template data<Tx, Context>();
+    auto* Tdata = Input(1).template data<Ty, Context>();
+    auto* Idata = !ignores.count() ? nullptr :
+        ignores.template data<int, Context>();
+    auto* dXdata = Output(0)->template mutable_data<Tx, Context>();
+    auto* Fdata = flags.template mutable_data<float, Context>();
+    math::Set<Tx, Context>(Output(0)->count(),
+        dragon_cast<Tx, float>(0.) , dXdata, ctx());
+    kernel::NLLLossGrad<Tx, Ty, Context>(
+        outer_dim, Output(0)->dim(axis), inner_dim,
+            LPdata, Tdata, Idata, ignores.count(),
+                dXdata, Fdata, ctx());
+    if (normalization == "UNIT") {
+        auto* dYdata = Input(-1).template data<float, Context>();
+        vector<void*> WSdata = ws()->template caches<Context>(
+            { Input(0).count() * sizeof(float),
+              Input(0).count() * sizeof(Tx) });
+        kernel::SumGrad<float, Context>(
+            Input(0).count() / Input(0).dim(axis),
+                Input(0).dim(axis), inner_dim,
+                    1.0, dYdata, (float*)WSdata[0], ctx());
+        kernel::TypeA2B<float, Tx, Context>(Input(0).count(),
+            (const float*)WSdata[0], (Tx*)WSdata[1], ctx());
+        math::Mul<Tx, Context>(Output(0)->count(),
+            (Tx*)WSdata[1], dXdata, dXdata, ctx());
+        return;
+    }
+    float normalizer = 1;
+    if (normalization == "VALID") {
+        normalizer = std::max(
+            math::ASum<float, Context>(
+                flags.count(), Fdata), 1.f);
+    } else if (normalization == "BATCH_SIZE") {
+        normalizer = Input(0).dim(0);
+    } else if (normalization == "FULL") {
+        normalizer = outer_dim * inner_dim;
+    }
+    auto* dYdata = Input(-1).template data<float, Context>();
+    float dYdata_host; ctx()->template Copy<float, CPUContext, Context>(
+        1, &dYdata_host, dYdata);
+    math::Scal<Tx, Context>(Output(0)->count(),
+        dYdata_host / normalizer, dXdata, ctx());
+}
+template <class Context>
+void NLLLossGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+    outer_dim = Input(0).count(0, axis);
+    inner_dim = Input(0).count(axis + 1);
+    Output(0)->ReshapeLike(Input(0));
+    flags.Reshape({ outer_dim * inner_dim });
+    if (XIsType(Input(0), float)) {
+        if (XIsType(Input(1), float)) RunWithType<float, float>();
+        else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
+        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
+    } else if (XIsType(Input(0), float16)) {
+        if (XIsType(Input(1), float)) RunWithType<float16, float>();
+        else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
+        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+}
+DEPLOY_CPU(NLLLossGradient);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(NLLLossGradient);
+#endif
+OPERATOR_SCHEMA(NLLLossGradient).NumInputs(3).NumOutputs(1);
+class GetNLLLossGradient final : public GradientMakerBase {
+ public:
+    GRADIENT_MAKER_CTOR(GetNLLLossGradient);
+    vector<OperatorDef> MakeDefs() override {
+        return SingleDef(def.type() + "Gradient", "",
+            vector<string> {I(0), I(1), GO(0)},
+            vector<string> {GI(0)});
+    }
+};
+REGISTER_GRADIENT(NLLLoss, GetNLLLossGradient);
+}    // namespace dragon
\ No newline at end of file
--- a/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
@@ -18,7 +18,9 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
                Xdata, Tdata, Ldata, Fdata, ctx());
    if (normalization == "UNIT") {
-        Output(0)->ReshapeLike(losses);
+        vector<TIndex> output_dims = Input(0).dims();
+        output_dims.erase(output_dims.begin() + axis);
+        Output(0)->Reshape(output_dims);
        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }

--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
@@ -43,9 +43,9 @@ template <class Context>
 void SmoothL1LossOp<Context>::RunOnDevice() {
    ctx()->set_stream_id(0);  //  enforce default stream
-    CHECK(Input(0).dims() == Input(1).dims());
+    CHECK(Input(0).count() == Input(1).count());
-    if (InputSize() > 2) CHECK(Input(0).dims() == Input(2).dims());
+    if (InputSize() > 2) CHECK(Input(0).count() == Input(2).count());
-    if (InputSize() > 3) CHECK(Input(0).dims() == Input(3).dims());
+    if (InputSize() > 3) CHECK(Input(0).count() == Input(3).count());
    Output(0)->Reshape({ 1 });
    diff = ws()->CreateTensor("/mnt/" + anchor() + "/smoothl1_loss/diff");

--- a/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
@@ -36,7 +36,9 @@ void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
                Ldata, Fdata, ctx());
    if (normalization == "UNIT") {
-        Output(0)->ReshapeLike(losses);
+        vector<TIndex> output_dims = Input(0).dims();
+        output_dims.erase(output_dims.begin() + axis);
+        Output(0)->Reshape(output_dims);
        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }

--- a/Dragon/src/operators/misc/initialize_op.cc
+++ b/Dragon/src/operators/misc/initialize_op.cc
@@ -28,6 +28,36 @@ void InitializeOp<Context>::RunOnDevice() {
    RunWithType<float>();
 }
+template <class Context> template <typename T>
+void FillOp<Context>::RunWithType() {
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    math::Set<T, Context>(Output(0)->count(),
+        dragon_cast<T, float>(value), Ydata, ctx());
+}
+template <class Context>
+void FillOp<Context>::RunOnDevice() {
+    vector<TIndex> output_shape;
+    if (shape_desc.empty()) {
+        //  determine the shape from dimensions
+        int ndims = (int)std::max(dims_value.size(), dims_desc.size());
+        for (int i = 0; i < ndims; i++) output_shape.push_back(dims(i));
+    } else {
+        //  determine the shape from given shape
+        Tensor* shape = ws()->GetTensor(shape_desc);
+        CHECK(shape->IsType<int>()) << "\nThe type of shape should be int32.";
+        auto* shape_data = shape->template data<int, CPUContext>();
+        for (int i = 0; i < shape->count(); i++) output_shape.push_back(shape_data[i]);
+    }
+    Output(0)->Reshape(output_shape);
+    if (dtype == "float32") RunWithType<float>();
+    else if (dtype == "float32") RunWithType<float16>();
+    else if (dtype == "int32") RunWithType<int>();
+    else if (dtype == "int64") RunWithType<int64_t>();
+    else LOG(FATAL) << DTypeHelper(dtype,
+        { "float32", "float16", "int32", "int64" });
+}
 //  constant
 DEPLOY_CPU(Fill);
 #ifdef WITH_CUDA

--- a/Dragon/src/operators/ndarray/reshape_op.cc
+++ b/Dragon/src/operators/ndarray/reshape_op.cc
@@ -3,14 +3,6 @@
 namespace dragon {
-string DimString(const vector<TIndex>& shape) {
-    std::stringstream ss;
-    ss << "(";
-    for (int i = 0; i < shape.size() - 1; i++) ss << shape[i] << ",";
-    ss << shape[shape.size() - 1] << ")";
-    return ss.str();
-}
 template <class Context>
 void ReshapeOp<Context>::RunOnDevice() {
    if (shape_desc.size() > 0 || shape_value.size() > 0) {
@@ -56,7 +48,7 @@ void ReshapeOp<Context>::RunOnDevice() {
                CHECK_EQ(Input(0).count() % total_count, 0)
                    << "\nCan not change the total size: "
                    << Input(0).DimString()
-                    << " -> " << DimString(new_shape);
+                    << " -> " << Tensor::DimString(new_shape);
                new_shape[i] = Input(0).count() / total_count;
                total_count *= new_shape[i];
                break;
@@ -66,7 +58,7 @@ void ReshapeOp<Context>::RunOnDevice() {
    CHECK_EQ(total_count, Input(0).count())
        << "\nCan not change the total size."
        << Input(0).DimString()
-        << " -> " << DimString(new_shape);
+        << " -> " << Tensor::DimString(new_shape);
    Output(0)->Reshape(new_shape);
    Output(0)->SetMeta(Input(0).meta());
    Output(0)->Share(Input(0).memory());

--- a/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
+++ b/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
@@ -123,14 +123,9 @@ template <class Context>
 void CuDNNBatchNormOp<Context>::RunOnDevice() {
    Setup();
-#ifdef WITH_CUDA_FP16
    if (XIsType(Input(0), float)) RunWithType<float>();
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
-#else
-    if (XIsType(Input(0), float)) RunWithType<float>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
-#endif
 }
 REGISTER_CUDNN_OPERATOR(
@@ -317,7 +312,6 @@ template <class Context>
 void CuDNNBatchNormGradientOp<Context>::RunOnDevice() {
    Setup();
-#ifdef WITH_CUDA_FP16
    if (XIsType(Input(0), float)) {
        if (this->use_global_stats) InferenceRunWithType<float>();
        else TrainingRunWithType<float>();
@@ -327,12 +321,6 @@ void CuDNNBatchNormGradientOp<Context>::RunOnDevice() {
            LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
        } else TrainingRunWithType<float16>();
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
-#else
-    if (XIsType(Input(0), float)) {
-        if (this->use_global_stats) InferenceRunWithType<float>();
-        else TrainingRunWithType<float>();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
-#endif
 }
 REGISTER_CUDNN_OPERATOR(

--- a/Dragon/src/operators/norm/fused_group_norm.cc
+++ b/Dragon/src/operators/norm/fused_group_norm.cc
--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
@@ -25,8 +25,8 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
            CUDNN_CHECK(cudnnDropoutGetStatesSize(
                ctx()->cudnn_handle(), &states_size));
            std::lock_guard<std::mutex> lk(CUDAContext::mutex());
-            Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
+            Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" 
-                dragon_cast<string, unsigned long long>(random_seed) + "/states");
+                + std::to_string(random_seed) + "/states");
            if (states->count() > 0) {
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
@@ -160,9 +160,7 @@ void CuDNNRecurrentOp<Context>::RunOnDevice() {
    ctx()->set_stream_id(0);  //  enforce default stream
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -240,9 +238,7 @@ void CuDNNRecurrentGradientOp<Context>::RunOnDevice() {
    Output(3)->ReshapeLike(Input(3));  // dCx
    if (XIsType(Input(0), float)) RunWithType<float>();
-#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/update/collective_update_op.cc
+++ b/Dragon/src/operators/update/collective_update_op.cc
@@ -31,8 +31,9 @@ void CollectiveUpdateOp<Context>::InitNCCL() {
    ncclUniqueId id;
    if (comm_rank == comm_root) NCCL_CHECK(ncclGetUniqueId(&id));
    MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, comm_root, comm);
-    NCCL_CHECK(ncclCommInitRank(&nccl_comm, comm_size, id, comm_rank));
+    ctx()->SwitchToDevice();
    closure = CUDAClosure<Context>(ctx());
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, comm_size, id, comm_rank));
 #else
    LOG(FATAL) << "NCCL was not compiled.";
 #endif

--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
@@ -29,6 +29,7 @@ void BiasAddOp<Context>::RunOnDevice() {
        dim = Input(0).dim(-1);
        inner_dim = Input(0).count(1) / dim;
    } else LOG(FATAL) << "Unknown data format: " << data_format;
+    Output(0)->ReshapeLike(Input(0));
    if (XIsType(Input(0), float)) RunWithType<float>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
@@ -38,7 +39,9 @@ DEPLOY_CPU(BiasAdd);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(BiasAdd);
 #endif
-OPERATOR_SCHEMA(BiasAdd).NumInputs(2).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(BiasAdd)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 0, 0 } });
 template <class Context> template <typename T>
 void BiasAddGradientOp<Context>::RunWithType() {
@@ -62,6 +65,12 @@ void BiasAddGradientOp<Context>::RunWithType() {
            dYdata += y_offset;
        }
    }
+    if (Output(0)->name() != "ignore" &&
+        Output(0)->name() != Input(-1).name()) {
+        Output(0)->ReshapeLike(Input(-1));
+        Output(0)->template CopyFrom<Context>(Input(-1), ctx());
+    }
 }
 template <class Context>
@@ -85,7 +94,9 @@ DEPLOY_CPU(BiasAddGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(BiasAddGradient);
 #endif
-OPERATOR_SCHEMA(BiasAddGradient).NumInputs(3).NumOutputs(2);
+OPERATOR_SCHEMA(BiasAddGradient)
+    .NumInputs(3).NumOutputs(2)
+    .Inplace({ { 2, 0 } });
 class GetBiasAddGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/vision/conv2d_op.cc
+++ b/Dragon/src/operators/vision/conv2d_op.cc
@@ -13,12 +13,12 @@ void Conv2dOp<Context>::RunWithType() {
    auto* Wdata = Input(1).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    for (int n = 0; n < Input(0).dim(0); n++) {
+    for (int n = 0; n < Input(0).dim(0); n++)
        Wx(Xdata + n * x_offset, Wdata, Ydata + n * y_offset);
    if (HasBias()) {
        auto* Bdata = Input(2).template data<T, Context>();
-            Pb(Bdata, Ydata + n * y_offset);
+        Pb(Bdata, Ydata);
-        }
    }
 }

--- a/Dragon/src/operators/vision/conv_op_base.cc
+++ b/Dragon/src/operators/vision/conv_op_base.cc
@@ -94,19 +94,9 @@ void ConvOpBase<Context>::Wx(
 template <class Context> template <typename T>
 void ConvOpBase<Context>::Pb(const T* bias, T* y) {
    DECLARE_MULTIPLIER(multiplier, out_spatial_dim);
-    if (data_format == "NCHW") {
+    kernel::BiasAdd<T, Context>(Output(0)->count(), 
-        math::Gemm<T, Context>(
+        Input(0).dim(0), num_output, out_spatial_dim,
-            CblasNoTrans, CblasNoTrans,
+            data_format, bias, multiplier, y, ctx());
-                num_output, out_spatial_dim, 1,
-                    1.0, bias, multiplier,
-                        1.0, y, ctx());
-    } else if (data_format == "NHWC") {
-        math::Gemm<T, Context>(
-            CblasNoTrans, CblasNoTrans,
-                out_spatial_dim, num_output, 1,
-                    1.0, multiplier, bias,
-                        1.0, y, ctx());
-    }
 }
 template <class Context> template <typename T>
@@ -117,8 +107,7 @@ void ConvOpBase<Context>::Dx(const T* dy, const T* weights, T* dx) {
        if (data_format == "NCHW") {
            math::Gemm<T, Context>(
                CblasTrans, CblasNoTrans,
-                                   kernel_dim,
+                    kernel_dim, conv_out_spatial_dim,
-                         conv_out_spatial_dim,
                        conv_out_channels / group,
                1.0, weights + weight_offset * g,
                          dy + output_offset * g,
@@ -126,8 +115,7 @@ void ConvOpBase<Context>::Dx(const T* dy, const T* weights, T* dx) {
        } else if (data_format == "NHWC") {
             math::Gemm<T, Context>(
                 CblasNoTrans, CblasTrans,
-                          conv_out_spatial_dim,
+                     conv_out_spatial_dim, kernel_dim,
-                                    kernel_dim,
                         conv_out_channels / group,
                 1.0, dy + output_offset * g,
                     weights + weight_offset * g,

--- a/Dragon/src/operators/vision/cudnn_bias_add_op.cc
+++ b/Dragon/src/operators/vision/cudnn_bias_add_op.cc
+#ifdef WITH_CUDNN
+#include "core/workspace.h"
+#include "utils/filler.h"
+#include "operators/vision/bias_add_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void CuDNNBiasAddOp<Context>::RunWithType() {
+    TENSOR_FILL(Input(1), vector<TIndex>(1, dim));
+    if (data_format == "NCHW") {
+        cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
+            vector<TIndex>({ 1, dim, 1, 1 }));
+        cudnnSetTensor4dDesc<T>(&output_desc, data_format,
+            vector<TIndex>({ outer_dim, dim, 1, inner_dim }));
+    } else if (data_format == "NHWC") {
+        cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
+            vector<TIndex>({ 1, 1, 1, dim }));
+        cudnnSetTensor4dDesc<T>(&output_desc, data_format,
+            vector<TIndex>({ outer_dim, 1, inner_dim, dim }));
+    }
+    auto* Bdata = Input(1).template data<T, Context>();
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    CUDNN_CHECK(cudnnAddTensor(ctx()->cudnn_handle(),
+        CUDNNType<T>::one, bias_desc, Bdata,
+            CUDNNType<T>::one, output_desc, Ydata));
+}
+template <class Context>
+void CuDNNBiasAddOp<Context>::RunOnDevice() {
+    if (data_format == "NCHW") {
+        outer_dim = Input(0).dim(0);
+        dim = Input(0).dim(1);
+        inner_dim = Input(0).count(2);
+    } else if (data_format == "NHWC") {
+        outer_dim = Input(0).dim(0);
+        dim = Input(0).dim(-1);
+        inner_dim = Input(0).count(1) / dim;
+    } else LOG(FATAL) << "Unknown data format: " << data_format;
+    Output(0)->ReshapeLike(Input(0));
+    if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+}
+DEPLOY_CUDNN(BiasAdd);
+template <class Context> template <typename T>
+void CuDNNBiasAddGradientOp<Context>::RunWithType() {
+    if (data_format == "NCHW") {
+        cudnnSetTensor4dDesc<T>(&input_desc, data_format,
+            vector<TIndex>({ outer_dim, dim, 1, inner_dim }));
+        cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
+            vector<TIndex>({ 1, dim, 1, 1 }));
+    } else if (data_format == "NHWC") {
+        cudnnSetTensor4dDesc<T>(&input_desc, data_format,
+            vector<TIndex>({ outer_dim, 1, inner_dim, dim }));
+        cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
+            vector<TIndex>({ 1, 1, 1, dim }));
+    }
+    auto* dYdata = Input(-1).template data<T, Context>();
+    T* dBdata = Output(1)->template mutable_data<T, Context>(ctx());
+    CUDNN_CHECK(cudnnConvolutionBackwardBias(ctx()->cudnn_handle(),
+        CUDNNType<T>::one, input_desc, dYdata,
+            CUDNNType<T>::one, bias_desc, dBdata));
+    if (Output(0)->name() != "ignore" &&
+        Output(0)->name() != Input(-1).name()) {
+        Output(0)->ReshapeLike(Input(-1));
+        Output(0)->template CopyFrom<Context>(Input(-1), ctx());
+    }
+}
+template <class Context>
+void CuDNNBiasAddGradientOp<Context>::RunOnDevice() {
+    if (data_format == "NCHW") {
+        outer_dim = Input(0).dim(0);
+        dim = Input(0).dim(1);
+        inner_dim = Input(0).count(2);
+    } else if (data_format == "NHWC") {
+        outer_dim = Input(0).dim(0);
+        dim = Input(0).dim(-1);
+        inner_dim = Input(0).count(1) / dim;
+    } else LOG(FATAL) << "Unknown data format: " << data_format;
+    Output(1)->ReshapeLike(Input(1));
+    if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+}
+DEPLOY_CUDNN(BiasAddGradient);
+}    // namespace dragon
+#endif    // WITH_CUDNN
\ No newline at end of file
--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
@@ -32,15 +32,12 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
    //  determine the bias shape
    if (HasBias()) {
-        bias_offset = num_output / cudnn_group;
        if (data_format == "NCHW") {
-            cudnnSetTensor4dDesc<T>(
+            cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
-                &bias_desc, data_format,
+                vector<TIndex>({ 1, num_output, 1, 1 }));
-                    vector<TIndex>({ 1, bias_offset, 1, 1 }));
        } else if (data_format == "NHWC") {
-            cudnnSetTensor4dDesc<T>(
+            cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
-                &bias_desc, data_format,
+                vector<TIndex>({ 1, 1, 1, num_output }));
-                    vector<TIndex>({ 1, 1, 1, bias_offset }));
        }
    }
@@ -86,12 +83,13 @@ void CuDNNConv2dOp<Context>::RunWithType() {
                filter_desc, Wdata + weight_offset * g,
                    conv_desc, fwd_algo, WSdata, fwd_data_size,
            CUDNNType<T>::zero, output_desc, Ydata + y_offset * g));
+    }
    if (HasBias()) {
-            auto* bias = Input(2).template data<T, Context>();
+        auto* Bdata = Input(2).template data<T, Context>();
        CUDNN_CHECK(cudnnAddTensor(cudnn_handle,
-                CUDNNType<T>::one, bias_desc, bias + bias_offset * g,
+            CUDNNType<T>::one, bias_desc, Bdata,
-                    CUDNNType<T>::one, output_desc, Ydata + y_offset * g));
+                CUDNNType<T>::one, output_desc, Ydata));
-        }
    }
 }
@@ -128,7 +126,6 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
 #endif
        RunWithType<float>();
    } else if (XIsType(Input(0), float16)) {
-#ifdef WITH_CUDA_FP16
 #if CUDNN_VERSION_MIN(6, 0, 0)
        compute_type = CUDNN_DATA_FLOAT;
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
@@ -150,7 +147,6 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
                conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float16>();
-#endif  // WITH_CUDA_FP16
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -179,15 +175,12 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
    //  determine the bias shape
    if (HasBias()) {
-        bias_offset = num_output / cudnn_group;
        if (data_format == "NCHW") {
-            cudnnSetTensor4dDesc<T>(
+            cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
-                &bias_desc, data_format,
+                vector<TIndex>({ 1, num_output, 1, 1 }));
-                    vector<TIndex>({ 1, bias_offset, 1, 1 }));
        } else if (data_format == "NHWC") {
-            cudnnSetTensor4dDesc<T>(&
+            cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
-                bias_desc, data_format,
+                vector<TIndex>({ 1, 1, 1, num_output }));
-                    vector<TIndex>({ 1, 1, 1, bias_offset }));
        }
    }
@@ -234,13 +227,14 @@ void CuDNNConv2dGradientOp<Context>::RunWithType() {
    auto cudnn_handle = ctx()->cudnn_handle();
-    for (int g = 0; g < cudnn_group; g++) {
    if (Output(2)->name() != "ignore") {
        T* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
        CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
-                CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
+            CUDNNType<T>::one, input_desc, dYdata,
-                    CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
+                CUDNNType<T>::one, bias_desc, dBdata));
    }
+    for (int g = 0; g < cudnn_group; g++) {
        if (Output(1)->name() != "ignore") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
@@ -295,7 +289,6 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
 #endif
        RunWithType<float>();
    } else if (XIsType(Input(0), float16)) {
-#ifdef WITH_CUDA_FP16
 #if CUDNN_VERSION_MIN(6, 0, 0)
        compute_type = CUDNN_DATA_FLOAT;
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
@@ -317,7 +310,6 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
                conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float16>();
-#endif  // WITH_CUDA_FP16
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
@@ -32,15 +32,12 @@ void CuDNNConv2dTransposeOp<Context>::ResetDesc() {
    //  determine the bias shape
    if (HasBias()) {
-        bias_offset = num_output / cudnn_group;
        if (data_format == "NCHW") {
-            cudnnSetTensor4dDesc<T>(
+            cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
-                &bias_desc, data_format,
+                vector<TIndex>({ 1, num_output, 1, 1 }));
-                    vector<TIndex>({ 1, bias_offset, 1, 1 }));
        } else if (data_format == "NHWC") {
-            cudnnSetTensor4dDesc<T>(
+            cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
-                &bias_desc, data_format,
+                vector<TIndex>({ 1, 1, 1, num_output }));
-                    vector<TIndex>({ 1, 1, 1, bias_offset }));
        }
    }
@@ -86,12 +83,13 @@ void CuDNNConv2dTransposeOp<Context>::RunWithType() {
                input_desc, Xdata + x_offset * g,
                    conv_desc, fwd_algo, WSdata, fwd_data_size,
            CUDNNType<T>::zero, output_desc, Ydata + y_offset * g));
+    }
    if (HasBias()) {
-            auto* bias = Input(2).template data<T, Context>();
+        auto* Bdata = Input(2).template data<T, Context>();
        CUDNN_CHECK(cudnnAddTensor(cudnn_handle,
-                CUDNNType<T>::one, bias_desc, bias + bias_offset * g,
+            CUDNNType<T>::one, bias_desc, Bdata,
-                    CUDNNType<T>::one, output_desc, Ydata + y_offset * g));
+                CUDNNType<T>::one, output_desc, Ydata));
-        }
    }
 }
@@ -128,7 +126,6 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() {
 #endif
        RunWithType<float>();
    } else if (XIsType(Input(0), float16)) {
-#ifdef WITH_CUDA_FP16
 #if CUDNN_VERSION_MIN(6, 0, 0)
        compute_type = CUDNN_DATA_FLOAT;
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
@@ -150,7 +147,6 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() {
                conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float16>();
-#endif  // WITH_CUDA_FP16
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }
@@ -179,15 +175,12 @@ void CuDNNConv2dTransposeGradientOp<Context>::ResetDesc() {
    //  determine the bias shape
    if (HasBias()) {
-        bias_offset = num_output / cudnn_group;
        if (data_format == "NCHW") {
-            cudnnSetTensor4dDesc<T>(
+            cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
-                &bias_desc, data_format,
+                vector<TIndex>({ 1, num_output, 1, 1 }));
-                    vector<TIndex>({ 1, bias_offset, 1, 1 }));
        } else if (data_format == "NHWC") {
-            cudnnSetTensor4dDesc<T>(
+            cudnnSetTensor4dDesc<T>(&bias_desc, data_format,
-                &bias_desc, data_format,
+                vector<TIndex>({ 1, 1, 1, num_output }));
-                    vector<TIndex>({ 1, 1, 1, bias_offset }));
        }
    }
@@ -234,13 +227,14 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunWithType() {
    auto cudnn_handle = ctx()->cudnn_handle();
-    for (int g = 0; g < cudnn_group; g++) {
    if (Output(2)->name() != "ignore") {
        T* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
        CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
-                CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
+            CUDNNType<T>::one, input_desc, dYdata,
-                    CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
+                CUDNNType<T>::one, bias_desc, dBdata));
    }
+    for (int g = 0; g < cudnn_group; g++) {
        if (Output(1)->name() != "ignore") {
            auto* Xdata = Input(0).template data<T, Context>();
            auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
@@ -295,7 +289,6 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() {
 #endif
        RunWithType<float>();
    } else if (XIsType(Input(0), float16)) {
-#ifdef WITH_CUDA_FP16
 #if CUDNN_VERSION_MIN(6, 0, 0)
        compute_type = CUDNN_DATA_FLOAT;
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
@@ -317,7 +310,6 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() {
                conv_desc, CUDNN_TENSOR_OP_MATH));
 #endif
        RunWithType<float16>();
-#endif  // WITH_CUDA_FP16
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

--- a/Dragon/src/operators/vision/cudnn_lrn_op.cc
+++ b/Dragon/src/operators/vision/cudnn_lrn_op.cc
@@ -26,14 +26,9 @@ void CuDNNLRNOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (this->mode == "ACROSS_CHANNELS") {
-#ifdef WITH_CUDA_FP16
        if (XIsType(Input(0), float)) RunWithType<float>();
        else if (XIsType(Input(0), float16)) RunWithType<float16>();
        else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
-#else
-        if (XIsType(Input(0), float)) RunWithType<float>();
-        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
-#endif
    } else if (this->mode == "WITHIN_CHANNEL") {
        LRNOp<Context>::RunOnDevice(); 
    } else {
@@ -69,14 +64,9 @@ void CuDNNLRNGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));
    if (this->mode == "ACROSS_CHANNELS") {
-#ifdef WITH_CUDA_FP16
        if (XIsType(Input(0), float)) RunWithType<float>();
        else if (XIsType(Input(0), float16)) RunWithType<float16>();
        else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
-#else
-        if (XIsType(Input(0), float)) RunWithType<float>();
-        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
-#endif
    } else if (this->mode == "WITHIN_CHANNEL") {
        LRNGradientOp<Context>::RunOnDevice(); 
    } else {

--- a/Dragon/src/operators/vision/cudnn_pooling2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_pooling2d_op.cc
@@ -34,14 +34,9 @@ template <class Context>
 void CuDNNPooling2dOp<Context>::RunOnDevice() {
    Pooling2dOp<Context>::Reshape();
-#ifdef WITH_CUDA_FP16
    if (XIsType(Input(0), float)) RunWithType<float>();
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
-#else
-    if (XIsType(Input(0), float)) RunWithType<float>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
-#endif
 }
 DEPLOY_CUDNN(Pooling2d);
@@ -79,14 +74,9 @@ template <class Context>
 void CuDNNPooling2dGradientOp<Context>::RunOnDevice() {
    Pooling2dGradientOp<Context>::Reshape();
-#ifdef WITH_CUDA_FP16
    if (XIsType(Input(0), float)) RunWithType<float>();
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
-#else
-    if (XIsType(Input(0), float)) RunWithType<float>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
-#endif
 }
 DEPLOY_CUDNN(Pooling2dGradient);

--- a/Dragon/src/operators/vision/dense_concat_op.cc
+++ b/Dragon/src/operators/vision/dense_concat_op.cc
@@ -51,7 +51,8 @@ void DenseConcatGradientOp<Context>::ElimateCorruption() {
        }
        int idx = safe_heads.front();
        safe_heads.pop();
-        Tensor* buffer = ws()->GetTensor("/opt/mirror_stage/buffer_" + dragon_cast<string, int>(idx));
+        Tensor* buffer = ws()->GetTensor(
+            "/opt/mirror_stage/buffer_" + std::to_string(idx));
        Input(0).Move(buffer->memory());
        head_data[idx] = Input(0).name();
        if (XIsType(Input(-2), float)) RestoreX1<float>();
@@ -86,7 +87,8 @@ void DenseConcatGradientOp<Context>::ElimateCorruption() {
                << "\nadd WORKSPACE_MAX_CORRUPTED_SIZE for more powerful mirror stage ?";
            int idx = safe_heads.front();
            safe_heads.pop();
-            Tensor* buffer = ws()->GetTensor("/opt/mirror_stage/buffer_" + dragon_cast<string, int>(idx));
+            Tensor* buffer = ws()->GetTensor(
+                "/opt/mirror_stage/buffer_" + std::to_string(idx));
            Output(i)->Move(buffer->memory());
            head_data[idx] = Output(i)->name();
        }

--- a/Dragon/src/operators/vision/drop_block_op.cc
+++ b/Dragon/src/operators/vision/drop_block_op.cc
+#include "core/workspace.h"
+#include "utils/op_kernel.h"
+#include "utils/math_functions.h"
+#include "operators/vision/drop_block_op.h"
+namespace dragon {
+template <class Context> template <typename T>
+void DropBlock2dOp<Context>::RunWithType() {
+    auto* Xdata = Input(0).template data<T, Context>();
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    if (phase() == "TEST") {
+        if (Output(0) != &Input(0)) {
+            ctx()->template Copy<T, Context, Context>(
+                Output(0)->count(), Ydata, Xdata);
+        }
+    } else if (phase() == "TRAIN") {
+        auto* mask = ws()->CreateTensor(
+            "/mnt/" + anchor() + "/drop_block/mask");
+        auto* norm = ws()->CreateTensor(
+            "/mnt/" + anchor() + "/drop_block/norm");
+        mask->ReshapeLike(Input(0));
+        norm->Reshape(vector<TIndex>({ 1 }));
+        auto WSdata = ws()->template caches<Context>({
+            n * c * seed_h * seed_w * sizeof(uint32_t),
+                mask->count() * sizeof(int),
+                    mask->count() * sizeof(float)});
+        auto* Mdata = mask->template mutable_data<uint8_t, Context>();
+        auto* Ndata = norm->template mutable_data<float, CPUContext>();
+        //  fill the mask with ones
+        math::Set<int, Context>(mask->count(),
+            1, (int*)WSdata[1], ctx());
+        //  generate 2d mask from seed region
+        kernel::DropBlock2d<Context>(n, c, h, w,
+            seed_h, seed_w, block_size, gamma, data_format,
+                (uint32_t*)WSdata[0], (int*)WSdata[1], ctx());
+        //  convert to float mask for counting
+        kernel::TypeA2B<int, float, Context>(mask->count(),
+            (int*)WSdata[1], (float*)WSdata[2], ctx());
+        //  convert to uint8 mask for applying
+        kernel::TypeA2B<int, uint8_t, Context>(mask->count(),
+            (int*)WSdata[1], Mdata, ctx());
+        //  count && apply
+        float normalizer = math::ASum<float, Context>(
+            mask->count(), (float*)WSdata[2]);
+        normalizer = std::max(normalizer, 1.f);
+        Ndata[0] = normalizer = mask->count() / normalizer;
+        kernel::ApplyMask<T, uint8_t, Context>(mask->count(),
+            normalizer, Xdata, Mdata, Ydata, ctx());
+    } else LOG(FATAL) << "Incorrect Op phase: " << phase();
+}
+template <class Context>
+void DropBlock2dOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+    if (data_format == "NCHW") {
+        n = Input(0).dim(0), c = Input(0).dim(1);
+        h = Input(0).dim(2), w = Input(0).dim(3);
+    } else if (data_format == "NHWC") {
+        n = Input(0).dim(0), c = Input(0).dim(-1);
+        h = Input(0).dim(1), w = Input(0).dim(2);
+    }
+    seed_h = h - block_size + 1;
+    seed_w = w - block_size + 1;
+    CHECK(seed_h > 0 && seed_w > 0) 
+        << "\nExcepted block_size <= feat_size.";
+    Output(0)->ReshapeLike(Input(0));
+    if (decrement > 0 && apply_prob > keep_prob()) {
+        apply_prob -= decrement;
+    } else { apply_prob = keep_prob(); }
+    gamma = (1.f - apply_prob) / (block_size * block_size);
+    gamma *= (alpha * (h * w) / (seed_h * seed_w));
+    if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+}
+DEPLOY_CPU(DropBlock2d);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(DropBlock2d);
+#endif
+OPERATOR_SCHEMA(DropBlock2d)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });
+template <class Context> template <typename T>
+void DropBlock2dGradientOp<Context>::RunWithType() {
+    auto* mask = ws()->GetTensor(
+        "/mnt/" + anchor() + "/drop_block/mask");
+    auto* norm = ws()->GetTensor(
+        "/mnt/" + anchor() + "/drop_block/norm");
+    auto* dYdata = Input(-1).template data<T, Context>();
+    auto* dXdata = Output(0)->template mutable_data<T, Context>();
+    auto* Mdata   = mask->template data<uint8_t, Context>();
+    auto* Ndata = norm->template mutable_data<float, CPUContext>();
+    if (phase() == "TEST") { NOT_IMPLEMENTED; }
+    else if (phase() == "TRAIN") {
+        kernel::ApplyMask<T, uint8_t, Context>(mask->count(),
+            Ndata[0], dYdata, Mdata, dXdata, ctx());
+    } else LOG(FATAL) << "Incorrect Op phase: " << phase();
+}
+template <class Context>
+void DropBlock2dGradientOp<Context>::RunOnDevice() {
+    Output(0)->ReshapeLike(Input(0));
+    if (XIsType(Input(0), float)) RunWithType<float>();
+    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+}
+DEPLOY_CPU(DropBlock2dGradient);
+#ifdef WITH_CUDA
+DEPLOY_CUDA(DropBlock2dGradient);
+#endif
+OPERATOR_SCHEMA(DropBlock2dGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });
+class GetDropBlock2dGradient final : public GradientMakerBase {
+ public:
+    GRADIENT_MAKER_CTOR(GetDropBlock2dGradient);
+    vector<OperatorDef> MakeDefs() override {
+        return SingleDef(def.type() + "Gradient", "",
+            vector<string> {O(0), GO(0)},
+            vector<string> {GI(0)});
+    }
+};
+REGISTER_GRADIENT(DropBlock2d, GetDropBlock2dGradient);
+}    // namepsace dragon
\ No newline at end of file
--- a/Dragon/src/protos/dragon.proto
+++ b/Dragon/src/protos/dragon.proto
@@ -35,7 +35,11 @@ message Argument {
    repeated string strings=7;
 }
-enum DeviceType { CPU = 0; CUDA = 1; OPENCL = 2; }
+enum DeviceType { 
+	CPU = 0; 
+	CUDA = 1; 
+	CNML = 2; 
+}
 message DeviceOption {
    optional DeviceType device_type = 1 [default = CPU];

--- a/Dragon/src/utils/cudnn_device.cc
+++ b/Dragon/src/utils/cudnn_device.cc
@@ -20,8 +20,6 @@ static_cast<void *>(&CUDNNType<double>::oneval);
 const void* CUDNNType<double>::zero =
 static_cast<void *>(&CUDNNType<double>::zeroval);
-#ifdef WITH_CUDA_FP16
 float CUDNNType<float16>::oneval = 1.0;
 float CUDNNType<float16>::zeroval = 0.0;
 const void* CUDNNType<float16>::one =
@@ -29,8 +27,6 @@ static_cast<void*>(&CUDNNType<float16>::oneval);
 const void* CUDNNType<float16>::zero =
 static_cast<void*>(&CUDNNType<float16>::zeroval);
-#endif
 template <typename T>
 void cudnnSetTensorDesc(
    cudnnTensorDescriptor_t*        desc,
@@ -173,8 +169,7 @@ void cudnnSetTensor4dDesc(
        << "\nThe num of dimensions of Tensor("
        << tensor->name() << ") "
        << "should be 4, but got " << tensor->ndim() << ".";
-    cudnnSetTensor4dDesc<T>(
+    cudnnSetTensor4dDesc<T>(desc, data_format, tensor->dims());
-        desc, data_format, tensor->dims());
 }
 template <typename T>
@@ -186,8 +181,7 @@ void cudnnSetTensor5dDesc(
        << "\nThe num of dimensions of Tensor("
        << tensor->name() << ") "
        << "should be 5, but got " << tensor->ndim() << ".";
-    cudnnSetTensor5dDesc<T>(
+    cudnnSetTensor5dDesc<T>(desc, data_format, tensor->dims());
-        desc, data_format, tensor->dims());
 }
 template <typename T>
@@ -300,9 +294,6 @@ template void cudnnSetTensorDesc<double>(
    const vector<TIndex>&,
    const vector<TIndex>&);
-#ifdef WITH_CUDA_FP16
 template void cudnnSetTensorDesc<float16>(
    cudnnTensorDescriptor_t*,
    Tensor*);
@@ -352,8 +343,6 @@ template void cudnnSetTensorDesc<float16>(
    const vector<TIndex>&,
    const vector<TIndex>&);
-#endif    // WITH_CUDA_FP16
 }    // namespace dragon
 #endif    // WITH_CUDNN
\ No newline at end of file
--- a/Dragon/src/utils/logging.cc
+++ b/Dragon/src/utils/logging.cc
@@ -35,8 +35,7 @@ LogSeverity StrToLogSeverity(std::string level) {
 }
 std::string GenLogHashKey(const char* file, int line) {
-    return std::string(file) +
+    return std::string(file) + std::to_string(line);
-        dragon_cast<std::string, int>(line);
 }
 int EveryNRegister(

--- a/Dragon/src/utils/math_functions.cc
+++ b/Dragon/src/utils/math_functions.cc
@@ -49,6 +49,21 @@ template <> void Set<int, CPUContext>(
 #endif  // WITH_SSE
 }
+template <> void Set<int64_t, CPUContext>(
+    const int               n,
+    const int64_t           alpha,
+    int64_t*                x,
+    CPUContext*             ctx) {
+    if (alpha == 0) {
+        memset(x, 0, sizeof(int64_t) * n);
+        return;
+    }
+#ifdef WITH_OMP
+    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#endif
+    for (int i = 0; i < n; ++i) x[i] = alpha;
+}
 template <> void Set<float16, CPUContext>(
    const int               n,
    const float16           alpha,
@@ -148,19 +163,36 @@ template <> void RandomTruncatedNormal<float16, CPUContext>(
    NOT_IMPLEMENTED;
 }
-template <> void RandomBernoulli<float, CPUContext>(
+template <typename T>
+void _RandomBernoulli(
    const int               n,
    const float             p,
-    uint32_t*               x,
+    T*                      x,
    CPUContext*             ctx) {
    std::bernoulli_distribution distribution(p);
    auto* rng = ctx->rand_generator();
 #ifdef WITH_OMP
-    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
+#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
    for (int i = 0; i < n; ++i) x[i] = distribution(*rng);
 }
+template <> void RandomBernoulli<uint8_t, CPUContext>(
+    const int               n,
+    const float             p,
+    uint8_t*                x,
+    CPUContext*             ctx) {
+    _RandomBernoulli<uint8_t>(n, p, x, ctx);
+}
+template <> void RandomBernoulli<uint32_t, CPUContext>(
+    const int               n,
+    const float             p,
+    uint32_t*               x,
+    CPUContext*             ctx) {
+    _RandomBernoulli<uint32_t>(n, p, x, ctx);
+}
 /******************** Level-1 ********************/
 template <> void Add<float, CPUContext>(
@@ -311,6 +343,14 @@ template <> void Log<float, CPUContext>(
    for (int i = 0; i < n; ++i) y[i] = std::log(x[i]);
 }
+template <> void Log<float16, CPUContext>(
+    int                     n,
+    const float16*          x,
+    float16*                y,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
 template <> void Square<float, CPUContext>(
    int                     n,
    const float*            x,
@@ -379,7 +419,7 @@ template <> void Inv<float, CPUContext>(
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
-    for (int i = 0; i < n; ++i) y[i] = numerator / y[i];
+    for (int i = 0; i < n; ++i) y[i] = numerator / x[i];
 }
 template <> void Inv<float16, CPUContext>(

--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
@@ -53,6 +53,22 @@ template <> void Set<int, CUDAContext>(
    }
 }
+template <> void Set<int64_t, CUDAContext>(
+    const int               n,
+    const int64_t           alpha,
+    int64_t*                x,
+    CUDAContext*            ctx) {
+    if (alpha == 0) {
+        CUDA_CHECK(cudaMemsetAsync(x, 0,
+            sizeof(int64_t) * n, ctx->cuda_stream()));
+    }
+    else {
+        _Set<int64_t>
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n, alpha, x);
+    }
+}
 template <> void RandomUniform<uint32_t, CUDAContext>(
    const int               n,
    const float             low,
@@ -75,16 +91,6 @@ template <> void RandomNormal<float, CUDAContext>(
    CURAND_CHECK(curandGenerateNormal(rng, x, n, mu, sigma));
 }
-template <> void RandomBernoulli<float, CUDAContext>(
-    const int               n,
-    const float             p,
-    unsigned int*           x,
-    CUDAContext*            ctx) {
-    //  curand could not generate bernoulli distribution
-    //  we recommend implement it within specfic case, e.g. Dropout
-    NOT_IMPLEMENTED;
-}
 /******************** Level-1 ********************/
 template <typename T>

--- a/Dragon/src/utils/math_functions_fp16.cu
+++ b/Dragon/src/utils/math_functions_fp16.cu
@@ -28,7 +28,6 @@ template <> void Set<float16, CUDAContext>(
    const float16           alpha,
    float16*                x,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if ((n & 1) == 0) {
        _SetHalf<half2>
            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
@@ -40,12 +39,8 @@ template <> void Set<float16, CUDAContext>(
            << < CUDA_BLOCKS(n), CUDA_THREADS,
                 0, ctx->cuda_stream() >> >(n, alpha, x);
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
-#ifdef WITH_CUDA_FP16
 __global__ void _TypeFloat2Half(
    const int               n,
    const float*            a,
@@ -54,7 +49,6 @@ __global__ void _TypeFloat2Half(
        b[idx] = __float2half(a[idx]);
    }
 }
-#endif
 template <> void RandomNormal<float16, CUDAContext>(
    const int               n,
@@ -62,7 +56,6 @@ template <> void RandomNormal<float16, CUDAContext>(
    const float             sigma,
    float16*                x,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    float* xf32 = (float*)CUDAContext::New(n * sizeof(float));
    CURAND_CHECK(curandGenerateNormal(
        ctx->curand_generator(), xf32, n, mu, sigma));
@@ -71,14 +64,10 @@ template <> void RandomNormal<float16, CUDAContext>(
             0, ctx->cuda_stream() >> >(n,
                 xf32, reinterpret_cast<half*>(x));
    CUDAContext::Delete(xf32);
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** Level-1 ********************/
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _AddHalf(
    const int               n,
@@ -104,7 +93,6 @@ __global__ void _AddHalf2(
 #endif
    }
 }
-#endif
 template <> void Add<float16, CUDAContext>(
    int                     n,
@@ -112,7 +100,6 @@ template <> void Add<float16, CUDAContext>(
    const float16*          b,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if ((n & 1) == 0) {
        _AddHalf2<half2>
            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
@@ -128,12 +115,8 @@ template <> void Add<float16, CUDAContext>(
                         reinterpret_cast<const half*>(b),
                             reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _SubHalf(
    const int               n,
@@ -159,7 +142,6 @@ __global__ void _SubHalf2(
 #endif
    }
 }
-#endif
 template <> void Sub<float16, CUDAContext>(
    int                     n,
@@ -167,7 +149,6 @@ template <> void Sub<float16, CUDAContext>(
    const float16*          b,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if ((n & 1) == 0) {
        _SubHalf2<half2>
            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
@@ -183,12 +164,8 @@ template <> void Sub<float16, CUDAContext>(
                         reinterpret_cast<const half*>(b),
                             reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _MulHalf(
    const int               n,
@@ -214,7 +191,6 @@ __global__ void _MulHalf2(
 #endif
    }
 }
-#endif
 template <> void Mul<float16, CUDAContext>(
    int                     n,
@@ -222,7 +198,6 @@ template <> void Mul<float16, CUDAContext>(
    const float16*          b,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if ((n & 1) == 0) {
        _MulHalf2<half2>
            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
@@ -238,12 +213,8 @@ template <> void Mul<float16, CUDAContext>(
                         reinterpret_cast<const half*>(b),
                             reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _DivHalf(
    const int               n,
@@ -256,7 +227,6 @@ __global__ void _DivHalf(
 #endif
    }
 }
-#endif
 template <> void Div<float16, CUDAContext>(
    int                     n,
@@ -264,19 +234,59 @@ template <> void Div<float16, CUDAContext>(
    const float16*          b,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    _DivHalf<half>
        << < CUDA_BLOCKS(n), CUDA_THREADS,
             0, ctx->cuda_stream() >> >(n,
                 reinterpret_cast<const half*>(a),
                      reinterpret_cast<const half*>(b),
                          reinterpret_cast<half*>(y));
-#else
+}
-    CUDA_FP16_NOT_COMPILED;
+template <typename T>
+__global__ void _LogHalf(
+    const int               n,
+    const T*                a,
+    T*                      y) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = hlog(a[idx]);
+#endif
+    }
+}
+template <typename T>
+__global__ void _LogHalf2(
+    const int               n,
+    const T*                a,
+    T*                      y) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = h2log(a[idx]);
 #endif
+    }
+}
+template <> void Log<float16, CUDAContext>(
+    int                     n,
+    const float16*          x,
+    float16*                y,
+    CUDAContext*            ctx) {
+    if ((n & 1) == 0) {
+        _LogHalf2<half2>
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                    reinterpret_cast<const half2*>(x),
+                        reinterpret_cast<half2*>(y));
+    }
+    else {
+        _LogHalf<half>
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     reinterpret_cast<const half*>(x),
+                        reinterpret_cast<half*>(y));
+    }
 }
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _SquareHalf(
    const int               n,
@@ -300,14 +310,12 @@ __global__ void _SquareHalf2(
 #endif
    }
 }
-#endif
 template <> void Square<float16, CUDAContext>(
    int                     n,
    const float16*          x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if ((n & 1) == 0) {
        _SquareHalf2<half2>
            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
@@ -321,12 +329,8 @@ template <> void Square<float16, CUDAContext>(
                     reinterpret_cast<const half*>(x),
                         reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _SqrtHalf(
    int                     n,
@@ -350,14 +354,12 @@ __global__ void _SqrtHalf2(
 #endif
    }
 }
-#endif
 template <> void Sqrt<float16, CUDAContext>(
    int                     n,
    const float16*          x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if ((n & 1) == 0) {
        _SqrtHalf2<half2>
            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
@@ -371,12 +373,8 @@ template <> void Sqrt<float16, CUDAContext>(
                     reinterpret_cast<const half*>(x),
                         reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _PowHalf(
    const int               n,
@@ -402,7 +400,6 @@ __global__ void _PowHalf2(
 #endif
    }
 }
-#endif
 template <> void Pow<float16, CUDAContext>(
    int                     n,
@@ -410,7 +407,6 @@ template <> void Pow<float16, CUDAContext>(
    const float16*          x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    CHECK(alpha == float(2)) << "fp16 only support the power of 2";
    if ((n & 1) == 0) {
        _PowHalf2<half2>
@@ -425,12 +421,8 @@ template <> void Pow<float16, CUDAContext>(
                     alpha, reinterpret_cast<const half*>(x),
                         reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _InvHalf(
    const int               n,
@@ -456,7 +448,6 @@ __global__ void _InvHalf2(
 #endif
    }
 }
-#endif
 template <> void Inv<float16, CUDAContext>(
    const int               n,
@@ -464,7 +455,6 @@ template <> void Inv<float16, CUDAContext>(
    const float16*          x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if ((n & 1) == 0) {
        _InvHalf2<half2>
            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
@@ -480,9 +470,6 @@ template <> void Inv<float16, CUDAContext>(
                         reinterpret_cast<const half*>(x),
                             reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** Level-2 ********************/
@@ -492,15 +479,11 @@ template <> void Scal<float16, CUDAContext>(
    const float             alpha,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    CUBLAS_CHECK(cublasScalEx(
        ctx->cublas_handle(), n,
            &alpha, CUDA_R_32F,
                y, CUDA_R_16F, 1,
                    CUDA_R_32F));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 template <> void Scale<float16, CUDAContext>(
@@ -519,7 +502,6 @@ template <> void Dot<float16, CUDAContext>(
    const float16*          b,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    CUBLAS_CHECK(cublasDotEx(
        ctx->cublas_handle(), n,
            a, CUDA_R_16F, 1,
@@ -527,12 +509,8 @@ template <> void Dot<float16, CUDAContext>(
                    y, CUDA_R_16F,
                        CUDA_R_32F));
    ctx->FinishDeviceCompution();
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _AddScalarHalf(
    const int               n,
@@ -556,14 +534,12 @@ __global__ void _AddScalarHalf2(
 #endif
    }
 }
-#endif
 template <> void AddScalar<float16, CUDAContext>(
    const int               n,
    const float             alpha,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if ((n & 1) == 0) {
        _AddScalarHalf2<half2>
            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
@@ -577,12 +553,8 @@ template <> void AddScalar<float16, CUDAContext>(
                     dragon_cast<half, float>(alpha),
                         reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _MulScalarHalf(
    const int               n,
@@ -606,14 +578,12 @@ __global__ void _MulScalarHalf2(
 #endif
    }
 }
-#endif
 template <> void MulScalar<float16, CUDAContext>(
    const int               n,
    const float             alpha,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if ((n & 1) == 0) {
        _MulScalarHalf2<half2>
            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
@@ -627,9 +597,6 @@ template <> void MulScalar<float16, CUDAContext>(
                     dragon_cast<half, float>(alpha),
                         reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 template <> void Axpy<float16, CUDAContext>(
@@ -638,16 +605,12 @@ template <> void Axpy<float16, CUDAContext>(
    const float16*          x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    CUBLAS_CHECK(cublasAxpyEx(
        ctx->cublas_handle(), n,
            &alpha, CUDA_R_32F,
                x, CUDA_R_16F, 1,
                    y, CUDA_R_16F, 1,
                        CUDA_R_32F));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 template <> void Axpby<float16, CUDAContext>(
@@ -667,7 +630,6 @@ template <> void RandomUniform<float16, CUDAContext>(
    const float             high,
    float16*                x,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    float* xf32 = (float*)ctx->New(n * sizeof(float));
    CURAND_CHECK(curandGenerateUniform(
        ctx->curand_generator(), xf32, n));
@@ -679,9 +641,6 @@ template <> void RandomUniform<float16, CUDAContext>(
    if (range != 1.f) Scal<float16, CUDAContext>(n, range, x, ctx);
    if (low != 0.f) AddScalar<float16, CUDAContext>(n, low, x, ctx);
    ctx->Delete(xf32);
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** Level-3 ********************/
@@ -699,7 +658,6 @@ template <> void Gemm<float16, CUDAContext>(
    float16*                C,
    CUDAContext*            ctx,
    TensorProto_DataType    math_type) {
-#ifdef WITH_CUDA_FP16
    int lda = (TransA == CblasNoTrans) ? K : M;
    int ldb = (TransB == CblasNoTrans) ? N : K;
    cublasOperation_t cuTransA = (TransA == CblasNoTrans) ?
@@ -782,9 +740,6 @@ template <> void Gemm<float16, CUDAContext>(
    } else {
        LOG(FATAL) << "Unsupported math type";
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 template <> void Gemv<float16, CUDAContext>(
@@ -798,7 +753,6 @@ template <> void Gemv<float16, CUDAContext>(
    float16*                y,
    CUDAContext*            ctx,
    TensorProto_DataType    math_type) {
-#ifdef WITH_CUDA_FP16
    cublasOperation_t cuTransA = (TransA == CblasNoTrans) ?
        CUBLAS_OP_T : CUBLAS_OP_N;
    int m = (cuTransA == CUBLAS_OP_N) ? N : M;
@@ -881,9 +835,6 @@ template <> void Gemv<float16, CUDAContext>(
    } else {
            LOG(FATAL) << "Unsupported math type";
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 }    // namespace math

--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -21,30 +21,65 @@ template<> void Dropout<float, CPUContext>(
    float                   prob,
    float                   scale,
    const float*            x,
-    uint32_t*               mask,
+    uint32_t*               mask32,
+    uint8_t*                mask8,
    float*                  y,
    CPUContext*             ctx) {
-    uint32_t thresh = static_cast<uint32_t>(UINT_MAX * prob);
+    math::RandomBernoulli<uint8_t, CPUContext>(
-    math::RandomBernoulli<float, CPUContext>(count, 1 - prob, mask, ctx);
+        count, 1 - prob, mask8, ctx);
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
-    for (int i = 0; i < count; ++i) y[i] = x[i] * mask[i] * scale;
+    for (int i = 0; i < count; ++i) {
+        y[i] = x[i] * mask8[i] * scale;
+    }
 }
-template<> void DropoutGrad<float, CPUContext>(
+template<> void Dropout<float16, CPUContext>(
    const int               count,
    float                   prob,
    float                   scale,
-    const float*            dy,
+    const float16*          x,
-    const uint32_t*         mask,
+    uint32_t*               mask32,
-    float*                  dx,
+    uint8_t*                mask8,
+    float16*                y,
    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+template <typename Tx, typename Tm>
+void _ApplyMask(
+    const int               count,
+    const float             scale,
+    const Tx*               x,
+    const Tm*               mask,
+    Tx*                     y) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
-    for (int i = 0; i < count; ++i)
+    for (int i = 0; i < count; ++i) {
-        dx[i] = dy[i] * mask[i] * scale;
+        y[i] = x[i] * mask[i] * scale;
+    }
+}
+template <> void ApplyMask<float, uint8_t, CPUContext>(
+    const int               count,
+    const float             scale,
+    const float*            x,
+    const uint8_t*          mask,
+    float*                  y,
+    CPUContext*             ctx) {
+    _ApplyMask<float, uint8_t>(count, scale, x, mask, y);
+}
+template <> void ApplyMask<float16, uint8_t, CPUContext>(
+    const int               count,
+    const float             scale,
+    const float16*          x,
+    const uint8_t*          mask,
+    float16*                y,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
 }
 /******************** activation.elu ********************/
@@ -479,19 +514,161 @@ template <> void Clip<float, CPUContext>(
    const float             low,
    const float             high,
    const float*            x,
-    float*                  mask,
    float*                  y,
    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
    for (int i = 0; i < count; ++i) {
-        mask[i] = 1.0;
-        if (x[i] < low || x[i] > high) mask[i] = 0.0;
        y[i] = std::max(low, std::min(x[i], high));
    }
 }
+template <> void ClipGrad<float, CPUContext>(
+    const int               count,
+    const float             low,
+    const float             high,
+    const float*            x,
+    const float*            dy,
+    float*                  dx,
+    CPUContext*             ctx) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        const float xi = x[i];
+        dx[i] = (xi < low || xi > high) ? 0 : dy[i];
+    }
+}
+/******************** arithmetic.maximum ********************/
+template <> void MaximumE<float, CPUContext>(
+    const int               count,
+    const float*            x1,
+    const float*            x2,
+    float*                  y,
+    CPUContext*             ctx) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        y[i] = std::max(x1[i], x2[i]);
+    }
+}
+template <> void MaximumB<float, CPUContext>(
+    const int               count,
+    const float*            x1,
+    const float             x2,
+    float*                  y,
+    CPUContext*             ctx) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        y[i] = std::max(x1[i], x2);
+    }
+}
+template <> void MaximumEGrad<float, CPUContext>(
+    const int               count,
+    const float*            x1,
+    const float*            x2,
+    const float*            dy,
+    float*                  dx1,
+    float*                  dx2,
+    CPUContext*             ctx) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        const bool dy_to_dx1 = x1[i] > x2[i];
+        dx1[i] = dy_to_dx1 ? dy[i] : 0;
+        dx2[i] = dy_to_dx1 ? 0 : dy[i];
+    }
+}
+template <> void MaximumBGrad<float, CPUContext>(
+    const int               count,
+    const float*            x1,
+    const float             x2,
+    const float*            dy,
+    float*                  dx1,
+ /* float*                  dx2, */
+    CPUContext*             ctx) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        dx1[i] = (x1[i] > x2) ? dy[i] : 0;
+    }
+}
+/******************** arithmetic.minimum ********************/
+template <> void MinimumE<float, CPUContext>(
+    const int               count,
+    const float*            x1,
+    const float*            x2,
+    float*                  y,
+    CPUContext*             ctx) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        y[i] = std::min(x1[i], x2[i]);
+    }
+}
+template <> void MinimumB<float, CPUContext>(
+    const int               count,
+    const float*            x1,
+    const float             x2,
+    float*                  y,
+    CPUContext*             ctx) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        y[i] = std::min(x1[i], x2);
+    }
+}
+template <> void MinimumEGrad<float, CPUContext>(
+    const int               count,
+    const float*            x1,
+    const float*            x2,
+    const float*            dy,
+    float*                  dx1,
+    float*                  dx2,
+    CPUContext*             ctx) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        const bool dy_to_dx1 = x1[i] < x2[i];
+        dx1[i] = dy_to_dx1 ? dy[i] : 0;
+        dx2[i] = dy_to_dx1 ? 0 : dy[i];
+    }
+}
+template <> void MinimumBGrad<float, CPUContext>(
+    const int               count,
+    const float*            x1,
+    const float             x2,
+    const float*            dy,
+    float*                  dx1,
+ /* float*                  dx2, */
+    CPUContext*             ctx) {
+#ifdef WITH_OMP
+#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
+#endif
+    for (int i = 0; i < count; ++i) {
+        dx1[i] = (x1[i] < x2) ? dy[i] : 0;
+    }
+}
 /******************** control_flow.compare ********************/
 template <> void Equal<float, CPUContext>(
@@ -524,6 +701,189 @@ template<> void AbsGrad<float, CPUContext>(
    }
 }
+/******************** loss.nll_loss ********************/
+template <typename Tx, typename Ty>
+void _NLLLoss(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               log_prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    Tx*                     losses,
+    Tx*                     flags) {
+    for (int oix = 0; oix < outer_dim; ++oix) {
+        for (int iix = 0; iix < inner_dim; ++iix) {
+            const int idx = oix * inner_dim + iix;
+            const int label = labels[idx];
+            int k;
+            for (k = 0; k < num_ignores; ++k) {
+                if (label == ignores[k]) {
+                    losses[idx] = flags[idx] = 0;
+                    break;
+                }
+            }
+            if (k == num_ignores) {
+                losses[idx] = -log_prob[
+                    (oix * axis_dim + label) * inner_dim + iix];
+                flags[idx] = 1;
+            }
+        }
+    }
+}
+template <> void NLLLoss<float, float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            log_prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CPUContext*             ctx) {
+    _NLLLoss<float, float>(
+        outer_dim, axis_dim, inner_dim,
+            log_prob, labels, ignores,
+                num_ignores, losses, flags);
+}
+template <> void NLLLoss<float16, float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          log_prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+template <> void NLLLoss<float, int64_t, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            log_prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CPUContext*             ctx) {
+    _NLLLoss<float, int64_t>(
+        outer_dim, axis_dim, inner_dim,
+            log_prob, labels, ignores,
+                num_ignores, losses, flags);
+}
+template <> void NLLLoss<float16, int64_t, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          log_prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+template <typename Tx, typename Ty>
+void _NLLLossGrad(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               log_prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    Tx*                     dx,
+    Tx*                     flags) {
+    flags[0] = 0;
+    for (int oix = 0; oix < outer_dim; ++oix) {
+        for (int iix = 0; iix < inner_dim; ++iix) {
+            const int label = labels[oix * inner_dim + iix];
+            int k;
+            for (k = 0; k < num_ignores; ++k)
+                if (label == ignores[k]) break;
+            if (k == num_ignores) {
+                dx[(oix * axis_dim + label) * inner_dim + iix] = -1;
+                flags[0]++;
+            }
+        }
+    }
+}
+template<> void NLLLossGrad<float, float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            log_prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  dx,
+    float*                  flags,
+    CPUContext*             ctx) {
+    _NLLLossGrad<float, float>(
+        outer_dim, axis_dim, inner_dim,
+            log_prob, labels, ignores,
+                num_ignores, dx, flags);
+}
+template<> void NLLLossGrad<float16, float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          log_prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float16*                dx,
+    float*                  flags,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+template<> void NLLLossGrad<float, int64_t, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            log_prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  dx,
+    float*                  flags,
+    CPUContext*             ctx) {
+    _NLLLossGrad<float, int64_t>(
+        outer_dim, axis_dim, inner_dim,
+            log_prob, labels, ignores,
+                num_ignores, dx, flags);
+}
+template<> void NLLLossGrad<float16, int64_t, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          log_prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float16*                dx,
+    float*                  flags,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
 /******************** loss.sigmoid_cross_entropy ********************/
 template <> void SigmoidCrossEntropy<float, CPUContext>(
@@ -2706,6 +3066,94 @@ template<> void Col2Im2d<float, CPUContext>(
    } else LOG(FATAL) << "Unknown data format: " << data_format;
 }
+/******************** vision.drop_block ********************/
+void _DropBlock2d_NCHW(
+    const int               N,
+    const int               C,
+    const int               H,
+    const int               W,
+    const int               seed_h,
+    const int               seed_w,
+    const int               block_size,
+    const uint32_t*         seed,
+    int*                    mask) {
+    TIndex seed_idx = 0;
+    for (int n = 0; n < N; ++n) {
+        for (int c = 0; c < C; ++c) {
+            const int nc = (n * C + c) * H;
+            for (int y = 0; y < seed_h; ++y) {
+                for (int x = 0; x < seed_w; ++x) {
+                    if (seed[seed_idx] > 0) {
+                        for (int i = 0; i < block_size; ++i) {
+                            const int nch = (nc + y + i) * W;
+                            for (int j = 0; j < block_size; ++j) {
+                                mask[nch + x + j] &= 0;
+                            }  // end j
+                        }  // end i
+                    }
+                    seed_idx++;
+                }  // end x
+            }  // end y
+        }  // end c
+    }  // end n
+}
+void _DropBlock2d_NHWC(
+    const int               N,
+    const int               C,
+    const int               H,
+    const int               W,
+    const int               seed_h,
+    const int               seed_w,
+    const int               block_size,
+    const uint32_t*         seed,
+    int*                    mask) {
+    TIndex seed_idx = 0;
+    for (int n = 0; n < N; ++n) {
+        for (int c = 0; c < C; ++c) {
+            for (int y = 0; y < seed_h; ++y) {
+                for (int x = 0; x < seed_w; ++x) {
+                    if (seed[seed_idx] > 0) {
+                        for (int i = 0; i < block_size; ++i) {
+                            const int nh = (n * H + y + i) * W;
+                            for (int j = 0; j < block_size; ++j) {
+                                mask[(nh + x + j) * C + c] &= 0;
+                            }  // end j
+                        }  // end i
+                    }
+                    seed_idx++;
+                }  // end x
+            }  // end y
+        }  // end c
+    }  // end n
+}
+template <> void DropBlock2d<CPUContext>(
+    const int               N,
+    const int               C,
+    const int               H,
+    const int               W,
+    const int               seed_h,
+    const int               seed_w,
+    const int               block_size,
+    const float             gamma,
+    const string&           data_format,
+    uint32_t*               seed,
+    int*                    mask,
+    CPUContext*             ctx) {
+    const int count = N * C * seed_h * seed_w;
+    math::RandomBernoulli<uint32_t, CPUContext>(
+        count, gamma, seed, ctx);
+    if (data_format == "NCHW") {
+        _DropBlock2d_NCHW(N, C, H, W,
+            seed_h, seed_w, block_size, seed, mask);
+    } else if (data_format == "NHWC") {
+        _DropBlock2d_NHWC(N, C, H, W,
+            seed_h, seed_w, block_size, seed, mask);
+    } else LOG(FATAL) << "Unknown data format: " << data_format;
+}
 /******************** vision.nn_resize ********************/
 template <typename T>

--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
@@ -19,12 +19,14 @@ template<typename T>
 __global__ void _Dropout(
    const int               count,
    const uint32_t          thresh,
-    const T                 scale,
+    const float             scale,
    const T*                x,
-    const uint32_t*         mask,
+    const uint32_t*         mask32,
+    uint8_t*                mask8,
    T*                      y) {
    CUDA_1D_KERNEL_LOOP(idx, count) {
-        y[idx] = x[idx] * (mask[idx] > thresh) * scale;
+        mask8[idx] = (mask32[idx] > thresh);
+        y[idx] = x[idx] * mask8[idx] * scale;
    }
 }
@@ -33,44 +35,42 @@ template<> void Dropout<float, CUDAContext>(
    float                   prob,
    float                   scale,
    const float*            x,
-    uint32_t*               mask,
+    uint32_t*               mask32,
+    uint8_t*                mask8,
    float*                  y,
    CUDAContext*            ctx) {
-    uint32_t thresh = static_cast<uint32_t>(UINT_MAX * prob);
    math::RandomUniform<uint32_t, CUDAContext>(
-        count, float(0), float(UINT_MAX), mask, ctx);
+        count, float(0), float(UINT_MAX), mask32, ctx);
+    auto thresh = static_cast<uint32_t>(UINT_MAX * prob);
    _Dropout<float>
        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+             0, ctx->cuda_stream() >> >(count,
-                 count, thresh, scale, x, mask, y);
+                 thresh, scale, x, mask32, mask8, y);
 }
-template <typename T>
+template <typename Tx, typename Tm>
-__global__ void _DropoutGrad(
+__global__ void _ApplyMask(
    const int               count,
-    const uint32_t          thresh,
+    const float             scale,
-    const T                 scale,
+    const Tx*               x,
-    const T*                dy, 
+    const Tm*               mask,
-    const uint32_t*         mask,
+    Tx*                     y) {
-    T*                      dx) {
    CUDA_1D_KERNEL_LOOP(idx, count) {
-        dx[idx] = dy[idx] * (mask[idx] > thresh) * scale;
+        y[idx] = x[idx] * mask[idx] * scale;
    }
 }
-template<> void DropoutGrad<float, CUDAContext>(
+template <> void ApplyMask<float, uint8_t, CUDAContext>(
    const int               count,
-    float                   prob,
+    const float             scale,
-    float                   scale,
+    const float*            x,
-    const float*            dy, 
+    const uint8_t*          mask,
-    const uint32_t*         mask,
+    float*                  y,
-    float*                  dx,
    CUDAContext*            ctx) {
-    uint32_t thresh = static_cast<uint32_t>(UINT_MAX * prob);
+    _ApplyMask<float, uint8_t>
-    _DropoutGrad<float>
        << < CUDA_BLOCKS(count), CUDA_THREADS,
-             0, ctx->cuda_stream() >> >(
+             0, ctx->cuda_stream() >> >(count,
-                 count, thresh, scale, dy, mask, dx);
+                 scale, x, mask, y);
 }
 /******************** activation.prelu ********************/
@@ -753,13 +753,9 @@ __global__ void _Clip(
    const T                 low,
    const T                 high,
    const T*                x,
-    T*                      mask,
    T*                      y) {
    CUDA_1D_KERNEL_LOOP(idx, count) {
-        mask[idx] = 1.0;
+        y[idx] = max(low, min(x[idx], high));
-        if (x[idx] > high || x[idx] < low) mask[idx] = 0.0;
-        y[idx] = x[idx] > high ? high : x[idx];
-        y[idx] = x[idx] < low ? low : x[idx];
    }
 }
@@ -768,13 +764,237 @@ template <> void Clip<float, CUDAContext>(
    const float             low,
    const float             high,
    const float*            x,
-    float*                  mask,
    float*                  y,
    CUDAContext*            ctx) {
    _Clip<float>
        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count, low, high, x, y);
+}
+template <typename T>
+__global__ void _ClipGrad(
+    const int               count,
+    const T                 low,
+    const T                 high,
+    const T*                x,
+    const T*                dy,
+    T*                      dx) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        const T xi = x[idx];
+        dx[idx] = (xi < low || xi > high) ? 0 : dy[idx];
+    }
+}
+template <> void ClipGrad<float, CUDAContext>(
+    const int               count,
+    const float             low,
+    const float             high,
+    const float*            x,
+    const float*            dy,
+    float*                  dx,
+    CUDAContext*            ctx) {
+    _ClipGrad<float>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
             0, ctx->cuda_stream() >> >(count,
-                 low, high, x, mask, y);
+                 low, high, x, dy, dx);
+}
+/******************** arithmetic.maximum ********************/
+template <typename T>
+__global__ void _MaximumE(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    T*                      y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        y[idx] = max(x1[idx], x2[idx]);
+    }
+}
+template <> void MaximumE<float, CUDAContext>(
+    const int               count,
+    const float*            x1,
+    const float*            x2,
+    float*                  y,
+    CUDAContext*            ctx) {
+    _MaximumE<float>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+              0, ctx->cuda_stream() >> >(count, x1, x2, y);
+}
+template <typename T>
+__global__ void _MaximumB(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    T*                      y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        y[idx] = max(x1[idx], x2);
+    }
+}
+template <> void MaximumB<float, CUDAContext>(
+    const int               count,
+    const float*            x1,
+    const float             x2,
+    float*                  y,
+    CUDAContext*            ctx) {
+    _MaximumB<float>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count, x1, x2, y);
+}
+template <typename T>
+__global__ void _MaximumEGrad(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    const T*                dy,
+    T*                      dx1,
+    T*                      dx2) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        const bool dy_to_dx1 = x1[idx] > x2[idx];
+        dx1[idx] = dy_to_dx1 ? dy[idx] : 0;
+        dx2[idx] = dy_to_dx1 ? 0 : dy[idx];
+    }
+}
+template <> void MaximumEGrad<float, CUDAContext>(
+    const int               count,
+    const float*            x1,
+    const float*            x2,
+    const float*            dy,
+    float*                  dx1,
+    float*                  dx2,
+    CUDAContext*            ctx) {
+    _MaximumEGrad<float>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count, x1, x2, dy, dx1, dx2);
+}
+template <typename T>
+__global__ void _MaximumBGrad(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    const T*                dy,
+    T*                      dx1) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        dx1[idx] = (x1[idx] > x2) ? dy[idx] : 0;
+    }
+}
+template <> void MaximumBGrad<float, CUDAContext>(
+    const int               count,
+    const float*            x1,
+    const float             x2,
+    const float*            dy,
+    float*                  dx1,
+ /* float*                  dx2, */
+    CUDAContext*            ctx) {
+    _MaximumBGrad<float>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count, x1, x2, dy, dx1);
+}
+/******************** arithmetic.minimum ********************/
+template <typename T>
+__global__ void _MinimumE(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    T*                      y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        y[idx] = min(x1[idx], x2[idx]);
+    }
+}
+template <> void MinimumE<float, CUDAContext>(
+    const int               count,
+    const float*            x1,
+    const float*            x2,
+    float*                  y,
+    CUDAContext*            ctx) {
+    _MinimumE<float>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+              0, ctx->cuda_stream() >> >(count, x1, x2, y);
+}
+template <typename T>
+__global__ void _MinimumB(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    T*                      y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        y[idx] = min(x1[idx], x2);
+    }
+}
+template <> void MinimumB<float, CUDAContext>(
+    const int               count,
+    const float*            x1,
+    const float             x2,
+    float*                  y,
+    CUDAContext*            ctx) {
+    _MinimumB<float>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count, x1, x2, y);
+}
+template <typename T>
+__global__ void _MinimumEGrad(
+    const int               count,
+    const T*                x1,
+    const T*                x2,
+    const T*                dy,
+    T*                      dx1,
+    T*                      dx2) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        const bool dy_to_dx1 = x1[idx] < x2[idx];
+        dx1[idx] = dy_to_dx1 ? dy[idx] : 0;
+        dx2[idx] = dy_to_dx1 ? 0 : dy[idx];
+    }
+}
+template <> void MinimumEGrad<float, CUDAContext>(
+    const int               count,
+    const float*            x1,
+    const float*            x2,
+    const float*            dy,
+    float*                  dx1,
+    float*                  dx2,
+    CUDAContext*            ctx) {
+    _MinimumEGrad<float>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count, x1, x2, dy, dx1, dx2);
+}
+template <typename T>
+__global__ void _MinimumBGrad(
+    const int               count,
+    const T*                x1,
+    const T                 x2,
+    const T*                dy,
+    T*                      dx1) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        dx1[idx] = (x1[idx] < x2) ? dy[idx] : 0;
+    }
+}
+template <> void MinimumBGrad<float, CUDAContext>(
+    const int               count,
+    const float*            x1,
+    const float             x2,
+    const float*            dy,
+    float*                  dx1,
+ /* float*                  dx2, */
+    CUDAContext*            ctx) {
+    _MinimumBGrad<float>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count, x1, x2, dy, dx1);
 }
 /******************** control_flow.compare ********************/
@@ -825,6 +1045,145 @@ template<> void AbsGrad<float, CUDAContext>(
             0, ctx->cuda_stream() >> >(count, dy, dx);
 }
+/******************** loss.nll_loss ********************/
+template <typename Tx, typename Ty>
+__global__ void _NLLLoss(
+    const int               count,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               log_prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    Tx*                     losses,
+    Tx*                     flags) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        const int oix = idx / inner_dim;
+        const int iix = idx % inner_dim;
+        const int label = labels[oix * inner_dim + iix];
+        int k;
+        for (k = 0; k < num_ignores; k++) {
+            if (label == ignores[k]) {
+                losses[idx] = flags[idx] = 0;
+                break;
+            }
+        }
+        if (k == num_ignores) {
+            losses[idx] = -log_prob[
+                (oix * axis_dim + label) * inner_dim + iix];
+            flags[idx] = 1;
+        }
+    }
+}
+template <> void NLLLoss<float, float, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            log_prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _NLLLoss<float, float>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                    log_prob, labels, ignores,
+                        num_ignores, losses, flags);
+}
+template <> void NLLLoss<float, int64_t, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            log_prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _NLLLoss<float, int64_t>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                     log_prob, labels, ignores,
+                        num_ignores, losses, flags);
+}
+template <typename Tx, typename Ty>
+__global__ void _NLLLossGrad(
+    const int               count,
+    const int               axis_dim,
+    const int               inner_dim,
+    const Tx*               log_prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    Tx*                     dx,
+    Tx*                     flags) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        const int oix = idx / inner_dim;
+        const int iix = idx % inner_dim;
+        const int label = labels[oix * inner_dim + iix];
+        int k;
+        for (k = 0; k < num_ignores; k++)
+            if (label == ignores[k]) break;
+        if (k != num_ignores) {
+            flags[idx] = 0;
+        } else {
+            dx[(oix * axis_dim + label) * inner_dim + iix] = -1;
+            flags[idx] = 1;
+        }
+    }
+}
+template<> void NLLLossGrad<float, float, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            log_prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  dx,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _NLLLossGrad<float, float>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                num_preds, axis_dim, inner_dim,
+                    log_prob, labels, ignores,
+                        num_ignores, dx, flags);
+}
+template<> void NLLLossGrad<float, int64_t, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float*            log_prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  dx,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _NLLLossGrad<float, int64_t>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                    log_prob, labels, ignores,
+                        num_ignores, dx, flags);
+}
 /******************** loss.sigmoid_cross_entropy ********************/
 template <typename T>
@@ -2856,8 +3215,7 @@ __global__ void _BiasAdd_NCHW(
    const T*                bias,
    T*                      y) {
    CUDA_1D_KERNEL_LOOP(idx, count) {
-        const int bias_idx = (idx / inner_dim) % dim;
+        y[idx] += bias[(idx / inner_dim) % dim];
-        y[idx] += bias[bias_idx];
    }
 }
@@ -3395,6 +3753,95 @@ template <> void Col2Im2d<float, CUDAContext>(
    } else LOG(FATAL) << "Unknown data format: " << data_format;
 }
+/******************** vision.drop_block ********************/
+template <typename T>
+__global__ void _DropBlock2d_NCHW(
+    const int               count,
+    const int               C,
+    const int               H,
+    const int               W,
+    const int               seed_h,
+    const int               seed_w,
+    const int               block_size,
+    const uint32_t          thresh,
+    const uint32_t*         seed,
+    int*                    mask) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        if (seed[idx] < thresh) {
+            const int x = idx % seed_w;
+            const int y = (idx / seed_w) % seed_h;
+            const int c = (idx / seed_w / seed_h) % C;
+            const int n = (idx / seed_w / seed_h) / C;
+            const int nc = (n * C + c) * H;
+            for (int i = 0; i < block_size; ++i) {
+                const int nch = (nc + y + i) * W;
+                for (int j = 0; j < block_size; ++j)
+                    atomicAnd(&mask[nch + x + j], 0);
+            }
+        }
+    }
+}
+template <typename T>
+__global__ void _DropBlock2d_NHWC(
+    const int               count,
+    const int               C,
+    const int               H,
+    const int               W,
+    const int               seed_h,
+    const int               seed_w,
+    const int               block_size,
+    const uint32_t          thresh,
+    const uint32_t*         seed,
+    int*                    mask) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+        if (seed[idx] < thresh) {
+            const int x = idx % seed_w;
+            const int y = (idx / seed_w) % seed_h;
+            const int c = (idx / seed_w / seed_h) % C;
+            const int n = (idx / seed_w / seed_h) / C;
+            for (int i = 0; i < block_size; ++i) {
+                const int nh = (n * H + y + i) * W;
+                for (int j = 0; j < block_size; ++j)
+                    atomicAnd(&mask[(nh + x + j) * C + c], 0);
+            }
+        }
+    }
+}
+template <> void DropBlock2d<CUDAContext>(
+    const int               N,
+    const int               C,
+    const int               H,
+    const int               W,
+    const int               seed_h,
+    const int               seed_w,
+    const int               block_size,
+    const float             gamma,
+    const string&           data_format,
+    uint32_t*               seed,
+    int*                    mask,
+    CUDAContext*            ctx) {
+    const int count = N * C * seed_h * seed_w;
+    math::RandomUniform<uint32_t, CUDAContext>(
+        count, 0.f, float(UINT_MAX), seed, ctx);
+    auto thresh = static_cast<uint32_t>(UINT_MAX * gamma);
+    if (data_format == "NCHW") {
+        _DropBlock2d_NCHW<int>
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     C, H, W, seed_h, seed_w, block_size,
+                        thresh, seed, mask);
+    } else if(data_format == "NHWC") {
+        _DropBlock2d_NHWC<int>
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                    C, H, W, seed_h, seed_w, block_size,
+                        thresh, seed, mask);
+    } else LOG(FATAL) << "Unknown data format: " << data_format;
+}
 /******************** vision.nn_resize ********************/
 template <typename T>

--- a/Dragon/src/utils/op_kernel_fp16.cu
+++ b/Dragon/src/utils/op_kernel_fp16.cu
@@ -13,9 +13,77 @@ namespace dragon {
 namespace kernel {
+/******************** activation.dropout ********************/
+__global__ void _DropoutHalf(
+    const int               count,
+    const uint32_t          thresh,
+    const half              scale,
+    const half*             x,
+    const uint32_t*         mask32,
+    uint8_t*                mask8,
+    half*                   y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        mask8[idx] = (mask32[idx] > thresh);
+        y[idx] = __hmul(__hmul(x[idx], scale),
+            __float2half((float)mask8[idx]));
+#endif
+    }
+}
+template<> void Dropout<float16, CUDAContext>(
+    const int               count,
+    float                   prob,
+    float                   scale,
+    const float16*          x,
+    uint32_t*               mask32,
+    uint8_t*                mask8,
+    float16*                y,
+    CUDAContext*            ctx) {
+    math::RandomUniform<uint32_t, CUDAContext>(
+        count, float(0), float(UINT_MAX), mask32, ctx);
+    auto thresh = static_cast<uint32_t>(UINT_MAX * prob);
+    _DropoutHalf
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count,
+                 thresh, dragon_cast<half, float>(scale),
+                     reinterpret_cast<const half*>(x),
+                         mask32, mask8, reinterpret_cast<half*>(y));
+}
+template <typename Tm>
+__global__ void _ApplyMaskHalf(
+    const int               count,
+    const half              scale,
+    const half*             x,
+    const Tm*               mask,
+    half*                   y) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        y[idx] = __hmul(__hmul(x[idx], scale),
+            __float2half((float)mask[idx]));
+#endif
+    }
+}
+template <> void ApplyMask<float16, uint8_t, CUDAContext>(
+    const int               count,
+    const float             scale,
+    const float16*          x,
+    const uint8_t*          mask,
+    float16*                y,
+    CUDAContext*            ctx) {
+    _ApplyMaskHalf<uint8_t>
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count,
+                 dragon_cast<half, float>(scale),
+                     reinterpret_cast<const half*>(x),
+                         mask, reinterpret_cast<half*>(y));
+}
 /******************** activation.relu ********************/
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _ReluHalf(
    const int               count,
@@ -45,7 +113,6 @@ __global__ void _ReluHalf2(
 #endif
    }
 }
-#endif
 template<> void Relu<float16, CUDAContext>(
    const int               count,
@@ -53,8 +120,7 @@ template<> void Relu<float16, CUDAContext>(
    const float16*          x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
+    if ((count & 1) == 0) {
-    if ((count & 1) == 0 == 0) {
        _ReluHalf2<half2>
            << < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
                 0, ctx->cuda_stream() >> > (count >> 1,
@@ -69,14 +135,10 @@ template<> void Relu<float16, CUDAContext>(
                         reinterpret_cast<const half*>(x),
                             reinterpret_cast<half*>(y));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** arithmetic.affine ********************/
-#ifdef WITH_CUDA_FP16
 template <typename T>
 __global__ void _AffineWithOBiasHalf(
    const int               count,
@@ -112,7 +174,6 @@ __global__ void _AffineWithBiasHalf(
 #endif
    }
 }
-#endif
 template<> void Affine<float16, CUDAContext>(
    const int               count,
@@ -125,7 +186,6 @@ template<> void Affine<float16, CUDAContext>(
    const float16*          beta_multiplier,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if (beta != nullptr) {
        _AffineWithBiasHalf<float>
            << < CUDA_BLOCKS(count), CUDA_THREADS,
@@ -144,9 +204,151 @@ template<> void Affine<float16, CUDAContext>(
                             reinterpret_cast<const half*>(alpha),
                                 reinterpret_cast<half*>(y));
    }
-#else
+}
-    CUDA_FP16_NOT_COMPILED;
+/******************** loss.nll_loss ********************/
+template <typename Ty>
+__global__ void _NLLLossHalf(
+    const int               count,
+    const int               axis_dim,
+    const int               inner_dim,
+    const half*             log_prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        const int oix = idx / inner_dim;
+        const int iix = idx % inner_dim;
+        const int label = labels[oix * inner_dim + iix];
+        int k;
+        for (k = 0; k < num_ignores; k++) {
+            if (label == ignores[k]) {
+                losses[idx] = flags[idx] = 0;
+                break;
+            }
+        }
+        if (k == num_ignores) {
+            losses[idx] = __half2float(__hneg(
+                log_prob[(oix * axis_dim + label) * inner_dim + iix]));
+            flags[idx] = 1;
+        }
 #endif
+    }
+}
+template <> void NLLLoss<float16, float, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          log_prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _NLLLossHalf<float>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                     reinterpret_cast<const half*>(log_prob), labels,
+                         ignores, num_ignores, losses, flags);
+}
+template <> void NLLLoss<float16, int64_t, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          log_prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _NLLLossHalf<int64_t>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                     reinterpret_cast<const half*>(log_prob), labels,
+                         ignores, num_ignores, losses, flags);
+}
+template <typename Ty>
+__global__ void _NLLLossGradHalf(
+    const int               count,
+    const int               axis_dim,
+    const int               inner_dim,
+    const half*             log_prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    half*                   dx,
+    float*                  flags) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        const int oix = idx / inner_dim;
+        const int iix = idx % inner_dim;
+        const int label = labels[oix * inner_dim + iix];
+        int k;
+        for (k = 0; k < num_ignores; k++)
+            if (label == ignores[k]) break;
+        if (k != num_ignores) {
+            flags[idx] = 0;
+        } else {
+            dx[(oix * axis_dim + label) * inner_dim + iix] = __float2half(-1.);
+            flags[idx] = 1;
+        }
+#endif
+    }
+}
+template<> void NLLLossGrad<float16, float, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          log_prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float16*                dx,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _NLLLossGradHalf<float>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                     reinterpret_cast<const half*>(log_prob), labels,
+                         ignores, num_ignores,
+                             reinterpret_cast<half*>(dx), flags);
+}
+template<> void NLLLossGrad<float16, int64_t, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          log_prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float16*                dx,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _NLLLossGradHalf<int64_t>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                     reinterpret_cast<const half*>(log_prob), labels,
+                         ignores, num_ignores,
+                             reinterpret_cast<half*>(dx), flags);
 }
 /******************** loss.sparse_softmax_cross_entropy ********************/
@@ -304,11 +506,11 @@ template<> void SparseSoftmaxCrossEntropyGrad<float16, int64_t, CUDAContext>(
                     reinterpret_cast<const half*>(prob), labels,
                         ignores, num_ignores,
                             reinterpret_cast<half*>(dx), flags);
 }
 /******************** misc.astype ********************/
-#ifdef WITH_CUDA_FP16
 __global__ void _TypeHalf2Float(
    const int               count,
    const half*             a,
@@ -334,7 +536,6 @@ __global__ void _TypeHalf2Half(
        b[idx] = a[idx];
    }
 }
-#endif
 #define DEFINE_TYPE_DISABLE_FP16(type) \
    template <> void TypeA2B<float16, type, CUDAContext>( \
@@ -376,7 +577,6 @@ __global__ void _TypeHalf2Half(
                     a, reinterpret_cast<half*>(b)); \
    }
-#ifdef WITH_CUDA_FP16
 template <> void TypeA2B<float16, float16, CUDAContext>(
    const int               count,
    const float16*          a,
@@ -388,29 +588,15 @@ template <> void TypeA2B<float16, float16, CUDAContext>(
                 reinterpret_cast<const half*>(a),
                     reinterpret_cast<half*>(b));
 }
 DEFINE_TYPE_ENABLE_FP16_FP32;
 DEFINE_TYPE_DISABLE_FP16(double);
 DEFINE_TYPE_DISABLE_FP16(int);
 DEFINE_TYPE_DISABLE_FP16(int64_t);
 DEFINE_TYPE_DISABLE_FP16(uint8_t);
-#else
-template <> void TypeA2B<float16, float16, CUDAContext>(
-    const int               count,
-    const float16*          a,
-    float16*                b,
-    CUDAContext*            ctx) {
-    LOG(FATAL) << "CUDAContext has not implemented: float16 -> float16";
-}
-DEFINE_TYPE_DISABLE_FP16(float);
-DEFINE_TYPE_DISABLE_FP16(double);
-DEFINE_TYPE_DISABLE_FP16(int);
-DEFINE_TYPE_DISABLE_FP16(int64_t);
-DEFINE_TYPE_DISABLE_FP16(uint8_t);
-#endif
 /******************** misc.image_data ********************/
-#ifdef WITH_CUDA_FP16
 template <typename Tx, typename Ty>
 __global__ void _ImageDataHalf_NCHW(
    const int               count,
@@ -453,7 +639,6 @@ __global__ void _ImageDataHalf_NHWC(
        y[idx] = __float2half(raw_value);
    }
 }
-#endif
 template <> void ImageData<float, float16, CUDAContext>(
    const int               count,
@@ -467,7 +652,6 @@ template <> void ImageData<float, float16, CUDAContext>(
    const float*            x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if (data_format == "NCHW") {
        _ImageDataHalf_NCHW<float, half>
            << < CUDA_BLOCKS(count), CUDA_THREADS,
@@ -481,9 +665,6 @@ template <> void ImageData<float, float16, CUDAContext>(
                     N, C, H, W, mean_values, std_values,
                         x, reinterpret_cast<half*>(y));
    } else LOG(FATAL) << "Unknown data format: " << data_format;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 template <> void ImageData<uint8_t, float16, CUDAContext>(
@@ -498,7 +679,6 @@ template <> void ImageData<uint8_t, float16, CUDAContext>(
    const uint8_t*          x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    if (data_format == "NCHW") {
        _ImageDataHalf_NCHW<uint8_t, half>
            << < CUDA_BLOCKS(count), CUDA_THREADS,
@@ -512,9 +692,6 @@ template <> void ImageData<uint8_t, float16, CUDAContext>(
                     N, C, H, W, mean_values, std_values,
                         x, reinterpret_cast<half*>(y));
    } else LOG(FATAL) << "Unknown data format: " << data_format;
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** ndarray.concat ********************/
@@ -549,7 +726,6 @@ template <> void Concat<float16, CUDAContext>(
    const float16*          x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    _ConcatHalf<half>
        << < CUDA_BLOCKS(count), CUDA_THREADS,
             0, ctx->cuda_stream() >> >(count,
@@ -557,9 +733,6 @@ template <> void Concat<float16, CUDAContext>(
                     x_concat_dim, y_concat_dim, concat_offset,
                         reinterpret_cast<const half*>(x),
                             reinterpret_cast<half*>(y));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 template <typename T>
@@ -592,7 +765,6 @@ template <> void ConcatGrad<float16, CUDAContext>(
    const float16*          dy,
    float16*                dx,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    _ConcatGradHalf<half>
        << < CUDA_BLOCKS(count), CUDA_THREADS,
             0, ctx->cuda_stream() >> >(count,
@@ -600,9 +772,6 @@ template <> void ConcatGrad<float16, CUDAContext>(
                     x_concat_dim, y_concat_dim, concat_offset,
                         reinterpret_cast<const half*>(dy),
                             reinterpret_cast<half*>(dx));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** ndarray.transpose ********************/
@@ -636,16 +805,12 @@ template <> void Transpose<float16, CUDAContext>(
    const float16*          x,
    float16*                y,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    _TransposeHalf<half>
        << < CUDA_BLOCKS(count), CUDA_THREADS,
             0, ctx->cuda_stream() >> >(count,
                 ndim, order, old_steps, new_steps,
                     reinterpret_cast<const half*>(x),
                         reinterpret_cast<half*>(y));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 template <typename T>
@@ -677,21 +842,16 @@ template <> void TransposeGrad<float16, CUDAContext>(
    const float16*          dy,
    float16*                dx,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    _TransposeGradHalf<half>
        << < CUDA_BLOCKS(count), CUDA_THREADS,
             0, ctx->cuda_stream() >> >(count,
                 ndim, order, old_steps, new_steps,
                     reinterpret_cast<const half*>(dy),
                         reinterpret_cast<half*>(dx));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** update.adam_update ********************/
-#ifdef WITH_CUDA_FP16
 __global__ void _AdamUpdateHalf(
    const int               count,
    const half              lr,
@@ -720,7 +880,6 @@ __global__ void _AdamUpdateHalf(
 #endif
    }
 }
-#endif
 template <> void AdamUpdate<float16, CUDAContext>(
    const int               count,
@@ -732,7 +891,6 @@ template <> void AdamUpdate<float16, CUDAContext>(
    float16*                m,
    float16*                v,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    _AdamUpdateHalf
        << < CUDA_BLOCKS(count), CUDA_THREADS,
             0, ctx->cuda_stream() >> >(count,
@@ -743,14 +901,10 @@ template <> void AdamUpdate<float16, CUDAContext>(
                                 reinterpret_cast<half*>(g),
                                     reinterpret_cast<half*>(m),
                                         reinterpret_cast<half*>(v));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** update.nesterov_update ********************/
-#ifdef WITH_CUDA_FP16
 __global__ void _NesterovUpdateHalf(
    const int               count,
    const half              lr,
@@ -794,7 +948,6 @@ __global__ void _NesterovUpdateHalf2(
 #endif
    }
 }
-#endif
 template <> void NesterovUpdate<float16, CUDAContext>(
    const int               count,
@@ -803,8 +956,7 @@ template <> void NesterovUpdate<float16, CUDAContext>(
    float16*                g,
    float16*                h,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
+    if ((count & 1) == 0) {
-    if ((count & 1) == 0 == 0) {
        _NesterovUpdateHalf2
            << < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
                 0, ctx->cuda_stream() >> >(count >> 1,
@@ -821,14 +973,10 @@ template <> void NesterovUpdate<float16, CUDAContext>(
                             reinterpret_cast<half*>(g),
                                 reinterpret_cast<half*>(h));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** update.rmsprop_update ********************/
-#ifdef WITH_CUDA_FP16
 __global__ void _RMSPropUpdateHalf(
    const int               count,
    const half              lr,
@@ -851,7 +999,6 @@ __global__ void _RMSPropUpdateHalf(
 #endif
    }
 }
-#endif
 template <> void RMSPropUpdate<float16, CUDAContext>(
    const int               count,
@@ -861,7 +1008,6 @@ template <> void RMSPropUpdate<float16, CUDAContext>(
    float16*                g,
    float16*                h,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
    _RMSPropUpdateHalf
        << < CUDA_BLOCKS(count), CUDA_THREADS,
             0, ctx->cuda_stream() >> >(count,
@@ -870,14 +1016,10 @@ template <> void RMSPropUpdate<float16, CUDAContext>(
                         dragon_cast<half, float>(eps),
                             reinterpret_cast<half*>(g),
                                 reinterpret_cast<half*>(h));
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 /******************** update.sgd_update ********************/
-#ifdef WITH_CUDA_FP16
 __global__ void _SGDUpdateHalf(
    const int               count,
    const half              lr,
@@ -911,7 +1053,6 @@ __global__ void _SGDUpdateHalf2(
 #endif
    }
 }
-#endif
 template <> void SGDUpdate<float16, CUDAContext>(
    const int               count,
@@ -920,8 +1061,7 @@ template <> void SGDUpdate<float16, CUDAContext>(
    float16*                g,
    float16*                h,
    CUDAContext*            ctx) {
-#ifdef WITH_CUDA_FP16
+    if ((count & 1) == 0) {
-    if ((count & 1) == 0 == 0) {
        _SGDUpdateHalf2
            << < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
                 0, ctx->cuda_stream() >> >(count >> 1,
@@ -938,9 +1078,6 @@ template <> void SGDUpdate<float16, CUDAContext>(
                             reinterpret_cast<half*>(g),
                                 reinterpret_cast<half*>(h));
    }
-#else
-    CUDA_FP16_NOT_COMPILED;
-#endif
 }
 }    // namespace kernel