Unlock CUDA Async Streams

Ting PAN
Commit 5cd0761b authored Aug 22, 2018 by Ting PAN
Showing with 3672 additions and 2829 deletions
Dragon/include/core/common.h
Dragon/include/core/context.h
Dragon/include/core/context_cuda.h
Dragon/include/core/graph.h
Dragon/include/core/operator.h
Dragon/include/core/tensor.h
Dragon/include/core/workspace.h
Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
Dragon/include/operators/ndarray/dimension_op.h
Dragon/include/operators/norm/l2_norm_op.h
Dragon/include/operators/update/adam_update_op.h
Dragon/include/operators/update/collective_update_op.h
Dragon/include/operators/update/nesterov_update_op.h
Dragon/include/operators/update/rmsprop_update_op.h
Dragon/include/operators/update/sgd_update_op.h
Dragon/include/operators/update/update_op_base.h
Dragon/include/operators/vision/conv_op_base.h
Dragon/include/utils/cast.h
Dragon/include/utils/cuda_device.h
Dragon/include/utils/cudnn_device.h
--- a/Dragon/include/core/common.h
+++ b/Dragon/include/core/common.h
@@ -52,9 +52,9 @@ using Set = std::unordered_set<Value> ;
 /*
 * Define the Kernel version.
 *
- * | Major(2) | Minor(2) | Patch(10) |
+ * | Major(2) | Minor(2) | Patch(11) |
 */
-#define DRAGON_VERSION 2210
+#define DRAGON_VERSION 2211

 /*
 * Define the default random seed.

--- a/Dragon/include/core/context.h
+++ b/Dragon/include/core/context.h
@@ -34,6 +34,8 @@ class CPUContext {
    virtual ~CPUContext() {}

    inline void SwitchToDevice() {}
+    inline void SwitchToDevice(int stream_id) {}
+
    inline void FinishDeviceCompution() {}

    inline static void* New(size_t nbytes) {
@@ -47,7 +49,15 @@ class CPUContext {
        return data;
    }

-    inline static void Memset(size_t nbytes, void* ptr) {
+    inline static void Memset(
+        size_t              nbytes,
+        void*               ptr) {
+        memset(ptr, 0, nbytes);
+    }
+
+    inline void MemsetAsync(
+        size_t              nbytes,
+        void*               ptr) {
        memset(ptr, 0, nbytes);
    }

@@ -59,18 +69,16 @@ class CPUContext {
        memcpy(dst, src, nbytes);
    }

-    inline static void Delete(void* data) { free(data); }
-
    template<class DstContext, class SrcContext>
-    inline static void MemcpyAsync(
+    inline void MemcpyAsync(
        size_t              nbytes,
        void*               dst,
        const void*         src) {
-        NOT_IMPLEMENTED;
+        memcpy(dst, src, nbytes);
    }

    template<typename T, class DstContext, class SrcContext>
-    inline static void Copy(
+    inline void Copy(
        int                 n,
        T*                  dst,
        const T*            src) {
@@ -82,7 +90,10 @@ class CPUContext {
        else for (int i = 0; i < n; i++) dst[i] = src[i];
    }

+    inline static void Delete(void* data) { free(data); }
+
    inline int device_id() const { return 0; }
+    inline void set_stream_id(int stream_id) {}

    inline std::mt19937* rand_generator() {
        if (!rand_generator_.get())

--- a/Dragon/include/core/context_cuda.h
+++ b/Dragon/include/core/context_cuda.h
@@ -23,8 +23,7 @@ namespace dragon {

 class CUDAObject {
 public:
-    CUDAObject(int default_stream = 1)
-        : default_stream(default_stream) {
+    CUDAObject() {
        for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
            cuda_streams[i] = vector<cudaStream_t>();
            cublas_handles[i] = vector<cublasHandle_t>();
@@ -38,7 +37,7 @@ class CUDAObject {
        for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
            for (int j = 0; j < cuda_streams[i].size(); j++) {
                auto& stream = cuda_streams[i][j];
-                //  follow caffe2, do not check the stream destroying
+                //  follow the caffe2, do not check the stream destroying
                //  Error code 29 (driver shutting down) is inevitable
                //  TODO(PhyscalX): Can someone solve this issue?
                if (stream) cudaStreamDestroy(stream);
@@ -52,19 +51,21 @@ class CUDAObject {
        }
    }

-    /**
-     * Each device takes a group of streams.
-     *
-     * The stream 0 is reserved for default stream,
-     * stream 1 or higher is created as ``cudaStreamNonBlocking``.
-     */
+    //  follow the caffe2,
+    //  each device takes a group of non-bl0cking streams
+    //  the stream 0 is reserved for default stream,
+    //  as some computations really require it,
+    //  e.g. cublas.asum() and mixed cpu/cuda operations
+    //  besides, somes calls, such as cudnn.conv() and cudnn.rnn(),
+    //  produce wrong results if running them on non-blocking streams
+    //  note that caffe2 also use default streams (within CuDNNState)
    cudaStream_t GetStream(int device_id, int stream_id) {
        vector<cudaStream_t>& dev_streams = cuda_streams[device_id];
        if (dev_streams.size() <= (unsigned)stream_id)
            dev_streams.resize(stream_id + 1, nullptr);
        if (!dev_streams[stream_id]) {
            DeviceGuard guard(device_id);
-            unsigned int flags = !stream_id && default_stream ?
+            unsigned int flags = !stream_id ?
                cudaStreamDefault : cudaStreamNonBlocking;
            CUDA_CHECK(cudaStreamCreateWithFlags(
                &dev_streams[stream_id], flags));
@@ -102,8 +103,6 @@ class CUDAObject {
    }
 #endif

-    int default_stream;
-
    vector<cudaStream_t> cuda_streams[CUDA_MAX_DEVICES];
    vector<cublasHandle_t> cublas_handles[CUDA_MAX_DEVICES];
 #ifdef WITH_CUDNN
@@ -129,11 +128,10 @@ class CUDAContext {
        stream_id_ = stream_id;
    }

-    inline void SwitchToDevice() { SwitchToDevice(0); }
+    inline void SwitchToDevice() { SwitchToDevice(1); }

    inline void FinishDeviceCompution() {
-        cudaStreamSynchronize(cuda_object_
-            .GetStream(device_id_, stream_id_));
+        cudaStreamSynchronize(cuda_stream());
        cudaError_t error = cudaGetLastError();
        CHECK_EQ(error, cudaSuccess)
            << "\nCUDA Error: " << cudaGetErrorString(error);
@@ -147,8 +145,17 @@ class CUDAContext {
        return data;
    }

-    inline static void Memset(size_t nbytes, void* ptr) {
-        cudaMemset(ptr, 0, nbytes); 
+    inline static void Memset(
+        size_t              nbytes,
+        void*               ptr) {
+        CUDA_CHECK(cudaMemset(ptr, 0, nbytes));
+    }
+
+    inline void MemsetAsync(
+        size_t              nbytes,
+        void*               ptr) {
+        CUDA_CHECK(cudaMemsetAsync(ptr, 0,
+            nbytes, cuda_stream()));
    }

    template<class DstContext, class SrcContext>
@@ -169,20 +176,22 @@ class CUDAContext {
            cudaMemcpyDefault, cuda_stream()));
    }

-    inline static void Delete(void* data) { cudaFree(data); }
-
    template<typename T, class DstContext, class SrcContext>
-    static void Copy(
+    inline void Copy(
        int                 n,
        T*                  dst,
        const T*            src) {
        if (dst == src) return;
-        Memcpy<SrcContext, DstContext>(
+        MemcpyAsync<SrcContext, DstContext>(
            n * sizeof(T), (void*)dst, (const void*)src);
    }

+    inline static void Delete(void* data) { cudaFree(data); }
+
    inline int device_id() const { return device_id_; }

+    inline void set_stream_id(int stream_id) { stream_id_ = stream_id; }
+
    inline cudaStream_t cuda_stream() {
        return cuda_stream(device_id_, stream_id_);
    }
@@ -227,7 +236,7 @@ class CUDAContext {
    static thread_local CUDAObject cuda_object_;

 private:
-    int device_id_, stream_id_ = 0, random_seed_;
+    int device_id_, stream_id_ = 1, random_seed_;
    unique_ptr<std::mt19937> rand_generator_;
    curandGenerator_t curand_generator_ = nullptr;
 };
@@ -271,7 +280,7 @@ class CUDAClosure {

 protected:
    Context* ctx_;
-    CUDAObject cuda_object_ = 0;
+    CUDAObject cuda_object_;
    vector<int> active_streams_;
 };

@@ -283,8 +292,22 @@ class CUDAContext {
    CUDAContext(const int device_id = 0) { CUDA_NOT_COMPILED; }

    inline void SwitchToDevice() { CUDA_NOT_COMPILED; }
+    inline void SwitchToDevice(int stream_id) { CUDA_NOT_COMPILED; }
+
    inline void FinishDeviceCompution() { CUDA_NOT_COMPILED; }

+    inline static void Memset(
+        size_t              nbytes,
+        void*               ptr) {
+        CUDA_NOT_COMPILED;
+    }
+
+    inline void MemsetAsync(
+        size_t              nbytes,
+        void*               ptr) {
+        CUDA_NOT_COMPILED;
+    }
+
    template<class DstContext, class SrcContext>
    inline static void Memcpy(
        size_t              nbytes,
@@ -302,6 +325,7 @@ class CUDAContext {
    }

    inline int device_id() const { return 0; }
+    inline void set_stream_id(int stream_id) {}
 };

 #endif // WITH_CUDA

--- a/Dragon/include/core/graph.h
+++ b/Dragon/include/core/graph.h
@@ -37,7 +37,8 @@ class GraphBase {

    virtual bool Run(
        const string&           include,
-        const string&           exclude) = 0;
+        const string&           exclude,
+        const int               stream_id = 1) = 0;

    inline string name() const { return name_; }

@@ -58,7 +59,8 @@ class Graph final : public GraphBase {

    bool Run(
        const string&           include,
-        const string&           exclude) override;
+        const string&           exclude,
+        const int               stream_id = 1) override;

    GraphDef Prune(const GraphDef& meta_graph);
    GraphDef MakeUpdate(const GraphDef& meta_graph);

--- a/Dragon/include/core/operator.h
+++ b/Dragon/include/core/operator.h
@@ -44,7 +44,7 @@ class OperatorBase {
                   const string& anchor);

    inline void SwitchToPhase(const string& phase) { phase_ = phase; }
-    virtual void Run() { NOT_IMPLEMENTED; }
+    virtual void Run(int stream_id = 1) { NOT_IMPLEMENTED; }

    inline const string& name() const { return def_.name(); }
    inline const string& type() const { return def_.type(); }
@@ -100,13 +100,13 @@ class Operator : public OperatorBase {
            Output(0)->name() == "ignore"));
    }

-    virtual void Run() final {
+    void Run(int stream_id = 1) final {
        if (!allow_run_) return;
        if (allow_recompute_) MakeResource();
-        ctx().SwitchToDevice();
+        ctx()->SwitchToDevice(stream_id);
        MemorySwitch();
        RunOnDevice();
-        if (do_sync_) ctx().FinishDeviceCompution();
+        if (do_sync_) ctx()->FinishDeviceCompution();
        if (allow_recompute_) CleanResource();
    }

@@ -123,7 +123,7 @@ class Operator : public OperatorBase {

    virtual void RunOnDevice() = 0;

-    inline Context& ctx() { return ctx_; }
+    inline Context* ctx() { return &ctx_; }
    inline bool AllowRun() { return allow_run_; }

 protected:
@@ -192,6 +192,27 @@ DECLARE_REGISTRY(
    const OperatorDef&,
    Workspace*);

+#define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
+    if (tensor.count() == 0) { \
+        CHECK(ws()->GetFiller(tensor.name())) \
+            << "\nTensor(" << tensor.name() << ") is empty. \n" \
+            << "may be specify a filler for it ?"; \
+        tensor.Reshape(shape); \
+        unique_ptr< Filler<type, Context> > filler(  \
+            CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
+        filler->Fill(&tensor, ctx()); \
+        ctx()->FinishDeviceCompution(); \
+    } else { \
+         TIndex count = 1; \
+         for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
+         CHECK_EQ(count, tensor.count()) \
+            << "\nModel request " << "Tensor(" << tensor.name() << ")'s " \
+            << "size is " << count << ", \n" \
+            << "but now is " << tensor.count() << ", " \
+            << "did you feed the incorrect Tensor before ?"; \
+        tensor.Reshape(shape); \
+    }
+
 #define TENSOR_FILL(tensor, shape) \
    if (tensor.count() == 0) { \
        CHECK(ws()->GetFiller(tensor.name())) \
@@ -200,7 +221,8 @@ DECLARE_REGISTRY(
        tensor.Reshape(shape); \
        unique_ptr< Filler<T, Context> > filler(  \
            CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
-        filler->Fill(&tensor, &ctx()); \
+        filler->Fill(&tensor, ctx()); \
+        ctx()->FinishDeviceCompution(); \
    } else { \
         TIndex count = 1; \
         for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
@@ -217,7 +239,7 @@ DECLARE_REGISTRY(
    if (size > ptr_tensor->count()) { \
        ptr_tensor->Reshape({ size }); \
        math::Set<T, Context>(size, dragon_cast<T, float>(1.f), \
-            ptr_tensor->template mutable_data<T, Context>()); \
+            ptr_tensor->template mutable_data<T, Context>(), ctx()); \
    } \
  }


--- a/Dragon/include/core/tensor.h
+++ b/Dragon/include/core/tensor.h
@@ -74,7 +74,9 @@ class Tensor {
        for (TIndex i = start; i < end; i++) ret *= dim(i);
        return ret;
    }
+
    inline TIndex count() const { return size_; }
+
    inline TIndex count(const TIndex start) const {
        return count(start, ndim());
    }
@@ -115,14 +117,14 @@ class Tensor {
    inline void Corrupt() { is_corrupted_ = true; }

    inline bool has_memory() const {
-        return memory_ || ex_memory_ != nullptr; 
+        return memory_ || ex_memory_ != nullptr;
    }

    MixedMemory* memory() const {
        return own_mem_ ? memory_.get() : ex_memory_;
    }

-    void set_memory(MixedMemory* mem) { 
+    void set_memory(MixedMemory* mem) {
        memory_.reset(mem); capacity_ = mem->nbytes();
    }

@@ -197,7 +199,7 @@ class Tensor {
        mutable_data_ptr<Context>(&data_ptr);
        //  call the constructors
        if (meta.ctor()) meta_.ctor()(data_ptr, size_);
-        capacity_ = size_ * meta.itemsize();
+        capacity_ = size_ * meta.itemsize(), require_init_ = true;
        return data_ptr;
    }

@@ -225,6 +227,15 @@ class Tensor {
    }

    template <typename T, class Context>
+    T* mutable_data(Context* ctx) {
+        auto* data = mutable_data<T, Context>();
+        if (!require_init_) return data;
+        ctx->MemsetAsync(nbytes(), (void*)data);
+        require_init_ = false;
+        return data;
+    }
+
+    template <typename T, class Context>
    const T* data() const {
        CHECK(meta_ == TypeMeta::Make<T>())
            << "\nThe DType of Tensor(" << name() << ") is "
@@ -234,27 +245,31 @@ class Tensor {
    }

    template <class Context>
-    inline void CopyFrom(const Tensor& other) {
+    inline void CopyFrom(const Tensor& other, Context* ctx) {
+        if ((void*)&other == (void*)this) return;
        CHECK_EQ(size_, other.size_);
        auto* src = other.template raw_data<Context>();
        auto* dst = raw_mutable_data<Context>(other.meta_);
-        if (dst == src) return;
-        if (TypeMeta::Id<Context>() ==
-                TypeMeta::Id<CPUContext>()) {
-            CPUContext::Memcpy<Context, Context>(nbytes(), dst, src);
-        } else if (TypeMeta::Id<Context>() ==
-                TypeMeta::Id<CUDAContext>()) {
-            CUDAContext::Memcpy<Context, Context>(nbytes(), dst, src);
-        }
+        ctx->template MemcpyAsync<Context, Context>(
+            nbytes(), dst, src);
+        require_init_ = false;
    }

    inline void Move(MixedMemory* mem) {
-        if (mem != nullptr) ex_memory_ = mem;
-        else ex_memory_ = new MixedMemory(TypeMeta::Make<float>(), 4);
-        own_mem_ = false;
+        if (mem != nullptr) {
+            ex_memory_ = mem;
+            require_init_ = false;
+        } else {
+            ex_memory_ = new MixedMemory(
+                TypeMeta::Make<float>(), 4);
+            require_init_ = true;
+        } own_mem_ = false;
    }

-    inline void Share(MixedMemory* mem) { Move(mem); is_shared_ = true; }
+    inline void Share(MixedMemory* mem) {
+        Move(mem); is_shared_ = true;
+        require_init_ = false;
+    }

    inline void Reset() {
        size_ = capacity_ = 0;
@@ -275,7 +290,7 @@ class Tensor {
    shared_ptr<MixedMemory> memory_;
    MixedMemory* ex_memory_ = nullptr;
    bool is_corrupted_ = false, is_shared_ = false;
-    bool own_mem_ = true;
+    bool own_mem_ = true, require_init_ = true;
 };

 }    // namespace dragon

--- a/Dragon/include/core/workspace.h
+++ b/Dragon/include/core/workspace.h
@@ -179,29 +179,28 @@ class Workspace {
    template <class Context>
    inline vector<void*> caches(
        const vector<size_t>&   segments) {
-        TIndex total_size = 0;
-        for (auto& segment : segments) total_size += (TIndex)segment;
-        Tensor* cacheT = CreateTensor("/share/cache");
-        cacheT->Reshape({ total_size });
-        vector<void*> caches(segments.size());
-        caches[0] = cacheT->template mutable_data<uint8_t, Context>();
+        TIndex nbytes = 0;
+        for (auto& segment : segments) nbytes += (TIndex)segment;
+        Tensor* cache_t = CreateTensor("/share/cache");
+        cache_t->Reshape({ nbytes });
+        vector<void*> Bcaches(segments.size());
+        Bcaches[0] = cache_t->template mutable_data<uint8_t, Context>();
        for (int i = 1; i < segments.size(); i++)
-            caches[i] = (uint8_t*)caches[i - 1] + segments[i - 1];
-        return caches;
+            Bcaches[i] = (uint8_t*)Bcaches[i - 1] + segments[i - 1];
+        return Bcaches;
    }

    template <typename T, class Context>
    inline vector<T*> caches(
        const vector<TIndex>&   segments) {
-        TIndex total_count = 0;
-        for (auto& segment : segments) total_count += segment;
-        Tensor* cacheT = CreateTensor("/share/cache");
-        cacheT->Reshape({ total_count });
-        vector<T*> caches(segments.size());
-        caches[0] = cacheT->template mutable_data<T, Context>();
-        for (int i = 1; i < segments.size(); i++)
-            caches[i] = caches[i - 1] + segments[i - 1];
-        return caches;
+        vector<size_t> Tsegments;
+        for (auto& segment : segments)
+            Tsegments.emplace_back(segment * sizeof(T));
+        vector<void*> Bcaches = caches<Context>(Tsegments);
+        vector<T*> Tcaches(segments.size());
+        for (int i = 0; i < segments.size(); i++)
+            Tcaches[i] = (T*)Bcaches[i];
+        return Tcaches;
    }

    /******************** Operator ********************/
@@ -259,11 +258,12 @@ class Workspace {
    void RunGraph(
        const string&           graph_name,
        const string&           include,
-        const string&           exclude) {
+        const string&           exclude,
+        const int               stream_id = 1) {
        if (!graph_map_.count(graph_name))
            LOG(FATAL) << "Graph(" << graph_name
                       << ") does not exist.";
-        graph_map_[graph_name]->Run(include, exclude);
+        graph_map_[graph_name]->Run(include, exclude, stream_id);
    }

    vector<string> GetGraphs() {

--- a/Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
+++ b/Dragon/include/operators/loss/sparse_softmax_cross_entropy_op.h
@@ -36,7 +36,6 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
    USE_OPERATOR_FUNCTIONS;

    void SoftmaxRun();
-    void SoftmaxRunFP16();

    void RunOnDevice() override;
    template <typename Tx, typename Ty> void RunWithType();

--- a/Dragon/include/operators/ndarray/dimension_op.h
+++ b/Dragon/include/operators/ndarray/dimension_op.h
@@ -42,7 +42,7 @@ public:
        //  simply copy the dY to dX
        Output(0)->ReshapeLike(Input(0));
        if (Output(0)->name() != Input(-1).name())
-            Output(0)->template CopyFrom<Context>(Input(-1));
+            Output(0)->template CopyFrom<Context>(Input(-1), ctx());
    }
 };


--- a/Dragon/include/operators/norm/l2_norm_op.h
+++ b/Dragon/include/operators/norm/l2_norm_op.h
@@ -34,7 +34,6 @@ class L2NormOp final : public Operator<Context> {
    TIndex axis, num_axes, end_axis;
    float eps;
    string mode;
-    bool across_inner;
    Tensor* norm, buffer;
    TIndex outer_dim, dim, inner_dim, spatial_dim;
 };
@@ -55,7 +54,6 @@ class L2NormGradientOp final : public Operator<Context> {
 protected:
    TIndex axis, num_axes, end_axis;
    string mode;
-    bool across_inner;
    Tensor* norm, buffer, buffer_inner;
    TIndex outer_dim, dim, inner_dim;
 };

--- a/Dragon/include/operators/update/adam_update_op.h
+++ b/Dragon/include/operators/update/adam_update_op.h
@@ -24,7 +24,7 @@ class AdamUpdateOp final : public UpdateOpBase<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_UPDATER_FUNCTIONS(Context);

-    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat32() override;
    void ComputeRunWithFloat16() override;

 protected:

--- a/Dragon/include/operators/update/collective_update_op.h
+++ b/Dragon/include/operators/update/collective_update_op.h
@@ -43,10 +43,26 @@ class CollectiveUpdateOp final : public Operator<Context> {
    void InitNCCL();

    void RunOnDevice() override;
-    void MPIAllReduceWithFloat();
-    void NCCLAllReduceWithFloat();
-    void MPIBcastWithFloat();
-    void NCCLBcastWithFloat();
+
+    template <typename T> void MPIAllReduce(
+        Tensor*                 tensor,
+        MPI_Datatype            dtype);
+
+    template <typename T> void MPIBcast(
+        Tensor*                 tensor,
+        MPI_Datatype            dtype);
+
+#ifdef WITH_MPI_NCCL
+    template <typename T> void NCCLAllReduce(
+        Tensor*                 tensor,
+        ncclDataType_t          dtype,
+        cudaStream_t&           stream);
+
+    template <typename T> void NCCLBcast(
+        Tensor*                 tensor,
+        ncclDataType_t          dtype,
+        cudaStream_t&           stream);
+#endif

 protected:
    int comm_size, comm_rank, comm_root;

--- a/Dragon/include/operators/update/nesterov_update_op.h
+++ b/Dragon/include/operators/update/nesterov_update_op.h
@@ -24,7 +24,7 @@ class NesterovUpdateOp final : public UpdateOpBase<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_UPDATER_FUNCTIONS(Context);

-    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat32() override;
    void ComputeRunWithFloat16() override;

 protected:

--- a/Dragon/include/operators/update/rmsprop_update_op.h
+++ b/Dragon/include/operators/update/rmsprop_update_op.h
@@ -24,7 +24,7 @@ class RMSPropUpdateOp final : public UpdateOpBase<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_UPDATER_FUNCTIONS(Context);

-    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat32() override;
    void ComputeRunWithFloat16() override;

 protected:

--- a/Dragon/include/operators/update/sgd_update_op.h
+++ b/Dragon/include/operators/update/sgd_update_op.h
@@ -25,7 +25,7 @@ class SGDUpdateOp final : public UpdateOpBase<Context> {
    USE_OPERATOR_FUNCTIONS;
    USE_UPDATER_FUNCTIONS(Context);

-    void ComputeRunWithFloat() override;
+    void ComputeRunWithFloat32() override;
    void ComputeRunWithFloat16() override;

 protected:

--- a/Dragon/include/operators/update/update_op_base.h
+++ b/Dragon/include/operators/update/update_op_base.h
@@ -35,13 +35,11 @@ class UpdateOpBase : public Operator<Context> {
    void RunOnDevice() override;
    template <typename T> void PreprocessRunWithType();

-    virtual void ComputeRunWithFloat() = 0;
+    virtual void ComputeRunWithFloat32() = 0;
+    virtual void ComputeRunWithFloat16() = 0;

-    virtual void ComputeRunWithFloat16() {
-        LOG(FATAL) << "This Updater does not support FP16."; 
-    }
-
-    template <typename T> void UpdateRunWithType();
+    void UpdateRunWithFloat32();
+    void UpdateRunWithFloat16();

 protected:
    float lr_mult, decay_mult;

--- a/Dragon/include/operators/vision/conv_op_base.h
+++ b/Dragon/include/operators/vision/conv_op_base.h
@@ -80,7 +80,8 @@ class ConvOpBase : public Operator<Context> {
                                  dilation[0], dilation[1],
                                               data_format,
                                                        im,
-                                                      col);
+                                                       col,
+                                                   ctx());
        } else LOG(FATAL) << "ConvNd has not been implemented yet";
    }
    template <typename T> void Col2Im(const T* col, T* im) {
@@ -94,7 +95,8 @@ class ConvOpBase : public Operator<Context> {
                                  dilation[0], dilation[1],
                                               data_format,
                                                       col,
-                                                       im);
+                                                        im,
+                                                   ctx());
        } else LOG(FATAL) << "ConvNd has not been implemented yet";
    }
 };

--- a/Dragon/include/utils/cast.h
+++ b/Dragon/include/utils/cast.h
@@ -19,6 +19,8 @@

 namespace dragon {

+#define HFLT_MIN 6.10e-5F
+
 template <typename DestType, typename SrcType>
 DestType dragon_cast(SrcType val);


--- a/Dragon/include/utils/cuda_device.h
+++ b/Dragon/include/utils/cuda_device.h
@@ -29,9 +29,17 @@ namespace dragon {

 #ifdef WITH_CUDA

-static const int CUDA_THREADS = 1024;
-//  We do have a server with 10 GPUs :-)
-#define CUDA_MAX_DEVICES 10
+//  The number of cuda threads to use. We set it to
+//  1024 which would work for compute capability 2.x
+//  Set it to 512 if using compute capability 1.x
+const int CUDA_THREADS = 1024;
+
+//  The maximum number of blocks to use in the default kernel call. We set it to
+//  65535 which would work for compute capability 2.x (where 65536 is the limit)
+const int CUDA_MAX_BLOCKS = 65535;
+
+//  You really need a NVIDIA DGX-2 !!! :-)
+#define CUDA_MAX_DEVICES 16

 #define CUDA_VERSION_MIN(major, minor, patch) \
    (CUDA_VERSION >= (major * 1000 + minor * 100 + patch))
@@ -67,12 +75,16 @@ static const int CUDA_THREADS = 1024;
  } while (0)
 #endif  // WITH_MPI_NCCL

-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \
       i < n; i += blockDim.x * gridDim.x)

 inline int CUDA_BLOCKS(const int N) {
-    return (N + CUDA_THREADS - 1) / CUDA_THREADS;
+    return std::max(
+        std::min(
+            (N + CUDA_THREADS - 1) / CUDA_THREADS,
+            CUDA_MAX_BLOCKS
+        ), 1);
 }

 #if CUDA_VERSION_MAX(9, 0, 0)

--- a/Dragon/include/utils/cudnn_device.h
+++ b/Dragon/include/utils/cudnn_device.h
@@ -44,6 +44,7 @@ template<> class CUDNNType<float>  {
    static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
    static float oneval, zeroval;
    static const void *one, *zero;
+    typedef float BNParamType;
 };

 template<> class CUDNNType<double> {
@@ -51,6 +52,7 @@ template<> class CUDNNType<double> {
    static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
    static double oneval, zeroval;
    static const void *one, *zero;
+    typedef double BNParamType;
 };

 #ifdef WITH_CUDA_FP16
@@ -59,6 +61,7 @@ template<> class CUDNNType<float16> {
    static const cudnnDataType_t type = CUDNN_DATA_HALF;
    static float oneval, zeroval;
    static const void *one, *zero;
+    typedef float BNParamType;
 };
 #endif


--- a/Dragon/include/utils/filler.h
+++ b/Dragon/include/utils/filler.h
@@ -40,7 +40,7 @@ class ConstantFiller final : public Filler<T, Context> {
    void Fill(Tensor* tensor, Context* ctx) override {
        math::Set<T, Context>(tensor->count(),
            dragon_cast<T, float>(filler().value()),
-                tensor->mutable_data<T, Context>());
+                tensor->mutable_data<T, Context>(), ctx);
    }

 protected:
@@ -71,11 +71,11 @@ class TruncatedNormalFiller final : public Filler<T, Context> {

    void Fill(Tensor* tensor, Context* ctx) override {
        //  implement it on gpu is difficult
-        static CPUContext cpu_ctx;
+        static CPUContext cctx;
        math::RandomTruncatedNormal<T, CPUContext>(tensor->count(),
            filler().mean(), filler().std(),
                filler().low(), filler().high(),
-                    tensor->mutable_data<T, CPUContext>(), &cpu_ctx);
+                    tensor->mutable_data<T, CPUContext>(), &cctx);
    }

 protected:

--- a/Dragon/include/utils/math_functions.h
+++ b/Dragon/include/utils/math_functions.h
@@ -36,7 +36,8 @@ template <typename T, class Context>
 void Set(
    const int               n,
    const T                 alpha,
-    T*                      x);
+    T*                      x,
+    Context*                ctx);

 template <typename T, class Context>
 void RandomUniform(
@@ -78,73 +79,84 @@ void Add(
    const int               n,
    const T*                a,
    const T*                b,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Sub(
    const int               n,
    const T*                a,
    const T*                b,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Mul(
    const int               n,
    const T*                a,
    const T*                b,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Div(
    const int               n,
    const T*                a,
    const T*                b,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Clip(
    const int               n,
    const float             low,
    const float             high,
-    T*                      x);
+    T*                      x,
+    Context*                ctx);

 template <typename T, class Context>
 void Exp(
    const int               n,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Log(
    const int               n,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Square(
    const int               n,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Sqrt(
    const int               n,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Pow(
    const int               n,
    const float             alpha,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Inv(
    const int               n,
    const float             numerator,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 /******************** Level-2 ********************/

@@ -164,19 +176,21 @@ void Scale(
    Context*                ctx);

 template <typename T, class Context>
-T StridedDot(
+void StridedDot(
    const int               n,
    const T*                a,
    const int               incx,
    const T*                b,
    const int               incy,
+    T*                      y,
    Context*                ctx);

 template <typename T, class Context>
-float Dot(
+void Dot(
    const int               n,
    const T*                a,
    const T*                b,
+    T*                      y,
    Context*                ctx);

 template<typename T, class Context>
@@ -188,13 +202,15 @@ template<typename T, class Context>
 void AddScalar(
    const int               n,
    const float             alpha,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template<typename T, class Context>
 void MulScalar(
    const int               n,
    const float             alpha,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template<typename T, class Context>
 void Axpy(

--- a/Dragon/include/utils/op_kernel.h
+++ b/Dragon/include/utils/op_kernel.h
@@ -49,7 +49,8 @@ void Elu(
    const int               count,
    const float             alpha,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void EluGrad(
@@ -57,7 +58,8 @@ void EluGrad(
    const float             alpha,
    const T*                dy,
    const T*                y,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** activation.prelu ********************/

@@ -70,7 +72,8 @@ void PRelu(
    const string&           data_format,
    const T*                x,
    const T*                w,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void PReluGrad(
@@ -82,7 +85,8 @@ void PReluGrad(
    const T*                dy,
    const T*                x,
    const T*                w,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 template <typename T, class Context>
 void PReluWGrad(
@@ -106,7 +110,8 @@ void Relu(
    const int               count,
    const float             slope,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void ReluGrad(
@@ -114,7 +119,8 @@ void ReluGrad(
    const float             slope,
    const T*                dy,
    const T*                y,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** activation.selu ********************/

@@ -122,14 +128,16 @@ template <typename T, class Context>
 void SElu(
    const int               count,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void SEluGrad(
    const int               count,
    const T*                dy,
    const T*                y,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** activation.sigmoid ********************/

@@ -137,14 +145,16 @@ template <typename T, class Context>
 void Sigmoid(
    const int               count,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void SigmoidGrad(
    const int               count,
    const T*                dy,
    const T*                y,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** activation.softmax ********************/

@@ -179,14 +189,16 @@ template <typename T, class Context>
 void Tanh(
    const int               count,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void TanhGrad(
    const int               count,
    const T*                dy,
    const T*                y,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** arithmetic.affine ********************/

@@ -223,7 +235,8 @@ void Clip(
    const float             high,
    const T*                x,
    T*                      mask,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 /******************** control_flow.compare ********************/

@@ -232,7 +245,8 @@ void Equal(
    const int               count,
    const T*                a,
    const T*                b,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 /******************** loss.l1_loss ********************/

@@ -240,7 +254,8 @@ template <typename T, class Context>
 void AbsGrad(
    const int               count,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** loss.sigmoid_cross_entropy ********************/

@@ -301,14 +316,16 @@ void SmoothL1(
    const int               count,
    const float             beta,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void SmoothL1Grad(
    const int               count,
    const float             beta,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** loss.softmax_cross_entropy ********************/

@@ -317,7 +334,8 @@ void SoftmaxCrossEntropy(
    const int               count,
    const T*                prob,
    const T*                target,
-    T*                      loss);
+    T*                      loss,
+    Context*                ctx);

 /******************** loss.softmax_focal_loss ********************/

@@ -366,8 +384,8 @@ void SparseSoftmaxCrossEntropy(
    const Ty*               labels,
    const int*              ignores,
    const int               num_ignores,
-    Tx*                     losses,
-    Tx*                     flags,
+    float*                  losses,
+    float*                  flags,
    Context*                ctx);

 template <typename Tx, typename Ty, class Context>
@@ -380,7 +398,7 @@ void SparseSoftmaxCrossEntropyGrad(
    const int*              ignores,
    const int               num_ignores,
    Tx*                     dx,
-    Tx*                     flags,
+    float*                  flags,
    Context*                ctx);

 /******************** misc.astype ********************/
@@ -389,7 +407,8 @@ template <typename Ta, typename Tb, class Context>
 void TypeA2B(
    const int               count,
    const Ta*               a,
-    Tb*                     b);
+    Tb*                     b,
+    Context*                ctx);

 /******************** misc.image_data ********************/

@@ -404,7 +423,8 @@ void ImageData(
    const float*            std_values,
    const string&           data_format,
    const Tx*               x,
-    Ty*                     y);
+    Ty*                     y,
+    Context*                ctx);

 /******************** ndarray.arange ********************/

@@ -413,7 +433,8 @@ void Arange(
    const int               count,
    const int               start,
    const int               step,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 /******************** ndarray.argreduce ********************/

@@ -425,7 +446,8 @@ void Argmax(
    const int               top_k,
    const T*                x,
    int64_t*                indices,
-    T*                      values);
+    T*                      values,
+    Context*                ctx);

 template <typename T, class Context>
 void Argmin(
@@ -435,7 +457,8 @@ void Argmin(
    const int               top_k,
    const T*                x,
    int64_t*                indices,
-    T*                      values);
+    T*                      values,
+    Context*                ctx);

 /******************** ndarray.gather ********************/

@@ -443,7 +466,8 @@ template <typename T, class Context>
 void CanonicalAxis(
    const int               count,
    const int               dim,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Gather(
@@ -454,7 +478,8 @@ void Gather(
    const int               y_slice_dim,
    const int*              indices,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void GatherGrad(
@@ -465,7 +490,8 @@ void GatherGrad(
    const int               y_slice_dim,
    const int*              indices,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** ndarray.concat ********************/

@@ -478,7 +504,8 @@ void Concat(
    const int               y_concat_dim,
    const int               concat_offset,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void ConcatGrad(
@@ -489,7 +516,8 @@ void ConcatGrad(
    const int               y_concat_dim,
    const int               concat_offset,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** ndarray.crop ********************/

@@ -501,7 +529,8 @@ void Crop1D(
    const int               inner_dim,
    const int               start,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void Crop1DGrad(
@@ -512,7 +541,8 @@ void Crop1DGrad(
    const int               start,
    const int               end,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** ndarray.pad ********************/

@@ -525,7 +555,8 @@ void ConstPad1D(
    const int               pad_l,
    const float             value,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void ReflectPad1D(
@@ -535,7 +566,8 @@ void ReflectPad1D(
    const int               inner_dim,
    const int               pad_l,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void EdgePad1D(
@@ -545,7 +577,8 @@ void EdgePad1D(
    const int               inner_dim,
    const int               pad_l,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void ConstPad1DGrad(
@@ -555,7 +588,8 @@ void ConstPad1DGrad(
    const int               inner_dim,
    const int               pad_l,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 template <typename T, class Context>
 void ReflectPad1DGrad(
@@ -565,7 +599,8 @@ void ReflectPad1DGrad(
    const int               inner_dim,
    const int               pad_l,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 template <typename T, class Context>
 void EdgePad1DGrad(
@@ -575,7 +610,8 @@ void EdgePad1DGrad(
    const int               inner_dim,
    const int               pad_l,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** ndarray.one_hot ********************/

@@ -585,7 +621,8 @@ void OneHot(
    const int               depth,
    const int               on_value,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 /******************** ndarray.reduce ********************/

@@ -595,7 +632,8 @@ void Sum(
    const int               axis_dim,
    const int               inner_dim,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void SumGrad(
@@ -604,7 +642,8 @@ void SumGrad(
    const int               inner_dim,
    const T                 coeff,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** ndarray.repeat ********************/

@@ -616,7 +655,8 @@ void Repeat(
    const int               inner_dim,
    const int               repeats,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void RepeatGrad(
@@ -640,7 +680,8 @@ void Slice(
    const int               y_slice_dim,
    const int               slice_offset,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void SliceGrad(
@@ -651,7 +692,8 @@ void SliceGrad(
    const int               y_slice_dim,
    const int               slice_offset,
    const T*                dy,
-    T*                      x);
+    T*                      x,
+    Context*                ctx);

 /******************** ndarray.tile ********************/

@@ -662,7 +704,8 @@ void Tile(
    const int               ex_inner_dim,
    const int               multiple,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void TileGrad(
@@ -684,7 +727,8 @@ void Transpose(
    const int*              old_steps,
    const int*              new_steps,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void TransposeGrad(
@@ -694,7 +738,8 @@ void TransposeGrad(
    const int*              old_steps,
    const int*              new_steps,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** recurrent.lstm_cell ********************/

@@ -706,7 +751,8 @@ void LSTMCell(
    const T*                cx,
    T*                      xact,
    T*                      c,
-    T*                      h);
+    T*                      h,
+    Context*                ctx);

 template <typename T, class Context>
 void LSTMCellGrad(
@@ -719,7 +765,8 @@ void LSTMCellGrad(
    const T*                dc,
    const T*                dh,
    T*                      dcx,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** update.adam_update ********************/

@@ -732,7 +779,8 @@ void AdamUpdate(
    const float             eps,
    T*                      g,
    T*                      m,
-    T*                      v);
+    T*                      v,
+    Context*                ctx);

 /******************** update.nesterov_update ********************/

@@ -742,7 +790,8 @@ void NesterovUpdate(
    const float             lr,
    const float             momentum,
    T*                      g,
-    T*                      h);
+    T*                      h,
+    Context*                ctx);

 /******************** update.rmsprop_update ********************/

@@ -753,7 +802,8 @@ void RMSPropUpdate(
    const float             decay,
    const float             eps,
    T*                      g,
-    T*                      h);
+    T*                      h,
+    Context*                ctx);

 /******************** update.sgd_update ********************/

@@ -763,7 +813,8 @@ void SGDUpdate(
    const float             lr,
    const float             momentum,
    T*                      g,
-    T*                      h);
+    T*                      h,
+    Context*                ctx);

 /******************** vision.bias_add ********************/

@@ -792,7 +843,8 @@ void BilinearResize(
    const int               out_w,
    const string&           data_format,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void BilinearResizeGrad(
@@ -805,7 +857,8 @@ void BilinearResizeGrad(
    const int               out_w,
    const string&           data_format,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** vision.conv ********************/

@@ -826,7 +879,8 @@ void Im2Col2d(
    const int               dilation_w,
    const string&           data_format,
    const T*                im,
-    T*                      col);
+    T*                      col,
+    Context*                ctx);

 template <typename T, class Context>
 void Col2Im2d(
@@ -845,7 +899,8 @@ void Col2Im2d(
    const int               dilation_w,
    const string&           data_format,
    const T*                col,
-    T*                      im);
+    T*                      im,
+    Context*                ctx);

 /******************** vision.nn_resize ********************/

@@ -860,7 +915,8 @@ void NNResize(
    const int               out_w,
    const string&           data_format,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void NNResizeGrad(
@@ -873,7 +929,8 @@ void NNResizeGrad(
    const int               out_w,
    const string&           data_format,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** vision.pooling ********************/

@@ -895,7 +952,8 @@ void MAXPooling2d(
    const string&           data_format,
    const T*                x,
    int*                    mask,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void AVGPooling2d(
@@ -914,7 +972,8 @@ void AVGPooling2d(
    const int               pad_w,
    const string&           data_format,
    const T*                x,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void MAXPooling2dGrad(
@@ -934,7 +993,8 @@ void MAXPooling2dGrad(
    const string&           data_format,
    const T*                dy,
    const int*              mask,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 template <typename T, class Context>
 void AVGPooling2dGrad(
@@ -953,7 +1013,8 @@ void AVGPooling2dGrad(
    const int               pad_w,
    const string&           data_format,
    const T*                dy,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** vision.roi_pooling ********************/

@@ -971,7 +1032,8 @@ void ROIPooling(
    const T*                x,
    const T*                rois,
    int*                    mask,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void ROIPoolingGrad(
@@ -987,7 +1049,8 @@ void ROIPoolingGrad(
    const T*                dy,
    const T*                rois,
    const int*              mask,
-    T*                      dx);
+    T*                      dx,
+    Context*                ctx);

 /******************** vision.roi_align ********************/

@@ -1005,7 +1068,8 @@ void ROIAlign(
    const int               sampling_ratio,
    const T*                x,
    const T*                rois,
-    T*                      y);
+    T*                      y,
+    Context*                ctx);

 template <typename T, class Context>
 void ROIAlignGrad(
@@ -1021,7 +1085,8 @@ void ROIAlignGrad(
    const int               sampling_ratio,
    const float*            dy,
    const float*            rois,
-    float*                  dx);
+    float*                  dx,
+    Context*                ctx);

 }    // namespace kernel


--- a/Dragon/include/utils/sse_alternative.h
+++ b/Dragon/include/utils/sse_alternative.h
@@ -80,7 +80,7 @@ T Dot(
    const T*                b);

 template<typename T>
-T ASum(
+T Sum(
    const int               n,
    const T*                x);


--- a/Dragon/include/utils/sse_device.h
+++ b/Dragon/include/utils/sse_device.h
@@ -15,6 +15,7 @@
 #ifdef WITH_SSE

 #include <immintrin.h>
+#include <tmmintrin.h>
 #include <cstdint>

 namespace dragon {

--- a/Dragon/modules/cxx/dragon.cc
+++ b/Dragon/modules/cxx/dragon.cc
@@ -250,8 +250,9 @@ void LoadCaffemodel(

 void RunGraph(
    const std::string&          graph_name,
-    Workspace*                  ws) {
-    ws->RunGraph(graph_name, "", "");
+    Workspace*                  ws,
+    const int                   stream_id) {
+    ws->RunGraph(graph_name, "", "", stream_id);
 }

 template <typename T>

--- a/Dragon/modules/cxx/dragon.h
+++ b/Dragon/modules/cxx/dragon.h
@@ -38,8 +38,7 @@ class Device {
    EXPORT const int device_id() const { return device_id_; }

 private:
-     int device_type_;
-     int device_id_;
+     int device_type_, device_id_;
 };

 EXPORT Workspace* CreateWorkspace(const std::string& name);
@@ -61,7 +60,8 @@ EXPORT std::string CreateGraph(

 EXPORT void RunGraph(
    const std::string&          graph_name,
-    Workspace*                  ws);
+    Workspace*                  ws,
+    const int                   stream_id = 1);

 EXPORT void CreateTensor(
    const std::string&          name,

--- a/Dragon/modules/python/dragon.h
+++ b/Dragon/modules/python/dragon.h
@@ -116,7 +116,7 @@ class NumpyFeeder : public TensorFeederBase {
 #else   
            LOG(FATAL) << "CUDA was not compiled.";
 #endif
-        } else{
+        } else {
            CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(),
                                 tensor->raw_mutable_data<CPUContext>(),
                               static_cast<void*>(PyArray_DATA(array)));

--- a/Dragon/modules/python/py_autograd.h
+++ b/Dragon/modules/python/py_autograd.h
@@ -18,18 +18,22 @@
 PyObject* CreateGradientDefsCC(PyObject* self, PyObject* args) {
    PyObject* def_string = nullptr;
    PyObject* py_g_outputs = nullptr;
-    if (!PyArg_ParseTuple(args, "SO!", &def_string, &PyList_Type, &py_g_outputs)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef "
-                                          "and a list containing outputs of this GradientOp.");
+    if (!PyArg_ParseTuple(args, "SO!",
+            &def_string, &PyList_Type, &py_g_outputs)) {
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of OperatorDef "
+            "and a list containing outputs of this GradientOp.");
         return nullptr;
    }
    OperatorDef def;
    if (!def.ParseFromString(PyBytes_AsStringEx(def_string))) {
-        PyErr_SetString(PyExc_ValueError, "Failed to parse the OperatorDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Failed to parse the OperatorDef.");
        return nullptr;
    }
    if (!GradientRegistry()->Has(def.type())) {
-        PyErr_SetString(PyExc_KeyError, "This Operator does not register GradientOp.");
+        PyErr_SetString(PyExc_KeyError,
+            "This Operator does not register GradientOp.");
        return nullptr;
    }
    vector<string> g_outputs;
@@ -61,9 +65,10 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
    PyObject* py_fp_ops, *py_targets;   
    PyObject* py_input_grads, *py_ignore_grads;
    PyObject* py_share_grads, *py_export_graph;
-    if (!PyArg_ParseTuple(args, "OOOOOO", &py_fp_ops, &py_targets,
-                                          &py_input_grads, &py_ignore_grads, 
-                                          &py_share_grads, &py_export_graph)) {
+    if (!PyArg_ParseTuple(args, "OOOOOO",
+        &py_fp_ops, &py_targets,
+            &py_input_grads, &py_ignore_grads,
+                &py_share_grads, &py_export_graph)) {
        PyErr_SetString(PyExc_ValueError,
            "Excepted a list of serialized input ops, targets, "
            "input grads, ignore grads and whehter to share grads or log graph.");
@@ -84,8 +89,8 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
    for (auto& grad : input_grads) maker.AddExternalGrad(grad);
    for (auto& grad : ignore_grads) maker.AddIgnoreGrad(grad);
    maker.Make(fp_ops, targets, bp_ops);
-    bool share_grads = (bool)PyObject_IsTrue(py_share_grads);
-    bool export_graph = (bool)PyObject_IsTrue(py_export_graph);
+    bool share_grads = PyObject_IsTrue(py_share_grads) ? true : false;
+    bool export_graph = PyObject_IsTrue(py_export_graph) ? true : false;
    if (share_grads) maker.Share("/share/buffer/grads", bp_ops);
    if (export_graph) {
        Tensor* t = ws()->CreateTensor("/export/dynamic_graph/gradient_flow");

--- a/Dragon/modules/python/py_config.h
+++ b/Dragon/modules/python/py_config.h
@@ -17,7 +17,8 @@
 inline PyObject* SetLogLevelCC(PyObject* self, PyObject* args) {
    char* cname;
    if (!PyArg_ParseTuple(args, "s", &cname)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the logging level.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the logging level.");
        return nullptr;
    }
    SetLogDestination(StrToLogSeverity(string(cname)));

--- a/Dragon/modules/python/py_graph.h
+++ b/Dragon/modules/python/py_graph.h
@@ -17,16 +17,19 @@
 inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {
    PyObject* graph_str;
    if (!PyArg_ParseTuple(args, "S", &graph_str)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of GraphDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of GraphDef.");
        return nullptr;
    }
    GraphDef graph_def;
    if (!graph_def.ParseFromString(PyBytes_AsStringEx(graph_str))) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to parse the GraphDef.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to parse the GraphDef.");
        return nullptr;
    } 
    if (!ws()->CreateGraph(graph_def)) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to create the Graph.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to create the Graph.");
        return nullptr;
    }
    Py_RETURN_TRUE;
@@ -34,11 +37,17 @@ inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {

 inline PyObject* RunGraphCC(PyObject* self, PyObject* args) {
    char* cname, *include, *exclude;
-    if (!PyArg_ParseTuple(args, "sss", &cname, &include, &exclude)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the graph name, include and exclude rules.");
+    if (!PyArg_ParseTuple(args, "sss",
+            &cname, &include, &exclude)) {
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the graph name, include and exclude rules.");
        return nullptr;
    }
-    ws()->RunGraph(string(cname), string(include), string(exclude));
+    ws()->RunGraph(
+        string(cname),
+        string(include),
+        string(exclude)
+    );
    Py_RETURN_TRUE;
 }


--- a/Dragon/modules/python/py_io.h
+++ b/Dragon/modules/python/py_io.h
@@ -19,13 +19,13 @@ inline PyObject* SnapshotCC(PyObject* self, PyObject* args) {
    char* path; int format;
    PyObject* names; vector<Tensor*> tensors;
    if (!PyArg_ParseTuple(args, "sOi", &path, &names, &format)) {
-        PyErr_SetString(PyExc_ValueError, 
+        PyErr_SetString(PyExc_ValueError,
            "Excepted the model path, tensors, and data format.");
        return nullptr;
    }
    switch (format) {
        case 0:  //  Pickle
-            PyErr_SetString(PyExc_NotImplementedError, 
+            PyErr_SetString(PyExc_NotImplementedError,
                "Format depends on Pickle. Can't be used in C++.");
            break;
        case 1:  //  CaffeModel
@@ -42,13 +42,13 @@ inline PyObject* SnapshotCC(PyObject* self, PyObject* args) {
 inline PyObject* RestoreCC(PyObject* self, PyObject* args) {
    char* path; int format;
    if (!PyArg_ParseTuple(args, "si", &path, &format)) {
-        PyErr_SetString(PyExc_ValueError, 
+        PyErr_SetString(PyExc_ValueError,
            "Excepted the model path and data format.");
        return nullptr;
    }
    switch (format) {
        case 0:  //  Pickle
-            PyErr_SetString(PyExc_NotImplementedError, 
+            PyErr_SetString(PyExc_NotImplementedError,
                "Format depends on Pickle. Can't be used in C++.");
            break;
        case 1:  //  CaffeModel

--- a/Dragon/modules/python/py_mpi.h
+++ b/Dragon/modules/python/py_mpi.h
@@ -46,7 +46,8 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
    PyObject *incl, *excl, *ret;
    int local_root, world_size;
    if (!PyArg_ParseTuple(args, "iOO", &local_root, &incl, &excl)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the local root, include and exclued list.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the local root, include and exclued list.");
        return nullptr;
    }
    MPI_Group world_group, local_group;

--- a/Dragon/modules/python/py_operator.h
+++ b/Dragon/modules/python/py_operator.h
@@ -37,12 +37,14 @@ inline PyObject* NoGradientOperatorsCC(PyObject* self, PyObject* args) {
 inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
    PyObject* op_str;
    if (!PyArg_ParseTuple(args, "S", &op_str)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of OperatorDef.");
        return nullptr;
    }
    OperatorDef op_def;
    if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to parse the OperatorDef.");
        return nullptr;
    }
    ws()->RunOperator(op_def);
@@ -52,7 +54,8 @@ inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
 inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
    PyObject* py_ops;
    if (!PyArg_ParseTuple(args, "O", &py_ops)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a list of serialized string of OperatorDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a list of serialized string of OperatorDef.");
        return nullptr;
    }
    OperatorDef op_def;
@@ -67,12 +70,14 @@ inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
 inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
    PyObject* op_str;
    if (!PyArg_ParseTuple(args, "S", &op_str)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of OperatorDef.");
        return nullptr;
    }
    OperatorDef op_def;
    if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to parse the OperatorDef.");
        return nullptr;
    }
    ws()->CreatePersistentOp(op_def);
@@ -82,9 +87,11 @@ inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
 inline PyObject* RunPersistentOpCC(PyObject* self, PyObject* args) {
    char* key, *anchor;
    PyObject* py_inputs, *py_outputs;
-    if (!PyArg_ParseTuple(args, "ssOO", &key, &anchor, &py_inputs, &py_outputs)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a persistent key, anchor, " 
-                                          "list of inputs and outputs.");
+    if (!PyArg_ParseTuple(args, "ssOO",
+            &key, &anchor, &py_inputs, &py_outputs)) {
+        PyErr_SetString(PyExc_ValueError, 
+            "Excepted a persistent key, anchor, "
+            "list of inputs and outputs.");
        return nullptr;
    }
    vector<string> inputs, outputs;

--- a/Dragon/modules/python/py_tensor.h
+++ b/Dragon/modules/python/py_tensor.h
@@ -39,12 +39,14 @@ inline PyObject* CreateTensorCC(PyObject* self, PyObject* args) {
 inline PyObject* CreateFillerCC(PyObject* self, PyObject* args) {
    PyObject* filler_string;
    if (!PyArg_ParseTuple(args, "S", &filler_string)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of TensorFiller.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted a serialized string of TensorFiller.");
        return nullptr;
    }
    TensorFiller filler_def;
    if (!filler_def.ParseFromString(PyBytes_AsStringEx(filler_string))) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to parse the TensorFiller.");
+        PyErr_SetString(PyExc_RuntimeError,
+            "Failed to parse the TensorFiller.");
        return nullptr;
    }
    ws()->CreateFiller(filler_def);
@@ -60,7 +62,8 @@ inline PyObject* GetFillerTypeCC(PyObject* self, PyObject* args) {
 inline PyObject* RenameTensorCC(PyObject* self, PyObject* args) {
    char* ori_name, *tar_name;
    if (!PyArg_ParseTuple(args, "ss", &ori_name, &tar_name)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the original and target name.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the original and target name.");
        return nullptr;
    }
    if (!ws()->HasTensor(tar_name)) {
@@ -77,7 +80,8 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) {
    char* cname, *dtype;
    PyObject* shape, *device_option = nullptr;
    if (!PyArg_ParseTuple(args, "sOs|O", &cname, &shape, &dtype, &device_option)) {
-        PyErr_SetString(PyExc_ValueError, "Excepted the name, shape, dtype and optional device option.");
+        PyErr_SetString(PyExc_ValueError,
+            "Excepted the name, shape, dtype and optional device option.");
        return nullptr;
    }
    const TypeMeta& meta = TypeStringToMeta(dtype);
@@ -119,7 +123,8 @@ PyObject* TensorFromPyArrayCC(PyObject* self, PyObject* args) {
    char* cname;
    PyArrayObject* original_array = nullptr;
    if (!PyArg_ParseTuple(args, "sO", &cname, &original_array)) {
-        PyErr_SetString(PyExc_ValueError, "Failed to create tensor from numpy.ndarray.\n"
+        PyErr_SetString(PyExc_ValueError,
+            "Failed to create tensor from numpy.ndarray.\n"
            "Excepted the name and numpy.ndarray both.");
        return nullptr;
    }
@@ -214,7 +219,8 @@ inline PyObject* TensorToPyArrayCC(PyObject* self, PyObject* args) {
        return nullptr;
    }
    auto* data = tensor->raw_mutable_data<CPUContext>();
-    PyObject* array = PyArray_SimpleNewFromData(tensor->ndim(), dims.data(), npy_type, data);
+    PyObject* array = PyArray_SimpleNewFromData(
+        (int)tensor->ndim(), dims.data(), npy_type, data);
    Py_XINCREF(array);
    return array;
 }

--- a/Dragon/python/dragon/io/blob_fetcher.py
+++ b/Dragon/python/dragon/io/blob_fetcher.py
@@ -30,6 +30,8 @@ class BlobFetcher(Process):
        ----------
        batch_size : int
            The size of a training batch.
+        dtype : str
+            The data type of batch. Default is ``float32``.
        partition : boolean
            Whether to partition batch. Default is ``False``.
        prefetch : int
@@ -42,6 +44,7 @@ class BlobFetcher(Process):
        """
        super(BlobFetcher, self).__init__()
        self._batch_size = kwargs.get('batch_size', 100)
+        self._dtype = kwargs.get('dtype', 'float32')
        self._partition  = kwargs.get('partition', False)
        self._mean_values = kwargs.get('mean_values', [])
        self._scale = kwargs.get('scale', 1.0)
@@ -68,7 +71,7 @@ class BlobFetcher(Process):
            if ix != self._batch_size - 1: im, labels = self.Q_in.get()

        # mean subtraction & numerical scale
-        im_blob = im_blob.astype(np.float32)
+        im_blob = im_blob.astype(self._dtype)
        if len(self._mean_values) > 0:
            im_blob -= self._mean_values
        if self._scale != 1.0:

--- a/Dragon/python/dragon/io/data_batch.py
+++ b/Dragon/python/dragon/io/data_batch.py
@@ -70,6 +70,8 @@ class DataBatch(object):
            The phase of this operator, ``TRAIN`` or ``TEST``. Default is ``TRAIN``.
        batch_size : int
            The size of a training batch.
+        dtype : str
+            The data type of batch. Default is ``float32``.
        partition : boolean
            Whether to partition batch. Default is ``False``.
        prefetch : int

--- a/Dragon/python/dragon/io/data_reader.py
+++ b/Dragon/python/dragon/io/data_reader.py
@@ -49,16 +49,14 @@ class DataReader(Process):
        self._source = kwargs.get('source', '')
        self._multiple_nodes = kwargs.get('multiple_nodes', False)
        self._use_shuffle = kwargs.get('shuffle', False)
+        self._use_instance_chunk = kwargs.get('instance_chunk', False)
        self._num_chunks = kwargs.get('num_chunks', 2048)
        self._chunk_size = kwargs.get('chunk_size', -1)

-        self._num_parts = 1
-        self._part_idx = 0
+        self._part_idx, self._num_parts = 0, 1
+        self._cur_idx, self._cur_chunk_idx = 0, 0
        self._random_seed = config.GetRandomSeed()

-        self._cur_idx = 0
-        self._cur_chunk_idx = 0
-
        self.Q_out = None
        self.daemon = True

@@ -167,12 +165,13 @@ class DataReader(Process):
        self._db.open(self._source)
        self._zfill = self._db.zfill()
        self._num_entries = self._db.num_entries()
-        self._epoch_size = int(self._num_entries / self._num_parts + 1)
+        self._epoch_size = int(self._num_entries/ self._num_parts + 1)

        if self._use_shuffle:
            if self._chunk_size == 1:
                # each chunk has at most 1 record [For Fully Shuffle]
-                self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
+                self._chunk_size, self._num_shuffle_parts = \
+                    1, int(self._num_entries / self._num_parts) + 1
            else:
                if self._use_shuffle and self._chunk_size == -1:
                    # search a optimal chunk size by chunks [For Chunk Shuffle]
@@ -183,6 +182,11 @@ class DataReader(Process):
                    self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
                                                 (self._num_parts * self._chunk_size << 20)))
                    self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
+                    limit = (self._num_parts - 0.5) * self._num_shuffle_parts * self._chunk_size
+                    if self._num_entries <= limit:
+                        # roll back to fully shuffle
+                        self._chunk_size, self._num_shuffle_parts = \
+                            1, int(self._num_entries / self._num_parts) + 1
        else:
            # each chunk has at most K records [For Multiple Nodes]
            # note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,

--- a/Dragon/python/dragon/version.py
+++ b/Dragon/python/dragon/version.py
@@ -14,7 +14,7 @@ from __future__ import division
 from __future__ import print_function

 version = '0.2.2'
-full_version = '0.2.2.10'
+full_version = '0.2.2.11'
 release = False

 if not release:

--- a/Dragon/python/dragon/vm/caffe/layers/common.py
+++ b/Dragon/python/dragon/vm/caffe/layers/common.py
@@ -364,7 +364,7 @@ class BatchNormLayer(Layer):
        var  = Tensor(scope + '/param:1').Constant(value=0.0)
        factor = Tensor(scope + '/param:2').Constant(value=0.0)
        # in dragon, set diff as None will ignore computing grad automatically
-        # but in bvlc-caffe1, you must set lr_mult = 0 manually
+        # but in bvlc-caffe, you must set lr_mult = 0 manually
        self._blobs.append({'data': mean, 'diff': None})
        self._blobs.append({'data': var, 'diff': None})
        self._blobs.append({'data': factor, 'diff': None})

--- a/Dragon/python/dragon/vm/torch/ops/__init__.py
+++ b/Dragon/python/dragon/vm/torch/ops/__init__.py
@@ -20,7 +20,7 @@ from .arithmetic import (

 from .ndarray import (
    squeeze, unsqueeze,
-    sum, mean, argmin, argmax, max, topk,
+    sum, mean, argmin, argmax, max, min, topk,
    cat, gather,
 )


--- a/Dragon/python/dragon/vm/torch/ops/arithmetic.py
+++ b/Dragon/python/dragon/vm/torch/ops/arithmetic.py
@@ -13,7 +13,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-
 from dragon.vm.torch.tensor import Tensor
 from dragon.vm.torch.ops.primitive import MakeContext, WrapScalar
 from dragon.vm.torch.ops.factory import get_module
@@ -26,7 +25,6 @@ def _fundamental(input, value, op='Add', out=None):
            raise TypeError('Type of value should be numerical, got {}.'
                    .format(type(value)))
        value = WrapScalar(value, input._dtype, input._ctx)
-
    ctx = MakeContext(inputs=[input, value])
    key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1])
    module = get_module(Fundamental, key, ctx, op_type=op)

--- a/Dragon/python/dragon/vm/torch/utils/data/dataset.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/dataset.py
@@ -13,7 +13,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from dragon.vm.torch.utils.data.io.data_reader import DataReader
+from dragon.io.data_reader import DataReader
 from dragon.vm.torch.utils.data.io.data_transformer import DataTransformer



--- a/Dragon/python/dragon/vm/torch/utils/data/io/data_batch.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/io/data_batch.py
@@ -19,7 +19,7 @@ from multiprocessing import Queue

 import dragon.core.mpi as mpi

-from .data_reader import DataReader
+from dragon.io.data_reader import DataReader
 from .data_transformer import DataTransformer
 from .blob_fetcher import BlobFetcher


--- a/Dragon/python/dragon/vm/torch/utils/data/io/data_reader.py
+++ b/Dragon/python/dragon/vm/torch/utils/data/io/data_reader.py
-# ------------------------------------------------------------
-# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
-#
-# Licensed under the BSD 2-Clause License.
-# You should have received a copy of the BSD 2-Clause License
-# along with the software. If not, See,
-#
-#      <https://opensource.org/licenses/BSD-2-Clause>
-#
-# ------------------------------------------------------------
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import numpy as np
-import numpy.random as npr
-from multiprocessing import Process
-
-import dragon.config as config
-from dragon.tools.db import LMDB
-
-
-class DataReader(Process):
-    """DataReader is deployed to queue encoded str from `LMDB`_.
-
-    It is supported to adaptively partition and shuffle records over all distributed nodes.
-
-    """
-    def __init__(self, **kwargs):
-        """Construct a ``DataReader``.
-
-        Parameters
-        ----------
-        source : str
-            The path of database.
-        multiple_nodes: boolean
-            Whether to split data for multiple parallel nodes. Default is ``False``.
-        shuffle : boolean
-            Whether to shuffle the data. Default is ``False``.
-        num_chunks : int
-            The number of chunks to split. Default is ``2048``.
-        chunk_size : int
-            The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).
-
-        """
-        super(DataReader, self).__init__()
-        self._source = kwargs.get('source', '')
-        self._multiple_nodes = kwargs.get('multiple_nodes', False)
-        self._use_shuffle = kwargs.get('shuffle', False)
-        self._num_chunks = kwargs.get('num_chunks', 2048)
-        self._chunk_size = kwargs.get('chunk_size', -1)
-
-        self._num_parts = 1
-        self._part_idx = 0
-        self._random_seed = config.GetRandomSeed()
-
-        self._cur_idx = 0
-        self._cur_chunk_idx = 0
-
-        self.Q_out = None
-        self.daemon = True
-
-    def element(self):
-        """Get the value of current record.
-
-        Returns
-        -------
-        str
-            The encoded str.
-
-        """
-        return self._db.value()
-
-    def redirect(self, target_idx):
-        """Redirect to the target position.
-
-        Parameters
-        ----------
-        target_idx : int
-            The key of instance in ``LMDB``.
-
-        Returns
-        -------
-        None
-
-        Notes
-        -----
-        The redirection reopens the ``LMDB``.
-
-        You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``.
-
-        This will disturb getting stuck when ``Database Size`` >> ``RAM Size``.
-
-        """
-        self._db.close()
-        self._db.open(self._source)
-        self._cur_idx = target_idx
-        self._db.set(str(self._cur_idx).zfill(self._zfill))
-
-    def reset(self):
-        """Reset the cursor and environment.
-
-        Returns
-        -------
-        None
-
-        """
-        if self._multiple_nodes or self._use_shuffle:
-            if self._use_shuffle: self._perm = npr.permutation(self._num_shuffle_parts)
-            self._cur_chunk_idx = 0
-            self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx])
-            self._start_idx = int(self._start_idx * self._chunk_size)
-            if self._start_idx >= self._num_entries: self.next_chunk()
-            self._end_idx = self._start_idx + self._chunk_size
-            self._end_idx = min(self._num_entries, self._end_idx)
-        else:
-            self._start_idx = 0
-            self._end_idx = self._num_entries
-
-        self.redirect(self._start_idx)
-
-    def next_record(self):
-        """Step the cursor of records.
-
-        Returns
-        -------
-        None
-
-        """
-        self._cur_idx += 1
-        self._db.next()
-
-    def next_chunk(self):
-        """Step the cursor of shuffling chunks.
-
-        Returns
-        -------
-        None
-
-        """
-        self._cur_chunk_idx += 1
-        if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset()
-        else:
-            self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]
-            self._start_idx = self._start_idx * self._chunk_size
-            if self._start_idx >= self._num_entries: self.next_chunk()
-            else:
-                self._end_idx = self._start_idx + self._chunk_size
-                self._end_idx = min(self._num_entries, self._end_idx)
-            self.redirect(self._start_idx)
-
-    def run(self):
-        """Start the process.
-
-        Returns
-        -------
-        None
-
-        """
-        # fix seed
-        npr.seed(self._random_seed)
-
-        # init db
-        self._db = LMDB()
-        self._db.open(self._source)
-        self._zfill = self._db.zfill()
-        self._num_entries = self._db.num_entries()
-        self._epoch_size = int(self._num_entries / self._num_parts + 1)
-
-        if self._use_shuffle:
-            if self._chunk_size == 1:
-                # each chunk has at most 1 record [For Fully Shuffle]
-                self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
-            else:
-                if self._use_shuffle and self._chunk_size == -1:
-                    # search a optimal chunk size by chunks [For Chunk Shuffle]
-                    max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
-                    min_chunk_size = 1
-                    while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
-                    self._chunk_size = min_chunk_size
-                    self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
-                                                 (self._num_parts * self._chunk_size << 20)))
-                    self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
-        else:
-            # each chunk has at most K records [For Multiple Nodes]
-            # note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
-            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
-            self._chunk_size = int(self._num_entries / self._num_parts) + 1
-            self._num_shuffle_parts = 1
-
-        self._perm = np.arange(self._num_shuffle_parts)
-
-        # init env
-        self.reset()
-
-        # run
-        while True:
-            self.Q_out.put(self.element())
-            self.next_record()
-            if self._cur_idx >= self._end_idx:
-                if self._multiple_nodes or \
-                    self._use_shuffle: self.next_chunk()
-                else: self.reset()
\ No newline at end of file
--- a/Dragon/python/setup.py
+++ b/Dragon/python/setup.py
@@ -42,7 +42,7 @@ find_modules()


 setup(name = 'dragon',
-      version='0.2.2.10',
+      version='0.2.2.11',
      description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
      url='https://github.com/seetaresearch/Dragon',
      author='Ting Pan',

--- a/Dragon/src/contrib/rcnn/bbox_utils.cc
+++ b/Dragon/src/contrib/rcnn/bbox_utils.cc
@@ -19,7 +19,8 @@ template <> void GenerateProposals<float, CPUContext>(
    const float*            scores,
    const float*            bbox_deltas,
    const float*            anchors,
-    float*                  proposals) {
+    float*                  proposals,
+    CPUContext*             ctx) {
    float* proposal = proposals;
    const int K = feat_h * feat_w;
    for (int h = 0; h < feat_h; ++h) {
@@ -57,7 +58,8 @@ template <> void GenerateProposals_v2<float, CPUContext>(
    const float             min_box_w,
    const float*            scores,
    const float*            bbox_deltas,
-    float*                  proposals) {
+    float*                  proposals,
+    CPUContext*             ctx) {
    float* proposal = proposals;
    for (int i = 0; i < total_anchors; ++i) {
        //  bbox_deltas: [1, 4, total_anchors]
@@ -98,7 +100,8 @@ template <> void ApplyNMS<float, CPUContext>(
    const float             thresh,
    const float*            boxes,
    int*                    keep_indices,
-    int&                    num_keep) {
+    int&                    num_keep,
+    CPUContext*             ctx) {
    int count = 0;
    std::vector<char> is_dead(num_boxes);
    for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;

--- a/Dragon/src/contrib/rcnn/bbox_utils.cu
+++ b/Dragon/src/contrib/rcnn/bbox_utils.cu
@@ -62,7 +62,7 @@ __global__ void _GenerateProposals(
    const T*                bbox_deltas,
    const T*                anchors,
    T*                      proposals) {
-    CUDA_KERNEL_LOOP(idx, nthreads) {
+    CUDA_1D_KERNEL_LOOP(idx, nthreads) {
        const int h = idx / A / feat_w;
        const int w = (idx / A) % feat_w;
        const int a = idx % A;
@@ -99,13 +99,15 @@ template <> void GenerateProposals<float, CUDAContext>(
    const float*            scores,
    const float*            bbox_deltas,
    const float*            anchors,
-    float*                  proposals) {
+    float*                  proposals,
+    CUDAContext*            ctx) {
    const int num_proposals = A * feat_h * feat_w;
    _GenerateProposals<float>
-        << <CUDA_BLOCKS(num_proposals), CUDA_THREADS >> >(
-            num_proposals, A, feat_h, feat_w, stride,
-                im_h, im_w, min_box_h, min_box_w,
-                    scores, bbox_deltas, anchors, proposals);
+        << < CUDA_BLOCKS(num_proposals), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(num_proposals,
+                 A, feat_h, feat_w, stride,
+                     im_h, im_w, min_box_h, min_box_w,
+                         scores, bbox_deltas, anchors, proposals);
 }

 template <typename T>
@@ -118,7 +120,7 @@ __global__ void _GenerateProposals_v2(
    const T*                scores,
    const T*                bbox_deltas,
    T*                      proposals) {
-    CUDA_KERNEL_LOOP(idx, nthreads) {
+    CUDA_1D_KERNEL_LOOP(idx, nthreads) {
        const float dx = bbox_deltas[idx];
        const float dy = bbox_deltas[nthreads + idx];
        const float d_log_w = bbox_deltas[2 * nthreads + idx];
@@ -139,11 +141,13 @@ template <> void GenerateProposals_v2<float, CUDAContext>(
    const float             min_box_w,
    const float*            scores,
    const float*            bbox_deltas,
-    float*                  proposals) {
+    float*                  proposals,
+    CUDAContext*            ctx) {
    _GenerateProposals_v2<float>
-        << <CUDA_BLOCKS(total_anchors), CUDA_THREADS >> >(
-            total_anchors, im_h, im_w, min_box_h, min_box_w,
-                scores, bbox_deltas, proposals);
+        << < CUDA_BLOCKS(total_anchors), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(total_anchors,
+                 im_h, im_w, min_box_h, min_box_w,
+                     scores, bbox_deltas, proposals);
 }

 /******************** NMS ********************/
@@ -170,7 +174,7 @@ __global__ void nms_mask(
    const int               num_boxes,
    const T                 nms_thresh,
    const T*                boxes,
-    unsigned long long*     mask) {
+    uint64_t*               mask) {
    const int i_start = blockIdx.x * NMS_BLOCK_SIZE;
    const int di_end = min(num_boxes - i_start, NMS_BLOCK_SIZE);
    const int j_start = blockIdx.y * NMS_BLOCK_SIZE;
@@ -209,25 +213,30 @@ void _ApplyNMS(
    const float             thresh,
    const T*                boxes,
    int*                    keep_indices,
-    int&                    num_keep) {
+    int&                    num_keep,
+    CUDAContext*            ctx) {
    const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
    const dim3 blocks(num_blocks, num_blocks);
-    size_t mask_nbytes = num_boxes * num_blocks * sizeof(unsigned long long);
+    size_t mask_nbytes = num_boxes * num_blocks * sizeof(uint64_t);
    size_t boxes_nbytes = num_boxes * 5 * sizeof(T);

    void* boxes_dev, *mask_dev;
    CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_nbytes));
    CUDA_CHECK(cudaMalloc(&mask_dev, mask_nbytes));
-    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes, boxes_nbytes, cudaMemcpyHostToDevice));
-    nms_mask<T> << <blocks, NMS_BLOCK_SIZE >> > (
-        num_boxes, thresh, (T*)boxes_dev, (unsigned long long*)mask_dev);
+    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes,
+        boxes_nbytes, cudaMemcpyHostToDevice));
+    nms_mask<T>
+        << < blocks, NMS_BLOCK_SIZE,
+             0, ctx->cuda_stream() >> > (num_boxes,
+                 thresh, (T*)boxes_dev, (uint64_t*)mask_dev);
    CUDA_CHECK(cudaPeekAtLastError());

-    std::vector<unsigned long long> mask_host(num_boxes * num_blocks);
-    CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev, mask_nbytes, cudaMemcpyDeviceToHost));
+    std::vector<uint64_t> mask_host(num_boxes * num_blocks);
+    CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
+        mask_nbytes, cudaMemcpyDeviceToHost));

-    std::vector<unsigned long long> dead_bit(num_blocks);
-    memset(&dead_bit[0], 0, sizeof(unsigned long long) * num_blocks);
+    std::vector<uint64_t> dead_bit(num_blocks);
+    memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
    int num_selected = 0;

    for (int i = 0; i < num_boxes; ++i) {
@@ -235,7 +244,7 @@ void _ApplyNMS(
        const int inblock = i % NMS_BLOCK_SIZE;
        if (!(dead_bit[nblock] & (1ULL << inblock))) {
            keep_indices[num_selected++] = i;
-            unsigned long long* mask_i = &mask_host[0] + i * num_blocks;
+            uint64_t* mask_i = &mask_host[0] + i * num_blocks;
            for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
            if (num_selected == max_keeps) break;
        }
@@ -251,9 +260,10 @@ template <> void ApplyNMS<float, CUDAContext>(
    const float             thresh,
    const float*            boxes,
    int*                    keep_indices,
-    int&                    num_keep) {
+    int&                    num_keep,
+    CUDAContext*            ctx) {
    _ApplyNMS<float>(num_boxes, max_keeps, thresh,
-        boxes, keep_indices, num_keep);
+        boxes, keep_indices, num_keep, ctx);
 }

 }    // namespace rcnn

--- a/Dragon/src/contrib/rcnn/bbox_utils.h
+++ b/Dragon/src/contrib/rcnn/bbox_utils.h
@@ -126,7 +126,8 @@ void GenerateProposals(
    const T*                        scores,
    const T*                        bbox_deltas,
    const T*                        anchors,
-    T*                              proposals);
+    T*                              proposals,
+    Context*                        ctx);

 template <typename T, class Context>
 void GenerateProposals_v2(
@@ -137,7 +138,8 @@ void GenerateProposals_v2(
    const float                     min_box_w,
    const T*                        scores,
    const T*                        bbox_deltas,
-    T*                              proposals);
+    T*                              proposals,
+    Context*                        ctx);

 template <typename T>
 inline void SortProposals(
@@ -246,7 +248,8 @@ void ApplyNMS(
    const T                         thresh,
    const T*                        boxes,
    int*                            keep_indices,
-    int&                            num_keep);
+    int&                            num_keep,
+    Context*                        ctx);

 }    // namespace rcnn


--- a/Dragon/src/contrib/rcnn/proposal_op.cc
+++ b/Dragon/src/contrib/rcnn/proposal_op.cc
@@ -37,7 +37,7 @@ void ProposalOp<Context>::RunWithType() {
                        Input(0).template data<T, Context>(),
                            Input(1).template data<T, Context>(),
                                anchors_.template mutable_data<T, Context>(),
-                                    proposals_.template mutable_data<T, Context>());
+                                    proposals_.template mutable_data<T, Context>(), ctx());

            rcnn::SortProposals(0, num_proposals - 1, pre_nms_top_n,
                proposals_.template mutable_data<T, CPUContext>());
@@ -45,7 +45,8 @@ void ProposalOp<Context>::RunWithType() {
            rcnn::ApplyNMS<T, Context>(
                pre_nms_topn, post_nms_top_n, nms_thresh,
                    proposals_.template mutable_data<T, Context>(),
-                        roi_indices_.template mutable_data<int, CPUContext>(), num_rois);
+                        roi_indices_.template mutable_data<int, CPUContext>(),
+                            num_rois, ctx());

            rcnn::RetrieveRoIs<T>(num_rois, n,
                proposals_.template mutable_data<T, CPUContext>(),
@@ -95,14 +96,15 @@ void ProposalOp<Context>::RunWithType() {
                im_height, im_width, min_box_h, min_box_w,
                    Input(-3).template data<T, Context>(),
                        Input(-2).template data<T, Context>(),
-                            proposals_.template mutable_data<T, Context>());
+                            proposals_.template mutable_data<T, Context>(), ctx());

            rcnn::SortProposals(0, total_proposals - 1, pre_nms_top_n,
                proposals_.template mutable_data<T, CPUContext>());

            rcnn::ApplyNMS<T, Context>(pre_nms_topn, post_nms_top_n, nms_thresh,
                proposals_.template mutable_data<T, Context>(),
-                    roi_indices_.template mutable_data<int, CPUContext>(), num_rois);
+                    roi_indices_.template mutable_data<int, CPUContext>(),
+                        num_rois, ctx());

            rcnn::RetrieveRoIs<T>(num_rois, n,
                proposals_.template mutable_data<T, CPUContext>(),
@@ -128,7 +130,7 @@ void ProposalOp<Context>::RunWithType() {
        collective_rois.ReshapeLike(*Output(0));
        auto* rois = collective_rois.template mutable_data<T, CPUContext>();

-        CPUContext::template Copy<T, CPUContext, CPUContext>(
+        ctx()->template Copy<T, CPUContext, CPUContext>(
            collective_rois.count(), rois,
                Output(0)->template data<T, CPUContext>());

@@ -147,6 +149,8 @@ void ProposalOp<Context>::RunWithType() {

 template <class Context>
 void ProposalOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    num_images = Input(0).dim(0);
    CHECK_EQ(Input(-1).dim(0), num_images)
        << "\nExcepted " << num_images << " groups image info, "

--- a/Dragon/src/core/graph.cc
+++ b/Dragon/src/core/graph.cc
@@ -455,7 +455,10 @@ Graph::Graph(const GraphDef& meta_graph, Workspace* ws)
    RecomputingAware(optimized_graph, ws);
 }

-bool Graph::Run(const string& include, const string& exclude) {
+bool Graph::Run(
+    const string&               include,
+    const string&               exclude,
+    const int                   stream_id) {
    LOG(DEBUG) << "Run Graph: " << name();
    for (auto op : ops_) {
        if (!include.empty())
@@ -464,7 +467,7 @@ bool Graph::Run(const string& include, const string& exclude) {
            if (op->type().find(exclude) != string::npos) continue;
        op->SwitchToPhase(this->args_["phase"].s());
        LOG(DEBUG) << "$ Before Operator: " << op->name();
-        op->Run();
+        op->Run(stream_id);
        LOG(DEBUG) << "$ After Operator: " << op->name();
    }
    return true;

--- a/Dragon/src/core/mixedmem.cc
+++ b/Dragon/src/core/mixedmem.cc
@@ -8,7 +8,6 @@ void MixedMemory::ToCPU() {
    switch (state_) {
    case UNINITIALIZED:
        cpu_ptr_ = CPUContext::New(nbytes_);
-        CPUContext::Memset(nbytes_, cpu_ptr_);
        state_ = STATE_AT_CPU;
        break;
    case STATE_AT_CUDA:
@@ -32,7 +31,6 @@ void MixedMemory::ToCUDA() {
    switch (state_) {
    case UNINITIALIZED:
        cuda_ptr_ = CUDAContext::New(nbytes_);
-        CUDAContext::Memset(nbytes_, cuda_ptr_);
        state_ = STATE_AT_CUDA;
        break;
    case STATE_AT_CPU:

--- a/Dragon/src/operators/activation/cudnn_dropout_op.cc
+++ b/Dragon/src/operators/activation/cudnn_dropout_op.cc
@@ -15,33 +15,35 @@ void CuDNNDropoutOp<Context>::RunWithType() {
    float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
    if (phase() == "TEST") {
        if (Output(0) != &Input(0)) {
-            ctx().template Copy<T, Context, Context>(
+            ctx()->template Copy<T, Context, Context>(
                Output(0)->count(), Ydata, Xdata);
            if (scale == 1.0)
                math::Scal<T, Context>(Output(0)->count(),
-                    1.0 - prob(), Ydata, &ctx());
+                    1.0 - prob(), Ydata, ctx());
        }
    } else if (phase() == "TRAIN") {
        CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
-        Tensor* mask = ws()->CreateTensor("/mnt/" + anchor() + "/dropout/mask");
+        Tensor* mask = ws()->CreateTensor(
+            "/mnt/" + anchor() + "/dropout/mask");
        //  determine the dropout states
        if (!states_initialized) {
            states_initialized = true;
            CUDNN_CHECK(cudnnDropoutGetStatesSize(
-                ctx().cudnn_handle(), &states_size));
+                ctx()->cudnn_handle(), &states_size));
            std::lock_guard<std::mutex> lk(CUDAContext::mutex());
-            Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
-                dragon_cast<string, unsigned long long>(random_seed) + "/states");
+            Tensor* states = ws()->CreateTensor(
+                "/share/cudnn/dropout:" + dragon_cast<string,
+                    unsigned long long>(random_seed) + "/states");
            if (states->count() > 0) {
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
-                    dropout_desc, ctx().cudnn_handle(), prob(),
+                    dropout_desc, ctx()->cudnn_handle(), prob(),
                        Sdata, states_size, random_seed));
            } else {
                states->Reshape({ (TIndex)states_size });
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnSetDropoutDescriptor(
-                    dropout_desc, ctx().cudnn_handle(), prob(),
+                    dropout_desc, ctx()->cudnn_handle(), prob(),
                        Sdata, states_size, random_seed));
            }
        }
@@ -53,7 +55,7 @@ void CuDNNDropoutOp<Context>::RunWithType() {
        mask->Reshape({ (TIndex)reserve_space_size });
        auto* Rdata = mask->template mutable_data<uint8_t, Context>();
        CUDNN_CHECK(cudnnDropoutForward(
-            ctx().cudnn_handle(), dropout_desc,
+            ctx()->cudnn_handle(), dropout_desc,
                input_desc, Xdata,
                    input_desc, Ydata,
                        Rdata, reserve_space_size));
@@ -65,7 +67,9 @@ void CuDNNDropoutOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

@@ -76,19 +80,21 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
    if (phase() == "TEST") { NOT_IMPLEMENTED; }
    else if (phase() == "TRAIN") {
        CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
-        Tensor* mask = ws()->GetTensor("/mnt/" + anchor() + "/dropout/mask");
+        Tensor* mask = ws()->GetTensor(
+            "/mnt/" + anchor() + "/dropout/mask");
        //  determine the dropout states
        if (!states_initialized) {
            states_initialized = true;
            CUDNN_CHECK(cudnnDropoutGetStatesSize(
-                ctx().cudnn_handle(), &states_size));
+                ctx()->cudnn_handle(), &states_size));
            std::lock_guard<std::mutex> lk(CUDAContext::mutex());
-            Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
-                dragon_cast<string, unsigned long long>(random_seed) + "/states");
+            Tensor* states = ws()->CreateTensor(
+                "/share/cudnn/dropout:" + dragon_cast<string,
+                    unsigned long long>(random_seed) + "/states");
            if (states->count() > 0) {
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
-                    dropout_desc, ctx().cudnn_handle(), prob(),
+                    dropout_desc, ctx()->cudnn_handle(), prob(),
                        Sdata, states_size, random_seed));
            } else { LOG(FATAL) << "Missing states with seed: " << random_seed; }
        }
@@ -101,7 +107,7 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
            input_desc, &reserve_space_size));
        auto* Rdata = mask->template mutable_data<uint8_t, Context>();
        CUDNN_CHECK(cudnnDropoutBackward(
-            ctx().cudnn_handle(), dropout_desc,
+            ctx()->cudnn_handle(), dropout_desc,
                input_desc, dYdata,
                    input_desc, dXdata,
                        Rdata, reserve_space_size));
@@ -113,7 +119,9 @@ void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }


--- a/Dragon/src/operators/activation/cudnn_elu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_elu_op.cc
@@ -14,7 +14,7 @@ void CuDNNEluOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    CUDNN_CHECK(cudnnActivationForward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
 }
@@ -41,7 +41,7 @@ void CuDNNEluGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

    CUDNN_CHECK(cudnnActivationBackward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/cudnn_relu_op.cc
+++ b/Dragon/src/operators/activation/cudnn_relu_op.cc
@@ -13,7 +13,7 @@ void CuDNNReluOp<Context>::RunWithType() {

 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationForward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
 #else
@@ -49,7 +49,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() {

 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationBackward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
+++ b/Dragon/src/operators/activation/cudnn_sigmoid_op.cc
@@ -13,12 +13,12 @@ void CuDNNSigmoidOp<Context>::RunWithType() {

 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationForward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
 #else
    CUDNN_CHECK(cudnnActivationForward_v4(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<Dtype>::one, input_desc, Xdata,
                CUDNNType<Dtype>::zero, output_desc, Ydata));
 #endif
@@ -47,13 +47,13 @@ void CuDNNSigmoidGradientOp<Context>::RunWithType() {

 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationBackward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));
 #else
    CUDNN_CHECK(cudnnActivationBackward_v4(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/cudnn_softmax_op.cc
+++ b/Dragon/src/operators/activation/cudnn_softmax_op.cc
@@ -7,8 +7,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void CuDNNSoftmaxOp<Context>::RunWithType() {
    Tensor fake_tensor(vector<TIndex>(
-        { outer_dim, Input(0).dim(axis), inner_dim })
-    );
+        { outer_dim, Input(0).dim(axis), inner_dim }));
    cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
    cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);

@@ -16,7 +15,7 @@ void CuDNNSoftmaxOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    CUDNN_CHECK(cudnnSoftmaxForward(
-        ctx().cudnn_handle(),
+        ctx()->cudnn_handle(),
            CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
                CUDNNType<T>::one, input_desc, Xdata,
                    CUDNNType<T>::zero, output_desc, Ydata));
@@ -41,8 +40,7 @@ DEPLOY_CUDNN(Softmax);
 template <class Context> template <typename T>
 void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
    Tensor fake_tensor(vector<TIndex>(
-        { outer_dim, Input(0).dim(axis), inner_dim })
-    );
+        { outer_dim, Input(0).dim(axis), inner_dim }));
    cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
    cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);

@@ -50,7 +48,7 @@ void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    CUDNN_CHECK(cudnnSoftmaxBackward(
-        ctx().cudnn_handle(),
+        ctx()->cudnn_handle(),
            CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
                CUDNNType<T>::one, input_desc, Ydata, input_desc, dYdata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/cudnn_tanh_op.cc
+++ b/Dragon/src/operators/activation/cudnn_tanh_op.cc
@@ -13,12 +13,12 @@ void CuDNNTanhOp<Context>::RunWithType() {

 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationForward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
 #else
    CUDNN_CHECK(cudnnActivationForward_v4(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<Dtype>::one, input_desc, Xdata,
                CUDNNType<Dtype>::zero, output_desc, Ydata));
 #endif
@@ -47,13 +47,13 @@ void CuDNNTanhGradientOp<Context>::RunWithType() {

 #if CUDNN_VERSION_MIN(5, 0, 0)
    CUDNN_CHECK(cudnnActivationBackward(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));
 #else
    CUDNN_CHECK(cudnnActivationBackward_v4(
-        ctx().cudnn_handle(), act_desc,
+        ctx()->cudnn_handle(), act_desc,
            CUDNNType<T>::one, input_desc, Ydata, 
                input_desc, dYdata, output_desc, Ydata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/activation/dropout_op.cc
+++ b/Dragon/src/operators/activation/dropout_op.cc
@@ -11,10 +11,10 @@ void DropoutOp<Context>::RunWithType() {
    float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
    if (phase() == "TEST") {
        if (Output(0) != &Input(0)) {
-            ctx().template Copy<T, Context, Context>(
+            ctx()->template Copy<T, Context, Context>(
                Output(0)->count(), Ydata, Xdata);
            if (scale == 1.0) math::Scal<T, Context>(
-                Output(0)->count(), 1.0 - prob(), Ydata, &ctx());
+                Output(0)->count(), 1.0 - prob(), Ydata, ctx());
        }
    } else if (phase() == "TRAIN") {
        Tensor* mask = ws()->CreateTensor(
@@ -23,7 +23,7 @@ void DropoutOp<Context>::RunWithType() {
        uint32_t* Mdata = mask->template mutable_data<uint32_t, Context>();
        kernel::Dropout<T, Context>(
            Output(0)->count(), prob(), scale,
-                Xdata, Mdata, Ydata, &ctx());
+                Xdata, Mdata, Ydata, ctx());
    } else LOG(FATAL) << "Incorrect Op phase: " << phase();
 }

@@ -52,7 +52,8 @@ void DropoutGradientOp<Context>::RunWithType() {
    else if (phase() == "TRAIN") {
        kernel::DropoutGrad<T, Context>(
            Output(0)->count(), prob(), scale,
-                dYdata, Mdata, dXdata, &ctx());
+                dYdata, Mdata, dXdata, ctx());
+        ctx()->FinishDeviceCompution();
        mask->Reset();
    } else LOG(FATAL) << "Incorrect Op phase: " << phase();
 }

--- a/Dragon/src/operators/activation/elu_op.cc
+++ b/Dragon/src/operators/activation/elu_op.cc
@@ -8,7 +8,8 @@ template <class Context> template <typename T>
 void EluOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Elu<T, Context>(Output(0)->count(), alpha, Xdata, Ydata);
+    kernel::Elu<T, Context>(Output(0)->count(),
+        alpha, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -30,8 +31,8 @@ void EluGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::EluGrad<T, Context>(
-        Output(0)->count(), alpha, dYdata, Ydata, dXdata);
+    kernel::EluGrad<T, Context>(Output(0)->count(),
+        alpha, dYdata, Ydata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/activation/prelu_op.cc
+++ b/Dragon/src/operators/activation/prelu_op.cc
@@ -18,7 +18,7 @@ void PReluOp<Context>::RunWithType() {
    kernel::PRelu<T, Context>(
        Output(0)->count(), channels, dim,
            channel_shared ? true : false, data_format,
-                Xdata, Wdata, Ydata);
+                Xdata, Wdata, Ydata, ctx());
 }

 template <class Context>
@@ -49,12 +49,12 @@ void PReluGradientOp<Context>::RunWithType() {

    if (Output(1)->name() != "ignore") {
        DECLARE_MULTIPLIER(multiplier, channels * dim);
-        auto* dWdata = Output(1)->template mutable_data<T, Context>();
+        auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
        auto* dWBdata = ws()->template caches<T, Context>({ channels * dim })[0];
        kernel::PReluWGrad<T, Context>(
            Input(0).dim(0), Input(0).count(1), channels, dim,
                channel_shared ? true : false, data_format,
-                    dYdata, Xdata, multiplier, dWBdata, dWdata, &ctx());
+                    dYdata, Xdata, multiplier, dWBdata, dWdata, ctx());
    }

    if (Output(0)->name() != "ignore") {
@@ -63,7 +63,7 @@ void PReluGradientOp<Context>::RunWithType() {
        kernel::PReluGrad<T, Context>(
            Output(0)->count(), channels, dim,
                channel_shared ? true : false, data_format,
-                    dYdata, Xdata, Wdata, dXdata);
+                    dYdata, Xdata, Wdata, dXdata, ctx());
    }
 }


--- a/Dragon/src/operators/activation/relu_op.cc
+++ b/Dragon/src/operators/activation/relu_op.cc
@@ -8,7 +8,8 @@ template <class Context> template <typename T>
 void ReluOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Relu<T, Context>(Output(0)->count(), slope, Xdata, Ydata);
+    kernel::Relu<T, Context>(Output(0)->count(),
+        slope, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -24,15 +25,17 @@ DEPLOY_CPU(Relu);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Relu);
 #endif
-OPERATOR_SCHEMA(Relu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(Relu)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context> template <typename T>
 void ReluGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::ReluGrad<T, Context>(
-        Output(0)->count(), slope, dYdata, Ydata, dXdata);
+    kernel::ReluGrad<T, Context>(Output(0)->count(),
+        slope, dYdata, Ydata, dXdata, ctx());
 }

 template <class Context>
@@ -47,7 +50,9 @@ DEPLOY_CPU(ReluGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ReluGradient);
 #endif
-OPERATOR_SCHEMA(ReluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }});
+OPERATOR_SCHEMA(ReluGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 }});

 class GetReluGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/activation/selu_op.cc
+++ b/Dragon/src/operators/activation/selu_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void SEluOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata);
+    kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -23,15 +23,17 @@ DEPLOY_CPU(SElu);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SElu);
 #endif
-OPERATOR_SCHEMA(SElu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(SElu)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context> template <typename T>
 void SEluGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::SEluGrad<T, Context>(
-        Output(0)->count(), dYdata, Ydata, dXdata);
+    kernel::SEluGrad<T, Context>(Output(0)->count(),
+        dYdata, Ydata, dXdata, ctx());
 }

 template <class Context>
@@ -46,7 +48,9 @@ DEPLOY_CPU(SEluGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SEluGradient);
 #endif
-OPERATOR_SCHEMA(SEluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }});
+OPERATOR_SCHEMA(SEluGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 }});

 class GetSEluGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/activation/sigmoid_op.cc
+++ b/Dragon/src/operators/activation/sigmoid_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void SigmoidOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata);
+    kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -30,8 +30,8 @@ void SigmoidGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::SigmoidGrad<T, Context>(
-        Output(0)->count(), dYdata, Ydata, dXdata);
+    kernel::SigmoidGrad<T, Context>(Output(0)->count(),
+        dYdata, Ydata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/activation/softmax_op.cc
+++ b/Dragon/src/operators/activation/softmax_op.cc
@@ -12,13 +12,13 @@ void SoftmaxOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];

-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Input(0).count(), Ydata, Xdata);

    kernel::Softmax<T, Context>(
        Output(0)->count(), Input(0).dim(axis),
            outer_dim, inner_dim, multiplier,
-                Xdata, WSdata, Ydata, &ctx());
+                Xdata, WSdata, Ydata, ctx());
 }

 template <class Context>
@@ -36,7 +36,9 @@ DEPLOY_CPU(Softmax);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Softmax);
 #endif
-OPERATOR_SCHEMA(Softmax).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(Softmax)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context> template <typename T>
 void SoftmaxGradientOp<Context>::RunWithType() {
@@ -44,15 +46,16 @@ void SoftmaxGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
+    auto* WSdata = ws()->template caches<T, Context>(
+        { Input(0).count() })[0];

-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Input(0).count(), dXdata, dYdata);

    kernel::SoftmaxGrad<T, Context>(
        Output(0)->count(), Input(0).dim(axis),
            outer_dim, inner_dim, multiplier,
-                dYdata, Ydata, WSdata, dXdata, &ctx());
+                dYdata, Ydata, WSdata, dXdata, ctx());
 }

 template <class Context>
@@ -70,7 +73,9 @@ DEPLOY_CPU(SoftmaxGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(SoftmaxGradient);
 #endif
-OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
+OPERATOR_SCHEMA(SoftmaxGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });

 class GetSoftmaxGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/activation/tanh_op.cc
+++ b/Dragon/src/operators/activation/tanh_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void TanhOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata);
+    kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -30,8 +30,8 @@ void TanhGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context>();
    auto* dYdata = Input(1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    kernel::TanhGrad<T, Context>(
-        Output(0)->count(), dYdata, Ydata, dXdata);
+    kernel::TanhGrad<T, Context>(Output(0)->count(),
+        dYdata, Ydata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/arithmetic/add_op.cc
+++ b/Dragon/src/operators/arithmetic/add_op.cc
@@ -9,7 +9,7 @@ void AddOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Add<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }

 template <class Context> template <typename T>
@@ -19,23 +19,24 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();

-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), y, x1);

    if (type == 0 || type == 1) {
        if (type == 0) {
-            outer_dim = Input(0).count();
-            inner_dim = 1;
+            x2 = Input(1).template data<T, CPUContext>();
+            math::AddScalar<T, Context>(Output(0)->count(),
+                dragon_cast<float, T>(x2[0]), y, ctx());
        } else {
            outer_dim = Input(0).count(0, Input(0).axis(-1));
            inner_dim = Input(0).dim(-1);
+            DECLARE_MULTIPLIER(multiplier, outer_dim);
+            math::Gemm<T, Context>(
+                CblasNoTrans, CblasNoTrans,
+                    outer_dim, inner_dim, 1,
+                        1.0, multiplier, x2,
+                            1.0, y, ctx());
        }
-        DECLARE_MULTIPLIER(multiplier, outer_dim);
-        math::Gemm<T, Context>(
-            CblasNoTrans, CblasNoTrans,
-                outer_dim, inner_dim, 1,
-                    1.0, multiplier, x2,
-                        1.0, y, &ctx());
    } else if (type == 2) {
        outer_dim = Input(0).dim(0);
        inner_dim = Input(0).count(1);
@@ -44,7 +45,7 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x2, multiplier,
-                        1.0, y, &ctx());
+                        1.0, y, ctx());
    }
 }

@@ -77,13 +78,13 @@ void AddGradientOp<Context>::EltwiseRunWithType() {

    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(1)->count(), dx2, dy);
    }

    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
    }
 }
@@ -108,7 +109,7 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        } else if (type == 2) {
            outer_dim = X1->dim(0);
            inner_dim = X1->count(1);
@@ -116,13 +117,13 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        }
    }

    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            X1->count(), dx1, dy);
    }
 }

--- a/Dragon/src/operators/arithmetic/affine_op.cc
+++ b/Dragon/src/operators/arithmetic/affine_op.cc
@@ -34,7 +34,7 @@ void AffineOp<Context>::RunWithType() {

    kernel::Affine<T, Context>(
        Output(0)->count(), outer_dim, scale_dim, inner_dim,
-            Xdata, Adata, Bdata, bias_multiplier, Ydata, &ctx());
+            Xdata, Adata, Bdata, bias_multiplier, Ydata, ctx());
 }

 template <class Context>
@@ -58,13 +58,13 @@ void AffineGradientOp<Context>::BiasRunWithType() {
    DECLARE_MULTIPLIER(multiplier, inner_dim);

    auto* dYdata = Input(-1).template data<T, Context>();
-    auto* dBias = Output(2)->template mutable_data<T, Context>();
+    auto* dBias = Output(2)->template mutable_data<T, Context>(ctx());

    for (int n = 0; n < outer_dim; n++) {
        math::Gemv<T, Context>(
            CblasNoTrans, scale_dim, inner_dim,
                1.0, dYdata, multiplier,
-                    1.0, dBias, &ctx());
+                    1.0, dBias, ctx());
        dYdata += dim;
    }
 }
@@ -79,45 +79,36 @@ void AffineGradientOp<Context>::ScaleRunWithType() {
    bool is_eltwise = (Input(-1).count() == Input(1).count());
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* Xdata = Input(0).template data<T, Context>();
-    auto* dScale = Output(1)->template mutable_data<T, Context>();
+    auto* dScale = Output(1)->template mutable_data<T, Context>(ctx());
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    auto* dYxX = dXdata;

-    math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX);
+    math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX, ctx());

    if (!is_eltwise) {
        T* SRes_data = nullptr;
+        //  reduce inner dimensions
        if (inner_dim == 1) {
            SRes_data = dYxX;
-        } else if (sum_result.count() == 1) {    //  handle inner only
-            dScale = Output(1)->template mutable_data<T, CPUContext>();
-            T result = math::Dot<T, Context>(
-                inner_dim, dYxX, multiplier, &ctx());
-            *dScale += result;
        } else {
-            SRes_data = (outer_dim == 1) ?  //  handle scale only
+            SRes_data = (outer_dim == 1) ?
                dScale : sum_result.template mutable_data<T, Context>();
            math::Gemv<T, Context>(
                CblasNoTrans, sum_result.count(), inner_dim,
                    1.0, dYxX, multiplier,
-                        SRes_data == dScale ? 1.0 : 0.0, SRes_data, &ctx());
+                        SRes_data == dScale ? 1.0 : 0.0,
+                            SRes_data, ctx());
        } 
+        //  reduce outer dimensions
        if (outer_dim != 1) {
-            if (scale_dim == 1) {    //  handle outer only
-                dScale = Output(1)->template mutable_data<T, CPUContext>();
-                T result = math::Dot<T, Context>(
-                    outer_dim, multiplier, SRes_data, &ctx());
-                *dScale += result;
-            } else {
-                math::Gemv<T, Context>(
-                    CblasTrans, outer_dim, scale_dim,
-                        1.0, SRes_data, multiplier,
-                            1.0, dScale, &ctx());
-            }
+            math::Gemv<T, Context>(
+                CblasTrans, outer_dim, scale_dim,
+                    1.0, SRes_data, multiplier,
+                        1.0, dScale, ctx());
        }
    } else {
        math::Axpy<T, Context>(Output(1)->count(),
-            1.f, dYxX, dScale, &ctx());
+            1.f, dYxX, dScale, ctx());
    }
 }

@@ -131,7 +122,7 @@ void AffineGradientOp<Context>::RunWithType() {

    kernel::AffineGrad<T, Context>(
        Output(0)->count(), outer_dim, scale_dim, inner_dim,
-            dYdata, Adata, dXdata, &ctx());
+            dYdata, Adata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/arithmetic/clip_op.cc
+++ b/Dragon/src/operators/arithmetic/clip_op.cc
@@ -15,7 +15,7 @@ void ClipOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* Mdata = mask->template mutable_data<T, Context>();
    kernel::Clip<T, Context>(Output(0)->count(),
-        low, high, Xdata, Mdata, Ydata);
+        low, high, Xdata, Mdata, Ydata, ctx());
 }

 template <class Context>
@@ -30,7 +30,9 @@ DEPLOY_CPU(Clip);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(Clip);
 #endif
-OPERATOR_SCHEMA(Clip).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
+OPERATOR_SCHEMA(Clip)
+    .NumInputs(1).NumOutputs(1)
+    .Inplace({ { 0, 0 } });

 template <class Context> template <typename T>
 void ClipGradientOp<Context>::RunWithType() {
@@ -39,7 +41,8 @@ void ClipGradientOp<Context>::RunWithType() {

    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    auto* Mdata = mask->template data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), dXdata, Mdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        dXdata, Mdata, dXdata, ctx());
 }

 template <class Context>
@@ -54,7 +57,9 @@ DEPLOY_CPU(ClipGradient);
 #ifdef WITH_CUDA
 DEPLOY_CUDA(ClipGradient);
 #endif
-OPERATOR_SCHEMA(ClipGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
+OPERATOR_SCHEMA(ClipGradient)
+    .NumInputs(2).NumOutputs(1)
+    .Inplace({ { 1, 0 } });

 class GetClipGradient final : public GradientMakerBase {
 public:

--- a/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
+++ b/Dragon/src/operators/arithmetic/cudnn_affine_op.cc
@@ -23,7 +23,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
        mul_desc, CUDNN_OP_TENSOR_MUL,
            CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
    CUDNN_CHECK(cudnnOpTensor(
-        ctx().cudnn_handle(), mul_desc,
+        ctx()->cudnn_handle(), mul_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::one, param_desc, Adata,
                    CUDNNType<T>::zero, input_desc, Ydata));
@@ -36,7 +36,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
            add_desc, CUDNN_OP_TENSOR_ADD,
                CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
        CUDNN_CHECK(cudnnOpTensor(
-            ctx().cudnn_handle(), add_desc,
+            ctx()->cudnn_handle(), add_desc,
                CUDNNType<T>::one, input_desc, Ydata,
                    CUDNNType<T>::one, param_desc, Bdata,
                        CUDNNType<T>::zero, input_desc, Ydata));
@@ -48,7 +48,9 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
    Output(0)->ReshapeLike(Input(0));

    if (XIsType(Input(0), float)) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

@@ -76,17 +78,17 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
    if (Output(1)->name() != "ignore") {
        Output(1)->ReshapeLike(Input(1));
        auto* Xdata = Input(0).template data<T, Context>();
-        auto* dAdata = Output(1)->template mutable_data<T, Context>();
+        auto* dAdata = Output(1)->template mutable_data<T, Context>(ctx());
        //  eltwise
        if (Input(0).count() == Input(1).count()) {
            CUDNN_CHECK(cudnnOpTensor(
-                ctx().cudnn_handle(), mul_desc,
+                ctx()->cudnn_handle(), mul_desc,
                    CUDNNType<T>::one, input_desc, Xdata,
                        CUDNNType<T>::one, input_desc, dYdata,
                            CUDNNType<T>::one, param_desc, dAdata));
        } else {
            CUDNN_CHECK(cudnnOpTensor(
-                ctx().cudnn_handle(), mul_desc,
+                ctx()->cudnn_handle(), mul_desc,
                    CUDNNType<T>::one, input_desc, Xdata,
                        CUDNNType<T>::one, input_desc, dYdata,
                            CUDNNType<T>::zero, input_desc, dXdata));
@@ -97,11 +99,11 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
    //  db = dy
    if (Output(2)->name() != "ignore") {
        Output(2)->ReshapeLike(Input(1));
-        auto* dBdata = Output(2)->template mutable_data<T, Context>();
+        auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
        //  eltwise
        if (Input(-1).count() == Input(1).count()) {
            math::Axpy<T, Context>(Output(2)->count(),
-                1.f, dYdata, dBdata, &ctx());
+                1.f, dYdata, dBdata, ctx());
        } else {
            ComputeBiasGradient_v2<T>(dYdata, dBdata);
        }
@@ -109,7 +111,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {

    //  dx = alpha * dy
    CUDNN_CHECK(cudnnOpTensor(
-        ctx().cudnn_handle(), mul_desc,
+        ctx()->cudnn_handle(), mul_desc,
            CUDNNType<T>::one, input_desc, dYdata,
                CUDNNType<T>::one, param_desc, Adata,
                    CUDNNType<T>::zero, input_desc, dXdata));
@@ -126,11 +128,11 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient(
                CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
    size_t workspace_size = 0;
    CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
-        ctx().cudnn_handle(), reduce_desc,
+        ctx()->cudnn_handle(), reduce_desc,
            input_desc, param_desc, &workspace_size));
    auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];;
    CUDNN_CHECK(cudnnReduceTensor(
-        ctx().cudnn_handle(), reduce_desc,
+        ctx()->cudnn_handle(), reduce_desc,
            nullptr, 0, WSdata, workspace_size,
                CUDNNType<T>::one, input_desc, dYxX,
                    CUDNNType<T>::one, param_desc, dA));
@@ -145,32 +147,23 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient_v2(
    sum_result.Reshape({ outer_dim * scale_dim });

    T* SRes_data = nullptr;
-    if (inner_dim == 1) SRes_data = dYxX;
-    else if (sum_result.count() == 1) {
-        auto* dAC = Output(1)->template mutable_data<T, CPUContext>();
-        T result = math::Dot<T, Context>(
-            inner_dim, dYxX, multiplier, &ctx());
-        *dAC += result;
+    //  reduce inner dimensions
+    if (inner_dim == 1) {
+        SRes_data = dYxX;
    } else {
        SRes_data = (outer_dim == 1) ?
            dA : sum_result.template mutable_data<T, Context>();
        math::Gemv<T, Context>(
            CblasNoTrans, sum_result.count(), inner_dim,
                1.0, dYxX, multiplier,
-                    SRes_data == dA ? 1.0 : 0.0, SRes_data, &ctx());
+                    SRes_data == dA ? 1.0 : 0.0, SRes_data, ctx());
    }
+    //  reduce outer dimensions
    if (outer_dim != 1) {
-        if (scale_dim == 1) {
-            auto* dAC = Output(1)->template mutable_data<T, CPUContext>();
-            T result = math::Dot<T, Context>(
-                outer_dim, multiplier, SRes_data, &ctx());
-            *dAC += result;
-        } else {
-            math::Gemv<T, Context>(
-                CblasTrans, outer_dim, scale_dim,
-                    1.0, SRes_data, multiplier,
-                        1.0, dA, &ctx());
-        }
+        math::Gemv<T, Context>(
+            CblasTrans, outer_dim, scale_dim,
+                1.0, SRes_data, multiplier,
+                    1.0, dA, ctx());
    }
 }

@@ -185,11 +178,11 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient(
                CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
    size_t workspace_size = 0;
    CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
-        ctx().cudnn_handle(), reduce_desc,
+        ctx()->cudnn_handle(), reduce_desc,
            input_desc, param_desc, &workspace_size));
    auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
    CUDNN_CHECK(cudnnReduceTensor(
-        ctx().cudnn_handle(), reduce_desc,
+        ctx()->cudnn_handle(), reduce_desc,
            nullptr, 0, WSdata, workspace_size,
                CUDNNType<T>::one, input_desc, dY,
                    CUDNNType<T>::one, param_desc, dB));
@@ -205,7 +198,7 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient_v2(
        math::Gemv<T, Context>(
            CblasNoTrans, scale_dim, inner_dim,
                1.0, dY, multiplier,
-                    1.0, dB, &ctx());
+                    1.0, dB, ctx());
        dY += dim;
    }
 }

--- a/Dragon/src/operators/arithmetic/div_op.cc
+++ b/Dragon/src/operators/arithmetic/div_op.cc
@@ -9,7 +9,7 @@ void DivOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Div<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }

 template <class Context> template <typename T>
@@ -18,34 +18,40 @@ void DivOp<Context>::BroadcastRunWithType(int type) {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    auto* c = ws()->template caches<T, Context>({
-        Output(0)->count() })[0];
-
-    if (type == 0 || type == 1) {
-        if (type == 0) {
-            outer_dim = Input(0).count();
-            inner_dim = 1;
-        } else {
-            outer_dim = Input(0).count(0, Input(0).axis(-1));
-            inner_dim = Input(0).dim(-1);
-        }
+
+    if (type == 0) {
+        x2 = Input(1).template data<T, CPUContext>();
+        float inverse_x2 = 1.f / dragon_cast<float, T>(x2[0]);
+        ctx()->template Copy<T, Context, Context>(
+            Output(0)->count(), y, x1);
+        math::MulScalar<T, Context>(
+            Output(0)->count(), inverse_x2, y, ctx());
+    } else if (type == 1) {
+        outer_dim = Input(0).count(0, Input(0).axis(-1));
+        inner_dim = Input(0).dim(-1);
        DECLARE_MULTIPLIER(multiplier, outer_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x2,
-                        0.0, c, &ctx());
-        math::Div<T, Context>(Output(0)->count(), x1, c, y);
+                        0.0, c, ctx());
+        math::Div<T, Context>(
+            Output(0)->count(), x1, c, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(0).dim(0);
        inner_dim = Input(0).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x2, multiplier,
-                        0.0, c, &ctx());
-        math::Div<T, Context>(Output(0)->count(), x1, c, y);
+                        0.0, c, ctx());
+        math::Div<T, Context>(
+            Output(0)->count(), x1, c, y, ctx());
    }
 }

@@ -82,16 +88,16 @@ void DivGradientOp<Context>::EltwiseRunWithType() {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
-        math::Mul<T,Context>(X1->count(), dy, x1, c); // dY * X1
-        math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
-        math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
-        math::Mul<T, Context>(X2->count(), c, dx2, dx2);
+        math::Mul<T,Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
+        math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
+        math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
+        math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
    }

    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        math::Div<T, Context>(X1->count(), dy, x2, dx1);
+        math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
    }
 }

@@ -118,23 +124,23 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        auto cs = ws()->template caches<T, Context>(
            { X1->count(), X2->count() });
-        math::Mul<T, Context>(X1->count(), dy, x1, cs[0]); // dY * X1
-        math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
-        math::Inv<T, Context>(X2->count(), -1.0, dx2, dx2); // -1 / X2^{2}
+        math::Mul<T, Context>(X1->count(), dy, x1, cs[0], ctx()); // dY * X1
+        math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
+        math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, cs[0], multiplier,
-                        0.0, cs[1], &ctx());
+                        0.0, cs[1], ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, cs[0], multiplier,
-                        0.0, cs[1], &ctx());
+                        0.0, cs[1], ctx());
        }
-        math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2);
+        math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2, ctx());
    }

    if (Output(0)->name() != "ignore") {
@@ -146,16 +152,16 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, multiplier, x2,
-                            0.0, dx1, &ctx());
+                            0.0, dx1, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, x2, multiplier,
-                            0.0, dx1, &ctx());
+                            0.0, dx1, ctx());
        }
-        math::Div<T, Context>(X1->count(), dy, dx1, dx1);
+        math::Div<T, Context>(X1->count(), dy, dx1, dx1, ctx());
    }
 }


--- a/Dragon/src/operators/arithmetic/dot_op.cc
+++ b/Dragon/src/operators/arithmetic/dot_op.cc
@@ -7,9 +7,13 @@ template <class Context> template <typename T>
 void DotOp<Context>::DotRunWithType() {
    auto* X1data = Input(0).template data<T, Context>();
    auto* X2data = Input(1).template data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, CPUContext>();
-    Ydata[0] = math::Dot<T, Context>(
-        Input(0).count(), X1data, X2data, &ctx());
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+
+    T result_host;
+    math::Dot<T, Context>(Input(0).count(),
+        X1data, X2data, &result_host, ctx());
+    ctx()->template Copy<T, Context, CPUContext>(
+        1, Ydata, &result_host);
 }

 template <class Context> template <typename T>
@@ -22,7 +26,7 @@ void DotOp<Context>::GemmRunWithType() {
            TransB ? CblasTrans : CblasNoTrans,
                M, N1, K1,
                    1.0, X1data, X2data,
-                        0.0, Ydata, &ctx());
+                        0.0, Ydata, ctx());
 }

 template <class Context> template <typename T>
@@ -33,7 +37,7 @@ void DotOp<Context>::GemvRunWithType() {
    math::Gemv<T, Context>(
        TransA ? CblasTrans : CblasNoTrans, M, N1,
            1.0, X1data, X2data,
-                0.0, Ydata, &ctx());
+                0.0, Ydata, ctx());
 }

 template <class Context>
@@ -98,12 +102,14 @@ void DotGradientOp<Context>::DotRunWithType() {
    auto* dYdata = Input(2).template data<T, CPUContext>();
    auto* dX1data = Output(0)->template mutable_data<T, Context>();
    auto* dX2data = Output(1)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), dX1data, X2data);
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(1)->count(), dX2data, X1data);
-    math::MulScalar<T, Context>(Output(0)->count(), dYdata[0], dX1data);
-    math::MulScalar<T, Context>(Output(1)->count(), dYdata[0], dX2data);
+    math::MulScalar<T, Context>(
+        Output(0)->count(), dYdata[0], dX1data, ctx());
+    math::MulScalar<T, Context>(
+        Output(1)->count(), dYdata[0], dX2data, ctx());
 }

 template <class Context> template <typename T>
@@ -118,13 +124,13 @@ void DotGradientOp<Context>::GemmRunWithType() {
            TransB ? CblasNoTrans : CblasTrans,
                M, K1, N1,
                    1.0, dYdata, X2data,
-                        0.0, dX1data, &ctx());
+                        0.0, dX1data, ctx());
    math::Gemm<T, Context>(
        TransA ? CblasNoTrans : CblasTrans,
            CblasNoTrans,
                K1, N1, M,
                    1.0, X1data, dYdata,
-                        0.0, dX2data, &ctx());
+                        0.0, dX2data, ctx());
 }

 template <class Context> template <typename T>
@@ -138,11 +144,11 @@ void DotGradientOp<Context>::GemvRunWithType() {
        CblasNoTrans, CblasNoTrans,
            M, N1, 1,
                1.0, dYdata, X2data,
-                    0.0, dX1data, &ctx());
+                    0.0, dX1data, ctx());
    math::Gemv<T, Context>(
        TransA ? CblasNoTrans : CblasTrans, M, N1,
            1.0, X1data, dYdata,
-                0.0, dX2data, &ctx());
+                0.0, dX2data, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/arithmetic/eltwise_op.cc
+++ b/Dragon/src/operators/arithmetic/eltwise_op.cc
@@ -7,10 +7,11 @@ template <class Context> template <typename T>
 void EltwiseOp<Context>::SumRunWithType() {
    TIndex count = Output(0)->count();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(count, dragon_cast<T, float>(0), Ydata);
+    math::Set<T, Context>(count,
+        dragon_cast<T, float>(0), Ydata, ctx());
    for (int i = 0; i < InputSize(); ++i) {
        math::Axpy<T, Context>(count, coeffs[i],
-            Input(i).template data<T, Context>(), Ydata, &ctx());
+            Input(i).template data<T, Context>(), Ydata, ctx());
    }
 }

@@ -21,19 +22,24 @@ void EltwiseOp<Context>::ProdRunWithType() {
    math::Mul<T, Context>(count,
        Input(0).template data<T, Context>(),
            Input(1).template data<T, Context>(),
-                Ydata);
+                Ydata, ctx());
    for (int i = 2; i < InputSize(); i++) {
        math::Mul<T, Context>(count,
            Ydata,
                Input(i).template data<T, Context>(),
-                    Ydata);
+                    Ydata, ctx());
    }
 }

 template <class Context>
 void EltwiseOp<Context>::RunOnDevice() {
-    for (int i = 1; i < InputSize(); i++)
-        CHECK(Input(i).dims() == Input(0).dims());
+    for (int i = 1; i < InputSize(); i++) {
+        CHECK(Input(i).dims() == Input(0).dims())
+            << "\nExcepted Input(" << i << ")'s dims as "
+            << Input(0).DimString() << ",\n but got "
+            << Input(1).DimString() << ".";
+    }
+
    Output(0)->ReshapeLike(Input(0));

    if (operation == "SUM") {
@@ -65,12 +71,12 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
    for (int i = 0; i < OutputSize(); i++) {
        if (Output(i)->name() == "ignore") continue;
        auto* dXdata = Output(i)->template mutable_data<T, Context>();
-        if (coeffs[i] == float(1)) {
-            ctx().template Copy<T, Context, Context>(
+        if (coeffs[i] == 1.f) {
+            ctx()->template Copy<T, Context, Context>(
                count, dXdata, dYdata);
        } else {
            math::Scale<T, Context>(count,
-                coeffs[i], dYdata, dXdata, &ctx());
+                coeffs[i], dYdata, dXdata, ctx());
        }
    }
 }
@@ -88,11 +94,11 @@ void EltwiseGradientOp<Context>::ProdRunWithType() {
            if (i == j) continue;
            auto* Xdata = Input(j).template data<T, Context>();
            if (!initialized) {
-                ctx().template Copy<T, Context, Context>(count, dXdata, Xdata);
+                ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
                initialized = true;
-            } else math::Mul<T, Context>(count, Xdata, dXdata, dXdata);
+            } else math::Mul<T, Context>(count, Xdata, dXdata, dXdata, ctx());
        }
-        math::Mul<T, Context>(count, dYdata, dXdata, dXdata);
+        math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
    }
 }


--- a/Dragon/src/operators/arithmetic/exp_op.cc
+++ b/Dragon/src/operators/arithmetic/exp_op.cc
@@ -8,7 +8,7 @@ template <class Context> template <typename T>
 void ExpOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata);
+    math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -30,7 +30,8 @@ void ExpGradientOp<Context>::RunWithType() {
    auto* Ydata = Input(0).template data<T, Context >();
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), dYdata, Ydata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        dYdata, Ydata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/arithmetic/gram_matrix_op.cc
+++ b/Dragon/src/operators/arithmetic/gram_matrix_op.cc
@@ -12,7 +12,7 @@ void GramMatrixOp<Context>::RunWithType() {
            CblasNoTrans, CblasTrans,
                dim, dim, inner_dim,
                    1.0, Xdata, Xdata,
-                        0.0, Ydata, &ctx());
+                        0.0, Ydata, ctx());
        Xdata += x_offset;
        Ydata += y_offset;
    }
@@ -47,7 +47,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                dim, inner_dim, dim,
                    2.0, dYdata, Xdata,
-                        0.0, dXdata, &ctx());
+                        0.0, dXdata, ctx());
        dYdata += y_offset;
        dXdata += x_offset;
    }

--- a/Dragon/src/operators/arithmetic/inner_product_op.cc
+++ b/Dragon/src/operators/arithmetic/inner_product_op.cc
@@ -23,7 +23,7 @@ void InnerProductOp<Context>::TransRunWithType() {
        CblasNoTrans, CblasTrans,
            M, num_output, K,
                1.0, Xdata, Wdata,
-                    0.0, Ydata, &ctx());
+                    0.0, Ydata, ctx());

    if (InputSize() > 2) {
        DECLARE_MULTIPLIER(multiplier, M);
@@ -32,7 +32,7 @@ void InnerProductOp<Context>::TransRunWithType() {
            CblasNoTrans, CblasNoTrans,
                M, num_output, 1,
                    1.0, multiplier, Bdata,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }
 }

@@ -55,7 +55,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
        CblasNoTrans, CblasNoTrans,
            M, num_output, K,
                1.0, Xdata, Wdata,
-                    0.0, Ydata, &ctx());
+                    0.0, Ydata, ctx());

    if (InputSize() > 2) {
        DECLARE_MULTIPLIER(multiplier, M);
@@ -64,7 +64,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
            CblasNoTrans, CblasNoTrans,
                M, num_output, 1,
                    1.0, multiplier, Bdata,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }
 }

@@ -102,30 +102,30 @@ void InnerProductGradientOp<Context>::RunWithType() {

    if (Output(1)->name() != "ignore") {
        Output(1)->ReshapeLike(Input(1));
-        auto* dWdata = Output(1)->template mutable_data<T, Context>();
+        auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
        if (TransW) {
            math::Gemm<T, Context>(
                CblasTrans, CblasNoTrans,
                    num_output, K, M,
                        1.0, dYdata, Xdata,
-                            1.0, dWdata, &ctx());
+                            1.0, dWdata, ctx());
        } else {
            math::Gemm<T, Context>(
                CblasTrans, CblasNoTrans,
                    K, num_output, M,
                        1.0, Xdata, dYdata,
-                            1.0, dWdata, &ctx());
+                            1.0, dWdata, ctx());
        }
    }

    if (Output(2)->name() != "ignore") {
        DECLARE_MULTIPLIER(multiplier, M);
        Output(2)->Reshape({ num_output });
-        auto* dBdata = Output(2)->template mutable_data<T, Context>();
+        auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
        math::Gemv<T, Context>(
            CblasTrans, M, num_output,
                1.0, dYdata, multiplier,
-                    1.0, dBdata, &ctx());
+                    1.0, dBdata, ctx());
    }

    if (Output(0)->name() != "ignore") {
@@ -136,13 +136,13 @@ void InnerProductGradientOp<Context>::RunWithType() {
                CblasNoTrans, CblasNoTrans,
                    M, K, num_output,
                        1.0, dYdata, Wdata,
-                            0.0, dXdata, &ctx());
+                            0.0, dXdata, ctx());
        } else {
            math::Gemm<T, Context>(
                CblasNoTrans, CblasTrans,
                    M, K, num_output,
                        1.0, dYdata, Wdata,
-                            0.0, dXdata, &ctx());
+                            0.0, dXdata, ctx());
        }
    }
 }

--- a/Dragon/src/operators/arithmetic/log_op.cc
+++ b/Dragon/src/operators/arithmetic/log_op.cc
@@ -7,7 +7,7 @@ template <class Context> template <typename T>
 void LogOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Log<T, Context>(Output(0)->count(), Xdata, Ydata);
+    math::Log<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -29,7 +29,7 @@ void LogGradientOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata);
+    math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/arithmetic/matmul_op.cc
+++ b/Dragon/src/operators/arithmetic/matmul_op.cc
@@ -16,7 +16,7 @@ void MatmulOp<Context>::RunWithType() {
                TransB ? CblasTrans : CblasNoTrans,
                    M, N, K1,
                        1.0, X1data, X2data,
-                            0.0, Ydata, &ctx());
+                            0.0, Ydata, ctx());
        X1data += x1_offset;
        X2data += x2_offset;
        Ydata += y_offset;
@@ -76,13 +76,13 @@ void MatmulGradientOp<Context>::RunWithType() {
                TransB ? CblasNoTrans : CblasTrans,
                    M, K1, N,
                        1.0, dYdata, X2data,
-                            0.0, dX1data, &ctx());
+                            0.0, dX1data, ctx());
        math::Gemm<T, Context>(
            TransA ? CblasNoTrans : CblasTrans,
                CblasNoTrans,
                    K1, N, M,
                        1.0, X1data, dYdata,
-                            0.0, dX2data, &ctx());
+                            0.0, dX2data, ctx());
        X1data += x1_offset;
        X2data += x2_offset;
        dX1data += x1_offset;

--- a/Dragon/src/operators/arithmetic/mul_op.cc
+++ b/Dragon/src/operators/arithmetic/mul_op.cc
@@ -9,7 +9,7 @@ void MulOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }

 template <class Context> template <typename T>
@@ -18,34 +18,39 @@ void MulOp<Context>::BroadcastRunWithType(int type) {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    auto* c = ws()->template caches<T, Context>({ 
-        Output(0)->count() })[0];
-
-    if (type == 0 || type == 1) {
-        if (type == 0) {
-            outer_dim = Input(0).count();
-            inner_dim = 1;
-        } else {
-            outer_dim = Input(0).count(0, Input(0).axis(-1));
-            inner_dim = Input(0).dim(-1);
-        }
+
+    if (type == 0) {
+        x2 = Input(1).template data<T, CPUContext>();
+        ctx()->template Copy<T, Context, Context>(
+            Output(0)->count(), y, x1);
+        math::MulScalar<T, Context>(Output(0)->count(),
+            dragon_cast<float, T>(x2[0]), y, ctx());
+    } else if (type == 1) {
+        outer_dim = Input(0).count(0, Input(0).axis(-1));
+        inner_dim = Input(0).dim(-1);
        DECLARE_MULTIPLIER(multiplier, outer_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x2,
-                        0.0, c, &ctx());
-        math::Mul<T, Context>(Output(0)->count(), x1, c, y);
+                        0.0, c, ctx());
+        math::Mul<T, Context>(
+            Output(0)->count(), x1, c, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(0).dim(0);
        inner_dim = Input(0).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x2, multiplier,
-                        0.0, c, &ctx());
-        math::Mul<T, Context>(Output(0)->count(), x1, c, y);
+                        0.0, c, ctx());
+        math::Mul<T, Context>(
+            Output(0)->count(), x1, c, y, ctx());
    }
 }

@@ -79,13 +84,13 @@ void MulGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
+        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
    }

    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
+        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
    }
 }

@@ -110,19 +115,19 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
-        math::Mul<T, Context>(X1->count(), dy, x1, c);
+        math::Mul<T, Context>(X1->count(), dy, x1, c, ctx());
        if (type == 0 || type == 1) {
-            DECLARE_MULTIPLIER(multiplier,  outer_dim);
+            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        }
    }

@@ -135,16 +140,16 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, multiplier, x2,
-                            0.0, dx1, &ctx());
+                            0.0, dx1, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, x2, multiplier,
-                            0.0, dx1, &ctx());
+                            0.0, dx1, ctx());
        }
-        math::Mul<T, Context>(X1->count(), dy, dx1, dx1);
+        math::Mul<T, Context>(X1->count(), dy, dx1, dx1, ctx());
    }
 }


--- a/Dragon/src/operators/arithmetic/pow_op.cc
+++ b/Dragon/src/operators/arithmetic/pow_op.cc
@@ -9,16 +9,17 @@ void PowOp<Context>::RunWithType() {
    TIndex count = Input(0).count();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    if (power_scale == float(0)) {
-        float value = (power == float(0)) ? float(1) : pow(shift, power);
-        math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata);
+    if (power_scale == 0.f) {
+        float value = (power == 0.f) ? 1.f : pow(shift, power);
+        math::Set<T, Context>(count,
+            dragon_cast<T, float>(value), Ydata, ctx());
        return;
    }
    auto* Xdata = Input(0).template data<T, Context>();
-    ctx().template Copy<T, Context, Context>(count, Ydata, Xdata);
-    if (scale != float(1)) math::Scal<T, Context>(count, scale, Ydata, &ctx());
-    if (shift != float(0)) math::AddScalar<T, Context>(count, shift, Ydata);
-    if (power != float(1)) math::Pow<T, Context>(count, power, Ydata, Ydata);
+    ctx()->template Copy<T, Context, Context>(count, Ydata, Xdata);
+    if (scale != 1.f) math::Scal<T, Context>(count, scale, Ydata, ctx());
+    if (shift != 0.f) math::AddScalar<T, Context>(count, shift, Ydata, ctx());
+    if (power != 1.f) math::Pow<T, Context>(count, power, Ydata, Ydata, ctx());
 }

 template <class Context>
@@ -42,35 +43,36 @@ void PowGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

-    if (power_scale == float(0) || power == float(1)) {
+    if (power_scale == 0.f || power == 1.f) {
        const T value = dragon_cast<T, float>(power_scale);
-        math::Set<T, Context>(count, value, dXdata);
+        math::Set<T, Context>(count, value, dXdata, ctx());
    } else {
        auto* Xdata = Input(0).template data<T, Context>();
-        if (power == float(2)) {
+        if (power == 2.f) {
            math::Axpby<T, Context>(count,
                power_scale * scale, Xdata,
-                    0, dXdata, &ctx());
-            if (shift != float(0)) 
-                math::AddScalar<T, Context>(count, power_scale * shift, dXdata);
-        } else if (shift == float(0)) {
+                    0, dXdata, ctx());
+            if (shift != 0.f) 
+                math::AddScalar<T, Context>(count,
+                    power_scale * shift, dXdata, ctx());
+        } else if (shift == 0.f) {
            auto* Ydata = Input(1).template data<T, Context>();
-            math::Div<T, Context>(count, Ydata, Xdata, dXdata);
-            math::Scal<T, Context>(count, power, dXdata, &ctx());
+            math::Div<T, Context>(count, Ydata, Xdata, dXdata, ctx());
+            math::Scal<T, Context>(count, power, dXdata, ctx());
        } else {
            auto* Ydata = Input(1).template data<T, Context>();
-            ctx().template Copy<T, Context, Context>(count, dXdata, Xdata);
-            if (scale != float(1))
-                math::Scal<T, Context>(count, scale, dXdata, &ctx());
-            if (shift != float(0))
-                math::AddScalar<T, Context>(count, shift, dXdata);
-            math::Div<T, Context>(count, Ydata, dXdata, dXdata);
-            if (power_scale != float(1))
-                math::Scal<T, Context>(count, power_scale, dXdata, &ctx());
+            ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
+            if (scale != 1.f)
+                math::Scal<T, Context>(count, scale, dXdata, ctx());
+            if (shift != 0.f)
+                math::AddScalar<T, Context>(count, shift, dXdata, ctx());
+            math::Div<T, Context>(count, Ydata, dXdata, dXdata, ctx());
+            if (power_scale != 1.f)
+                math::Scal<T, Context>(count, power_scale, dXdata, ctx());
        }
    }
-    if (power_scale != float(0))
-        math::Mul<T, Context>(count, dYdata, dXdata, dXdata);
+    if (power_scale != 0.f)
+        math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/arithmetic/radd_op.cc
+++ b/Dragon/src/operators/arithmetic/radd_op.cc
@@ -9,7 +9,7 @@ void RAddOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Add<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }

 template <class Context> template <typename T>
@@ -19,23 +19,24 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();

-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), y, x2);

    if (type == 0 || type == 1) {
        if (type == 0) {
-            outer_dim = Input(1).count();
-            inner_dim = 1;
+            x1 = Input(0).template data<T, CPUContext>();
+            math::AddScalar<T, Context>(Output(0)->count(),
+                dragon_cast<float, T>(x1[0]), y, ctx());
        } else {
            outer_dim = Input(1).count(0, Input(1).axis(-1));
            inner_dim = Input(1).dim(-1);
+            DECLARE_MULTIPLIER(multiplier, outer_dim);
+            math::Gemm<T, Context>(
+                CblasNoTrans, CblasNoTrans,
+                    outer_dim, inner_dim, 1,
+                        1.0, multiplier, x1,
+                            1.0, y, ctx());
        }
-        DECLARE_MULTIPLIER(multiplier, outer_dim);
-        math::Gemm<T, Context>(
-            CblasNoTrans, CblasNoTrans,
-                outer_dim, inner_dim, 1,
-                    1.0, multiplier, x1,
-                        1.0, y, &ctx());
    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
@@ -44,7 +45,7 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x1, multiplier,
-                        1.0, y, &ctx());
+                        1.0, y, ctx());
    }
 }

@@ -77,13 +78,13 @@ void RAddGradientOp<Context>::EltwiseRunWithType() {

    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(1)->count(), dx2, dy);
    }

    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
    }
 }
@@ -108,7 +109,7 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        } else if (type == 2) {
            outer_dim = X2->dim(0);
            inner_dim = X2->count(1);
@@ -116,13 +117,13 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        }
    }

    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            X2->count(), dx2, dy);
    }
 }

--- a/Dragon/src/operators/arithmetic/rdiv_op.cc
+++ b/Dragon/src/operators/arithmetic/rdiv_op.cc
@@ -9,7 +9,7 @@ void RDivOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Div<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }

 template <class Context> template <typename T>
@@ -34,8 +34,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x1,
-                        0.0, c, &ctx());
-        math::Div<T, Context>(Output(0)->count(), c, x2, y);
+                        0.0, c, ctx());
+        math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
@@ -44,8 +44,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x1, multiplier,
-                        0.0, c, &ctx());
-        math::Div<T, Context>(Output(0)->count(), c, x2, y);
+                        0.0, c, ctx());
+        math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
    }
 }

@@ -82,16 +82,16 @@ void RDivGradientOp<Context>::EltwiseRunWithType() {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
-        math::Mul<T, Context>(X1->count(), dy, x1, c); // dY * X1
-        math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
-        math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
-        math::Mul<T, Context>(X2->count(), c, dx2, dx2);
+        math::Mul<T, Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
+        math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
+        math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
+        math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
    }

    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        math::Div<T, Context>(X1->count(), dy, x2, dx1);
+        math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
    }
 }

@@ -116,19 +116,19 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
-        math::Div<T, Context>(X2->count(), dy, x2, c);
+        math::Div<T, Context>(X2->count(), dy, x2, c, ctx());
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        }
    }

@@ -142,18 +142,18 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        -1.0, multiplier, x1,
-                            0.0, dx2, &ctx());
+                            0.0, dx2, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        -1.0, x1, multiplier,
-                            0.0, dx2, &ctx());
+                            0.0, dx2, ctx());
        }
-        math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
-        math::Div<T, Context>(X2->count(), dx2, x2, dx2);
-        math::Div<T, Context>(X2->count(), dx2, x2, dx2);
+        math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
+        math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
+        math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
    }
 }


--- a/Dragon/src/operators/arithmetic/rmul_op.cc
+++ b/Dragon/src/operators/arithmetic/rmul_op.cc
@@ -9,7 +9,7 @@ void RMulOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }

 template <class Context> template <typename T>
@@ -18,34 +18,39 @@ void RMulOp<Context>::BroadcastRunWithType(int type) {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    auto* c = ws()->template caches<T, Context>({
-        Output(0)->count() })[0];
-
-    if (type == 0 || type == 1) {
-        if (type == 0) {
-            outer_dim = Input(1).count();
-            inner_dim = 1;
-        } else {
-            outer_dim = Input(1).count(0, Input(1).axis(-1));
-            inner_dim = Input(1).dim(-1);
-        }
+
+    if (type == 0) {
+        x1 = Input(0).template data<T, CPUContext>();
+        ctx()->template Copy<T, Context, Context>(
+            Output(0)->count(), y, x2);
+        math::MulScalar<T, Context>(Output(0)->count(),
+            dragon_cast<float, T>(x1[0]), y, ctx());
+    } else if (type == 1) {
+        outer_dim = Input(1).count(0, Input(1).axis(-1));
+        inner_dim = Input(1).dim(-1);
        DECLARE_MULTIPLIER(multiplier, outer_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x1,
-                        0.0, c, &ctx());
-        math::Mul<T, Context>(Output(0)->count(), c, x2, y);
+                        0.0, c, ctx());
+        math::Mul<T, Context>(
+            Output(0)->count(), c, x2, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
        DECLARE_MULTIPLIER(multiplier, inner_dim);
+        auto* c = ws()->template caches<T, Context>(
+            { Output(0)->count() })[0];
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x1, multiplier,
-                        0.0, c, &ctx());
-        math::Mul<T, Context>(Output(0)->count(), c, x2, y);
+                        0.0, c, ctx());
+        math::Mul<T, Context>(
+            Output(0)->count(), c, x2, y, ctx());
    }
 }

@@ -79,13 +84,13 @@ void RMulGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* x1 = Input(0).template data<T, Context>();
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
+        math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
    }

    if (Output(0)->name() != "ignore") {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
+        math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
    }
 }

@@ -110,19 +115,19 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
        auto* x2 = Input(1).template data<T, Context>();
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
        auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
-        math::Mul<T, Context>(X2->count(), dy, x2, c);
+        math::Mul<T, Context>(X2->count(), dy, x2, c, ctx());
        if (type == 0 || type == 1) {
            DECLARE_MULTIPLIER(multiplier, outer_dim);
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, c, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        }
    }

@@ -135,16 +140,16 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, multiplier, x1,
-                            0.0, dx2, &ctx());
+                            0.0, dx2, ctx());
        } else if (type == 2) {
            DECLARE_MULTIPLIER(multiplier, inner_dim);
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    outer_dim, inner_dim, 1,
                        1.0, x1, multiplier,
-                            0.0, dx2, &ctx());
+                            0.0, dx2, ctx());
        }
-        math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
+        math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
    }
 }


--- a/Dragon/src/operators/arithmetic/rsub_op.cc
+++ b/Dragon/src/operators/arithmetic/rsub_op.cc
@@ -9,7 +9,7 @@ void RSubOp<Context>::EltwiseRunWithType() {
    auto* x1 = Input(0).template data<T, Context>();
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();
-    math::Sub<T, Context>(Output(0)->count(), x1, x2, y);
+    math::Sub<T, Context>(Output(0)->count(), x1, x2, y, ctx());
 }

 template <class Context> template <typename T>
@@ -19,7 +19,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();

-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), y, x2);

    if (type == 0 || type == 1) {
@@ -35,7 +35,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, multiplier, x1,
-                        -1.0, y, &ctx());
+                        -1.0, y, ctx());
    } else if (type == 2) {
        outer_dim = Input(1).dim(0);
        inner_dim = Input(1).count(1);
@@ -44,7 +44,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    1.0, x1, multiplier,
-                        -1.0, y, &ctx());
+                        -1.0, y, ctx());
    }
 }

@@ -78,12 +78,12 @@ void RSubGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale<T, Context>(
-            Output(1)->count(), -1, dy, dx2, &ctx());
+            Output(1)->count(), -1, dy, dx2, ctx());
    }

    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
    }
 }
@@ -108,7 +108,7 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        } else if (type == 2) {
            outer_dim = X2->dim(0);
            inner_dim = X2->count(1);
@@ -116,14 +116,14 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    1.0, dy, multiplier,
-                        0.0, dx1, &ctx());
+                        0.0, dx1, ctx());
        }
    }

    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale<T, Context>(
-            X2->count(), -1, dy, dx2, &ctx());
+            X2->count(), -1, dy, dx2, ctx());
    }
 }


--- a/Dragon/src/operators/arithmetic/square_op.cc
+++ b/Dragon/src/operators/arithmetic/square_op.cc
@@ -7,7 +7,7 @@ template <class Context> template <typename T>
 void SquareOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata);
+    math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -29,8 +29,8 @@ void SquareGradientOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata);
-    math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, &ctx());
+    math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
+    math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/arithmetic/sub_op.cc
+++ b/Dragon/src/operators/arithmetic/sub_op.cc
@@ -9,7 +9,8 @@ void SubOp<Context>::EltwiseRunWithType() {
    auto* X1data = Input(0).template data<T, Context>();
    auto* X2data = Input(1).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Sub<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
+    math::Sub<T, Context>(Output(0)->count(),
+        X1data, X2data, Ydata, ctx());
 }

 template <class Context> template <typename T>
@@ -19,23 +20,24 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
    auto* x2 = Input(1).template data<T, Context>();
    auto* y = Output(0)->template mutable_data<T, Context>();

-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), y, x1);

    if (type == 0 || type == 1) {
        if (type == 0) {
-            outer_dim = Input(0).count();
-            inner_dim = 1;
+            x2 = Input(1).template data<T, CPUContext>();
+            math::AddScalar<T, Context>(Output(0)->count(),
+                -dragon_cast<float, T>(x2[0]), y, ctx());
        } else {
            outer_dim = Input(0).count(0, Input(0).axis(-1));
            inner_dim = Input(0).dim(-1);
+            DECLARE_MULTIPLIER(multiplier, outer_dim);
+            math::Gemm<T, Context>(
+                CblasNoTrans, CblasNoTrans,
+                    outer_dim, inner_dim, 1,
+                        -1.0, multiplier, x2,
+                            1.0, y, ctx());
        }
-        DECLARE_MULTIPLIER(multiplier, outer_dim);
-        math::Gemm<T, Context>(
-            CblasNoTrans, CblasNoTrans,
-                outer_dim, inner_dim, 1,
-                    -1.0, multiplier, x2,
-                        1.0, y, &ctx());
    } 
    else if (type == 2) {
        outer_dim = Input(0).dim(0);
@@ -45,7 +47,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
            CblasNoTrans, CblasNoTrans,
                outer_dim, inner_dim, 1,
                    -1.0, x2, multiplier,
-                        1.0, y, &ctx());
+                        1.0, y, ctx());
    }
 }

@@ -79,12 +81,12 @@ void SubGradientOp<Context>::EltwiseRunWithType() {
    if (Output(1)->name() != "ignore") {
        auto* dx2 = Output(1)->template mutable_data<T, Context>();
        math::Scale<T, Context>(Output(1)->count(),
-            -1.0, dy, dx2, &ctx());
+            -1.0, dy, dx2, ctx());
    }

    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dx1, dy);
    }
 }
@@ -109,7 +111,7 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasTrans, outer_dim, inner_dim,
                    -1.0, dy, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        } else if (type == 2) {
            outer_dim = X1->dim(0);
            inner_dim = X1->count(1);
@@ -117,13 +119,13 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
            math::Gemv<T, Context>(
                CblasNoTrans, outer_dim, inner_dim,
                    -1.0, dy, multiplier,
-                        0.0, dx2, &ctx());
+                        0.0, dx2, ctx());
        }
    }

    if (Output(0)->name() != "ignore") {
        auto* dx1 = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            X1->count(), dx1, dy);
    }
 }

--- a/Dragon/src/operators/control_flow/compare_op.cc
+++ b/Dragon/src/operators/control_flow/compare_op.cc
@@ -8,7 +8,8 @@ void CompareOp<Context>::EqualRunWithType() {
    auto* X1data = Input(0).template data<T, Context>();
    auto* X2data = Input(1).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Equal<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
+    kernel::Equal<T, Context>(Output(0)->count(),
+        X1data, X2data, Ydata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/control_flow/copy_op.cc
+++ b/Dragon/src/operators/control_flow/copy_op.cc
@@ -7,7 +7,7 @@ void CopyOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), Ydata, Xdata);
 }


--- a/Dragon/src/operators/loss/ctc_loss_op.cc
+++ b/Dragon/src/operators/loss/ctc_loss_op.cc
@@ -20,10 +20,10 @@ void CTCLossGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();

-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scale<T, Context>(Output(0)->count(),
-        dYdata_host, Gdata, dXdata, &ctx());
+        dYdata_host, Gdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/loss/cudnn_ctc_loss_op.cc
+++ b/Dragon/src/operators/loss/cudnn_ctc_loss_op.cc
@@ -45,7 +45,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
    cudnnSetTensorDesc<T>(&grad_desc, Input(0).dims());

    CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(
-        ctx().cudnn_handle(), prob_desc, grad_desc,
+        ctx()->cudnn_handle(), prob_desc, grad_desc,
            packed_labels.data(), label_lengths.data(),
                input_lengths.data(),
                    ctc_algo, ctc_desc, &workspace_size));
@@ -58,7 +58,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
    auto* WSdata = (uint8_t*)ws()->template caches<Context>({
        workspace_size })[0];

-    CUDNN_CHECK(cudnnCTCLoss(ctx().cudnn_handle(),
+    CUDNN_CHECK(cudnnCTCLoss(ctx()->cudnn_handle(),
        prob_desc, Pdata, packed_labels.data(),
            label_lengths.data(), input_lengths.data(),
                Ydata, grad_desc, Gdata,

--- a/Dragon/src/operators/loss/l1_loss_op.cc
+++ b/Dragon/src/operators/loss/l1_loss_op.cc
@@ -12,11 +12,13 @@ void L1LossOp<Context>::RunWithType() {
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    math::Sub<T, Context>(Input(0).count(), X0data, X1data, diff_data);
+    math::Sub<T, Context>(Input(0).count(),
+        X0data, X1data, diff_data, ctx());
    if (InputSize() > 2) {
        CHECK_EQ(Input(0).count(), Input(2).count());
        auto* Wdata = Input(2).template data<T, Context>();
-        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
+        math::Mul<T, Context>(diff->count(),
+            Wdata, diff_data, diff_data, ctx());
    }

    T normalizer = 1;
@@ -27,11 +29,13 @@ void L1LossOp<Context>::RunWithType() {
    }

    T loss = math::ASum<T, Context>(diff->count(), diff_data);
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }

 template <class Context>
 void L1LossOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    CHECK_EQ(Input(0).count(), Input(1).count());
    Output(0)->Reshape({ 1 });
    diff = ws()->CreateTensor("/mnt/" + anchor() + "/l1_loss/diff");
@@ -51,9 +55,11 @@ template <class Context> template <typename T>
 void L1LossGradientOp<Context>::RunWithType() {
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
-    kernel::AbsGrad<T, Context>(diff->count(), diff_data, diff_data);
+    ctx()->FinishDeviceCompution();
+    kernel::AbsGrad<T, Context>(diff->count(),
+        diff_data, diff_data, ctx());

    T alpha = dYdata_host, normalizer = 1;
    if (normalization == "BATCH_SIZE") {
@@ -69,7 +75,7 @@ void L1LossGradientOp<Context>::RunWithType() {
        const T sign = (i == 0) ? 1 : -1;
        alpha *= sign;
        math::Axpby<T, Context>(Output(i)->count(),
-            alpha, diff_data, 0, dXdata, &ctx());
+            alpha, diff_data, 0, dXdata, ctx());
    }
 }


--- a/Dragon/src/operators/loss/l2_loss_op.cc
+++ b/Dragon/src/operators/loss/l2_loss_op.cc
@@ -9,12 +9,14 @@ void L2LossOp<Context>::RunWithType() {
    auto* X0data = Input(0).template data<T, Context>();
    auto* X1data = Input(1).template data<T, Context>();
    auto* diff_data = diff->template mutable_data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
+    auto* Ydata = Output(0)->template mutable_data<float, Context>();
+    math::Sub<T, Context>(diff->count(),
+        X0data, X1data, diff_data, ctx());
    if (InputSize() > 2) {
        CHECK_EQ(Input(0).count(), Input(2).count());
        auto* Wdata = Input(2).template data<T, Context>();
-        math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
+        math::Mul<T, Context>(diff->count(),
+            Wdata, diff_data, diff_data, ctx());
    }

    T normalizer = 1;
@@ -23,10 +25,12 @@ void L2LossOp<Context>::RunWithType() {
    } else if (normalization == "FULL") {
        normalizer = Input(0).count();
    }
+    normalizer *= 2;

-    T loss = T(0.5) * math::Dot<T, Context>(diff->count(),
-        diff_data, diff_data, &ctx());
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    T loss;
+    math::Dot<T, Context>(diff->count(),
+        diff_data, diff_data, &loss, ctx());
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }

 template <class Context>
@@ -48,10 +52,11 @@ OPERATOR_SCHEMA(L2Loss).NumInputs(2, 3).NumOutputs(1);

 template <class Context> template <typename T>
 void L2LossGradientOp<Context>::RunWithType() {
-    auto* diff_data = diff->template mutable_data<T, Context>();
+    auto* diff_data = diff->template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
+    ctx()->FinishDeviceCompution();

    T alpha = dYdata_host, normalizer = 1;
    if (normalization == "BATCH_SIZE") {
@@ -67,7 +72,7 @@ void L2LossGradientOp<Context>::RunWithType() {
        const T sign = (i == 0) ? 1 : -1;
        alpha *= sign;
        math::Axpby<T, Context>(Output(i)->count(),
-            alpha, diff_data, 0, dXdata, &ctx());
+            alpha, diff_data, 0, dXdata, ctx());
    }
 }


--- a/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_cross_entropy_op.cc
@@ -13,11 +13,11 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
    auto* Fdata = flags.template mutable_data<T, Context>();

    kernel::SigmoidCrossEntropy<T, Context>(
-        Input(0).count(), Xdata, Tdata, Ldata, Fdata, &ctx());
+        Input(0).count(), Xdata, Tdata, Ldata, Fdata, ctx());

    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template CopyFrom<Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }

@@ -35,11 +35,13 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }

 template <class Context>
 void SigmoidCrossEntropyOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    CHECK_EQ(Input(0).count(), Input(1).count())
        << "\nNumber of predictions must match the number of labels.";
    losses.ReshapeLike(Input(0));
@@ -63,12 +65,12 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
    auto* Fdata = flags.template mutable_data<T, Context>();

    kernel::SigmoidCrossEntropyGrad<T, Context>(
-        Input(0).count(), Xdata, Tdata, dXdata, Fdata, &ctx());
+        Input(0).count(), Xdata, Tdata, dXdata, Fdata, ctx());

    if (normalization == "UNIT") {
        auto* dYdata = Input(-1).template data<T, Context>();
        math::Mul<T, Context>(Output(0)->count(),
-            dYdata, dXdata, dXdata); return;
+            dYdata, dXdata, dXdata, ctx()); return;
    }

    T normalizer = 1;
@@ -83,14 +85,16 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
    }

    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<T, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }

 template <class Context>
 void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    Output(0)->ReshapeLike(Input(0));
    flags.ReshapeLike(Input(0));


--- a/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/sigmoid_focal_loss_op.cc
@@ -15,11 +15,11 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
    kernel::SigmoidFocalLoss<T, Context>(
        outer_dim, axis_dim, inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
-                Xdata, Tdata, Ldata, Fdata, &ctx());
+                Xdata, Tdata, Ldata, Fdata, ctx());

    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template CopyFrom<Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }

@@ -37,11 +37,13 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }

 template <class Context>
 void SigmoidFocalLossOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    outer_dim = Input(0).count(0, axis);
    axis_dim = Input(0).dim(axis);
    inner_dim = Input(0).count(axis + 1);
@@ -71,12 +73,12 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
    kernel::SigmoidFocalLossGradient<T, Context>(
        outer_dim, axis_dim, inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
-                Xdata, Tdata, dXdata, Fdata, &ctx());
+                Xdata, Tdata, dXdata, Fdata, ctx());

    if (normalization == "UNIT") {
        auto* dYdata = Input(-1).template data<T, Context>();
        math::Mul<T, Context>(Output(0)->count(),
-            dYdata, dXdata, dXdata); return;
+            dYdata, dXdata, dXdata, ctx()); return;
    }

    T normalizer = 1;
@@ -91,14 +93,16 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
    }

    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<T, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }

 template <class Context>
 void SigmoidFocalLossGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    outer_dim = Input(0).count(0, axis);
    axis_dim = Input(0).dim(axis);
    inner_dim = Input(0).count(axis + 1);

--- a/Dragon/src/operators/loss/smooth_l1_loss_op.cc
+++ b/Dragon/src/operators/loss/smooth_l1_loss_op.cc
@@ -11,20 +11,21 @@ void SmoothL1LossOp<Context>::RunWithType() {
    auto* X1data = Input(1).template data<T, Context>();
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* error_data = error->template mutable_data<T, Context>();
-    auto* Ydata = Output(0)->template mutable_data<T, Context>();
+    auto* Ydata = Output(0)->template mutable_data<float, Context>();

-    math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
+    math::Sub<T, Context>(diff->count(),
+        X0data, X1data, diff_data, ctx());
    if (InputSize() > 2) {
        auto* inside_w_data = Input(2).template data<T, Context>();
        math::Mul<T, Context>(diff->count(),
-            inside_w_data, diff_data, diff_data);
+            inside_w_data, diff_data, diff_data, ctx());
    }
-    kernel::SmoothL1<T, Context>(
-        diff->count(), beta, diff_data, error_data);
+    kernel::SmoothL1<T, Context>(diff->count(),
+        beta, diff_data, error_data, ctx());
    if (InputSize() > 3) {
        auto* outside_w_data = Input(3).template data<T, Context>();
        math::Mul<T, Context>(diff->count(),
-            outside_w_data, error_data, error_data);
+            outside_w_data, error_data, error_data, ctx());
    }

    T normalizer = 1;
@@ -34,12 +35,14 @@ void SmoothL1LossOp<Context>::RunWithType() {
        normalizer = Input(0).count();
    }

-    T loss = math::ASum<T, Context>(error->count(), error_data);
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    float loss = math::ASum<float, Context>(error->count(), error_data);
+    math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
 }

 template <class Context>
 void SmoothL1LossOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    CHECK(Input(0).dims() == Input(1).dims());
    if (InputSize() > 2) CHECK(Input(0).dims() == Input(2).dims());
    if (InputSize() > 3) CHECK(Input(0).dims() == Input(3).dims());
@@ -64,10 +67,12 @@ template <class Context> template <typename T>
 void SmoothL1LossGradientOp<Context>::RunWithType() {
    auto* diff_data = diff->template mutable_data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
-    kernel::SmoothL1Grad<T, Context>(
-        diff->count(), beta, diff_data, diff_data);
+    ctx()->FinishDeviceCompution();
+
+    kernel::SmoothL1Grad<T, Context>(diff->count(),
+        beta, diff_data, diff_data, ctx());

    T alpha = dYdata_host, normalizer = 1;
    if (normalization == "BATCH_SIZE") {
@@ -83,16 +88,16 @@ void SmoothL1LossGradientOp<Context>::RunWithType() {
        const T sign = (i == 0) ? 1 : -1;
        alpha *= sign;
        math::Axpby<T, Context>(Output(i)->count(),
-            alpha, diff_data, 0, dXdata, &ctx());
+            alpha, diff_data, 0, dXdata, ctx());
        if (InputSize() > 3) {
            auto* inside_w_data = Input(2).template data<T, Context>();
            math::Mul<T, Context>(Output(i)->count(),
-                inside_w_data, dXdata, dXdata);
+                inside_w_data, dXdata, dXdata, ctx());
        }
        if (InputSize() > 4) {
            auto* outside_w_data = Input(3).template data<T, Context>();
            math::Mul<T, Context>(Output(i)->count(),
-                outside_w_data, dXdata, dXdata);
+                outside_w_data, dXdata, dXdata, ctx());
        }
    }
 }

--- a/Dragon/src/operators/loss/softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/softmax_cross_entropy_op.cc
@@ -26,15 +26,15 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
    auto* Pdata = prob->template data<T, Context>();
    auto* Tdata = Input(1).template data<T, Context>();
    auto* Ldata = losses.template mutable_data<T, Context>();
-    kernel::SoftmaxCrossEntropy<T, Context>(
-        Input(0).count(), Pdata, Tdata, Ldata);
+    kernel::SoftmaxCrossEntropy<T, Context>(Input(0).count(),
+        Pdata, Tdata, Ldata, ctx());

    if (normalization == "UNIT") {
        Output(0)->Reshape({ outer_dim * inner_dim });
        auto* Ydata = Output(0)->template mutable_data<T, Context>();
        kernel::Sum<T, Context>(outer_dim * inner_dim,
            Input(0).dim(axis), inner_dim,
-                Ldata, Ydata); return;
+                Ldata, Ydata, ctx()); return;
    }

    T normalizer = 1;
@@ -47,11 +47,13 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }

 template <class Context>
 void SoftmaxCrossEntropyOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);
    CHECK_EQ(Input(0).count(), Input(1).count())
@@ -76,16 +78,16 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
    auto* Tdata = Input(1).template data<T, Context>();
    auto* Pdata = prob->template mutable_data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(prob->count(), dXdata, Pdata);
+    ctx()->template Copy<T, Context, Context>(prob->count(), dXdata, Pdata);
    math::Axpy<T, Context>(Output(0)->count(),
-        -1.0, Tdata, dXdata, &ctx());
+        -1.0, Tdata, dXdata, ctx());

    if (normalization == "UNIT") {
        auto* dYdata = Input(-1).template data<T, Context>();
        kernel::SumGrad<T, Context>(outer_dim * inner_dim,
-            Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata);
+            Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata, ctx());
        math::Mul<T, Context>(Output(0)->count(),
-            Pdata, dXdata, dXdata); return;
+            Pdata, dXdata, dXdata, ctx()); return;
    }

    T normalizer = 1;
@@ -96,10 +98,10 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
    }

    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<T, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/loss/softmax_focal_loss_op.cc
+++ b/Dragon/src/operators/loss/softmax_focal_loss_op.cc
@@ -20,11 +20,11 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
        outer_dim, Input(0).dim(axis), inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
                Pdata, Tdata, Idata, this->ignores.count(),
-                    Ldata, Fdata, &ctx());
+                    Ldata, Fdata, ctx());

    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template CopyFrom<Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }

@@ -42,11 +42,13 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
    T loss = math::ASum<T, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    math::Set<T, Context>(1, loss / normalizer, Ydata);
+    math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
 }

 template <class Context>
 void SoftmaxFocalLossOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);
    CHECK_EQ(outer_dim * inner_dim, Input(1).count())
@@ -80,16 +82,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
        outer_dim, Output(0)->dim(axis), inner_dim,
            pos_alpha, neg_alpha, gamma, neg_id,
                Pdata, Tdata, Idata, this->ignores.count(),
-                    dXdata, Fdata, &ctx());
+                    dXdata, Fdata, ctx());

    if (normalization == "UNIT") {
        auto* dYdata = Input(-1).template data<T, Context>();
        kernel::SumGrad<T, Context>(
            Input(0).count() / Input(0).dim(axis),
                Input(0).dim(axis), inner_dim,
-                    1.0, dYdata, Pdata);
+                    1.0, dYdata, Pdata, ctx());
        math::Mul<T, Context>(Output(0)->count(),
-            Pdata, dXdata, dXdata); return;
+            Pdata, dXdata, dXdata, ctx()); return;
    }

    T normalizer = 1;
@@ -104,14 +106,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
    }

    auto* dYdata = Input(-1).template data<T, Context>();
-    T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
+    T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<T, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }

 template <class Context>
 void SoftmaxFocalLossGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    this->prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
    outer_dim = this->prob->count(0, axis);
    inner_dim = this->prob->count(axis + 1);

--- a/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
+++ b/Dragon/src/operators/loss/sparse_softmax_cross_entropy_op.cc
@@ -21,83 +21,66 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRun() {
    softmax_op->Run();
 }

-template <class Context>
-void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRunFP16() {
-    Tensor* XF32 = ws()->CreateTensor(
-        "/mnt/" + anchor() + "/softmax/xf32");
-    XF32->ReshapeLike(Input(0));
-    auto* XdataF16 = Input(0).template data<float16, Context>();
-    auto* XdataF32 = XF32->template mutable_data<float, Context>();
-    kernel::TypeA2B<float16, float, Context>(
-        Input(0).count(), XdataF16, XdataF32);
-    OperatorDef softmax_def = MakeOperatorDef("Softmax", "",
-        vector<string>({ XF32->name() }),
-        vector<string>({ "/mnt/" + anchor() + "/softmax/prob" }));
-    softmax_def.add_arg()->CopyFrom(this->arg("axis"));
-    if (def().has_device_option())
-        softmax_def.mutable_device_option()
-            ->CopyFrom(def().device_option());
-    if (!softmax_op) softmax_op.reset(
-        CreateOperator(softmax_def, ws()));
-    else softmax_op->MutableOp(softmax_def);
-    softmax_op->Run();
-}
-
 template <class Context> template <typename Tx, typename Ty>
 void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
    auto* Pdata = prob->template data<Tx, Context>();
    auto* Tdata = Input(1).template data<Ty, Context>();
    auto* Idata = !ignores.count() ? nullptr :
        ignores.template data<int, Context>();
-    auto* Ldata = losses.template mutable_data<Tx, Context>();
-    auto* Fdata = flags.template mutable_data<Tx, Context>();
+    auto* Ldata = losses.template mutable_data<float, Context>();
+    auto* Fdata = flags.template mutable_data<float, Context>();

    kernel::SparseSoftmaxCrossEntropy<Tx, Ty, Context>(
        outer_dim, Input(0).dim(axis), inner_dim,
            Pdata, Tdata, Idata, ignores.count(),
-                Ldata, Fdata, &ctx());
+                Ldata, Fdata, ctx());

    if (normalization == "UNIT") {
        Output(0)->ReshapeLike(losses);
-        Output(0)->template CopyFrom<Context>(losses);
+        Output(0)->template CopyFrom<Context>(losses, ctx());
        return;
    }

-    Tx normalizer = 1;
+    float normalizer = 1;
    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<Tx, Context>(
-                flags.count(), Fdata), (Tx)1.f);
+            math::ASum<float, Context>(
+                flags.count(), Fdata), 1.f);
    } else if (normalization == "BATCH_SIZE") {
        normalizer = Input(0).dim(0);
    } else if (normalization == "FULL") {
        normalizer = outer_dim * inner_dim;
    }

-    Tx loss = math::ASum<Tx, Context>(losses.count(), Ldata);
+    float loss = math::ASum<float, Context>(losses.count(), Ldata);
    Output(0)->Reshape({ 1 });
-    auto* Ydata = Output(0)->template mutable_data<Tx, Context>();
-    math::Set<Tx, Context>(1, loss / normalizer, Ydata);
+    auto* Ydata = Output(0)->template mutable_data<float, Context>();
+    math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
 }

 template <class Context>
 void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    outer_dim = Input(0).count(0, axis);
    inner_dim = Input(0).count(axis + 1);
    CHECK_EQ(outer_dim * inner_dim, Input(1).count())
        << "\nNumber of predictions must match the number of labels.";
    losses.Reshape({ outer_dim * inner_dim });
    flags.Reshape({ outer_dim * inner_dim });
+
    prob = ws()->CreateTensor("/mnt/" + anchor() + "/softmax/prob");
+    SoftmaxRun();

-    if (XIsType(Input(0), float) ||
-            XIsType(Input(0), float16)) {
-        if (XIsType(Input(0), float16)) SoftmaxRunFP16();
-        else SoftmaxRun();
+    if (XIsType(Input(0), float)) {
        if (XIsType(Input(1), float)) RunWithType<float, float>();
        else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    } else if (XIsType(Input(0), float16)) {
+        if (XIsType(Input(1), float)) RunWithType<float16, float>();
+        else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
+        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

 DEPLOY_CPU(SparseSoftmaxCrossEntropy);
@@ -113,62 +96,66 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
    auto* Idata = !ignores.count() ? nullptr :
        ignores.template data<int, Context>();
    auto* dXdata = Output(0)->template mutable_data<Tx, Context>();
-    auto* Fdata = flags.template mutable_data<Tx, Context>();
-    ctx().template Copy<Tx, Context, Context>(
+    auto* Fdata = flags.template mutable_data<float, Context>();
+    ctx()->template Copy<Tx, Context, Context>(
        prob->count(), dXdata, Pdata);

    kernel::SparseSoftmaxCrossEntropyGrad<Tx, Ty, Context>(
        outer_dim, Output(0)->dim(axis), inner_dim,
            Pdata, Tdata, Idata, ignores.count(),
-                dXdata, Fdata, &ctx());
+                dXdata, Fdata, ctx());

    if (normalization == "UNIT") {
-        auto* dYdata = Input(-1).template data<Tx, Context>();
-        kernel::SumGrad<Tx, Context>(
+        auto* dYdata = Input(-1).template data<float, Context>();
+        auto* WSdata = ws()->template caches<float, Context>(
+            { Input(0).count() })[0];
+        kernel::SumGrad<float, Context>(
            Input(0).count() / Input(0).dim(axis),
                Input(0).dim(axis), inner_dim,
-                    1.0, dYdata, Pdata);
-        math::Mul<Tx, Context>(
-            Output(0)->count(), Pdata, dXdata, dXdata);
+                    1.0, dYdata, WSdata, ctx());
+        kernel::TypeA2B<float, Tx, Context>(
+            Input(0).count(), WSdata, Pdata, ctx());
+        math::Mul<Tx, Context>(Output(0)->count(),
+            Pdata, dXdata, dXdata, ctx());
        return;
    }

-    Tx normalizer = 1;
+    float normalizer = 1;
    if (normalization == "VALID") {
        normalizer = std::max(
-            math::ASum<Tx, Context>(
-                flags.count(), Fdata), (Tx)1.f);
+            math::ASum<float, Context>(
+                flags.count(), Fdata), 1.f);
    } else if (normalization == "BATCH_SIZE") {
        normalizer = Input(0).dim(0);
    } else if (normalization == "FULL") {
        normalizer = outer_dim * inner_dim;
    }

-    auto* dYdata = Input(-1).template data<Tx, Context>();
-    Tx dYdata_host; ctx().template Copy<Tx, CPUContext, Context>(
+    auto* dYdata = Input(-1).template data<float, Context>();
+    float dYdata_host; ctx()->template Copy<float, CPUContext, Context>(
        1, &dYdata_host, dYdata);
    math::Scal<Tx, Context>(Output(0)->count(),
-        dYdata_host / normalizer, dXdata, &ctx());
+        dYdata_host / normalizer, dXdata, ctx());
 }

 template <class Context>
 void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
    outer_dim = prob->count(0, axis);
    inner_dim = prob->count(axis + 1);
    Output(0)->ReshapeLike(Input(0));
    flags.Reshape({ outer_dim * inner_dim });

-    if (XIsType(Input(0), float) || XIsType(Input(0), float16)) {
+    if (XIsType(Input(0), float)) {
        if (XIsType(Input(1), float)) RunWithType<float, float>();
        else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
-        if (XIsType(Input(0), float16)) {
-            auto* dXdataF32 = Output(0)->template data<float, Context>();
-            auto* dXdataF16 = prob->template mutable_data<float16, Context>();
-            kernel::TypeA2B<float, float16, Context>(Output(0)->count(), dXdataF32, dXdataF16);
-            Output(0)->template CopyFrom<Context>(*prob);
-        }
+    } else if (XIsType(Input(0), float16)) {
+        if (XIsType(Input(1), float)) RunWithType<float16, float>();
+        else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
+        else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }


--- a/Dragon/src/operators/misc/accuracy_op.cc
+++ b/Dragon/src/operators/misc/accuracy_op.cc
@@ -9,23 +9,27 @@ namespace dragon {

 template <class Context> template <typename Tx, typename Ty>
 void AccuracyOp<Context>::RunWithType() {
+    static CPUContext cctx;
+    float* Y1data, *Y2data = nullptr;
+    Y1data = Output(0)->template mutable_data<float, CPUContext>();
    if (OutputSize() > 1) {
-        math::Set<float, CPUContext>(num_classes, 0,
-            Output(1)->template mutable_data<float, CPUContext>());
+        Y2data = Output(1)->template mutable_data<float, CPUContext>();
+        math::Set<float, CPUContext>(num_classes, 0, Y2data, &cctx);
    }
-    Map<int, TIndex> num_per_class;

+    Map<int, TIndex> num_per_class;
    TIndex acc = 0, count = 0;

    const Tx* Xdata;
    if (XIsType(Input(0), float16)) {
-        Tensor* XF32 = ws()->CreateTensor("/mnt/" + anchor() + "/accuracy/xf32");
-        XF32->ReshapeLike(Input(0));
-        auto* XdataF16 = Input(0).template data<float16, CPUContext>();
-        auto* XdataF32 = XF32->template mutable_data<float, CPUContext>();
+        Tensor* X32T = ws()->CreateTensor(
+            "/mnt/" + anchor() + "/accuracy/f32");
+        X32T->ReshapeLike(Input(0));
+        auto* X16 = Input(0).template data<float16, CPUContext>();
+        auto* X32 = X32T->template mutable_data<float, CPUContext>();
        kernel::TypeA2B<float16, float, CPUContext>(
-            Input(0).count(), XdataF16, XdataF32);
-        Xdata = XdataF32;
+            Input(0).count(), X16, X32, &cctx);
+        Xdata = X32;
    } else Xdata = Input(0).template data<Tx, CPUContext>();

    auto* labels = Input(1).template data<Ty, CPUContext>();
@@ -41,15 +45,13 @@ void AccuracyOp<Context>::RunWithType() {
            vector<pair<Tx, int> > vec;
            for (int k = 0; k < num_classes; k++)
                vec.push_back(
-                    std::make_pair(Xdata[i * dim + k * inner_dim + j], k)
-                );
+                    std::make_pair(Xdata[i * dim + k * inner_dim + j], k));
            std::partial_sort(
                vec.begin(), vec.begin() + top_k, vec.end(),
                    std::greater<pair<Tx, int> >());
            for (int k = 0; k < top_k; k++) {
                if (vec[k].second == label) {
-                    if (OutputSize() > 1)
-                        Output(1)->template mutable_data<float, CPUContext>()[label]++;
+                    if (OutputSize() > 1) Y2data[label]++;
                    acc++;
                    break;
                }
@@ -58,12 +60,11 @@ void AccuracyOp<Context>::RunWithType() {
        }    //  end inner_dim
    }    // end outer_dim

-    Output(0)->template mutable_data<float, CPUContext>()[0] = (float)acc / count;
-    if (OutputSize() > 1) {
-        auto* acc_per_class = Output(1)->template mutable_data<float, CPUContext>();
+    Y1data[0] = (float)acc / count;
+    if (Y2data) {
        for (int i = 0; i < num_classes; i++)
-            acc_per_class[i] = num_per_class[i] == 0 ?
-                0 : acc_per_class[i] / num_per_class[i];
+            Y2data[i] = num_per_class[i] == 0 ?
+                0 : Y2data[i] / num_per_class[i];
    }
 }


--- a/Dragon/src/operators/misc/astype_op.cc
+++ b/Dragon/src/operators/misc/astype_op.cc
@@ -14,14 +14,14 @@ namespace dragon {
            Output(0)->ReshapeLike(Input(0)); \
            auto* Xdata = Input(0).template data<type_a, Context>(); \
            auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
-            kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata); \
+            kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata, ctx()); \
        } else { \
            TIndex count = Output(0)->count(); \
            auto* Xdata = Output(0)->template data<type_a, Context>(); \
            auto* Cdata = ws()->template caches<type_b, Context>({ count })[0]; \
-            kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata); \
+            kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata, ctx()); \
            auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
-            ctx().template Copy<type_b, Context, Context>(count, Ydata, Cdata); \
+            ctx()->template Copy<type_b, Context, Context>(count, Ydata, Cdata); \
        } \
        return; \
    }

--- a/Dragon/src/operators/misc/gradient_op.cc
+++ b/Dragon/src/operators/misc/gradient_op.cc
@@ -11,7 +11,7 @@ void GradientGenerateOp<Context>::RunWithType() {
        Output(i)->ReshapeLike(Input(i));
        auto* dXdata = Output(0)->template mutable_data<T, Context>();
        math::Set<T, Context>(Output(0)->count(),
-            dragon_cast<T, float>(defaults[i]), dXdata);
+            dragon_cast<T, float>(defaults[i]), dXdata, ctx());
    }
 }

@@ -37,12 +37,13 @@ void GradientGatherOp<Context>::RunWithType() {
        CHECK(Output(0)->dims() == Input(indices[i]).dims());
        auto* dYdata = Input(indices[i]).template data<T, Context>();
        if (i == 0) {
-            ctx().template Copy<T, Context, Context>(
+            ctx()->template Copy<T, Context, Context>(
                count, dXdata, dYdata);
        } else {
            math::Add<T, Context>(
-                count, dXdata, dYdata, dXdata);
+                count, dXdata, dYdata, dXdata, ctx());
        }
+        ctx()->FinishDeviceCompution();
        Input(indices[i]).Reset();
    }
 }
@@ -68,7 +69,7 @@ template <class Context>
 void StopGradientOp<Context>::RunOnDevice() {
    if (Output(0)->name() != Input(0).name()) {
        Output(0)->ReshapeLike(Input(0));
-        Output(0)->template CopyFrom<Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0), ctx());
    }
 }


--- a/Dragon/src/operators/misc/image_data_op.cc
+++ b/Dragon/src/operators/misc/image_data_op.cc
@@ -14,7 +14,7 @@ void ImageDataOp<Context>::RunWithType() {

    kernel::ImageData<Tx, Ty, Context>(
        Output(0)->count(), n, c, h, w, Mdata, Sdata,
-            data_format, Xdata, Ydata);
+            data_format, Xdata, Ydata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/misc/initialize_op.cc
+++ b/Dragon/src/operators/misc/initialize_op.cc
@@ -7,7 +7,7 @@ template <class Context> template <typename T>
 void InitializeOp<Context>::RunWithType() {
    unique_ptr< Filler<T, Context> > f;
    f.reset(CreateFiller<T, Context>(filler));
-    f->Fill(Output(0), &ctx());
+    f->Fill(Output(0), ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/mpi/mpi_broadcast_op.cc
+++ b/Dragon/src/operators/mpi/mpi_broadcast_op.cc
@@ -14,7 +14,7 @@ void MPIBroadcastOp<Context>::RunWithType() {
        auto* Xdata = Input(0).template mutable_data<T, CPUContext>();
 #endif
        MPI_Bcast(Xdata, Input(0).count(), mpi_dtype(), comm_root, comm);
-        Output(0)->template CopyFrom<Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0), ctx());
    } else { 
 #ifdef WITH_MPI_CUDA
        auto* Ydata = Output(0)->template mutable_data<T, Context>();
@@ -62,12 +62,13 @@ void MPIBroadcastGradientOp<Context>::RunWithType() {
 #ifdef WITH_MPI_CUDA
        auto* dYdata = Input(-1).template mutable_data<T, Context>();
        auto* dXdata = Output(0)->template mutable_data<T, Context>();
-        ctx().template Copy<T, Context, Context>(
+        ctx()->template Copy<T, Context, Context>(
            Output(0)->count(), dXdata, dYdata);
 #else
        auto* dYdata = Input(-1).template mutable_data<T, CPUContext>();
        auto* dXdata = Output(0)->template mutable_data<T, CPUContext>();
-        CPUContext::template Copy<T, CPUContext, CPUContext>(
+        static CPUContext cctx;
+        cctx.template Copy<T, CPUContext, CPUContext>(
            Output(0)->count(), dXdata, dYdata);
 #endif
        for (int i = 0; i < comm_size; i++) {
@@ -76,10 +77,10 @@ void MPIBroadcastGradientOp<Context>::RunWithType() {
                i, 0, comm, MPI_STATUS_IGNORE);
 #ifdef WITH_MPI_CUDA
            math::Add<T, Context>(Output(0)->count(),
-                dYdata, dXdata, dXdata);
+                dYdata, dXdata, dXdata, ctx());
 #else
-            math::Add<T, CPUContext>(Output(0)->count(), 
-                dYdata, dXdata, dXdata);
+            math::Add<T, CPUContext>(Output(0)->count(),
+                dYdata, dXdata, dXdata, &cctx);
 #endif
        }
    }

--- a/Dragon/src/operators/mpi/mpi_gather_op.cc
+++ b/Dragon/src/operators/mpi/mpi_gather_op.cc
@@ -8,7 +8,7 @@ namespace dragon {
 template <class Context> template <typename T>
 void MPIGatherOp<Context>::RunWithType() {
    if (comm_rank == comm_root) {
-        Output(comm_rank)->template CopyFrom<Context>(Input(0));
+        Output(comm_rank)->template CopyFrom<Context>(Input(0), ctx());
        for (int i = 0; i < comm_size; i++) {
            if (i == comm_root) continue;
 #ifdef WITH_MPI_CUDA
@@ -76,7 +76,8 @@ OPERATOR_SCHEMA(MPIGather).NumInputs(1).NumOutputs(1, INT_MAX);
 template <class Context> template <typename T>
 void MPIGatherGradientOp<Context>::RunWithType() {
    if (comm_rank == comm_root) {
-        Output(0)->template CopyFrom<Context>(Input(this->comm_rank + 1));
+        Output(0)->template CopyFrom<Context>(
+            Input(this->comm_rank + 1), ctx());
        for (int i = 0; i < comm_size; i++) {
            if (i == comm_root) continue;
 #ifdef WITH_MPI_CUDA

--- a/Dragon/src/operators/ndarray/arange_op.cc
+++ b/Dragon/src/operators/ndarray/arange_op.cc
@@ -11,7 +11,7 @@ void ArangeOp<Context>::RunWithType() {
    count = (stop_ - start_ - 1) / step_ + 1;
    Output(0)->Reshape({ count });
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
-    kernel::Arange<T, Context>(count, start_, step_, Ydata);
+    kernel::Arange<T, Context>(count, start_, step_, Ydata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/ndarray/argreduce_op.cc
+++ b/Dragon/src/operators/ndarray/argreduce_op.cc
 #include "utils/op_kernel.h"
+#include "utils/math_functions.h"
 #include "operators/ndarray/argreduce_op.h"

 namespace dragon {
@@ -12,14 +13,15 @@ void ArgReduceOp<Context>::RunWithType() {
        auto* Idata = Output(0)->template mutable_data<int64_t, CPUContext>();
        auto* Vdata = OutputSize() == 2 ?
            Output(1)->template mutable_data<T, CPUContext>() : nullptr;
+        static CPUContext cctx;
        if (operation == "ARGMAX") {
            kernel::Argmax<T, CPUContext>(
                count, axis_dim, inner_dim,
-                    top_k, Xdata, Idata, Vdata);
+                    top_k, Xdata, Idata, Vdata, &cctx);
        } else if (operation == "ARGMIN") {
            kernel::Argmin<T, CPUContext>(
                count, axis_dim, inner_dim, 
-                    top_k, Xdata, Idata, Vdata);
+                    top_k, Xdata, Idata, Vdata, &cctx);
        } else LOG(FATAL) << "Unknown operation: [" << operation << "].";
    } else {
        auto* Xdata = Input(0).template data<T, Context>();
@@ -29,11 +31,11 @@ void ArgReduceOp<Context>::RunWithType() {
        if (operation == "ARGMAX") {
            kernel::Argmax<T, Context>(
                count, axis_dim, inner_dim,
-                    top_k, Xdata, Idata, Vdata);
+                    top_k, Xdata, Idata, Vdata, ctx());
        } else if (operation == "ARGMIN") {
            kernel::Argmin<T, Context>(
-                count, axis_dim, inner_dim, 
-                    top_k, Xdata, Idata, Vdata);
+                count, axis_dim, inner_dim,
+                    top_k, Xdata, Idata, Vdata, ctx());
        } else LOG(FATAL) << "Unknown operation: [" << operation << "].";
    }
 }

--- a/Dragon/src/operators/ndarray/concat_op.cc
+++ b/Dragon/src/operators/ndarray/concat_op.cc
@@ -14,7 +14,7 @@ void ConcatOp<Context>::RunWithType() {
        kernel::Concat<T, Context>(
            count, outer_dim, inner_dim,
                x_concat_dim, y_concat_dim,
-                    concat_offset, Xdata, Ydata);
+                    concat_offset, Xdata, Ydata, ctx());
        concat_offset += x_concat_dim;
    }
 }
@@ -61,7 +61,7 @@ void ConcatGradientOp<Context>::RunWithType() {
            kernel::ConcatGrad<T, Context>(
                count, outer_dim, inner_dim,
                    x_concat_dim, y_concat_dim,
-                        concat_offset, dYdata, dXdata);
+                        concat_offset, dYdata, dXdata, ctx());
        }
        concat_offset += x_concat_dim;
    }

--- a/Dragon/src/operators/ndarray/crop_op.cc
+++ b/Dragon/src/operators/ndarray/crop_op.cc
@@ -17,7 +17,7 @@ void CropOp<Context>::RunWithType() {

    kernel::Crop1D<T, Context>(dest->count(),
        dim, ed[axis] - st[axis], inner_dim,
-            st[axis], Xdata, Ydata);
+            st[axis], Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -46,7 +46,7 @@ void CropOp<Context>::Setup() {

    // make ends
    ed.assign(Input(0).ndim(), 0);
-    keep_dims.resize(Input(0).ndim(), 0);
+    keep_dims.assign(Input(0).ndim(), 1);
    if (shape.size() + shape_like.size() != 0) {
        CHECK(shape.size() * shape_like.size() == 0)
            << "\nCan not set shape and shape_like both.";
@@ -75,7 +75,6 @@ void CropOp<Context>::Setup() {
        //  static crop
        int n_given = (int)GET_ARGUMENTS_SIZE(ends);
        for (int i = 0; i < ed.size(); i++) {
-            keep_dims[i] = 1;
            if (i < n_given) ed[i] = ends(i);
            if (ed[i] == 0) ed[i] = Input(0).dim(i);
            if (ed[i] == -1) { ed[i] = st[i] + 1; keep_dims[i] = 0; }
@@ -125,7 +124,7 @@ void CropOp<Context>::RunOnDevice() {
    //  do nothing
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(0));
-        Output(0)->template CopyFrom<Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0), ctx());
        //  squeeze dimensions
        vector<TIndex> squeeze_shape;
        for (int i = 0; i < keep_dims.size(); i++)
@@ -149,6 +148,7 @@ void CropOp<Context>::RunOnDevice() {
        if (XIsType(Input(0), float)) RunWithType<float>();
        else if (XIsType(Input(0), int)) RunWithType<int>();
        else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
+        ctx()->FinishDeviceCompution();
        //  allow buffer to protect X if the num of tasks >= 2
        std::swap(source, dest);
        if (process_axes.size() % 2 == 1) {
@@ -160,7 +160,7 @@ void CropOp<Context>::RunOnDevice() {

    //  squeeze dimensions
    vector<TIndex> squeeze_shape;
-    for (int i = 0; i < keep_dims.size(); i++) 
+    for (int i = 0; i < keep_dims.size(); i++)
        if (keep_dims[i]) squeeze_shape.push_back(Output(0)->dim(i));
    Output(0)->Reshape(squeeze_shape);
 }
@@ -206,10 +206,10 @@ void CropGradientOp<Context>::RunWithType() {
    if (dest == &navigator) {
        dXdata = ws()->template caches<T, Context>({ dest->count() })[0];
    } else { dXdata = dest->template mutable_data<T, Context>(); }
-    
+
    kernel::Crop1DGrad<T, Context>(dest->count(),
        Input(0).dim(axis), dim, inner_dim,
-            st[axis], ed[axis], dYdata, dXdata);
+            st[axis], ed[axis], dYdata, dXdata, ctx());
 }

 template <class Context>
@@ -226,10 +226,10 @@ void CropGradientOp<Context>::RunOnDevice() {
        expand_shape[keep_axes[i]] = Input(-1).dim(i);
    Input(-1).Reshape(expand_shape);

-    //  do nothing 
+    //  do nothing
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(-1));
-        Output(0)->template CopyFrom<Context>(Input(-1));
+        Output(0)->template CopyFrom<Context>(Input(-1), ctx());
        return;
    }

@@ -248,6 +248,7 @@ void CropGradientOp<Context>::RunOnDevice() {
        if (XIsType(Input(0), float)) RunWithType<float>();
        else if (XIsType(Input(0), int)) RunWithType<int>();
        else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
+        ctx()->FinishDeviceCompution();
        //  allow buffer to protect X if the num of tasks >= 2
        std::swap(source, dest);
        if (process_axes.size() % 2 == 1) {

--- a/Dragon/src/operators/ndarray/gather_op.cc
+++ b/Dragon/src/operators/ndarray/gather_op.cc
@@ -12,11 +12,11 @@ void GatherOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    kernel::CanonicalAxis<int, Context>(
-        Input(1).count(), x_slice_dim, indices);
+        Input(1).count(), x_slice_dim, indices, ctx());

-    kernel::Gather<T, Context>(
-        Output(0)->count(), outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim, indices, Xdata, Ydata);
+    kernel::Gather<T, Context>(Output(0)->count(),
+        outer_dim, inner_dim, x_slice_dim, y_slice_dim,
+            indices, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -46,13 +46,18 @@ template <class Context> template <typename T>
 void GatherGradientOp<Context>::RunWithType() {
    auto* indices = Input(1).template data<int, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
-    auto* dXdata = Output(0)->template mutable_data<T, Context>();

-    if (!acc_grad) math::Set<T, Context>(Output(0)->count(), 0, dXdata);
+    T* dXdata = nullptr;
+    if (!acc_grad) {
+        dXdata = Output(0)->template mutable_data<T, Context>();
+        math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());
+    } else {
+        dXdata = Output(0)->template mutable_data<T, Context>(ctx());
+    }

-    kernel::GatherGrad<T, Context>(
-        Input(-1).count(), outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim, indices, dYdata, dXdata);
+    kernel::GatherGrad<T, Context>(Input(-1).count(),
+        outer_dim, inner_dim, x_slice_dim, y_slice_dim,
+            indices, dYdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/ndarray/one_hot_op.cc
+++ b/Dragon/src/operators/ndarray/one_hot_op.cc
@@ -10,10 +10,10 @@ void OneHotOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    math::Set<T, Context>(Output(0)->count(),
-        dragon_cast<T, float>(float(off_value)), Ydata);
+        dragon_cast<T, float>(float(off_value)), Ydata, ctx());

    kernel::OneHot<T, Context>(Input(0).count(),
-        depth, on_value, Xdata, Ydata);
+        depth, on_value, Xdata, Ydata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/ndarray/pad_op.cc
+++ b/Dragon/src/operators/ndarray/pad_op.cc
@@ -17,7 +17,7 @@ void PadOp<Context>::ConstRunWithType() {

    kernel::ConstPad1D<T, Context>(dest->count(),
        dim, dim + pad_l[axis] + pad_r[axis], inner_dim,
-            pad_l[axis], value, Xdata, Ydata);
+            pad_l[axis], value, Xdata, Ydata, ctx());
 }

 template <class Context> template <typename T>
@@ -32,7 +32,7 @@ void PadOp<Context>::ReflectRunWithType() {

    kernel::ReflectPad1D<T, Context>(dest->count(),
        dim, dim + pad_l[axis] + pad_r[axis], inner_dim,
-            pad_l[axis], Xdata, Ydata);
+            pad_l[axis], Xdata, Ydata, ctx());
 }

 template <class Context> template <typename T>
@@ -47,7 +47,7 @@ void PadOp<Context>::EdgeRunWithType() {

    kernel::EdgePad1D<T, Context>(dest->count(),
        dim, dim + pad_l[axis] + pad_r[axis], inner_dim,
-            pad_l[axis], Xdata, Ydata);
+            pad_l[axis], Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -61,7 +61,7 @@ void PadOp<Context>::RunOnDevice() {
    //  do nothing
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(0));
-        Output(0)->template CopyFrom<Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0), ctx());
        return;
    }

@@ -99,6 +99,7 @@ void PadOp<Context>::RunOnDevice() {
        } else {
            LOG(FATAL) << "Unsupported padding mode: " << mode << ".";
        }
+        ctx()->FinishDeviceCompution();
        //  allow buffer to protect X if the num of tasks >= 2
        std::swap(source, dest);
        if (process_axes.size() % 2 == 1) {
@@ -127,7 +128,7 @@ void PadGradientOp<Context>::ConstRunWithType() {

    kernel::ConstPad1DGrad<T, Context>(dest->count(),
        dim - pad_l[axis] - pad_r[axis], dim, inner_dim,
-            pad_l[axis], dYdata, dXdata);
+            pad_l[axis], dYdata, dXdata, ctx());
 }

 template <class Context> template <typename T>
@@ -140,11 +141,11 @@ void PadGradientOp<Context>::ReflectRunWithType() {
        dXdata = ws()->template caches<T, Context>({ dest->count() })[0];
    } else { dXdata = dest->template mutable_data<T, Context>(); }

-    math::Set<T, Context>(dest->count(), 0, dXdata);
+    math::Set<T, Context>(dest->count(), 0, dXdata, ctx());

    kernel::ReflectPad1DGrad<T, Context>(source->count(),
        dim - pad_l[axis] - pad_r[axis], dim, inner_dim,
-            pad_l[axis], dYdata, dXdata);
+            pad_l[axis], dYdata, dXdata, ctx());
 }

 template <class Context> template <typename T>
@@ -157,11 +158,11 @@ void PadGradientOp<Context>::EdgeRunWithType() {
        dXdata = ws()->template caches<T, Context>({ dest->count() })[0];
    } else { dXdata = dest->template mutable_data<T, Context>(); }

-    math::Set<T, Context>(dest->count(), 0, dXdata);
+    math::Set<T, Context>(dest->count(), 0, dXdata, ctx());

    kernel::EdgePad1DGrad<T, Context>(source->count(),
        dim - pad_l[axis] - pad_r[axis], dim, inner_dim,
-            pad_l[axis], dYdata, dXdata);
+            pad_l[axis], dYdata, dXdata, ctx());
 }

 template <class Context>
@@ -175,7 +176,7 @@ void PadGradientOp<Context>::RunOnDevice() {
    //  do nothing 
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(-1));
-        Output(0)->template CopyFrom<Context>(Input(-1));
+        Output(0)->template CopyFrom<Context>(Input(-1), ctx());
        return;
    }

@@ -213,6 +214,7 @@ void PadGradientOp<Context>::RunOnDevice() {
        } else {
            LOG(FATAL) << "Unsupported padding mode: " << mode << ".";
        }
+        ctx()->FinishDeviceCompution();
        //  allow buffer to protect X if the num of tasks >= 2
        std::swap(source, dest);
        if (process_axes.size() % 2 == 1) {

--- a/Dragon/src/operators/ndarray/random_pick_op.cc
+++ b/Dragon/src/operators/ndarray/random_pick_op.cc
@@ -9,15 +9,15 @@ template <class Context> template <typename T>
 void RandomPickOp<Context>::RunWithType() {
    auto* indices = pick_indices->template mutable_data<int, CPUContext>();
    for (int i = 0; i < pick_indices->count(); i++)
-        indices[i] = int((*ctx().rand_generator())() % x_slice_dim);
+        indices[i] = int((*ctx()->rand_generator())() % x_slice_dim);

    auto* Xdata = Input(0).template data<T, Context>();
    indices = pick_indices->template mutable_data<int, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    kernel::Gather<T, Context>(
-        Output(0)->count(), outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim, indices, Xdata, Ydata);
+    kernel::Gather<T, Context>(Output(0)->count(),
+        outer_dim, inner_dim, x_slice_dim, y_slice_dim,
+            indices, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -39,7 +39,7 @@ void RandomPickOp<Context>::RunOnDevice() {

    if (Output(1)->name() != "ignore") {
        Output(1)->ReshapeLike(*pick_indices);
-        Output(1)->template CopyFrom<Context>(*pick_indices);
+        Output(1)->template CopyFrom<Context>(*pick_indices, ctx());
    }
 }

@@ -55,11 +55,11 @@ void RandomPickGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

-    math::Set<T, Context>(Output(0)->count(), 0, dXdata);
+    math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());

-    kernel::GatherGrad<T, Context>(
-        Input(-1).count(), outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim, indices, dYdata, dXdata);
+    kernel::GatherGrad<T, Context>(Input(-1).count(),
+        outer_dim, inner_dim, x_slice_dim, y_slice_dim,
+            indices, dYdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/ndarray/reduce_op.cc
+++ b/Dragon/src/operators/ndarray/reduce_op.cc
@@ -8,14 +8,17 @@ namespace dragon {
 template <class Context> template <typename T>
 void ReduceOp<Context>::SumRunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
+    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    if (axis == -1) {
        DECLARE_MULTIPLIER(multiplier, Input(0).count());
-        auto* Ydata = Output(0)->template mutable_data<T, CPUContext>();
-        Ydata[0] = math::Dot<T, Context>(
-            Input(0).count(), multiplier, Xdata, &ctx());
+        T result_host;
+        math::Dot<T, Context>(Input(0).count(),
+            multiplier, Xdata, &result_host, ctx());
+        ctx()->template Copy<T, Context, CPUContext>(
+            1, Ydata, &result_host);
    } else {
-        auto* Ydata = Output(0)->template mutable_data<T, Context>();
-        kernel::Sum<T, Context>(count, axis_dim, inner_dim, Xdata, Ydata);
+        kernel::Sum<T, Context>(count,
+            axis_dim, inner_dim, Xdata, Ydata, ctx());
    }
 }

@@ -24,7 +27,7 @@ void ReduceOp<Context>::MeanRunWithType() {
    SumRunWithType<T>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    T coeff = axis != -1 ? 1.0 / axis_dim : 1.0 / Input(0).count();
-    math::Scal<T, Context>(Output(0)->count(), coeff, Ydata, &ctx());
+    math::Scal<T, Context>(Output(0)->count(), coeff, Ydata, ctx());
 }

 template <class Context>
@@ -62,11 +65,12 @@ void ReduceGradientOp<Context>::SumRunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    if (axis == -1) {
        auto* dYdata = Input(-1).template data<T, CPUContext>();
-        math::Set<T, Context>(Output(0)->count(), dYdata[0], dXdata);
+        math::Set<T, Context>(Output(0)->count(),
+            dYdata[0], dXdata, ctx());
    } else {
        auto* dYdata = Input(-1).template data<T, Context>();
        kernel::SumGrad<T, Context>(count,
-            axis_dim, inner_dim, 1.0, dYdata, dXdata);
+            axis_dim, inner_dim, 1.0, dYdata, dXdata, ctx());
    }
 }

@@ -76,11 +80,12 @@ void ReduceGradientOp<Context>::MeanRunWithType() {
    if (axis == -1) {
        auto* dYdata = Input(-1).template data<T, CPUContext>();
        math::Set<T, Context>(Output(0)->count(),
-            dYdata[0] / Input(0).count(), dXdata);
+            dYdata[0] / Input(0).count(), dXdata, ctx());
    } else {
        auto* dYdata = Input(-1).template data<T, Context>();
        kernel::SumGrad<T, Context>(count,
-            axis_dim, inner_dim, 1.0 / axis_dim, dYdata, dXdata);
+            axis_dim, inner_dim, 1.0 / axis_dim,
+                dYdata, dXdata, ctx());
    }
 }


--- a/Dragon/src/operators/ndarray/repeat_op.cc
+++ b/Dragon/src/operators/ndarray/repeat_op.cc
@@ -10,7 +10,7 @@ void RepeatOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    kernel::Repeat<T, Context>(
        Output(0)->count(), outer_dim, dim,
-            inner_dim, repeats(), Xdata, Ydata);
+            inner_dim, repeats(), Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -44,7 +44,7 @@ void RepeatGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
    kernel::RepeatGrad<T, Context>(
        Output(0)->count(), outer_dim, dim, inner_dim,
-            repeats(), dYdata, dXdata, &ctx());
+            repeats(), dYdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/ndarray/slice_op.cc
+++ b/Dragon/src/operators/ndarray/slice_op.cc
@@ -10,8 +10,9 @@ void SliceOp<Context>::RunWithType() {
    for (int i = 0; i < nout; i++) {
        auto* Ydata = Output(i)->template mutable_data<T, Context>();
        TIndex count = Output(i)->count();
-        kernel::Slice<T, Context>(count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim, slice_offset, Xdata, Ydata);
+        kernel::Slice<T, Context>(count,
+            outer_dim, inner_dim, x_slice_dim, y_slice_dim,
+                slice_offset, Xdata, Ydata, ctx());
        slice_offset += y_slice_dim;
    }
 }
@@ -46,8 +47,9 @@ void SliceGradientOp<Context>::RunWithType() {
        if (Input(i + 1).name() == "ignore") continue;
        auto* dYdata = Input(i + 1).template data<T, Context>();
        TIndex count = Input(i + 1).count();
-        kernel::SliceGrad<T, Context>(count, outer_dim, inner_dim,
-            x_slice_dim, y_slice_dim, slice_offset, dYdata, dXdata);
+        kernel::SliceGrad<T, Context>(count,
+            outer_dim, inner_dim, x_slice_dim, y_slice_dim,
+                slice_offset, dYdata, dXdata, ctx());
        slice_offset += y_slice_dim;
    }
 }

--- a/Dragon/src/operators/ndarray/stack_op.cc
+++ b/Dragon/src/operators/ndarray/stack_op.cc
@@ -14,7 +14,7 @@ void StackOp<Context>::RunWithType() {
        kernel::Concat<T, Context>(
            count, outer_dim, inner_dim,
                x_concat_dim, y_concat_dim,
-                    concat_offset, Xdata, Ydata);
+                    concat_offset, Xdata, Ydata, ctx());
        concat_offset += x_concat_dim;
    }
 }
@@ -59,7 +59,7 @@ void StackGradientOp<Context>::RunWithType() {
            kernel::ConcatGrad<T, Context>(
                count, outer_dim, inner_dim,
                    x_concat_dim, y_concat_dim,
-                        concat_offset, dYdata, dXdata);
+                        concat_offset, dYdata, dXdata, ctx());
        }
        concat_offset += x_concat_dim;
    }

--- a/Dragon/src/operators/ndarray/tile_op.cc
+++ b/Dragon/src/operators/ndarray/tile_op.cc
@@ -22,7 +22,7 @@ void TileOp<Context>::TileRunWithType() {

    kernel::Tile<T, Context>(dest->count(),
        outer_dim, ex_inner_dim,
-            multiple, Xdata, Ydata);
+            multiple, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -35,7 +35,7 @@ void TileOp<Context>::RunOnDevice() {
    //  do nothing 
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(0));
-        Output(0)->template CopyFrom<Context>(Input(0));
+        Output(0)->template CopyFrom<Context>(Input(0), ctx());
        return;
    }

@@ -48,6 +48,7 @@ void TileOp<Context>::RunOnDevice() {
        axis = task.second; multiple = task.first;
        if (XIsType(Input(0), float)) TileRunWithType<float>();
        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+        ctx()->FinishDeviceCompution();
        //  allow buffer to protect X if the num of tasks >= 2
        std::swap(source, dest);
        if (process_axes.size() % 2 == 1) {
@@ -82,7 +83,7 @@ void TileGradientOp<Context>::TileRunWithType() {

    kernel::TileGrad<T, Context>(
        dest->count(), outer_dim, ex_inner_dim,
-            multiple, dYdata, dXdata, &ctx());
+            multiple, dYdata, dXdata, ctx());
 }

 template <class Context>
@@ -96,7 +97,7 @@ void TileGradientOp<Context>::RunOnDevice() {
    //  do nothing 
    if (process_axes.size() == 0) {
        Output(0)->ReshapeLike(Input(-1));
-        Output(0)->template CopyFrom<Context>(Input(-1));
+        Output(0)->template CopyFrom<Context>(Input(-1), ctx());
        return;
    }

@@ -109,6 +110,7 @@ void TileGradientOp<Context>::RunOnDevice() {
        axis = task.second; multiple = task.first;
        if (XIsType(Input(0), float)) TileRunWithType<float>();
        else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+        ctx()->FinishDeviceCompution();
        //  allow buffer to protect X if the num of tasks >= 2
        std::swap(source, dest);
        if (process_axes.size() % 2 == 1) {

--- a/Dragon/src/operators/ndarray/transpose_op.cc
+++ b/Dragon/src/operators/ndarray/transpose_op.cc
@@ -14,7 +14,7 @@ void TransposeOp<Context>::RunWithType() {

    kernel::Transpose<T, Context>(
        Output(0)->count(), (int)Output(0)->ndim(),
-            ORdata, OSdata, NSdata, Xdata, Ydata);
+            ORdata, OSdata, NSdata, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -75,7 +75,7 @@ void TransposeGradientOp<Context>::RunWithType() {

    kernel::TransposeGrad<T, Context>(
        Input(-1).count(), order->count(),
-            ORdata, OSdata, NSdata, dYdata, dXdata);
+            ORdata, OSdata, NSdata, dYdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/norm/batch_norm_op.cc
+++ b/Dragon/src/operators/norm/batch_norm_op.cc
@@ -20,23 +20,23 @@ void BatchNormOp<Context>::TrainingRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* NCdata = nc.template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);

    //  compute mean
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0 / NS, Xdata, MXmult,
-                    0, NCdata, &ctx());
+                    0, NCdata, ctx());
        math::Gemv<T, Context>(
            CblasTrans, N, C,
                1.0, NCdata, MXmult,
-                    0, Tmean, &ctx());
+                    0, Tmean, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0 / NS, Xdata, MXmult,
-                    0, Tmean, &ctx());
+                    0, Tmean, ctx());
    }

    //  subtract mean
@@ -45,37 +45,37 @@ void BatchNormOp<Context>::TrainingRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tmean,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    -1.0, NCdata, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    -1.0, MXmult, Tmean,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }

    //  compute variance
    //  note that we use VAR(X) = E((X - EX) ^ 2)
-    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
+    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0 / NS, WSdata, MXmult,
-                    0.0, NCdata, &ctx());
+                    0.0, NCdata, ctx());
        math::Gemv<T, Context>(
            CblasTrans, N, C,
                1.0, NCdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0 / NS, WSdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
    }

    //  compute moving average
@@ -92,21 +92,21 @@ void BatchNormOp<Context>::TrainingRunWithType() {
            float coeff = m > 1 ? float(m) / (m - 1) : 1;
            //  History(X) = Cur(X) + momentum * History(X)
            math::Axpby<T, Context>(mean.count(),
-                1.0, Tmean, momentum, Hmean, &ctx());
+                1.0, Tmean, momentum, Hmean, ctx());
            math::Axpby<T, Context>(var->count(),
-                coeff, Tvar, momentum, Hvar, &ctx());
+                coeff, Tvar, momentum, Hvar, ctx());
        } else {
            //  History(X) = (1 - momentum) * Cur(X) + momentum * History(X)
            math::Axpby<T, Context>(mean.count(),
-                1.0 - momentum, Tmean, momentum, Hmean, &ctx());
+                1.0 - momentum, Tmean, momentum, Hmean, ctx());
            math::Axpby<T, Context>(var->count(),
-                1.0 - momentum, Tvar, momentum, Hvar, &ctx());
+                1.0 - momentum, Tvar, momentum, Hvar, ctx());
        }
    }

    //  compute stddev
-    math::AddScalar<T, Context>(var->count(), eps, Tvar);
-    math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+    math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
+    math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

    //  divide by stddev
    if (data_format == "NCHW") {
@@ -114,20 +114,21 @@ void BatchNormOp<Context>::TrainingRunWithType() {
              CblasNoTrans, CblasNoTrans,
                  N, C, 1,
                      1.0, MXmult, Tvar,
-                          0.0, NCdata, &ctx());
+                          0.0, NCdata, ctx());
          math::Gemm<T, Context>(
              CblasNoTrans, CblasNoTrans,
                  NC, S, 1,
                      1.0, NCdata, MXmult,
-                          0.0, WSdata, &ctx());
+                          0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Div<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());
 }

 template <class Context> template <typename T>
@@ -145,7 +146,7 @@ void BatchNormOp<Context>::InferenceRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* NCdata = nc.template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);

    //  scale the mean and variance if necessary
    if (mode == "CAFFE") {
@@ -156,12 +157,12 @@ void BatchNormOp<Context>::InferenceRunWithType() {
        const float factor = dragon_cast<float, T>(hFact_data[0]);
        const float scale = factor == 0 ? 0 : 1.0 / factor;
        math::Scale<T, Context>(mean.count(),
-            scale, Hmean, Tmean, &ctx());
+            scale, Hmean, Tmean, ctx());
        math::Scale<T, Context>(var->count(),
-            scale, Hvar, Tvar, &ctx());
+            scale, Hvar, Tvar, ctx());
    } else {
-       ctx().template Copy<T, Context, Context>(mean.count(), Tmean, Hmean);
-       ctx().template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
+       ctx()->template Copy<T, Context, Context>(mean.count(), Tmean, Hmean);
+       ctx()->template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
    }

    //  subtract mean
@@ -170,23 +171,23 @@ void BatchNormOp<Context>::InferenceRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tmean,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    -1.0, NCdata, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    -1.0, MXmult, Tmean,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }

    //  compute stddev
-    math::AddScalar<T, Context>(var->count(), eps, Tvar);
-    math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+    math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
+    math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

    //  divide by stddev
    if (data_format == "NCHW") {
@@ -194,20 +195,21 @@ void BatchNormOp<Context>::InferenceRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Div<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());
 }

 template <class Context>
@@ -246,10 +248,7 @@ void BatchNormOp<Context>::RunOnDevice() {
    if (XIsType(Input(0), float)) {
        if (use_global_stats) InferenceRunWithType<float>();
        else TrainingRunWithType<float>();
-    } else if (XIsType(Input(0), float16)) {
-        if (use_global_stats) InferenceRunWithType<float16>();
-        else TrainingRunWithType<float16>();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

 DEPLOY_CPU(BatchNorm);
@@ -273,97 +272,100 @@ void BatchNormGradientOp<Context>::TrainingRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }

    auto* Ydata = Input(1).template data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), Ydata, dYdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, dYdata, dXdata, ctx());

     //  sum(dE/dY \cdot Y)
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0, dXdata, MXmult,
-                    0.0, NCdata, &ctx());
+                    0.0, NCdata, ctx());
        math::Gemv<T, Context>(
            CblasTrans, N, C,
                1.0, NCdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        0.0, dXdata, &ctx());
+                        0.0, dXdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0, dXdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, dXdata, &ctx());
+                        0.0, dXdata, ctx());
    }

    //  sum(dE/dY \cdot Y) \cdot Y
-    math::Mul<T, Context>(Output(0)->count(), Ydata, dXdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, dXdata, dXdata, ctx());

    //  sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0, dYdata, MXmult,
-                    0.0, NCdata, &ctx());
+                    0.0, NCdata, ctx());
        math::Gemv<T, Context>(
            CblasTrans, N, C,
                1.0, NCdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        1.0, dXdata, &ctx());
+                        1.0, dXdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0, dYdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        1.0, dXdata, &ctx());
+                        1.0, dXdata, ctx());
    }

    //   dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
    // = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
    math::Axpby<T, Context>(Output(0)->count(),
-        1.0, dYdata, -1.0 / NS, dXdata, &ctx());
+        1.0, dYdata, -1.0 / NS, dXdata, ctx());

    //  divide by stddev
-    math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
+    math::Div<T, Context>(Output(0)->count(),
+        dXdata, WSdata, dXdata, ctx());
 }

 template <class Context> template <typename T>
@@ -381,21 +383,22 @@ void BatchNormGradientOp<Context>::InferenceRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }

-    math::Div<T, Context>(Output(0)->count(), dYdata, WSdata, dXdata);
+    math::Div<T, Context>(Output(0)->count(),
+        dYdata, WSdata, dXdata, ctx());
 }

 template <class Context>
@@ -430,10 +433,7 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
    if (XIsType(Input(0), float)) {
        if (use_global_stats) InferenceRunWithType<float>();
        else TrainingRunWithType<float>();
-    } else if (XIsType(Input(0), float16)) {
-        if (use_global_stats) InferenceRunWithType<float16>();
-        else TrainingRunWithType<float16>();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

 DEPLOY_CPU(BatchNormGradient);

--- a/Dragon/src/operators/norm/batch_renorm_op.cc
+++ b/Dragon/src/operators/norm/batch_renorm_op.cc
@@ -20,7 +20,7 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* NCdata = nc.template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);

    auto* Td = d.template mutable_data<T, Context>();
    auto* Tr = r->template mutable_data<T, Context>();
@@ -35,11 +35,11 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
        auto* hFact_data = Input(3).template mutable_data<T, CPUContext>();
        const float factor = dragon_cast<float, T>(hFact_data[0]);
        const float scale = factor == 0 ? 0 : 1.0 / factor;
-        math::Scale<T, Context>(mean.count(), scale, Hmean, THmean, &ctx());
-        math::Scale<T, Context>(mean.count(), scale, Hvar, THvar, &ctx());
+        math::Scale<T, Context>(mean.count(), scale, Hmean, THmean, ctx());
+        math::Scale<T, Context>(mean.count(), scale, Hvar, THvar, ctx());
    } else {
-       ctx().template Copy<T, Context, Context>(mean.count(), THmean, Hmean);
-       ctx().template Copy<T, Context, Context>(var->count(), THvar, Hvar);
+       ctx()->template Copy<T, Context, Context>(mean.count(), THmean, Hmean);
+       ctx()->template Copy<T, Context, Context>(var->count(), THvar, Hvar);
    }

    //  compute mean
@@ -47,16 +47,16 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0 / NS, Xdata, MXmult,
-                    0.0, NCdata, &ctx());
+                    0.0, NCdata, ctx());
        math::Gemv<T, Context>(
            CblasTrans, N, C,
                1.0, NCdata, MXmult,
-                    0.0, Tmean, &ctx());
+                    0.0, Tmean, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0 / NS, Xdata, MXmult,
-                    0.0, Tmean, &ctx());
+                    0.0, Tmean, ctx());
    }

    //  subtract mean
@@ -65,37 +65,37 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tmean,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    -1.0, NCdata, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    -1.0, MXmult, Tmean,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }

    //  compute variance
    //  note that we use VAR(X) = E((X - EX) ^ 2)
-    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
+    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0 / NS, WSdata, MXmult,
-                    0.0, NCdata, &ctx());
+                    0.0, NCdata, ctx());
        math::Gemv<T, Context>(
            CblasTrans, N, C,
                1.0, NCdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0 / NS, WSdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
    }

    //  compute moving average
@@ -112,21 +112,21 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
            float coeff = m > 1 ? float(m) / (m - 1) : 1;
            //  History(X) = Cur(X) + momentum * History(X)
            math::Axpby<T, Context>(mean.count(),
-                1.0, Tmean, momentum, Hmean, &ctx());
+                1.0, Tmean, momentum, Hmean, ctx());
            math::Axpby<T, Context>(var->count(),
-                coeff, Tvar, momentum, Hvar, &ctx());
+                coeff, Tvar, momentum, Hvar, ctx());
        } else {
            //  History(X) = (1 - momentum) * Cur(X) + momentum * History(X)
            math::Axpby<T, Context>(mean.count(),
-                1.0 - momentum, Tmean, momentum, Hmean, &ctx());
+                1.0 - momentum, Tmean, momentum, Hmean, ctx());
            math::Axpby<T, Context>(var->count(),
-                1.0 - momentum, Tvar, momentum, Hvar, &ctx());
+                1.0 - momentum, Tvar, momentum, Hvar, ctx());
        }
    }

    //  compute stddev
-    math::AddScalar<T, Context>(var->count(), eps, Tvar);
-    math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+    math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
+    math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

     //  divide by stddev
    if (data_format == "NCHW") {
@@ -134,35 +134,36 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
              CblasNoTrans, CblasNoTrans,
                  N, C, 1,
                      1.0, MXmult, Tvar,
-                          0.0, NCdata, &ctx());
+                          0.0, NCdata, ctx());
          math::Gemm<T, Context>(
              CblasNoTrans, CblasNoTrans,
                  NC, S, 1,
                      1.0, NCdata, MXmult,
-                          0.0, WSdata, &ctx());
+                          0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Div<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());

    //  compute renorm
    if (!is_recomputing) {
        //  compute history stddev
-        math::AddScalar<T, Context>(var->count(), eps, THvar);
-        math::Sqrt<T, Context>(var->count(), THvar, THvar);
+        math::AddScalar<T, Context>(var->count(), eps, THvar, ctx());
+        math::Sqrt<T, Context>(var->count(), THvar, THvar, ctx());

        //  compute r
-        math::Div<T, Context>(var->count(), Tvar, THvar, Tr);
-        math::Clip<T, Context>(var->count(), 1.0 / t_r_max, t_r_max, Tr);
+        math::Div<T, Context>(var->count(), Tvar, THvar, Tr, ctx());
+        math::Clip<T, Context>(var->count(), 1.0 / t_r_max, t_r_max, Tr, ctx());

        //  compute d
-        math::Sub<T, Context>(mean.count(), Tmean, THmean, Td);
-        math::Div<T, Context>(mean.count(), Td, THvar, Td);
-        math::Clip<T, Context>(mean.count(), -t_d_max, t_d_max, Td);
+        math::Sub<T, Context>(mean.count(), Tmean, THmean, Td, ctx());
+        math::Div<T, Context>(mean.count(), Td, THvar, Td, ctx());
+        math::Clip<T, Context>(mean.count(), -t_d_max, t_d_max, Td, ctx());

        //  update the bound of r & d
        t_r_max = r_max / (1.0 + (r_max - 1.0) * exp(-t_val));
@@ -173,7 +174,7 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
    //  apply renorm
    //  store x_norm for backward
    auto* XNorm_data = x_norm->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), XNorm_data, Ydata);

    //  correction: mul by r
@@ -182,20 +183,21 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
              CblasNoTrans, CblasNoTrans,
                  N, C, 1,
                      1.0, MXmult, Tr,
-                          0.0, NCdata, &ctx());
+                          0.0, NCdata, ctx());
          math::Gemm<T, Context>(
              CblasNoTrans, CblasNoTrans,
                  NC, S, 1,
                      1.0, NCdata, MXmult,
-                          0.0, WSdata, &ctx());
+                          0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tr,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Mul<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());

    //  correction: add by d
    if (data_format == "NCHW") {
@@ -203,18 +205,18 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Td,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Td,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }
 }

@@ -233,7 +235,7 @@ void BatchRenormOp<Context>::InferenceRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* NCdata = nc.template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);

    //  scale the mean and variance if necessary
    if (mode == "CAFFE") {
@@ -243,11 +245,11 @@ void BatchRenormOp<Context>::InferenceRunWithType() {
        auto* hFact_data = Input(3).template mutable_data<T, CPUContext>();
        const float factor = dragon_cast<float, T>(hFact_data[0]);
        const float scale = factor == 0 ? 0 : 1.0 / factor;
-        math::Scale<T, Context>(mean.count(), scale, Hmean, Tmean, &ctx());
-        math::Scale<T, Context>(var->count(), scale, Hvar, Tvar, &ctx());
+        math::Scale<T, Context>(mean.count(), scale, Hmean, Tmean, ctx());
+        math::Scale<T, Context>(var->count(), scale, Hvar, Tvar, ctx());
    } else {
-       ctx().template Copy<T, Context, Context>(mean.count(), Tmean, Hmean);
-       ctx().template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
+       ctx()->template Copy<T, Context, Context>(mean.count(), Tmean, Hmean);
+       ctx()->template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
    }

    //  subtract mean
@@ -256,22 +258,22 @@ void BatchRenormOp<Context>::InferenceRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tmean,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans, NC, S, 1,
                -1.0, NCdata, MXmult,
-                    1.0, Ydata, &ctx());
+                    1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    -1.0, MXmult, Tmean,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }

    //  compute stddev
-    math::AddScalar<T, Context>(var->count(), eps, Tvar);
-    math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+    math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
+    math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

    //  divide by stddev
    if (data_format == "NCHW") {
@@ -279,20 +281,21 @@ void BatchRenormOp<Context>::InferenceRunWithType() {
              CblasNoTrans, CblasNoTrans,
                  N, C, 1,
                      1.0, MXmult, Tvar,
-                          0.0, NCdata, &ctx());
+                          0.0, NCdata, ctx());
          math::Gemm<T, Context>(
              CblasNoTrans, CblasNoTrans,
                  NC, S, 1,
                      1.0, NCdata, MXmult,
-                          0.0, WSdata, &ctx());
+                          0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Div<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());
 }

 template <class Context>
@@ -366,93 +369,96 @@ void BatchRenormGradientOp<Context>::TrainingRunWithType() {
             CblasNoTrans, CblasNoTrans,
                 N, C, 1,
                     1.0, MXmult, Tr,
-                         0.0, NCdata, &ctx());
+                         0.0, NCdata, ctx());
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans, 
                 NC, S, 1,
                     1.0, NCdata, MXmult,
-                         0.0, WSdata, &ctx());
+                         0.0, WSdata, ctx());
    } else if (data_format == "NWHC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tr,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Mul<T, Context>(Output(0)->count(), dYdata, WSdata, WSdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        dYdata, WSdata, WSdata, ctx());

    //  sum(dE/dY \cdot Y)
-    math::Mul<T, Context>(Output(0)->count(), XNorm_data, WSdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        XNorm_data, WSdata, dXdata, ctx());
    if (data_format == "NCHW") {
         math::Gemv<T, Context>(
             CblasNoTrans, NC, S,
                 1.0, dXdata, MXmult,
-                     0.0, NCdata, &ctx());
+                     0.0, NCdata, ctx());
         math::Gemv<T, Context>(
             CblasTrans, N, C,
                 1.0, NCdata, MXmult,
-                     0.0, Tmean, &ctx());
+                     0.0, Tmean, ctx());
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                 N, C, 1,
                     1.0, MXmult, Tmean,
-                         0.0, NCdata, &ctx());
+                         0.0, NCdata, ctx());
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                 NC, S, 1,
                     1.0, NCdata, MXmult,
-                         0.0, dXdata, &ctx());
+                         0.0, dXdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0, dXdata, MXmult,
-                    0.0, Tmean, &ctx());
+                    0.0, Tmean, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tmean,
-                        0.0, dXdata, &ctx());
+                        0.0, dXdata, ctx());
    }

-    //  sum(dE/dY \cdot Y) \cdot Y  
-    math::Mul<T, Context>(Output(0)->count(), XNorm_data, dXdata, dXdata);
+    //  sum(dE/dY \cdot Y) \cdot Y
+    math::Mul<T, Context>(Output(0)->count(),
+        XNorm_data, dXdata, dXdata, ctx());

    //  sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0, WSdata, MXmult,
-                    0.0, NCdata, &ctx());
+                    0.0, NCdata, ctx());
        math::Gemv<T, Context>(
            CblasTrans, N, C,
                1.0, NCdata, MXmult,
-                    0.0, Tmean, &ctx());
+                    0.0, Tmean, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tmean,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        1.0, dXdata, &ctx());
+                        1.0, dXdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0, WSdata, MXmult,
-                    0.0, Tmean, &ctx());
+                    0.0, Tmean, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tmean,
-                        1.0, dXdata, &ctx());
+                        1.0, dXdata, ctx());
    }

    //  dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
    //  = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
    math::Axpby<T, Context>(Output(0)->count(),
-        1.0, WSdata, -1.0 / NS, dXdata, &ctx());
+        1.0, WSdata, -1.0 / NS, dXdata, ctx());

    //  divide by stddev
    if (data_format == "NCHW") {
@@ -460,21 +466,24 @@ void BatchRenormGradientOp<Context>::TrainingRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }

-    math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
+    math::Div<T, Context>(Output(0)->count(),
+        dXdata, WSdata, dXdata, ctx());
+
+    ctx()->FinishDeviceCompution();
    x_norm->Reset();
 }

@@ -493,21 +502,22 @@ void BatchRenormGradientOp<Context>::InferenceRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }

-    math::Div<T, Context>(Output(0)->count(), dYdata, WSdata, dXdata);
+    math::Div<T, Context>(Output(0)->count(),
+        dYdata, WSdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
+++ b/Dragon/src/operators/norm/cudnn_batch_norm_op.cc
@@ -10,6 +10,8 @@ namespace dragon {

 template <class Context> template <typename T>
 void CuDNNBatchNormOp<Context>::RunWithType() {
+    typedef typename CUDNNType<T>::BNParamType BNParamType;
+
    //  determine the bn desc
    if (Input(0).ndim() == 2) {
        bn_mode = CUDNN_BATCHNORM_PER_ACTIVATION;
@@ -22,7 +24,7 @@ void CuDNNBatchNormOp<Context>::RunWithType() {
            << "The number of dimensions should be at least 3.";
        bn_mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION_MIN(7, 0, 0)
-        if (!this->use_global_stats) 
+        if (!this->use_global_stats)
            bn_mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 #endif
        if (data_format == "NCHW") {
@@ -54,32 +56,32 @@ void CuDNNBatchNormOp<Context>::RunWithType() {
    //  derive the bn desc
    CUDNN_CHECK(cudnnDeriveBNTensorDescriptor(bn_desc, input_desc, bn_mode));

-    TENSOR_FILL(Input(1), vector<TIndex>(1, C));  //  history_mean
-    TENSOR_FILL(Input(2), vector<TIndex>(1, C));  //  history_var
-    TENSOR_FILL(Input(3), vector<TIndex>(1, C));  //  scale
-    TENSOR_FILL(Input(4), vector<TIndex>(1, C));  //  bias
+    TENSOR_FILL_WITH_TYPE(Input(1), vector<TIndex>(1, C), BNParamType);  //  history_mean
+    TENSOR_FILL_WITH_TYPE(Input(2), vector<TIndex>(1, C), BNParamType);  //  history_var
+    TENSOR_FILL_WITH_TYPE(Input(3), vector<TIndex>(1, C), BNParamType);  //  scale
+    TENSOR_FILL_WITH_TYPE(Input(4), vector<TIndex>(1, C), BNParamType);  //  bias

    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

-    auto* Hmean = Input(1).template mutable_data<T, Context>();
-    auto* Hvar = Input(2).template mutable_data<T, Context>();
-    auto* Sdata = Input(3).template data<T, Context>();
-    auto* Bdata = Input(4).template data<T, Context>();
+    auto* Hmean = Input(1).template mutable_data<BNParamType, Context>();
+    auto* Hvar = Input(2).template mutable_data<BNParamType, Context>();
+    auto* Sdata = Input(3).template data<BNParamType, Context>();
+    auto* Bdata = Input(4).template data<BNParamType, Context>();

    if (this->use_global_stats) {
        CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
-            ctx().cudnn_handle(), bn_mode,
+            ctx()->cudnn_handle(), bn_mode,
                CUDNNType<T>::one, CUDNNType<T>::zero,
                    input_desc, Xdata, output_desc, Ydata,
                        bn_desc, Sdata, Bdata,
                            Hmean, Hvar, eps64));
    } else {
-        auto* Tmean = mean->template mutable_data<T, Context>();
-        auto* Tvar = var->template mutable_data<T, Context>();
+        auto* Tmean = mean->template mutable_data<BNParamType, Context>();
+        auto* Tvar = var->template mutable_data<BNParamType, Context>();
        auto mt = this->is_recomputing ? 0.0 : 1.0 - this->momentum;
        CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
-            ctx().cudnn_handle(), bn_mode,
+            ctx()->cudnn_handle(), bn_mode,
                CUDNNType<T>::one, CUDNNType<T>::zero,
                    input_desc, Xdata, output_desc, Ydata,
                        bn_desc, Sdata, Bdata,
@@ -131,7 +133,10 @@ void CuDNNBatchNormOp<Context>::RunOnDevice() {
 #endif
 }

-REGISTER_CUDNN_OPERATOR(FusedBatchNorm, CuDNNBatchNormOp<CUDAContext>);
+REGISTER_CUDNN_OPERATOR(
+    FusedBatchNorm,
+    CuDNNBatchNormOp<CUDAContext>
+);
 INSTANTIATE_CUDNN_OPERATOR(BatchNorm);

 template <class Context>
@@ -169,6 +174,8 @@ void CuDNNBatchNormGradientOp<Context>::Setup() {

 template <class Context> template <typename T>
 void CuDNNBatchNormGradientOp<Context>::TrainingRunWithType() {
+    typedef typename CUDNNType<T>::BNParamType BNParamType;
+
    //  determine the bn desc
    if (Input(0).ndim() == 2) {
        bn_mode = CUDNN_BATCHNORM_PER_ACTIVATION;
@@ -218,14 +225,14 @@ void CuDNNBatchNormGradientOp<Context>::TrainingRunWithType() {
        auto* dYdata = Input(-1).template data<T, Context>();
        auto* dXdata = Output(0)->template mutable_data<T, Context>();
        auto* Xdata = Input(0).template data<T, Context>();
-        auto* Sdata = Input(3).template data<T, Context>();
-        auto* dSdata = Output(1)->template mutable_data<T, Context>();
-        auto* dBdata = Output(2)->template mutable_data<T, Context>();
-        auto* Tmean = mean->template data<T, Context>();
-        auto* Tvar = var->template data<T, Context>();
+        auto* Sdata = Input(3).template data<BNParamType, Context>();
+        auto* dSdata = Output(1)->template mutable_data<BNParamType, Context>();
+        auto* dBdata = Output(2)->template mutable_data<BNParamType, Context>();
+        auto* Tmean = mean->template data<BNParamType, Context>();
+        auto* Tvar = var->template data<BNParamType, Context>();

        CUDNN_CHECK(cudnnBatchNormalizationBackward(
-            ctx().cudnn_handle(), bn_mode,
+            ctx()->cudnn_handle(), bn_mode,
                CUDNNType<T>::one, CUDNNType<T>::zero,
                    CUDNNType<T>::one, CUDNNType<T>::one,
                        output_desc, Xdata, input_desc, dYdata,
@@ -256,16 +263,16 @@ void CuDNNBatchNormGradientOp<Context>::InferenceRunWithType() {
            math::Gemv<T, Context>(
                CblasNoTrans, NC, S,
                    1.0, dYdata, MXmult,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
            math::Gemv<T, Context>(
                CblasTrans, N, C,
                    1.0, NCdata, MXmult,
-                        1.0, dBdata, &ctx());
+                        1.0, dBdata, ctx());
        } else if (data_format == "NHWC") {
            math::Gemv<T, Context>(
                CblasTrans, NS, C,
                    1.0, dYdata, MXmult,
-                        1.0, dBdata, &ctx());
+                        1.0, dBdata, ctx());
        }
    }

@@ -275,12 +282,12 @@ void CuDNNBatchNormGradientOp<Context>::InferenceRunWithType() {
        auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];

        //  compute stddev
-        ctx().template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
-        math::AddScalar<T, Context>(var->count(), this->eps, Tvar);
-        math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+        ctx()->template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
+        math::AddScalar<T, Context>(var->count(), this->eps, Tvar, ctx());
+        math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

        //  divide scale by stddev
-        math::Div<T, Context>(var->count(), Sdata, Tvar, Tvar);
+        math::Div<T, Context>(var->count(), Sdata, Tvar, Tvar, ctx());

        //  compute dE/dY \cot (scale / std(X))
        if (data_format == "NCHW") {
@@ -288,20 +295,21 @@ void CuDNNBatchNormGradientOp<Context>::InferenceRunWithType() {
                CblasNoTrans, CblasNoTrans,
                    N, C, 1,
                        1.0, MXmult, Tvar,
-                            0.0, NCdata, &ctx());
+                            0.0, NCdata, ctx());
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NC, S, 1,
                        1.0, NCdata, MXmult,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
        } else if (data_format == "NHWC") {
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NS, C, 1,
                        1.0, MXmult, Tvar,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
        }
-        math::Mul<T, Context>(Output(0)->count(), dYdata, WSdata, dXdata);
+        math::Mul<T, Context>(Output(0)->count(),
+            dYdata, WSdata, dXdata, ctx());
    }
 }

@@ -314,8 +322,10 @@ void CuDNNBatchNormGradientOp<Context>::RunOnDevice() {
        if (this->use_global_stats) InferenceRunWithType<float>();
        else TrainingRunWithType<float>();
    } else if (XIsType(Input(0), float16)) {
-        if (this->use_global_stats) InferenceRunWithType<float16>();
-        else TrainingRunWithType<float16>();
+        if (this->use_global_stats) {
+            //  fp16 is disabled during inference
+            LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+        } else TrainingRunWithType<float16>();
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 #else
    if (XIsType(Input(0), float)) {
@@ -325,7 +335,10 @@ void CuDNNBatchNormGradientOp<Context>::RunOnDevice() {
 #endif
 }

-REGISTER_CUDNN_OPERATOR(FusedBatchNormGradient, CuDNNBatchNormGradientOp<CUDAContext>);
+REGISTER_CUDNN_OPERATOR(
+    FusedBatchNormGradient,
+    CuDNNBatchNormGradientOp<CUDAContext>
+);
 INSTANTIATE_CUDNN_OPERATOR(BatchNormGradient);

 }    // namespace dragon

--- a/Dragon/src/operators/norm/fused_batch_norm.cc
+++ b/Dragon/src/operators/norm/fused_batch_norm.cc
@@ -24,23 +24,23 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* NCdata = nc.template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);

    //  compute mean
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0 / NS, Xdata, MXmult,
-                    0.0, NCdata, &ctx());
+                    0.0, NCdata, ctx());
        math::Gemv<T, Context>(
            CblasTrans, N, C,
                1.0, NCdata, MXmult,
-                    0.0, Tmean, &ctx());
+                    0.0, Tmean, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0 / NS, Xdata, MXmult,
-                    0.0, Tmean, &ctx());
+                    0.0, Tmean, ctx());
    }

    //  subtract mean
@@ -49,51 +49,51 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tmean,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    -1.0, NCdata, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    -1.0, MXmult, Tmean,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }

    //  compute variance
    //  note that we use VAR(X) = E((X - EX) ^ 2)
-    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
+    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0 / NS, WSdata, MXmult,
-                    0.0, NCdata, &ctx());
+                    0.0, NCdata, ctx());
        math::Gemv<T, Context>(
            CblasTrans, N, C,
                1.0, NCdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, NS, C,
                1.0 / NS, WSdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
    }

    //  compute moving average
    if (!is_recomputing) {
        //  History(X) = (1 - momentum) * Cur(X) + momentum * History(X)
        math::Axpby<T, Context>(mean->count(),
-            1.0 - momentum, Tmean, momentum, Hmean, &ctx());
+            1.0 - momentum, Tmean, momentum, Hmean, ctx());
        math::Axpby<T, Context>(var->count(),
-            1.0 - momentum, Tvar, momentum, Hvar, &ctx());
+            1.0 - momentum, Tvar, momentum, Hvar, ctx());
    }

    //  compute stddev
-    math::AddScalar<T, Context>(var->count(), eps, Tvar);
-    math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+    math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
+    math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

    //  divide by stddev
    if (data_format == "NCHW") {
@@ -101,24 +101,25 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
              CblasNoTrans, CblasNoTrans,
                  N, C, 1,
                      1.0, MXmult, Tvar,
-                          0.0, NCdata, &ctx());
+                          0.0, NCdata, ctx());
          math::Gemm<T, Context>(
              CblasNoTrans, CblasNoTrans,
                  NC, S, 1,
                      1.0, NCdata, MXmult,
-                          0.0, WSdata, &ctx());
+                          0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Div<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());

    //  store x_norm for backward
    auto* XNorm_data = x_norm->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), XNorm_data, Ydata);

    // scale
@@ -127,20 +128,21 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
             CblasNoTrans, CblasNoTrans,
                 N, C, 1,
                     1.0, MXmult, Sdata,
-                         0.0, NCdata, &ctx());
+                         0.0, NCdata, ctx());
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Sdata,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Mul<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());

    // shift
    if (data_format == "NCHW") {
@@ -148,18 +150,18 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Bdata,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                 NS, C, 1,
                     1.0, MXmult, Bdata,
-                         1.0, Ydata, &ctx());
+                         1.0, Ydata, ctx());
    }
 }

@@ -182,9 +184,9 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* NCdata = nc.template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
-    ctx().template Copy<T, Context, Context>(mean->count(), Tmean, Hmean);
-    ctx().template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
+    ctx()->template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(mean->count(), Tmean, Hmean);
+    ctx()->template Copy<T, Context, Context>(var->count(), Tvar, Hvar);

    //  subtract mean
    if (data_format == "NCHW") {
@@ -192,23 +194,23 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Tmean,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    -1.0, NCdata, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    -1.0, MXmult, Tmean,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }

    //  compute stddev
-    math::AddScalar<T, Context>(var->count(), eps, Tvar);
-    math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+    math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
+    math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

    //  divide by stddev
    if (data_format == "NCHW") {
@@ -216,20 +218,21 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
              CblasNoTrans, CblasNoTrans,
                  N, C, 1,
                      1.0, MXmult, Tvar,
-                          0.0, NCdata, &ctx());
+                          0.0, NCdata, ctx());
          math::Gemm<T, Context>(
              CblasNoTrans, CblasNoTrans,
                  NC, S, 1,
                      1.0, NCdata, MXmult,
-                          0.0, WSdata, &ctx());
+                          0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Tvar,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Div<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());

    // scale
    if (data_format == "NCHW") {
@@ -237,20 +240,21 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
             CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Sdata,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult, Sdata,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    }
-    math::Mul<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());

    // shift
    if (data_format == "NCHW") {
@@ -258,18 +262,18 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Bdata,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                 NS, C, 1,
                     1.0, MXmult, Bdata,
-                         1.0, Ydata, &ctx());
+                         1.0, Ydata, ctx());
    }
 }

@@ -312,10 +316,7 @@ void FusedBatchNormOp<Context>::RunOnDevice() {
    if (XIsType(Input(0), float)) {
        if (use_global_stats) InferenceRunWithType<float>();
        else TrainingRunWithType<float>();
-    } else if (XIsType(Input(0), float16)) {
-        if (use_global_stats) InferenceRunWithType<float16>();
-        else TrainingRunWithType<float16>();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }


@@ -341,21 +342,22 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
    // gradient w.r.t. scale
    if (Output(1)->name() != "ignore") {
        auto* dSdata = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(x_norm->count(), XNorm_data, dYdata, WSdata);
+        math::Mul<T, Context>(x_norm->count(),
+            XNorm_data, dYdata, WSdata, ctx());
        if (data_format == "NCHW") {
            math::Gemv<T, Context>(
                CblasNoTrans, NC, S,
                    1.0, WSdata, MXmult,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
            math::Gemv<T, Context>(
                CblasTrans, N, C,
                    1.0, NCdata, MXmult,
-                        1.0, dSdata, &ctx());
+                        1.0, dSdata, ctx());
        } else if (data_format == "NHWC") {
            math::Gemv<T, Context>(
                CblasTrans, NS, C,
                    1.0, WSdata, MXmult,
-                        1.0, dSdata, &ctx());
+                        1.0, dSdata, ctx());
        }
    }

@@ -366,16 +368,16 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
            math::Gemv<T, Context>(
                CblasNoTrans, NC, S,
                    1.0, dYdata, MXmult,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
            math::Gemv<T, Context>(
                CblasTrans, N, C,
                    1.0, NCdata, MXmult,
-                        1.0, dBdata, &ctx());
+                        1.0, dBdata, ctx());
        } else if (data_format == "NHWC") {
            math::Gemv<T, Context>(
                CblasTrans, NS, C,
                    1.0, dYdata, MXmult,
-                        1.0, dBdata, &ctx());
+                        1.0, dBdata, ctx());
        }
    }

@@ -387,37 +389,39 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
                CblasNoTrans, CblasNoTrans,
                    N, C, 1,
                        1.0, MXmult, Sdata,
-                            0.0, NCdata, &ctx());
+                            0.0, NCdata, ctx());
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NC, S, 1,
                        1.0, NCdata, MXmult,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
         } else if (data_format == "NHWC") {
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NS, C, 1,
                        1.0, MXmult, Sdata,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
         }
-         math::Mul<T, Context>(x_norm->count(), WSdata, dYdata, WSdata);
+         math::Mul<T, Context>(x_norm->count(),
+             WSdata, dYdata, WSdata, ctx());

         // sum of x_hat * (dl / dx_hat)
-         math::Mul<T, Context>(x_norm->count(), XNorm_data, WSdata, dXdata);
+         math::Mul<T, Context>(x_norm->count(),
+             XNorm_data, WSdata, dXdata, ctx());
         if (data_format == "NCHW") {
             math::Gemv<T, Context>(
                 CblasNoTrans, NC, S,
                     1.0, dXdata, MXmult,
-                         0.0, NCdata, &ctx());
+                         0.0, NCdata, ctx());
             math::Gemv<T, Context>(
                 CblasTrans, N, C,
                    1.0, NCdata, MXmult,
-                         0.0, Tmean, &ctx());
+                         0.0, Tmean, ctx());
         } else if (data_format == "NHWC") {
             math::Gemv<T, Context>(
                 CblasTrans, NS, C,
                     1.0, dXdata, MXmult,
-                         0.0, Tmean, &ctx());
+                         0.0, Tmean, ctx());
         }

         // x_hat times the sum
@@ -426,54 +430,55 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
                 CblasNoTrans, CblasNoTrans,
                     N, C, 1,
                         1.0, MXmult, Tmean,
-                             0.0, NCdata, &ctx());
+                             0.0, NCdata, ctx());
             math::Gemm<T, Context>(
                 CblasNoTrans, CblasNoTrans,
                     NC, S, 1,
                         1.0, NCdata, MXmult,
-                             0.0, dXdata, &ctx());
+                             0.0, dXdata, ctx());
         } else if (data_format == "NHWC") {
             math::Gemm<T, Context>(
                 CblasNoTrans, CblasNoTrans,
                     NS, C, 1,
                         1.0, MXmult, Tmean,
-                             0.0, dXdata, &ctx());
+                             0.0, dXdata, ctx());
         }
-         math::Mul<T, Context>(x_norm->count(), XNorm_data, dXdata, dXdata);
+         math::Mul<T, Context>(x_norm->count(),
+             XNorm_data, dXdata, dXdata, ctx());

        // subtract the average of x_hat times the sum
        if (data_format == "NCHW") {
            math::Gemv<T, Context>(
                CblasNoTrans, NC, S,
                    1.0, WSdata, MXmult,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
            math::Gemv<T, Context>(
                CblasTrans, N, C,
                    1.0, NCdata, MXmult,
-                        0.0, Tmean, &ctx());
+                        0.0, Tmean, ctx());
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    N, C, 1,
                        1.0, MXmult, Tmean,
-                            0.0, NCdata, &ctx());
+                            0.0, NCdata, ctx());
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NC, S, 1,
                        1.0, NCdata, MXmult,
-                            1.0, dXdata, &ctx());
+                            1.0, dXdata, ctx());
        } else if (data_format == "NHWC") {
            math::Gemv<T, Context>(
                CblasTrans, NS, C,
                    1.0, WSdata, MXmult,
-                        0.0, Tmean, &ctx());
+                        0.0, Tmean, ctx());
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NS, C, 1,
                        1.0, MXmult, Tmean,
-                            1.0, dXdata, &ctx());
+                            1.0, dXdata, ctx());
        }
        math::Axpby<T, Context>(x_norm->count(),
-            1.0, WSdata, -1.0 / NS, dXdata, &ctx());
+            1.0, WSdata, -1.0 / NS, dXdata, ctx());

        // multiply with the inverse std
         if (data_format == "NCHW") {
@@ -481,21 +486,22 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
                 CblasNoTrans, CblasNoTrans,
                     N, C, 1,
                         1.0, MXmult, Tvar,
-                             0.0, NCdata, &ctx());
+                             0.0, NCdata, ctx());
             math::Gemm<T, Context>(
                 CblasNoTrans, CblasNoTrans,
                     NC, S, 1,
                         1.0, NCdata, MXmult,
-                             0.0, WSdata, &ctx());
+                             0.0, WSdata, ctx());
        } else if (data_format == "NHWC") {
             math::Gemm<T, Context>(
                 CblasNoTrans, CblasNoTrans,
                    NS, C, 1,
                        1.0, MXmult, Tvar,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
        }
        //  divide by stddev
-        math::Div<T, Context>(x_norm->count(), dXdata, WSdata, dXdata);
+        math::Div<T, Context>(x_norm->count(),
+            dXdata, WSdata, dXdata, ctx());
    }
 }

@@ -519,16 +525,16 @@ void FusedBatchNormGradientOp<Context>::InferenceRunWithType() {
            math::Gemv<T, Context>(
                CblasNoTrans, NC, S,
                    1.0, dYdata, MXmult,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
            math::Gemv<T, Context>(
                CblasTrans, N, C,
                    1.0, NCdata, MXmult,
-                        1.0, dBdata, &ctx());
+                        1.0, dBdata, ctx());
        } else if (data_format == "NHWC") {
            math::Gemv<T, Context>(
                CblasTrans, NS, C,
                    1.0, dYdata, MXmult,
-                        1.0, dBdata, &ctx());
+                        1.0, dBdata, ctx());
            }
    }

@@ -538,7 +544,7 @@ void FusedBatchNormGradientOp<Context>::InferenceRunWithType() {
        auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];

        //  divide scale by stddev
-        math::Div<T, Context>(var->count(), Sdata, Tvar, Tvar);
+        math::Div<T, Context>(var->count(), Sdata, Tvar, Tvar, ctx());

        //  compute dE/dY \cot (scale / std(X))
        if (data_format == "NCHW") {
@@ -546,20 +552,21 @@ void FusedBatchNormGradientOp<Context>::InferenceRunWithType() {
                CblasNoTrans, CblasNoTrans,
                    N, C, 1,
                        1.0, MXmult, Tvar,
-                            0.0, NCdata, &ctx());
+                            0.0, NCdata, ctx());
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NC, S, 1,
                        1.0, NCdata, MXmult,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
        } else if (data_format == "NHWC") {
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NS, C, 1,
                        1.0, MXmult, Tvar,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
        }
-        math::Mul<T, Context>(Output(0)->count(), dYdata, WSdata, dXdata);
+        math::Mul<T, Context>(Output(0)->count(),
+            dYdata, WSdata, dXdata, ctx());
    }
 }

@@ -599,10 +606,7 @@ void FusedBatchNormGradientOp<Context>::RunOnDevice() {
    if (XIsType(Input(0), float)) {
        if (use_global_stats) InferenceRunWithType<float>();
        else TrainingRunWithType<float>();
-    } else if (XIsType(Input(0), float16)) {
-        if (use_global_stats) InferenceRunWithType<float16>();
-        else TrainingRunWithType<float16>();
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

 DEPLOY_CPU(FusedBatchNormGradient);

--- a/Dragon/src/operators/norm/fused_group_norm.cc
+++ b/Dragon/src/operators/norm/fused_group_norm.cc
@@ -21,14 +21,14 @@ void FusedGroupNormOp<Context>::RunWithType() {

    auto* NCdata = nc.template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);

    //  compute mean
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NG, CGS,
                1.0 / CGS, Xdata, MXmult,
-                    0.0, Tmean, &ctx());
+                    0.0, Tmean, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }
@@ -39,26 +39,26 @@ void FusedGroupNormOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                NG, CGS, 1,
                    -1.0, Tmean, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }

    //  compute variance
    //  note that we use VAR(X) = E((X - EX) ^ 2)
-    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
+    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NG, CGS,
                1.0 / CGS, WSdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }

    //  compute stddev
-    math::AddScalar<T, Context>(var->count(), eps, Tvar);
-    math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+    math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
+    math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

    //  divide by stddev
    if (data_format == "NCHW") {
@@ -66,15 +66,16 @@ void FusedGroupNormOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                NG, CGS, 1,
                    1.0, Tvar, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }
-    math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Div<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());

    //  store x_norm for backward
    auto* XNorm_data = x_norm->template mutable_data<T, Context>();
-    ctx().template Copy<T, Context, Context>(
+    ctx()->template Copy<T, Context, Context>(
        Output(0)->count(), XNorm_data, Ydata);

    // scale
@@ -83,20 +84,21 @@ void FusedGroupNormOp<Context>::RunWithType() {
             CblasNoTrans, CblasNoTrans,
                 N, C, 1,
                     1.0, MXmult, Sdata,
-                         0.0, NCdata, &ctx());
+                         0.0, NCdata, ctx());
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                 NC, S, 1,
                     1.0, NCdata, MXmult,
-                         0.0, WSdata, &ctx());
+                         0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                 NS, C, 1,
                     1.0, MXmult, Sdata,
-                         0.0, WSdata, &ctx());
+                         0.0, WSdata, ctx());
    }
-    math::Mul<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());

    // shift
    if (data_format == "NCHW") {
@@ -104,18 +106,18 @@ void FusedGroupNormOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                N, C, 1,
                    1.0, MXmult, Bdata,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, NCdata, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
         math::Gemm<T, Context>(
             CblasNoTrans, CblasNoTrans,
                NS, C, 1,
                    1.0, MXmult,  Bdata,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    }
 }

@@ -157,8 +159,7 @@ void FusedGroupNormOp<Context>::RunOnDevice() {
    Setup();

    if (XIsType(Input(0), float)) RunWithType<float>();
-    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }


@@ -184,21 +185,22 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
    // gradient w.r.t. scale
    if (Output(1)->name() != "ignore") {
        auto* dSdata = Output(1)->template mutable_data<T, Context>();
-        math::Mul<T, Context>(x_norm->count(), XNorm_data, dYdata, WSdata);
+        math::Mul<T, Context>(x_norm->count(),
+            XNorm_data, dYdata, WSdata, ctx());
        if (data_format == "NCHW") {
            math::Gemv<T, Context>(
                CblasNoTrans, NC, S,
                    1.0, WSdata, MXmult,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
            math::Gemv<T, Context>(
                CblasTrans, N, C,
                    1.0, NCdata, MXmult,
-                        1.0, dSdata, &ctx());
+                        1.0, dSdata, ctx());
        } else if (data_format == "NHWC") {
            math::Gemv<T, Context>(
                CblasTrans, NS, C,
                    1.0, WSdata, MXmult,
-                        1.0, dSdata, &ctx());
+                        1.0, dSdata, ctx());
        }
    }

@@ -209,16 +211,16 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
            math::Gemv<T, Context>(
                CblasNoTrans, NC, S,
                    1.0, dYdata, MXmult,
-                        0.0, NCdata, &ctx());
+                        0.0, NCdata, ctx());
            math::Gemv<T, Context>(
                CblasTrans, N, C,
                    1.0, NCdata, MXmult,
-                        1.0, dBdata, &ctx());
+                        1.0, dBdata, ctx());
        } else if (data_format == "NHWC") {
            math::Gemv<T, Context>(
                CblasTrans, NS, C,
                    1.0, dYdata, MXmult,
-                        1.0, dBdata, &ctx());
+                        1.0, dBdata, ctx());
        }
    }

@@ -230,28 +232,30 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
                CblasNoTrans, CblasNoTrans,
                    N, C, 1,
                        1.0, MXmult, Sdata,
-                            0.0, NCdata, &ctx());
+                            0.0, NCdata, ctx());
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NC, S, 1,
                        1.0, NCdata, MXmult,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
         } else if (data_format == "NHWC") {
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NS, C, 1,
                        1.0, MXmult, Sdata,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
         }
-         math::Mul<T, Context>(x_norm->count(), WSdata, dYdata, WSdata);
+         math::Mul<T, Context>(x_norm->count(),
+             WSdata, dYdata, WSdata, ctx());

         // sum of x_hat * (dl / dx_hat)
-         math::Mul<T, Context>(x_norm->count(), XNorm_data, WSdata, dXdata);
+         math::Mul<T, Context>(x_norm->count(),
+             XNorm_data, WSdata, dXdata, ctx());
         if (data_format == "NCHW") {
             math::Gemv<T, Context>(
                 CblasNoTrans, NG, CGS,
                    1.0, dXdata, MXmult,
-                        0.0, Tmean, &ctx());
+                        0.0, Tmean, ctx());
         } else if (data_format == "NHWC") {
             NOT_IMPLEMENTED;
         }
@@ -262,28 +266,29 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
                 CblasNoTrans, CblasNoTrans,
                    NG, CGS, 1,
                        1.0, Tmean, MXmult,
-                            0.0, dXdata, &ctx());
+                            0.0, dXdata, ctx());
         } else if (data_format == "NHWC") {
             NOT_IMPLEMENTED;
         }
-         math::Mul<T, Context>(x_norm->count(), XNorm_data, dXdata, dXdata);
+         math::Mul<T, Context>(x_norm->count(),
+             XNorm_data, dXdata, dXdata, ctx());

        // subtract the average of x_hat times the sum
        if (data_format == "NCHW") {
            math::Gemv<T, Context>(
                CblasNoTrans, NG, CGS,
                    1.0, WSdata, MXmult,
-                        0.0, Tmean, &ctx());
+                        0.0, Tmean, ctx());
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
                    NG, CGS, 1,
                        1.0, Tmean, MXmult,
-                            1.0, dXdata, &ctx());
+                            1.0, dXdata, ctx());
        } else if (data_format == "NHWC") {
            NOT_IMPLEMENTED;
        }
        math::Axpby<T, Context>(x_norm->count(),
-            1.0, WSdata, -1.0 / CGS, dXdata, &ctx());
+            1.0, WSdata, -1.0 / CGS, dXdata, ctx());

        // multiply with the inverse std
         if (data_format == "NCHW") {
@@ -291,12 +296,13 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
                 CblasNoTrans, CblasNoTrans,
                    NG, CGS, 1,
                        1.0, Tvar, MXmult,
-                            0.0, WSdata, &ctx());
+                            0.0, WSdata, ctx());
        } else if (data_format == "NHWC") {
             NOT_IMPLEMENTED;
        }
        //  divide by stddev
-        math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
+        math::Div<T, Context>(Output(0)->count(),
+            dXdata, WSdata, dXdata, ctx());
    }
 }

@@ -337,8 +343,7 @@ void FusedGroupNormGradientOp<Context>::RunOnDevice() {
    Setup();

    if (XIsType(Input(0), float)) RunWithType<float>();
-    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

 DEPLOY_CPU(FusedGroupNormGradient);

--- a/Dragon/src/operators/norm/group_norm_op.cc
+++ b/Dragon/src/operators/norm/group_norm_op.cc
@@ -15,14 +15,14 @@ void GroupNormOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* NCdata = nc.template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);

    //  compute mean
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NG, CGS,
                1.0 / CGS, Xdata, MXmult,
-                    0, Tmean, &ctx());
+                    0, Tmean, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }
@@ -33,26 +33,26 @@ void GroupNormOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                NG, CGS, 1,
                    -1.0, Tmean, MXmult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }

    //  compute variance
    //  note that we use VAR(X) = E((X - EX) ^ 2)
-    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
+    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NG, CGS,
                1.0 / CGS, WSdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }

    //  compute stddev
-    math::AddScalar<T, Context>(var->count(), eps, Tvar);
-    math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+    math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
+    math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

    //  divide by stddev
    if (data_format == "NCHW") {
@@ -60,11 +60,12 @@ void GroupNormOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                NG, CGS, 1,
                    1.0, Tvar, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }
-    math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Div<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());
 }

 template <class Context>
@@ -102,8 +103,7 @@ void GroupNormOp<Context>::RunOnDevice() {
    Setup();

    if (XIsType(Input(0), float)) RunWithType<float>();
-    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

 DEPLOY_CPU(GroupNorm);
@@ -127,43 +127,45 @@ void GroupNormGradientOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                NG, CGS, 1,
                    1.0, Tvar, MXmult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }

    auto* Ydata = Input(1).template data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), Ydata, dYdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, dYdata, dXdata, ctx());

     //  sum(dE/dY \cdot Y)
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NG, CGS,
                1.0, dXdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NG, CGS, 1,
                    1.0, Tvar, MXmult,
-                        0.0, dXdata, &ctx());
+                        0.0, dXdata, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }

    //  sum(dE/dY \cdot Y) \cdot Y
-    math::Mul<T, Context>(Output(0)->count(), Ydata, dXdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, dXdata, dXdata, ctx());

    //  sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NG, CGS,
                1.0, dYdata, MXmult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NG, CGS, 1,
                    1.0, Tvar, MXmult,
-                        1.0, dXdata, &ctx());
+                        1.0, dXdata, ctx());
    } else if (data_format == "NHWC") {
        NOT_IMPLEMENTED;
    }
@@ -171,10 +173,11 @@ void GroupNormGradientOp<Context>::RunWithType() {
    //   dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
    // = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
    math::Axpby<T, Context>(Output(0)->count(),
-        1.0, dYdata, -1.0 / CGS, dXdata, &ctx());
+        1.0, dYdata, -1.0 / CGS, dXdata, ctx());

    //  divide by stddev
-    math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
+    math::Div<T, Context>(Output(0)->count(),
+        dXdata, WSdata, dXdata, ctx());
 }

 template <class Context>
@@ -210,8 +213,7 @@ void GroupNormGradientOp<Context>::RunOnDevice() {
    Setup();

    if (XIsType(Input(0), float)) RunWithType<float>();
-    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

 DEPLOY_CPU(GroupNormGradient);

--- a/Dragon/src/operators/norm/instance_norm_op.cc
+++ b/Dragon/src/operators/norm/instance_norm_op.cc
@@ -14,14 +14,14 @@ void InstanceNormOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
-    ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
+    ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);

    //  compute mean
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0 / S, Xdata, Smult,
-                    0.0, Tmean, &ctx());
+                    0.0, Tmean, ctx());
    } else if (data_format == "NHWC") {
        auto* x = Xdata;
        auto* tm = Tmean;
@@ -29,7 +29,7 @@ void InstanceNormOp<Context>::RunWithType() {
            math::Gemv<T, Context>(
                CblasTrans, S, C,
                    1.0 / S, x, Smult,
-                        0.0, tm, &ctx());
+                        0.0, tm, ctx());
            x += CS;
            tm += C;
        }
@@ -41,7 +41,7 @@ void InstanceNormOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    -1.0, Tmean, Smult,
-                        1.0, Ydata, &ctx());
+                        1.0, Ydata, ctx());
    } else if (data_format == "NHWC") {
        auto* y = Ydata;
        auto* tm = Tmean;
@@ -50,7 +50,7 @@ void InstanceNormOp<Context>::RunWithType() {
                CblasNoTrans, CblasNoTrans,
                    S, C, 1,
                        -1.0, Smult, tm,
-                            1.0, y, &ctx());
+                            1.0, y, ctx());
            y += CS;
            tm += C;
        }
@@ -58,12 +58,12 @@ void InstanceNormOp<Context>::RunWithType() {
  
    //  compute variance
    //  note that we use VAR(X) = E((X - EX) ^ 2)
-    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
+    math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0 / S, WSdata, Smult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
    } else if (data_format == "NHWC") {
        auto* x2 = WSdata;
        auto* tv = Tvar;
@@ -71,15 +71,15 @@ void InstanceNormOp<Context>::RunWithType() {
            math::Gemv<T, Context>(
                CblasTrans, S, C,
                    1.0 / S, x2, Smult,
-                        0.0, tv, &ctx());
+                        0.0, tv, ctx());
            x2 += CS;
            tv += C;
        }
    }

    //  compute stddev
-    math::AddScalar<T, Context>(var->count(), eps, Tvar);
-    math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
+    math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
+    math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());

    //  divide by stddev
    if (data_format == "NCHW") {
@@ -87,7 +87,7 @@ void InstanceNormOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, Tvar, Smult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        auto* std = WSdata;
        auto* tv = Tvar;
@@ -96,12 +96,13 @@ void InstanceNormOp<Context>::RunWithType() {
                CblasNoTrans, CblasNoTrans,
                    S, C, 1,
                        1.0, Smult, tv,
-                            0.0, std, &ctx());
+                            0.0, std, ctx());
            std += CS;
            tv += C;
        }
    }
-    math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
+    math::Div<T, Context>(Output(0)->count(),
+        Ydata, WSdata, Ydata, ctx());
 }

 template <class Context>
@@ -133,8 +134,7 @@ void InstanceNormOp<Context>::RunOnDevice() {
    Setup();

    if (XIsType(Input(0), float)) RunWithType<float>();
-    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

 DEPLOY_CPU(InstanceNorm);
@@ -157,7 +157,7 @@ void InstanceNormGradientOp<Context>::RunWithType() {
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, Tvar, Smult,
-                        0.0, WSdata, &ctx());
+                        0.0, WSdata, ctx());
    } else if (data_format == "NHWC") {
        auto* std = WSdata;
        auto* tv = Tvar;
@@ -166,26 +166,27 @@ void InstanceNormGradientOp<Context>::RunWithType() {
                CblasNoTrans, CblasNoTrans,
                    S, C, 1,
                        1.0, Smult, tv,
-                            0.0, std, &ctx());
+                            0.0, std, ctx());
            std += CS;
            tv += C;
        }
    }

    auto* Ydata = Input(-2).template data<T, Context>();
-    math::Mul<T, Context>(Output(0)->count(), Ydata, dYdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, dYdata, dXdata, ctx());

    //  sum(dE/dY \cdot Y)
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0, dXdata, Smult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, Tvar, Smult,
-                        0.0, dXdata, &ctx());
+                        0.0, dXdata, ctx());
    } else if (data_format == "NHWC") {
        for (int i = 0; i < N; i++) {
            auto* dx = dXdata;
@@ -194,12 +195,12 @@ void InstanceNormGradientOp<Context>::RunWithType() {
                math::Gemv<T, Context>(
                    CblasTrans, S, C,
                        1.0, dx, Smult,
-                            0, tv, &ctx());
+                            0, tv, ctx());
                math::Gemm<T, Context>(
                    CblasNoTrans, CblasNoTrans,
                        S, C, 1,
                            1.0, Smult, tv,
-                                0.0, dx, &ctx());
+                                0.0, dx, ctx());
                dx += CS;
                tv += C;
            }
@@ -207,19 +208,20 @@ void InstanceNormGradientOp<Context>::RunWithType() {
    }

    //  sum(dE/dY \cdot Y) \cdot Y
-    math::Mul<T, Context>(Output(0)->count(), Ydata, dXdata, dXdata);
+    math::Mul<T, Context>(Output(0)->count(),
+        Ydata, dXdata, dXdata, ctx());

    //  sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
    if (data_format == "NCHW") {
        math::Gemv<T, Context>(
            CblasNoTrans, NC, S,
                1.0, dYdata, Smult,
-                    0.0, Tvar, &ctx());
+                    0.0, Tvar, ctx());
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                NC, S, 1,
                    1.0, Tvar, Smult,
-                        1.0, dXdata, &ctx());
+                        1.0, dXdata, ctx());
    } else if (data_format == "NHWC") {
        for (int i = 0; i < N; i++) {
            auto* dy = dYdata;
@@ -229,12 +231,12 @@ void InstanceNormGradientOp<Context>::RunWithType() {
                math::Gemv<T, Context>(
                    CblasTrans, S, C,
                        1.0, dy, Smult,
-                            0, tv, &ctx());
+                            0, tv, ctx());
                math::Gemm<T, Context>(
                    CblasNoTrans, CblasNoTrans,
                        S, C, 1,
                            1.0, Smult, tv,
-                                1.0, dx, &ctx());
+                                1.0, dx, ctx());
                dy += CS;
                dx += CS;
                tv += C;
@@ -245,10 +247,11 @@ void InstanceNormGradientOp<Context>::RunWithType() {
    //  dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
    //  = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
    math::Axpby<T, Context>(Output(0)->count(),
-        1.0, dYdata, -1.0 / S, dXdata, &ctx());
+        1.0, dYdata, -1.0 / S, dXdata, ctx());

    //  divide by stddev
-    math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
+    math::Div<T, Context>(Output(0)->count(),
+        dXdata, WSdata, dXdata, ctx());
 }

 template <class Context>
@@ -279,8 +282,7 @@ void InstanceNormGradientOp<Context>::RunOnDevice() {
    Setup();

    if (XIsType(Input(0), float)) RunWithType<float>();
-    else if (XIsType(Input(0), float16)) RunWithType<float16>();
-    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+    else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
 }

 DEPLOY_CPU(InstanceNormGradient);

--- a/Dragon/src/operators/norm/l2_norm_op.cc
+++ b/Dragon/src/operators/norm/l2_norm_op.cc
@@ -24,35 +24,28 @@ void L2NormOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    auto* Bdata = ws()->template caches<T, Context>({ buffer.count() })[0];
    auto* Ndata = norm->template mutable_data<T, Context>();
-    math::Set<T, Context>(norm->count(), dragon_cast<T, float>(eps), Ndata);
+    math::Set<T, Context>(norm->count(),
+        dragon_cast<T, float>(eps), Ndata, ctx());

    for (int n = 0; n < outer_dim; n++) {
-        if (across_inner) {
-            auto* Ndata_ = norm->template mutable_data<float, CPUContext>();
-            float sum_of_sqr = math::Dot<T, Context>(
-                buffer.count(), Xdata, Xdata, &ctx());
-            if (mode == "MEAN") sum_of_sqr = sum_of_sqr / dim;
-            Ndata_[n] = pow(sum_of_sqr + eps, 0.5);
-            math::Scale<T, Context>(buffer.count(),
-                1.0 / Ndata_[n], Xdata, Ydata, &ctx());
-        } else {
-            math::Square<T, Context>(buffer.count(), Xdata, Bdata);
-            //  compute T1 = \sum_{i} x_{i,j}^{2}
-            math::Gemv<T, Context>(
-                CblasTrans, dim, inner_dim,
-                    mode == "MEAN" ? 1.0 / dim : 1.0, Bdata, Dmult,
-                        1.0, Ndata, &ctx());
-            //  compute T2 = \sqrt{T1}
-            math::Sqrt<T, Context>(inner_dim, Ndata, Ndata);
-            //  compute T3 = x / [(T2)]_{dim} 
-            math::Gemm<T, Context>(
-                CblasNoTrans, CblasNoTrans,
-                    dim, inner_dim, 1,
-                        1.0, Dmult, Ndata,
-                            0.0, Bdata, &ctx());
-            math::Div<T, Context>(buffer.count(), Xdata, Bdata, Ydata);
-            Ndata += inner_dim;
-        }
+        math::Square<T, Context>(buffer.count(),
+            Xdata, Bdata, ctx());
+        //  compute T1 = \sum_{i} x_{i,j}^{2}
+        math::Gemv<T, Context>(
+            CblasTrans, dim, inner_dim,
+                mode == "MEAN" ? 1.0 / dim : 1.0, Bdata, Dmult,
+                    1.0, Ndata, ctx());
+        //  compute T2 = \sqrt{T1}
+        math::Sqrt<T, Context>(inner_dim, Ndata, Ndata, ctx());
+        //  compute T3 = x / [(T2)]_{dim} 
+        math::Gemm<T, Context>(
+            CblasNoTrans, CblasNoTrans,
+                dim, inner_dim, 1,
+                    1.0, Dmult, Ndata,
+                        0.0, Bdata, ctx());
+        math::Div<T, Context>(buffer.count(),
+            Xdata, Bdata, Ydata, ctx());
+        Ndata += inner_dim;
        Xdata += buffer.count();
        Ydata += buffer.count();
    }
@@ -70,8 +63,6 @@ void L2NormOp<Context>::RunOnDevice() {
    outer_dim = Input(0).count(0, axis);
    dim = Input(0).count(axis, axis + num_axes);
    inner_dim = Input(0).count(axis + num_axes);
-    if (inner_dim == 1) across_inner = true;
-    else across_inner = false;

    Output(0)->ReshapeLike(Input(0));

@@ -96,8 +87,8 @@ void L2NormGradientOp<Context>::RunWithType() {
    for (int i = 0; i < axis; i++) dims[i] = 1;
    buffer.Reshape(dims);
    buffer_inner.Reshape({ inner_dim });
-    vector<T*> BSdata = ws()->template caches<T, Context>({
-        buffer.count(), buffer_inner.count() });
+    vector<T*> BSdata = ws()->template caches<T, Context>(
+        { buffer.count(), buffer_inner.count() });

    auto* Xdata = Input(0).template data<T, Context>();
    auto* dYdata = Input(-1).template data<T, Context>();
@@ -106,48 +97,42 @@ void L2NormGradientOp<Context>::RunWithType() {
    auto* Bdata = BSdata[0], *BInnerdata = BSdata[1];

    for (int n = 0; n < outer_dim; n++) {
-        if (across_inner) {
-            Ndata = norm->template data<T, CPUContext>();
-            T sum_of_x_mul_dy = math::Dot<T, Context>(
-                buffer.count(), Xdata, dYdata, &ctx());
-            if (mode == "MEAN") sum_of_x_mul_dy = sum_of_x_mul_dy / dim;
-            math::Scale<T, Context>(buffer.count(),
-                sum_of_x_mul_dy / Ndata[n] / Ndata[n], Xdata, dXdata, &ctx());
-            math::Sub<T, Context>(buffer.count(), dYdata, dXdata, dXdata);
-            math::Scal<T, Context>(buffer.count(),
-                T(1.0 / Ndata[n]), dXdata, &ctx());
-        } else {
-            //  compute \sum_{i} x_{i, j}dy_{i, j}
-            math::Mul<T, Context>(buffer.count(), Xdata, dYdata, Bdata);
-            math::Gemv<T, Context>(
-                CblasTrans, dim, inner_dim,
-                    mode == "MEAN" ? 1.0 / dim : 1.0, Bdata, Dmult,
-                        0.0, BInnerdata, &ctx());
-            //  compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
-            math::Gemm<T, Context>(
-                CblasNoTrans, CblasNoTrans,
-                    dim, inner_dim, 1,
-                        1.0, Dmult, BInnerdata,
-                            0.0, Bdata, &ctx());
-            math::Mul<T, Context>(buffer.count(), Xdata, Bdata, dXdata);
-            //  compute T2 = T1 / Normalizer^{2}
-            math::Pow<T, Context>(inner_dim, 2.0, Ndata, BInnerdata);
-            math::Gemm<T, Context>(
-                CblasNoTrans, CblasNoTrans,
-                    dim, inner_dim, 1,
-                        1.0, Dmult, BInnerdata,
-                            0.0, Bdata, &ctx());
-            math::Div<T, Context>(buffer.count(), dXdata, Bdata, dXdata);
-            //  compute T3 = (dy - T2) / Normalizer
-            math::Sub<T, Context>(buffer.count(), dYdata, dXdata, dXdata);
-            math::Gemm<T, Context>(
-                CblasNoTrans, CblasNoTrans,
-                    dim, inner_dim, 1,
-                        1.0, Dmult, Ndata,
-                            0.0, Bdata, &ctx());
-            math::Div<T, Context>(buffer.count(), dXdata, Bdata, dXdata);
-            Ndata += inner_dim;
-        }
+        //  compute \sum_{i} x_{i, j}dy_{i, j}
+        math::Mul<T, Context>(buffer.count(),
+            Xdata, dYdata, Bdata, ctx());
+        math::Gemv<T, Context>(
+            CblasTrans, dim, inner_dim,
+                mode == "MEAN" ? 1.0 / dim : 1.0, Bdata, Dmult,
+                    0.0, BInnerdata, ctx());
+        //  compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
+        math::Gemm<T, Context>(
+            CblasNoTrans, CblasNoTrans,
+                dim, inner_dim, 1,
+                    1.0, Dmult, BInnerdata,
+                        0.0, Bdata, ctx());
+        math::Mul<T, Context>(buffer.count(),
+            Xdata, Bdata, dXdata, ctx());
+        //  compute T2 = T1 / Normalizer^{2}
+        math::Pow<T, Context>(inner_dim,
+            2.0, Ndata, BInnerdata, ctx());
+        math::Gemm<T, Context>(
+            CblasNoTrans, CblasNoTrans,
+                dim, inner_dim, 1,
+                    1.0, Dmult, BInnerdata,
+                        0.0, Bdata, ctx());
+        math::Div<T, Context>(buffer.count(),
+            dXdata, Bdata, dXdata, ctx());
+        //  compute T3 = (dy - T2) / Normalizer
+        math::Sub<T, Context>(buffer.count(),
+            dYdata, dXdata, dXdata, ctx());
+        math::Gemm<T, Context>(
+            CblasNoTrans, CblasNoTrans,
+                dim, inner_dim, 1,
+                    1.0, Dmult, Ndata,
+                        0.0, Bdata, ctx());
+        math::Div<T, Context>(buffer.count(),
+            dXdata, Bdata, dXdata, ctx());
+        Ndata += inner_dim;
        Xdata += buffer.count();
        dYdata += buffer.count();
        dXdata += buffer.count();
@@ -166,8 +151,6 @@ void L2NormGradientOp<Context>::RunOnDevice() {
    outer_dim = Input(0).count(0, axis);
    dim = Input(0).count(axis, axis + num_axes);
    inner_dim = Input(0).count(axis + num_axes);
-    if (inner_dim == 1) across_inner = true;
-    else across_inner = false;

    Output(0)->ReshapeLike(Input(0));


--- a/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
+++ b/Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
@@ -23,20 +23,20 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
        if (!states_initialized) {
            states_initialized = true;
            CUDNN_CHECK(cudnnDropoutGetStatesSize(
-                ctx().cudnn_handle(), &states_size));
+                ctx()->cudnn_handle(), &states_size));
            std::lock_guard<std::mutex> lk(CUDAContext::mutex());
            Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
                dragon_cast<string, unsigned long long>(random_seed) + "/states");
            if (states->count() > 0) {
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
-                    dropout_desc, ctx().cudnn_handle(), dropout_ratio,
+                    dropout_desc, ctx()->cudnn_handle(), dropout_ratio,
                        Sdata, states_size, random_seed));
            } else {
                states->Reshape({ (TIndex)states_size });
                auto* Sdata = states->template mutable_data<uint8_t, Context>();
                CUDNN_CHECK(cudnnSetDropoutDescriptor(
-                    dropout_desc, ctx().cudnn_handle(), dropout_ratio,
+                    dropout_desc, ctx()->cudnn_handle(), dropout_ratio,
                        Sdata, states_size, random_seed));
            }
        }
@@ -48,7 +48,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
    //  setup rnn
 #if CUDNN_VERSION_MIN(7, 0, 0)
    CUDNN_CHECK(cudnnSetRNNDescriptor(
-        ctx().cudnn_handle(), rnn_desc,
+        ctx()->cudnn_handle(), rnn_desc,
            hidden_size, num_layers,
                dropout_desc,
                    rnn_input_mode, rnn_direction, rnn_mode,
@@ -68,7 +68,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
    xs_desc->Set<T>({ batch_size, input_dim, 1 }, { input_dim, 1, 1 });
    ys_desc.reset(new cudnnTensorDescriptors(seq_length));
    ys_desc->Set<T>({ batch_size, output_dim, 1 }, { output_dim, 1, 1 });
-    CUDNN_CHECK(cudnnGetRNNWorkspaceSize(ctx().cudnn_handle(),
+    CUDNN_CHECK(cudnnGetRNNWorkspaceSize(ctx()->cudnn_handle(),
        rnn_desc, seq_length, xs_desc->descs(), &workspace_size));
    output_dims = { seq_length, batch_size, output_dim };

@@ -82,7 +82,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
    //  setup packed weights
    size_t weights_size; TIndex weights_count;
    CUDNN_CHECK(cudnnGetRNNParamsSize(
-        ctx().cudnn_handle(), rnn_desc, xs_desc->descs()[0],
+        ctx()->cudnn_handle(), rnn_desc, xs_desc->descs()[0],
            &weights_size, CUDNNType<T>::type));
    weights_count = (TIndex)weights_size / sizeof(T);
    CHECK_EQ(weights_count, Input(1).count())
@@ -96,7 +96,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {

    //  setup rnn workspace
    CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
-        ctx().cudnn_handle(), rnn_desc, seq_length,
+        ctx()->cudnn_handle(), rnn_desc, seq_length,
            xs_desc->descs(), &workspace_size));
 }

@@ -122,7 +122,7 @@ void CuDNNRecurrentOp<Context>::RunWithType() {

    auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];

-    auto handle = ctx().cudnn_handle();
+    auto handle = ctx()->cudnn_handle();

    if (phase() == "TRAIN") {
        CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(handle,
@@ -157,8 +157,12 @@ void CuDNNRecurrentOp<Context>::RunWithType() {

 template <class Context>
 void CuDNNRecurrentOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    if (XIsType(Input(0), float)) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }

@@ -182,7 +186,7 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {

    auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
    //  check the reserve space
-    CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(ctx().cudnn_handle(),
+    CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(ctx()->cudnn_handle(),
        rnn_desc, seq_length, xs_desc->descs(), &reserve_size));
    auto* reserveT = ws()->GetTensor("/mnt/" + anchor() + "/rnn/reserve");
    CHECK_EQ(reserve_size, reserveT->nbytes());
@@ -192,7 +196,7 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {
    auto* RSdata = reserveT->template data<uint8_t, Context>();
 #endif

-    auto handle = ctx().cudnn_handle();
+    auto handle = ctx()->cudnn_handle();

    if (Output(0)->name() != "ignore" ||
            Output(1)->name() != "ignore" ||
@@ -228,13 +232,17 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {

 template <class Context>
 void CuDNNRecurrentGradientOp<Context>::RunOnDevice() {
+    ctx()->set_stream_id(0);  //  enforce default stream
+
    Output(0)->ReshapeLike(Input(0));  // dX
    Output(1)->ReshapeLike(Input(1));  // dW
    Output(2)->ReshapeLike(Input(2));  // dHx
    Output(3)->ReshapeLike(Input(3));  // dCx

    if (XIsType(Input(0), float)) RunWithType<float>();
+#ifdef WITH_CUDA_FP16
    else if (XIsType(Input(0), float16)) RunWithType<float16>();
+#endif
    else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 }


--- a/Dragon/src/operators/recurrent/lstm_cell_op.cc
+++ b/Dragon/src/operators/recurrent/lstm_cell_op.cc
@@ -14,7 +14,7 @@ void LSTMCellOp<Context>::RunWithType() {

    kernel::LSTMCell<T, Context>(Input(1).count(), Input(1).dim(0),
        Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
-            CXdata, XAdata, Cdata, Hdata);
+            CXdata, XAdata, Cdata, Hdata, ctx());
 }

 template <class Context>
@@ -44,7 +44,7 @@ void LSTMCellGradientOp<Context>::RunWithType() {

    kernel::LSTMCellGrad<T, Context>(Input(1).count(), Input(1).dim(0),
        Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
-            CXdata, XAdata, Cdata, dCdata, dHdata, dCXdata, dXdata);
+            CXdata, XAdata, Cdata, dCdata, dHdata, dCXdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/recurrent/rnn_param_op.cc
+++ b/Dragon/src/operators/recurrent/rnn_param_op.cc
@@ -30,7 +30,7 @@ void RNNParamSetOp<Context>::RunWithType() {
        << "\nExcepted the size of param is " << size
        << ", but got " << Input(0).count();
    offset += param_type == "bias" ? matrix_count : 0;
-    ctx().template Copy<T, Context, Context>(size, Wdata + offset, Pdata);
+    ctx()->template Copy<T, Context, Context>(size, Wdata + offset, Pdata);
 }

 template <class Context>

--- a/Dragon/src/operators/update/adam_update_op.cc
+++ b/Dragon/src/operators/update/adam_update_op.cc
@@ -5,7 +5,7 @@
 namespace dragon {

 template <class Context>
-void AdamUpdateOp<Context>::ComputeRunWithFloat() {
+void AdamUpdateOp<Context>::ComputeRunWithFloat32() {
    Tensor* m = ws()->CreateTensor("/mnt/" + Slot() + "/adam/m");
    Tensor* v = ws()->CreateTensor("/mnt/" + Slot() + "/adam/v");
    m->ReshapeLike(Input(0));
@@ -16,12 +16,11 @@ void AdamUpdateOp<Context>::ComputeRunWithFloat() {
    float coeff = sqrt(1. - pow(beta2, t)) / (1. - pow(beta1, t));
    lr = Param("base_lr") * coeff * this->lr_mult;
    auto* dXdata = Input(0).template mutable_data<float, Context>();
-    auto* Mdata = m->mutable_data<float, Context>();
-    auto* Vdata = v->mutable_data<float, Context>();
+    auto* Mdata = m->mutable_data<float, Context>(ctx());
+    auto* Vdata = v->mutable_data<float, Context>(ctx());

-    kernel::AdamUpdate<float, Context>(
-        Input(0).count(), lr, beta1, beta2, eps,
-            dXdata, Mdata, Vdata);
+    kernel::AdamUpdate<float, Context>(Input(0).count(),
+        lr, beta1, beta2, eps, dXdata, Mdata, Vdata, ctx());
 }

 template <class Context>
@@ -35,13 +34,19 @@ void AdamUpdateOp<Context>::ComputeRunWithFloat16() {
    beta1 = Param("beta1"), beta2 = Param("beta2"), eps = Param("eps");
    float coeff = sqrt(1. - pow(beta2, t)) / (1. - pow(beta1, t));
    lr = Param("base_lr") * coeff * this->lr_mult;
-    auto* dXdata = Input(0).template mutable_data<float16, Context>();
-    auto* Mdata = m->mutable_data<float16, Context>();
-    auto* Vdata = v->mutable_data<float16, Context>();

-    kernel::AdamUpdate<float16, Context>(
-        Input(0).count(), lr, beta1, beta2, eps,
-            dXdata, Mdata, Vdata);
+    auto* dX32T = ws()->CreateTensor(Input(0).name() + "/f32");
+    dX32T->ReshapeLike(Input(0));
+
+    auto* dX32 = dX32T->template mutable_data<float, Context>();
+    auto* dX16 = Input(0).template mutable_data<float16, Context>();
+    auto* M32 = m->mutable_data<float, Context>(ctx());
+    auto* V32 = v->mutable_data<float, Context>(ctx());
+
+    kernel::TypeA2B<float16, float, Context>(
+        Input(0).count(), dX16, dX32, ctx());
+    kernel::AdamUpdate<float, Context>(Input(0).count(),
+        lr, beta1, beta2, eps, dX32, M32, V32, ctx());
 }

 DEPLOY_CPU(AdamUpdate);

--- a/Dragon/src/operators/update/collective_update_op.cc
+++ b/Dragon/src/operators/update/collective_update_op.cc
@@ -32,149 +32,175 @@ void CollectiveUpdateOp<Context>::InitNCCL() {
    if (comm_rank == comm_root) NCCL_CHECK(ncclGetUniqueId(&id));
    MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, comm_root, comm);
    NCCL_CHECK(ncclCommInitRank(&nccl_comm, comm_size, id, comm_rank));
-    closure = CUDAClosure<Context>(&ctx());
+    closure = CUDAClosure<Context>(ctx());
 #else
    LOG(FATAL) << "NCCL was not compiled.";
 #endif
 }

-template <class Context>
-void CollectiveUpdateOp<Context>::MPIAllReduceWithFloat() {
-    for (int j = 0; j < InputSize(); j++) {
-        TIndex count = Input(j).count();
-        MPI_Request recv_req;
-        TIndex segment_size = count / comm_size;
-        TIndex residual = count % comm_size;
-        vector<TIndex> segment_sizes(comm_size, segment_size);
-        for (int i = 0; i < residual; i++) segment_sizes[i]++;
-        vector<TIndex> segment_ends(comm_size);
-        segment_ends[0] = segment_sizes[0];
-        for (int i = 1; i < segment_ends.size(); i++) 
-            segment_ends[i] = segment_sizes[i] + segment_ends[i - 1];
+template <class Context> template <typename T>
+void CollectiveUpdateOp<Context>::MPIAllReduce(
+    Tensor*                 tensor,
+    MPI_Datatype            dtype) {
+    TIndex count = tensor->count();
+    MPI_Request recv_req;
+    TIndex segment_size = count / comm_size;
+    TIndex residual = count % comm_size;
+    vector<TIndex> segment_sizes(comm_size, segment_size);
+    for (int i = 0; i < residual; i++) segment_sizes[i]++;
+    vector<TIndex> segment_ends(comm_size);
+    segment_ends[0] = segment_sizes[0];
+    for (int i = 1; i < segment_ends.size(); i++)
+        segment_ends[i] = segment_sizes[i] + segment_ends[i - 1];
 #ifdef WITH_MPI_CUDA
-        auto* WSdata = ws()->template caches<float, Context>({ segment_sizes[0] })[0];
-        auto* dXdata = Input(j).template mutable_data<float, Context>();
+    auto* WSdata = ws()->template caches<T, Context>({ segment_sizes[0] })[0];
+    auto* dXdata = tensor->template mutable_data<T, Context>();
 #else
-        auto* WSdata = ws()->template caches<float, CPUContext>({ segment_sizes[0] })[0];
-        auto* dXdata = Input(j).template mutable_data<float, CPUContext>();
+    auto* WSdata = ws()->template caches<T, CPUContext>({ segment_sizes[0] })[0];
+    auto* dXdata = tensor->template mutable_data<T, CPUContext>();
 #endif // WITH_MPI_CUDA
-        int recv_from = (comm_rank - 1 + comm_size) % comm_size;
-        int send_to = (comm_rank + 1) % comm_size;
-
-        //  scatter-reduce
-        for (int i = 0; i < comm_size - 1; i++) {
-            int recv_chunk = (comm_rank - i - 1 + comm_size) % comm_size;
-            int send_chunk = (comm_rank - i + comm_size) % comm_size;
-            auto* segment_send = &(dXdata[
-                segment_ends[send_chunk] - segment_sizes[send_chunk]
-            ]);
-            MPI_Irecv(WSdata, segment_sizes[recv_chunk],
-                MPI_FLOAT, recv_from, 0, comm, &recv_req);
-            MPI_Send(segment_send, segment_sizes[send_chunk],
-                MPI_FLOAT, send_to, 0, comm);
-            auto* segment_update = &(dXdata[
-                segment_ends[recv_chunk] - segment_sizes[recv_chunk]
-            ]);
-            MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
+    int recv_from = (comm_rank - 1 + comm_size) % comm_size;
+    int send_to = (comm_rank + 1) % comm_size;
+
+    //  scatter-reduce
+    for (int i = 0; i < comm_size - 1; i++) {
+        int recv_chunk = (comm_rank - i - 1 + comm_size) % comm_size;
+        int send_chunk = (comm_rank - i + comm_size) % comm_size;
+        auto* segment_send = &(dXdata[
+            segment_ends[send_chunk] - segment_sizes[send_chunk]]);
+        MPI_Irecv(WSdata, segment_sizes[recv_chunk],
+            dtype, recv_from, 0, comm, &recv_req);
+        MPI_Send(segment_send, segment_sizes[send_chunk],
+            dtype, send_to, 0, comm);
+        auto* segment_update = &(dXdata[
+            segment_ends[recv_chunk] - segment_sizes[recv_chunk]]);
+        MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
 #ifdef WITH_MPI_CUDA
-            math::Axpy<float, Context>(segment_sizes[recv_chunk],
-                1.0, WSdata, segment_update, &ctx());
-            ctx().FinishDeviceCompution();
+        math::Axpy<T, Context>(segment_sizes[recv_chunk],
+            1.0, WSdata, segment_update, ctx());
+        ctx()->FinishDeviceCompution();
 #else 
-            math::Axpy<float, CPUContext>(segment_sizes[recv_chunk],
-                1.0, WSdata, segment_update, &ctx());
+        math::Axpy<T, CPUContext>(segment_sizes[recv_chunk],
+            1.0, WSdata, segment_update, ctx());
 #endif // WITH_MPI_CUDA
-        }
+    }

-        //  allgather
-        for (int i = 0; i < comm_size - 1; i++) {
-            int send_chunk = (comm_rank - i + 1 + comm_size) % comm_size;
-            int recv_chunk = (comm_rank - i + comm_size) % comm_size;
-            auto* segment_send = &(dXdata[
-                segment_ends[send_chunk] - segment_sizes[send_chunk]
-            ]);
-            auto* segment_recv = &(dXdata[
-                segment_ends[recv_chunk] - segment_sizes[recv_chunk]
-            ]);
-            MPI_Sendrecv(segment_send, segment_sizes[send_chunk],
-                MPI_FLOAT, send_to, 0,
-                    segment_recv, segment_sizes[recv_chunk],
-                        MPI_FLOAT, recv_from, 0,
-                            comm, MPI_STATUS_IGNORE);
-        }
+    //  allgather
+    for (int i = 0; i < comm_size - 1; i++) {
+        int send_chunk = (comm_rank - i + 1 + comm_size) % comm_size;
+        int recv_chunk = (comm_rank - i + comm_size) % comm_size;
+        auto* segment_send = &(dXdata[
+            segment_ends[send_chunk] - segment_sizes[send_chunk]]);
+        auto* segment_recv = &(dXdata[
+            segment_ends[recv_chunk] - segment_sizes[recv_chunk]]);
+        MPI_Sendrecv(segment_send, segment_sizes[send_chunk],
+            dtype, send_to, 0, segment_recv, segment_sizes[recv_chunk],
+            dtype, recv_from, 0, comm, MPI_STATUS_IGNORE);
+    }

-        //  normalization
-        if (comm_size > 1) {
+    //  normalization
+    if (comm_size > 1) {
 #ifdef WITH_MPI_CUDA
-            math::Scal<float, Context>(count,
-                1.f / comm_size, dXdata, &ctx());
+        math::Scal<T, Context>(count, 1.f / comm_size, dXdata, ctx());
 #else
-            math::Scal<float, CPUContext>(count,
-                1.f / comm_size, dXdata, &ctx());
+        math::Scal<T, CPUContext>(count, 1.f / comm_size, dXdata, ctx());
 #endif  // WITH_MPI_CUDA
-        }
    }
 }

-template <class Context>
-void CollectiveUpdateOp<Context>::NCCLAllReduceWithFloat() {
-#ifdef WITH_MPI_NCCL
-    auto stream = closure.cuda_stream(0);
-    for (int i = 0; i < InputSize(); i++) {
-        TIndex count = Input(i).count();
-        auto* dXdata = Input(i).template mutable_data<float, Context>();
-        NCCL_CHECK(ncclAllReduce((const void*)dXdata, (void*)dXdata,
-            count, ncclFloat, ncclSum, nccl_comm, stream));
-    }
-    closure.Sync();
-    for (int i = 0; i < InputSize(); i++) {
-        TIndex count = Input(i).count();
-        auto* dXdata = Input(i).template mutable_data<float, Context>();
-        math::Scal<float, Context>(count, 1.f / comm_size, dXdata, &ctx());
-    }
-#endif
-}
-
-template <class Context>
-void CollectiveUpdateOp<Context>::MPIBcastWithFloat() {
-    for (int i = 0; i < InputSize(); i++) {
-        TIndex count = Input(i).count();
+template <class Context> template <typename T>
+void CollectiveUpdateOp<Context>::MPIBcast(
+    Tensor*                 tensor,
+    MPI_Datatype            dtype) {
+    TIndex count = tensor->count();
 #ifdef WITH_MPI_CUDA
-        auto* dXdata = Input(i).template mutable_data<float, Context>();
+    auto* dXdata = tensor->template mutable_data<float, Context>();
 #else
-        auto* dXdata = Input(i).template mutable_data<float, CPUContext>();
+    auto* dXdata = tensor->template mutable_data<float, CPUContext>();
 #endif
-        MPI_Bcast(dXdata, count, MPI_FLOAT, comm_root, comm);
-    }
+    MPI_Bcast(dXdata, count, dtype, comm_root, comm);
 }

-template <class Context>
-void CollectiveUpdateOp<Context>::NCCLBcastWithFloat() {
 #ifdef WITH_MPI_NCCL
-    auto stream = closure.cuda_stream(0);
-    for (int i = 0; i < InputSize(); i++) {
-        TIndex count = Input(i).count();
-        auto* dXdata = Input(i).template mutable_data<float, Context>();
-        NCCL_CHECK(ncclBcast((void*)dXdata, count,
-            ncclFloat, comm_root, nccl_comm, stream));
-    }
-    closure.Sync();
-#endif
+
+template <class Context> template <typename T>
+void CollectiveUpdateOp<Context>::NCCLAllReduce(
+    Tensor*                 tensor,
+    ncclDataType_t          dtype,
+    cudaStream_t&           stream) {
+    TIndex count = tensor->count();
+    auto* dXdata = tensor->template mutable_data<T, Context>();
+    NCCL_CHECK(ncclAllReduce((const void*)dXdata, (void*)dXdata,
+        count, dtype, ncclSum, nccl_comm, stream));
+}
+
+template <class Context> template <typename T>
+void CollectiveUpdateOp<Context>::NCCLBcast(
+    Tensor*                 tensor,
+    ncclDataType_t          dtype,
+    cudaStream_t&           stream) {
+    TIndex count = tensor->count();
+    auto* dXdata = tensor->template mutable_data<T, Context>();
+    NCCL_CHECK(ncclBcast((void*)dXdata,
+        count, dtype, comm_root, nccl_comm, stream));
 }

+#endif
+
 template <class Context>
 void CollectiveUpdateOp<Context>::RunOnDevice() {
-    if(XIsType(Input(0), float)) {
-        if (mode == "MPI_ALLREDUCE") {
-            MPIAllReduceWithFloat();
-        } else if (mode == "NCCL_ALLREDUCE") {
-            NCCLAllReduceWithFloat();
-        } else if (mode == "MPI_BCAST") {
-            MPIBcastWithFloat();
-        } else if (mode == "NCCL_BCAST") {
-            NCCLBcastWithFloat();
-        } else LOG(FATAL) << "Unsupported collective mode: " << mode;
-    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
+    if (mode == "MPI_ALLREDUCE") {
+        for (int i = 0; i < InputSize(); i++) {
+            if (XIsType(Input(i), float))
+                MPIAllReduce<float>(&Input(i), MPI_FLOAT);
+            else if (XIsType(Input(i), float16))
+                MPIAllReduce<float16>(&Input(i), MPI_UNSIGNED_SHORT);
+            else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        }
+    } else if (mode == "MPI_BCAST") {
+        for (int i = 0; i < InputSize(); i++) {
+            if (XIsType(Input(i), float))
+                MPIBcast<float>(&Input(i), MPI_FLOAT);
+            else if (XIsType(Input(i), float16))
+                MPIBcast<float16>(&Input(i), MPI_UNSIGNED_SHORT);
+            else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        }
+    }
+#ifdef WITH_MPI_NCCL
+    else if (mode == "NCCL_ALLREDUCE") {
+        auto stream = closure.cuda_stream(1);
+        for (int i = 0; i < InputSize(); i++) {
+            if (XIsType(Input(i), float))
+                NCCLAllReduce<float>(&Input(i), ncclFloat, stream);
+            else if (XIsType(Input(i), float16))
+                NCCLAllReduce<float16>(&Input(i), ncclHalf, stream);
+            else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        }
+        closure.Sync();
+        for (int i = 0; i < InputSize(); i++) {
+            TIndex count = Input(i).count();
+            if (XIsType(Input(i), float)) {
+                auto* dXdata = Input(i).template mutable_data<float, Context>();
+                math::Scal<float, Context>(count, 1.f / comm_size, dXdata, ctx());
+            }
+            else if (XIsType(Input(i), float16)) {
+                auto* dXdata = Input(i).template mutable_data<float16, Context>();
+                math::Scal<float16, Context>(count, 1.f / comm_size, dXdata, ctx());
+            }
+        }
+    } else if (mode == "NCCL_BCAST") {
+        auto stream = closure.cuda_stream(1);
+        for (int i = 0; i < InputSize(); i++) {
+            if (XIsType(Input(i), float))
+                NCCLBcast<float>(&Input(i), ncclFloat, stream);
+            else if (XIsType(Input(i), float16))
+                NCCLBcast<float16>(&Input(i), ncclHalf, stream);
+            else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
+        }
+        closure.Sync();
+    }
+#endif
+    else LOG(FATAL) << "Unsupported collective mode: " << mode;
 }

 DEPLOY_CPU(CollectiveUpdate);

--- a/Dragon/src/operators/update/moving_average_op.cc
+++ b/Dragon/src/operators/update/moving_average_op.cc
@@ -8,7 +8,7 @@ void MovingAverageOp<Context>::RunWithType() {
    auto* Xdata = Input(0).template data<T, Context>();
    auto* Ydata = Output(0)->template mutable_data<T, Context>();
    math::Axpby<T, Context>(Input(0).count(),
-        1.f - decay, Xdata, decay, Ydata, &ctx());
+        1.f - decay, Xdata, decay, Ydata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/update/nesterov_update_op.cc
+++ b/Dragon/src/operators/update/nesterov_update_op.cc
@@ -6,16 +6,16 @@
 namespace dragon {

 template <class Context>
-void NesterovUpdateOp<Context>::ComputeRunWithFloat() {
+void NesterovUpdateOp<Context>::ComputeRunWithFloat32() {
    Tensor* h = ws()->CreateTensor("/mnt/" + Slot() + "/nesterov/h");
    h->ReshapeLike(Input(0));

    lr = Param("base_lr") * this->lr_mult, momentum = Param("momentum");
    auto* dXdata = Input(0).template mutable_data<float, Context>();
-    auto* Hdata = h->template mutable_data<float, Context>();
+    auto* Hdata = h->template mutable_data<float, Context>(ctx());

    kernel::NesterovUpdate<float, Context>(
-        Input(0).count(), lr, momentum, dXdata, Hdata);
+        Input(0).count(), lr, momentum, dXdata, Hdata, ctx());
 }

 template <class Context>
@@ -24,11 +24,18 @@ void NesterovUpdateOp<Context>::ComputeRunWithFloat16() {
    h->ReshapeLike(Input(0));

    lr = Param("base_lr") * this->lr_mult, momentum = Param("momentum");
-    auto* dXdata = Input(0).template mutable_data<float16, Context>();
-    auto* Hdata = h->template mutable_data<float16, Context>();

-    kernel::NesterovUpdate<float16, Context>(
-        Input(0).count(), lr, momentum, dXdata, Hdata);
+    auto* dX32T = ws()->CreateTensor(Input(0).name() + "/f32");
+    dX32T->ReshapeLike(Input(0));
+
+    auto* dX32 = dX32T->template mutable_data<float, Context>();
+    auto* dX16 = Input(0).template mutable_data<float16, Context>();
+    auto* H32 = h->template mutable_data<float, Context>(ctx());
+
+    kernel::TypeA2B<float16, float, Context>(
+        Input(0).count(), dX16, dX32, ctx());
+    kernel::NesterovUpdate<float, Context>(
+        Input(0).count(), lr, momentum, dX32, H32, ctx());
 }

 DEPLOY_CPU(NesterovUpdate);

--- a/Dragon/src/operators/update/rmsprop_update_op.cc
+++ b/Dragon/src/operators/update/rmsprop_update_op.cc
@@ -5,17 +5,17 @@
 namespace dragon {

 template <class Context>
-void RMSPropUpdateOp<Context>::ComputeRunWithFloat() {
+void RMSPropUpdateOp<Context>::ComputeRunWithFloat32() {
    Tensor* h = ws()->CreateTensor("/mnt/" + Slot() + "/rmsprop/h");
    h->ReshapeLike(Input(0));

    lr = Param("base_lr") * this->lr_mult;
    decay = Param("decay"), eps = Param("eps");
    auto* dXdata = Input(0).template mutable_data<float, Context>();
-    auto* Hdata = h->template mutable_data<float, Context>();
+    auto* Hdata = h->template mutable_data<float, Context>(ctx());

    kernel::RMSPropUpdate<float, Context>(
-        Input(0).count(), lr, decay, eps, dXdata, Hdata);
+        Input(0).count(), lr, decay, eps, dXdata, Hdata, ctx());
 }

 template <class Context>
@@ -25,11 +25,18 @@ void RMSPropUpdateOp<Context>::ComputeRunWithFloat16() {

    lr = Param("base_lr") * this->lr_mult;
    decay = Param("decay"), eps = Param("eps");
-    auto* dXdata = Input(0).template mutable_data<float16, Context>();
-    auto* Hdata = h->template mutable_data<float16, Context>();

-    kernel::RMSPropUpdate<float16, Context>(
-        Input(0).count(), lr, decay, eps, dXdata, Hdata);
+    auto* dX32T = ws()->CreateTensor(Input(0).name() + "/f32");
+    dX32T->ReshapeLike(Input(0));
+
+    auto* dX32 = dX32T->template mutable_data<float, Context>();
+    auto* dX16 = Input(0).template mutable_data<float16, Context>();
+    auto* H32 = h->template mutable_data<float, Context>(ctx());
+
+    kernel::TypeA2B<float16, float, Context>(
+        Input(0).count(), dX16, dX32, ctx());
+    kernel::RMSPropUpdate<float, Context>(
+        Input(0).count(), lr, decay, eps, dX32, H32, ctx());
 }

 DEPLOY_CPU(RMSPropUpdate);

--- a/Dragon/src/operators/update/sgd_update_op.cc
+++ b/Dragon/src/operators/update/sgd_update_op.cc
@@ -6,7 +6,7 @@
 namespace dragon {

 template <class Context>
-void SGDUpdateOp<Context>::ComputeRunWithFloat() {
+void SGDUpdateOp<Context>::ComputeRunWithFloat32() {
    Tensor* h = ws()->CreateTensor("/mnt/" + Slot() + "/sgd/h");
    h->ReshapeLike(Input(0));

@@ -14,10 +14,10 @@ void SGDUpdateOp<Context>::ComputeRunWithFloat() {
    //  momentum correction, see arXiv:1706.02677
    if (old_lr > 0) { correction = lr / old_lr; } old_lr = lr;
    auto* dXdata = Input(0).template mutable_data<float, Context>();
-    auto* Hdata = h->template mutable_data<float, Context>();
+    auto* Hdata = h->template mutable_data<float, Context>(ctx());

    kernel::SGDUpdate<float, Context>(Input(0).count(),
-        lr, momentum * correction, dXdata, Hdata);
+        lr, momentum * correction, dXdata, Hdata, ctx());
 }

 template <class Context>
@@ -27,11 +27,18 @@ void SGDUpdateOp<Context>::ComputeRunWithFloat16() {

    lr = Param("base_lr") * this->lr_mult, momentum = Param("momentum");
    if (old_lr > 0) { correction = lr / old_lr; } old_lr = lr;
-    auto* dXdata = Input(0).template mutable_data<float16, Context>();
-    auto* Hdata = h->template mutable_data<float16, Context>();

-    kernel::SGDUpdate<float16, Context>(Input(0).count(),
-        lr, momentum * correction, dXdata, Hdata);
+    auto* dX32T = ws()->CreateTensor(Input(0).name() + "/f32");
+    dX32T->ReshapeLike(Input(0));
+
+    auto* dX32 = dX32T->template mutable_data<float, Context>();
+    auto* dX16 = Input(0).template mutable_data<float16, Context>();
+    auto* H32 = h->template mutable_data<float, Context>(ctx());
+
+    kernel::TypeA2B<float16, float, Context>(
+        Input(0).count(), dX16, dX32, ctx());
+    kernel::SGDUpdate<float, Context>(Input(0).count(),
+        lr, momentum * correction, dX32, H32, ctx());
 }

 DEPLOY_CPU(SGDUpdate);

--- a/Dragon/src/operators/update/update_op_base.cc
+++ b/Dragon/src/operators/update/update_op_base.cc
 #include "core/workspace.h"
 #include "utils/cast.h"
 #include "utils/math_functions.h"
+#include "utils/op_kernel.h"
 #include "operators/update/update_op_base.h"

 namespace dragon {
@@ -20,22 +21,24 @@ template <class Context> template <typename T>
 void UpdateOpBase<Context>::PreprocessRunWithType() {
    //  scale
    scale_factor = Param("scale_gradient");
-    if (scale_factor != 1) {
+    if (scale_factor != 1.f) {
        auto* dXdata = Input(0).template mutable_data<T, Context>();
        math::Scal<T, Context>(Input(0).count(),
-            scale_factor, dXdata, &ctx());
+            scale_factor, dXdata, ctx());
    }
    //  clip
    clip_thresh = Param("clip_gradient");
    if (clip_thresh > 0) {
        auto* dXdata = Input(0).template mutable_data<T, Context>();
-        float sumsq_grad = math::Dot<T, Context>(
-            Input(0).count(), dXdata, dXdata, &ctx());
-        const float l2norm = sqrt(sumsq_grad);
+        T sumsq_grad;
+        math::Dot<T, Context>(Input(0).count(),
+            dXdata, dXdata, &sumsq_grad, ctx());
+        const float l2norm = sqrt(
+            dragon_cast<float, T>(sumsq_grad));
        if (l2norm > clip_thresh) {
            float norm_factor = clip_thresh / l2norm;
            math::Scal<T, Context>(Input(0).count(),
-                norm_factor, dXdata, &ctx());
+                norm_factor, dXdata, ctx());
        }
    }
    //  decay
@@ -44,34 +47,76 @@ void UpdateOpBase<Context>::PreprocessRunWithType() {
        auto* dXdata = Input(0).template mutable_data<T, Context>();
        auto* Xdata = Output(0)->template data<T, Context>();
        math::Axpy<T, Context>(Input(0).count(),
-            l2_decay, Xdata, dXdata, &ctx());
+            l2_decay, Xdata, dXdata, ctx());
    }
 }

-template <class Context> template <typename T>
-void UpdateOpBase<Context>::UpdateRunWithType() {
-    auto* dXdata = Input(0).template mutable_data<T, Context>();
-    auto* Xdata = Output(0)->template mutable_data<T, Context>();
-    math::Axpy<T, Context>(Output(0)->count(), -1, dXdata, Xdata, &ctx());
-    T zeroT = dragon_cast<T, float>(0.f);
-    if (zero_grad) math::Set<T, Context>(Input(0).count(), zeroT, dXdata);
+template <class Context>
+void UpdateOpBase<Context>::UpdateRunWithFloat32() {
+    auto* dXdata = Input(0).template mutable_data<float, Context>();
+    auto* Xdata = Output(0)->template mutable_data<float, Context>();
+    //  weights update & zero grads
+    math::Axpy<float, Context>(Output(0)->count(),
+        -1, dXdata, Xdata, ctx());
+    if (zero_grad) math::Set<float, Context>(
+        Input(0).count(), 0.f, dXdata, ctx());
+}
+
+template <class Context>
+void UpdateOpBase<Context>::UpdateRunWithFloat16() {
+
+    /* ------------------------------------------------
+     *
+     *              Mixed Precision Training
+     *
+     *           http://arxiv.org/abs/1710.03740
+     *
+     * ------------------------------------------------ */
+
+    //  the "master" weights
+    auto* X32T = ws()->CreateTensor(Output(0)->name() + "/f32");
+    X32T->ReshapeLike(Input(0));
+
+    //  the "master" updates
+    auto* dX32T = ws()->GetTensor(Input(0).name() + "/f32");
+    
+    auto* dX32 = dX32T->template data<float, Context>();
+    auto* X16 = Output(0)->template mutable_data<float16, Context>();
+    auto* X32 = X32T->template mutable_data<float, Context>();
+
+    //  X16 -> X32
+    kernel::TypeA2B<float16, float, Context>(
+        Input(0).count(), X16, X32, ctx());
+
+    //  weights update & zero grads
+    math::Axpy<float, Context>(
+        Input(0).count(), -1, dX32, X32, ctx());
+    if (zero_grad) {
+        float16 zero = dragon_cast<float16, float>(0.f);
+        auto* dX16 = Input(0).template mutable_data<float16, Context>();
+        math::Set<float16, Context>(Input(0).count(), zero, dX16, ctx());
+    }
+
+    //  X32 -> X16
+    kernel::TypeA2B<float, float16, Context>(
+        Input(0).count(), X32, X16, ctx());
 }

 template <class Context>
 void UpdateOpBase<Context>::RunOnDevice() {
-    //  skip empty param or grad
+    //  skip empty param or grads
    if (Input(0).count() == 0 || Output(0)->count() == 0) return;
    CHECK(Input(0).dims() == Output(0)->dims())
        << "\nTensor and its gradients should have same dims.\nGot "
        << Output(0)->DimString() << " and " << Input(0).DimString();
    if (XIsType(Input(0), float)) {
        PreprocessRunWithType<float>();
-        ComputeRunWithFloat();
-        UpdateRunWithType<float>();
+        ComputeRunWithFloat32();
+        UpdateRunWithFloat32();
    } else if (XIsType(Input(0), float16)) {
        PreprocessRunWithType<float16>();
        ComputeRunWithFloat16();
-        UpdateRunWithType<float16>();
+        UpdateRunWithFloat16();
    } else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
 } 


--- a/Dragon/src/operators/vision/bias_add_op.cc
+++ b/Dragon/src/operators/vision/bias_add_op.cc
@@ -15,7 +15,7 @@ void BiasAddOp<Context>::RunWithType() {

    kernel::BiasAdd<T, Context>(
        Output(0)->count(), outer_dim, dim, inner_dim,
-            data_format, Bdata, multiplier, Ydata, &ctx());
+            data_format, Bdata, multiplier, Ydata, ctx());
 }

 template <class Context>
@@ -45,19 +45,19 @@ void BiasAddGradientOp<Context>::RunWithType() {
    if (Output(1)->name() != "ignore") {
        DECLARE_MULTIPLIER(multiplier, inner_dim);
        auto* dYdata = Input(-1).template data<T, Context>();
-        auto* dBias = Output(1)->template mutable_data<T, Context>();
+        auto* dBias = Output(1)->template mutable_data<T, Context>(ctx());
        const int y_offset = dim * inner_dim;
        for (int n = 0; n < outer_dim; n++) {
            if (data_format == "NCHW") {
                math::Gemv<T, Context>(
                    CblasNoTrans, dim, inner_dim,
                        1.0, dYdata, multiplier,
-                            1.0, dBias, &ctx());
+                            1.0, dBias, ctx());
            } else if (data_format == "NHWC") {
                math::Gemv<T, Context>(
                    CblasTrans, inner_dim, dim,
                        1.0, dYdata, multiplier,
-                            1.0, dBias, &ctx());
+                            1.0, dBias, ctx());
            }
            dYdata += y_offset;
        }

--- a/Dragon/src/operators/vision/bilinear_resize_op.cc
+++ b/Dragon/src/operators/vision/bilinear_resize_op.cc
@@ -26,7 +26,7 @@ void BilinearResizeOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    kernel::BilinearResize<T, Context>(Output(0)->count(),
-        n, c, h, w, out_h, out_w, data_format, Xdata, Ydata);
+        n, c, h, w, out_h, out_w, data_format, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -77,8 +77,10 @@ void BilinearResizeGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

+    math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());
+
    kernel::BilinearResizeGrad<T, Context>(Input(-1).count(),
-        n, c, h, w, out_h, out_w, data_format, dYdata, dXdata);
+        n, c, h, w, out_h, out_w, data_format, dYdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/vision/conv2d_op.cc
+++ b/Dragon/src/operators/vision/conv2d_op.cc
@@ -41,7 +41,7 @@ void Conv2dGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();

    if (HasBias()) {
-        T* dBdata = Output(2)->template mutable_data<T, Context>();
+        T* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
        for (int n = 0; n < Input(2).dim(0); n++)
            Db(dYdata + n * y_offset, dBdata);
    }
@@ -49,7 +49,7 @@ void Conv2dGradientOp<Context>::RunWithType() {
    for (int n = 0; n < Input(2).dim(0); n++) {
        if (Output(1)->name() != "ignore") {
            auto* Xdata = Input(0).template data<T, Context>();
-            auto* dWdata = Output(1)->template mutable_data<T, Context>();
+            auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
            Dw(dYdata + n * y_offset, Xdata + n * x_offset, dWdata);
        }
        if (Output(0)->name() != "ignore") {

--- a/Dragon/src/operators/vision/conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/conv2d_transpose_op.cc
@@ -44,7 +44,7 @@ void Conv2dTransposeGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();

    if (Output(2)->name() != "ignore") {
-        auto* dBdata = Output(2)->template mutable_data<T, Context>();
+        auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
        for (int n = 0; n < Input(2).dim(0); n++)
            Db(dYdata + n * y_offset, dBdata);
    }
@@ -52,7 +52,7 @@ void Conv2dTransposeGradientOp<Context>::RunWithType() {
    for (int n = 0; n < Input(2).dim(0); n++) {
        if (Output(1)->name() != "ignore") {
            auto* Xdata = Input(0).template data<T, Context>();
-            auto* dWdata = Output(1)->template mutable_data<T, Context>();
+            auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
            Dw(Xdata + n * x_offset, dYdata + n * y_offset, dWdata);
        }
        if (Output(0)->name() != "ignore") {

--- a/Dragon/src/operators/vision/conv_op_base.cc
+++ b/Dragon/src/operators/vision/conv_op_base.cc
@@ -77,7 +77,7 @@ void ConvOpBase<Context>::Wx(
                                   kernel_dim,
                1.0, weights + weight_offset * g,
                     col_buffer + col_offset * g,
-                0.0, y + output_offset * g, &ctx());
+                0.0, y + output_offset * g, ctx());
        } else if (data_format == "NHWC") {
            math::Gemm<T, Context>(
                CblasNoTrans, CblasNoTrans,
@@ -86,7 +86,7 @@ void ConvOpBase<Context>::Wx(
                                   kernel_dim,
                1.0, col_buffer + col_offset * g,
                     weights + weight_offset * g,
-                0.0, y + output_offset * g, &ctx());
+                0.0, y + output_offset * g, ctx());
        }
    }
 }
@@ -99,13 +99,13 @@ void ConvOpBase<Context>::Pb(const T* bias, T* y) {
            CblasNoTrans, CblasNoTrans,
                num_output, out_spatial_dim, 1,
                    1.0, bias, multiplier,
-                        1.0, y, &ctx());
+                        1.0, y, ctx());
    } else if (data_format == "NHWC") {
        math::Gemm<T, Context>(
            CblasNoTrans, CblasNoTrans,
                out_spatial_dim, num_output, 1,
                    1.0, multiplier, bias,
-                        1.0, y, &ctx());
+                        1.0, y, ctx());
    }
 }

@@ -122,7 +122,7 @@ void ConvOpBase<Context>::Dx(const T* dy, const T* weights, T* dx) {
                    conv_out_channels / group,
                1.0, weights + weight_offset * g,
                          dy + output_offset * g,
-                0.0, col_buffer + col_offset * g, &ctx());
+                0.0, col_buffer + col_offset * g, ctx());
        } else if (data_format == "NHWC") {
             math::Gemm<T, Context>(
                 CblasNoTrans, CblasTrans,
@@ -131,7 +131,7 @@ void ConvOpBase<Context>::Dx(const T* dy, const T* weights, T* dx) {
                     conv_out_channels / group,
                 1.0, dy + output_offset * g,
                     weights + weight_offset * g,
-                 0.0, col_buffer + col_offset * g, &ctx());
+                 0.0, col_buffer + col_offset * g, ctx());
        }
    }
    if (!is_1x1) Col2Im(col_buffer, dx);
@@ -154,7 +154,7 @@ void ConvOpBase<Context>::Dw(const T* dy, const T* x, T *dw) {
                         conv_out_spatial_dim,
                1.0, dy + output_offset * g,
                    col_buffer + col_offset * g,
-                1.0, dw + weight_offset * g, &ctx());
+                1.0, dw + weight_offset * g, ctx());
        } else if (data_format == "NHWC") {
            math::Gemm<T, Context>(
                CblasTrans, CblasNoTrans,
@@ -163,7 +163,7 @@ void ConvOpBase<Context>::Dw(const T* dy, const T* x, T *dw) {
                         conv_out_spatial_dim,
                1.0, col_buffer + col_offset * g,
                          dy + output_offset * g,
-                1.0, dw + weight_offset * g, &ctx());
+                1.0, dw + weight_offset * g, ctx());
        }
    }
 }
@@ -175,12 +175,12 @@ void ConvOpBase<Context>::Db(const T* dy, T* db) {
        math::Gemv<T, Context>(
            CblasNoTrans, num_output, out_spatial_dim,
                1.0, dy, multiplier,
-                    1.0, db, &ctx());
+                    1.0, db, ctx());
    } else if (data_format == "NHWC") {
        math::Gemv<T, Context>(
            CblasTrans, out_spatial_dim, num_output,
                1.0, dy, multiplier,
-                    1.0, db, &ctx());
+                    1.0, db, ctx());
    }
 }


--- a/Dragon/src/operators/vision/cudnn_conv2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_op.cc
@@ -54,13 +54,13 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
    }

    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-        ctx().cudnn_handle(), input_desc,
+        ctx()->cudnn_handle(), input_desc,
            filter_desc, conv_desc, output_desc,
                CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
                    WORKSPACE_LIMIT_BYTES, &fwd_algo));

    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-        ctx().cudnn_handle(), input_desc,
+        ctx()->cudnn_handle(), input_desc,
            filter_desc, conv_desc, output_desc,
                fwd_algo, &fwd_data_size));
 }
@@ -78,7 +78,7 @@ void CuDNNConv2dOp<Context>::RunWithType() {
    auto* WSdata = (uint8_t*)ws()->template
        caches<Context>({ fwd_data_size })[0];

-    auto cudnn_handle = ctx().cudnn_handle();
+    auto cudnn_handle = ctx()->cudnn_handle();

    for (int g = 0; g < cudnn_group; g++) {
        CUDNN_CHECK(cudnnConvolutionForward(cudnn_handle,
@@ -104,6 +104,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
 #endif
    Conv2dOp<Context>::Reshape();

+    ctx()->set_stream_id(0);  //  enforce default stream
+
    if (XIsType(Input(0), float)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
@@ -199,24 +201,24 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
    }

    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-        ctx().cudnn_handle(), output_desc,
+        ctx()->cudnn_handle(), output_desc,
            input_desc, conv_desc, filter_desc,
                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
                    WORKSPACE_LIMIT_BYTES, &bwd_filter_algo));

    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        ctx().cudnn_handle(), output_desc,
+        ctx()->cudnn_handle(), output_desc,
            input_desc, conv_desc, filter_desc,
                bwd_filter_algo, &bwd_filter_size));

    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-        ctx().cudnn_handle(), filter_desc,
+        ctx()->cudnn_handle(), filter_desc,
            input_desc, conv_desc, output_desc,
                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
                    WORKSPACE_LIMIT_BYTES, &bwd_data_algo));

    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-        ctx().cudnn_handle(), filter_desc,
+        ctx()->cudnn_handle(), filter_desc,
            input_desc, conv_desc, output_desc,
                bwd_data_algo, &bwd_data_size));
 }
@@ -230,18 +232,18 @@ void CuDNNConv2dGradientOp<Context>::RunWithType() {
    auto* WSdata = ws()->template caches<Context>({
        std::max(bwd_data_size, bwd_filter_size)})[0];

-    auto cudnn_handle = ctx().cudnn_handle();
+    auto cudnn_handle = ctx()->cudnn_handle();

    for (int g = 0; g < cudnn_group; g++) {
        if (Output(2)->name() != "ignore") {
-            T* dBdata = Output(2)->template mutable_data<T, Context>();
+            T* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
            CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
                CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
                    CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
        }
        if (Output(1)->name() != "ignore") {
            auto* Xdata = Input(0).template data<T, Context>();
-            auto* dWdata = Output(1)->template mutable_data<T, Context>();
+            auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
            CUDNN_CHECK(cudnnConvolutionBackwardFilter(cudnn_handle,
                CUDNNType<T>::one, output_desc, Xdata + x_offset * g,
                    input_desc, dYdata + y_offset * g,
@@ -269,6 +271,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
 #endif
    Conv2dGradientOp<Context>::GradientReshape();

+    ctx()->set_stream_id(0);  //  enforce default stream
+
    if (XIsType(Input(0), float)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,

--- a/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
+++ b/Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
@@ -54,13 +54,13 @@ void CuDNNConv2dTransposeOp<Context>::ResetDesc() {
    }

    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
-        ctx().cudnn_handle(), filter_desc,
+        ctx()->cudnn_handle(), filter_desc,
            input_desc, conv_desc, output_desc,
                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
                    WORKSPACE_LIMIT_BYTES, &fwd_algo));

    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
-        ctx().cudnn_handle(), filter_desc,
+        ctx()->cudnn_handle(), filter_desc,
            input_desc, conv_desc, output_desc,
                fwd_algo, &fwd_data_size));
 }
@@ -78,7 +78,7 @@ void CuDNNConv2dTransposeOp<Context>::RunWithType() {
    auto* WSdata = (uint8_t*)ws()->template
        caches<Context>({ fwd_data_size })[0];

-    auto cudnn_handle = ctx().cudnn_handle();
+    auto cudnn_handle = ctx()->cudnn_handle();

    for (int g = 0; g < cudnn_group; g++) {
        CUDNN_CHECK(cudnnConvolutionBackwardData(cudnn_handle,
@@ -104,6 +104,8 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() {
 #endif
    Conv2dTransposeOp<Context>::Reshape();

+    ctx()->set_stream_id(0);  //  enforce default stream
+
    if (XIsType(Input(0), float)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
@@ -199,24 +201,24 @@ void CuDNNConv2dTransposeGradientOp<Context>::ResetDesc() {
    }

    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
-        ctx().cudnn_handle(), input_desc,
+        ctx()->cudnn_handle(), input_desc,
            output_desc, conv_desc, filter_desc,
                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
                    WORKSPACE_LIMIT_BYTES, &bwd_filter_algo));

    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        ctx().cudnn_handle(), input_desc,
+        ctx()->cudnn_handle(), input_desc,
            output_desc, conv_desc, filter_desc,
                bwd_filter_algo, &bwd_filter_size));

    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
-        ctx().cudnn_handle(), input_desc,
+        ctx()->cudnn_handle(), input_desc,
            filter_desc, conv_desc, output_desc,
                CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
                    WORKSPACE_LIMIT_BYTES, &bwd_data_algo));

    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-        ctx().cudnn_handle(), input_desc,
+        ctx()->cudnn_handle(), input_desc,
            filter_desc, conv_desc, output_desc,
                bwd_data_algo, &bwd_data_size));
 }
@@ -230,18 +232,18 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunWithType() {
    auto* WSdata = ws()->template caches<Context>({
        std::max(bwd_data_size, bwd_filter_size) })[0];

-    auto cudnn_handle = ctx().cudnn_handle();
+    auto cudnn_handle = ctx()->cudnn_handle();

    for (int g = 0; g < cudnn_group; g++) {
        if (Output(2)->name() != "ignore") {
-            T* dBdata = Output(2)->template mutable_data<T, Context>();
+            T* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
            CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
                CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
                    CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
        }
        if (Output(1)->name() != "ignore") {
            auto* Xdata = Input(0).template data<T, Context>();
-            auto* dWdata = Output(1)->template mutable_data<T, Context>();
+            auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
            CUDNN_CHECK(cudnnConvolutionBackwardFilter(cudnn_handle,
                CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
                    output_desc, Xdata + x_offset * g,
@@ -269,6 +271,8 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() {
 #endif
    Conv2dTransposeGradientOp<Context>::GradientReshape();

+    ctx()->set_stream_id(0);  //  enforce default stream
+
    if (XIsType(Input(0), float)) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
        CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,

--- a/Dragon/src/operators/vision/cudnn_lrn_op.cc
+++ b/Dragon/src/operators/vision/cudnn_lrn_op.cc
@@ -13,7 +13,7 @@ void CuDNNLRNOp<Context>::RunWithType() {
        auto* Ydata = Output(0)->template mutable_data<T, Context>();

        CUDNN_CHECK(cudnnLRNCrossChannelForward(
-            ctx().cudnn_handle(), norm_desc,
+            ctx()->cudnn_handle(), norm_desc,
                CUDNN_LRN_CROSS_CHANNEL_DIM1,
                    CUDNNType<T>::one, input_desc, Xdata,
                        CUDNNType<T>::zero, output_desc, Ydata));
@@ -55,7 +55,7 @@ void CuDNNLRNGradientOp<Context>::RunWithType() {
        auto* dXdata = Output(0)->template mutable_data<T, Context>();

        CUDNN_CHECK(cudnnLRNCrossChannelBackward(
-            ctx().cudnn_handle(), norm_desc,
+            ctx()->cudnn_handle(), norm_desc,
                CUDNN_LRN_CROSS_CHANNEL_DIM1,
                    CUDNNType<T>::one, input_desc, Ydata,
                        input_desc, dYdata, output_desc, Xdata,

--- a/Dragon/src/operators/vision/cudnn_pooling2d_op.cc
+++ b/Dragon/src/operators/vision/cudnn_pooling2d_op.cc
@@ -25,7 +25,7 @@ void CuDNNPooling2dOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    CUDNN_CHECK(cudnnPoolingForward(
-        ctx().cudnn_handle(), pool_desc,
+        ctx()->cudnn_handle(), pool_desc,
            CUDNNType<T>::one, input_desc, Xdata,
                CUDNNType<T>::zero, output_desc, Ydata));
 }
@@ -69,7 +69,7 @@ void CuDNNPooling2dGradientOp<Context>::RunWithType() {
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

    CUDNN_CHECK(cudnnPoolingBackward(
-        ctx().cudnn_handle(), pool_desc,
+        ctx()->cudnn_handle(), pool_desc,
            CUDNNType<T>::one, input_desc, Ydata,
                input_desc, dYdata, output_desc, Xdata,
                    CUDNNType<T>::zero, output_desc, dXdata));

--- a/Dragon/src/operators/vision/dense_concat_op.cc
+++ b/Dragon/src/operators/vision/dense_concat_op.cc
@@ -28,7 +28,7 @@ void DenseConcatGradientOp<Context>::RestoreX1() {
    kernel::ConcatGrad<T, Context>(
        count, this->outer_dim, this->inner_dim,
            this->x_concat_dim, this->y_concat_dim,
-                0, Ydata, Xdata);
+                0, Ydata, Xdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/vision/lrn_op.cc
+++ b/Dragon/src/operators/vision/lrn_op.cc
@@ -17,11 +17,11 @@ template <class Context> template <typename T>
 void LRNOp<Context>::SplitRunWithType() {
    sqr_in = ws()->CreateTensor("/mnt/" + anchor() + "/sqr/in");
    sqr_in->ReshapeLike(Input(0));
-    sqr_in->template CopyFrom<Context>(Input(0));
+    sqr_in->template CopyFrom<Context>(Input(0), ctx());

    prod_in = ws()->CreateTensor("/mnt/" + anchor() + "/prod/in");
    prod_in->ReshapeLike(Input(0));
-    prod_in->template CopyFrom<Context>(Input(0));
+    prod_in->template CopyFrom<Context>(Input(0), ctx());
 }

 template <class Context> template <typename T>
@@ -229,7 +229,7 @@ void LRNGradientOp<Context>::SplitRunWithType() {
    auto* data0 = g_sqr_in->template data<T, Context>();
    auto* data1 = g_prod_in->template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();
-    math::Add<T, Context>(Output(0)->count(), data0, data1, dXdata);
+    math::Add<T, Context>(Output(0)->count(), data0, data1, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/vision/nn_resize_op.cc
+++ b/Dragon/src/operators/vision/nn_resize_op.cc
@@ -26,7 +26,7 @@ void NNResizeOp<Context>::RunWithType() {
    auto* Ydata = Output(0)->template mutable_data<T, Context>();

    kernel::NNResize<T, Context>(Output(0)->count(),
-        n, c, h, w, out_h, out_w, data_format, Xdata, Ydata);
+        n, c, h, w, out_h, out_w, data_format, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -77,8 +77,10 @@ void NNResizeGradientOp<Context>::RunWithType() {
    auto* dYdata = Input(-1).template data<T, Context>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

+    math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());
+
    kernel::NNResizeGrad<T, Context>(Input(-1).count(),
-        n, c, h, w, out_h, out_w, data_format, dYdata, dXdata);
+        n, c, h, w, out_h, out_w, data_format, dYdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/vision/pooling2d_op.cc
+++ b/Dragon/src/operators/vision/pooling2d_op.cc
@@ -17,7 +17,7 @@ void Pooling2dOp<Context>::MAXRunWithType() {
    kernel::MAXPooling2d<T, Context>(Output(0)->count(),
        n, c, h, w, pool_h, pool_w, kernel_size[0], kernel_size[1],
            stride[0], stride[1], pad[0], pad[1],
-                data_format, Xdata, Mdata, Ydata);
+                data_format, Xdata, Mdata, Ydata, ctx());
 }

 template <class Context> template <typename T>
@@ -28,7 +28,7 @@ void Pooling2dOp<Context>::AVGRunWithType() {
    kernel::AVGPooling2d<T, Context>(Output(0)->count(),
        n, c, h, w, pool_h, pool_w, kernel_size[0], kernel_size[1],
            stride[0], stride[1], pad[0], pad[1],
-                data_format, Xdata, Ydata);
+                data_format, Xdata, Ydata, ctx());
 }

 template <class Context>
@@ -127,8 +127,9 @@ void Pooling2dGradientOp<Context>::MAXRunWithType() {
    kernel::MAXPooling2dGrad<T, Context>(Output(0)->count(),
        n, c, h, w, pool_h, pool_w, kernel_size[0], kernel_size[1],
            stride[0], stride[1], pad[0], pad[1],
-                data_format, dYdata, Mdata, dXdata);
+                data_format, dYdata, Mdata, dXdata, ctx());

+    ctx()->FinishDeviceCompution();
    mask->Reset();
 }

@@ -140,7 +141,7 @@ void Pooling2dGradientOp<Context>::AVGRunWithType() {
    kernel::AVGPooling2dGrad<T, Context>(Output(0)->count(),
        n, c, h, w, pool_h, pool_w, kernel_size[0], kernel_size[1],
            stride[0], stride[1], pad[0], pad[1],
-                data_format, dYdata, dXdata);
+                data_format, dYdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/vision/roi_align_op.cc
+++ b/Dragon/src/operators/vision/roi_align_op.cc
@@ -14,7 +14,8 @@ void ROIAlignOp<Context>::RunWithType() {
    kernel::ROIAlign<T, Context>(
        Output(0)->count(), Input(0).dim(0), Input(0).dim(1),
            Input(0).dim(2), Input(0).dim(3), pool_h, pool_w,
-                Input(1).dim(0), spatial_scale, sampling_ratio, Xdata, Rdata, Ydata);
+                Input(1).dim(0), spatial_scale, sampling_ratio,
+                    Xdata, Rdata, Ydata, ctx());
 }

 template <class Context>
@@ -38,12 +39,13 @@ void ROIAlignGradientOp<Context>::RunWithType() {
    auto* Rdata = Input(1).template data<T, CUDAContext>();
    auto* dXdata = Output(0)->template mutable_data<T, Context>();

-    math::Set<T, Context>(Output(0)->count(), 0, dXdata);
+    math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());

    kernel::ROIAlignGrad<T, Context>(
        Input(-1).count(), Output(0)->dim(0), Output(0)->dim(1),
            Output(0)->dim(2), Output(0)->dim(3), pool_h, pool_w,
-                Input(1).dim(0), spatial_scale, sampling_ratio, dYdata, Rdata, dXdata);
+                Input(1).dim(0), spatial_scale, sampling_ratio,
+                    dYdata, Rdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/operators/vision/roi_pooling_op.cc
+++ b/Dragon/src/operators/vision/roi_pooling_op.cc
@@ -19,7 +19,8 @@ void ROIPoolingOp<Context>::RunWithType() {
    kernel::ROIPooling<T, Context>(
        Output(0)->count(), Input(0).dim(0), Input(0).dim(1),
            Input(0).dim(2), Input(0).dim(3), pool_h, pool_w,
-                Input(1).dim(0), spatial_scale, Xdata, Rdata, Mdata, Ydata);
+                Input(1).dim(0), spatial_scale,
+                    Xdata, Rdata, Mdata, Ydata, ctx());
 }

 template <class Context>
@@ -50,7 +51,8 @@ void ROIPoolingGradientOp<Context>::RunWithType() {
    kernel::ROIPoolingGrad<T, Context>(
        Output(0)->count(), Output(0)->dim(0), Output(0)->dim(1),
            Output(0)->dim(2), Output(0)->dim(3), pool_h, pool_w,
-                Input(1).dim(0), spatial_scale, dYdata, Rdata, Mdata, dXdata);
+                Input(1).dim(0), spatial_scale,
+                    dYdata, Rdata, Mdata, dXdata, ctx());
 }

 template <class Context>

--- a/Dragon/src/utils/math_functions.cc
+++ b/Dragon/src/utils/math_functions.cc
@@ -14,7 +14,8 @@ namespace math {
 template <> void Set<float, CPUContext>(
    const int               n,
    const float             alpha,
-    float*                  x) {
+    float*                  x,
+    CPUContext*             ctx) {
    if (alpha == 0) {
        memset(x, 0, sizeof(float) * n);
        return;
@@ -32,7 +33,8 @@ template <> void Set<float, CPUContext>(
 template <> void Set<int, CPUContext>(
    const int               n,
    const int               alpha,
-    int*                    x) {
+    int*                    x,
+    CPUContext*             ctx) {
    if (alpha == 0) {
        memset(x, 0, sizeof(int) * n);
        return;
@@ -50,7 +52,8 @@ template <> void Set<int, CPUContext>(
 template <> void Set<float16, CPUContext>(
    const int               n,
    const float16           alpha,
-    float16*                x) {
+    float16*                x,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -164,7 +167,8 @@ template <> void Add<float, CPUContext>(
    const int               n,
    const float*            a,
    const float*            b,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_SSE
    sse::Add<float>(n, a, b, y);
 #else
@@ -179,7 +183,8 @@ template <> void Add<int, CPUContext>(
    const int               n,
    const int*              a,
    const int*              b,
-    int*                    y) {
+    int*                    y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
@@ -190,7 +195,8 @@ template <> void Add<float16, CPUContext>(
    const int               n,
    const float16*          a,
    const float16*          b,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -198,7 +204,8 @@ template <> void Sub<float, CPUContext>(
    const int               n,
    const float*            a,
    const float*            b,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_SSE
    sse::Sub<float>(n, a, b, y);
 #else
@@ -213,7 +220,8 @@ template <> void Sub<float16, CPUContext>(
    const int               n,
    const float16*          a,
    const float16*          b,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -221,7 +229,8 @@ template <> void Mul<float, CPUContext>(
    const int               n,
    const float*            a,
    const float*            b,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_SSE
    sse::Mul<float>(n, a, b, y);
 #else
@@ -236,7 +245,8 @@ template <> void Mul<float16, CPUContext>(
    const int               n,
    const float16*          a,
    const float16*          b,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -244,7 +254,8 @@ template <> void Div<float, CPUContext>(
    const int               n,
    const float*            a,
    const float*            b,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_SSE
    sse::Div<float>(n, a, b, y);
 #else
@@ -259,7 +270,8 @@ template <> void Div<float16, CPUContext>(
    const int               n,
    const float16*          a,
    const float16*          b,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -267,7 +279,8 @@ template <> void Clip<float, CPUContext>(
    const int               n,
    const float             low,
    const float             high,
-    float*                  x) {
+    float*                  x,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
@@ -279,7 +292,8 @@ template <> void Clip<float, CPUContext>(
 template <> void Exp<float, CPUContext>(
    int                     n,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
@@ -289,7 +303,8 @@ template <> void Exp<float, CPUContext>(
 template <> void Log<float, CPUContext>(
    int                     n,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
@@ -299,7 +314,8 @@ template <> void Log<float, CPUContext>(
 template <> void Square<float, CPUContext>(
    int                     n,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
@@ -309,14 +325,16 @@ template <> void Square<float, CPUContext>(
 template <> void Square<float16, CPUContext>(
    int                     n,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

 template <> void Sqrt<float, CPUContext>(
    int                     n,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
@@ -326,7 +344,8 @@ template <> void Sqrt<float, CPUContext>(
 template <> void Sqrt<float16, CPUContext>(
    int                     n,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -334,7 +353,8 @@ template <> void Pow<float, CPUContext>(
    int                     n,
    const float             alpha,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
@@ -345,7 +365,8 @@ template <> void Pow<float16, CPUContext>(
    int                     n,
    const float             alpha,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -353,7 +374,8 @@ template <> void Inv<float, CPUContext>(
    const int               n,
    const float             numerator,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
 #endif
@@ -364,7 +386,8 @@ template <> void Inv<float16, CPUContext>(
    const int               n,
    const float             numerator,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -423,51 +446,51 @@ template <> void Scale<float, CPUContext>(
 #endif  // WITH_BLAS
 }

-template <> float StridedDot<float, CPUContext>(
+template <> void StridedDot<float, CPUContext>(
    const int               n,
    const float*            a,
    const int               incx,
    const float*            b,
    const int               incy,
+    float*                  y,
    CPUContext*             ctx) {
 #ifdef WITH_BLAS
-    return cblas_sdot(n, a, incx, b, incy);
+    float result = cblas_sdot(n, a, incx, b, incy);
 #else
-    float ret = 0.f;
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
-#endif
-    for (int i = 0; i < n; ++i) ret += a[i] * b[i];
-    return ret;
+    float result = 0.f;
+    int cx = 0, cy = 0;
+    for (int i = 0; i < n; ++i) {
+        result += a[cx] * b[cy];
+        cx += incx; cy += incy;
+    }
 #endif  //  WITH_BLAS
+    *y = result;
 }

-template <> float Dot<float, CPUContext>(
+template <> void Dot<float, CPUContext>(
    int                     n,
    const float*            a,
    const float*            b,
+    float*                  y,
    CPUContext*             ctx) {
 #ifdef WITH_BLAS
-    return StridedDot<float, CPUContext>(n, a, 1, b, 1, ctx);
-#elif  WITH_SSE
-    return sse::Dot<float>(n, a, b);
+    StridedDot<float, CPUContext>(n, a, 1, b, 1, y, ctx);
+#elif WITH_SSE
+    *y = sse::Dot<float>(n, a, b);
 #else
-    float ret = 0.f;
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
-#endif
-    for (int i = 0; i < n; ++i) ret += a[i] * b[i];
-    return ret;
+    float result = 0.f;
+    for (int i = 0; i < n; ++i) result += a[i] * b[i];
+    *y = result;
 #endif  // WITH_BLAS
 }

-template <> float Dot<float16, CPUContext>(
+template <> void Dot<float16, CPUContext>(
    int                     n,
    const float16*          a,
    const float16*          b,
+    float16*                y,
    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
-    return 0;
 }

 template <> float ASum<float, CPUContext>(
@@ -475,22 +498,19 @@ template <> float ASum<float, CPUContext>(
    const float*            x) {
 #ifdef WITH_BLAS
    return cblas_sasum(n, x, 1);
-#elif WITH_SSE
-    return sse::ASum<float>(n, x);
 #else
-    float ret = 0.f;
-#ifdef WITH_OMP
-    #pragma omp parallel for num_threads(GET_OMP_THREADS(n))
-#endif
-    for (int i = 0; i < n; ++i) ret += x[i];
-    return ret;
+    float result = 0.f;
+    for (int i = 0; i < n; ++i)
+        result += std::abs(x[i]);
+    return result;
 #endif  // WITH_BLAS
 }

 template <> void AddScalar<float, CPUContext>(
    const int               n,
    const float             alpha,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_SSE
    sse::AddScalar<float>(n, alpha, y);
 #else
@@ -504,14 +524,16 @@ template <> void AddScalar<float, CPUContext>(
 template <> void AddScalar<float16, CPUContext>(
    const int               n,
    const float             alpha,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

 template <> void MulScalar<float, CPUContext>(
    const int               n,
    const float             alpha,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_SSE
    sse::MulScalar<float>(n, alpha, y);
 #else
@@ -525,7 +547,8 @@ template <> void MulScalar<float, CPUContext>(
 template <> void MulScalar<float16, CPUContext>(
    const int               n,
    const float             alpha,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }


--- a/Dragon/src/utils/math_functions.cu
+++ b/Dragon/src/utils/math_functions.cu
@@ -18,7 +18,7 @@ __global__ void _Set(
    const int               n,
    const T                 alpha,
    T*                      x) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        x[idx] = alpha;
    }
 }
@@ -26,27 +26,31 @@ __global__ void _Set(
 template <> void Set<float, CUDAContext>(
    const int               n,
    const float             alpha,
-    float*                  x) {
-    if (alpha == 0) {
-        CUDA_CHECK(cudaMemset(x, 0, sizeof(float) * n));
-        return;
+    float*                  x,
+    CUDAContext*            ctx) {
+    if (alpha == 0.f) {
+        CUDA_CHECK(cudaMemsetAsync(x, 0,
+            sizeof(float) * n, ctx->cuda_stream()));
+    } else {
+        _Set<float>
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n, alpha, x);
    }
-    _Set<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, alpha, x);
 }

 template <> void Set<int, CUDAContext>(
    const int               n,
    const int               alpha,
-    int*                    x) {
+    int*                    x,
+    CUDAContext*            ctx) {
    if (alpha == 0) {
-        CUDA_CHECK(cudaMemset(x, 0, sizeof(int) * n));
-        return;
+        CUDA_CHECK(cudaMemsetAsync(x, 0,
+            sizeof(int) * n, ctx->cuda_stream()));
+    } else {
+        _Set<int>
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n, alpha, x);
    }
-    _Set<int> 
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, alpha, x);
 }

 template <> void RandomUniform<uint32_t, CUDAContext>(
@@ -89,7 +93,7 @@ __global__ void _Add(
    const T*                a,
    const T*                b,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = a[idx] + b[idx];
    }
 }
@@ -98,10 +102,11 @@ template <> void Add<float, CUDAContext>(
    int                     n,
    const float*            a,
    const float*            b,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Add<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, a, b, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, a, b, y);
 }

 template <typename T>
@@ -110,7 +115,7 @@ __global__ void _Sub(
    const T*                a,
    const T*                b,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = a[idx] - b[idx];
    }
 }
@@ -119,10 +124,11 @@ template <> void Sub<float, CUDAContext>(
    int                     n,
    const float*            a,
    const float*            b,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Sub<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, a, b, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, a, b, y);
 }

 template <typename T>
@@ -131,7 +137,7 @@ __global__ void _Mul(
    const T*                a,
    const T*                b,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = a[idx] * b[idx];
    }
 }
@@ -140,10 +146,11 @@ template <> void Mul<float, CUDAContext>(
    int                     n,
    const float*            a,
    const float*            b,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Mul<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, a, b, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, a, b, y);
 }

 template <typename T>
@@ -152,7 +159,7 @@ __global__ void _Div(
    const T*                a,
    const T*                b,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = a[idx] / b[idx];
    }
 }
@@ -161,10 +168,11 @@ template <> void Div<float, CUDAContext>(
    int                     n,
    const float*            a,
    const float*            b,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Div<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, a, b, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, a, b, y);
 }

 template <typename T>
@@ -173,7 +181,7 @@ __global__ void _Clip(
    const T                 low,
    const T                 high,
    T*                      x) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        x[idx] = x[idx] > high ? high : x[idx];
        x[idx] = x[idx] < low ? low : x[idx];
    }
@@ -183,10 +191,11 @@ template <> void Clip<float, CUDAContext>(
    const int               n,
    const float             low,
    const float             high,
-    float*                  x) {
+    float*                  x,
+    CUDAContext*            ctx) {
    _Clip<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, low, high, x);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, low, high, x);
 }

 template <typename T>
@@ -194,7 +203,7 @@ __global__ void _Exp(
    const int               n,
    const T*                a,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = exp(a[idx]);
    }
 }
@@ -202,10 +211,11 @@ __global__ void _Exp(
 template <> void Exp<float, CUDAContext>(
    int                     n,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Exp<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, x, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, x, y);
 }

 template <typename T>
@@ -213,7 +223,7 @@ __global__ void _Log(
    const int               n,
    const T*                a,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = log(a[idx]);
    }
 }
@@ -221,10 +231,11 @@ __global__ void _Log(
 template <> void Log<float, CUDAContext>(
    int                     n,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Log<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, x, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, x, y);
 }

 template <typename T>
@@ -232,7 +243,7 @@ __global__ void _Square(
    const int               n,
    const T*                x,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = x[idx] * x[idx];
    }
 }
@@ -240,10 +251,11 @@ __global__ void _Square(
 template <> void Square<float, CUDAContext>(
    int                     n,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Square<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, x, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, x, y);
 }

 template <typename T>
@@ -251,7 +263,7 @@ __global__ void _Sqrt(
    const int               n,
    const T*                x,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = sqrt(x[idx]);
    }
 }
@@ -259,10 +271,11 @@ __global__ void _Sqrt(
 template <> void Sqrt<float, CUDAContext>(
    int                     n,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Sqrt<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, x, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, x, y);
 }

 template <typename T>
@@ -271,7 +284,7 @@ __global__ void _Pow(
    const T                 alpha,
    const T*                a,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = pow(a[idx], alpha);
    }
 }
@@ -280,10 +293,11 @@ template <> void Pow<float, CUDAContext>(
    int                     n,
    const float             alpha,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Pow<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, alpha, x, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, alpha, x, y);
 }

 template <typename T>
@@ -292,7 +306,7 @@ __global__ void _Inv(
    const float             numerator,
    const T*                x,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] = numerator / x[idx];
    }
 }
@@ -301,10 +315,11 @@ template <> void Inv<float, CUDAContext>(
    const int               n,
    const float             numerator,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _Inv<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, numerator, x, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, numerator, x, y);
 }

 /******************** Level-2 ********************/
@@ -330,26 +345,27 @@ template <> void Scale<float, CUDAContext>(
        ctx->cublas_handle(), n, &alpha, y, 1));
 }

-template <> float StridedDot<float, CUDAContext>(
+template <> void StridedDot<float, CUDAContext>(
    const int               n,
    const float*            a,
    const int               incx,
    const float*            b,
    const int               incy,
+    float*                  y,
    CUDAContext*            ctx) {
-    float result;
    CUBLAS_CHECK(cublasSdot_v2(ctx->cublas_handle(),
-        n, a, incx, b, incy, &result));
-    return result;
+        n, a, incx, b, incy, y));
 }

-template <> float Dot<float, CUDAContext>(
+template <> void Dot<float, CUDAContext>(
    int                     n,
    const float*            a,
    const float*            b,
+    float*                  y,
    CUDAContext*            ctx) {
-    return StridedDot<float, CUDAContext>(
-        n, a, 1, b, 1, ctx);
+    StridedDot<float, CUDAContext>(
+        n, a, 1, b, 1, y, ctx);
+    ctx->FinishDeviceCompution();
 }

 template <> float ASum<float, CUDAContext>(
@@ -363,7 +379,7 @@ __global__ void _AddScalar(
    const int               n,
    T                       alpha,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] += alpha;
    }
 }
@@ -371,10 +387,11 @@ __global__ void _AddScalar(
 template <> void AddScalar<float, CUDAContext>(
    const int               n,
    const float             alpha,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _AddScalar<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, alpha, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, alpha, y);
 }

 template <typename T>
@@ -382,7 +399,7 @@ __global__ void _MulScalar(
    const int               n,
    T                       alpha,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        y[idx] *= alpha;
    }
 }
@@ -390,10 +407,11 @@ __global__ void _MulScalar(
 template <> void MulScalar<float, CUDAContext>(
    const int               n,
    const float             alpha,
-    float*                  y) {
+    float*                  y,
+    CUDAContext*            ctx) {
    _MulScalar<float>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, alpha, y);
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n, alpha, y);
 }

 template <> void Axpy<float, CUDAContext>(
@@ -427,7 +445,7 @@ template <> void RandomUniform<float, CUDAContext>(
        ctx->curand_generator(), x, n));
    float range = high - low;
    if (range != 1.f) Scal<float, CUDAContext>(n, range, x, ctx);
-    if (low != 0.f) AddScalar<float, CUDAContext>(n, low, x);
+    if (low != 0.f) AddScalar<float, CUDAContext>(n, low, x, ctx);
 }

 /******************** Level-3 ********************/

--- a/Dragon/src/utils/math_functions_fp16.cu
+++ b/Dragon/src/utils/math_functions_fp16.cu
@@ -18,7 +18,7 @@ __global__ void _SetHalf(
    const int               n,
    const T                 alpha,
    T*                      x) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        x[idx] = alpha;
    }
 }
@@ -26,16 +26,19 @@ __global__ void _SetHalf(
 template <> void Set<float16, CUDAContext>(
    const int               n,
    const float16           alpha,
-    float16*                x) {
+    float16*                x,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _SetHalf<half2>
-            << <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                dragon_cast<half2, float16>(alpha),
-                    reinterpret_cast<half2*>(x));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     dragon_cast<half2, float16>(alpha),
+                         reinterpret_cast<half2*>(x));
    } else {
        _SetHalf<float16>
-            << <CUDA_BLOCKS(n), CUDA_THREADS >> >(n, alpha, x);
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n, alpha, x);
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -47,7 +50,7 @@ __global__ void _TypeFloat2Half(
    const int               n,
    const float*            a,
    half*                   b) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
        b[idx] = __float2half(a[idx]);
    }
 }
@@ -64,8 +67,9 @@ template <> void RandomNormal<float16, CUDAContext>(
    CURAND_CHECK(curandGenerateNormal(
        ctx->curand_generator(), xf32, n, mu, sigma));
    _TypeFloat2Half
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, xf32, reinterpret_cast<half*>(x));
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n,
+                 xf32, reinterpret_cast<half*>(x));
    CUDAContext::Delete(xf32);
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -81,7 +85,7 @@ __global__ void _AddHalf(
    const half*             a,
    const half*             b,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hadd(a[idx], b[idx]);
 #endif
@@ -94,7 +98,7 @@ __global__ void _AddHalf2(
    const half2*            a,
    const half2*            b,
    half2*                  y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hadd2(a[idx], b[idx]);
 #endif
@@ -106,20 +110,23 @@ template <> void Add<float16, CUDAContext>(
    int                     n,
    const float16*          a,
    const float16*          b,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _AddHalf2<half2>
-            << <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                reinterpret_cast<const half2*>(a),
-                    reinterpret_cast<const half2*>(b),
-                        reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     reinterpret_cast<const half2*>(a),
+                         reinterpret_cast<const half2*>(b),
+                             reinterpret_cast<half2*>(y));
    } else {
        _AddHalf<half>
-            << <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
-                reinterpret_cast<const half*>(a),
-                    reinterpret_cast<const half*>(b),
-                        reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     reinterpret_cast<const half*>(a),
+                         reinterpret_cast<const half*>(b),
+                             reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -133,7 +140,7 @@ __global__ void _SubHalf(
    const half*             a,
    const half*             b,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hsub(a[idx], b[idx]);
 #endif
@@ -146,7 +153,7 @@ __global__ void _SubHalf2(
    const half2*            a,
    const half2*            b,
    half2*                  y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hsub2(a[idx], b[idx]);
 #endif
@@ -158,20 +165,23 @@ template <> void Sub<float16, CUDAContext>(
    int                     n,
    const float16*          a,
    const float16*          b,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _SubHalf2<half2>
-            << <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                reinterpret_cast<const half2*>(a),
-                    reinterpret_cast<const half2*>(b),
-                        reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     reinterpret_cast<const half2*>(a),
+                         reinterpret_cast<const half2*>(b),
+                             reinterpret_cast<half2*>(y));
    } else {
        _SubHalf<half>
-            << <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
-                reinterpret_cast<const half*>(a),
-                    reinterpret_cast<const half*>(b),
-                        reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     reinterpret_cast<const half*>(a),
+                         reinterpret_cast<const half*>(b),
+                             reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -185,7 +195,7 @@ __global__ void _MulHalf(
    const half*             a,
    const half*             b,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hmul(a[idx], b[idx]);
 #endif
@@ -198,7 +208,7 @@ __global__ void _MulHalf2(
    const half2*            a,
    const half2*            b,
    half2*                  y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hmul2(a[idx], b[idx]);
 #endif
@@ -210,20 +220,23 @@ template <> void Mul<float16, CUDAContext>(
    int                     n,
    const float16*          a,
    const float16*          b,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _MulHalf2<half2>
-            << <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                reinterpret_cast<const half2*>(a),
-                    reinterpret_cast<const half2*>(b),
-                        reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     reinterpret_cast<const half2*>(a),
+                         reinterpret_cast<const half2*>(b),
+                             reinterpret_cast<half2*>(y));
    } else {
        _MulHalf<half>
-            << <CUDA_BLOCKS(n), CUDA_THREADS >> > (n,
-                reinterpret_cast<const half*>(a),
-                    reinterpret_cast<const half*>(b),
-                        reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     reinterpret_cast<const half*>(a),
+                         reinterpret_cast<const half*>(b),
+                             reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -237,7 +250,7 @@ __global__ void _DivHalf(
    const half*             a,
    const half*             b,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hdiv(a[idx], b[idx]);
 #endif
@@ -249,13 +262,15 @@ template <> void Div<float16, CUDAContext>(
    int                     n,
    const float16*          a,
    const float16*          b,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    _DivHalf<half>
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
-            reinterpret_cast<const half*>(a),
-                reinterpret_cast<const half*>(b),
-                    reinterpret_cast<half*>(y));
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n,
+                 reinterpret_cast<const half*>(a),
+                      reinterpret_cast<const half*>(b),
+                          reinterpret_cast<half*>(y));
 #else
    CUDA_FP16_NOT_COMPILED;
 #endif
@@ -267,7 +282,7 @@ __global__ void _SquareHalf(
    const int               n,
    const half*             x,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hmul(x[idx], x[idx]);
 #endif
@@ -279,7 +294,7 @@ __global__ void _SquareHalf2(
    const int               n,
    const half2*            x,
    half2*                  y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hmul2(x[idx], x[idx]);
 #endif
@@ -290,18 +305,21 @@ __global__ void _SquareHalf2(
 template <> void Square<float16, CUDAContext>(
    int                     n,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _SquareHalf2<half2>
-            << < CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                reinterpret_cast<const half2*>(x),
-                    reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     reinterpret_cast<const half2*>(x),
+                         reinterpret_cast<half2*>(y));
    } else {
        _SquareHalf<half>
-            << < CUDA_BLOCKS(n), CUDA_THREADS >> > (n,
-                reinterpret_cast<const half*>(x),
-                    reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     reinterpret_cast<const half*>(x),
+                         reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -314,7 +332,7 @@ __global__ void _SqrtHalf(
    int                     n,
    const half*             x,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = hsqrt(x[idx]);
 #endif
@@ -326,7 +344,7 @@ __global__ void _SqrtHalf2(
    const int               n,
    const half2*            x,
    half2*                  y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = h2sqrt(x[idx]);
 #endif
@@ -337,18 +355,21 @@ __global__ void _SqrtHalf2(
 template <> void Sqrt<float16, CUDAContext>(
    int                     n,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _SqrtHalf2<half2>
-            << < CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                reinterpret_cast<const half2*>(x),
-                    reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     reinterpret_cast<const half2*>(x),
+                         reinterpret_cast<half2*>(y));
    } else {
        _SqrtHalf<half>
-            << < CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
-                reinterpret_cast<const half*>(x),
-                    reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     reinterpret_cast<const half*>(x),
+                         reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -362,7 +383,7 @@ __global__ void _PowHalf(
    const float             alpha,
    const half*             a,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hmul(a[idx], a[idx]);
 #endif
@@ -375,7 +396,7 @@ __global__ void _PowHalf2(
    const float             alpha,
    const half2*            a,
    half2*                  y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hmul2(a[idx], a[idx]);
 #endif
@@ -387,19 +408,22 @@ template <> void Pow<float16, CUDAContext>(
    int                     n,
    const float             alpha,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    CHECK(alpha == float(2)) << "fp16 only support the power of 2";
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _PowHalf2<half2>
-            << < CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                alpha, reinterpret_cast<const half2*>(x),
-                    reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     alpha, reinterpret_cast<const half2*>(x),
+                         reinterpret_cast<half2*>(y));
    } else {
        _PowHalf<half>
-            << < CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
-                alpha, reinterpret_cast<const half*>(x),
-                    reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     alpha, reinterpret_cast<const half*>(x),
+                         reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -413,7 +437,7 @@ __global__ void _InvHalf(
    const half              numerator,
    const half*             x,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] =  __hmul(hrcp(x[idx]), numerator);
 #endif
@@ -426,7 +450,7 @@ __global__ void _InvHalf2(
    const half2             numerator,
    const half2*            x,
    half2*                  y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hmul2(h2rcp(x[idx]), numerator);
 #endif
@@ -438,20 +462,23 @@ template <> void Inv<float16, CUDAContext>(
    const int               n,
    const float             numerator,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _InvHalf2<half2>
-            << < CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                dragon_cast<half2, float>(numerator),
-                    reinterpret_cast<const half2*>(x),
-                        reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     dragon_cast<half2, float>(numerator),
+                         reinterpret_cast<const half2*>(x),
+                             reinterpret_cast<half2*>(y));
    } else {
        _InvHalf<half>
-            << < CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
-                dragon_cast<half, float>(numerator),
-                    reinterpret_cast<const half*>(x),
-                        reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     dragon_cast<half, float>(numerator),
+                         reinterpret_cast<const half*>(x),
+                             reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -482,27 +509,26 @@ template <> void Scale<float16, CUDAContext>(
    const float16*          x,
    float16*                y,
    CUDAContext*            ctx) {
-    CUDAContext::Copy<float16, CUDAContext, CUDAContext>(n, y, x);
+    ctx->Copy<float16, CUDAContext, CUDAContext>(n, y, x);
    Scal<float16, CUDAContext>(n, alpha, y, ctx);
 }

-template <> float Dot<float16, CUDAContext>(
+template <> void Dot<float16, CUDAContext>(
    int                     n,
    const float16*          a,
    const float16*          b,
+    float16*                y,
    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    float16 result;
    CUBLAS_CHECK(cublasDotEx(
        ctx->cublas_handle(), n,
            a, CUDA_R_16F, 1,
                b, CUDA_R_16F, 1,
-                    &result, CUDA_R_16F,
+                    y, CUDA_R_16F,
                        CUDA_R_32F));
-    return dragon_cast<float, float16>(result);
+    ctx->FinishDeviceCompution();
 #else
    CUDA_FP16_NOT_COMPILED;
-    return 0.;
 #endif
 }

@@ -512,7 +538,7 @@ __global__ void _AddScalarHalf(
    const int               n,
    half                    alpha,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hadd(y[idx], alpha);
 #endif
@@ -524,7 +550,7 @@ __global__ void _AddScalarHalf2(
    const int               n,
    half2                   alpha,
    half2*                  y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hadd2(y[idx], alpha);
 #endif
@@ -535,18 +561,21 @@ __global__ void _AddScalarHalf2(
 template <> void AddScalar<float16, CUDAContext>(
    const int               n,
    const float             alpha,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _AddScalarHalf2<half2>
-            << <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                dragon_cast<half2, float>(alpha),
-                    reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     dragon_cast<half2, float>(alpha),
+                         reinterpret_cast<half2*>(y));
    } else {
        _AddScalarHalf<half>
-            << <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
-                dragon_cast<half, float>(alpha),
-                    reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     dragon_cast<half, float>(alpha),
+                         reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -559,7 +588,7 @@ __global__ void _MulScalarHalf(
    const int               n,
    half                    alpha,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hmul(y[idx], alpha);
 #endif
@@ -571,7 +600,7 @@ __global__ void _MulScalarHalf2(
    const int               n,
    half2                   alpha,
    half2*                  y) {
-    CUDA_KERNEL_LOOP(idx, n) {
+    CUDA_1D_KERNEL_LOOP(idx, n) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hmul2(y[idx], alpha);
 #endif
@@ -582,18 +611,21 @@ __global__ void _MulScalarHalf2(
 template <> void MulScalar<float16, CUDAContext>(
    const int               n,
    const float             alpha,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (n % 2 == 0) {
+    if ((n & 1) == 0) {
        _MulScalarHalf2<half2>
-            << <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
-                dragon_cast<half2, float>(alpha),
-                    reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n >> 1,
+                     dragon_cast<half2, float>(alpha),
+                         reinterpret_cast<half2*>(y));
    } else {
        _MulScalarHalf<half>
-            << <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
-                dragon_cast<half, float>(alpha),
-                    reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(n), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(n,
+                     dragon_cast<half, float>(alpha),
+                         reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -640,11 +672,12 @@ template <> void RandomUniform<float16, CUDAContext>(
    CURAND_CHECK(curandGenerateUniform(
        ctx->curand_generator(), xf32, n));
    _TypeFloat2Half
-        << <CUDA_BLOCKS(n), CUDA_THREADS >> >(
-            n, xf32, reinterpret_cast<half*>(x));
+        << < CUDA_BLOCKS(n), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(n,
+                 xf32, reinterpret_cast<half*>(x));
    float range = high - low;
-    if (range != float(1)) Scal<float16, CUDAContext>(n, range, x, ctx);
-    if (low != float(0)) AddScalar<float16, CUDAContext>(n, low, x);
+    if (range != 1.f) Scal<float16, CUDAContext>(n, range, x, ctx);
+    if (low != 0.f) AddScalar<float16, CUDAContext>(n, low, x, ctx);
    ctx->Delete(xf32);
 #else
    CUDA_FP16_NOT_COMPILED;

--- a/Dragon/src/utils/op_kernel.cc
+++ b/Dragon/src/utils/op_kernel.cc
@@ -53,7 +53,8 @@ template<> void Elu<float, CPUContext>(
    const int               count,
    const float             alpha,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -68,7 +69,8 @@ template<> void EluGrad<float, CPUContext>(
    const float             alpha,
    const float*            dy,
    const float*            y,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -89,7 +91,8 @@ template<> void PRelu<float, CPUContext>(
    const string&           data_format,
    const float*            x,
    const float*            w,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    if (channel_shared) {
 #ifdef WITH_OMP
        #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
@@ -130,7 +133,8 @@ template<> void PReluGrad<float, CPUContext>(
    const float*            dy,
    const float*            x,
    const float*            w,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    if (channel_shared) {
 #ifdef WITH_OMP
        #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
@@ -184,9 +188,10 @@ template<> void PReluWGrad<float, CPUContext>(
        }
    }
    if (channel_shared) {
-        float w_sum = math::Dot<float, CPUContext>(
-            channels * dim, bcast_dw, multiplier, ctx);
-        math::AddScalar<float, CPUContext>(1, w_sum, dw);
+        float w_sum;
+        math::Dot<float, CPUContext>(channels * dim,
+            bcast_dw, multiplier, &w_sum, ctx);
+        math::AddScalar<float, CPUContext>(1, w_sum, dw, ctx);
    } else {
        if (data_format == "NCHW") {
            math::Gemv<float, CPUContext>(
@@ -208,7 +213,8 @@ template<> void Relu<float, CPUContext>(
    const int               count,
    const float             slope,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -221,7 +227,8 @@ template<> void Relu<float16, CPUContext>(
    const int               count,
    const float             slope,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -230,7 +237,8 @@ template<> void ReluGrad<float, CPUContext>(
    const float             slope,
    const float*            dy,
    const float*            y,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -244,7 +252,8 @@ template<> void ReluGrad<float, CPUContext>(
 template<> void SElu<float, CPUContext>(
    const int               count,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -258,7 +267,8 @@ template<> void SEluGrad<float, CPUContext>(
    const int               count,
    const float*            dy,
    const float*            y,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -276,7 +286,8 @@ T _sigmoid(T x) { return T(1) / (T(1) + exp(-x)); }
 template<> void Sigmoid<float, CPUContext>(
    const int               count,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -287,7 +298,8 @@ template<> void SigmoidGrad<float, CPUContext>(
    const int               count,
    const float*            dy,
    const float*            y,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -310,7 +322,7 @@ template<> void Softmax<float, CPUContext>(
    CPUContext*             ctx) {
    const int dim = count / outer_dim;
    for (int i = 0; i < outer_dim; ++i) {
-       CPUContext::Copy<float, CPUContext, CPUContext>(
+       ctx->Copy<float, CPUContext, CPUContext>(
            inner_dim, scale, x + i*dim);
        for (int j = 0; j < classes; ++j) {
            for (int k = 0; k < inner_dim; k++)
@@ -322,13 +334,13 @@ template<> void Softmax<float, CPUContext>(
            CblasNoTrans, CblasNoTrans, 
                classes, inner_dim, 1,
                    -1.0, sum_multiplier, scale, 1.0, y, ctx);
-        math::Exp<float, CPUContext>(dim, y, y);
+        math::Exp<float, CPUContext>(dim, y, y, ctx);
        math::Gemv<float, CPUContext>(
            CblasTrans, classes, inner_dim,
                1.0, y, sum_multiplier,
                    0.0, scale, ctx);
        for (int j = 0; j < classes; ++j) {
-            math::Div<float, CPUContext>(inner_dim, y, scale, y);
+            math::Div<float, CPUContext>(inner_dim, y, scale, y, ctx);
            y += inner_dim;
        }
    }
@@ -348,17 +360,16 @@ template<> void SoftmaxGrad<float, CPUContext>(
    const int dim = count / outer_dim;
    for (int i = 0; i < outer_dim; ++i) {
        for (int k = 0; k < inner_dim; ++k)
-            scale[k] = math::StridedDot<float, CPUContext>(
-                classes,
-                    dx + i * dim + k, inner_dim,
-                        y + i*dim + k, inner_dim, ctx);
+            math::StridedDot<float, CPUContext>(classes,
+                dx + i * dim + k, inner_dim,
+                    y + i * dim + k, inner_dim, scale + k, ctx);
         math::Gemm<float, CPUContext>(
             CblasNoTrans, CblasNoTrans,
                classes, inner_dim, 1,
                    -1.0, sum_multiplier, scale,
                        1.0, dx + i * dim, ctx);
    }
-    math::Mul<float, CPUContext>(count, dx, y, dx);
+    math::Mul<float, CPUContext>(count, dx, y, dx, ctx);
 }

 /******************** activation.tanh ********************/
@@ -366,7 +377,8 @@ template<> void SoftmaxGrad<float, CPUContext>(
 template<> void Tanh<float, CPUContext>(
    const int               count,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -379,7 +391,8 @@ template<> void TanhGrad<float, CPUContext>(
    const int               count,
    const float*            dy,
    const float*            y,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -467,7 +480,8 @@ template <> void Clip<float, CPUContext>(
    const float             high,
    const float*            x,
    float*                  mask,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -484,7 +498,8 @@ template <> void Equal<float, CPUContext>(
    const int               count,
    const float*            a,
    const float*            b,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -497,7 +512,8 @@ template <> void Equal<float, CPUContext>(
 template<> void AbsGrad<float, CPUContext>(
    const int               count,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -651,7 +667,8 @@ template<> void SmoothL1<float, CPUContext>(
    const int               count,
    const float             beta,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -667,7 +684,8 @@ template<> void SmoothL1Grad<float, CPUContext>(
    const int               count,
    const float             beta,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -686,7 +704,8 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(
    const int               count,
    const float*            prob,
    const float*            target,
-    float*                  loss) {
+    float*                  loss,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -834,6 +853,20 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>(
                losses, flags);
 }

+template <> void SparseSoftmaxCrossEntropy<float16, float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+
 template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
    const int               outer_dim,
    const int               axis_dim,
@@ -851,6 +884,20 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
                losses, flags);
 }

+template <> void SparseSoftmaxCrossEntropy<float16, int64_t, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+
 template <typename Tx, typename Ty>
 void _SparseSoftmaxCrossEntropyGrad(
    const int               outer_dim,
@@ -897,6 +944,20 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>(
                num_ignores, dx, flags);
 }

+template<> void SparseSoftmaxCrossEntropyGrad<float16, float, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float16*                dx,
+    float*                  flags,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+
 template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CPUContext>(
    const int               outer_dim,
    const int               axis_dim,
@@ -914,6 +975,20 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CPUContext>(
                num_ignores, dx, flags);
 }

+template<> void SparseSoftmaxCrossEntropyGrad<float16, int64_t, CPUContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float16*                dx,
+    float*                  flags,
+    CPUContext*             ctx) {
+    CPU_FP16_NOT_SUPPORTED;
+}
+
 /******************** misc.astype ********************/

 template <typename Ta, typename Tb>
@@ -936,7 +1011,8 @@ void _TypeA2B_v2(const int count, const Ta* a, Tb* b) {
    template <> void TypeA2B<type_a, type_b, CPUContext>( \
        const int           count, \
        const type_a*       a, \
-        type_b*             b) { \
+        type_b*             b, \
+        CPUContext*         ctx) { \
        _TypeA2B<type_a, type_b>(count, a, b); \
    }

@@ -944,7 +1020,8 @@ void _TypeA2B_v2(const int count, const Ta* a, Tb* b) {
    template <> void TypeA2B<type_a, type_b, CPUContext>( \
        const int           count, \
        const type_a*       a, \
-        type_b*             b) { \
+        type_b*             b, \
+        CPUContext*         ctx) { \
        _TypeA2B_v2<type_a, type_b>(count, a, b); \
    }

@@ -952,13 +1029,15 @@ void _TypeA2B_v2(const int count, const Ta* a, Tb* b) {
    template <> void TypeA2B<float16, type, CPUContext>( \
        const int           count, \
        const float16*      a, \
-        type*               b) { \
+        type*               b, \
+        CPUContext*         ctx) { \
        CPU_FP16_NOT_SUPPORTED; \
    } \
    template <> void TypeA2B<type, float16, CPUContext>( \
        const int           count, \
        const type*         a, \
-        float16*            b) { \
+        float16*            b, \
+        CPUContext*         ctx) { \
        CPU_FP16_NOT_SUPPORTED; \
    }

@@ -1039,7 +1118,8 @@ template <> void ImageData<float, float, CPUContext>(
    const float*            std_values,
    const string&           data_format,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    if (data_format == "NCHW") {
        _ImageData_NCHW<float, float>(
            N, C, H, W, mean_values, std_values, x, y);
@@ -1059,7 +1139,8 @@ template <> void ImageData<uint8_t, float, CPUContext>(
    const float*            std_values,
    const string&           data_format,
    const uint8_t*          x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    if (data_format == "NCHW") {
        _ImageData_NCHW<uint8_t, float>(
            N, C, H, W, mean_values, std_values, x, y);
@@ -1079,7 +1160,8 @@ template <> void ImageData<float, float16, CPUContext>(
    const float*            std_values,
    const string&           data_format,
    const float*            x,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -1093,7 +1175,8 @@ template <> void ImageData<uint8_t, float16, CPUContext>(
    const float*            std_values,
    const string&           data_format,
    const uint8_t*          x,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -1103,7 +1186,8 @@ template<> void Arange<float, CPUContext>(
    const int               count,
    const int               start,
    const int               step,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -1114,7 +1198,8 @@ template<> void Arange<int, CPUContext>(
    const int               count,
    const int               start,
    const int               step,
-    int*                    y) {
+    int*                    y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -1130,7 +1215,8 @@ template<> void Argmax<float, CPUContext>(
    const int               top_k,
    const float*            x,
    int64_t*                indices,
-    float*                  values) {
+    float*                  values,
+    CPUContext*             ctx) {
    vector<pair<float, int> > vec(axis_dim);
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
@@ -1158,7 +1244,8 @@ template<> void Argmin<float, CPUContext>(
    const int               top_k,
    const float*            x,
    int64_t*                indices,
-    float*                  values) {
+    float*                  values,
+    CPUContext*             ctx) {
    vector<pair<float, int> > vec(axis_dim);
 #ifdef WITH_OMP
 #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
@@ -1182,7 +1269,8 @@ template<> void Argmin<float, CPUContext>(
 template <> void CanonicalAxis<int, CPUContext>(
    const int               count,
    const int               dim,
-    int*                    y) {
+    int*                    y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -1198,7 +1286,8 @@ void _Gather(
    const int               y_slice_dim,
    const int*              indices,
    const T*                x,
-    T*                      y) {
+    T*                      y,
+    CPUContext*             ctx) {
    TIndex x_offset, y_offset, x_idx_offset, y_idx_offset;
    for (int i = 0; i < y_slice_dim; ++i) {
        y_idx_offset = i;
@@ -1206,7 +1295,7 @@ void _Gather(
        for (int n = 0; n < outer_dim; ++n) {
            x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
            y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
-            CPUContext::Copy<T, CPUContext, CPUContext>(
+            ctx->Copy<T, CPUContext, CPUContext>(
                inner_dim, y + y_offset, x + x_offset);
        }
    }
@@ -1220,9 +1309,10 @@ template <> void Gather<float, CPUContext>(
    const int               y_slice_dim,
    const int*              indices,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    _Gather<float>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, x, y);
+        x_slice_dim, y_slice_dim, indices, x, y, ctx);
 }

 template <> void Gather<int, CPUContext>(
@@ -1233,9 +1323,10 @@ template <> void Gather<int, CPUContext>(
    const int               y_slice_dim,
    const int*              indices,
    const int*              x,
-    int*                    y) {
+    int*                    y,
+    CPUContext*             ctx) {
    _Gather<int>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, x, y);
+        x_slice_dim, y_slice_dim, indices, x, y, ctx);
 }

 template <typename T>
@@ -1247,7 +1338,8 @@ void _GatherGrad(
    const int               y_slice_dim,
    const int*              indices,
    const T*                dy,
-    T*                      dx) {
+    T*                      dx,
+    CPUContext*             ctx) {
    TIndex x_offset, y_offset, x_idx_offset, y_idx_offset;
    for (int i = 0; i < y_slice_dim; ++i) {
        y_idx_offset = i;
@@ -1256,7 +1348,7 @@ void _GatherGrad(
            x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
            y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
            math::Add<T, CPUContext>(inner_dim,
-                dy + y_offset, dx + x_offset, dx + x_offset);
+                dy + y_offset, dx + x_offset, dx + x_offset, ctx);
        }
    }
 }
@@ -1269,9 +1361,10 @@ template <> void GatherGrad<float, CPUContext>(
    const int               y_slice_dim,
    const int*              indices,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    _GatherGrad<float>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, dy, dx);
+        x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
 }

 template <> void GatherGrad<int, CPUContext>(
@@ -1282,9 +1375,10 @@ template <> void GatherGrad<int, CPUContext>(
    const int               y_slice_dim,
    const int*              indices,
    const int*              dy,
-    int*                    dx) {
+    int*                    dx,
+    CPUContext*             ctx) {
    _GatherGrad<int>(count, outer_dim, inner_dim,
-        x_slice_dim, y_slice_dim, indices, dy, dx);
+        x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
 }

 /******************** ndarray.concat ********************/
@@ -1297,12 +1391,13 @@ template <> void Concat<float, CPUContext>(
    const int               y_concat_dim,
    const int               concat_offset,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    TIndex x_offset, y_offset;
    for (int n = 0; n < outer_dim; ++n) {
        x_offset = n * x_concat_dim * inner_dim;
        y_offset = (n * y_concat_dim + concat_offset) * inner_dim;
-        CPUContext::Copy<float, CPUContext, CPUContext>(
+        ctx->Copy<float, CPUContext, CPUContext>(
            x_concat_dim * inner_dim, y + y_offset, x + x_offset);
    }
 }
@@ -1315,12 +1410,13 @@ template <> void Concat<float16, CPUContext>(
    const int               y_concat_dim,
    const int               concat_offset,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    TIndex x_offset, y_offset;
    for (int n = 0; n < outer_dim; ++n) {
        x_offset = n * x_concat_dim * inner_dim;
        y_offset = (n * y_concat_dim + concat_offset) * inner_dim;
-        CPUContext::Copy<float16, CPUContext, CPUContext>(
+        ctx->Copy<float16, CPUContext, CPUContext>(
            x_concat_dim * inner_dim, y + y_offset, x + x_offset);
    }
 }
@@ -1333,12 +1429,13 @@ template <> void ConcatGrad<float, CPUContext>(
    const int               y_concat_dim,
    const int               concat_offset,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    TIndex x_offset, y_offset;
    for (int n = 0; n < outer_dim; ++n) {
        x_offset = n * x_concat_dim * inner_dim;
        y_offset = (n * y_concat_dim + concat_offset) * inner_dim;
-        CPUContext::Copy<float, CPUContext, CPUContext>(
+        ctx->Copy<float, CPUContext, CPUContext>(
            x_concat_dim * inner_dim, dx + x_offset, dy + y_offset);
    }
 }
@@ -1351,12 +1448,13 @@ template <> void ConcatGrad<float16, CPUContext>(
    const int               y_concat_dim,
    const int               concat_offset,
    const float16*          dy,
-    float16*                dx) {
+    float16*                dx,
+    CPUContext*             ctx) {
    TIndex x_offset, y_offset;
    for (int n = 0; n < outer_dim; ++n) {
        x_offset = n * x_concat_dim * inner_dim;
        y_offset = (n * y_concat_dim + concat_offset) * inner_dim;
-        CPUContext::Copy<float16, CPUContext, CPUContext>(
+        ctx->Copy<float16, CPUContext, CPUContext>(
            x_concat_dim * inner_dim, dx + x_offset, dy + y_offset);
    }
 }
@@ -1371,7 +1469,8 @@ void _Crop1D(
    const int               inner_dim,
    const int               start,
    const T*                x,
-    T*                      y) {
+    T*                      y,
+    CPUContext*             ctx) {
    const int count_v2 = count / inner_dim;
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
@@ -1381,7 +1480,7 @@ void _Crop1D(
        const int o = idx / ex_dim;
        const T* x_ptr = x + (o * dim + ex_d + start) * inner_dim;
        T* y_ptr = y + (o * ex_dim + ex_d) * inner_dim;
-        CPUContext::Copy<T, CPUContext, CPUContext>(
+        ctx->Copy<T, CPUContext, CPUContext>(
            inner_dim, y_ptr, x_ptr);
    }
 }
@@ -1393,8 +1492,10 @@ template<> void Crop1D<int, CPUContext>(
    const int               inner_dim,
    const int               start,
    const int*              x,
-    int*                    y) {
-    _Crop1D<int>(count, dim, ex_dim, inner_dim, start, x, y);
+    int*                    y,
+    CPUContext*             ctx) {
+    _Crop1D<int>(count, dim, ex_dim,
+        inner_dim, start, x, y, ctx);
 }

 template<> void Crop1D<float, CPUContext>(
@@ -1404,8 +1505,10 @@ template<> void Crop1D<float, CPUContext>(
    const int               inner_dim,
    const int               start,
    const float*            x,
-    float*                  y) {
-    _Crop1D<float>(count, dim, ex_dim, inner_dim, start, x, y);
+    float*                  y,
+    CPUContext*             ctx) {
+    _Crop1D<float>(count, dim, ex_dim,
+        inner_dim, start, x, y, ctx);
 }

 template <typename T>
@@ -1417,7 +1520,8 @@ void _Crop1DGrad(
    const int               start,
    const int               end,
    const T*                dy,
-    T*                      dx) {
+    T*                      dx,
+    CPUContext*             ctx) {
    const int count_v2 = count / inner_dim;
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
@@ -1430,7 +1534,7 @@ void _Crop1DGrad(
            for (int i = 0; i < inner_dim; ++i) dx_ptr[i] = 0;
        } else {
            const T* dy_ptr = dy + (o * ex_dim + d - start) * inner_dim;
-            CPUContext::Copy<T, CPUContext, CPUContext>(
+            ctx->Copy<T, CPUContext, CPUContext>(
                inner_dim, dx_ptr, dy_ptr);
        }
    }
@@ -1444,10 +1548,11 @@ template<> void Crop1DGrad<int, CPUContext>(
    const int               start,
    const int               end,
    const int*              dy,
-    int*                    dx) {
+    int*                    dx,
+    CPUContext*             ctx) {
    _Crop1DGrad<int>(
        count, dim, ex_dim, inner_dim,
-            start, end, dy, dx);
+            start, end, dy, dx, ctx);
 }

 template<> void Crop1DGrad<float, CPUContext>(
@@ -1458,10 +1563,11 @@ template<> void Crop1DGrad<float, CPUContext>(
    const int               start,
    const int               end,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    _Crop1DGrad<float>(
        count, dim, ex_dim, inner_dim,
-            start, end, dy, dx);
+            start, end, dy, dx, ctx);
 }

 /******************** ndarray.pad ********************/
@@ -1474,7 +1580,8 @@ template <> void ConstPad1D<float, CPUContext>(
    const int               pad_l,
    const float             value,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    const int count_v2 = count / inner_dim;
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
@@ -1488,7 +1595,7 @@ template <> void ConstPad1D<float, CPUContext>(
            for (int i = 0; i < inner_dim; ++i) y_ptr[i] = value;
        } else {
            const float* x_ptr = x + (o * dim + d) * inner_dim;
-            CPUContext::Copy<float, CPUContext, CPUContext>(
+            ctx->Copy<float, CPUContext, CPUContext>(
                inner_dim, y_ptr, x_ptr);
        }
    }
@@ -1501,7 +1608,8 @@ template <> void ReflectPad1D<float, CPUContext>(
    const int               inner_dim,
    const int               pad_l,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    const int count_v2 = count / inner_dim;
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
@@ -1518,7 +1626,7 @@ template <> void ReflectPad1D<float, CPUContext>(
                y_ptr[i] = x[(o * dim + d) * inner_dim + i];
        } else {
            const float* x_ptr = x + (o * dim + d) * inner_dim;
-            CPUContext::Copy<float, CPUContext, CPUContext>(
+            ctx->Copy<float, CPUContext, CPUContext>(
                inner_dim, y_ptr, x_ptr);
        }
    }
@@ -1531,7 +1639,8 @@ template <> void EdgePad1D<float, CPUContext>(
    const int               inner_dim,
    const int               pad_l,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    const int count_v2 = count / inner_dim;
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
@@ -1546,7 +1655,7 @@ template <> void EdgePad1D<float, CPUContext>(
                y_ptr[i] = x[(o * dim + d) * inner_dim + i];
        } else {
            const float* x_ptr = x + (o * dim + d) * inner_dim;
-            CPUContext::Copy<float, CPUContext, CPUContext>(
+            ctx->Copy<float, CPUContext, CPUContext>(
                inner_dim, y_ptr, x_ptr);
        }
    }
@@ -1559,7 +1668,8 @@ template <> void ConstPad1DGrad<float, CPUContext>(
    const int               inner_dim,
    const int               pad_l,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    const int count_v2 = count / inner_dim;
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
@@ -1570,7 +1680,7 @@ template <> void ConstPad1DGrad<float, CPUContext>(
        const int ex_d = d + pad_l;
        const float* dy_ptr = dy + (o * ex_dim + ex_d) * inner_dim;
        float* dx_ptr = dx + (o * dim + d) * inner_dim;
-        CPUContext::Copy<float, CPUContext, CPUContext>(
+        ctx->Copy<float, CPUContext, CPUContext>(
            inner_dim, dx_ptr, dy_ptr);
    }
 }
@@ -1582,7 +1692,8 @@ template <> void ReflectPad1DGrad<float, CPUContext>(
    const int               inner_dim,
    const int               pad_l,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    for (int idx = 0; idx < count; ++idx) {
        const int i = idx % inner_dim;
        const int ex_d = (idx / inner_dim) % ex_dim;
@@ -1601,7 +1712,8 @@ template <> void EdgePad1DGrad<float, CPUContext>(
    const int               inner_dim,
    const int               pad_l,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    const int count_v2 = count / inner_dim;
    for (int idx = 0; idx < count_v2; ++idx) {
        const int ex_d = idx % ex_dim;
@@ -1613,7 +1725,7 @@ template <> void EdgePad1DGrad<float, CPUContext>(
                dx[(o * dim + d) * inner_dim + i] += dy_ptr[i];
        } else {
            float* dx_ptr = dx + (o * dim + d) * inner_dim;
-            CPUContext::Copy<float, CPUContext, CPUContext>(
+            ctx->Copy<float, CPUContext, CPUContext>(
                inner_dim, dx_ptr, dy_ptr);
        }
    }
@@ -1626,7 +1738,8 @@ template <> void OneHot<float, CPUContext>(
    const int               depth,
    const int               on_value,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -1643,7 +1756,8 @@ template<> void Sum<float, CPUContext>(
    const int               axis_dim,
    const int               inner_dim,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -1662,7 +1776,8 @@ template<> void SumGrad<float, CPUContext>(
    const int               inner_dim,
    const float             coeff,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -1682,14 +1797,15 @@ template <> void Repeat<float, CPUContext>(
    const int               inner_dim,
    const int               repeats,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
    for (int i = 0; i < outer_dim; ++i) {
        for (int j = 0; j < dim; ++j) {
            for (int k = 0; k < repeats; ++k) {
-                CPUContext::Copy<float, CPUContext, CPUContext>(
+                ctx->Copy<float, CPUContext, CPUContext>(
                    inner_dim, y, x);
                y += inner_dim;
            }
@@ -1709,7 +1825,7 @@ template <> void RepeatGrad<float, CPUContext>(
    CPUContext*             ctx) {
    for (int i = 0; i < outer_dim; ++i) {
        for (int j = 0; j < dim; ++j) {
-            CPUContext::Copy<float, CPUContext, CPUContext>(
+            ctx->Copy<float, CPUContext, CPUContext>(
                inner_dim, dx, dy);
            dy += inner_dim;
            for (int k = 1; k < repeats; ++k) {
@@ -1732,12 +1848,13 @@ template <> void Slice<float, CPUContext>(
    const int               y_slice_dim,
    const int               slice_offset,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    TIndex x_offset, y_offset;
    for (int n = 0; n < outer_dim; ++n) {
        x_offset = (n * x_slice_dim + slice_offset) * inner_dim;
        y_offset = n * y_slice_dim * inner_dim;
-        CPUContext::Copy<float, CPUContext, CPUContext>(
+        ctx->Copy<float, CPUContext, CPUContext>(
            y_slice_dim * inner_dim, y + y_offset, x + x_offset);
    }
 }
@@ -1750,12 +1867,13 @@ template <> void SliceGrad<float, CPUContext>(
    const int               y_slice_dim,
    const int               slice_offset,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    TIndex x_offset, y_offset;
    for (int n = 0; n < outer_dim; ++n) {
        x_offset = (n * x_slice_dim + slice_offset) * inner_dim;
        y_offset = n * y_slice_dim * inner_dim;
-        CPUContext::Copy<float, CPUContext, CPUContext>(
+        ctx->Copy<float, CPUContext, CPUContext>(
            y_slice_dim * inner_dim, dx + x_offset, dy + y_offset);
    }
 }
@@ -1768,10 +1886,11 @@ template <> void Tile<float, CPUContext>(
    const int               ex_inner_dim,
    const int               multiple,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    for (int i = 0; i < outer_dim; ++i) {
        for (int t = 0; t < multiple; ++t) {
-            CPUContext::Copy<float, CPUContext, CPUContext>(
+            ctx->Copy<float, CPUContext, CPUContext>(
                ex_inner_dim, y, x);
            y += ex_inner_dim;
        }
@@ -1788,7 +1907,7 @@ template <> void TileGrad<float, CPUContext>(
    float*                  dx,
    CPUContext*             ctx) {
    for (int i = 0; i < outer_dim; ++i) {
-        CPUContext::Copy<float, CPUContext, CPUContext>(
+        ctx->Copy<float, CPUContext, CPUContext>(
            ex_inner_dim, dx, dy);
        dy += ex_inner_dim;
        for (int t = 1; t < multiple; ++t) {
@@ -1809,7 +1928,8 @@ template <> void Transpose<float, CPUContext>(
    const int*              old_steps,
    const int*              new_steps,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -1831,7 +1951,8 @@ template <> void Transpose<float16, CPUContext>(
    const int*              old_steps,
    const int*              new_steps,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -1842,7 +1963,8 @@ template <> void TransposeGrad<float, CPUContext>(
    const int*              old_steps,
    const int*              new_steps,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
 #ifdef WITH_OMP
    #pragma omp parallel for num_threads(GET_OMP_THREADS(count))
 #endif
@@ -1864,7 +1986,8 @@ template <> void TransposeGrad<float16, CPUContext>(
    const int*              old_steps,
    const int*              new_steps,
    const float16*          dy,
-    float16*                dx) {
+    float16*                dx,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -1877,7 +2000,8 @@ template <> void LSTMCell<float, CPUContext>(
    const float*            cx,
    float*                  xact,
    float*                  c,
-    float*                  h) {
+    float*                  h,
+    CPUContext*             ctx) {
    float i, f, o, c_;
    int f_offset = C, o_offset = 2 * C, c_offset = 3 * C, x_offset = 4 * C;
    for (int n = 0; n < N; ++n) {
@@ -1903,7 +2027,8 @@ template <> void LSTMCellGrad<float, CPUContext>(
    const float*            dc,
    const float*            dh,
    float*                  dcx,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    float i, f, o, g, tanh_c, dcx_sum_term;
    int f_offset = C,
            o_offset = 2 * C,
@@ -1964,7 +2089,8 @@ template <> void AdamUpdate<float, CPUContext>(
    const float             eps,
    float*                  g,
    float*                  m,
-    float*                  v) {
+    float*                  v,
+    CPUContext*             ctx) {
    _AdamUpdate<float>(count, lr, beta1, beta2, eps, g, m, v);
 }

@@ -1976,7 +2102,8 @@ template <> void AdamUpdate<float16, CPUContext>(
    const float             eps,
    float16*                g,
    float16*                m,
-    float16*                v) {
+    float16*                v,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -2004,7 +2131,8 @@ template <> void NesterovUpdate<float, CPUContext>(
    const float             lr,
    const float             momentum,
    float*                  g,
-    float*                  h) {
+    float*                  h,
+    CPUContext*             ctx) {
    _NesterovUpdate<float>(count, lr, momentum, g, h);
 }

@@ -2013,7 +2141,8 @@ template <> void NesterovUpdate<float16, CPUContext>(
    const float             lr,
    const float             momentum,
    float16*                g,
-    float16*                h) {
+    float16*                h,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -2043,7 +2172,8 @@ template <> void RMSPropUpdate<float, CPUContext>(
    const float             decay,
    const float             eps,
    float*                  g,
-    float*                  h) {
+    float*                  h,
+    CPUContext*             ctx) {
    _RMSPropUpdate<float>(count, lr, decay, eps, g, h);
 }

@@ -2053,7 +2183,8 @@ template <> void RMSPropUpdate<float16, CPUContext>(
    const float             decay,
    const float             eps,
    float16*                g,
-    float16*                h) {
+    float16*                h,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -2080,7 +2211,8 @@ template <> void SGDUpdate<float, CPUContext>(
    const float             lr,
    const float             momentum,
    float*                  g,
-    float*                  h) {
+    float*                  h,
+    CPUContext*             ctx) {
    _SGDUpdate<float>(count, lr, momentum, g, h);
 }

@@ -2089,7 +2221,8 @@ template <> void SGDUpdate<float16, CPUContext>(
    const float             lr,
    const float             momentum,
    float16*                g,
-    float16*                h) {
+    float16*                h,
+    CPUContext*             ctx) {
    CPU_FP16_NOT_SUPPORTED;
 }

@@ -2217,7 +2350,8 @@ template <> void BilinearResize<float, CPUContext>(
    const int               out_w,
    const string&           data_format,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    const float scale_h = (float)H / out_h;
    const float scale_w = (float)W / out_w;
    if (data_format == "NCHW") {
@@ -2326,10 +2460,10 @@ template <> void BilinearResizeGrad<float, CPUContext>(
    const int               out_w,
    const string&           data_format,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    const float scale_h = (float)H / out_h;
    const float scale_w = (float)W / out_w;
-    math::Set<float, CPUContext>(N * C * H * W, 0, dx);
    if (data_format == "NCHW") {
        _BilinearResizeGrad_NCHW<float>(
            N, C, H, W, out_h, out_w,
@@ -2439,7 +2573,8 @@ template <> void Im2Col2d<float, CPUContext>(
    const int               dilation_w,
    const string&           data_format,
    const float*            im,
-    float*                  col) {
+    float*                  col,
+    CPUContext*             ctx) {
    if (data_format == "NCHW") {
        const int count = (C * col_h * col_w);
        _Im2Col2d_NCHW<float>(
@@ -2471,8 +2606,9 @@ void _Col2Im2d_NCHW(
    const int               dilation_h,
    const int               dilation_w,
    const T*                col,
-    T*                      im) {
-    math::Set<float, CPUContext>(C * H * W, 0, im);
+    T*                      im,
+    CPUContext*             ctx) {
+    math::Set<float, CPUContext>(C * H * W, 0, im, ctx);
    const int im_offset = H * W;
    for (int c = 0; c < C; ++c, im += im_offset) {
        for (int kh = 0; kh < kernel_h; ++kh) {
@@ -2512,8 +2648,9 @@ void _Col2Im2d_NHWC(
    const int               dilation_h,
    const int               dilation_w,
    const T*                col,
-    T*                      im) {
-    math::Set<float, CPUContext>(C * H * W, 0, im);
+    T*                      im,
+    CPUContext*             ctx) {
+    math::Set<float, CPUContext>(C * H * W, 0, im, ctx);
    for (int output_h = 0; output_h < col_h; ++output_h) {
        const int base_h = -pad_h + stride_h * output_h;
        for (int output_w = 0; output_w < col_w; ++output_w) {
@@ -2552,19 +2689,20 @@ template<> void Col2Im2d<float, CPUContext>(
    const int               dilation_w,
    const string&           data_format,
    const float*            col,
-    float*                  im) {
+    float*                  im,
+    CPUContext*             ctx) {
    if (data_format == "NCHW") {
        const int count = (C * H * W);
        _Col2Im2d_NCHW<float>(
            C, H, W, col_h, col_w, kernel_h, kernel_w,
                stride_h, stride_w, pad_h, pad_w,
-                    dilation_h, dilation_w, col, im);
+                    dilation_h, dilation_w, col, im, ctx);
    } else if (data_format == "NHWC") {
        const int count = (H * W * C);
        _Col2Im2d_NHWC<float>(
            C, H, W, col_h, col_w, kernel_h, kernel_w,
                stride_h, stride_w, pad_h, pad_w,
-                    dilation_h, dilation_w, col, im);
+                    dilation_h, dilation_w, col, im, ctx);
    } else LOG(FATAL) << "Unknown data format: " << data_format;
 }

@@ -2632,7 +2770,8 @@ template <> void NNResize<float, CPUContext>(
    const int               out_w,
    const string&           data_format,
    const float*            x,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    const float scale_h = (float)H / out_h;
    const float scale_w = (float)W / out_w;
    if (data_format == "NCHW") {
@@ -2708,10 +2847,10 @@ template <> void NNResizeGrad<float, CPUContext>(
    const int               out_w,
    const string&           data_format,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    const float scale_h = (float)H / out_h;
    const float scale_w = (float)W / out_w;
-    math::Set<float, CPUContext>(N * C * H * W, 0, dx);
    if (data_format == "NCHW") {
        _NNResizeGrad_NCHW<float>(
            N, C, H, W, out_h, out_w,
@@ -2847,7 +2986,8 @@ template<> void MAXPooling2d<float, CPUContext>(
    const string&           data_format,
    const float*            x,
    int*                    mask,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    if (data_format == "NCHW") {
        _MAXPooling2d_NCHW<float>(
            N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
@@ -2966,7 +3106,8 @@ template<> void AVGPooling2d<float, CPUContext>(
    const int               pad_w,
    const string&           data_format,
    const float*            x,
-    float* y) {
+    float*                  y,
+    CPUContext*             ctx) {
    if (data_format == "NCHW") {
        _AVGPooling2d_NCHW<float>(
            N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
@@ -2994,10 +3135,11 @@ void _MAXPooling2dGrad_NCHW(
    const int               pad_w,
    const float*            dy,
    const int*              mask,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    int x_offset = H * W;
    int y_offset = pool_h * pool_w;
-    math::Set<float, CPUContext>(N * C * H * W, 0, dx);
+    math::Set<float, CPUContext>(N * C * H * W, 0, dx, ctx);
    for (int n = 0; n < N; ++n) {
        for (int c = 0; c < C; ++c) {
            for (int ph = 0; ph < pool_h; ++ph) {
@@ -3030,10 +3172,11 @@ void _MAXPooling2dGrad_NHWC(
    const int               pad_w,
    const float*            dy,
    const int*              mask,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    int x_offset = H * W * C;
    int y_offset = pool_h * pool_w * C;
-    math::Set<float, CPUContext>(N * H * W * C, 0, dx);
+    math::Set<float, CPUContext>(N * H * W * C, 0, dx, ctx);
    for (int n = 0; n < N; ++n) {
        for (int ph = 0; ph < pool_h; ph++) {
            for (int pw = 0; pw < pool_w; ++pw) {
@@ -3067,15 +3210,16 @@ template<> void MAXPooling2dGrad<float, CPUContext>(
    const string&           data_format,
    const float*            dy,
    const int*              mask,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
   if (data_format == "NCHW") {
        _MAXPooling2dGrad_NCHW<float>(
            N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
-                stride_h, stride_w, pad_h, pad_w, dy, mask, dx);
+                stride_h, stride_w, pad_h, pad_w, dy, mask, dx, ctx);
    } else if (data_format == "NHWC") {
        _MAXPooling2dGrad_NHWC<float>(
            N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
-                stride_h, stride_w, pad_h, pad_w, dy, mask, dx);
+                stride_h, stride_w, pad_h, pad_w, dy, mask, dx, ctx);
    } else LOG(FATAL) << "Unknown data format: " << data_format;
 }

@@ -3094,10 +3238,11 @@ void _AVGPooling2dGrad_NCHW(
    const int               pad_h,
    const int               pad_w,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    int x_offset = H * W;
    int y_offset = pool_h * pool_w;
-    math::Set<float, CPUContext>(N * C * H * W, 0, dx);
+    math::Set<float, CPUContext>(N * C * H * W, 0, dx,ctx);
    for (int n = 0; n < N; ++n) {
        for (int c = 0; c < C; ++c) {
            for (int ph = 0; ph < pool_h; ++ph) {
@@ -3141,10 +3286,11 @@ void _AVGPooling2dGrad_NHWC(
    const int               pad_h,
    const int               pad_w,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    int x_offset = H * W * C;
    int y_offset = pool_h * pool_w * C;
-    math::Set<float, CPUContext>(N * H * W * C, 0, dx);
+    math::Set<float, CPUContext>(N * H * W * C, 0, dx, ctx);
    for (int n = 0; n < N; ++n) {
        for (int ph = 0; ph < pool_h; ph++) {
            for (int pw = 0; pw < pool_w; ++pw) {
@@ -3187,15 +3333,16 @@ template<> void AVGPooling2dGrad<float, CPUContext>(
    const int               pad_w,
    const string&           data_format,
    const float*            dy,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    if (data_format == "NCHW") {
        _AVGPooling2dGrad_NCHW<float>(
            N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
-                stride_h, stride_w, pad_h, pad_w, dy, dx);
+                stride_h, stride_w, pad_h, pad_w, dy, dx, ctx);
    } else if (data_format == "NHWC") {
        _AVGPooling2dGrad_NHWC<float>(
            N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
-                stride_h, stride_w, pad_h, pad_w, dy, dx);
+                stride_h, stride_w, pad_h, pad_w, dy, dx, ctx);
    } else LOG(FATAL) << "Unknown data format: " << data_format;
 }

@@ -3214,12 +3361,11 @@ template<> void ROIPooling<float, CPUContext>(
    const float*            x,
    const float*            rois,
    int*                    mask,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    const TIndex x_offset = H * W,
                     y_offset = pool_h * pool_w,
                         im_offset = C * H * W;
-    math::Set<float, CPUContext>(count, -FLT_MAX, y);
-    math::Set<int, CPUContext>(count, -1, mask);
    for (int n = 0; n < num_rois; ++n) {
        int im_idx = rois[0];
        int x1 = round(rois[1] * spatial_scale);
@@ -3248,10 +3394,10 @@ template<> void ROIPooling<float, CPUContext>(
                    end_w = std::min(end_w, W);
                    bool is_empty = (end_h == start_h) || (end_w == start_w);
                    const int pool_idx = ph * pool_w + pw;
-                    if (is_empty) {
-                        y[pool_idx] = 0;
-                        mask[pool_idx] = -1;
-                    }
+                    if (is_empty || im_idx < 0) y[pool_idx] = 0;
+                    else y[pool_idx] = -FLT_MAX;
+                    mask[pool_idx] = -1;
+                    if (im_idx < 0) continue;
                    for (int h = start_h; h < end_h; ++h) {
                        for (int w = start_w; w < end_w; ++w) {
                            const int idx = h * W + w;
@@ -3286,7 +3432,8 @@ template<> void ROIPoolingGrad<float, CPUContext>(
    const float*            dy,
    const float*            rois,
    const int*              mask,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    NOT_IMPLEMENTED;
 }

@@ -3305,7 +3452,8 @@ template<> void ROIAlign<float, CPUContext>(
    const int               sampling_ratio,
    const float*            x,
    const float*            rois,
-    float*                  y) {
+    float*                  y,
+    CPUContext*             ctx) {
    NOT_IMPLEMENTED;
 }

@@ -3322,7 +3470,8 @@ template<> void ROIAlignGrad<float, CPUContext>(
    const int               sampling_ratio,
    const float*            dy,
    const float*            rois,
-    float*                  dx) {
+    float*                  dx,
+    CPUContext*             ctx) {
    NOT_IMPLEMENTED;
 }


--- a/Dragon/src/utils/op_kernel.cu
+++ b/Dragon/src/utils/op_kernel.cu
--- a/Dragon/src/utils/op_kernel_fp16.cu
+++ b/Dragon/src/utils/op_kernel_fp16.cu
@@ -23,7 +23,7 @@ __global__ void _ReluHalf(
    const half*             x,
    half*                   y) {
    const half kZero = __float2half(0.f);
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hgt(x[idx], kZero) ?
            x[idx] : __hmul(x[idx], slope);
@@ -38,7 +38,7 @@ __global__ void _ReluHalf2(
    const half2*            x,
    half2*                  y) {
    const half2 kZero = __float2half2_rn(0.f);
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
 #if __CUDA_ARCH__ >= 530
        y[idx] = __hbgt2(x[idx], kZero) ?
            x[idx] : __hmul2(x[idx], slope);
@@ -51,20 +51,23 @@ template<> void Relu<float16, CUDAContext>(
    const int               count,
    const float             slope,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (count % 2 == 0) {
+    if ((count & 1) == 0 == 0) {
        _ReluHalf2<half2>
-            << < CUDA_BLOCKS(count), CUDA_THREADS >> > (count / 2,
-                dragon_cast<half2, float>(slope),
-                    reinterpret_cast<const half2*>(x),
-                        reinterpret_cast<half2*>(y));
+            << < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> > (count >> 1,
+                     dragon_cast<half2, float>(slope),
+                         reinterpret_cast<const half2*>(x),
+                             reinterpret_cast<half2*>(y));
    } else {
        _ReluHalf<half>
-            << < CUDA_BLOCKS(count), CUDA_THREADS >> >(count,
-                dragon_cast<half, float>(slope),
-                    reinterpret_cast<const half*>(x),
-                        reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     dragon_cast<half, float>(slope),
+                         reinterpret_cast<const half*>(x),
+                             reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -82,7 +85,7 @@ __global__ void _AffineWithOBiasHalf(
    const half*             x,
    const half*             alpha,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
 #if __CUDA_ARCH__ >= 530
        const int scale_idx = (idx / inner_dim) % scale_dim;
        y[idx] = __hmul(alpha[scale_idx], x[idx]);
@@ -99,7 +102,7 @@ __global__ void _AffineWithBiasHalf(
    const half*             alpha,
    const half*             beta,
    half*                   y) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
 #if __CUDA_ARCH__ >= 530
        const int scale_idx = (idx / inner_dim) % scale_dim;
        y[idx] = __hadd(
@@ -125,25 +128,184 @@ template<> void Affine<float16, CUDAContext>(
 #ifdef WITH_CUDA_FP16
    if (beta != nullptr) {
        _AffineWithBiasHalf<float>
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-                count, scale_dim, inner_dim,
-                    reinterpret_cast<const half*>(x),
-                        reinterpret_cast<const half*>(alpha),
-                            reinterpret_cast<const half*>(beta),
-                                reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     scale_dim, inner_dim,
+                         reinterpret_cast<const half*>(x),
+                             reinterpret_cast<const half*>(alpha),
+                                 reinterpret_cast<const half*>(beta),
+                                     reinterpret_cast<half*>(y));
    } else {
        _AffineWithOBiasHalf<float>
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-                count, scale_dim, inner_dim,
-                    reinterpret_cast<const half*>(x),
-                        reinterpret_cast<const half*>(alpha),
-                            reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     scale_dim, inner_dim,
+                         reinterpret_cast<const half*>(x),
+                             reinterpret_cast<const half*>(alpha),
+                                 reinterpret_cast<half*>(y));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
 #endif
 }

+/******************** loss.sparse_softmax_cross_entropy ********************/
+
+template <typename Ty>
+__global__ void _SparseSoftmaxCrossEntropyHalf(
+    const int               count,
+    const int               axis_dim,
+    const int               inner_dim,
+    const half*             prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        const int oix = idx / inner_dim;
+        const int iix = idx % inner_dim;
+        const int label = labels[oix * inner_dim + iix];
+        int k;
+        for (k = 0; k < num_ignores; k++) {
+            if (label == ignores[k]) {
+                losses[idx] = flags[idx] = 0;
+                break;
+            }
+        }
+        if (k == num_ignores) {
+            const half kMIN = __float2half(HFLT_MIN);
+            half loss = __hneg(
+                hlog(
+                    __hgt(prob[(oix * axis_dim + label)
+                            * inner_dim + iix], kMIN) ?
+                                prob[(oix * axis_dim + label)
+                                    * inner_dim + iix] : kMIN
+                )
+            );
+            losses[idx] = __half2float(loss);
+            flags[idx] = 1;
+        }
+#endif
+    }
+}
+
+template <> void SparseSoftmaxCrossEntropy<float16, float, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _SparseSoftmaxCrossEntropyHalf<float>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                     reinterpret_cast<const half*>(prob), labels,
+                         ignores, num_ignores, losses, flags);
+}
+
+template <> void SparseSoftmaxCrossEntropy<float16, int64_t, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float*                  losses,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _SparseSoftmaxCrossEntropyHalf<int64_t>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                     reinterpret_cast<const half*>(prob), labels,
+                         ignores, num_ignores, losses, flags);
+}
+
+template <typename Ty>
+__global__ void _SparseSoftmaxCrossEntropyGradHalf(
+    const int               count,
+    const int               axis_dim,
+    const int               inner_dim,
+    const half*             prob,
+    const Ty*               labels,
+    const int*              ignores,
+    const int               num_ignores,
+    half*                   dx,
+    float*                  flags) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
+#if __CUDA_ARCH__ >= 530
+        const int oix = idx / inner_dim;
+        const int iix = idx % inner_dim;
+        const int label = labels[oix * inner_dim + iix];
+        int k;
+        for (k = 0; k < num_ignores; k++)
+                if (label == ignores[k]) break;
+        if (k != num_ignores) {
+            for (int c = 0; c < axis_dim; c++)
+                dx[(oix * axis_dim + c) * inner_dim + iix]
+                    = __float2half(0.f);
+            flags[idx] = 0;
+        } else {
+            const int x_idx = (oix * axis_dim + label) * inner_dim + iix;
+            dx[x_idx] = __hsub(dx[x_idx], __float2half(1.f));
+            flags[idx] = 1;
+        }
+#endif
+    }
+}
+
+template<> void SparseSoftmaxCrossEntropyGrad<float16, float, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          prob,
+    const float*            labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float16*                dx,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _SparseSoftmaxCrossEntropyGradHalf<float>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                     reinterpret_cast<const half*>(prob), labels,
+                         ignores, num_ignores,
+                             reinterpret_cast<half*>(dx), flags);
+}
+
+template<> void SparseSoftmaxCrossEntropyGrad<float16, int64_t, CUDAContext>(
+    const int               outer_dim,
+    const int               axis_dim,
+    const int               inner_dim,
+    const float16*          prob,
+    const int64_t*          labels,
+    const int*              ignores,
+    const int               num_ignores,
+    float16*                dx,
+    float*                  flags,
+    CUDAContext*            ctx) {
+    const int num_preds = outer_dim * inner_dim;
+    _SparseSoftmaxCrossEntropyGradHalf<int64_t>
+        << < CUDA_BLOCKS(num_preds), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(
+                 num_preds, axis_dim, inner_dim,
+                     reinterpret_cast<const half*>(prob), labels,
+                         ignores, num_ignores,
+                             reinterpret_cast<half*>(dx), flags);
+}
+
 /******************** misc.astype ********************/

 #ifdef WITH_CUDA_FP16
@@ -151,7 +313,7 @@ __global__ void _TypeHalf2Float(
    const int               count,
    const half*             a,
    float*                  b) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
        b[idx] = __half2float(a[idx]);
    }
 }
@@ -159,7 +321,7 @@ __global__ void _TypeFloat2Half(
    const int               count,
    const float*            a,
    half*                   b) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
        b[idx] = __float2half(a[idx]);
    }
 }
@@ -168,7 +330,7 @@ __global__ void _TypeHalf2Half(
    const int               count,
    const half*             a,
    half*                   b) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
        b[idx] = a[idx];
    }
 }
@@ -178,14 +340,16 @@ __global__ void _TypeHalf2Half(
    template <> void TypeA2B<float16, type, CUDAContext>( \
        const int           count, \
        const float16*      a, \
-        type*               b) { \
+        type*               b, \
+        CUDAContext*        ctx) { \
        LOG(FATAL) << "CUDAContext has not implemented: float16 -> " \
                   << TypeMetaToString(TypeMeta::Make<type>()); \
    } \
    template <> void TypeA2B<type, float16, CUDAContext>( \
        const int           count, \
        const type*         a, \
-        float16*            b) { \
+        float16*            b, \
+        CUDAContext*        ctx) { \
        LOG(FATAL) << "CUDAContext has not implemented: " \
                   << TypeMetaToString(TypeMeta::Make<type>()) << " -> float16"; \
    }
@@ -194,29 +358,35 @@ __global__ void _TypeHalf2Half(
    template <> void TypeA2B<float16, float, CUDAContext>( \
        const int           count, \
        const float16*      a, \
-        float*              b) { \
+        float*              b, \
+        CUDAContext*        ctx) { \
        _TypeHalf2Float \
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >( \
-                count, reinterpret_cast<const half*>(a), b); \
+            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+                 0, ctx->cuda_stream() >> >(count, \
+                     reinterpret_cast<const half*>(a), b); \
    } \
    template <> void TypeA2B<float, float16, CUDAContext>( \
        const int           count, \
        const float*        a, \
-        float16*            b) { \
+        float16*            b, \
+        CUDAContext*        ctx) { \
        _TypeFloat2Half \
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >( \
-                count, a, reinterpret_cast<half*>(b)); \
+            << < CUDA_BLOCKS(count), CUDA_THREADS, \
+                 0, ctx->cuda_stream() >> >(count, \
+                     a, reinterpret_cast<half*>(b)); \
    }

 #ifdef WITH_CUDA_FP16
 template <> void TypeA2B<float16, float16, CUDAContext>(
    const int               count,
    const float16*          a,
-    float16*                b) {
+    float16*                b,
+    CUDAContext*            ctx) {
    _TypeHalf2Half
-        << <CUDA_BLOCKS(count), CUDA_THREADS >> >(count,
-            reinterpret_cast<const half*>(a),
-                reinterpret_cast<half*>(b));
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count,
+                 reinterpret_cast<const half*>(a),
+                     reinterpret_cast<half*>(b));
 }
 DEFINE_TYPE_ENABLE_FP16_FP32;
 DEFINE_TYPE_DISABLE_FP16(double);
@@ -227,7 +397,8 @@ DEFINE_TYPE_DISABLE_FP16(uint8_t);
 template <> void TypeA2B<float16, float16, CUDAContext>(
    const int               count,
    const float16*          a,
-    float16*                b) {
+    float16*                b,
+    CUDAContext*            ctx) {
    LOG(FATAL) << "CUDAContext has not implemented: float16 -> float16";
 }
 DEFINE_TYPE_DISABLE_FP16(float);
@@ -251,7 +422,7 @@ __global__ void _ImageDataHalf_NCHW(
    const float*            std_values,
    const Tx*               x,
    Ty*                     y) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
        const int w = idx % W;
        const int h = (idx / W) % H;
        const int c = (idx / W / H) % C;
@@ -274,7 +445,7 @@ __global__ void _ImageDataHalf_NHWC(
    const float*            std_values,
    const Tx*               x,
    Ty*                     y) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
        const int c = idx % C;
        float raw_value = x[idx];
        if (mean_values) raw_value -= mean_values[c];
@@ -294,18 +465,21 @@ template <> void ImageData<float, float16, CUDAContext>(
    const float*            std_values,
    const string&           data_format,
    const float*            x,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    if (data_format == "NCHW") {
        _ImageDataHalf_NCHW<float, half>
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-                count, N, C, H, W, mean_values, std_values,
-                    x, reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     N, C, H, W, mean_values, std_values,
+                         x, reinterpret_cast<half*>(y));
    } else if (data_format == "NHWC") {
        _ImageDataHalf_NHWC<float, half> 
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-                count, N, C, H, W, mean_values, std_values,
-                    x, reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     N, C, H, W, mean_values, std_values,
+                         x, reinterpret_cast<half*>(y));
    } else LOG(FATAL) << "Unknown data format: " << data_format;
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -322,18 +496,21 @@ template <> void ImageData<uint8_t, float16, CUDAContext>(
    const float*            std_values,
    const string&           data_format,
    const uint8_t*          x,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    if (data_format == "NCHW") {
        _ImageDataHalf_NCHW<uint8_t, half>
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-                count, N, C, H, W, mean_values, std_values,
-                    x, reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     N, C, H, W, mean_values, std_values,
+                         x, reinterpret_cast<half*>(y));
    } else if (data_format == "NHWC") {
        _ImageDataHalf_NHWC<uint8_t, half>
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-                count, N, C, H, W, mean_values, std_values,
-                    x, reinterpret_cast<half*>(y));
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     N, C, H, W, mean_values, std_values,
+                         x, reinterpret_cast<half*>(y));
    } else LOG(FATAL) << "Unknown data format: " << data_format;
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -352,7 +529,7 @@ __global__ void _ConcatHalf(
    const int               concat_offset,
    const T*                x,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
        const int tmp = x_concat_dim * inner_dim;
        const int outer_idx = idx / tmp;
        const int concat_idx = idx % tmp;
@@ -370,14 +547,16 @@ template <> void Concat<float16, CUDAContext>(
    const int               y_concat_dim,
    const int               concat_offset,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    _ConcatHalf<half>
-        << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-            count, outer_dim, inner_dim,
-                x_concat_dim, y_concat_dim, concat_offset,
-                    reinterpret_cast<const half*>(x),
-                        reinterpret_cast<half*>(y));
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count,
+                 outer_dim, inner_dim,
+                     x_concat_dim, y_concat_dim, concat_offset,
+                         reinterpret_cast<const half*>(x),
+                             reinterpret_cast<half*>(y));
 #else
    CUDA_FP16_NOT_COMPILED;
 #endif
@@ -393,7 +572,7 @@ __global__ void _ConcatGradHalf(
    const int               concat_offset,
    const T*                dy,
    T*                      dx) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
        const int tmp = x_concat_dim * inner_dim;
        const int outer_idx = idx / tmp;
        const int concat_idx = idx % tmp;
@@ -411,14 +590,16 @@ template <> void ConcatGrad<float16, CUDAContext>(
    const int               y_concat_dim,
    const int               concat_offset,
    const float16*          dy,
-    float16*                dx) {
+    float16*                dx,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    _ConcatGradHalf<half>
-        << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-            count, outer_dim, inner_dim,
-                x_concat_dim, y_concat_dim, concat_offset,
-                    reinterpret_cast<const half*>(dy),
-                        reinterpret_cast<half*>(dx));
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count,
+                 outer_dim, inner_dim,
+                     x_concat_dim, y_concat_dim, concat_offset,
+                         reinterpret_cast<const half*>(dy),
+                             reinterpret_cast<half*>(dx));
 #else
    CUDA_FP16_NOT_COMPILED;
 #endif
@@ -435,7 +616,7 @@ __global__ void _TransposeHalf(
    const int*              new_steps,
    const T*                x,
    T*                      y) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
       int x_idx = 0, y_idx = idx;
       for (int j = 0; j < ndim; ++j) {
           int k = order[j];
@@ -453,13 +634,15 @@ template <> void Transpose<float16, CUDAContext>(
    const int*              old_steps,
    const int*              new_steps,
    const float16*          x,
-    float16*                y) {
+    float16*                y,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    _TransposeHalf<half>
-        << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-            count, ndim, order, old_steps, new_steps,
-                reinterpret_cast<const half*>(x),
-                    reinterpret_cast<half*>(y));
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count,
+                 ndim, order, old_steps, new_steps,
+                     reinterpret_cast<const half*>(x),
+                         reinterpret_cast<half*>(y));
 #else
    CUDA_FP16_NOT_COMPILED;
 #endif
@@ -474,7 +657,7 @@ __global__ void _TransposeGradHalf(
    const int*              new_steps,
    const T*                dy,
    T*                      dx) {
-    CUDA_KERNEL_LOOP(idx, count) {
+    CUDA_1D_KERNEL_LOOP(idx, count) {
        int x_idx = 0, y_idx = idx;
        for (int j = 0; j < ndim; ++j) {
            int k = order[j];
@@ -492,13 +675,15 @@ template <> void TransposeGrad<float16, CUDAContext>(
    const int*              old_steps,
    const int*              new_steps,
    const float16*          dy,
-    float16*                dx) {
+    float16*                dx,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    _TransposeGradHalf<half>
-        << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-            count, ndim, order, old_steps, new_steps,
-                reinterpret_cast<const half*>(dy),
-                    reinterpret_cast<half*>(dx));
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count,
+                 ndim, order, old_steps, new_steps,
+                     reinterpret_cast<const half*>(dy),
+                         reinterpret_cast<half*>(dx));
 #else
    CUDA_FP16_NOT_COMPILED;
 #endif
@@ -516,7 +701,7 @@ __global__ void _AdamUpdateHalf(
    half*                   g,
    half*                   m,
    half*                   v) {
-    CUDA_KERNEL_LOOP(i, count) {
+    CUDA_1D_KERNEL_LOOP(i, count) {
 #if __CUDA_ARCH__ >= 530
        half gi = g[i];
        half kOne = __float2half(1.f);
@@ -545,17 +730,19 @@ template <> void AdamUpdate<float16, CUDAContext>(
    const float             eps,
    float16*                g,
    float16*                m,
-    float16*                v) {
+    float16*                v,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    _AdamUpdateHalf
-        << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-            count, dragon_cast<half, float>(lr),
-                dragon_cast<half, float>(beta1),
-                    dragon_cast<half, float>(beta2),
-                        dragon_cast<half, float>(eps),
-                            reinterpret_cast<half*>(g),
-                                reinterpret_cast<half*>(m),
-                                    reinterpret_cast<half*>(v));
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count,
+                 dragon_cast<half, float>(lr),
+                     dragon_cast<half, float>(beta1),
+                         dragon_cast<half, float>(beta2),
+                             dragon_cast<half, float>(eps),
+                                 reinterpret_cast<half*>(g),
+                                     reinterpret_cast<half*>(m),
+                                         reinterpret_cast<half*>(v));
 #else
    CUDA_FP16_NOT_COMPILED;
 #endif
@@ -570,7 +757,7 @@ __global__ void _NesterovUpdateHalf(
    const half              momentum,
    half*                   g,
    half*                   h) {
-    CUDA_KERNEL_LOOP(i, count) {
+    CUDA_1D_KERNEL_LOOP(i, count) {
 #if __CUDA_ARCH__ >= 530
        half hi = h[i];
        half hi_new = h[i] = __hadd(
@@ -592,7 +779,7 @@ __global__ void _NesterovUpdateHalf2(
    const half2             momentum,
    half2*                  g,
    half2*                  h) {
-    CUDA_KERNEL_LOOP(i, count) {
+    CUDA_1D_KERNEL_LOOP(i, count) {
 #if __CUDA_ARCH__ >= 530
        half2 hi = h[i];
        half2 hi_new = h[i] = __hadd2(
@@ -614,22 +801,25 @@ template <> void NesterovUpdate<float16, CUDAContext>(
    const float             lr,
    const float             momentum,
    float16*                g,
-    float16*                h) {
+    float16*                h,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (count % 2 == 0) {
+    if ((count & 1) == 0 == 0) {
        _NesterovUpdateHalf2
-            << <CUDA_BLOCKS(count / 2), CUDA_THREADS >> >(
-                count / 2, dragon_cast<half2, float>(lr),
-                    dragon_cast<half2, float>(momentum),
-                        reinterpret_cast<half2*>(g),
-                            reinterpret_cast<half2*>(h));
+            << < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count >> 1,
+                     dragon_cast<half2, float>(lr),
+                         dragon_cast<half2, float>(momentum),
+                             reinterpret_cast<half2*>(g),
+                                 reinterpret_cast<half2*>(h));
    } else {
        _NesterovUpdateHalf
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-                count, dragon_cast<half, float>(lr),
-                    dragon_cast<half, float>(momentum),
-                        reinterpret_cast<half*>(g),
-                            reinterpret_cast<half*>(h));
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     dragon_cast<half, float>(lr),
+                         dragon_cast<half, float>(momentum),
+                             reinterpret_cast<half*>(g),
+                                 reinterpret_cast<half*>(h));
    }
 #else
    CUDA_FP16_NOT_COMPILED;
@@ -646,7 +836,7 @@ __global__ void _RMSPropUpdateHalf(
    const half              eps,
    half*                   g,
    half*                   h) {
-    CUDA_KERNEL_LOOP(i, count) {
+    CUDA_1D_KERNEL_LOOP(i, count) {
 #if __CUDA_ARCH__ >= 530
        half gi = g[i];
        half kOne = __float2half(1.f);
@@ -669,15 +859,17 @@ template <> void RMSPropUpdate<float16, CUDAContext>(
    const float             decay,
    const float             eps,
    float16*                g,
-    float16*                h) {
+    float16*                h,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
    _RMSPropUpdateHalf
-        << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-            count, dragon_cast<half, float>(lr),
-                dragon_cast<half, float>(decay),
-                    dragon_cast<half, float>(eps),
-                        reinterpret_cast<half*>(g),
-                            reinterpret_cast<half*>(h));
+        << < CUDA_BLOCKS(count), CUDA_THREADS,
+             0, ctx->cuda_stream() >> >(count,
+                 dragon_cast<half, float>(lr),
+                     dragon_cast<half, float>(decay),
+                         dragon_cast<half, float>(eps),
+                             reinterpret_cast<half*>(g),
+                                 reinterpret_cast<half*>(h));
 #else
    CUDA_FP16_NOT_COMPILED;
 #endif
@@ -692,7 +884,7 @@ __global__ void _SGDUpdateHalf(
    const half              momentum,
    half*                   g,
    half*                   h) {
-    CUDA_KERNEL_LOOP(i, count) {
+    CUDA_1D_KERNEL_LOOP(i, count) {
 #if __CUDA_ARCH__ >= 530
        half hi = h[i];
        g[i] = h[i] = __hadd(
@@ -709,7 +901,7 @@ __global__ void _SGDUpdateHalf2(
    const half2             momentum,
    half2*                  g,
    half2*                  h) {
-    CUDA_KERNEL_LOOP(i, count) {
+    CUDA_1D_KERNEL_LOOP(i, count) {
 #if __CUDA_ARCH__ >= 530
        half2 hi = h[i];
        g[i] = h[i] = __hadd2(
@@ -726,22 +918,25 @@ template <> void SGDUpdate<float16, CUDAContext>(
    const float             lr,
    const float             momentum,
    float16*                g,
-    float16*                h) {
+    float16*                h,
+    CUDAContext*            ctx) {
 #ifdef WITH_CUDA_FP16
-    if (count % 2 == 0) {
+    if ((count & 1) == 0 == 0) {
        _SGDUpdateHalf2
-            << <CUDA_BLOCKS(count / 2), CUDA_THREADS >> >(
-                count / 2, dragon_cast<half2, float>(lr),
-                    dragon_cast<half2, float>(momentum),
-                        reinterpret_cast<half2*>(g),
-                            reinterpret_cast<half2*>(h));
+            << < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count >> 1,
+                     dragon_cast<half2, float>(lr),
+                         dragon_cast<half2, float>(momentum),
+                             reinterpret_cast<half2*>(g),
+                                 reinterpret_cast<half2*>(h));
    } else {
        _SGDUpdateHalf
-            << <CUDA_BLOCKS(count), CUDA_THREADS >> >(
-                count, dragon_cast<half, float>(lr),
-                    dragon_cast<half, float>(momentum),
-                        reinterpret_cast<half*>(g),
-                            reinterpret_cast<half*>(h));
+            << < CUDA_BLOCKS(count), CUDA_THREADS,
+                 0, ctx->cuda_stream() >> >(count,
+                     dragon_cast<half, float>(lr),
+                         dragon_cast<half, float>(momentum),
+                             reinterpret_cast<half*>(g),
+                                 reinterpret_cast<half*>(h));
    }
 #else
    CUDA_FP16_NOT_COMPILED;

--- a/Dragon/src/utils/sse_alternative.cc
+++ b/Dragon/src/utils/sse_alternative.cc
@@ -162,7 +162,7 @@ template<> void Axpby(
    SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
 }

-template<> float ASum(
+template<> float Sum(
    const int               n,
    const float*            x) {
    __m128 x1, sum = SSE_FP32_ZERO;