Commit 5cd0761b by Ting PAN

Unlock CUDA Async Streams

1 parent 3b990761
Showing with 3672 additions and 2829 deletions
......@@ -52,9 +52,9 @@ using Set = std::unordered_set<Value> ;
/*
* Define the Kernel version.
*
* | Major(2) | Minor(2) | Patch(10) |
* | Major(2) | Minor(2) | Patch(11) |
*/
#define DRAGON_VERSION 2210
#define DRAGON_VERSION 2211
/*
* Define the default random seed.
......
......@@ -34,6 +34,8 @@ class CPUContext {
virtual ~CPUContext() {}
inline void SwitchToDevice() {}
inline void SwitchToDevice(int stream_id) {}
inline void FinishDeviceCompution() {}
inline static void* New(size_t nbytes) {
......@@ -47,7 +49,15 @@ class CPUContext {
return data;
}
inline static void Memset(size_t nbytes, void* ptr) {
inline static void Memset(
size_t nbytes,
void* ptr) {
memset(ptr, 0, nbytes);
}
inline void MemsetAsync(
size_t nbytes,
void* ptr) {
memset(ptr, 0, nbytes);
}
......@@ -59,18 +69,16 @@ class CPUContext {
memcpy(dst, src, nbytes);
}
inline static void Delete(void* data) { free(data); }
template<class DstContext, class SrcContext>
inline static void MemcpyAsync(
inline void MemcpyAsync(
size_t nbytes,
void* dst,
const void* src) {
NOT_IMPLEMENTED;
memcpy(dst, src, nbytes);
}
template<typename T, class DstContext, class SrcContext>
inline static void Copy(
inline void Copy(
int n,
T* dst,
const T* src) {
......@@ -82,7 +90,10 @@ class CPUContext {
else for (int i = 0; i < n; i++) dst[i] = src[i];
}
inline static void Delete(void* data) { free(data); }
inline int device_id() const { return 0; }
inline void set_stream_id(int stream_id) {}
inline std::mt19937* rand_generator() {
if (!rand_generator_.get())
......
......@@ -23,8 +23,7 @@ namespace dragon {
class CUDAObject {
public:
CUDAObject(int default_stream = 1)
: default_stream(default_stream) {
CUDAObject() {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
cuda_streams[i] = vector<cudaStream_t>();
cublas_handles[i] = vector<cublasHandle_t>();
......@@ -38,7 +37,7 @@ class CUDAObject {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
for (int j = 0; j < cuda_streams[i].size(); j++) {
auto& stream = cuda_streams[i][j];
// follow caffe2, do not check the stream destroying
// follow the caffe2, do not check the stream destroying
// Error code 29 (driver shutting down) is inevitable
// TODO(PhyscalX): Can someone solve this issue?
if (stream) cudaStreamDestroy(stream);
......@@ -52,19 +51,21 @@ class CUDAObject {
}
}
/**
* Each device takes a group of streams.
*
* The stream 0 is reserved for default stream,
* stream 1 or higher is created as ``cudaStreamNonBlocking``.
*/
// follow the caffe2,
// each device takes a group of non-bl0cking streams
// the stream 0 is reserved for default stream,
// as some computations really require it,
// e.g. cublas.asum() and mixed cpu/cuda operations
// besides, somes calls, such as cudnn.conv() and cudnn.rnn(),
// produce wrong results if running them on non-blocking streams
// note that caffe2 also use default streams (within CuDNNState)
cudaStream_t GetStream(int device_id, int stream_id) {
vector<cudaStream_t>& dev_streams = cuda_streams[device_id];
if (dev_streams.size() <= (unsigned)stream_id)
dev_streams.resize(stream_id + 1, nullptr);
if (!dev_streams[stream_id]) {
DeviceGuard guard(device_id);
unsigned int flags = !stream_id && default_stream ?
unsigned int flags = !stream_id ?
cudaStreamDefault : cudaStreamNonBlocking;
CUDA_CHECK(cudaStreamCreateWithFlags(
&dev_streams[stream_id], flags));
......@@ -102,8 +103,6 @@ class CUDAObject {
}
#endif
int default_stream;
vector<cudaStream_t> cuda_streams[CUDA_MAX_DEVICES];
vector<cublasHandle_t> cublas_handles[CUDA_MAX_DEVICES];
#ifdef WITH_CUDNN
......@@ -129,11 +128,10 @@ class CUDAContext {
stream_id_ = stream_id;
}
inline void SwitchToDevice() { SwitchToDevice(0); }
inline void SwitchToDevice() { SwitchToDevice(1); }
inline void FinishDeviceCompution() {
cudaStreamSynchronize(cuda_object_
.GetStream(device_id_, stream_id_));
cudaStreamSynchronize(cuda_stream());
cudaError_t error = cudaGetLastError();
CHECK_EQ(error, cudaSuccess)
<< "\nCUDA Error: " << cudaGetErrorString(error);
......@@ -147,8 +145,17 @@ class CUDAContext {
return data;
}
inline static void Memset(size_t nbytes, void* ptr) {
cudaMemset(ptr, 0, nbytes);
inline static void Memset(
size_t nbytes,
void* ptr) {
CUDA_CHECK(cudaMemset(ptr, 0, nbytes));
}
inline void MemsetAsync(
size_t nbytes,
void* ptr) {
CUDA_CHECK(cudaMemsetAsync(ptr, 0,
nbytes, cuda_stream()));
}
template<class DstContext, class SrcContext>
......@@ -169,20 +176,22 @@ class CUDAContext {
cudaMemcpyDefault, cuda_stream()));
}
inline static void Delete(void* data) { cudaFree(data); }
template<typename T, class DstContext, class SrcContext>
static void Copy(
inline void Copy(
int n,
T* dst,
const T* src) {
if (dst == src) return;
Memcpy<SrcContext, DstContext>(
MemcpyAsync<SrcContext, DstContext>(
n * sizeof(T), (void*)dst, (const void*)src);
}
inline static void Delete(void* data) { cudaFree(data); }
inline int device_id() const { return device_id_; }
inline void set_stream_id(int stream_id) { stream_id_ = stream_id; }
inline cudaStream_t cuda_stream() {
return cuda_stream(device_id_, stream_id_);
}
......@@ -227,7 +236,7 @@ class CUDAContext {
static thread_local CUDAObject cuda_object_;
private:
int device_id_, stream_id_ = 0, random_seed_;
int device_id_, stream_id_ = 1, random_seed_;
unique_ptr<std::mt19937> rand_generator_;
curandGenerator_t curand_generator_ = nullptr;
};
......@@ -271,7 +280,7 @@ class CUDAClosure {
protected:
Context* ctx_;
CUDAObject cuda_object_ = 0;
CUDAObject cuda_object_;
vector<int> active_streams_;
};
......@@ -283,8 +292,22 @@ class CUDAContext {
CUDAContext(const int device_id = 0) { CUDA_NOT_COMPILED; }
inline void SwitchToDevice() { CUDA_NOT_COMPILED; }
inline void SwitchToDevice(int stream_id) { CUDA_NOT_COMPILED; }
inline void FinishDeviceCompution() { CUDA_NOT_COMPILED; }
inline static void Memset(
size_t nbytes,
void* ptr) {
CUDA_NOT_COMPILED;
}
inline void MemsetAsync(
size_t nbytes,
void* ptr) {
CUDA_NOT_COMPILED;
}
template<class DstContext, class SrcContext>
inline static void Memcpy(
size_t nbytes,
......@@ -302,6 +325,7 @@ class CUDAContext {
}
inline int device_id() const { return 0; }
inline void set_stream_id(int stream_id) {}
};
#endif // WITH_CUDA
......
......@@ -37,7 +37,8 @@ class GraphBase {
virtual bool Run(
const string& include,
const string& exclude) = 0;
const string& exclude,
const int stream_id = 1) = 0;
inline string name() const { return name_; }
......@@ -58,7 +59,8 @@ class Graph final : public GraphBase {
bool Run(
const string& include,
const string& exclude) override;
const string& exclude,
const int stream_id = 1) override;
GraphDef Prune(const GraphDef& meta_graph);
GraphDef MakeUpdate(const GraphDef& meta_graph);
......
......@@ -44,7 +44,7 @@ class OperatorBase {
const string& anchor);
inline void SwitchToPhase(const string& phase) { phase_ = phase; }
virtual void Run() { NOT_IMPLEMENTED; }
virtual void Run(int stream_id = 1) { NOT_IMPLEMENTED; }
inline const string& name() const { return def_.name(); }
inline const string& type() const { return def_.type(); }
......@@ -100,13 +100,13 @@ class Operator : public OperatorBase {
Output(0)->name() == "ignore"));
}
virtual void Run() final {
void Run(int stream_id = 1) final {
if (!allow_run_) return;
if (allow_recompute_) MakeResource();
ctx().SwitchToDevice();
ctx()->SwitchToDevice(stream_id);
MemorySwitch();
RunOnDevice();
if (do_sync_) ctx().FinishDeviceCompution();
if (do_sync_) ctx()->FinishDeviceCompution();
if (allow_recompute_) CleanResource();
}
......@@ -123,7 +123,7 @@ class Operator : public OperatorBase {
virtual void RunOnDevice() = 0;
inline Context& ctx() { return ctx_; }
inline Context* ctx() { return &ctx_; }
inline bool AllowRun() { return allow_run_; }
protected:
......@@ -192,6 +192,27 @@ DECLARE_REGISTRY(
const OperatorDef&,
Workspace*);
#define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
if (tensor.count() == 0) { \
CHECK(ws()->GetFiller(tensor.name())) \
<< "\nTensor(" << tensor.name() << ") is empty. \n" \
<< "may be specify a filler for it ?"; \
tensor.Reshape(shape); \
unique_ptr< Filler<type, Context> > filler( \
CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, ctx()); \
ctx()->FinishDeviceCompution(); \
} else { \
TIndex count = 1; \
for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
CHECK_EQ(count, tensor.count()) \
<< "\nModel request " << "Tensor(" << tensor.name() << ")'s " \
<< "size is " << count << ", \n" \
<< "but now is " << tensor.count() << ", " \
<< "did you feed the incorrect Tensor before ?"; \
tensor.Reshape(shape); \
}
#define TENSOR_FILL(tensor, shape) \
if (tensor.count() == 0) { \
CHECK(ws()->GetFiller(tensor.name())) \
......@@ -200,7 +221,8 @@ DECLARE_REGISTRY(
tensor.Reshape(shape); \
unique_ptr< Filler<T, Context> > filler( \
CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, &ctx()); \
filler->Fill(&tensor, ctx()); \
ctx()->FinishDeviceCompution(); \
} else { \
TIndex count = 1; \
for(int i = 0; i < shape.size(); i++) count *= shape[i]; \
......@@ -217,7 +239,7 @@ DECLARE_REGISTRY(
if (size > ptr_tensor->count()) { \
ptr_tensor->Reshape({ size }); \
math::Set<T, Context>(size, dragon_cast<T, float>(1.f), \
ptr_tensor->template mutable_data<T, Context>()); \
ptr_tensor->template mutable_data<T, Context>(), ctx()); \
} \
}
......
......@@ -74,7 +74,9 @@ class Tensor {
for (TIndex i = start; i < end; i++) ret *= dim(i);
return ret;
}
inline TIndex count() const { return size_; }
inline TIndex count(const TIndex start) const {
return count(start, ndim());
}
......@@ -115,14 +117,14 @@ class Tensor {
inline void Corrupt() { is_corrupted_ = true; }
inline bool has_memory() const {
return memory_ || ex_memory_ != nullptr;
return memory_ || ex_memory_ != nullptr;
}
MixedMemory* memory() const {
return own_mem_ ? memory_.get() : ex_memory_;
}
void set_memory(MixedMemory* mem) {
void set_memory(MixedMemory* mem) {
memory_.reset(mem); capacity_ = mem->nbytes();
}
......@@ -197,7 +199,7 @@ class Tensor {
mutable_data_ptr<Context>(&data_ptr);
// call the constructors
if (meta.ctor()) meta_.ctor()(data_ptr, size_);
capacity_ = size_ * meta.itemsize();
capacity_ = size_ * meta.itemsize(), require_init_ = true;
return data_ptr;
}
......@@ -225,6 +227,15 @@ class Tensor {
}
template <typename T, class Context>
T* mutable_data(Context* ctx) {
auto* data = mutable_data<T, Context>();
if (!require_init_) return data;
ctx->MemsetAsync(nbytes(), (void*)data);
require_init_ = false;
return data;
}
template <typename T, class Context>
const T* data() const {
CHECK(meta_ == TypeMeta::Make<T>())
<< "\nThe DType of Tensor(" << name() << ") is "
......@@ -234,27 +245,31 @@ class Tensor {
}
template <class Context>
inline void CopyFrom(const Tensor& other) {
inline void CopyFrom(const Tensor& other, Context* ctx) {
if ((void*)&other == (void*)this) return;
CHECK_EQ(size_, other.size_);
auto* src = other.template raw_data<Context>();
auto* dst = raw_mutable_data<Context>(other.meta_);
if (dst == src) return;
if (TypeMeta::Id<Context>() ==
TypeMeta::Id<CPUContext>()) {
CPUContext::Memcpy<Context, Context>(nbytes(), dst, src);
} else if (TypeMeta::Id<Context>() ==
TypeMeta::Id<CUDAContext>()) {
CUDAContext::Memcpy<Context, Context>(nbytes(), dst, src);
}
ctx->template MemcpyAsync<Context, Context>(
nbytes(), dst, src);
require_init_ = false;
}
inline void Move(MixedMemory* mem) {
if (mem != nullptr) ex_memory_ = mem;
else ex_memory_ = new MixedMemory(TypeMeta::Make<float>(), 4);
own_mem_ = false;
if (mem != nullptr) {
ex_memory_ = mem;
require_init_ = false;
} else {
ex_memory_ = new MixedMemory(
TypeMeta::Make<float>(), 4);
require_init_ = true;
} own_mem_ = false;
}
inline void Share(MixedMemory* mem) { Move(mem); is_shared_ = true; }
inline void Share(MixedMemory* mem) {
Move(mem); is_shared_ = true;
require_init_ = false;
}
inline void Reset() {
size_ = capacity_ = 0;
......@@ -275,7 +290,7 @@ class Tensor {
shared_ptr<MixedMemory> memory_;
MixedMemory* ex_memory_ = nullptr;
bool is_corrupted_ = false, is_shared_ = false;
bool own_mem_ = true;
bool own_mem_ = true, require_init_ = true;
};
} // namespace dragon
......
......@@ -179,29 +179,28 @@ class Workspace {
template <class Context>
inline vector<void*> caches(
const vector<size_t>& segments) {
TIndex total_size = 0;
for (auto& segment : segments) total_size += (TIndex)segment;
Tensor* cacheT = CreateTensor("/share/cache");
cacheT->Reshape({ total_size });
vector<void*> caches(segments.size());
caches[0] = cacheT->template mutable_data<uint8_t, Context>();
TIndex nbytes = 0;
for (auto& segment : segments) nbytes += (TIndex)segment;
Tensor* cache_t = CreateTensor("/share/cache");
cache_t->Reshape({ nbytes });
vector<void*> Bcaches(segments.size());
Bcaches[0] = cache_t->template mutable_data<uint8_t, Context>();
for (int i = 1; i < segments.size(); i++)
caches[i] = (uint8_t*)caches[i - 1] + segments[i - 1];
return caches;
Bcaches[i] = (uint8_t*)Bcaches[i - 1] + segments[i - 1];
return Bcaches;
}
template <typename T, class Context>
inline vector<T*> caches(
const vector<TIndex>& segments) {
TIndex total_count = 0;
for (auto& segment : segments) total_count += segment;
Tensor* cacheT = CreateTensor("/share/cache");
cacheT->Reshape({ total_count });
vector<T*> caches(segments.size());
caches[0] = cacheT->template mutable_data<T, Context>();
for (int i = 1; i < segments.size(); i++)
caches[i] = caches[i - 1] + segments[i - 1];
return caches;
vector<size_t> Tsegments;
for (auto& segment : segments)
Tsegments.emplace_back(segment * sizeof(T));
vector<void*> Bcaches = caches<Context>(Tsegments);
vector<T*> Tcaches(segments.size());
for (int i = 0; i < segments.size(); i++)
Tcaches[i] = (T*)Bcaches[i];
return Tcaches;
}
/******************** Operator ********************/
......@@ -259,11 +258,12 @@ class Workspace {
void RunGraph(
const string& graph_name,
const string& include,
const string& exclude) {
const string& exclude,
const int stream_id = 1) {
if (!graph_map_.count(graph_name))
LOG(FATAL) << "Graph(" << graph_name
<< ") does not exist.";
graph_map_[graph_name]->Run(include, exclude);
graph_map_[graph_name]->Run(include, exclude, stream_id);
}
vector<string> GetGraphs() {
......
......@@ -36,7 +36,6 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
USE_OPERATOR_FUNCTIONS;
void SoftmaxRun();
void SoftmaxRunFP16();
void RunOnDevice() override;
template <typename Tx, typename Ty> void RunWithType();
......
......@@ -42,7 +42,7 @@ public:
// simply copy the dY to dX
Output(0)->ReshapeLike(Input(0));
if (Output(0)->name() != Input(-1).name())
Output(0)->template CopyFrom<Context>(Input(-1));
Output(0)->template CopyFrom<Context>(Input(-1), ctx());
}
};
......
......@@ -34,7 +34,6 @@ class L2NormOp final : public Operator<Context> {
TIndex axis, num_axes, end_axis;
float eps;
string mode;
bool across_inner;
Tensor* norm, buffer;
TIndex outer_dim, dim, inner_dim, spatial_dim;
};
......@@ -55,7 +54,6 @@ class L2NormGradientOp final : public Operator<Context> {
protected:
TIndex axis, num_axes, end_axis;
string mode;
bool across_inner;
Tensor* norm, buffer, buffer_inner;
TIndex outer_dim, dim, inner_dim;
};
......
......@@ -24,7 +24,7 @@ class AdamUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override;
void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override;
protected:
......
......@@ -43,10 +43,26 @@ class CollectiveUpdateOp final : public Operator<Context> {
void InitNCCL();
void RunOnDevice() override;
void MPIAllReduceWithFloat();
void NCCLAllReduceWithFloat();
void MPIBcastWithFloat();
void NCCLBcastWithFloat();
template <typename T> void MPIAllReduce(
Tensor* tensor,
MPI_Datatype dtype);
template <typename T> void MPIBcast(
Tensor* tensor,
MPI_Datatype dtype);
#ifdef WITH_MPI_NCCL
template <typename T> void NCCLAllReduce(
Tensor* tensor,
ncclDataType_t dtype,
cudaStream_t& stream);
template <typename T> void NCCLBcast(
Tensor* tensor,
ncclDataType_t dtype,
cudaStream_t& stream);
#endif
protected:
int comm_size, comm_rank, comm_root;
......
......@@ -24,7 +24,7 @@ class NesterovUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override;
void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override;
protected:
......
......@@ -24,7 +24,7 @@ class RMSPropUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override;
void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override;
protected:
......
......@@ -25,7 +25,7 @@ class SGDUpdateOp final : public UpdateOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_UPDATER_FUNCTIONS(Context);
void ComputeRunWithFloat() override;
void ComputeRunWithFloat32() override;
void ComputeRunWithFloat16() override;
protected:
......
......@@ -35,13 +35,11 @@ class UpdateOpBase : public Operator<Context> {
void RunOnDevice() override;
template <typename T> void PreprocessRunWithType();
virtual void ComputeRunWithFloat() = 0;
virtual void ComputeRunWithFloat32() = 0;
virtual void ComputeRunWithFloat16() = 0;
virtual void ComputeRunWithFloat16() {
LOG(FATAL) << "This Updater does not support FP16.";
}
template <typename T> void UpdateRunWithType();
void UpdateRunWithFloat32();
void UpdateRunWithFloat16();
protected:
float lr_mult, decay_mult;
......
......@@ -80,7 +80,8 @@ class ConvOpBase : public Operator<Context> {
dilation[0], dilation[1],
data_format,
im,
col);
col,
ctx());
} else LOG(FATAL) << "ConvNd has not been implemented yet";
}
template <typename T> void Col2Im(const T* col, T* im) {
......@@ -94,7 +95,8 @@ class ConvOpBase : public Operator<Context> {
dilation[0], dilation[1],
data_format,
col,
im);
im,
ctx());
} else LOG(FATAL) << "ConvNd has not been implemented yet";
}
};
......
......@@ -19,6 +19,8 @@
namespace dragon {
#define HFLT_MIN 6.10e-5F
template <typename DestType, typename SrcType>
DestType dragon_cast(SrcType val);
......
......@@ -29,9 +29,17 @@ namespace dragon {
#ifdef WITH_CUDA
static const int CUDA_THREADS = 1024;
// We do have a server with 10 GPUs :-)
#define CUDA_MAX_DEVICES 10
// The number of cuda threads to use. We set it to
// 1024 which would work for compute capability 2.x
// Set it to 512 if using compute capability 1.x
const int CUDA_THREADS = 1024;
// The maximum number of blocks to use in the default kernel call. We set it to
// 65535 which would work for compute capability 2.x (where 65536 is the limit)
const int CUDA_MAX_BLOCKS = 65535;
// You really need a NVIDIA DGX-2 !!! :-)
#define CUDA_MAX_DEVICES 16
#define CUDA_VERSION_MIN(major, minor, patch) \
(CUDA_VERSION >= (major * 1000 + minor * 100 + patch))
......@@ -67,12 +75,16 @@ static const int CUDA_THREADS = 1024;
} while (0)
#endif // WITH_MPI_NCCL
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; \
i < n; i += blockDim.x * gridDim.x)
inline int CUDA_BLOCKS(const int N) {
return (N + CUDA_THREADS - 1) / CUDA_THREADS;
return std::max(
std::min(
(N + CUDA_THREADS - 1) / CUDA_THREADS,
CUDA_MAX_BLOCKS
), 1);
}
#if CUDA_VERSION_MAX(9, 0, 0)
......
......@@ -44,6 +44,7 @@ template<> class CUDNNType<float> {
static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
static float oneval, zeroval;
static const void *one, *zero;
typedef float BNParamType;
};
template<> class CUDNNType<double> {
......@@ -51,6 +52,7 @@ template<> class CUDNNType<double> {
static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
static double oneval, zeroval;
static const void *one, *zero;
typedef double BNParamType;
};
#ifdef WITH_CUDA_FP16
......@@ -59,6 +61,7 @@ template<> class CUDNNType<float16> {
static const cudnnDataType_t type = CUDNN_DATA_HALF;
static float oneval, zeroval;
static const void *one, *zero;
typedef float BNParamType;
};
#endif
......
......@@ -40,7 +40,7 @@ class ConstantFiller final : public Filler<T, Context> {
void Fill(Tensor* tensor, Context* ctx) override {
math::Set<T, Context>(tensor->count(),
dragon_cast<T, float>(filler().value()),
tensor->mutable_data<T, Context>());
tensor->mutable_data<T, Context>(), ctx);
}
protected:
......@@ -71,11 +71,11 @@ class TruncatedNormalFiller final : public Filler<T, Context> {
void Fill(Tensor* tensor, Context* ctx) override {
// implement it on gpu is difficult
static CPUContext cpu_ctx;
static CPUContext cctx;
math::RandomTruncatedNormal<T, CPUContext>(tensor->count(),
filler().mean(), filler().std(),
filler().low(), filler().high(),
tensor->mutable_data<T, CPUContext>(), &cpu_ctx);
tensor->mutable_data<T, CPUContext>(), &cctx);
}
protected:
......
......@@ -36,7 +36,8 @@ template <typename T, class Context>
void Set(
const int n,
const T alpha,
T* x);
T* x,
Context* ctx);
template <typename T, class Context>
void RandomUniform(
......@@ -78,73 +79,84 @@ void Add(
const int n,
const T* a,
const T* b,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Sub(
const int n,
const T* a,
const T* b,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Mul(
const int n,
const T* a,
const T* b,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Div(
const int n,
const T* a,
const T* b,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Clip(
const int n,
const float low,
const float high,
T* x);
T* x,
Context* ctx);
template <typename T, class Context>
void Exp(
const int n,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Log(
const int n,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Square(
const int n,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Sqrt(
const int n,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Pow(
const int n,
const float alpha,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Inv(
const int n,
const float numerator,
const T* x,
T* y);
T* y,
Context* ctx);
/******************** Level-2 ********************/
......@@ -164,19 +176,21 @@ void Scale(
Context* ctx);
template <typename T, class Context>
T StridedDot(
void StridedDot(
const int n,
const T* a,
const int incx,
const T* b,
const int incy,
T* y,
Context* ctx);
template <typename T, class Context>
float Dot(
void Dot(
const int n,
const T* a,
const T* b,
T* y,
Context* ctx);
template<typename T, class Context>
......@@ -188,13 +202,15 @@ template<typename T, class Context>
void AddScalar(
const int n,
const float alpha,
T* y);
T* y,
Context* ctx);
template<typename T, class Context>
void MulScalar(
const int n,
const float alpha,
T* y);
T* y,
Context* ctx);
template<typename T, class Context>
void Axpy(
......
......@@ -49,7 +49,8 @@ void Elu(
const int count,
const float alpha,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void EluGrad(
......@@ -57,7 +58,8 @@ void EluGrad(
const float alpha,
const T* dy,
const T* y,
T* dx);
T* dx,
Context* ctx);
/******************** activation.prelu ********************/
......@@ -70,7 +72,8 @@ void PRelu(
const string& data_format,
const T* x,
const T* w,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void PReluGrad(
......@@ -82,7 +85,8 @@ void PReluGrad(
const T* dy,
const T* x,
const T* w,
T* dx);
T* dx,
Context* ctx);
template <typename T, class Context>
void PReluWGrad(
......@@ -106,7 +110,8 @@ void Relu(
const int count,
const float slope,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void ReluGrad(
......@@ -114,7 +119,8 @@ void ReluGrad(
const float slope,
const T* dy,
const T* y,
T* dx);
T* dx,
Context* ctx);
/******************** activation.selu ********************/
......@@ -122,14 +128,16 @@ template <typename T, class Context>
void SElu(
const int count,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void SEluGrad(
const int count,
const T* dy,
const T* y,
T* dx);
T* dx,
Context* ctx);
/******************** activation.sigmoid ********************/
......@@ -137,14 +145,16 @@ template <typename T, class Context>
void Sigmoid(
const int count,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void SigmoidGrad(
const int count,
const T* dy,
const T* y,
T* dx);
T* dx,
Context* ctx);
/******************** activation.softmax ********************/
......@@ -179,14 +189,16 @@ template <typename T, class Context>
void Tanh(
const int count,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void TanhGrad(
const int count,
const T* dy,
const T* y,
T* dx);
T* dx,
Context* ctx);
/******************** arithmetic.affine ********************/
......@@ -223,7 +235,8 @@ void Clip(
const float high,
const T* x,
T* mask,
T* y);
T* y,
Context* ctx);
/******************** control_flow.compare ********************/
......@@ -232,7 +245,8 @@ void Equal(
const int count,
const T* a,
const T* b,
T* y);
T* y,
Context* ctx);
/******************** loss.l1_loss ********************/
......@@ -240,7 +254,8 @@ template <typename T, class Context>
void AbsGrad(
const int count,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** loss.sigmoid_cross_entropy ********************/
......@@ -301,14 +316,16 @@ void SmoothL1(
const int count,
const float beta,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void SmoothL1Grad(
const int count,
const float beta,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** loss.softmax_cross_entropy ********************/
......@@ -317,7 +334,8 @@ void SoftmaxCrossEntropy(
const int count,
const T* prob,
const T* target,
T* loss);
T* loss,
Context* ctx);
/******************** loss.softmax_focal_loss ********************/
......@@ -366,8 +384,8 @@ void SparseSoftmaxCrossEntropy(
const Ty* labels,
const int* ignores,
const int num_ignores,
Tx* losses,
Tx* flags,
float* losses,
float* flags,
Context* ctx);
template <typename Tx, typename Ty, class Context>
......@@ -380,7 +398,7 @@ void SparseSoftmaxCrossEntropyGrad(
const int* ignores,
const int num_ignores,
Tx* dx,
Tx* flags,
float* flags,
Context* ctx);
/******************** misc.astype ********************/
......@@ -389,7 +407,8 @@ template <typename Ta, typename Tb, class Context>
void TypeA2B(
const int count,
const Ta* a,
Tb* b);
Tb* b,
Context* ctx);
/******************** misc.image_data ********************/
......@@ -404,7 +423,8 @@ void ImageData(
const float* std_values,
const string& data_format,
const Tx* x,
Ty* y);
Ty* y,
Context* ctx);
/******************** ndarray.arange ********************/
......@@ -413,7 +433,8 @@ void Arange(
const int count,
const int start,
const int step,
T* y);
T* y,
Context* ctx);
/******************** ndarray.argreduce ********************/
......@@ -425,7 +446,8 @@ void Argmax(
const int top_k,
const T* x,
int64_t* indices,
T* values);
T* values,
Context* ctx);
template <typename T, class Context>
void Argmin(
......@@ -435,7 +457,8 @@ void Argmin(
const int top_k,
const T* x,
int64_t* indices,
T* values);
T* values,
Context* ctx);
/******************** ndarray.gather ********************/
......@@ -443,7 +466,8 @@ template <typename T, class Context>
void CanonicalAxis(
const int count,
const int dim,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Gather(
......@@ -454,7 +478,8 @@ void Gather(
const int y_slice_dim,
const int* indices,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void GatherGrad(
......@@ -465,7 +490,8 @@ void GatherGrad(
const int y_slice_dim,
const int* indices,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** ndarray.concat ********************/
......@@ -478,7 +504,8 @@ void Concat(
const int y_concat_dim,
const int concat_offset,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void ConcatGrad(
......@@ -489,7 +516,8 @@ void ConcatGrad(
const int y_concat_dim,
const int concat_offset,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** ndarray.crop ********************/
......@@ -501,7 +529,8 @@ void Crop1D(
const int inner_dim,
const int start,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void Crop1DGrad(
......@@ -512,7 +541,8 @@ void Crop1DGrad(
const int start,
const int end,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** ndarray.pad ********************/
......@@ -525,7 +555,8 @@ void ConstPad1D(
const int pad_l,
const float value,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void ReflectPad1D(
......@@ -535,7 +566,8 @@ void ReflectPad1D(
const int inner_dim,
const int pad_l,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void EdgePad1D(
......@@ -545,7 +577,8 @@ void EdgePad1D(
const int inner_dim,
const int pad_l,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void ConstPad1DGrad(
......@@ -555,7 +588,8 @@ void ConstPad1DGrad(
const int inner_dim,
const int pad_l,
const T* dy,
T* dx);
T* dx,
Context* ctx);
template <typename T, class Context>
void ReflectPad1DGrad(
......@@ -565,7 +599,8 @@ void ReflectPad1DGrad(
const int inner_dim,
const int pad_l,
const T* dy,
T* dx);
T* dx,
Context* ctx);
template <typename T, class Context>
void EdgePad1DGrad(
......@@ -575,7 +610,8 @@ void EdgePad1DGrad(
const int inner_dim,
const int pad_l,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** ndarray.one_hot ********************/
......@@ -585,7 +621,8 @@ void OneHot(
const int depth,
const int on_value,
const T* x,
T* y);
T* y,
Context* ctx);
/******************** ndarray.reduce ********************/
......@@ -595,7 +632,8 @@ void Sum(
const int axis_dim,
const int inner_dim,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void SumGrad(
......@@ -604,7 +642,8 @@ void SumGrad(
const int inner_dim,
const T coeff,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** ndarray.repeat ********************/
......@@ -616,7 +655,8 @@ void Repeat(
const int inner_dim,
const int repeats,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void RepeatGrad(
......@@ -640,7 +680,8 @@ void Slice(
const int y_slice_dim,
const int slice_offset,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void SliceGrad(
......@@ -651,7 +692,8 @@ void SliceGrad(
const int y_slice_dim,
const int slice_offset,
const T* dy,
T* x);
T* x,
Context* ctx);
/******************** ndarray.tile ********************/
......@@ -662,7 +704,8 @@ void Tile(
const int ex_inner_dim,
const int multiple,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void TileGrad(
......@@ -684,7 +727,8 @@ void Transpose(
const int* old_steps,
const int* new_steps,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void TransposeGrad(
......@@ -694,7 +738,8 @@ void TransposeGrad(
const int* old_steps,
const int* new_steps,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** recurrent.lstm_cell ********************/
......@@ -706,7 +751,8 @@ void LSTMCell(
const T* cx,
T* xact,
T* c,
T* h);
T* h,
Context* ctx);
template <typename T, class Context>
void LSTMCellGrad(
......@@ -719,7 +765,8 @@ void LSTMCellGrad(
const T* dc,
const T* dh,
T* dcx,
T* dx);
T* dx,
Context* ctx);
/******************** update.adam_update ********************/
......@@ -732,7 +779,8 @@ void AdamUpdate(
const float eps,
T* g,
T* m,
T* v);
T* v,
Context* ctx);
/******************** update.nesterov_update ********************/
......@@ -742,7 +790,8 @@ void NesterovUpdate(
const float lr,
const float momentum,
T* g,
T* h);
T* h,
Context* ctx);
/******************** update.rmsprop_update ********************/
......@@ -753,7 +802,8 @@ void RMSPropUpdate(
const float decay,
const float eps,
T* g,
T* h);
T* h,
Context* ctx);
/******************** update.sgd_update ********************/
......@@ -763,7 +813,8 @@ void SGDUpdate(
const float lr,
const float momentum,
T* g,
T* h);
T* h,
Context* ctx);
/******************** vision.bias_add ********************/
......@@ -792,7 +843,8 @@ void BilinearResize(
const int out_w,
const string& data_format,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void BilinearResizeGrad(
......@@ -805,7 +857,8 @@ void BilinearResizeGrad(
const int out_w,
const string& data_format,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** vision.conv ********************/
......@@ -826,7 +879,8 @@ void Im2Col2d(
const int dilation_w,
const string& data_format,
const T* im,
T* col);
T* col,
Context* ctx);
template <typename T, class Context>
void Col2Im2d(
......@@ -845,7 +899,8 @@ void Col2Im2d(
const int dilation_w,
const string& data_format,
const T* col,
T* im);
T* im,
Context* ctx);
/******************** vision.nn_resize ********************/
......@@ -860,7 +915,8 @@ void NNResize(
const int out_w,
const string& data_format,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void NNResizeGrad(
......@@ -873,7 +929,8 @@ void NNResizeGrad(
const int out_w,
const string& data_format,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** vision.pooling ********************/
......@@ -895,7 +952,8 @@ void MAXPooling2d(
const string& data_format,
const T* x,
int* mask,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void AVGPooling2d(
......@@ -914,7 +972,8 @@ void AVGPooling2d(
const int pad_w,
const string& data_format,
const T* x,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void MAXPooling2dGrad(
......@@ -934,7 +993,8 @@ void MAXPooling2dGrad(
const string& data_format,
const T* dy,
const int* mask,
T* dx);
T* dx,
Context* ctx);
template <typename T, class Context>
void AVGPooling2dGrad(
......@@ -953,7 +1013,8 @@ void AVGPooling2dGrad(
const int pad_w,
const string& data_format,
const T* dy,
T* dx);
T* dx,
Context* ctx);
/******************** vision.roi_pooling ********************/
......@@ -971,7 +1032,8 @@ void ROIPooling(
const T* x,
const T* rois,
int* mask,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void ROIPoolingGrad(
......@@ -987,7 +1049,8 @@ void ROIPoolingGrad(
const T* dy,
const T* rois,
const int* mask,
T* dx);
T* dx,
Context* ctx);
/******************** vision.roi_align ********************/
......@@ -1005,7 +1068,8 @@ void ROIAlign(
const int sampling_ratio,
const T* x,
const T* rois,
T* y);
T* y,
Context* ctx);
template <typename T, class Context>
void ROIAlignGrad(
......@@ -1021,7 +1085,8 @@ void ROIAlignGrad(
const int sampling_ratio,
const float* dy,
const float* rois,
float* dx);
float* dx,
Context* ctx);
} // namespace kernel
......
......@@ -80,7 +80,7 @@ T Dot(
const T* b);
template<typename T>
T ASum(
T Sum(
const int n,
const T* x);
......
......@@ -15,6 +15,7 @@
#ifdef WITH_SSE
#include <immintrin.h>
#include <tmmintrin.h>
#include <cstdint>
namespace dragon {
......
......@@ -250,8 +250,9 @@ void LoadCaffemodel(
void RunGraph(
const std::string& graph_name,
Workspace* ws) {
ws->RunGraph(graph_name, "", "");
Workspace* ws,
const int stream_id) {
ws->RunGraph(graph_name, "", "", stream_id);
}
template <typename T>
......
......@@ -38,8 +38,7 @@ class Device {
EXPORT const int device_id() const { return device_id_; }
private:
int device_type_;
int device_id_;
int device_type_, device_id_;
};
EXPORT Workspace* CreateWorkspace(const std::string& name);
......@@ -61,7 +60,8 @@ EXPORT std::string CreateGraph(
EXPORT void RunGraph(
const std::string& graph_name,
Workspace* ws);
Workspace* ws,
const int stream_id = 1);
EXPORT void CreateTensor(
const std::string& name,
......
......@@ -116,7 +116,7 @@ class NumpyFeeder : public TensorFeederBase {
#else
LOG(FATAL) << "CUDA was not compiled.";
#endif
} else{
} else {
CPUContext::Memcpy<CPUContext, CPUContext>(tensor->nbytes(),
tensor->raw_mutable_data<CPUContext>(),
static_cast<void*>(PyArray_DATA(array)));
......
......@@ -18,18 +18,22 @@
PyObject* CreateGradientDefsCC(PyObject* self, PyObject* args) {
PyObject* def_string = nullptr;
PyObject* py_g_outputs = nullptr;
if (!PyArg_ParseTuple(args, "SO!", &def_string, &PyList_Type, &py_g_outputs)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef "
"and a list containing outputs of this GradientOp.");
if (!PyArg_ParseTuple(args, "SO!",
&def_string, &PyList_Type, &py_g_outputs)) {
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of OperatorDef "
"and a list containing outputs of this GradientOp.");
return nullptr;
}
OperatorDef def;
if (!def.ParseFromString(PyBytes_AsStringEx(def_string))) {
PyErr_SetString(PyExc_ValueError, "Failed to parse the OperatorDef.");
PyErr_SetString(PyExc_ValueError,
"Failed to parse the OperatorDef.");
return nullptr;
}
if (!GradientRegistry()->Has(def.type())) {
PyErr_SetString(PyExc_KeyError, "This Operator does not register GradientOp.");
PyErr_SetString(PyExc_KeyError,
"This Operator does not register GradientOp.");
return nullptr;
}
vector<string> g_outputs;
......@@ -61,9 +65,10 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
PyObject* py_fp_ops, *py_targets;
PyObject* py_input_grads, *py_ignore_grads;
PyObject* py_share_grads, *py_export_graph;
if (!PyArg_ParseTuple(args, "OOOOOO", &py_fp_ops, &py_targets,
&py_input_grads, &py_ignore_grads,
&py_share_grads, &py_export_graph)) {
if (!PyArg_ParseTuple(args, "OOOOOO",
&py_fp_ops, &py_targets,
&py_input_grads, &py_ignore_grads,
&py_share_grads, &py_export_graph)) {
PyErr_SetString(PyExc_ValueError,
"Excepted a list of serialized input ops, targets, "
"input grads, ignore grads and whehter to share grads or log graph.");
......@@ -84,8 +89,8 @@ PyObject* RunGradientFlowCC(PyObject* self, PyObject* args) {
for (auto& grad : input_grads) maker.AddExternalGrad(grad);
for (auto& grad : ignore_grads) maker.AddIgnoreGrad(grad);
maker.Make(fp_ops, targets, bp_ops);
bool share_grads = (bool)PyObject_IsTrue(py_share_grads);
bool export_graph = (bool)PyObject_IsTrue(py_export_graph);
bool share_grads = PyObject_IsTrue(py_share_grads) ? true : false;
bool export_graph = PyObject_IsTrue(py_export_graph) ? true : false;
if (share_grads) maker.Share("/share/buffer/grads", bp_ops);
if (export_graph) {
Tensor* t = ws()->CreateTensor("/export/dynamic_graph/gradient_flow");
......
......@@ -17,7 +17,8 @@
inline PyObject* SetLogLevelCC(PyObject* self, PyObject* args) {
char* cname;
if (!PyArg_ParseTuple(args, "s", &cname)) {
PyErr_SetString(PyExc_ValueError, "Excepted the logging level.");
PyErr_SetString(PyExc_ValueError,
"Excepted the logging level.");
return nullptr;
}
SetLogDestination(StrToLogSeverity(string(cname)));
......
......@@ -17,16 +17,19 @@
inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {
PyObject* graph_str;
if (!PyArg_ParseTuple(args, "S", &graph_str)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of GraphDef.");
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of GraphDef.");
return nullptr;
}
GraphDef graph_def;
if (!graph_def.ParseFromString(PyBytes_AsStringEx(graph_str))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the GraphDef.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the GraphDef.");
return nullptr;
}
if (!ws()->CreateGraph(graph_def)) {
PyErr_SetString(PyExc_RuntimeError, "Failed to create the Graph.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to create the Graph.");
return nullptr;
}
Py_RETURN_TRUE;
......@@ -34,11 +37,17 @@ inline PyObject* CreateGraphCC(PyObject* self, PyObject* args) {
inline PyObject* RunGraphCC(PyObject* self, PyObject* args) {
char* cname, *include, *exclude;
if (!PyArg_ParseTuple(args, "sss", &cname, &include, &exclude)) {
PyErr_SetString(PyExc_ValueError, "Excepted the graph name, include and exclude rules.");
if (!PyArg_ParseTuple(args, "sss",
&cname, &include, &exclude)) {
PyErr_SetString(PyExc_ValueError,
"Excepted the graph name, include and exclude rules.");
return nullptr;
}
ws()->RunGraph(string(cname), string(include), string(exclude));
ws()->RunGraph(
string(cname),
string(include),
string(exclude)
);
Py_RETURN_TRUE;
}
......
......@@ -19,13 +19,13 @@ inline PyObject* SnapshotCC(PyObject* self, PyObject* args) {
char* path; int format;
PyObject* names; vector<Tensor*> tensors;
if (!PyArg_ParseTuple(args, "sOi", &path, &names, &format)) {
PyErr_SetString(PyExc_ValueError,
PyErr_SetString(PyExc_ValueError,
"Excepted the model path, tensors, and data format.");
return nullptr;
}
switch (format) {
case 0: // Pickle
PyErr_SetString(PyExc_NotImplementedError,
PyErr_SetString(PyExc_NotImplementedError,
"Format depends on Pickle. Can't be used in C++.");
break;
case 1: // CaffeModel
......@@ -42,13 +42,13 @@ inline PyObject* SnapshotCC(PyObject* self, PyObject* args) {
inline PyObject* RestoreCC(PyObject* self, PyObject* args) {
char* path; int format;
if (!PyArg_ParseTuple(args, "si", &path, &format)) {
PyErr_SetString(PyExc_ValueError,
PyErr_SetString(PyExc_ValueError,
"Excepted the model path and data format.");
return nullptr;
}
switch (format) {
case 0: // Pickle
PyErr_SetString(PyExc_NotImplementedError,
PyErr_SetString(PyExc_NotImplementedError,
"Format depends on Pickle. Can't be used in C++.");
break;
case 1: // CaffeModel
......
......@@ -46,7 +46,8 @@ inline PyObject* MPICreateGroupCC(PyObject* self, PyObject* args) {
PyObject *incl, *excl, *ret;
int local_root, world_size;
if (!PyArg_ParseTuple(args, "iOO", &local_root, &incl, &excl)) {
PyErr_SetString(PyExc_ValueError, "Excepted the local root, include and exclued list.");
PyErr_SetString(PyExc_ValueError,
"Excepted the local root, include and exclued list.");
return nullptr;
}
MPI_Group world_group, local_group;
......
......@@ -37,12 +37,14 @@ inline PyObject* NoGradientOperatorsCC(PyObject* self, PyObject* args) {
inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
PyObject* op_str;
if (!PyArg_ParseTuple(args, "S", &op_str)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef.");
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of OperatorDef.");
return nullptr;
}
OperatorDef op_def;
if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the OperatorDef.");
return nullptr;
}
ws()->RunOperator(op_def);
......@@ -52,7 +54,8 @@ inline PyObject* RunOperatorCC(PyObject* self, PyObject* args) {
inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
PyObject* py_ops;
if (!PyArg_ParseTuple(args, "O", &py_ops)) {
PyErr_SetString(PyExc_ValueError, "Excepted a list of serialized string of OperatorDef.");
PyErr_SetString(PyExc_ValueError,
"Excepted a list of serialized string of OperatorDef.");
return nullptr;
}
OperatorDef op_def;
......@@ -67,12 +70,14 @@ inline PyObject* RunOperatorsCC(PyObject* self, PyObject* args) {
inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
PyObject* op_str;
if (!PyArg_ParseTuple(args, "S", &op_str)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of OperatorDef.");
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of OperatorDef.");
return nullptr;
}
OperatorDef op_def;
if (!op_def.ParseFromString(PyBytes_AsStringEx(op_str))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the OperatorDef.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the OperatorDef.");
return nullptr;
}
ws()->CreatePersistentOp(op_def);
......@@ -82,9 +87,11 @@ inline PyObject* CreatePersistentOpCC(PyObject* self, PyObject* args) {
inline PyObject* RunPersistentOpCC(PyObject* self, PyObject* args) {
char* key, *anchor;
PyObject* py_inputs, *py_outputs;
if (!PyArg_ParseTuple(args, "ssOO", &key, &anchor, &py_inputs, &py_outputs)) {
PyErr_SetString(PyExc_ValueError, "Excepted a persistent key, anchor, "
"list of inputs and outputs.");
if (!PyArg_ParseTuple(args, "ssOO",
&key, &anchor, &py_inputs, &py_outputs)) {
PyErr_SetString(PyExc_ValueError,
"Excepted a persistent key, anchor, "
"list of inputs and outputs.");
return nullptr;
}
vector<string> inputs, outputs;
......
......@@ -39,12 +39,14 @@ inline PyObject* CreateTensorCC(PyObject* self, PyObject* args) {
inline PyObject* CreateFillerCC(PyObject* self, PyObject* args) {
PyObject* filler_string;
if (!PyArg_ParseTuple(args, "S", &filler_string)) {
PyErr_SetString(PyExc_ValueError, "Excepted a serialized string of TensorFiller.");
PyErr_SetString(PyExc_ValueError,
"Excepted a serialized string of TensorFiller.");
return nullptr;
}
TensorFiller filler_def;
if (!filler_def.ParseFromString(PyBytes_AsStringEx(filler_string))) {
PyErr_SetString(PyExc_RuntimeError, "Failed to parse the TensorFiller.");
PyErr_SetString(PyExc_RuntimeError,
"Failed to parse the TensorFiller.");
return nullptr;
}
ws()->CreateFiller(filler_def);
......@@ -60,7 +62,8 @@ inline PyObject* GetFillerTypeCC(PyObject* self, PyObject* args) {
inline PyObject* RenameTensorCC(PyObject* self, PyObject* args) {
char* ori_name, *tar_name;
if (!PyArg_ParseTuple(args, "ss", &ori_name, &tar_name)) {
PyErr_SetString(PyExc_ValueError, "Excepted the original and target name.");
PyErr_SetString(PyExc_ValueError,
"Excepted the original and target name.");
return nullptr;
}
if (!ws()->HasTensor(tar_name)) {
......@@ -77,7 +80,8 @@ PyObject* TensorFromShapeCC(PyObject* self, PyObject* args) {
char* cname, *dtype;
PyObject* shape, *device_option = nullptr;
if (!PyArg_ParseTuple(args, "sOs|O", &cname, &shape, &dtype, &device_option)) {
PyErr_SetString(PyExc_ValueError, "Excepted the name, shape, dtype and optional device option.");
PyErr_SetString(PyExc_ValueError,
"Excepted the name, shape, dtype and optional device option.");
return nullptr;
}
const TypeMeta& meta = TypeStringToMeta(dtype);
......@@ -119,7 +123,8 @@ PyObject* TensorFromPyArrayCC(PyObject* self, PyObject* args) {
char* cname;
PyArrayObject* original_array = nullptr;
if (!PyArg_ParseTuple(args, "sO", &cname, &original_array)) {
PyErr_SetString(PyExc_ValueError, "Failed to create tensor from numpy.ndarray.\n"
PyErr_SetString(PyExc_ValueError,
"Failed to create tensor from numpy.ndarray.\n"
"Excepted the name and numpy.ndarray both.");
return nullptr;
}
......@@ -214,7 +219,8 @@ inline PyObject* TensorToPyArrayCC(PyObject* self, PyObject* args) {
return nullptr;
}
auto* data = tensor->raw_mutable_data<CPUContext>();
PyObject* array = PyArray_SimpleNewFromData(tensor->ndim(), dims.data(), npy_type, data);
PyObject* array = PyArray_SimpleNewFromData(
(int)tensor->ndim(), dims.data(), npy_type, data);
Py_XINCREF(array);
return array;
}
......
......@@ -30,6 +30,8 @@ class BlobFetcher(Process):
----------
batch_size : int
The size of a training batch.
dtype : str
The data type of batch. Default is ``float32``.
partition : boolean
Whether to partition batch. Default is ``False``.
prefetch : int
......@@ -42,6 +44,7 @@ class BlobFetcher(Process):
"""
super(BlobFetcher, self).__init__()
self._batch_size = kwargs.get('batch_size', 100)
self._dtype = kwargs.get('dtype', 'float32')
self._partition = kwargs.get('partition', False)
self._mean_values = kwargs.get('mean_values', [])
self._scale = kwargs.get('scale', 1.0)
......@@ -68,7 +71,7 @@ class BlobFetcher(Process):
if ix != self._batch_size - 1: im, labels = self.Q_in.get()
# mean subtraction & numerical scale
im_blob = im_blob.astype(np.float32)
im_blob = im_blob.astype(self._dtype)
if len(self._mean_values) > 0:
im_blob -= self._mean_values
if self._scale != 1.0:
......
......@@ -70,6 +70,8 @@ class DataBatch(object):
The phase of this operator, ``TRAIN`` or ``TEST``. Default is ``TRAIN``.
batch_size : int
The size of a training batch.
dtype : str
The data type of batch. Default is ``float32``.
partition : boolean
Whether to partition batch. Default is ``False``.
prefetch : int
......
......@@ -49,16 +49,14 @@ class DataReader(Process):
self._source = kwargs.get('source', '')
self._multiple_nodes = kwargs.get('multiple_nodes', False)
self._use_shuffle = kwargs.get('shuffle', False)
self._use_instance_chunk = kwargs.get('instance_chunk', False)
self._num_chunks = kwargs.get('num_chunks', 2048)
self._chunk_size = kwargs.get('chunk_size', -1)
self._num_parts = 1
self._part_idx = 0
self._part_idx, self._num_parts = 0, 1
self._cur_idx, self._cur_chunk_idx = 0, 0
self._random_seed = config.GetRandomSeed()
self._cur_idx = 0
self._cur_chunk_idx = 0
self.Q_out = None
self.daemon = True
......@@ -167,12 +165,13 @@ class DataReader(Process):
self._db.open(self._source)
self._zfill = self._db.zfill()
self._num_entries = self._db.num_entries()
self._epoch_size = int(self._num_entries / self._num_parts + 1)
self._epoch_size = int(self._num_entries/ self._num_parts + 1)
if self._use_shuffle:
if self._chunk_size == 1:
# each chunk has at most 1 record [For Fully Shuffle]
self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
self._chunk_size, self._num_shuffle_parts = \
1, int(self._num_entries / self._num_parts) + 1
else:
if self._use_shuffle and self._chunk_size == -1:
# search a optimal chunk size by chunks [For Chunk Shuffle]
......@@ -183,6 +182,11 @@ class DataReader(Process):
self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
(self._num_parts * self._chunk_size << 20)))
self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
limit = (self._num_parts - 0.5) * self._num_shuffle_parts * self._chunk_size
if self._num_entries <= limit:
# roll back to fully shuffle
self._chunk_size, self._num_shuffle_parts = \
1, int(self._num_entries / self._num_parts) + 1
else:
# each chunk has at most K records [For Multiple Nodes]
# note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
......
......@@ -14,7 +14,7 @@ from __future__ import division
from __future__ import print_function
version = '0.2.2'
full_version = '0.2.2.10'
full_version = '0.2.2.11'
release = False
if not release:
......
......@@ -364,7 +364,7 @@ class BatchNormLayer(Layer):
var = Tensor(scope + '/param:1').Constant(value=0.0)
factor = Tensor(scope + '/param:2').Constant(value=0.0)
# in dragon, set diff as None will ignore computing grad automatically
# but in bvlc-caffe1, you must set lr_mult = 0 manually
# but in bvlc-caffe, you must set lr_mult = 0 manually
self._blobs.append({'data': mean, 'diff': None})
self._blobs.append({'data': var, 'diff': None})
self._blobs.append({'data': factor, 'diff': None})
......
......@@ -20,7 +20,7 @@ from .arithmetic import (
from .ndarray import (
squeeze, unsqueeze,
sum, mean, argmin, argmax, max, topk,
sum, mean, argmin, argmax, max, min, topk,
cat, gather,
)
......
......@@ -13,7 +13,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dragon.vm.torch.tensor import Tensor
from dragon.vm.torch.ops.primitive import MakeContext, WrapScalar
from dragon.vm.torch.ops.factory import get_module
......@@ -26,7 +25,6 @@ def _fundamental(input, value, op='Add', out=None):
raise TypeError('Type of value should be numerical, got {}.'
.format(type(value)))
value = WrapScalar(value, input._dtype, input._ctx)
ctx = MakeContext(inputs=[input, value])
key = 'torch/ops/{}/{}:{}'.format(op.lower(), ctx[0].lower(), ctx[1])
module = get_module(Fundamental, key, ctx, op_type=op)
......
......@@ -13,7 +13,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dragon.vm.torch.utils.data.io.data_reader import DataReader
from dragon.io.data_reader import DataReader
from dragon.vm.torch.utils.data.io.data_transformer import DataTransformer
......
......@@ -19,7 +19,7 @@ from multiprocessing import Queue
import dragon.core.mpi as mpi
from .data_reader import DataReader
from dragon.io.data_reader import DataReader
from .data_transformer import DataTransformer
from .blob_fetcher import BlobFetcher
......
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import numpy.random as npr
from multiprocessing import Process
import dragon.config as config
from dragon.tools.db import LMDB
class DataReader(Process):
"""DataReader is deployed to queue encoded str from `LMDB`_.
It is supported to adaptively partition and shuffle records over all distributed nodes.
"""
def __init__(self, **kwargs):
"""Construct a ``DataReader``.
Parameters
----------
source : str
The path of database.
multiple_nodes: boolean
Whether to split data for multiple parallel nodes. Default is ``False``.
shuffle : boolean
Whether to shuffle the data. Default is ``False``.
num_chunks : int
The number of chunks to split. Default is ``2048``.
chunk_size : int
The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).
"""
super(DataReader, self).__init__()
self._source = kwargs.get('source', '')
self._multiple_nodes = kwargs.get('multiple_nodes', False)
self._use_shuffle = kwargs.get('shuffle', False)
self._num_chunks = kwargs.get('num_chunks', 2048)
self._chunk_size = kwargs.get('chunk_size', -1)
self._num_parts = 1
self._part_idx = 0
self._random_seed = config.GetRandomSeed()
self._cur_idx = 0
self._cur_chunk_idx = 0
self.Q_out = None
self.daemon = True
def element(self):
"""Get the value of current record.
Returns
-------
str
The encoded str.
"""
return self._db.value()
def redirect(self, target_idx):
"""Redirect to the target position.
Parameters
----------
target_idx : int
The key of instance in ``LMDB``.
Returns
-------
None
Notes
-----
The redirection reopens the ``LMDB``.
You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``.
This will disturb getting stuck when ``Database Size`` >> ``RAM Size``.
"""
self._db.close()
self._db.open(self._source)
self._cur_idx = target_idx
self._db.set(str(self._cur_idx).zfill(self._zfill))
def reset(self):
"""Reset the cursor and environment.
Returns
-------
None
"""
if self._multiple_nodes or self._use_shuffle:
if self._use_shuffle: self._perm = npr.permutation(self._num_shuffle_parts)
self._cur_chunk_idx = 0
self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx])
self._start_idx = int(self._start_idx * self._chunk_size)
if self._start_idx >= self._num_entries: self.next_chunk()
self._end_idx = self._start_idx + self._chunk_size
self._end_idx = min(self._num_entries, self._end_idx)
else:
self._start_idx = 0
self._end_idx = self._num_entries
self.redirect(self._start_idx)
def next_record(self):
"""Step the cursor of records.
Returns
-------
None
"""
self._cur_idx += 1
self._db.next()
def next_chunk(self):
"""Step the cursor of shuffling chunks.
Returns
-------
None
"""
self._cur_chunk_idx += 1
if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset()
else:
self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]
self._start_idx = self._start_idx * self._chunk_size
if self._start_idx >= self._num_entries: self.next_chunk()
else:
self._end_idx = self._start_idx + self._chunk_size
self._end_idx = min(self._num_entries, self._end_idx)
self.redirect(self._start_idx)
def run(self):
"""Start the process.
Returns
-------
None
"""
# fix seed
npr.seed(self._random_seed)
# init db
self._db = LMDB()
self._db.open(self._source)
self._zfill = self._db.zfill()
self._num_entries = self._db.num_entries()
self._epoch_size = int(self._num_entries / self._num_parts + 1)
if self._use_shuffle:
if self._chunk_size == 1:
# each chunk has at most 1 record [For Fully Shuffle]
self._num_shuffle_parts = int(self._num_entries / self._chunk_size / self._num_parts) + 1
else:
if self._use_shuffle and self._chunk_size == -1:
# search a optimal chunk size by chunks [For Chunk Shuffle]
max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
min_chunk_size = 1
while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
self._chunk_size = min_chunk_size
self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
(self._num_parts * self._chunk_size << 20)))
self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1)
else:
# each chunk has at most K records [For Multiple Nodes]
# note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
# ``chunk_size`` and ``num_shuffle_parts`` are meaningless
self._chunk_size = int(self._num_entries / self._num_parts) + 1
self._num_shuffle_parts = 1
self._perm = np.arange(self._num_shuffle_parts)
# init env
self.reset()
# run
while True:
self.Q_out.put(self.element())
self.next_record()
if self._cur_idx >= self._end_idx:
if self._multiple_nodes or \
self._use_shuffle: self.next_chunk()
else: self.reset()
\ No newline at end of file
......@@ -42,7 +42,7 @@ find_modules()
setup(name = 'dragon',
version='0.2.2.10',
version='0.2.2.11',
description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
url='https://github.com/seetaresearch/Dragon',
author='Ting Pan',
......
......@@ -19,7 +19,8 @@ template <> void GenerateProposals<float, CPUContext>(
const float* scores,
const float* bbox_deltas,
const float* anchors,
float* proposals) {
float* proposals,
CPUContext* ctx) {
float* proposal = proposals;
const int K = feat_h * feat_w;
for (int h = 0; h < feat_h; ++h) {
......@@ -57,7 +58,8 @@ template <> void GenerateProposals_v2<float, CPUContext>(
const float min_box_w,
const float* scores,
const float* bbox_deltas,
float* proposals) {
float* proposals,
CPUContext* ctx) {
float* proposal = proposals;
for (int i = 0; i < total_anchors; ++i) {
// bbox_deltas: [1, 4, total_anchors]
......@@ -98,7 +100,8 @@ template <> void ApplyNMS<float, CPUContext>(
const float thresh,
const float* boxes,
int* keep_indices,
int& num_keep) {
int& num_keep,
CPUContext* ctx) {
int count = 0;
std::vector<char> is_dead(num_boxes);
for (int i = 0; i < num_boxes; ++i) is_dead[i] = 0;
......
......@@ -62,7 +62,7 @@ __global__ void _GenerateProposals(
const T* bbox_deltas,
const T* anchors,
T* proposals) {
CUDA_KERNEL_LOOP(idx, nthreads) {
CUDA_1D_KERNEL_LOOP(idx, nthreads) {
const int h = idx / A / feat_w;
const int w = (idx / A) % feat_w;
const int a = idx % A;
......@@ -99,13 +99,15 @@ template <> void GenerateProposals<float, CUDAContext>(
const float* scores,
const float* bbox_deltas,
const float* anchors,
float* proposals) {
float* proposals,
CUDAContext* ctx) {
const int num_proposals = A * feat_h * feat_w;
_GenerateProposals<float>
<< <CUDA_BLOCKS(num_proposals), CUDA_THREADS >> >(
num_proposals, A, feat_h, feat_w, stride,
im_h, im_w, min_box_h, min_box_w,
scores, bbox_deltas, anchors, proposals);
<< < CUDA_BLOCKS(num_proposals), CUDA_THREADS,
0, ctx->cuda_stream() >> >(num_proposals,
A, feat_h, feat_w, stride,
im_h, im_w, min_box_h, min_box_w,
scores, bbox_deltas, anchors, proposals);
}
template <typename T>
......@@ -118,7 +120,7 @@ __global__ void _GenerateProposals_v2(
const T* scores,
const T* bbox_deltas,
T* proposals) {
CUDA_KERNEL_LOOP(idx, nthreads) {
CUDA_1D_KERNEL_LOOP(idx, nthreads) {
const float dx = bbox_deltas[idx];
const float dy = bbox_deltas[nthreads + idx];
const float d_log_w = bbox_deltas[2 * nthreads + idx];
......@@ -139,11 +141,13 @@ template <> void GenerateProposals_v2<float, CUDAContext>(
const float min_box_w,
const float* scores,
const float* bbox_deltas,
float* proposals) {
float* proposals,
CUDAContext* ctx) {
_GenerateProposals_v2<float>
<< <CUDA_BLOCKS(total_anchors), CUDA_THREADS >> >(
total_anchors, im_h, im_w, min_box_h, min_box_w,
scores, bbox_deltas, proposals);
<< < CUDA_BLOCKS(total_anchors), CUDA_THREADS,
0, ctx->cuda_stream() >> >(total_anchors,
im_h, im_w, min_box_h, min_box_w,
scores, bbox_deltas, proposals);
}
/******************** NMS ********************/
......@@ -170,7 +174,7 @@ __global__ void nms_mask(
const int num_boxes,
const T nms_thresh,
const T* boxes,
unsigned long long* mask) {
uint64_t* mask) {
const int i_start = blockIdx.x * NMS_BLOCK_SIZE;
const int di_end = min(num_boxes - i_start, NMS_BLOCK_SIZE);
const int j_start = blockIdx.y * NMS_BLOCK_SIZE;
......@@ -209,25 +213,30 @@ void _ApplyNMS(
const float thresh,
const T* boxes,
int* keep_indices,
int& num_keep) {
int& num_keep,
CUDAContext* ctx) {
const int num_blocks = DIV_UP(num_boxes, NMS_BLOCK_SIZE);
const dim3 blocks(num_blocks, num_blocks);
size_t mask_nbytes = num_boxes * num_blocks * sizeof(unsigned long long);
size_t mask_nbytes = num_boxes * num_blocks * sizeof(uint64_t);
size_t boxes_nbytes = num_boxes * 5 * sizeof(T);
void* boxes_dev, *mask_dev;
CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_nbytes));
CUDA_CHECK(cudaMalloc(&mask_dev, mask_nbytes));
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes, boxes_nbytes, cudaMemcpyHostToDevice));
nms_mask<T> << <blocks, NMS_BLOCK_SIZE >> > (
num_boxes, thresh, (T*)boxes_dev, (unsigned long long*)mask_dev);
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes,
boxes_nbytes, cudaMemcpyHostToDevice));
nms_mask<T>
<< < blocks, NMS_BLOCK_SIZE,
0, ctx->cuda_stream() >> > (num_boxes,
thresh, (T*)boxes_dev, (uint64_t*)mask_dev);
CUDA_CHECK(cudaPeekAtLastError());
std::vector<unsigned long long> mask_host(num_boxes * num_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev, mask_nbytes, cudaMemcpyDeviceToHost));
std::vector<uint64_t> mask_host(num_boxes * num_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
mask_nbytes, cudaMemcpyDeviceToHost));
std::vector<unsigned long long> dead_bit(num_blocks);
memset(&dead_bit[0], 0, sizeof(unsigned long long) * num_blocks);
std::vector<uint64_t> dead_bit(num_blocks);
memset(&dead_bit[0], 0, sizeof(uint64_t) * num_blocks);
int num_selected = 0;
for (int i = 0; i < num_boxes; ++i) {
......@@ -235,7 +244,7 @@ void _ApplyNMS(
const int inblock = i % NMS_BLOCK_SIZE;
if (!(dead_bit[nblock] & (1ULL << inblock))) {
keep_indices[num_selected++] = i;
unsigned long long* mask_i = &mask_host[0] + i * num_blocks;
uint64_t* mask_i = &mask_host[0] + i * num_blocks;
for (int j = nblock; j < num_blocks; ++j) dead_bit[j] |= mask_i[j];
if (num_selected == max_keeps) break;
}
......@@ -251,9 +260,10 @@ template <> void ApplyNMS<float, CUDAContext>(
const float thresh,
const float* boxes,
int* keep_indices,
int& num_keep) {
int& num_keep,
CUDAContext* ctx) {
_ApplyNMS<float>(num_boxes, max_keeps, thresh,
boxes, keep_indices, num_keep);
boxes, keep_indices, num_keep, ctx);
}
} // namespace rcnn
......
......@@ -126,7 +126,8 @@ void GenerateProposals(
const T* scores,
const T* bbox_deltas,
const T* anchors,
T* proposals);
T* proposals,
Context* ctx);
template <typename T, class Context>
void GenerateProposals_v2(
......@@ -137,7 +138,8 @@ void GenerateProposals_v2(
const float min_box_w,
const T* scores,
const T* bbox_deltas,
T* proposals);
T* proposals,
Context* ctx);
template <typename T>
inline void SortProposals(
......@@ -246,7 +248,8 @@ void ApplyNMS(
const T thresh,
const T* boxes,
int* keep_indices,
int& num_keep);
int& num_keep,
Context* ctx);
} // namespace rcnn
......
......@@ -37,7 +37,7 @@ void ProposalOp<Context>::RunWithType() {
Input(0).template data<T, Context>(),
Input(1).template data<T, Context>(),
anchors_.template mutable_data<T, Context>(),
proposals_.template mutable_data<T, Context>());
proposals_.template mutable_data<T, Context>(), ctx());
rcnn::SortProposals(0, num_proposals - 1, pre_nms_top_n,
proposals_.template mutable_data<T, CPUContext>());
......@@ -45,7 +45,8 @@ void ProposalOp<Context>::RunWithType() {
rcnn::ApplyNMS<T, Context>(
pre_nms_topn, post_nms_top_n, nms_thresh,
proposals_.template mutable_data<T, Context>(),
roi_indices_.template mutable_data<int, CPUContext>(), num_rois);
roi_indices_.template mutable_data<int, CPUContext>(),
num_rois, ctx());
rcnn::RetrieveRoIs<T>(num_rois, n,
proposals_.template mutable_data<T, CPUContext>(),
......@@ -95,14 +96,15 @@ void ProposalOp<Context>::RunWithType() {
im_height, im_width, min_box_h, min_box_w,
Input(-3).template data<T, Context>(),
Input(-2).template data<T, Context>(),
proposals_.template mutable_data<T, Context>());
proposals_.template mutable_data<T, Context>(), ctx());
rcnn::SortProposals(0, total_proposals - 1, pre_nms_top_n,
proposals_.template mutable_data<T, CPUContext>());
rcnn::ApplyNMS<T, Context>(pre_nms_topn, post_nms_top_n, nms_thresh,
proposals_.template mutable_data<T, Context>(),
roi_indices_.template mutable_data<int, CPUContext>(), num_rois);
roi_indices_.template mutable_data<int, CPUContext>(),
num_rois, ctx());
rcnn::RetrieveRoIs<T>(num_rois, n,
proposals_.template mutable_data<T, CPUContext>(),
......@@ -128,7 +130,7 @@ void ProposalOp<Context>::RunWithType() {
collective_rois.ReshapeLike(*Output(0));
auto* rois = collective_rois.template mutable_data<T, CPUContext>();
CPUContext::template Copy<T, CPUContext, CPUContext>(
ctx()->template Copy<T, CPUContext, CPUContext>(
collective_rois.count(), rois,
Output(0)->template data<T, CPUContext>());
......@@ -147,6 +149,8 @@ void ProposalOp<Context>::RunWithType() {
template <class Context>
void ProposalOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
num_images = Input(0).dim(0);
CHECK_EQ(Input(-1).dim(0), num_images)
<< "\nExcepted " << num_images << " groups image info, "
......
......@@ -455,7 +455,10 @@ Graph::Graph(const GraphDef& meta_graph, Workspace* ws)
RecomputingAware(optimized_graph, ws);
}
bool Graph::Run(const string& include, const string& exclude) {
bool Graph::Run(
const string& include,
const string& exclude,
const int stream_id) {
LOG(DEBUG) << "Run Graph: " << name();
for (auto op : ops_) {
if (!include.empty())
......@@ -464,7 +467,7 @@ bool Graph::Run(const string& include, const string& exclude) {
if (op->type().find(exclude) != string::npos) continue;
op->SwitchToPhase(this->args_["phase"].s());
LOG(DEBUG) << "$ Before Operator: " << op->name();
op->Run();
op->Run(stream_id);
LOG(DEBUG) << "$ After Operator: " << op->name();
}
return true;
......
......@@ -8,7 +8,6 @@ void MixedMemory::ToCPU() {
switch (state_) {
case UNINITIALIZED:
cpu_ptr_ = CPUContext::New(nbytes_);
CPUContext::Memset(nbytes_, cpu_ptr_);
state_ = STATE_AT_CPU;
break;
case STATE_AT_CUDA:
......@@ -32,7 +31,6 @@ void MixedMemory::ToCUDA() {
switch (state_) {
case UNINITIALIZED:
cuda_ptr_ = CUDAContext::New(nbytes_);
CUDAContext::Memset(nbytes_, cuda_ptr_);
state_ = STATE_AT_CUDA;
break;
case STATE_AT_CPU:
......
......@@ -15,33 +15,35 @@ void CuDNNDropoutOp<Context>::RunWithType() {
float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
if (phase() == "TEST") {
if (Output(0) != &Input(0)) {
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), Ydata, Xdata);
if (scale == 1.0)
math::Scal<T, Context>(Output(0)->count(),
1.0 - prob(), Ydata, &ctx());
1.0 - prob(), Ydata, ctx());
}
} else if (phase() == "TRAIN") {
CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
Tensor* mask = ws()->CreateTensor("/mnt/" + anchor() + "/dropout/mask");
Tensor* mask = ws()->CreateTensor(
"/mnt/" + anchor() + "/dropout/mask");
// determine the dropout states
if (!states_initialized) {
states_initialized = true;
CUDNN_CHECK(cudnnDropoutGetStatesSize(
ctx().cudnn_handle(), &states_size));
ctx()->cudnn_handle(), &states_size));
std::lock_guard<std::mutex> lk(CUDAContext::mutex());
Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
dragon_cast<string, unsigned long long>(random_seed) + "/states");
Tensor* states = ws()->CreateTensor(
"/share/cudnn/dropout:" + dragon_cast<string,
unsigned long long>(random_seed) + "/states");
if (states->count() > 0) {
auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), prob(),
dropout_desc, ctx()->cudnn_handle(), prob(),
Sdata, states_size, random_seed));
} else {
states->Reshape({ (TIndex)states_size });
auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnSetDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), prob(),
dropout_desc, ctx()->cudnn_handle(), prob(),
Sdata, states_size, random_seed));
}
}
......@@ -53,7 +55,7 @@ void CuDNNDropoutOp<Context>::RunWithType() {
mask->Reshape({ (TIndex)reserve_space_size });
auto* Rdata = mask->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnDropoutForward(
ctx().cudnn_handle(), dropout_desc,
ctx()->cudnn_handle(), dropout_desc,
input_desc, Xdata,
input_desc, Ydata,
Rdata, reserve_space_size));
......@@ -65,7 +67,9 @@ void CuDNNDropoutOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......@@ -76,19 +80,21 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
if (phase() == "TEST") { NOT_IMPLEMENTED; }
else if (phase() == "TRAIN") {
CHECK(use_scale) << "\nCuDNN only supports scale-dropout";
Tensor* mask = ws()->GetTensor("/mnt/" + anchor() + "/dropout/mask");
Tensor* mask = ws()->GetTensor(
"/mnt/" + anchor() + "/dropout/mask");
// determine the dropout states
if (!states_initialized) {
states_initialized = true;
CUDNN_CHECK(cudnnDropoutGetStatesSize(
ctx().cudnn_handle(), &states_size));
ctx()->cudnn_handle(), &states_size));
std::lock_guard<std::mutex> lk(CUDAContext::mutex());
Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
dragon_cast<string, unsigned long long>(random_seed) + "/states");
Tensor* states = ws()->CreateTensor(
"/share/cudnn/dropout:" + dragon_cast<string,
unsigned long long>(random_seed) + "/states");
if (states->count() > 0) {
auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), prob(),
dropout_desc, ctx()->cudnn_handle(), prob(),
Sdata, states_size, random_seed));
} else { LOG(FATAL) << "Missing states with seed: " << random_seed; }
}
......@@ -101,7 +107,7 @@ void CuDNNDropoutGradientOp<Context>::RunWithType() {
input_desc, &reserve_space_size));
auto* Rdata = mask->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnDropoutBackward(
ctx().cudnn_handle(), dropout_desc,
ctx()->cudnn_handle(), dropout_desc,
input_desc, dYdata,
input_desc, dXdata,
Rdata, reserve_space_size));
......@@ -113,7 +119,9 @@ void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......
......@@ -14,7 +14,7 @@ void CuDNNEluOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
}
......@@ -41,7 +41,7 @@ void CuDNNEluGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -13,7 +13,7 @@ void CuDNNReluOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
#else
......@@ -49,7 +49,7 @@ void CuDNNReluGradientOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -13,12 +13,12 @@ void CuDNNSigmoidOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
#else
CUDNN_CHECK(cudnnActivationForward_v4(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<Dtype>::one, input_desc, Xdata,
CUDNNType<Dtype>::zero, output_desc, Ydata));
#endif
......@@ -47,13 +47,13 @@ void CuDNNSigmoidGradientOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
#else
CUDNN_CHECK(cudnnActivationBackward_v4(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -7,8 +7,7 @@ namespace dragon {
template <class Context> template <typename T>
void CuDNNSoftmaxOp<Context>::RunWithType() {
Tensor fake_tensor(vector<TIndex>(
{ outer_dim, Input(0).dim(axis), inner_dim })
);
{ outer_dim, Input(0).dim(axis), inner_dim }));
cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);
......@@ -16,7 +15,7 @@ void CuDNNSoftmaxOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnSoftmaxForward(
ctx().cudnn_handle(),
ctx()->cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
......@@ -41,8 +40,7 @@ DEPLOY_CUDNN(Softmax);
template <class Context> template <typename T>
void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
Tensor fake_tensor(vector<TIndex>(
{ outer_dim, Input(0).dim(axis), inner_dim })
);
{ outer_dim, Input(0).dim(axis), inner_dim }));
cudnnSetTensorDesc<T>(&input_desc, &fake_tensor);
cudnnSetTensorDesc<T>(&output_desc, &fake_tensor);
......@@ -50,7 +48,7 @@ void CuDNNSoftmaxGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnSoftmaxBackward(
ctx().cudnn_handle(),
ctx()->cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
CUDNNType<T>::one, input_desc, Ydata, input_desc, dYdata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -13,12 +13,12 @@ void CuDNNTanhOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationForward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
#else
CUDNN_CHECK(cudnnActivationForward_v4(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<Dtype>::one, input_desc, Xdata,
CUDNNType<Dtype>::zero, output_desc, Ydata));
#endif
......@@ -47,13 +47,13 @@ void CuDNNTanhGradientOp<Context>::RunWithType() {
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnActivationBackward(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
#else
CUDNN_CHECK(cudnnActivationBackward_v4(
ctx().cudnn_handle(), act_desc,
ctx()->cudnn_handle(), act_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Ydata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -11,10 +11,10 @@ void DropoutOp<Context>::RunWithType() {
float scale = use_scale ? 1.0 / (1.0 - prob()) : 1.0;
if (phase() == "TEST") {
if (Output(0) != &Input(0)) {
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), Ydata, Xdata);
if (scale == 1.0) math::Scal<T, Context>(
Output(0)->count(), 1.0 - prob(), Ydata, &ctx());
Output(0)->count(), 1.0 - prob(), Ydata, ctx());
}
} else if (phase() == "TRAIN") {
Tensor* mask = ws()->CreateTensor(
......@@ -23,7 +23,7 @@ void DropoutOp<Context>::RunWithType() {
uint32_t* Mdata = mask->template mutable_data<uint32_t, Context>();
kernel::Dropout<T, Context>(
Output(0)->count(), prob(), scale,
Xdata, Mdata, Ydata, &ctx());
Xdata, Mdata, Ydata, ctx());
} else LOG(FATAL) << "Incorrect Op phase: " << phase();
}
......@@ -52,7 +52,8 @@ void DropoutGradientOp<Context>::RunWithType() {
else if (phase() == "TRAIN") {
kernel::DropoutGrad<T, Context>(
Output(0)->count(), prob(), scale,
dYdata, Mdata, dXdata, &ctx());
dYdata, Mdata, dXdata, ctx());
ctx()->FinishDeviceCompution();
mask->Reset();
} else LOG(FATAL) << "Incorrect Op phase: " << phase();
}
......
......@@ -8,7 +8,8 @@ template <class Context> template <typename T>
void EluOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Elu<T, Context>(Output(0)->count(), alpha, Xdata, Ydata);
kernel::Elu<T, Context>(Output(0)->count(),
alpha, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -30,8 +31,8 @@ void EluGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::EluGrad<T, Context>(
Output(0)->count(), alpha, dYdata, Ydata, dXdata);
kernel::EluGrad<T, Context>(Output(0)->count(),
alpha, dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......
......@@ -18,7 +18,7 @@ void PReluOp<Context>::RunWithType() {
kernel::PRelu<T, Context>(
Output(0)->count(), channels, dim,
channel_shared ? true : false, data_format,
Xdata, Wdata, Ydata);
Xdata, Wdata, Ydata, ctx());
}
template <class Context>
......@@ -49,12 +49,12 @@ void PReluGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") {
DECLARE_MULTIPLIER(multiplier, channels * dim);
auto* dWdata = Output(1)->template mutable_data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
auto* dWBdata = ws()->template caches<T, Context>({ channels * dim })[0];
kernel::PReluWGrad<T, Context>(
Input(0).dim(0), Input(0).count(1), channels, dim,
channel_shared ? true : false, data_format,
dYdata, Xdata, multiplier, dWBdata, dWdata, &ctx());
dYdata, Xdata, multiplier, dWBdata, dWdata, ctx());
}
if (Output(0)->name() != "ignore") {
......@@ -63,7 +63,7 @@ void PReluGradientOp<Context>::RunWithType() {
kernel::PReluGrad<T, Context>(
Output(0)->count(), channels, dim,
channel_shared ? true : false, data_format,
dYdata, Xdata, Wdata, dXdata);
dYdata, Xdata, Wdata, dXdata, ctx());
}
}
......
......@@ -8,7 +8,8 @@ template <class Context> template <typename T>
void ReluOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Relu<T, Context>(Output(0)->count(), slope, Xdata, Ydata);
kernel::Relu<T, Context>(Output(0)->count(),
slope, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -24,15 +25,17 @@ DEPLOY_CPU(Relu);
#ifdef WITH_CUDA
DEPLOY_CUDA(Relu);
#endif
OPERATOR_SCHEMA(Relu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
OPERATOR_SCHEMA(Relu)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T>
void ReluGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::ReluGrad<T, Context>(
Output(0)->count(), slope, dYdata, Ydata, dXdata);
kernel::ReluGrad<T, Context>(Output(0)->count(),
slope, dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......@@ -47,7 +50,9 @@ DEPLOY_CPU(ReluGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(ReluGradient);
#endif
OPERATOR_SCHEMA(ReluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }});
OPERATOR_SCHEMA(ReluGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 }});
class GetReluGradient final : public GradientMakerBase {
public:
......
......@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void SEluOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata);
kernel::SElu<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -23,15 +23,17 @@ DEPLOY_CPU(SElu);
#ifdef WITH_CUDA
DEPLOY_CUDA(SElu);
#endif
OPERATOR_SCHEMA(SElu).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
OPERATOR_SCHEMA(SElu)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T>
void SEluGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::SEluGrad<T, Context>(
Output(0)->count(), dYdata, Ydata, dXdata);
kernel::SEluGrad<T, Context>(Output(0)->count(),
dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......@@ -46,7 +48,9 @@ DEPLOY_CPU(SEluGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(SEluGradient);
#endif
OPERATOR_SCHEMA(SEluGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 }});
OPERATOR_SCHEMA(SEluGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 }});
class GetSEluGradient final : public GradientMakerBase {
public:
......
......@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void SigmoidOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata);
kernel::Sigmoid<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -30,8 +30,8 @@ void SigmoidGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::SigmoidGrad<T, Context>(
Output(0)->count(), dYdata, Ydata, dXdata);
kernel::SigmoidGrad<T, Context>(Output(0)->count(),
dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......
......@@ -12,13 +12,13 @@ void SoftmaxOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Input(0).count(), Ydata, Xdata);
kernel::Softmax<T, Context>(
Output(0)->count(), Input(0).dim(axis),
outer_dim, inner_dim, multiplier,
Xdata, WSdata, Ydata, &ctx());
Xdata, WSdata, Ydata, ctx());
}
template <class Context>
......@@ -36,7 +36,9 @@ DEPLOY_CPU(Softmax);
#ifdef WITH_CUDA
DEPLOY_CUDA(Softmax);
#endif
OPERATOR_SCHEMA(Softmax).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
OPERATOR_SCHEMA(Softmax)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T>
void SoftmaxGradientOp<Context>::RunWithType() {
......@@ -44,15 +46,16 @@ void SoftmaxGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
auto* Ydata = Input(0).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
auto* WSdata = ws()->template caches<T, Context>(
{ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Input(0).count(), dXdata, dYdata);
kernel::SoftmaxGrad<T, Context>(
Output(0)->count(), Input(0).dim(axis),
outer_dim, inner_dim, multiplier,
dYdata, Ydata, WSdata, dXdata, &ctx());
dYdata, Ydata, WSdata, dXdata, ctx());
}
template <class Context>
......@@ -70,7 +73,9 @@ DEPLOY_CPU(SoftmaxGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(SoftmaxGradient);
#endif
OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
OPERATOR_SCHEMA(SoftmaxGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 } });
class GetSoftmaxGradient final : public GradientMakerBase {
public:
......
......@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void TanhOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata);
kernel::Tanh<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -30,8 +30,8 @@ void TanhGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context>();
auto* dYdata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::TanhGrad<T, Context>(
Output(0)->count(), dYdata, Ydata, dXdata);
kernel::TanhGrad<T, Context>(Output(0)->count(),
dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......
......@@ -9,7 +9,7 @@ void AddOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Add<T, Context>(Output(0)->count(), x1, x2, y);
math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -19,23 +19,24 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1);
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(0).count();
inner_dim = 1;
x2 = Input(1).template data<T, CPUContext>();
math::AddScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x2[0]), y, ctx());
} else {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
1.0, y, ctx());
}
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
1.0, y, &ctx());
} else if (type == 2) {
outer_dim = Input(0).dim(0);
inner_dim = Input(0).count(1);
......@@ -44,7 +45,7 @@ void AddOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
1.0, y, &ctx());
1.0, y, ctx());
}
}
......@@ -77,13 +78,13 @@ void AddGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(1)->count(), dx2, dy);
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy);
}
}
......@@ -108,7 +109,7 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
outer_dim = X1->dim(0);
inner_dim = X1->count(1);
......@@ -116,13 +117,13 @@ void AddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
X1->count(), dx1, dy);
}
}
......
......@@ -34,7 +34,7 @@ void AffineOp<Context>::RunWithType() {
kernel::Affine<T, Context>(
Output(0)->count(), outer_dim, scale_dim, inner_dim,
Xdata, Adata, Bdata, bias_multiplier, Ydata, &ctx());
Xdata, Adata, Bdata, bias_multiplier, Ydata, ctx());
}
template <class Context>
......@@ -58,13 +58,13 @@ void AffineGradientOp<Context>::BiasRunWithType() {
DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* dYdata = Input(-1).template data<T, Context>();
auto* dBias = Output(2)->template mutable_data<T, Context>();
auto* dBias = Output(2)->template mutable_data<T, Context>(ctx());
for (int n = 0; n < outer_dim; n++) {
math::Gemv<T, Context>(
CblasNoTrans, scale_dim, inner_dim,
1.0, dYdata, multiplier,
1.0, dBias, &ctx());
1.0, dBias, ctx());
dYdata += dim;
}
}
......@@ -79,45 +79,36 @@ void AffineGradientOp<Context>::ScaleRunWithType() {
bool is_eltwise = (Input(-1).count() == Input(1).count());
auto* dYdata = Input(-1).template data<T, Context>();
auto* Xdata = Input(0).template data<T, Context>();
auto* dScale = Output(1)->template mutable_data<T, Context>();
auto* dScale = Output(1)->template mutable_data<T, Context>(ctx());
auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* dYxX = dXdata;
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX);
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dYxX, ctx());
if (!is_eltwise) {
T* SRes_data = nullptr;
// reduce inner dimensions
if (inner_dim == 1) {
SRes_data = dYxX;
} else if (sum_result.count() == 1) { // handle inner only
dScale = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
inner_dim, dYxX, multiplier, &ctx());
*dScale += result;
} else {
SRes_data = (outer_dim == 1) ? // handle scale only
SRes_data = (outer_dim == 1) ?
dScale : sum_result.template mutable_data<T, Context>();
math::Gemv<T, Context>(
CblasNoTrans, sum_result.count(), inner_dim,
1.0, dYxX, multiplier,
SRes_data == dScale ? 1.0 : 0.0, SRes_data, &ctx());
SRes_data == dScale ? 1.0 : 0.0,
SRes_data, ctx());
}
// reduce outer dimensions
if (outer_dim != 1) {
if (scale_dim == 1) { // handle outer only
dScale = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
outer_dim, multiplier, SRes_data, &ctx());
*dScale += result;
} else {
math::Gemv<T, Context>(
CblasTrans, outer_dim, scale_dim,
1.0, SRes_data, multiplier,
1.0, dScale, &ctx());
}
math::Gemv<T, Context>(
CblasTrans, outer_dim, scale_dim,
1.0, SRes_data, multiplier,
1.0, dScale, ctx());
}
} else {
math::Axpy<T, Context>(Output(1)->count(),
1.f, dYxX, dScale, &ctx());
1.f, dYxX, dScale, ctx());
}
}
......@@ -131,7 +122,7 @@ void AffineGradientOp<Context>::RunWithType() {
kernel::AffineGrad<T, Context>(
Output(0)->count(), outer_dim, scale_dim, inner_dim,
dYdata, Adata, dXdata, &ctx());
dYdata, Adata, dXdata, ctx());
}
template <class Context>
......
......@@ -15,7 +15,7 @@ void ClipOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* Mdata = mask->template mutable_data<T, Context>();
kernel::Clip<T, Context>(Output(0)->count(),
low, high, Xdata, Mdata, Ydata);
low, high, Xdata, Mdata, Ydata, ctx());
}
template <class Context>
......@@ -30,7 +30,9 @@ DEPLOY_CPU(Clip);
#ifdef WITH_CUDA
DEPLOY_CUDA(Clip);
#endif
OPERATOR_SCHEMA(Clip).NumInputs(1).NumOutputs(1).Inplace({ { 0, 0 } });
OPERATOR_SCHEMA(Clip)
.NumInputs(1).NumOutputs(1)
.Inplace({ { 0, 0 } });
template <class Context> template <typename T>
void ClipGradientOp<Context>::RunWithType() {
......@@ -39,7 +41,8 @@ void ClipGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* Mdata = mask->template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dXdata, Mdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
dXdata, Mdata, dXdata, ctx());
}
template <class Context>
......@@ -54,7 +57,9 @@ DEPLOY_CPU(ClipGradient);
#ifdef WITH_CUDA
DEPLOY_CUDA(ClipGradient);
#endif
OPERATOR_SCHEMA(ClipGradient).NumInputs(2).NumOutputs(1).Inplace({ { 1, 0 } });
OPERATOR_SCHEMA(ClipGradient)
.NumInputs(2).NumOutputs(1)
.Inplace({ { 1, 0 } });
class GetClipGradient final : public GradientMakerBase {
public:
......
......@@ -23,7 +23,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
mul_desc, CUDNN_OP_TENSOR_MUL,
CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc,
ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::one, param_desc, Adata,
CUDNNType<T>::zero, input_desc, Ydata));
......@@ -36,7 +36,7 @@ void CuDNNAffineOp<Context>::RunWithType() {
add_desc, CUDNN_OP_TENSOR_ADD,
CUDNNType<T>::type, CUDNN_PROPAGATE_NAN));
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), add_desc,
ctx()->cudnn_handle(), add_desc,
CUDNNType<T>::one, input_desc, Ydata,
CUDNNType<T>::one, param_desc, Bdata,
CUDNNType<T>::zero, input_desc, Ydata));
......@@ -48,7 +48,9 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......@@ -76,17 +78,17 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") {
Output(1)->ReshapeLike(Input(1));
auto* Xdata = Input(0).template data<T, Context>();
auto* dAdata = Output(1)->template mutable_data<T, Context>();
auto* dAdata = Output(1)->template mutable_data<T, Context>(ctx());
// eltwise
if (Input(0).count() == Input(1).count()) {
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc,
ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::one, input_desc, dYdata,
CUDNNType<T>::one, param_desc, dAdata));
} else {
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc,
ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::one, input_desc, dYdata,
CUDNNType<T>::zero, input_desc, dXdata));
......@@ -97,11 +99,11 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
// db = dy
if (Output(2)->name() != "ignore") {
Output(2)->ReshapeLike(Input(1));
auto* dBdata = Output(2)->template mutable_data<T, Context>();
auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
// eltwise
if (Input(-1).count() == Input(1).count()) {
math::Axpy<T, Context>(Output(2)->count(),
1.f, dYdata, dBdata, &ctx());
1.f, dYdata, dBdata, ctx());
} else {
ComputeBiasGradient_v2<T>(dYdata, dBdata);
}
......@@ -109,7 +111,7 @@ void CuDNNAffineGradientOp<Context>::RunWithType() {
// dx = alpha * dy
CUDNN_CHECK(cudnnOpTensor(
ctx().cudnn_handle(), mul_desc,
ctx()->cudnn_handle(), mul_desc,
CUDNNType<T>::one, input_desc, dYdata,
CUDNNType<T>::one, param_desc, Adata,
CUDNNType<T>::zero, input_desc, dXdata));
......@@ -126,11 +128,11 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient(
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
ctx().cudnn_handle(), reduce_desc,
ctx()->cudnn_handle(), reduce_desc,
input_desc, param_desc, &workspace_size));
auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];;
CUDNN_CHECK(cudnnReduceTensor(
ctx().cudnn_handle(), reduce_desc,
ctx()->cudnn_handle(), reduce_desc,
nullptr, 0, WSdata, workspace_size,
CUDNNType<T>::one, input_desc, dYxX,
CUDNNType<T>::one, param_desc, dA));
......@@ -145,32 +147,23 @@ void CuDNNAffineGradientOp<Context>::ComputeScaleGradient_v2(
sum_result.Reshape({ outer_dim * scale_dim });
T* SRes_data = nullptr;
if (inner_dim == 1) SRes_data = dYxX;
else if (sum_result.count() == 1) {
auto* dAC = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
inner_dim, dYxX, multiplier, &ctx());
*dAC += result;
// reduce inner dimensions
if (inner_dim == 1) {
SRes_data = dYxX;
} else {
SRes_data = (outer_dim == 1) ?
dA : sum_result.template mutable_data<T, Context>();
math::Gemv<T, Context>(
CblasNoTrans, sum_result.count(), inner_dim,
1.0, dYxX, multiplier,
SRes_data == dA ? 1.0 : 0.0, SRes_data, &ctx());
SRes_data == dA ? 1.0 : 0.0, SRes_data, ctx());
}
// reduce outer dimensions
if (outer_dim != 1) {
if (scale_dim == 1) {
auto* dAC = Output(1)->template mutable_data<T, CPUContext>();
T result = math::Dot<T, Context>(
outer_dim, multiplier, SRes_data, &ctx());
*dAC += result;
} else {
math::Gemv<T, Context>(
CblasTrans, outer_dim, scale_dim,
1.0, SRes_data, multiplier,
1.0, dA, &ctx());
}
math::Gemv<T, Context>(
CblasTrans, outer_dim, scale_dim,
1.0, SRes_data, multiplier,
1.0, dA, ctx());
}
}
......@@ -185,11 +178,11 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient(
CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
size_t workspace_size = 0;
CUDNN_CHECK(cudnnGetReductionWorkspaceSize(
ctx().cudnn_handle(), reduce_desc,
ctx()->cudnn_handle(), reduce_desc,
input_desc, param_desc, &workspace_size));
auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
CUDNN_CHECK(cudnnReduceTensor(
ctx().cudnn_handle(), reduce_desc,
ctx()->cudnn_handle(), reduce_desc,
nullptr, 0, WSdata, workspace_size,
CUDNNType<T>::one, input_desc, dY,
CUDNNType<T>::one, param_desc, dB));
......@@ -205,7 +198,7 @@ void CuDNNAffineGradientOp<Context>::ComputeBiasGradient_v2(
math::Gemv<T, Context>(
CblasNoTrans, scale_dim, inner_dim,
1.0, dY, multiplier,
1.0, dB, &ctx());
1.0, dB, ctx());
dY += dim;
}
}
......
......@@ -9,7 +9,7 @@ void DivOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(Output(0)->count(), x1, x2, y);
math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -18,34 +18,40 @@ void DivOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({
Output(0)->count() })[0];
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(0).count();
inner_dim = 1;
} else {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
}
if (type == 0) {
x2 = Input(1).template data<T, CPUContext>();
float inverse_x2 = 1.f / dragon_cast<float, T>(x2[0]);
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1);
math::MulScalar<T, Context>(
Output(0)->count(), inverse_x2, y, ctx());
} else if (type == 1) {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
DECLARE_MULTIPLIER(multiplier, outer_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
0.0, c, &ctx());
math::Div<T, Context>(Output(0)->count(), x1, c, y);
0.0, c, ctx());
math::Div<T, Context>(
Output(0)->count(), x1, c, y, ctx());
} else if (type == 2) {
outer_dim = Input(0).dim(0);
inner_dim = Input(0).count(1);
DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
0.0, c, &ctx());
math::Div<T, Context>(Output(0)->count(), x1, c, y);
0.0, c, ctx());
math::Div<T, Context>(
Output(0)->count(), x1, c, y, ctx());
}
}
......@@ -82,16 +88,16 @@ void DivGradientOp<Context>::EltwiseRunWithType() {
auto* x2 = Input(1).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
math::Mul<T,Context>(X1->count(), dy, x1, c); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2);
math::Mul<T,Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(X1->count(), dy, x2, dx1);
math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
}
}
......@@ -118,23 +124,23 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto cs = ws()->template caches<T, Context>(
{ X1->count(), X2->count() });
math::Mul<T, Context>(X1->count(), dy, x1, cs[0]); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
math::Inv<T, Context>(X2->count(), -1.0, dx2, dx2); // -1 / X2^{2}
math::Mul<T, Context>(X1->count(), dy, x1, cs[0], ctx()); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, cs[0], multiplier,
0.0, cs[1], &ctx());
0.0, cs[1], ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, cs[0], multiplier,
0.0, cs[1], &ctx());
0.0, cs[1], ctx());
}
math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2);
math::Mul<T, Context>(X2->count(), cs[1], dx2, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
......@@ -146,16 +152,16 @@ void DivGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
math::Div<T, Context>(X1->count(), dy, dx1, dx1);
math::Div<T, Context>(X1->count(), dy, dx1, dx1, ctx());
}
}
......
......@@ -7,9 +7,13 @@ template <class Context> template <typename T>
void DotOp<Context>::DotRunWithType() {
auto* X1data = Input(0).template data<T, Context>();
auto* X2data = Input(1).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, CPUContext>();
Ydata[0] = math::Dot<T, Context>(
Input(0).count(), X1data, X2data, &ctx());
auto* Ydata = Output(0)->template mutable_data<T, Context>();
T result_host;
math::Dot<T, Context>(Input(0).count(),
X1data, X2data, &result_host, ctx());
ctx()->template Copy<T, Context, CPUContext>(
1, Ydata, &result_host);
}
template <class Context> template <typename T>
......@@ -22,7 +26,7 @@ void DotOp<Context>::GemmRunWithType() {
TransB ? CblasTrans : CblasNoTrans,
M, N1, K1,
1.0, X1data, X2data,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
}
template <class Context> template <typename T>
......@@ -33,7 +37,7 @@ void DotOp<Context>::GemvRunWithType() {
math::Gemv<T, Context>(
TransA ? CblasTrans : CblasNoTrans, M, N1,
1.0, X1data, X2data,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
}
template <class Context>
......@@ -98,12 +102,14 @@ void DotGradientOp<Context>::DotRunWithType() {
auto* dYdata = Input(2).template data<T, CPUContext>();
auto* dX1data = Output(0)->template mutable_data<T, Context>();
auto* dX2data = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dX1data, X2data);
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(1)->count(), dX2data, X1data);
math::MulScalar<T, Context>(Output(0)->count(), dYdata[0], dX1data);
math::MulScalar<T, Context>(Output(1)->count(), dYdata[0], dX2data);
math::MulScalar<T, Context>(
Output(0)->count(), dYdata[0], dX1data, ctx());
math::MulScalar<T, Context>(
Output(1)->count(), dYdata[0], dX2data, ctx());
}
template <class Context> template <typename T>
......@@ -118,13 +124,13 @@ void DotGradientOp<Context>::GemmRunWithType() {
TransB ? CblasNoTrans : CblasTrans,
M, K1, N1,
1.0, dYdata, X2data,
0.0, dX1data, &ctx());
0.0, dX1data, ctx());
math::Gemm<T, Context>(
TransA ? CblasNoTrans : CblasTrans,
CblasNoTrans,
K1, N1, M,
1.0, X1data, dYdata,
0.0, dX2data, &ctx());
0.0, dX2data, ctx());
}
template <class Context> template <typename T>
......@@ -138,11 +144,11 @@ void DotGradientOp<Context>::GemvRunWithType() {
CblasNoTrans, CblasNoTrans,
M, N1, 1,
1.0, dYdata, X2data,
0.0, dX1data, &ctx());
0.0, dX1data, ctx());
math::Gemv<T, Context>(
TransA ? CblasNoTrans : CblasTrans, M, N1,
1.0, X1data, dYdata,
0.0, dX2data, &ctx());
0.0, dX2data, ctx());
}
template <class Context>
......
......@@ -7,10 +7,11 @@ template <class Context> template <typename T>
void EltwiseOp<Context>::SumRunWithType() {
TIndex count = Output(0)->count();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(count, dragon_cast<T, float>(0), Ydata);
math::Set<T, Context>(count,
dragon_cast<T, float>(0), Ydata, ctx());
for (int i = 0; i < InputSize(); ++i) {
math::Axpy<T, Context>(count, coeffs[i],
Input(i).template data<T, Context>(), Ydata, &ctx());
Input(i).template data<T, Context>(), Ydata, ctx());
}
}
......@@ -21,19 +22,24 @@ void EltwiseOp<Context>::ProdRunWithType() {
math::Mul<T, Context>(count,
Input(0).template data<T, Context>(),
Input(1).template data<T, Context>(),
Ydata);
Ydata, ctx());
for (int i = 2; i < InputSize(); i++) {
math::Mul<T, Context>(count,
Ydata,
Input(i).template data<T, Context>(),
Ydata);
Ydata, ctx());
}
}
template <class Context>
void EltwiseOp<Context>::RunOnDevice() {
for (int i = 1; i < InputSize(); i++)
CHECK(Input(i).dims() == Input(0).dims());
for (int i = 1; i < InputSize(); i++) {
CHECK(Input(i).dims() == Input(0).dims())
<< "\nExcepted Input(" << i << ")'s dims as "
<< Input(0).DimString() << ",\n but got "
<< Input(1).DimString() << ".";
}
Output(0)->ReshapeLike(Input(0));
if (operation == "SUM") {
......@@ -65,12 +71,12 @@ void EltwiseGradientOp<Context>::SumRunWithType() {
for (int i = 0; i < OutputSize(); i++) {
if (Output(i)->name() == "ignore") continue;
auto* dXdata = Output(i)->template mutable_data<T, Context>();
if (coeffs[i] == float(1)) {
ctx().template Copy<T, Context, Context>(
if (coeffs[i] == 1.f) {
ctx()->template Copy<T, Context, Context>(
count, dXdata, dYdata);
} else {
math::Scale<T, Context>(count,
coeffs[i], dYdata, dXdata, &ctx());
coeffs[i], dYdata, dXdata, ctx());
}
}
}
......@@ -88,11 +94,11 @@ void EltwiseGradientOp<Context>::ProdRunWithType() {
if (i == j) continue;
auto* Xdata = Input(j).template data<T, Context>();
if (!initialized) {
ctx().template Copy<T, Context, Context>(count, dXdata, Xdata);
ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
initialized = true;
} else math::Mul<T, Context>(count, Xdata, dXdata, dXdata);
} else math::Mul<T, Context>(count, Xdata, dXdata, dXdata, ctx());
}
math::Mul<T, Context>(count, dYdata, dXdata, dXdata);
math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
}
}
......
......@@ -8,7 +8,7 @@ template <class Context> template <typename T>
void ExpOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata);
math::Exp<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -30,7 +30,8 @@ void ExpGradientOp<Context>::RunWithType() {
auto* Ydata = Input(0).template data<T, Context >();
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dYdata, Ydata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
dYdata, Ydata, dXdata, ctx());
}
template <class Context>
......
......@@ -12,7 +12,7 @@ void GramMatrixOp<Context>::RunWithType() {
CblasNoTrans, CblasTrans,
dim, dim, inner_dim,
1.0, Xdata, Xdata,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
Xdata += x_offset;
Ydata += y_offset;
}
......@@ -47,7 +47,7 @@ void GramMatrixGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
dim, inner_dim, dim,
2.0, dYdata, Xdata,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
dYdata += y_offset;
dXdata += x_offset;
}
......
......@@ -23,7 +23,7 @@ void InnerProductOp<Context>::TransRunWithType() {
CblasNoTrans, CblasTrans,
M, num_output, K,
1.0, Xdata, Wdata,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
if (InputSize() > 2) {
DECLARE_MULTIPLIER(multiplier, M);
......@@ -32,7 +32,7 @@ void InnerProductOp<Context>::TransRunWithType() {
CblasNoTrans, CblasNoTrans,
M, num_output, 1,
1.0, multiplier, Bdata,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
}
......@@ -55,7 +55,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
CblasNoTrans, CblasNoTrans,
M, num_output, K,
1.0, Xdata, Wdata,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
if (InputSize() > 2) {
DECLARE_MULTIPLIER(multiplier, M);
......@@ -64,7 +64,7 @@ void InnerProductOp<Context>::NoTransRunWithType() {
CblasNoTrans, CblasNoTrans,
M, num_output, 1,
1.0, multiplier, Bdata,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
}
......@@ -102,30 +102,30 @@ void InnerProductGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") {
Output(1)->ReshapeLike(Input(1));
auto* dWdata = Output(1)->template mutable_data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
if (TransW) {
math::Gemm<T, Context>(
CblasTrans, CblasNoTrans,
num_output, K, M,
1.0, dYdata, Xdata,
1.0, dWdata, &ctx());
1.0, dWdata, ctx());
} else {
math::Gemm<T, Context>(
CblasTrans, CblasNoTrans,
K, num_output, M,
1.0, Xdata, dYdata,
1.0, dWdata, &ctx());
1.0, dWdata, ctx());
}
}
if (Output(2)->name() != "ignore") {
DECLARE_MULTIPLIER(multiplier, M);
Output(2)->Reshape({ num_output });
auto* dBdata = Output(2)->template mutable_data<T, Context>();
auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
math::Gemv<T, Context>(
CblasTrans, M, num_output,
1.0, dYdata, multiplier,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
}
if (Output(0)->name() != "ignore") {
......@@ -136,13 +136,13 @@ void InnerProductGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
M, K, num_output,
1.0, dYdata, Wdata,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
} else {
math::Gemm<T, Context>(
CblasNoTrans, CblasTrans,
M, K, num_output,
1.0, dYdata, Wdata,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
}
}
}
......
......@@ -7,7 +7,7 @@ template <class Context> template <typename T>
void LogOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Log<T, Context>(Output(0)->count(), Xdata, Ydata);
math::Log<T, Context>(Output(0)->count(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -29,7 +29,7 @@ void LogGradientOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata);
math::Div<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
}
template <class Context>
......
......@@ -16,7 +16,7 @@ void MatmulOp<Context>::RunWithType() {
TransB ? CblasTrans : CblasNoTrans,
M, N, K1,
1.0, X1data, X2data,
0.0, Ydata, &ctx());
0.0, Ydata, ctx());
X1data += x1_offset;
X2data += x2_offset;
Ydata += y_offset;
......@@ -76,13 +76,13 @@ void MatmulGradientOp<Context>::RunWithType() {
TransB ? CblasNoTrans : CblasTrans,
M, K1, N,
1.0, dYdata, X2data,
0.0, dX1data, &ctx());
0.0, dX1data, ctx());
math::Gemm<T, Context>(
TransA ? CblasNoTrans : CblasTrans,
CblasNoTrans,
K1, N, M,
1.0, X1data, dYdata,
0.0, dX2data, &ctx());
0.0, dX2data, ctx());
X1data += x1_offset;
X2data += x2_offset;
dX1data += x1_offset;
......
......@@ -9,7 +9,7 @@ void MulOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -18,34 +18,39 @@ void MulOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({
Output(0)->count() })[0];
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(0).count();
inner_dim = 1;
} else {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
}
if (type == 0) {
x2 = Input(1).template data<T, CPUContext>();
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1);
math::MulScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x2[0]), y, ctx());
} else if (type == 1) {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
DECLARE_MULTIPLIER(multiplier, outer_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
0.0, c, &ctx());
math::Mul<T, Context>(Output(0)->count(), x1, c, y);
0.0, c, ctx());
math::Mul<T, Context>(
Output(0)->count(), x1, c, y, ctx());
} else if (type == 2) {
outer_dim = Input(0).dim(0);
inner_dim = Input(0).count(1);
DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
0.0, c, &ctx());
math::Mul<T, Context>(Output(0)->count(), x1, c, y);
0.0, c, ctx());
math::Mul<T, Context>(
Output(0)->count(), x1, c, y, ctx());
}
}
......@@ -79,13 +84,13 @@ void MulGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* x1 = Input(0).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
}
}
......@@ -110,19 +115,19 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
math::Mul<T, Context>(X1->count(), dy, x1, c);
math::Mul<T, Context>(X1->count(), dy, x1, c, ctx());
if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim);
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
}
......@@ -135,16 +140,16 @@ void MulGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x2,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x2, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
math::Mul<T, Context>(X1->count(), dy, dx1, dx1);
math::Mul<T, Context>(X1->count(), dy, dx1, dx1, ctx());
}
}
......
......@@ -9,16 +9,17 @@ void PowOp<Context>::RunWithType() {
TIndex count = Input(0).count();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
if (power_scale == float(0)) {
float value = (power == float(0)) ? float(1) : pow(shift, power);
math::Set<T, Context>(count, dragon_cast<T, float>(value), Ydata);
if (power_scale == 0.f) {
float value = (power == 0.f) ? 1.f : pow(shift, power);
math::Set<T, Context>(count,
dragon_cast<T, float>(value), Ydata, ctx());
return;
}
auto* Xdata = Input(0).template data<T, Context>();
ctx().template Copy<T, Context, Context>(count, Ydata, Xdata);
if (scale != float(1)) math::Scal<T, Context>(count, scale, Ydata, &ctx());
if (shift != float(0)) math::AddScalar<T, Context>(count, shift, Ydata);
if (power != float(1)) math::Pow<T, Context>(count, power, Ydata, Ydata);
ctx()->template Copy<T, Context, Context>(count, Ydata, Xdata);
if (scale != 1.f) math::Scal<T, Context>(count, scale, Ydata, ctx());
if (shift != 0.f) math::AddScalar<T, Context>(count, shift, Ydata, ctx());
if (power != 1.f) math::Pow<T, Context>(count, power, Ydata, Ydata, ctx());
}
template <class Context>
......@@ -42,35 +43,36 @@ void PowGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
if (power_scale == float(0) || power == float(1)) {
if (power_scale == 0.f || power == 1.f) {
const T value = dragon_cast<T, float>(power_scale);
math::Set<T, Context>(count, value, dXdata);
math::Set<T, Context>(count, value, dXdata, ctx());
} else {
auto* Xdata = Input(0).template data<T, Context>();
if (power == float(2)) {
if (power == 2.f) {
math::Axpby<T, Context>(count,
power_scale * scale, Xdata,
0, dXdata, &ctx());
if (shift != float(0))
math::AddScalar<T, Context>(count, power_scale * shift, dXdata);
} else if (shift == float(0)) {
0, dXdata, ctx());
if (shift != 0.f)
math::AddScalar<T, Context>(count,
power_scale * shift, dXdata, ctx());
} else if (shift == 0.f) {
auto* Ydata = Input(1).template data<T, Context>();
math::Div<T, Context>(count, Ydata, Xdata, dXdata);
math::Scal<T, Context>(count, power, dXdata, &ctx());
math::Div<T, Context>(count, Ydata, Xdata, dXdata, ctx());
math::Scal<T, Context>(count, power, dXdata, ctx());
} else {
auto* Ydata = Input(1).template data<T, Context>();
ctx().template Copy<T, Context, Context>(count, dXdata, Xdata);
if (scale != float(1))
math::Scal<T, Context>(count, scale, dXdata, &ctx());
if (shift != float(0))
math::AddScalar<T, Context>(count, shift, dXdata);
math::Div<T, Context>(count, Ydata, dXdata, dXdata);
if (power_scale != float(1))
math::Scal<T, Context>(count, power_scale, dXdata, &ctx());
ctx()->template Copy<T, Context, Context>(count, dXdata, Xdata);
if (scale != 1.f)
math::Scal<T, Context>(count, scale, dXdata, ctx());
if (shift != 0.f)
math::AddScalar<T, Context>(count, shift, dXdata, ctx());
math::Div<T, Context>(count, Ydata, dXdata, dXdata, ctx());
if (power_scale != 1.f)
math::Scal<T, Context>(count, power_scale, dXdata, ctx());
}
}
if (power_scale != float(0))
math::Mul<T, Context>(count, dYdata, dXdata, dXdata);
if (power_scale != 0.f)
math::Mul<T, Context>(count, dYdata, dXdata, dXdata, ctx());
}
template <class Context>
......
......@@ -9,7 +9,7 @@ void RAddOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Add<T, Context>(Output(0)->count(), x1, x2, y);
math::Add<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -19,23 +19,24 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x2);
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(1).count();
inner_dim = 1;
x1 = Input(0).template data<T, CPUContext>();
math::AddScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x1[0]), y, ctx());
} else {
outer_dim = Input(1).count(0, Input(1).axis(-1));
inner_dim = Input(1).dim(-1);
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
1.0, y, ctx());
}
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
1.0, y, &ctx());
} else if (type == 2) {
outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1);
......@@ -44,7 +45,7 @@ void RAddOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
1.0, y, &ctx());
1.0, y, ctx());
}
}
......@@ -77,13 +78,13 @@ void RAddGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(1)->count(), dx2, dy);
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy);
}
}
......@@ -108,7 +109,7 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
outer_dim = X2->dim(0);
inner_dim = X2->count(1);
......@@ -116,13 +117,13 @@ void RAddGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
}
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
X2->count(), dx2, dy);
}
}
......
......@@ -9,7 +9,7 @@ void RDivOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(Output(0)->count(), x1, x2, y);
math::Div<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -34,8 +34,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
0.0, c, &ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y);
0.0, c, ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
} else if (type == 2) {
outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1);
......@@ -44,8 +44,8 @@ void RDivOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
0.0, c, &ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y);
0.0, c, ctx());
math::Div<T, Context>(Output(0)->count(), c, x2, y, ctx());
}
}
......@@ -82,16 +82,16 @@ void RDivGradientOp<Context>::EltwiseRunWithType() {
auto* x2 = Input(1).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X1->count() })[0];
math::Mul<T, Context>(X1->count(), dy, x1, c); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2);
math::Mul<T, Context>(X1->count(), dy, x1, c, ctx()); // dY * X1
math::Square<T, Context>(X2->count(), x2, dx2, ctx()); // X2^{2}
math::Inv<T, Context>(X2->count(), -1, dx2, dx2, ctx()); // -1 / X2^{2}
math::Mul<T, Context>(X2->count(), c, dx2, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Div<T, Context>(X1->count(), dy, x2, dx1);
math::Div<T, Context>(X1->count(), dy, x2, dx1, ctx());
}
}
......@@ -116,19 +116,19 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
math::Div<T, Context>(X2->count(), dy, x2, c);
math::Div<T, Context>(X2->count(), dy, x2, c, ctx());
if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
}
......@@ -142,18 +142,18 @@ void RDivGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
-1.0, multiplier, x1,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
-1.0, x1, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
math::Div<T, Context>(X2->count(), dx2, x2, dx2);
math::Div<T, Context>(X2->count(), dx2, x2, dx2);
math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
math::Div<T, Context>(X2->count(), dx2, x2, dx2, ctx());
}
}
......
......@@ -9,7 +9,7 @@ void RMulOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), x1, x2, y);
math::Mul<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -18,34 +18,39 @@ void RMulOp<Context>::BroadcastRunWithType(int type) {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({
Output(0)->count() })[0];
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(1).count();
inner_dim = 1;
} else {
outer_dim = Input(1).count(0, Input(1).axis(-1));
inner_dim = Input(1).dim(-1);
}
if (type == 0) {
x1 = Input(0).template data<T, CPUContext>();
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x2);
math::MulScalar<T, Context>(Output(0)->count(),
dragon_cast<float, T>(x1[0]), y, ctx());
} else if (type == 1) {
outer_dim = Input(1).count(0, Input(1).axis(-1));
inner_dim = Input(1).dim(-1);
DECLARE_MULTIPLIER(multiplier, outer_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
0.0, c, &ctx());
math::Mul<T, Context>(Output(0)->count(), c, x2, y);
0.0, c, ctx());
math::Mul<T, Context>(
Output(0)->count(), c, x2, y, ctx());
} else if (type == 2) {
outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1);
DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* c = ws()->template caches<T, Context>(
{ Output(0)->count() })[0];
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
0.0, c, &ctx());
math::Mul<T, Context>(Output(0)->count(), c, x2, y);
0.0, c, ctx());
math::Mul<T, Context>(
Output(0)->count(), c, x2, y, ctx());
}
}
......@@ -79,13 +84,13 @@ void RMulGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* x1 = Input(0).template data<T, Context>();
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2);
math::Mul<T, Context>(Output(1)->count(), dy, x1, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1);
math::Mul<T, Context>(Output(0)->count(), dy, x2, dx1, ctx());
}
}
......@@ -110,19 +115,19 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* dx1 = Output(0)->template mutable_data<T, Context>();
auto* c = ws()->template caches<T, Context>({ X2->count() })[0];
math::Mul<T, Context>(X2->count(), dy, x2, c);
math::Mul<T, Context>(X2->count(), dy, x2, c, ctx());
if (type == 0 || type == 1) {
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, c, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
}
......@@ -135,16 +140,16 @@ void RMulGradientOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
DECLARE_MULTIPLIER(multiplier, inner_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
math::Mul<T, Context>(X2->count(), dy, dx2, dx2);
math::Mul<T, Context>(X2->count(), dy, dx2, dx2, ctx());
}
}
......
......@@ -9,7 +9,7 @@ void RSubOp<Context>::EltwiseRunWithType() {
auto* x1 = Input(0).template data<T, Context>();
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(Output(0)->count(), x1, x2, y);
math::Sub<T, Context>(Output(0)->count(), x1, x2, y, ctx());
}
template <class Context> template <typename T>
......@@ -19,7 +19,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x2);
if (type == 0 || type == 1) {
......@@ -35,7 +35,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, multiplier, x1,
-1.0, y, &ctx());
-1.0, y, ctx());
} else if (type == 2) {
outer_dim = Input(1).dim(0);
inner_dim = Input(1).count(1);
......@@ -44,7 +44,7 @@ void RSubOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
1.0, x1, multiplier,
-1.0, y, &ctx());
-1.0, y, ctx());
}
}
......@@ -78,12 +78,12 @@ void RSubGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Scale<T, Context>(
Output(1)->count(), -1, dy, dx2, &ctx());
Output(1)->count(), -1, dy, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy);
}
}
......@@ -108,7 +108,7 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
} else if (type == 2) {
outer_dim = X2->dim(0);
inner_dim = X2->count(1);
......@@ -116,14 +116,14 @@ void RSubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
1.0, dy, multiplier,
0.0, dx1, &ctx());
0.0, dx1, ctx());
}
}
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Scale<T, Context>(
X2->count(), -1, dy, dx2, &ctx());
X2->count(), -1, dy, dx2, ctx());
}
}
......
......@@ -7,7 +7,7 @@ template <class Context> template <typename T>
void SquareOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata);
math::Pow<T, Context>(Output(0)->count(), 2.0, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -29,8 +29,8 @@ void SquareGradientOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata);
math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, &ctx());
math::Mul<T, Context>(Output(0)->count(), dYdata, Xdata, dXdata, ctx());
math::Scal<T, Context>(Output(0)->count(), 2.0, dXdata, ctx());
}
template <class Context>
......
......@@ -9,7 +9,8 @@ void SubOp<Context>::EltwiseRunWithType() {
auto* X1data = Input(0).template data<T, Context>();
auto* X2data = Input(1).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
math::Sub<T, Context>(Output(0)->count(),
X1data, X2data, Ydata, ctx());
}
template <class Context> template <typename T>
......@@ -19,23 +20,24 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
auto* x2 = Input(1).template data<T, Context>();
auto* y = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), y, x1);
if (type == 0 || type == 1) {
if (type == 0) {
outer_dim = Input(0).count();
inner_dim = 1;
x2 = Input(1).template data<T, CPUContext>();
math::AddScalar<T, Context>(Output(0)->count(),
-dragon_cast<float, T>(x2[0]), y, ctx());
} else {
outer_dim = Input(0).count(0, Input(0).axis(-1));
inner_dim = Input(0).dim(-1);
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
-1.0, multiplier, x2,
1.0, y, ctx());
}
DECLARE_MULTIPLIER(multiplier, outer_dim);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
-1.0, multiplier, x2,
1.0, y, &ctx());
}
else if (type == 2) {
outer_dim = Input(0).dim(0);
......@@ -45,7 +47,7 @@ void SubOp<Context>::BroadcastRunWithType(int type) {
CblasNoTrans, CblasNoTrans,
outer_dim, inner_dim, 1,
-1.0, x2, multiplier,
1.0, y, &ctx());
1.0, y, ctx());
}
}
......@@ -79,12 +81,12 @@ void SubGradientOp<Context>::EltwiseRunWithType() {
if (Output(1)->name() != "ignore") {
auto* dx2 = Output(1)->template mutable_data<T, Context>();
math::Scale<T, Context>(Output(1)->count(),
-1.0, dy, dx2, &ctx());
-1.0, dy, dx2, ctx());
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dx1, dy);
}
}
......@@ -109,7 +111,7 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasTrans, outer_dim, inner_dim,
-1.0, dy, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
} else if (type == 2) {
outer_dim = X1->dim(0);
inner_dim = X1->count(1);
......@@ -117,13 +119,13 @@ void SubGradientOp<Context>::BroadcastRunWithType(int type) {
math::Gemv<T, Context>(
CblasNoTrans, outer_dim, inner_dim,
-1.0, dy, multiplier,
0.0, dx2, &ctx());
0.0, dx2, ctx());
}
}
if (Output(0)->name() != "ignore") {
auto* dx1 = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
X1->count(), dx1, dy);
}
}
......
......@@ -8,7 +8,8 @@ void CompareOp<Context>::EqualRunWithType() {
auto* X1data = Input(0).template data<T, Context>();
auto* X2data = Input(1).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Equal<T, Context>(Output(0)->count(), X1data, X2data, Ydata);
kernel::Equal<T, Context>(Output(0)->count(),
X1data, X2data, Ydata, ctx());
}
template <class Context>
......
......@@ -7,7 +7,7 @@ void CopyOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), Ydata, Xdata);
}
......
......@@ -20,10 +20,10 @@ void CTCLossGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scale<T, Context>(Output(0)->count(),
dYdata_host, Gdata, dXdata, &ctx());
dYdata_host, Gdata, dXdata, ctx());
}
template <class Context>
......
......@@ -45,7 +45,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
cudnnSetTensorDesc<T>(&grad_desc, Input(0).dims());
CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(
ctx().cudnn_handle(), prob_desc, grad_desc,
ctx()->cudnn_handle(), prob_desc, grad_desc,
packed_labels.data(), label_lengths.data(),
input_lengths.data(),
ctc_algo, ctc_desc, &workspace_size));
......@@ -58,7 +58,7 @@ void CuDNNCTCLossOp<Context>::RunWithType() {
auto* WSdata = (uint8_t*)ws()->template caches<Context>({
workspace_size })[0];
CUDNN_CHECK(cudnnCTCLoss(ctx().cudnn_handle(),
CUDNN_CHECK(cudnnCTCLoss(ctx()->cudnn_handle(),
prob_desc, Pdata, packed_labels.data(),
label_lengths.data(), input_lengths.data(),
Ydata, grad_desc, Gdata,
......
......@@ -12,11 +12,13 @@ void L1LossOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(Input(0).count(), X0data, X1data, diff_data);
math::Sub<T, Context>(Input(0).count(),
X0data, X1data, diff_data, ctx());
if (InputSize() > 2) {
CHECK_EQ(Input(0).count(), Input(2).count());
auto* Wdata = Input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
math::Mul<T, Context>(diff->count(),
Wdata, diff_data, diff_data, ctx());
}
T normalizer = 1;
......@@ -27,11 +29,13 @@ void L1LossOp<Context>::RunWithType() {
}
T loss = math::ASum<T, Context>(diff->count(), diff_data);
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void L1LossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
CHECK_EQ(Input(0).count(), Input(1).count());
Output(0)->Reshape({ 1 });
diff = ws()->CreateTensor("/mnt/" + anchor() + "/l1_loss/diff");
......@@ -51,9 +55,11 @@ template <class Context> template <typename T>
void L1LossGradientOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
kernel::AbsGrad<T, Context>(diff->count(), diff_data, diff_data);
ctx()->FinishDeviceCompution();
kernel::AbsGrad<T, Context>(diff->count(),
diff_data, diff_data, ctx());
T alpha = dYdata_host, normalizer = 1;
if (normalization == "BATCH_SIZE") {
......@@ -69,7 +75,7 @@ void L1LossGradientOp<Context>::RunWithType() {
const T sign = (i == 0) ? 1 : -1;
alpha *= sign;
math::Axpby<T, Context>(Output(i)->count(),
alpha, diff_data, 0, dXdata, &ctx());
alpha, diff_data, 0, dXdata, ctx());
}
}
......
......@@ -9,12 +9,14 @@ void L2LossOp<Context>::RunWithType() {
auto* X0data = Input(0).template data<T, Context>();
auto* X1data = Input(1).template data<T, Context>();
auto* diff_data = diff->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
auto* Ydata = Output(0)->template mutable_data<float, Context>();
math::Sub<T, Context>(diff->count(),
X0data, X1data, diff_data, ctx());
if (InputSize() > 2) {
CHECK_EQ(Input(0).count(), Input(2).count());
auto* Wdata = Input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(), Wdata, diff_data, diff_data);
math::Mul<T, Context>(diff->count(),
Wdata, diff_data, diff_data, ctx());
}
T normalizer = 1;
......@@ -23,10 +25,12 @@ void L2LossOp<Context>::RunWithType() {
} else if (normalization == "FULL") {
normalizer = Input(0).count();
}
normalizer *= 2;
T loss = T(0.5) * math::Dot<T, Context>(diff->count(),
diff_data, diff_data, &ctx());
math::Set<T, Context>(1, loss / normalizer, Ydata);
T loss;
math::Dot<T, Context>(diff->count(),
diff_data, diff_data, &loss, ctx());
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
......@@ -48,10 +52,11 @@ OPERATOR_SCHEMA(L2Loss).NumInputs(2, 3).NumOutputs(1);
template <class Context> template <typename T>
void L2LossGradientOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>();
auto* diff_data = diff->template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
ctx()->FinishDeviceCompution();
T alpha = dYdata_host, normalizer = 1;
if (normalization == "BATCH_SIZE") {
......@@ -67,7 +72,7 @@ void L2LossGradientOp<Context>::RunWithType() {
const T sign = (i == 0) ? 1 : -1;
alpha *= sign;
math::Axpby<T, Context>(Output(i)->count(),
alpha, diff_data, 0, dXdata, &ctx());
alpha, diff_data, 0, dXdata, ctx());
}
}
......
......@@ -13,11 +13,11 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
auto* Fdata = flags.template mutable_data<T, Context>();
kernel::SigmoidCrossEntropy<T, Context>(
Input(0).count(), Xdata, Tdata, Ldata, Fdata, &ctx());
Input(0).count(), Xdata, Tdata, Ldata, Fdata, ctx());
if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses);
Output(0)->template CopyFrom<Context>(losses, ctx());
return;
}
......@@ -35,11 +35,13 @@ void SigmoidCrossEntropyOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SigmoidCrossEntropyOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
CHECK_EQ(Input(0).count(), Input(1).count())
<< "\nNumber of predictions must match the number of labels.";
losses.ReshapeLike(Input(0));
......@@ -63,12 +65,12 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
auto* Fdata = flags.template mutable_data<T, Context>();
kernel::SigmoidCrossEntropyGrad<T, Context>(
Input(0).count(), Xdata, Tdata, dXdata, Fdata, &ctx());
Input(0).count(), Xdata, Tdata, dXdata, Fdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(),
dYdata, dXdata, dXdata); return;
dYdata, dXdata, dXdata, ctx()); return;
}
T normalizer = 1;
......@@ -83,14 +85,16 @@ void SigmoidCrossEntropyGradientOp<Context>::RunWithType() {
}
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
Output(0)->ReshapeLike(Input(0));
flags.ReshapeLike(Input(0));
......
......@@ -15,11 +15,11 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
kernel::SigmoidFocalLoss<T, Context>(
outer_dim, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
Xdata, Tdata, Ldata, Fdata, &ctx());
Xdata, Tdata, Ldata, Fdata, ctx());
if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses);
Output(0)->template CopyFrom<Context>(losses, ctx());
return;
}
......@@ -37,11 +37,13 @@ void SigmoidFocalLossOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SigmoidFocalLossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
axis_dim = Input(0).dim(axis);
inner_dim = Input(0).count(axis + 1);
......@@ -71,12 +73,12 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
kernel::SigmoidFocalLossGradient<T, Context>(
outer_dim, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
Xdata, Tdata, dXdata, Fdata, &ctx());
Xdata, Tdata, dXdata, Fdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(),
dYdata, dXdata, dXdata); return;
dYdata, dXdata, dXdata, ctx()); return;
}
T normalizer = 1;
......@@ -91,14 +93,16 @@ void SigmoidFocalLossGradientOp<Context>::RunWithType() {
}
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
void SigmoidFocalLossGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
axis_dim = Input(0).dim(axis);
inner_dim = Input(0).count(axis + 1);
......
......@@ -11,20 +11,21 @@ void SmoothL1LossOp<Context>::RunWithType() {
auto* X1data = Input(1).template data<T, Context>();
auto* diff_data = diff->template mutable_data<T, Context>();
auto* error_data = error->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<float, Context>();
math::Sub<T, Context>(diff->count(), X0data, X1data, diff_data);
math::Sub<T, Context>(diff->count(),
X0data, X1data, diff_data, ctx());
if (InputSize() > 2) {
auto* inside_w_data = Input(2).template data<T, Context>();
math::Mul<T, Context>(diff->count(),
inside_w_data, diff_data, diff_data);
inside_w_data, diff_data, diff_data, ctx());
}
kernel::SmoothL1<T, Context>(
diff->count(), beta, diff_data, error_data);
kernel::SmoothL1<T, Context>(diff->count(),
beta, diff_data, error_data, ctx());
if (InputSize() > 3) {
auto* outside_w_data = Input(3).template data<T, Context>();
math::Mul<T, Context>(diff->count(),
outside_w_data, error_data, error_data);
outside_w_data, error_data, error_data, ctx());
}
T normalizer = 1;
......@@ -34,12 +35,14 @@ void SmoothL1LossOp<Context>::RunWithType() {
normalizer = Input(0).count();
}
T loss = math::ASum<T, Context>(error->count(), error_data);
math::Set<T, Context>(1, loss / normalizer, Ydata);
float loss = math::ASum<float, Context>(error->count(), error_data);
math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SmoothL1LossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
CHECK(Input(0).dims() == Input(1).dims());
if (InputSize() > 2) CHECK(Input(0).dims() == Input(2).dims());
if (InputSize() > 3) CHECK(Input(0).dims() == Input(3).dims());
......@@ -64,10 +67,12 @@ template <class Context> template <typename T>
void SmoothL1LossGradientOp<Context>::RunWithType() {
auto* diff_data = diff->template mutable_data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
kernel::SmoothL1Grad<T, Context>(
diff->count(), beta, diff_data, diff_data);
ctx()->FinishDeviceCompution();
kernel::SmoothL1Grad<T, Context>(diff->count(),
beta, diff_data, diff_data, ctx());
T alpha = dYdata_host, normalizer = 1;
if (normalization == "BATCH_SIZE") {
......@@ -83,16 +88,16 @@ void SmoothL1LossGradientOp<Context>::RunWithType() {
const T sign = (i == 0) ? 1 : -1;
alpha *= sign;
math::Axpby<T, Context>(Output(i)->count(),
alpha, diff_data, 0, dXdata, &ctx());
alpha, diff_data, 0, dXdata, ctx());
if (InputSize() > 3) {
auto* inside_w_data = Input(2).template data<T, Context>();
math::Mul<T, Context>(Output(i)->count(),
inside_w_data, dXdata, dXdata);
inside_w_data, dXdata, dXdata, ctx());
}
if (InputSize() > 4) {
auto* outside_w_data = Input(3).template data<T, Context>();
math::Mul<T, Context>(Output(i)->count(),
outside_w_data, dXdata, dXdata);
outside_w_data, dXdata, dXdata, ctx());
}
}
}
......
......@@ -26,15 +26,15 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
auto* Pdata = prob->template data<T, Context>();
auto* Tdata = Input(1).template data<T, Context>();
auto* Ldata = losses.template mutable_data<T, Context>();
kernel::SoftmaxCrossEntropy<T, Context>(
Input(0).count(), Pdata, Tdata, Ldata);
kernel::SoftmaxCrossEntropy<T, Context>(Input(0).count(),
Pdata, Tdata, Ldata, ctx());
if (normalization == "UNIT") {
Output(0)->Reshape({ outer_dim * inner_dim });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Sum<T, Context>(outer_dim * inner_dim,
Input(0).dim(axis), inner_dim,
Ldata, Ydata); return;
Ldata, Ydata, ctx()); return;
}
T normalizer = 1;
......@@ -47,11 +47,13 @@ void SoftmaxCrossEntropyOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SoftmaxCrossEntropyOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1);
CHECK_EQ(Input(0).count(), Input(1).count())
......@@ -76,16 +78,16 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
auto* Tdata = Input(1).template data<T, Context>();
auto* Pdata = prob->template mutable_data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(prob->count(), dXdata, Pdata);
ctx()->template Copy<T, Context, Context>(prob->count(), dXdata, Pdata);
math::Axpy<T, Context>(Output(0)->count(),
-1.0, Tdata, dXdata, &ctx());
-1.0, Tdata, dXdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>();
kernel::SumGrad<T, Context>(outer_dim * inner_dim,
Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata);
Input(0).dim(axis), inner_dim, 1.0, dYdata, Pdata, ctx());
math::Mul<T, Context>(Output(0)->count(),
Pdata, dXdata, dXdata); return;
Pdata, dXdata, dXdata, ctx()); return;
}
T normalizer = 1;
......@@ -96,10 +98,10 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
}
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
......
......@@ -20,11 +20,11 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
outer_dim, Input(0).dim(axis), inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
Pdata, Tdata, Idata, this->ignores.count(),
Ldata, Fdata, &ctx());
Ldata, Fdata, ctx());
if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses);
Output(0)->template CopyFrom<Context>(losses, ctx());
return;
}
......@@ -42,11 +42,13 @@ void SoftmaxFocalLossOp<Context>::RunWithType() {
T loss = math::ASum<T, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(1, loss / normalizer, Ydata);
math::Set<T, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SoftmaxFocalLossOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1);
CHECK_EQ(outer_dim * inner_dim, Input(1).count())
......@@ -80,16 +82,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
outer_dim, Output(0)->dim(axis), inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
Pdata, Tdata, Idata, this->ignores.count(),
dXdata, Fdata, &ctx());
dXdata, Fdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<T, Context>();
kernel::SumGrad<T, Context>(
Input(0).count() / Input(0).dim(axis),
Input(0).dim(axis), inner_dim,
1.0, dYdata, Pdata);
1.0, dYdata, Pdata, ctx());
math::Mul<T, Context>(Output(0)->count(),
Pdata, dXdata, dXdata); return;
Pdata, dXdata, dXdata, ctx()); return;
}
T normalizer = 1;
......@@ -104,14 +106,16 @@ void SoftmaxFocalLossGradientOp<Context>::RunWithType() {
}
auto* dYdata = Input(-1).template data<T, Context>();
T dYdata_host; ctx().template Copy<T, CPUContext, Context>(
T dYdata_host; ctx()->template Copy<T, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<T, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
void SoftmaxFocalLossGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
this->prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
outer_dim = this->prob->count(0, axis);
inner_dim = this->prob->count(axis + 1);
......
......@@ -21,83 +21,66 @@ void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRun() {
softmax_op->Run();
}
template <class Context>
void SparseSoftmaxCrossEntropyOp<Context>::SoftmaxRunFP16() {
Tensor* XF32 = ws()->CreateTensor(
"/mnt/" + anchor() + "/softmax/xf32");
XF32->ReshapeLike(Input(0));
auto* XdataF16 = Input(0).template data<float16, Context>();
auto* XdataF32 = XF32->template mutable_data<float, Context>();
kernel::TypeA2B<float16, float, Context>(
Input(0).count(), XdataF16, XdataF32);
OperatorDef softmax_def = MakeOperatorDef("Softmax", "",
vector<string>({ XF32->name() }),
vector<string>({ "/mnt/" + anchor() + "/softmax/prob" }));
softmax_def.add_arg()->CopyFrom(this->arg("axis"));
if (def().has_device_option())
softmax_def.mutable_device_option()
->CopyFrom(def().device_option());
if (!softmax_op) softmax_op.reset(
CreateOperator(softmax_def, ws()));
else softmax_op->MutableOp(softmax_def);
softmax_op->Run();
}
template <class Context> template <typename Tx, typename Ty>
void SparseSoftmaxCrossEntropyOp<Context>::RunWithType() {
auto* Pdata = prob->template data<Tx, Context>();
auto* Tdata = Input(1).template data<Ty, Context>();
auto* Idata = !ignores.count() ? nullptr :
ignores.template data<int, Context>();
auto* Ldata = losses.template mutable_data<Tx, Context>();
auto* Fdata = flags.template mutable_data<Tx, Context>();
auto* Ldata = losses.template mutable_data<float, Context>();
auto* Fdata = flags.template mutable_data<float, Context>();
kernel::SparseSoftmaxCrossEntropy<Tx, Ty, Context>(
outer_dim, Input(0).dim(axis), inner_dim,
Pdata, Tdata, Idata, ignores.count(),
Ldata, Fdata, &ctx());
Ldata, Fdata, ctx());
if (normalization == "UNIT") {
Output(0)->ReshapeLike(losses);
Output(0)->template CopyFrom<Context>(losses);
Output(0)->template CopyFrom<Context>(losses, ctx());
return;
}
Tx normalizer = 1;
float normalizer = 1;
if (normalization == "VALID") {
normalizer = std::max(
math::ASum<Tx, Context>(
flags.count(), Fdata), (Tx)1.f);
math::ASum<float, Context>(
flags.count(), Fdata), 1.f);
} else if (normalization == "BATCH_SIZE") {
normalizer = Input(0).dim(0);
} else if (normalization == "FULL") {
normalizer = outer_dim * inner_dim;
}
Tx loss = math::ASum<Tx, Context>(losses.count(), Ldata);
float loss = math::ASum<float, Context>(losses.count(), Ldata);
Output(0)->Reshape({ 1 });
auto* Ydata = Output(0)->template mutable_data<Tx, Context>();
math::Set<Tx, Context>(1, loss / normalizer, Ydata);
auto* Ydata = Output(0)->template mutable_data<float, Context>();
math::Set<float, Context>(1, loss / normalizer, Ydata, ctx());
}
template <class Context>
void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
outer_dim = Input(0).count(0, axis);
inner_dim = Input(0).count(axis + 1);
CHECK_EQ(outer_dim * inner_dim, Input(1).count())
<< "\nNumber of predictions must match the number of labels.";
losses.Reshape({ outer_dim * inner_dim });
flags.Reshape({ outer_dim * inner_dim });
prob = ws()->CreateTensor("/mnt/" + anchor() + "/softmax/prob");
SoftmaxRun();
if (XIsType(Input(0), float) ||
XIsType(Input(0), float16)) {
if (XIsType(Input(0), float16)) SoftmaxRunFP16();
else SoftmaxRun();
if (XIsType(Input(0), float)) {
if (XIsType(Input(1), float)) RunWithType<float, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
} else if (XIsType(Input(0), float16)) {
if (XIsType(Input(1), float)) RunWithType<float16, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
DEPLOY_CPU(SparseSoftmaxCrossEntropy);
......@@ -113,62 +96,66 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::RunWithType() {
auto* Idata = !ignores.count() ? nullptr :
ignores.template data<int, Context>();
auto* dXdata = Output(0)->template mutable_data<Tx, Context>();
auto* Fdata = flags.template mutable_data<Tx, Context>();
ctx().template Copy<Tx, Context, Context>(
auto* Fdata = flags.template mutable_data<float, Context>();
ctx()->template Copy<Tx, Context, Context>(
prob->count(), dXdata, Pdata);
kernel::SparseSoftmaxCrossEntropyGrad<Tx, Ty, Context>(
outer_dim, Output(0)->dim(axis), inner_dim,
Pdata, Tdata, Idata, ignores.count(),
dXdata, Fdata, &ctx());
dXdata, Fdata, ctx());
if (normalization == "UNIT") {
auto* dYdata = Input(-1).template data<Tx, Context>();
kernel::SumGrad<Tx, Context>(
auto* dYdata = Input(-1).template data<float, Context>();
auto* WSdata = ws()->template caches<float, Context>(
{ Input(0).count() })[0];
kernel::SumGrad<float, Context>(
Input(0).count() / Input(0).dim(axis),
Input(0).dim(axis), inner_dim,
1.0, dYdata, Pdata);
math::Mul<Tx, Context>(
Output(0)->count(), Pdata, dXdata, dXdata);
1.0, dYdata, WSdata, ctx());
kernel::TypeA2B<float, Tx, Context>(
Input(0).count(), WSdata, Pdata, ctx());
math::Mul<Tx, Context>(Output(0)->count(),
Pdata, dXdata, dXdata, ctx());
return;
}
Tx normalizer = 1;
float normalizer = 1;
if (normalization == "VALID") {
normalizer = std::max(
math::ASum<Tx, Context>(
flags.count(), Fdata), (Tx)1.f);
math::ASum<float, Context>(
flags.count(), Fdata), 1.f);
} else if (normalization == "BATCH_SIZE") {
normalizer = Input(0).dim(0);
} else if (normalization == "FULL") {
normalizer = outer_dim * inner_dim;
}
auto* dYdata = Input(-1).template data<Tx, Context>();
Tx dYdata_host; ctx().template Copy<Tx, CPUContext, Context>(
auto* dYdata = Input(-1).template data<float, Context>();
float dYdata_host; ctx()->template Copy<float, CPUContext, Context>(
1, &dYdata_host, dYdata);
math::Scal<Tx, Context>(Output(0)->count(),
dYdata_host / normalizer, dXdata, &ctx());
dYdata_host / normalizer, dXdata, ctx());
}
template <class Context>
void SparseSoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
prob = ws()->GetTensor("/mnt/" + anchor() + "/softmax/prob");
outer_dim = prob->count(0, axis);
inner_dim = prob->count(axis + 1);
Output(0)->ReshapeLike(Input(0));
flags.Reshape({ outer_dim * inner_dim });
if (XIsType(Input(0), float) || XIsType(Input(0), float16)) {
if (XIsType(Input(0), float)) {
if (XIsType(Input(1), float)) RunWithType<float, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
if (XIsType(Input(0), float16)) {
auto* dXdataF32 = Output(0)->template data<float, Context>();
auto* dXdataF16 = prob->template mutable_data<float16, Context>();
kernel::TypeA2B<float, float16, Context>(Output(0)->count(), dXdataF32, dXdataF16);
Output(0)->template CopyFrom<Context>(*prob);
}
} else if (XIsType(Input(0), float16)) {
if (XIsType(Input(1), float)) RunWithType<float16, float>();
else if (XIsType(Input(1), int64_t)) RunWithType<float16, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......
......@@ -9,23 +9,27 @@ namespace dragon {
template <class Context> template <typename Tx, typename Ty>
void AccuracyOp<Context>::RunWithType() {
static CPUContext cctx;
float* Y1data, *Y2data = nullptr;
Y1data = Output(0)->template mutable_data<float, CPUContext>();
if (OutputSize() > 1) {
math::Set<float, CPUContext>(num_classes, 0,
Output(1)->template mutable_data<float, CPUContext>());
Y2data = Output(1)->template mutable_data<float, CPUContext>();
math::Set<float, CPUContext>(num_classes, 0, Y2data, &cctx);
}
Map<int, TIndex> num_per_class;
Map<int, TIndex> num_per_class;
TIndex acc = 0, count = 0;
const Tx* Xdata;
if (XIsType(Input(0), float16)) {
Tensor* XF32 = ws()->CreateTensor("/mnt/" + anchor() + "/accuracy/xf32");
XF32->ReshapeLike(Input(0));
auto* XdataF16 = Input(0).template data<float16, CPUContext>();
auto* XdataF32 = XF32->template mutable_data<float, CPUContext>();
Tensor* X32T = ws()->CreateTensor(
"/mnt/" + anchor() + "/accuracy/f32");
X32T->ReshapeLike(Input(0));
auto* X16 = Input(0).template data<float16, CPUContext>();
auto* X32 = X32T->template mutable_data<float, CPUContext>();
kernel::TypeA2B<float16, float, CPUContext>(
Input(0).count(), XdataF16, XdataF32);
Xdata = XdataF32;
Input(0).count(), X16, X32, &cctx);
Xdata = X32;
} else Xdata = Input(0).template data<Tx, CPUContext>();
auto* labels = Input(1).template data<Ty, CPUContext>();
......@@ -41,15 +45,13 @@ void AccuracyOp<Context>::RunWithType() {
vector<pair<Tx, int> > vec;
for (int k = 0; k < num_classes; k++)
vec.push_back(
std::make_pair(Xdata[i * dim + k * inner_dim + j], k)
);
std::make_pair(Xdata[i * dim + k * inner_dim + j], k));
std::partial_sort(
vec.begin(), vec.begin() + top_k, vec.end(),
std::greater<pair<Tx, int> >());
for (int k = 0; k < top_k; k++) {
if (vec[k].second == label) {
if (OutputSize() > 1)
Output(1)->template mutable_data<float, CPUContext>()[label]++;
if (OutputSize() > 1) Y2data[label]++;
acc++;
break;
}
......@@ -58,12 +60,11 @@ void AccuracyOp<Context>::RunWithType() {
} // end inner_dim
} // end outer_dim
Output(0)->template mutable_data<float, CPUContext>()[0] = (float)acc / count;
if (OutputSize() > 1) {
auto* acc_per_class = Output(1)->template mutable_data<float, CPUContext>();
Y1data[0] = (float)acc / count;
if (Y2data) {
for (int i = 0; i < num_classes; i++)
acc_per_class[i] = num_per_class[i] == 0 ?
0 : acc_per_class[i] / num_per_class[i];
Y2data[i] = num_per_class[i] == 0 ?
0 : Y2data[i] / num_per_class[i];
}
}
......
......@@ -14,14 +14,14 @@ namespace dragon {
Output(0)->ReshapeLike(Input(0)); \
auto* Xdata = Input(0).template data<type_a, Context>(); \
auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata); \
kernel::TypeA2B<type_a, type_b, Context>(Input(0).count(), Xdata, Ydata, ctx()); \
} else { \
TIndex count = Output(0)->count(); \
auto* Xdata = Output(0)->template data<type_a, Context>(); \
auto* Cdata = ws()->template caches<type_b, Context>({ count })[0]; \
kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata); \
kernel::TypeA2B<type_a, type_b, Context>(count, Xdata, Cdata, ctx()); \
auto* Ydata = Output(0)->template mutable_data<type_b, Context>(); \
ctx().template Copy<type_b, Context, Context>(count, Ydata, Cdata); \
ctx()->template Copy<type_b, Context, Context>(count, Ydata, Cdata); \
} \
return; \
}
......
......@@ -11,7 +11,7 @@ void GradientGenerateOp<Context>::RunWithType() {
Output(i)->ReshapeLike(Input(i));
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(Output(0)->count(),
dragon_cast<T, float>(defaults[i]), dXdata);
dragon_cast<T, float>(defaults[i]), dXdata, ctx());
}
}
......@@ -37,12 +37,13 @@ void GradientGatherOp<Context>::RunWithType() {
CHECK(Output(0)->dims() == Input(indices[i]).dims());
auto* dYdata = Input(indices[i]).template data<T, Context>();
if (i == 0) {
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
count, dXdata, dYdata);
} else {
math::Add<T, Context>(
count, dXdata, dYdata, dXdata);
count, dXdata, dYdata, dXdata, ctx());
}
ctx()->FinishDeviceCompution();
Input(indices[i]).Reset();
}
}
......@@ -68,7 +69,7 @@ template <class Context>
void StopGradientOp<Context>::RunOnDevice() {
if (Output(0)->name() != Input(0).name()) {
Output(0)->ReshapeLike(Input(0));
Output(0)->template CopyFrom<Context>(Input(0));
Output(0)->template CopyFrom<Context>(Input(0), ctx());
}
}
......
......@@ -14,7 +14,7 @@ void ImageDataOp<Context>::RunWithType() {
kernel::ImageData<Tx, Ty, Context>(
Output(0)->count(), n, c, h, w, Mdata, Sdata,
data_format, Xdata, Ydata);
data_format, Xdata, Ydata, ctx());
}
template <class Context>
......
......@@ -7,7 +7,7 @@ template <class Context> template <typename T>
void InitializeOp<Context>::RunWithType() {
unique_ptr< Filler<T, Context> > f;
f.reset(CreateFiller<T, Context>(filler));
f->Fill(Output(0), &ctx());
f->Fill(Output(0), ctx());
}
template <class Context>
......
......@@ -14,7 +14,7 @@ void MPIBroadcastOp<Context>::RunWithType() {
auto* Xdata = Input(0).template mutable_data<T, CPUContext>();
#endif
MPI_Bcast(Xdata, Input(0).count(), mpi_dtype(), comm_root, comm);
Output(0)->template CopyFrom<Context>(Input(0));
Output(0)->template CopyFrom<Context>(Input(0), ctx());
} else {
#ifdef WITH_MPI_CUDA
auto* Ydata = Output(0)->template mutable_data<T, Context>();
......@@ -62,12 +62,13 @@ void MPIBroadcastGradientOp<Context>::RunWithType() {
#ifdef WITH_MPI_CUDA
auto* dYdata = Input(-1).template mutable_data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), dXdata, dYdata);
#else
auto* dYdata = Input(-1).template mutable_data<T, CPUContext>();
auto* dXdata = Output(0)->template mutable_data<T, CPUContext>();
CPUContext::template Copy<T, CPUContext, CPUContext>(
static CPUContext cctx;
cctx.template Copy<T, CPUContext, CPUContext>(
Output(0)->count(), dXdata, dYdata);
#endif
for (int i = 0; i < comm_size; i++) {
......@@ -76,10 +77,10 @@ void MPIBroadcastGradientOp<Context>::RunWithType() {
i, 0, comm, MPI_STATUS_IGNORE);
#ifdef WITH_MPI_CUDA
math::Add<T, Context>(Output(0)->count(),
dYdata, dXdata, dXdata);
dYdata, dXdata, dXdata, ctx());
#else
math::Add<T, CPUContext>(Output(0)->count(),
dYdata, dXdata, dXdata);
math::Add<T, CPUContext>(Output(0)->count(),
dYdata, dXdata, dXdata, &cctx);
#endif
}
}
......
......@@ -8,7 +8,7 @@ namespace dragon {
template <class Context> template <typename T>
void MPIGatherOp<Context>::RunWithType() {
if (comm_rank == comm_root) {
Output(comm_rank)->template CopyFrom<Context>(Input(0));
Output(comm_rank)->template CopyFrom<Context>(Input(0), ctx());
for (int i = 0; i < comm_size; i++) {
if (i == comm_root) continue;
#ifdef WITH_MPI_CUDA
......@@ -76,7 +76,8 @@ OPERATOR_SCHEMA(MPIGather).NumInputs(1).NumOutputs(1, INT_MAX);
template <class Context> template <typename T>
void MPIGatherGradientOp<Context>::RunWithType() {
if (comm_rank == comm_root) {
Output(0)->template CopyFrom<Context>(Input(this->comm_rank + 1));
Output(0)->template CopyFrom<Context>(
Input(this->comm_rank + 1), ctx());
for (int i = 0; i < comm_size; i++) {
if (i == comm_root) continue;
#ifdef WITH_MPI_CUDA
......
......@@ -11,7 +11,7 @@ void ArangeOp<Context>::RunWithType() {
count = (stop_ - start_ - 1) / step_ + 1;
Output(0)->Reshape({ count });
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Arange<T, Context>(count, start_, step_, Ydata);
kernel::Arange<T, Context>(count, start_, step_, Ydata, ctx());
}
template <class Context>
......
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
#include "operators/ndarray/argreduce_op.h"
namespace dragon {
......@@ -12,14 +13,15 @@ void ArgReduceOp<Context>::RunWithType() {
auto* Idata = Output(0)->template mutable_data<int64_t, CPUContext>();
auto* Vdata = OutputSize() == 2 ?
Output(1)->template mutable_data<T, CPUContext>() : nullptr;
static CPUContext cctx;
if (operation == "ARGMAX") {
kernel::Argmax<T, CPUContext>(
count, axis_dim, inner_dim,
top_k, Xdata, Idata, Vdata);
top_k, Xdata, Idata, Vdata, &cctx);
} else if (operation == "ARGMIN") {
kernel::Argmin<T, CPUContext>(
count, axis_dim, inner_dim,
top_k, Xdata, Idata, Vdata);
top_k, Xdata, Idata, Vdata, &cctx);
} else LOG(FATAL) << "Unknown operation: [" << operation << "].";
} else {
auto* Xdata = Input(0).template data<T, Context>();
......@@ -29,11 +31,11 @@ void ArgReduceOp<Context>::RunWithType() {
if (operation == "ARGMAX") {
kernel::Argmax<T, Context>(
count, axis_dim, inner_dim,
top_k, Xdata, Idata, Vdata);
top_k, Xdata, Idata, Vdata, ctx());
} else if (operation == "ARGMIN") {
kernel::Argmin<T, Context>(
count, axis_dim, inner_dim,
top_k, Xdata, Idata, Vdata);
count, axis_dim, inner_dim,
top_k, Xdata, Idata, Vdata, ctx());
} else LOG(FATAL) << "Unknown operation: [" << operation << "].";
}
}
......
......@@ -14,7 +14,7 @@ void ConcatOp<Context>::RunWithType() {
kernel::Concat<T, Context>(
count, outer_dim, inner_dim,
x_concat_dim, y_concat_dim,
concat_offset, Xdata, Ydata);
concat_offset, Xdata, Ydata, ctx());
concat_offset += x_concat_dim;
}
}
......@@ -61,7 +61,7 @@ void ConcatGradientOp<Context>::RunWithType() {
kernel::ConcatGrad<T, Context>(
count, outer_dim, inner_dim,
x_concat_dim, y_concat_dim,
concat_offset, dYdata, dXdata);
concat_offset, dYdata, dXdata, ctx());
}
concat_offset += x_concat_dim;
}
......
......@@ -17,7 +17,7 @@ void CropOp<Context>::RunWithType() {
kernel::Crop1D<T, Context>(dest->count(),
dim, ed[axis] - st[axis], inner_dim,
st[axis], Xdata, Ydata);
st[axis], Xdata, Ydata, ctx());
}
template <class Context>
......@@ -46,7 +46,7 @@ void CropOp<Context>::Setup() {
// make ends
ed.assign(Input(0).ndim(), 0);
keep_dims.resize(Input(0).ndim(), 0);
keep_dims.assign(Input(0).ndim(), 1);
if (shape.size() + shape_like.size() != 0) {
CHECK(shape.size() * shape_like.size() == 0)
<< "\nCan not set shape and shape_like both.";
......@@ -75,7 +75,6 @@ void CropOp<Context>::Setup() {
// static crop
int n_given = (int)GET_ARGUMENTS_SIZE(ends);
for (int i = 0; i < ed.size(); i++) {
keep_dims[i] = 1;
if (i < n_given) ed[i] = ends(i);
if (ed[i] == 0) ed[i] = Input(0).dim(i);
if (ed[i] == -1) { ed[i] = st[i] + 1; keep_dims[i] = 0; }
......@@ -125,7 +124,7 @@ void CropOp<Context>::RunOnDevice() {
// do nothing
if (process_axes.size() == 0) {
Output(0)->ReshapeLike(Input(0));
Output(0)->template CopyFrom<Context>(Input(0));
Output(0)->template CopyFrom<Context>(Input(0), ctx());
// squeeze dimensions
vector<TIndex> squeeze_shape;
for (int i = 0; i < keep_dims.size(); i++)
......@@ -149,6 +148,7 @@ void CropOp<Context>::RunOnDevice() {
if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), int)) RunWithType<int>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
ctx()->FinishDeviceCompution();
// allow buffer to protect X if the num of tasks >= 2
std::swap(source, dest);
if (process_axes.size() % 2 == 1) {
......@@ -160,7 +160,7 @@ void CropOp<Context>::RunOnDevice() {
// squeeze dimensions
vector<TIndex> squeeze_shape;
for (int i = 0; i < keep_dims.size(); i++)
for (int i = 0; i < keep_dims.size(); i++)
if (keep_dims[i]) squeeze_shape.push_back(Output(0)->dim(i));
Output(0)->Reshape(squeeze_shape);
}
......@@ -206,10 +206,10 @@ void CropGradientOp<Context>::RunWithType() {
if (dest == &navigator) {
dXdata = ws()->template caches<T, Context>({ dest->count() })[0];
} else { dXdata = dest->template mutable_data<T, Context>(); }
kernel::Crop1DGrad<T, Context>(dest->count(),
Input(0).dim(axis), dim, inner_dim,
st[axis], ed[axis], dYdata, dXdata);
st[axis], ed[axis], dYdata, dXdata, ctx());
}
template <class Context>
......@@ -226,10 +226,10 @@ void CropGradientOp<Context>::RunOnDevice() {
expand_shape[keep_axes[i]] = Input(-1).dim(i);
Input(-1).Reshape(expand_shape);
// do nothing
// do nothing
if (process_axes.size() == 0) {
Output(0)->ReshapeLike(Input(-1));
Output(0)->template CopyFrom<Context>(Input(-1));
Output(0)->template CopyFrom<Context>(Input(-1), ctx());
return;
}
......@@ -248,6 +248,7 @@ void CropGradientOp<Context>::RunOnDevice() {
if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), int)) RunWithType<int>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "int32" });
ctx()->FinishDeviceCompution();
// allow buffer to protect X if the num of tasks >= 2
std::swap(source, dest);
if (process_axes.size() % 2 == 1) {
......
......@@ -12,11 +12,11 @@ void GatherOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::CanonicalAxis<int, Context>(
Input(1).count(), x_slice_dim, indices);
Input(1).count(), x_slice_dim, indices, ctx());
kernel::Gather<T, Context>(
Output(0)->count(), outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, Xdata, Ydata);
kernel::Gather<T, Context>(Output(0)->count(),
outer_dim, inner_dim, x_slice_dim, y_slice_dim,
indices, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -46,13 +46,18 @@ template <class Context> template <typename T>
void GatherGradientOp<Context>::RunWithType() {
auto* indices = Input(1).template data<int, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
if (!acc_grad) math::Set<T, Context>(Output(0)->count(), 0, dXdata);
T* dXdata = nullptr;
if (!acc_grad) {
dXdata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());
} else {
dXdata = Output(0)->template mutable_data<T, Context>(ctx());
}
kernel::GatherGrad<T, Context>(
Input(-1).count(), outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, dYdata, dXdata);
kernel::GatherGrad<T, Context>(Input(-1).count(),
outer_dim, inner_dim, x_slice_dim, y_slice_dim,
indices, dYdata, dXdata, ctx());
}
template <class Context>
......
......@@ -10,10 +10,10 @@ void OneHotOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(Output(0)->count(),
dragon_cast<T, float>(float(off_value)), Ydata);
dragon_cast<T, float>(float(off_value)), Ydata, ctx());
kernel::OneHot<T, Context>(Input(0).count(),
depth, on_value, Xdata, Ydata);
depth, on_value, Xdata, Ydata, ctx());
}
template <class Context>
......
......@@ -17,7 +17,7 @@ void PadOp<Context>::ConstRunWithType() {
kernel::ConstPad1D<T, Context>(dest->count(),
dim, dim + pad_l[axis] + pad_r[axis], inner_dim,
pad_l[axis], value, Xdata, Ydata);
pad_l[axis], value, Xdata, Ydata, ctx());
}
template <class Context> template <typename T>
......@@ -32,7 +32,7 @@ void PadOp<Context>::ReflectRunWithType() {
kernel::ReflectPad1D<T, Context>(dest->count(),
dim, dim + pad_l[axis] + pad_r[axis], inner_dim,
pad_l[axis], Xdata, Ydata);
pad_l[axis], Xdata, Ydata, ctx());
}
template <class Context> template <typename T>
......@@ -47,7 +47,7 @@ void PadOp<Context>::EdgeRunWithType() {
kernel::EdgePad1D<T, Context>(dest->count(),
dim, dim + pad_l[axis] + pad_r[axis], inner_dim,
pad_l[axis], Xdata, Ydata);
pad_l[axis], Xdata, Ydata, ctx());
}
template <class Context>
......@@ -61,7 +61,7 @@ void PadOp<Context>::RunOnDevice() {
// do nothing
if (process_axes.size() == 0) {
Output(0)->ReshapeLike(Input(0));
Output(0)->template CopyFrom<Context>(Input(0));
Output(0)->template CopyFrom<Context>(Input(0), ctx());
return;
}
......@@ -99,6 +99,7 @@ void PadOp<Context>::RunOnDevice() {
} else {
LOG(FATAL) << "Unsupported padding mode: " << mode << ".";
}
ctx()->FinishDeviceCompution();
// allow buffer to protect X if the num of tasks >= 2
std::swap(source, dest);
if (process_axes.size() % 2 == 1) {
......@@ -127,7 +128,7 @@ void PadGradientOp<Context>::ConstRunWithType() {
kernel::ConstPad1DGrad<T, Context>(dest->count(),
dim - pad_l[axis] - pad_r[axis], dim, inner_dim,
pad_l[axis], dYdata, dXdata);
pad_l[axis], dYdata, dXdata, ctx());
}
template <class Context> template <typename T>
......@@ -140,11 +141,11 @@ void PadGradientOp<Context>::ReflectRunWithType() {
dXdata = ws()->template caches<T, Context>({ dest->count() })[0];
} else { dXdata = dest->template mutable_data<T, Context>(); }
math::Set<T, Context>(dest->count(), 0, dXdata);
math::Set<T, Context>(dest->count(), 0, dXdata, ctx());
kernel::ReflectPad1DGrad<T, Context>(source->count(),
dim - pad_l[axis] - pad_r[axis], dim, inner_dim,
pad_l[axis], dYdata, dXdata);
pad_l[axis], dYdata, dXdata, ctx());
}
template <class Context> template <typename T>
......@@ -157,11 +158,11 @@ void PadGradientOp<Context>::EdgeRunWithType() {
dXdata = ws()->template caches<T, Context>({ dest->count() })[0];
} else { dXdata = dest->template mutable_data<T, Context>(); }
math::Set<T, Context>(dest->count(), 0, dXdata);
math::Set<T, Context>(dest->count(), 0, dXdata, ctx());
kernel::EdgePad1DGrad<T, Context>(source->count(),
dim - pad_l[axis] - pad_r[axis], dim, inner_dim,
pad_l[axis], dYdata, dXdata);
pad_l[axis], dYdata, dXdata, ctx());
}
template <class Context>
......@@ -175,7 +176,7 @@ void PadGradientOp<Context>::RunOnDevice() {
// do nothing
if (process_axes.size() == 0) {
Output(0)->ReshapeLike(Input(-1));
Output(0)->template CopyFrom<Context>(Input(-1));
Output(0)->template CopyFrom<Context>(Input(-1), ctx());
return;
}
......@@ -213,6 +214,7 @@ void PadGradientOp<Context>::RunOnDevice() {
} else {
LOG(FATAL) << "Unsupported padding mode: " << mode << ".";
}
ctx()->FinishDeviceCompution();
// allow buffer to protect X if the num of tasks >= 2
std::swap(source, dest);
if (process_axes.size() % 2 == 1) {
......
......@@ -9,15 +9,15 @@ template <class Context> template <typename T>
void RandomPickOp<Context>::RunWithType() {
auto* indices = pick_indices->template mutable_data<int, CPUContext>();
for (int i = 0; i < pick_indices->count(); i++)
indices[i] = int((*ctx().rand_generator())() % x_slice_dim);
indices[i] = int((*ctx()->rand_generator())() % x_slice_dim);
auto* Xdata = Input(0).template data<T, Context>();
indices = pick_indices->template mutable_data<int, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Gather<T, Context>(
Output(0)->count(), outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, Xdata, Ydata);
kernel::Gather<T, Context>(Output(0)->count(),
outer_dim, inner_dim, x_slice_dim, y_slice_dim,
indices, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -39,7 +39,7 @@ void RandomPickOp<Context>::RunOnDevice() {
if (Output(1)->name() != "ignore") {
Output(1)->ReshapeLike(*pick_indices);
Output(1)->template CopyFrom<Context>(*pick_indices);
Output(1)->template CopyFrom<Context>(*pick_indices, ctx());
}
}
......@@ -55,11 +55,11 @@ void RandomPickGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(Output(0)->count(), 0, dXdata);
math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());
kernel::GatherGrad<T, Context>(
Input(-1).count(), outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, dYdata, dXdata);
kernel::GatherGrad<T, Context>(Input(-1).count(),
outer_dim, inner_dim, x_slice_dim, y_slice_dim,
indices, dYdata, dXdata, ctx());
}
template <class Context>
......
......@@ -8,14 +8,17 @@ namespace dragon {
template <class Context> template <typename T>
void ReduceOp<Context>::SumRunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
if (axis == -1) {
DECLARE_MULTIPLIER(multiplier, Input(0).count());
auto* Ydata = Output(0)->template mutable_data<T, CPUContext>();
Ydata[0] = math::Dot<T, Context>(
Input(0).count(), multiplier, Xdata, &ctx());
T result_host;
math::Dot<T, Context>(Input(0).count(),
multiplier, Xdata, &result_host, ctx());
ctx()->template Copy<T, Context, CPUContext>(
1, Ydata, &result_host);
} else {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Sum<T, Context>(count, axis_dim, inner_dim, Xdata, Ydata);
kernel::Sum<T, Context>(count,
axis_dim, inner_dim, Xdata, Ydata, ctx());
}
}
......@@ -24,7 +27,7 @@ void ReduceOp<Context>::MeanRunWithType() {
SumRunWithType<T>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
T coeff = axis != -1 ? 1.0 / axis_dim : 1.0 / Input(0).count();
math::Scal<T, Context>(Output(0)->count(), coeff, Ydata, &ctx());
math::Scal<T, Context>(Output(0)->count(), coeff, Ydata, ctx());
}
template <class Context>
......@@ -62,11 +65,12 @@ void ReduceGradientOp<Context>::SumRunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
if (axis == -1) {
auto* dYdata = Input(-1).template data<T, CPUContext>();
math::Set<T, Context>(Output(0)->count(), dYdata[0], dXdata);
math::Set<T, Context>(Output(0)->count(),
dYdata[0], dXdata, ctx());
} else {
auto* dYdata = Input(-1).template data<T, Context>();
kernel::SumGrad<T, Context>(count,
axis_dim, inner_dim, 1.0, dYdata, dXdata);
axis_dim, inner_dim, 1.0, dYdata, dXdata, ctx());
}
}
......@@ -76,11 +80,12 @@ void ReduceGradientOp<Context>::MeanRunWithType() {
if (axis == -1) {
auto* dYdata = Input(-1).template data<T, CPUContext>();
math::Set<T, Context>(Output(0)->count(),
dYdata[0] / Input(0).count(), dXdata);
dYdata[0] / Input(0).count(), dXdata, ctx());
} else {
auto* dYdata = Input(-1).template data<T, Context>();
kernel::SumGrad<T, Context>(count,
axis_dim, inner_dim, 1.0 / axis_dim, dYdata, dXdata);
axis_dim, inner_dim, 1.0 / axis_dim,
dYdata, dXdata, ctx());
}
}
......
......@@ -10,7 +10,7 @@ void RepeatOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Repeat<T, Context>(
Output(0)->count(), outer_dim, dim,
inner_dim, repeats(), Xdata, Ydata);
inner_dim, repeats(), Xdata, Ydata, ctx());
}
template <class Context>
......@@ -44,7 +44,7 @@ void RepeatGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::RepeatGrad<T, Context>(
Output(0)->count(), outer_dim, dim, inner_dim,
repeats(), dYdata, dXdata, &ctx());
repeats(), dYdata, dXdata, ctx());
}
template <class Context>
......
......@@ -10,8 +10,9 @@ void SliceOp<Context>::RunWithType() {
for (int i = 0; i < nout; i++) {
auto* Ydata = Output(i)->template mutable_data<T, Context>();
TIndex count = Output(i)->count();
kernel::Slice<T, Context>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, slice_offset, Xdata, Ydata);
kernel::Slice<T, Context>(count,
outer_dim, inner_dim, x_slice_dim, y_slice_dim,
slice_offset, Xdata, Ydata, ctx());
slice_offset += y_slice_dim;
}
}
......@@ -46,8 +47,9 @@ void SliceGradientOp<Context>::RunWithType() {
if (Input(i + 1).name() == "ignore") continue;
auto* dYdata = Input(i + 1).template data<T, Context>();
TIndex count = Input(i + 1).count();
kernel::SliceGrad<T, Context>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, slice_offset, dYdata, dXdata);
kernel::SliceGrad<T, Context>(count,
outer_dim, inner_dim, x_slice_dim, y_slice_dim,
slice_offset, dYdata, dXdata, ctx());
slice_offset += y_slice_dim;
}
}
......
......@@ -14,7 +14,7 @@ void StackOp<Context>::RunWithType() {
kernel::Concat<T, Context>(
count, outer_dim, inner_dim,
x_concat_dim, y_concat_dim,
concat_offset, Xdata, Ydata);
concat_offset, Xdata, Ydata, ctx());
concat_offset += x_concat_dim;
}
}
......@@ -59,7 +59,7 @@ void StackGradientOp<Context>::RunWithType() {
kernel::ConcatGrad<T, Context>(
count, outer_dim, inner_dim,
x_concat_dim, y_concat_dim,
concat_offset, dYdata, dXdata);
concat_offset, dYdata, dXdata, ctx());
}
concat_offset += x_concat_dim;
}
......
......@@ -22,7 +22,7 @@ void TileOp<Context>::TileRunWithType() {
kernel::Tile<T, Context>(dest->count(),
outer_dim, ex_inner_dim,
multiple, Xdata, Ydata);
multiple, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -35,7 +35,7 @@ void TileOp<Context>::RunOnDevice() {
// do nothing
if (process_axes.size() == 0) {
Output(0)->ReshapeLike(Input(0));
Output(0)->template CopyFrom<Context>(Input(0));
Output(0)->template CopyFrom<Context>(Input(0), ctx());
return;
}
......@@ -48,6 +48,7 @@ void TileOp<Context>::RunOnDevice() {
axis = task.second; multiple = task.first;
if (XIsType(Input(0), float)) TileRunWithType<float>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
ctx()->FinishDeviceCompution();
// allow buffer to protect X if the num of tasks >= 2
std::swap(source, dest);
if (process_axes.size() % 2 == 1) {
......@@ -82,7 +83,7 @@ void TileGradientOp<Context>::TileRunWithType() {
kernel::TileGrad<T, Context>(
dest->count(), outer_dim, ex_inner_dim,
multiple, dYdata, dXdata, &ctx());
multiple, dYdata, dXdata, ctx());
}
template <class Context>
......@@ -96,7 +97,7 @@ void TileGradientOp<Context>::RunOnDevice() {
// do nothing
if (process_axes.size() == 0) {
Output(0)->ReshapeLike(Input(-1));
Output(0)->template CopyFrom<Context>(Input(-1));
Output(0)->template CopyFrom<Context>(Input(-1), ctx());
return;
}
......@@ -109,6 +110,7 @@ void TileGradientOp<Context>::RunOnDevice() {
axis = task.second; multiple = task.first;
if (XIsType(Input(0), float)) TileRunWithType<float>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
ctx()->FinishDeviceCompution();
// allow buffer to protect X if the num of tasks >= 2
std::swap(source, dest);
if (process_axes.size() % 2 == 1) {
......
......@@ -14,7 +14,7 @@ void TransposeOp<Context>::RunWithType() {
kernel::Transpose<T, Context>(
Output(0)->count(), (int)Output(0)->ndim(),
ORdata, OSdata, NSdata, Xdata, Ydata);
ORdata, OSdata, NSdata, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -75,7 +75,7 @@ void TransposeGradientOp<Context>::RunWithType() {
kernel::TransposeGrad<T, Context>(
Input(-1).count(), order->count(),
ORdata, OSdata, NSdata, dYdata, dXdata);
ORdata, OSdata, NSdata, dYdata, dXdata, ctx());
}
template <class Context>
......
......@@ -20,23 +20,23 @@ void BatchNormOp<Context>::TrainingRunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* NCdata = nc.template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
// compute mean
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0 / NS, Xdata, MXmult,
0, NCdata, &ctx());
0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0, Tmean, &ctx());
0, Tmean, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0 / NS, Xdata, MXmult,
0, Tmean, &ctx());
0, Tmean, ctx());
}
// subtract mean
......@@ -45,37 +45,37 @@ void BatchNormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
-1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
-1.0, MXmult, Tmean,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
// compute variance
// note that we use VAR(X) = E((X - EX) ^ 2)
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0 / NS, WSdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0 / NS, WSdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
}
// compute moving average
......@@ -92,21 +92,21 @@ void BatchNormOp<Context>::TrainingRunWithType() {
float coeff = m > 1 ? float(m) / (m - 1) : 1;
// History(X) = Cur(X) + momentum * History(X)
math::Axpby<T, Context>(mean.count(),
1.0, Tmean, momentum, Hmean, &ctx());
1.0, Tmean, momentum, Hmean, ctx());
math::Axpby<T, Context>(var->count(),
coeff, Tvar, momentum, Hvar, &ctx());
coeff, Tvar, momentum, Hvar, ctx());
} else {
// History(X) = (1 - momentum) * Cur(X) + momentum * History(X)
math::Axpby<T, Context>(mean.count(),
1.0 - momentum, Tmean, momentum, Hmean, &ctx());
1.0 - momentum, Tmean, momentum, Hmean, ctx());
math::Axpby<T, Context>(var->count(),
1.0 - momentum, Tvar, momentum, Hvar, &ctx());
1.0 - momentum, Tvar, momentum, Hvar, ctx());
}
}
// compute stddev
math::AddScalar<T, Context>(var->count(), eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -114,20 +114,21 @@ void BatchNormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Div<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
}
template <class Context> template <typename T>
......@@ -145,7 +146,7 @@ void BatchNormOp<Context>::InferenceRunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* NCdata = nc.template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
ctx()->template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
// scale the mean and variance if necessary
if (mode == "CAFFE") {
......@@ -156,12 +157,12 @@ void BatchNormOp<Context>::InferenceRunWithType() {
const float factor = dragon_cast<float, T>(hFact_data[0]);
const float scale = factor == 0 ? 0 : 1.0 / factor;
math::Scale<T, Context>(mean.count(),
scale, Hmean, Tmean, &ctx());
scale, Hmean, Tmean, ctx());
math::Scale<T, Context>(var->count(),
scale, Hvar, Tvar, &ctx());
scale, Hvar, Tvar, ctx());
} else {
ctx().template Copy<T, Context, Context>(mean.count(), Tmean, Hmean);
ctx().template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
ctx()->template Copy<T, Context, Context>(mean.count(), Tmean, Hmean);
ctx()->template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
}
// subtract mean
......@@ -170,23 +171,23 @@ void BatchNormOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
-1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
-1.0, MXmult, Tmean,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
// compute stddev
math::AddScalar<T, Context>(var->count(), eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -194,20 +195,21 @@ void BatchNormOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Div<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
}
template <class Context>
......@@ -246,10 +248,7 @@ void BatchNormOp<Context>::RunOnDevice() {
if (XIsType(Input(0), float)) {
if (use_global_stats) InferenceRunWithType<float>();
else TrainingRunWithType<float>();
} else if (XIsType(Input(0), float16)) {
if (use_global_stats) InferenceRunWithType<float16>();
else TrainingRunWithType<float16>();
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
DEPLOY_CPU(BatchNorm);
......@@ -273,97 +272,100 @@ void BatchNormGradientOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
auto* Ydata = Input(1).template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), Ydata, dYdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, dYdata, dXdata, ctx());
// sum(dE/dY \cdot Y)
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dXdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, dXdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
}
// sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(Output(0)->count(), Ydata, dXdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, dXdata, dXdata, ctx());
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dYdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
1.0, dXdata, &ctx());
1.0, dXdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, dYdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
1.0, dXdata, &ctx());
1.0, dXdata, ctx());
}
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
math::Axpby<T, Context>(Output(0)->count(),
1.0, dYdata, -1.0 / NS, dXdata, &ctx());
1.0, dYdata, -1.0 / NS, dXdata, ctx());
// divide by stddev
math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
math::Div<T, Context>(Output(0)->count(),
dXdata, WSdata, dXdata, ctx());
}
template <class Context> template <typename T>
......@@ -381,21 +383,22 @@ void BatchNormGradientOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Div<T, Context>(Output(0)->count(), dYdata, WSdata, dXdata);
math::Div<T, Context>(Output(0)->count(),
dYdata, WSdata, dXdata, ctx());
}
template <class Context>
......@@ -430,10 +433,7 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
if (XIsType(Input(0), float)) {
if (use_global_stats) InferenceRunWithType<float>();
else TrainingRunWithType<float>();
} else if (XIsType(Input(0), float16)) {
if (use_global_stats) InferenceRunWithType<float16>();
else TrainingRunWithType<float16>();
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
DEPLOY_CPU(BatchNormGradient);
......
......@@ -20,7 +20,7 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* NCdata = nc.template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
ctx()->template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
auto* Td = d.template mutable_data<T, Context>();
auto* Tr = r->template mutable_data<T, Context>();
......@@ -35,11 +35,11 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
auto* hFact_data = Input(3).template mutable_data<T, CPUContext>();
const float factor = dragon_cast<float, T>(hFact_data[0]);
const float scale = factor == 0 ? 0 : 1.0 / factor;
math::Scale<T, Context>(mean.count(), scale, Hmean, THmean, &ctx());
math::Scale<T, Context>(mean.count(), scale, Hvar, THvar, &ctx());
math::Scale<T, Context>(mean.count(), scale, Hmean, THmean, ctx());
math::Scale<T, Context>(mean.count(), scale, Hvar, THvar, ctx());
} else {
ctx().template Copy<T, Context, Context>(mean.count(), THmean, Hmean);
ctx().template Copy<T, Context, Context>(var->count(), THvar, Hvar);
ctx()->template Copy<T, Context, Context>(mean.count(), THmean, Hmean);
ctx()->template Copy<T, Context, Context>(var->count(), THvar, Hvar);
}
// compute mean
......@@ -47,16 +47,16 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0 / NS, Xdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0 / NS, Xdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
}
// subtract mean
......@@ -65,37 +65,37 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
-1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
-1.0, MXmult, Tmean,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
// compute variance
// note that we use VAR(X) = E((X - EX) ^ 2)
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0 / NS, WSdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0 / NS, WSdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
}
// compute moving average
......@@ -112,21 +112,21 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
float coeff = m > 1 ? float(m) / (m - 1) : 1;
// History(X) = Cur(X) + momentum * History(X)
math::Axpby<T, Context>(mean.count(),
1.0, Tmean, momentum, Hmean, &ctx());
1.0, Tmean, momentum, Hmean, ctx());
math::Axpby<T, Context>(var->count(),
coeff, Tvar, momentum, Hvar, &ctx());
coeff, Tvar, momentum, Hvar, ctx());
} else {
// History(X) = (1 - momentum) * Cur(X) + momentum * History(X)
math::Axpby<T, Context>(mean.count(),
1.0 - momentum, Tmean, momentum, Hmean, &ctx());
1.0 - momentum, Tmean, momentum, Hmean, ctx());
math::Axpby<T, Context>(var->count(),
1.0 - momentum, Tvar, momentum, Hvar, &ctx());
1.0 - momentum, Tvar, momentum, Hvar, ctx());
}
}
// compute stddev
math::AddScalar<T, Context>(var->count(), eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -134,35 +134,36 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Div<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
// compute renorm
if (!is_recomputing) {
// compute history stddev
math::AddScalar<T, Context>(var->count(), eps, THvar);
math::Sqrt<T, Context>(var->count(), THvar, THvar);
math::AddScalar<T, Context>(var->count(), eps, THvar, ctx());
math::Sqrt<T, Context>(var->count(), THvar, THvar, ctx());
// compute r
math::Div<T, Context>(var->count(), Tvar, THvar, Tr);
math::Clip<T, Context>(var->count(), 1.0 / t_r_max, t_r_max, Tr);
math::Div<T, Context>(var->count(), Tvar, THvar, Tr, ctx());
math::Clip<T, Context>(var->count(), 1.0 / t_r_max, t_r_max, Tr, ctx());
// compute d
math::Sub<T, Context>(mean.count(), Tmean, THmean, Td);
math::Div<T, Context>(mean.count(), Td, THvar, Td);
math::Clip<T, Context>(mean.count(), -t_d_max, t_d_max, Td);
math::Sub<T, Context>(mean.count(), Tmean, THmean, Td, ctx());
math::Div<T, Context>(mean.count(), Td, THvar, Td, ctx());
math::Clip<T, Context>(mean.count(), -t_d_max, t_d_max, Td, ctx());
// update the bound of r & d
t_r_max = r_max / (1.0 + (r_max - 1.0) * exp(-t_val));
......@@ -173,7 +174,7 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
// apply renorm
// store x_norm for backward
auto* XNorm_data = x_norm->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), XNorm_data, Ydata);
// correction: mul by r
......@@ -182,20 +183,21 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tr,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tr,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Mul<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
// correction: add by d
if (data_format == "NCHW") {
......@@ -203,18 +205,18 @@ void BatchRenormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Td,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Td,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
}
......@@ -233,7 +235,7 @@ void BatchRenormOp<Context>::InferenceRunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* NCdata = nc.template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
ctx()->template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
// scale the mean and variance if necessary
if (mode == "CAFFE") {
......@@ -243,11 +245,11 @@ void BatchRenormOp<Context>::InferenceRunWithType() {
auto* hFact_data = Input(3).template mutable_data<T, CPUContext>();
const float factor = dragon_cast<float, T>(hFact_data[0]);
const float scale = factor == 0 ? 0 : 1.0 / factor;
math::Scale<T, Context>(mean.count(), scale, Hmean, Tmean, &ctx());
math::Scale<T, Context>(var->count(), scale, Hvar, Tvar, &ctx());
math::Scale<T, Context>(mean.count(), scale, Hmean, Tmean, ctx());
math::Scale<T, Context>(var->count(), scale, Hvar, Tvar, ctx());
} else {
ctx().template Copy<T, Context, Context>(mean.count(), Tmean, Hmean);
ctx().template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
ctx()->template Copy<T, Context, Context>(mean.count(), Tmean, Hmean);
ctx()->template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
}
// subtract mean
......@@ -256,22 +258,22 @@ void BatchRenormOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans, NC, S, 1,
-1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
-1.0, MXmult, Tmean,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
// compute stddev
math::AddScalar<T, Context>(var->count(), eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -279,20 +281,21 @@ void BatchRenormOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Div<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
}
template <class Context>
......@@ -366,93 +369,96 @@ void BatchRenormGradientOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tr,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NWHC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tr,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Mul<T, Context>(Output(0)->count(), dYdata, WSdata, WSdata);
math::Mul<T, Context>(Output(0)->count(),
dYdata, WSdata, WSdata, ctx());
// sum(dE/dY \cdot Y)
math::Mul<T, Context>(Output(0)->count(), XNorm_data, WSdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
XNorm_data, WSdata, dXdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dXdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, dXdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tmean,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
}
// sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(Output(0)->count(), XNorm_data, dXdata, dXdata);
// sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(Output(0)->count(),
XNorm_data, dXdata, dXdata, ctx());
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, WSdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
1.0, dXdata, &ctx());
1.0, dXdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, WSdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tmean,
1.0, dXdata, &ctx());
1.0, dXdata, ctx());
}
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
math::Axpby<T, Context>(Output(0)->count(),
1.0, WSdata, -1.0 / NS, dXdata, &ctx());
1.0, WSdata, -1.0 / NS, dXdata, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -460,21 +466,24 @@ void BatchRenormGradientOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
math::Div<T, Context>(Output(0)->count(),
dXdata, WSdata, dXdata, ctx());
ctx()->FinishDeviceCompution();
x_norm->Reset();
}
......@@ -493,21 +502,22 @@ void BatchRenormGradientOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Div<T, Context>(Output(0)->count(), dYdata, WSdata, dXdata);
math::Div<T, Context>(Output(0)->count(),
dYdata, WSdata, dXdata, ctx());
}
template <class Context>
......
......@@ -10,6 +10,8 @@ namespace dragon {
template <class Context> template <typename T>
void CuDNNBatchNormOp<Context>::RunWithType() {
typedef typename CUDNNType<T>::BNParamType BNParamType;
// determine the bn desc
if (Input(0).ndim() == 2) {
bn_mode = CUDNN_BATCHNORM_PER_ACTIVATION;
......@@ -22,7 +24,7 @@ void CuDNNBatchNormOp<Context>::RunWithType() {
<< "The number of dimensions should be at least 3.";
bn_mode = CUDNN_BATCHNORM_SPATIAL;
#if CUDNN_VERSION_MIN(7, 0, 0)
if (!this->use_global_stats)
if (!this->use_global_stats)
bn_mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
#endif
if (data_format == "NCHW") {
......@@ -54,32 +56,32 @@ void CuDNNBatchNormOp<Context>::RunWithType() {
// derive the bn desc
CUDNN_CHECK(cudnnDeriveBNTensorDescriptor(bn_desc, input_desc, bn_mode));
TENSOR_FILL(Input(1), vector<TIndex>(1, C)); // history_mean
TENSOR_FILL(Input(2), vector<TIndex>(1, C)); // history_var
TENSOR_FILL(Input(3), vector<TIndex>(1, C)); // scale
TENSOR_FILL(Input(4), vector<TIndex>(1, C)); // bias
TENSOR_FILL_WITH_TYPE(Input(1), vector<TIndex>(1, C), BNParamType); // history_mean
TENSOR_FILL_WITH_TYPE(Input(2), vector<TIndex>(1, C), BNParamType); // history_var
TENSOR_FILL_WITH_TYPE(Input(3), vector<TIndex>(1, C), BNParamType); // scale
TENSOR_FILL_WITH_TYPE(Input(4), vector<TIndex>(1, C), BNParamType); // bias
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* Hmean = Input(1).template mutable_data<T, Context>();
auto* Hvar = Input(2).template mutable_data<T, Context>();
auto* Sdata = Input(3).template data<T, Context>();
auto* Bdata = Input(4).template data<T, Context>();
auto* Hmean = Input(1).template mutable_data<BNParamType, Context>();
auto* Hvar = Input(2).template mutable_data<BNParamType, Context>();
auto* Sdata = Input(3).template data<BNParamType, Context>();
auto* Bdata = Input(4).template data<BNParamType, Context>();
if (this->use_global_stats) {
CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
ctx().cudnn_handle(), bn_mode,
ctx()->cudnn_handle(), bn_mode,
CUDNNType<T>::one, CUDNNType<T>::zero,
input_desc, Xdata, output_desc, Ydata,
bn_desc, Sdata, Bdata,
Hmean, Hvar, eps64));
} else {
auto* Tmean = mean->template mutable_data<T, Context>();
auto* Tvar = var->template mutable_data<T, Context>();
auto* Tmean = mean->template mutable_data<BNParamType, Context>();
auto* Tvar = var->template mutable_data<BNParamType, Context>();
auto mt = this->is_recomputing ? 0.0 : 1.0 - this->momentum;
CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
ctx().cudnn_handle(), bn_mode,
ctx()->cudnn_handle(), bn_mode,
CUDNNType<T>::one, CUDNNType<T>::zero,
input_desc, Xdata, output_desc, Ydata,
bn_desc, Sdata, Bdata,
......@@ -131,7 +133,10 @@ void CuDNNBatchNormOp<Context>::RunOnDevice() {
#endif
}
REGISTER_CUDNN_OPERATOR(FusedBatchNorm, CuDNNBatchNormOp<CUDAContext>);
REGISTER_CUDNN_OPERATOR(
FusedBatchNorm,
CuDNNBatchNormOp<CUDAContext>
);
INSTANTIATE_CUDNN_OPERATOR(BatchNorm);
template <class Context>
......@@ -169,6 +174,8 @@ void CuDNNBatchNormGradientOp<Context>::Setup() {
template <class Context> template <typename T>
void CuDNNBatchNormGradientOp<Context>::TrainingRunWithType() {
typedef typename CUDNNType<T>::BNParamType BNParamType;
// determine the bn desc
if (Input(0).ndim() == 2) {
bn_mode = CUDNN_BATCHNORM_PER_ACTIVATION;
......@@ -218,14 +225,14 @@ void CuDNNBatchNormGradientOp<Context>::TrainingRunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
auto* Xdata = Input(0).template data<T, Context>();
auto* Sdata = Input(3).template data<T, Context>();
auto* dSdata = Output(1)->template mutable_data<T, Context>();
auto* dBdata = Output(2)->template mutable_data<T, Context>();
auto* Tmean = mean->template data<T, Context>();
auto* Tvar = var->template data<T, Context>();
auto* Sdata = Input(3).template data<BNParamType, Context>();
auto* dSdata = Output(1)->template mutable_data<BNParamType, Context>();
auto* dBdata = Output(2)->template mutable_data<BNParamType, Context>();
auto* Tmean = mean->template data<BNParamType, Context>();
auto* Tvar = var->template data<BNParamType, Context>();
CUDNN_CHECK(cudnnBatchNormalizationBackward(
ctx().cudnn_handle(), bn_mode,
ctx()->cudnn_handle(), bn_mode,
CUDNNType<T>::one, CUDNNType<T>::zero,
CUDNNType<T>::one, CUDNNType<T>::one,
output_desc, Xdata, input_desc, dYdata,
......@@ -256,16 +263,16 @@ void CuDNNBatchNormGradientOp<Context>::InferenceRunWithType() {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dYdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, dYdata, MXmult,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
}
}
......@@ -275,12 +282,12 @@ void CuDNNBatchNormGradientOp<Context>::InferenceRunWithType() {
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
// compute stddev
ctx().template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
math::AddScalar<T, Context>(var->count(), this->eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
ctx()->template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
math::AddScalar<T, Context>(var->count(), this->eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide scale by stddev
math::Div<T, Context>(var->count(), Sdata, Tvar, Tvar);
math::Div<T, Context>(var->count(), Sdata, Tvar, Tvar, ctx());
// compute dE/dY \cot (scale / std(X))
if (data_format == "NCHW") {
......@@ -288,20 +295,21 @@ void CuDNNBatchNormGradientOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Mul<T, Context>(Output(0)->count(), dYdata, WSdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
dYdata, WSdata, dXdata, ctx());
}
}
......@@ -314,8 +322,10 @@ void CuDNNBatchNormGradientOp<Context>::RunOnDevice() {
if (this->use_global_stats) InferenceRunWithType<float>();
else TrainingRunWithType<float>();
} else if (XIsType(Input(0), float16)) {
if (this->use_global_stats) InferenceRunWithType<float16>();
else TrainingRunWithType<float16>();
if (this->use_global_stats) {
// fp16 is disabled during inference
LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
} else TrainingRunWithType<float16>();
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
#else
if (XIsType(Input(0), float)) {
......@@ -325,7 +335,10 @@ void CuDNNBatchNormGradientOp<Context>::RunOnDevice() {
#endif
}
REGISTER_CUDNN_OPERATOR(FusedBatchNormGradient, CuDNNBatchNormGradientOp<CUDAContext>);
REGISTER_CUDNN_OPERATOR(
FusedBatchNormGradient,
CuDNNBatchNormGradientOp<CUDAContext>
);
INSTANTIATE_CUDNN_OPERATOR(BatchNormGradient);
} // namespace dragon
......
......@@ -24,23 +24,23 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* NCdata = nc.template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
// compute mean
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0 / NS, Xdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0 / NS, Xdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
}
// subtract mean
......@@ -49,51 +49,51 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
-1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
-1.0, MXmult, Tmean,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
// compute variance
// note that we use VAR(X) = E((X - EX) ^ 2)
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0 / NS, WSdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0 / NS, WSdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
}
// compute moving average
if (!is_recomputing) {
// History(X) = (1 - momentum) * Cur(X) + momentum * History(X)
math::Axpby<T, Context>(mean->count(),
1.0 - momentum, Tmean, momentum, Hmean, &ctx());
1.0 - momentum, Tmean, momentum, Hmean, ctx());
math::Axpby<T, Context>(var->count(),
1.0 - momentum, Tvar, momentum, Hvar, &ctx());
1.0 - momentum, Tvar, momentum, Hvar, ctx());
}
// compute stddev
math::AddScalar<T, Context>(var->count(), eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -101,24 +101,25 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Div<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
// store x_norm for backward
auto* XNorm_data = x_norm->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), XNorm_data, Ydata);
// scale
......@@ -127,20 +128,21 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Sdata,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Sdata,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Mul<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
// shift
if (data_format == "NCHW") {
......@@ -148,18 +150,18 @@ void FusedBatchNormOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Bdata,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Bdata,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
}
......@@ -182,9 +184,9 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* NCdata = nc.template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
ctx().template Copy<T, Context, Context>(mean->count(), Tmean, Hmean);
ctx().template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
ctx()->template Copy<T, Context, Context>(Input(0).count(), Ydata, Xdata);
ctx()->template Copy<T, Context, Context>(mean->count(), Tmean, Hmean);
ctx()->template Copy<T, Context, Context>(var->count(), Tvar, Hvar);
// subtract mean
if (data_format == "NCHW") {
......@@ -192,23 +194,23 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
-1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
-1.0, MXmult, Tmean,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
// compute stddev
math::AddScalar<T, Context>(var->count(), eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -216,20 +218,21 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Div<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
// scale
if (data_format == "NCHW") {
......@@ -237,20 +240,21 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Sdata,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Sdata,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Mul<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
// shift
if (data_format == "NCHW") {
......@@ -258,18 +262,18 @@ void FusedBatchNormOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Bdata,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Bdata,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
}
......@@ -312,10 +316,7 @@ void FusedBatchNormOp<Context>::RunOnDevice() {
if (XIsType(Input(0), float)) {
if (use_global_stats) InferenceRunWithType<float>();
else TrainingRunWithType<float>();
} else if (XIsType(Input(0), float16)) {
if (use_global_stats) InferenceRunWithType<float16>();
else TrainingRunWithType<float16>();
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
......@@ -341,21 +342,22 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
// gradient w.r.t. scale
if (Output(1)->name() != "ignore") {
auto* dSdata = Output(1)->template mutable_data<T, Context>();
math::Mul<T, Context>(x_norm->count(), XNorm_data, dYdata, WSdata);
math::Mul<T, Context>(x_norm->count(),
XNorm_data, dYdata, WSdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, WSdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
1.0, dSdata, &ctx());
1.0, dSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, WSdata, MXmult,
1.0, dSdata, &ctx());
1.0, dSdata, ctx());
}
}
......@@ -366,16 +368,16 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dYdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, dYdata, MXmult,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
}
}
......@@ -387,37 +389,39 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Sdata,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Sdata,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Mul<T, Context>(x_norm->count(), WSdata, dYdata, WSdata);
math::Mul<T, Context>(x_norm->count(),
WSdata, dYdata, WSdata, ctx());
// sum of x_hat * (dl / dx_hat)
math::Mul<T, Context>(x_norm->count(), XNorm_data, WSdata, dXdata);
math::Mul<T, Context>(x_norm->count(),
XNorm_data, WSdata, dXdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dXdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, dXdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
}
// x_hat times the sum
......@@ -426,54 +430,55 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tmean,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
}
math::Mul<T, Context>(x_norm->count(), XNorm_data, dXdata, dXdata);
math::Mul<T, Context>(x_norm->count(),
XNorm_data, dXdata, dXdata, ctx());
// subtract the average of x_hat times the sum
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, WSdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tmean,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
1.0, dXdata, &ctx());
1.0, dXdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, WSdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tmean,
1.0, dXdata, &ctx());
1.0, dXdata, ctx());
}
math::Axpby<T, Context>(x_norm->count(),
1.0, WSdata, -1.0 / NS, dXdata, &ctx());
1.0, WSdata, -1.0 / NS, dXdata, ctx());
// multiply with the inverse std
if (data_format == "NCHW") {
......@@ -481,21 +486,22 @@ void FusedBatchNormGradientOp<Context>::TrainingRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
// divide by stddev
math::Div<T, Context>(x_norm->count(), dXdata, WSdata, dXdata);
math::Div<T, Context>(x_norm->count(),
dXdata, WSdata, dXdata, ctx());
}
}
......@@ -519,16 +525,16 @@ void FusedBatchNormGradientOp<Context>::InferenceRunWithType() {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dYdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, dYdata, MXmult,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
}
}
......@@ -538,7 +544,7 @@ void FusedBatchNormGradientOp<Context>::InferenceRunWithType() {
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
// divide scale by stddev
math::Div<T, Context>(var->count(), Sdata, Tvar, Tvar);
math::Div<T, Context>(var->count(), Sdata, Tvar, Tvar, ctx());
// compute dE/dY \cot (scale / std(X))
if (data_format == "NCHW") {
......@@ -546,20 +552,21 @@ void FusedBatchNormGradientOp<Context>::InferenceRunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Tvar,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Tvar,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Mul<T, Context>(Output(0)->count(), dYdata, WSdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
dYdata, WSdata, dXdata, ctx());
}
}
......@@ -599,10 +606,7 @@ void FusedBatchNormGradientOp<Context>::RunOnDevice() {
if (XIsType(Input(0), float)) {
if (use_global_stats) InferenceRunWithType<float>();
else TrainingRunWithType<float>();
} else if (XIsType(Input(0), float16)) {
if (use_global_stats) InferenceRunWithType<float16>();
else TrainingRunWithType<float16>();
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
DEPLOY_CPU(FusedBatchNormGradient);
......
......@@ -21,14 +21,14 @@ void FusedGroupNormOp<Context>::RunWithType() {
auto* NCdata = nc.template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
// compute mean
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NG, CGS,
1.0 / CGS, Xdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
......@@ -39,26 +39,26 @@ void FusedGroupNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
-1.0, Tmean, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
// compute variance
// note that we use VAR(X) = E((X - EX) ^ 2)
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NG, CGS,
1.0 / CGS, WSdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
// compute stddev
math::AddScalar<T, Context>(var->count(), eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -66,15 +66,16 @@ void FusedGroupNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
1.0, Tvar, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Div<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
// store x_norm for backward
auto* XNorm_data = x_norm->template mutable_data<T, Context>();
ctx().template Copy<T, Context, Context>(
ctx()->template Copy<T, Context, Context>(
Output(0)->count(), XNorm_data, Ydata);
// scale
......@@ -83,20 +84,21 @@ void FusedGroupNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Sdata,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Sdata,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Mul<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
// shift
if (data_format == "NCHW") {
......@@ -104,18 +106,18 @@ void FusedGroupNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Bdata,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Bdata,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
}
}
......@@ -157,8 +159,7 @@ void FusedGroupNormOp<Context>::RunOnDevice() {
Setup();
if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
......@@ -184,21 +185,22 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
// gradient w.r.t. scale
if (Output(1)->name() != "ignore") {
auto* dSdata = Output(1)->template mutable_data<T, Context>();
math::Mul<T, Context>(x_norm->count(), XNorm_data, dYdata, WSdata);
math::Mul<T, Context>(x_norm->count(),
XNorm_data, dYdata, WSdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, WSdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
1.0, dSdata, &ctx());
1.0, dSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, WSdata, MXmult,
1.0, dSdata, &ctx());
1.0, dSdata, ctx());
}
}
......@@ -209,16 +211,16 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dYdata, MXmult,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemv<T, Context>(
CblasTrans, N, C,
1.0, NCdata, MXmult,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, NS, C,
1.0, dYdata, MXmult,
1.0, dBdata, &ctx());
1.0, dBdata, ctx());
}
}
......@@ -230,28 +232,30 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
N, C, 1,
1.0, MXmult, Sdata,
0.0, NCdata, &ctx());
0.0, NCdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, NCdata, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NS, C, 1,
1.0, MXmult, Sdata,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
}
math::Mul<T, Context>(x_norm->count(), WSdata, dYdata, WSdata);
math::Mul<T, Context>(x_norm->count(),
WSdata, dYdata, WSdata, ctx());
// sum of x_hat * (dl / dx_hat)
math::Mul<T, Context>(x_norm->count(), XNorm_data, WSdata, dXdata);
math::Mul<T, Context>(x_norm->count(),
XNorm_data, WSdata, dXdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NG, CGS,
1.0, dXdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
......@@ -262,28 +266,29 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
1.0, Tmean, MXmult,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
math::Mul<T, Context>(x_norm->count(), XNorm_data, dXdata, dXdata);
math::Mul<T, Context>(x_norm->count(),
XNorm_data, dXdata, dXdata, ctx());
// subtract the average of x_hat times the sum
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NG, CGS,
1.0, WSdata, MXmult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
1.0, Tmean, MXmult,
1.0, dXdata, &ctx());
1.0, dXdata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
math::Axpby<T, Context>(x_norm->count(),
1.0, WSdata, -1.0 / CGS, dXdata, &ctx());
1.0, WSdata, -1.0 / CGS, dXdata, ctx());
// multiply with the inverse std
if (data_format == "NCHW") {
......@@ -291,12 +296,13 @@ void FusedGroupNormGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
1.0, Tvar, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
// divide by stddev
math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
math::Div<T, Context>(Output(0)->count(),
dXdata, WSdata, dXdata, ctx());
}
}
......@@ -337,8 +343,7 @@ void FusedGroupNormGradientOp<Context>::RunOnDevice() {
Setup();
if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
DEPLOY_CPU(FusedGroupNormGradient);
......
......@@ -15,14 +15,14 @@ void GroupNormOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* NCdata = nc.template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
// compute mean
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NG, CGS,
1.0 / CGS, Xdata, MXmult,
0, Tmean, &ctx());
0, Tmean, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
......@@ -33,26 +33,26 @@ void GroupNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
-1.0, Tmean, MXmult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
// compute variance
// note that we use VAR(X) = E((X - EX) ^ 2)
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NG, CGS,
1.0 / CGS, WSdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
// compute stddev
math::AddScalar<T, Context>(var->count(), eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -60,11 +60,12 @@ void GroupNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
1.0, Tvar, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Div<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
}
template <class Context>
......@@ -102,8 +103,7 @@ void GroupNormOp<Context>::RunOnDevice() {
Setup();
if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
DEPLOY_CPU(GroupNorm);
......@@ -127,43 +127,45 @@ void GroupNormGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
1.0, Tvar, MXmult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
auto* Ydata = Input(1).template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), Ydata, dYdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, dYdata, dXdata, ctx());
// sum(dE/dY \cdot Y)
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NG, CGS,
1.0, dXdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
1.0, Tvar, MXmult,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
// sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(Output(0)->count(), Ydata, dXdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, dXdata, dXdata, ctx());
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NG, CGS,
1.0, dYdata, MXmult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NG, CGS, 1,
1.0, Tvar, MXmult,
1.0, dXdata, &ctx());
1.0, dXdata, ctx());
} else if (data_format == "NHWC") {
NOT_IMPLEMENTED;
}
......@@ -171,10 +173,11 @@ void GroupNormGradientOp<Context>::RunWithType() {
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
math::Axpby<T, Context>(Output(0)->count(),
1.0, dYdata, -1.0 / CGS, dXdata, &ctx());
1.0, dYdata, -1.0 / CGS, dXdata, ctx());
// divide by stddev
math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
math::Div<T, Context>(Output(0)->count(),
dXdata, WSdata, dXdata, ctx());
}
template <class Context>
......@@ -210,8 +213,7 @@ void GroupNormGradientOp<Context>::RunOnDevice() {
Setup();
if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
DEPLOY_CPU(GroupNormGradient);
......
......@@ -14,14 +14,14 @@ void InstanceNormOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* WSdata = ws()->template caches<T, Context>({ Input(0).count() })[0];
ctx().template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
ctx()->template Copy<T, Context, Context>(Output(0)->count(), Ydata, Xdata);
// compute mean
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0 / S, Xdata, Smult,
0.0, Tmean, &ctx());
0.0, Tmean, ctx());
} else if (data_format == "NHWC") {
auto* x = Xdata;
auto* tm = Tmean;
......@@ -29,7 +29,7 @@ void InstanceNormOp<Context>::RunWithType() {
math::Gemv<T, Context>(
CblasTrans, S, C,
1.0 / S, x, Smult,
0.0, tm, &ctx());
0.0, tm, ctx());
x += CS;
tm += C;
}
......@@ -41,7 +41,7 @@ void InstanceNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NC, S, 1,
-1.0, Tmean, Smult,
1.0, Ydata, &ctx());
1.0, Ydata, ctx());
} else if (data_format == "NHWC") {
auto* y = Ydata;
auto* tm = Tmean;
......@@ -50,7 +50,7 @@ void InstanceNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
S, C, 1,
-1.0, Smult, tm,
1.0, y, &ctx());
1.0, y, ctx());
y += CS;
tm += C;
}
......@@ -58,12 +58,12 @@ void InstanceNormOp<Context>::RunWithType() {
// compute variance
// note that we use VAR(X) = E((X - EX) ^ 2)
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata);
math::Square<T, Context>(Output(0)->count(), Ydata, WSdata, ctx());
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0 / S, WSdata, Smult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
} else if (data_format == "NHWC") {
auto* x2 = WSdata;
auto* tv = Tvar;
......@@ -71,15 +71,15 @@ void InstanceNormOp<Context>::RunWithType() {
math::Gemv<T, Context>(
CblasTrans, S, C,
1.0 / S, x2, Smult,
0.0, tv, &ctx());
0.0, tv, ctx());
x2 += CS;
tv += C;
}
}
// compute stddev
math::AddScalar<T, Context>(var->count(), eps, Tvar);
math::Sqrt<T, Context>(var->count(), Tvar, Tvar);
math::AddScalar<T, Context>(var->count(), eps, Tvar, ctx());
math::Sqrt<T, Context>(var->count(), Tvar, Tvar, ctx());
// divide by stddev
if (data_format == "NCHW") {
......@@ -87,7 +87,7 @@ void InstanceNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, Tvar, Smult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
auto* std = WSdata;
auto* tv = Tvar;
......@@ -96,12 +96,13 @@ void InstanceNormOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
S, C, 1,
1.0, Smult, tv,
0.0, std, &ctx());
0.0, std, ctx());
std += CS;
tv += C;
}
}
math::Div<T, Context>(Output(0)->count(), Ydata, WSdata, Ydata);
math::Div<T, Context>(Output(0)->count(),
Ydata, WSdata, Ydata, ctx());
}
template <class Context>
......@@ -133,8 +134,7 @@ void InstanceNormOp<Context>::RunOnDevice() {
Setup();
if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
DEPLOY_CPU(InstanceNorm);
......@@ -157,7 +157,7 @@ void InstanceNormGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, Tvar, Smult,
0.0, WSdata, &ctx());
0.0, WSdata, ctx());
} else if (data_format == "NHWC") {
auto* std = WSdata;
auto* tv = Tvar;
......@@ -166,26 +166,27 @@ void InstanceNormGradientOp<Context>::RunWithType() {
CblasNoTrans, CblasNoTrans,
S, C, 1,
1.0, Smult, tv,
0.0, std, &ctx());
0.0, std, ctx());
std += CS;
tv += C;
}
}
auto* Ydata = Input(-2).template data<T, Context>();
math::Mul<T, Context>(Output(0)->count(), Ydata, dYdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, dYdata, dXdata, ctx());
// sum(dE/dY \cdot Y)
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dXdata, Smult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, Tvar, Smult,
0.0, dXdata, &ctx());
0.0, dXdata, ctx());
} else if (data_format == "NHWC") {
for (int i = 0; i < N; i++) {
auto* dx = dXdata;
......@@ -194,12 +195,12 @@ void InstanceNormGradientOp<Context>::RunWithType() {
math::Gemv<T, Context>(
CblasTrans, S, C,
1.0, dx, Smult,
0, tv, &ctx());
0, tv, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
S, C, 1,
1.0, Smult, tv,
0.0, dx, &ctx());
0.0, dx, ctx());
dx += CS;
tv += C;
}
......@@ -207,19 +208,20 @@ void InstanceNormGradientOp<Context>::RunWithType() {
}
// sum(dE/dY \cdot Y) \cdot Y
math::Mul<T, Context>(Output(0)->count(), Ydata, dXdata, dXdata);
math::Mul<T, Context>(Output(0)->count(),
Ydata, dXdata, dXdata, ctx());
// sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, NC, S,
1.0, dYdata, Smult,
0.0, Tvar, &ctx());
0.0, Tvar, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
NC, S, 1,
1.0, Tvar, Smult,
1.0, dXdata, &ctx());
1.0, dXdata, ctx());
} else if (data_format == "NHWC") {
for (int i = 0; i < N; i++) {
auto* dy = dYdata;
......@@ -229,12 +231,12 @@ void InstanceNormGradientOp<Context>::RunWithType() {
math::Gemv<T, Context>(
CblasTrans, S, C,
1.0, dy, Smult,
0, tv, &ctx());
0, tv, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
S, C, 1,
1.0, Smult, tv,
1.0, dx, &ctx());
1.0, dx, ctx());
dy += CS;
dx += CS;
tv += C;
......@@ -245,10 +247,11 @@ void InstanceNormGradientOp<Context>::RunWithType() {
// dE/dY - mean(dE/dY)- mean(dE/dY \cdot Y) \cdot Y
// = dE/dY - mean(sum(dE/dY) + sum(dE/dY \cdot Y) \cdot Y)
math::Axpby<T, Context>(Output(0)->count(),
1.0, dYdata, -1.0 / S, dXdata, &ctx());
1.0, dYdata, -1.0 / S, dXdata, ctx());
// divide by stddev
math::Div<T, Context>(Output(0)->count(), dXdata, WSdata, dXdata);
math::Div<T, Context>(Output(0)->count(),
dXdata, WSdata, dXdata, ctx());
}
template <class Context>
......@@ -279,8 +282,7 @@ void InstanceNormGradientOp<Context>::RunOnDevice() {
Setup();
if (XIsType(Input(0), float)) RunWithType<float>();
else if (XIsType(Input(0), float16)) RunWithType<float16>();
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
}
DEPLOY_CPU(InstanceNormGradient);
......
......@@ -24,35 +24,28 @@ void L2NormOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
auto* Bdata = ws()->template caches<T, Context>({ buffer.count() })[0];
auto* Ndata = norm->template mutable_data<T, Context>();
math::Set<T, Context>(norm->count(), dragon_cast<T, float>(eps), Ndata);
math::Set<T, Context>(norm->count(),
dragon_cast<T, float>(eps), Ndata, ctx());
for (int n = 0; n < outer_dim; n++) {
if (across_inner) {
auto* Ndata_ = norm->template mutable_data<float, CPUContext>();
float sum_of_sqr = math::Dot<T, Context>(
buffer.count(), Xdata, Xdata, &ctx());
if (mode == "MEAN") sum_of_sqr = sum_of_sqr / dim;
Ndata_[n] = pow(sum_of_sqr + eps, 0.5);
math::Scale<T, Context>(buffer.count(),
1.0 / Ndata_[n], Xdata, Ydata, &ctx());
} else {
math::Square<T, Context>(buffer.count(), Xdata, Bdata);
// compute T1 = \sum_{i} x_{i,j}^{2}
math::Gemv<T, Context>(
CblasTrans, dim, inner_dim,
mode == "MEAN" ? 1.0 / dim : 1.0, Bdata, Dmult,
1.0, Ndata, &ctx());
// compute T2 = \sqrt{T1}
math::Sqrt<T, Context>(inner_dim, Ndata, Ndata);
// compute T3 = x / [(T2)]_{dim}
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
dim, inner_dim, 1,
1.0, Dmult, Ndata,
0.0, Bdata, &ctx());
math::Div<T, Context>(buffer.count(), Xdata, Bdata, Ydata);
Ndata += inner_dim;
}
math::Square<T, Context>(buffer.count(),
Xdata, Bdata, ctx());
// compute T1 = \sum_{i} x_{i,j}^{2}
math::Gemv<T, Context>(
CblasTrans, dim, inner_dim,
mode == "MEAN" ? 1.0 / dim : 1.0, Bdata, Dmult,
1.0, Ndata, ctx());
// compute T2 = \sqrt{T1}
math::Sqrt<T, Context>(inner_dim, Ndata, Ndata, ctx());
// compute T3 = x / [(T2)]_{dim}
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
dim, inner_dim, 1,
1.0, Dmult, Ndata,
0.0, Bdata, ctx());
math::Div<T, Context>(buffer.count(),
Xdata, Bdata, Ydata, ctx());
Ndata += inner_dim;
Xdata += buffer.count();
Ydata += buffer.count();
}
......@@ -70,8 +63,6 @@ void L2NormOp<Context>::RunOnDevice() {
outer_dim = Input(0).count(0, axis);
dim = Input(0).count(axis, axis + num_axes);
inner_dim = Input(0).count(axis + num_axes);
if (inner_dim == 1) across_inner = true;
else across_inner = false;
Output(0)->ReshapeLike(Input(0));
......@@ -96,8 +87,8 @@ void L2NormGradientOp<Context>::RunWithType() {
for (int i = 0; i < axis; i++) dims[i] = 1;
buffer.Reshape(dims);
buffer_inner.Reshape({ inner_dim });
vector<T*> BSdata = ws()->template caches<T, Context>({
buffer.count(), buffer_inner.count() });
vector<T*> BSdata = ws()->template caches<T, Context>(
{ buffer.count(), buffer_inner.count() });
auto* Xdata = Input(0).template data<T, Context>();
auto* dYdata = Input(-1).template data<T, Context>();
......@@ -106,48 +97,42 @@ void L2NormGradientOp<Context>::RunWithType() {
auto* Bdata = BSdata[0], *BInnerdata = BSdata[1];
for (int n = 0; n < outer_dim; n++) {
if (across_inner) {
Ndata = norm->template data<T, CPUContext>();
T sum_of_x_mul_dy = math::Dot<T, Context>(
buffer.count(), Xdata, dYdata, &ctx());
if (mode == "MEAN") sum_of_x_mul_dy = sum_of_x_mul_dy / dim;
math::Scale<T, Context>(buffer.count(),
sum_of_x_mul_dy / Ndata[n] / Ndata[n], Xdata, dXdata, &ctx());
math::Sub<T, Context>(buffer.count(), dYdata, dXdata, dXdata);
math::Scal<T, Context>(buffer.count(),
T(1.0 / Ndata[n]), dXdata, &ctx());
} else {
// compute \sum_{i} x_{i, j}dy_{i, j}
math::Mul<T, Context>(buffer.count(), Xdata, dYdata, Bdata);
math::Gemv<T, Context>(
CblasTrans, dim, inner_dim,
mode == "MEAN" ? 1.0 / dim : 1.0, Bdata, Dmult,
0.0, BInnerdata, &ctx());
// compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
dim, inner_dim, 1,
1.0, Dmult, BInnerdata,
0.0, Bdata, &ctx());
math::Mul<T, Context>(buffer.count(), Xdata, Bdata, dXdata);
// compute T2 = T1 / Normalizer^{2}
math::Pow<T, Context>(inner_dim, 2.0, Ndata, BInnerdata);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
dim, inner_dim, 1,
1.0, Dmult, BInnerdata,
0.0, Bdata, &ctx());
math::Div<T, Context>(buffer.count(), dXdata, Bdata, dXdata);
// compute T3 = (dy - T2) / Normalizer
math::Sub<T, Context>(buffer.count(), dYdata, dXdata, dXdata);
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
dim, inner_dim, 1,
1.0, Dmult, Ndata,
0.0, Bdata, &ctx());
math::Div<T, Context>(buffer.count(), dXdata, Bdata, dXdata);
Ndata += inner_dim;
}
// compute \sum_{i} x_{i, j}dy_{i, j}
math::Mul<T, Context>(buffer.count(),
Xdata, dYdata, Bdata, ctx());
math::Gemv<T, Context>(
CblasTrans, dim, inner_dim,
mode == "MEAN" ? 1.0 / dim : 1.0, Bdata, Dmult,
0.0, BInnerdata, ctx());
// compute T1 = x[(\sum_{i} x_{i, j}dy_{i, j})]_{dim}
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
dim, inner_dim, 1,
1.0, Dmult, BInnerdata,
0.0, Bdata, ctx());
math::Mul<T, Context>(buffer.count(),
Xdata, Bdata, dXdata, ctx());
// compute T2 = T1 / Normalizer^{2}
math::Pow<T, Context>(inner_dim,
2.0, Ndata, BInnerdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
dim, inner_dim, 1,
1.0, Dmult, BInnerdata,
0.0, Bdata, ctx());
math::Div<T, Context>(buffer.count(),
dXdata, Bdata, dXdata, ctx());
// compute T3 = (dy - T2) / Normalizer
math::Sub<T, Context>(buffer.count(),
dYdata, dXdata, dXdata, ctx());
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
dim, inner_dim, 1,
1.0, Dmult, Ndata,
0.0, Bdata, ctx());
math::Div<T, Context>(buffer.count(),
dXdata, Bdata, dXdata, ctx());
Ndata += inner_dim;
Xdata += buffer.count();
dYdata += buffer.count();
dXdata += buffer.count();
......@@ -166,8 +151,6 @@ void L2NormGradientOp<Context>::RunOnDevice() {
outer_dim = Input(0).count(0, axis);
dim = Input(0).count(axis, axis + num_axes);
inner_dim = Input(0).count(axis + num_axes);
if (inner_dim == 1) across_inner = true;
else across_inner = false;
Output(0)->ReshapeLike(Input(0));
......
......@@ -23,20 +23,20 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
if (!states_initialized) {
states_initialized = true;
CUDNN_CHECK(cudnnDropoutGetStatesSize(
ctx().cudnn_handle(), &states_size));
ctx()->cudnn_handle(), &states_size));
std::lock_guard<std::mutex> lk(CUDAContext::mutex());
Tensor* states = ws()->CreateTensor("/share/cudnn/dropout:" +
dragon_cast<string, unsigned long long>(random_seed) + "/states");
if (states->count() > 0) {
auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), dropout_ratio,
dropout_desc, ctx()->cudnn_handle(), dropout_ratio,
Sdata, states_size, random_seed));
} else {
states->Reshape({ (TIndex)states_size });
auto* Sdata = states->template mutable_data<uint8_t, Context>();
CUDNN_CHECK(cudnnSetDropoutDescriptor(
dropout_desc, ctx().cudnn_handle(), dropout_ratio,
dropout_desc, ctx()->cudnn_handle(), dropout_ratio,
Sdata, states_size, random_seed));
}
}
......@@ -48,7 +48,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
// setup rnn
#if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetRNNDescriptor(
ctx().cudnn_handle(), rnn_desc,
ctx()->cudnn_handle(), rnn_desc,
hidden_size, num_layers,
dropout_desc,
rnn_input_mode, rnn_direction, rnn_mode,
......@@ -68,7 +68,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
xs_desc->Set<T>({ batch_size, input_dim, 1 }, { input_dim, 1, 1 });
ys_desc.reset(new cudnnTensorDescriptors(seq_length));
ys_desc->Set<T>({ batch_size, output_dim, 1 }, { output_dim, 1, 1 });
CUDNN_CHECK(cudnnGetRNNWorkspaceSize(ctx().cudnn_handle(),
CUDNN_CHECK(cudnnGetRNNWorkspaceSize(ctx()->cudnn_handle(),
rnn_desc, seq_length, xs_desc->descs(), &workspace_size));
output_dims = { seq_length, batch_size, output_dim };
......@@ -82,7 +82,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
// setup packed weights
size_t weights_size; TIndex weights_count;
CUDNN_CHECK(cudnnGetRNNParamsSize(
ctx().cudnn_handle(), rnn_desc, xs_desc->descs()[0],
ctx()->cudnn_handle(), rnn_desc, xs_desc->descs()[0],
&weights_size, CUDNNType<T>::type));
weights_count = (TIndex)weights_size / sizeof(T);
CHECK_EQ(weights_count, Input(1).count())
......@@ -96,7 +96,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
// setup rnn workspace
CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
ctx().cudnn_handle(), rnn_desc, seq_length,
ctx()->cudnn_handle(), rnn_desc, seq_length,
xs_desc->descs(), &workspace_size));
}
......@@ -122,7 +122,7 @@ void CuDNNRecurrentOp<Context>::RunWithType() {
auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
auto handle = ctx().cudnn_handle();
auto handle = ctx()->cudnn_handle();
if (phase() == "TRAIN") {
CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(handle,
......@@ -157,8 +157,12 @@ void CuDNNRecurrentOp<Context>::RunWithType() {
template <class Context>
void CuDNNRecurrentOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......@@ -182,7 +186,7 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {
auto* WSdata = ws()->template caches<Context>({ workspace_size })[0];
// check the reserve space
CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(ctx().cudnn_handle(),
CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(ctx()->cudnn_handle(),
rnn_desc, seq_length, xs_desc->descs(), &reserve_size));
auto* reserveT = ws()->GetTensor("/mnt/" + anchor() + "/rnn/reserve");
CHECK_EQ(reserve_size, reserveT->nbytes());
......@@ -192,7 +196,7 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {
auto* RSdata = reserveT->template data<uint8_t, Context>();
#endif
auto handle = ctx().cudnn_handle();
auto handle = ctx()->cudnn_handle();
if (Output(0)->name() != "ignore" ||
Output(1)->name() != "ignore" ||
......@@ -228,13 +232,17 @@ void CuDNNRecurrentGradientOp<Context>::RunWithType() {
template <class Context>
void CuDNNRecurrentGradientOp<Context>::RunOnDevice() {
ctx()->set_stream_id(0); // enforce default stream
Output(0)->ReshapeLike(Input(0)); // dX
Output(1)->ReshapeLike(Input(1)); // dW
Output(2)->ReshapeLike(Input(2)); // dHx
Output(3)->ReshapeLike(Input(3)); // dCx
if (XIsType(Input(0), float)) RunWithType<float>();
#ifdef WITH_CUDA_FP16
else if (XIsType(Input(0), float16)) RunWithType<float16>();
#endif
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......
......@@ -14,7 +14,7 @@ void LSTMCellOp<Context>::RunWithType() {
kernel::LSTMCell<T, Context>(Input(1).count(), Input(1).dim(0),
Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
CXdata, XAdata, Cdata, Hdata);
CXdata, XAdata, Cdata, Hdata, ctx());
}
template <class Context>
......@@ -44,7 +44,7 @@ void LSTMCellGradientOp<Context>::RunWithType() {
kernel::LSTMCellGrad<T, Context>(Input(1).count(), Input(1).dim(0),
Input(1).ndim() == 2 ? Input(1).dim(1) : Input(1).dim(2),
CXdata, XAdata, Cdata, dCdata, dHdata, dCXdata, dXdata);
CXdata, XAdata, Cdata, dCdata, dHdata, dCXdata, dXdata, ctx());
}
template <class Context>
......
......@@ -30,7 +30,7 @@ void RNNParamSetOp<Context>::RunWithType() {
<< "\nExcepted the size of param is " << size
<< ", but got " << Input(0).count();
offset += param_type == "bias" ? matrix_count : 0;
ctx().template Copy<T, Context, Context>(size, Wdata + offset, Pdata);
ctx()->template Copy<T, Context, Context>(size, Wdata + offset, Pdata);
}
template <class Context>
......
......@@ -5,7 +5,7 @@
namespace dragon {
template <class Context>
void AdamUpdateOp<Context>::ComputeRunWithFloat() {
void AdamUpdateOp<Context>::ComputeRunWithFloat32() {
Tensor* m = ws()->CreateTensor("/mnt/" + Slot() + "/adam/m");
Tensor* v = ws()->CreateTensor("/mnt/" + Slot() + "/adam/v");
m->ReshapeLike(Input(0));
......@@ -16,12 +16,11 @@ void AdamUpdateOp<Context>::ComputeRunWithFloat() {
float coeff = sqrt(1. - pow(beta2, t)) / (1. - pow(beta1, t));
lr = Param("base_lr") * coeff * this->lr_mult;
auto* dXdata = Input(0).template mutable_data<float, Context>();
auto* Mdata = m->mutable_data<float, Context>();
auto* Vdata = v->mutable_data<float, Context>();
auto* Mdata = m->mutable_data<float, Context>(ctx());
auto* Vdata = v->mutable_data<float, Context>(ctx());
kernel::AdamUpdate<float, Context>(
Input(0).count(), lr, beta1, beta2, eps,
dXdata, Mdata, Vdata);
kernel::AdamUpdate<float, Context>(Input(0).count(),
lr, beta1, beta2, eps, dXdata, Mdata, Vdata, ctx());
}
template <class Context>
......@@ -35,13 +34,19 @@ void AdamUpdateOp<Context>::ComputeRunWithFloat16() {
beta1 = Param("beta1"), beta2 = Param("beta2"), eps = Param("eps");
float coeff = sqrt(1. - pow(beta2, t)) / (1. - pow(beta1, t));
lr = Param("base_lr") * coeff * this->lr_mult;
auto* dXdata = Input(0).template mutable_data<float16, Context>();
auto* Mdata = m->mutable_data<float16, Context>();
auto* Vdata = v->mutable_data<float16, Context>();
kernel::AdamUpdate<float16, Context>(
Input(0).count(), lr, beta1, beta2, eps,
dXdata, Mdata, Vdata);
auto* dX32T = ws()->CreateTensor(Input(0).name() + "/f32");
dX32T->ReshapeLike(Input(0));
auto* dX32 = dX32T->template mutable_data<float, Context>();
auto* dX16 = Input(0).template mutable_data<float16, Context>();
auto* M32 = m->mutable_data<float, Context>(ctx());
auto* V32 = v->mutable_data<float, Context>(ctx());
kernel::TypeA2B<float16, float, Context>(
Input(0).count(), dX16, dX32, ctx());
kernel::AdamUpdate<float, Context>(Input(0).count(),
lr, beta1, beta2, eps, dX32, M32, V32, ctx());
}
DEPLOY_CPU(AdamUpdate);
......
......@@ -32,149 +32,175 @@ void CollectiveUpdateOp<Context>::InitNCCL() {
if (comm_rank == comm_root) NCCL_CHECK(ncclGetUniqueId(&id));
MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, comm_root, comm);
NCCL_CHECK(ncclCommInitRank(&nccl_comm, comm_size, id, comm_rank));
closure = CUDAClosure<Context>(&ctx());
closure = CUDAClosure<Context>(ctx());
#else
LOG(FATAL) << "NCCL was not compiled.";
#endif
}
template <class Context>
void CollectiveUpdateOp<Context>::MPIAllReduceWithFloat() {
for (int j = 0; j < InputSize(); j++) {
TIndex count = Input(j).count();
MPI_Request recv_req;
TIndex segment_size = count / comm_size;
TIndex residual = count % comm_size;
vector<TIndex> segment_sizes(comm_size, segment_size);
for (int i = 0; i < residual; i++) segment_sizes[i]++;
vector<TIndex> segment_ends(comm_size);
segment_ends[0] = segment_sizes[0];
for (int i = 1; i < segment_ends.size(); i++)
segment_ends[i] = segment_sizes[i] + segment_ends[i - 1];
template <class Context> template <typename T>
void CollectiveUpdateOp<Context>::MPIAllReduce(
Tensor* tensor,
MPI_Datatype dtype) {
TIndex count = tensor->count();
MPI_Request recv_req;
TIndex segment_size = count / comm_size;
TIndex residual = count % comm_size;
vector<TIndex> segment_sizes(comm_size, segment_size);
for (int i = 0; i < residual; i++) segment_sizes[i]++;
vector<TIndex> segment_ends(comm_size);
segment_ends[0] = segment_sizes[0];
for (int i = 1; i < segment_ends.size(); i++)
segment_ends[i] = segment_sizes[i] + segment_ends[i - 1];
#ifdef WITH_MPI_CUDA
auto* WSdata = ws()->template caches<float, Context>({ segment_sizes[0] })[0];
auto* dXdata = Input(j).template mutable_data<float, Context>();
auto* WSdata = ws()->template caches<T, Context>({ segment_sizes[0] })[0];
auto* dXdata = tensor->template mutable_data<T, Context>();
#else
auto* WSdata = ws()->template caches<float, CPUContext>({ segment_sizes[0] })[0];
auto* dXdata = Input(j).template mutable_data<float, CPUContext>();
auto* WSdata = ws()->template caches<T, CPUContext>({ segment_sizes[0] })[0];
auto* dXdata = tensor->template mutable_data<T, CPUContext>();
#endif // WITH_MPI_CUDA
int recv_from = (comm_rank - 1 + comm_size) % comm_size;
int send_to = (comm_rank + 1) % comm_size;
// scatter-reduce
for (int i = 0; i < comm_size - 1; i++) {
int recv_chunk = (comm_rank - i - 1 + comm_size) % comm_size;
int send_chunk = (comm_rank - i + comm_size) % comm_size;
auto* segment_send = &(dXdata[
segment_ends[send_chunk] - segment_sizes[send_chunk]
]);
MPI_Irecv(WSdata, segment_sizes[recv_chunk],
MPI_FLOAT, recv_from, 0, comm, &recv_req);
MPI_Send(segment_send, segment_sizes[send_chunk],
MPI_FLOAT, send_to, 0, comm);
auto* segment_update = &(dXdata[
segment_ends[recv_chunk] - segment_sizes[recv_chunk]
]);
MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
int recv_from = (comm_rank - 1 + comm_size) % comm_size;
int send_to = (comm_rank + 1) % comm_size;
// scatter-reduce
for (int i = 0; i < comm_size - 1; i++) {
int recv_chunk = (comm_rank - i - 1 + comm_size) % comm_size;
int send_chunk = (comm_rank - i + comm_size) % comm_size;
auto* segment_send = &(dXdata[
segment_ends[send_chunk] - segment_sizes[send_chunk]]);
MPI_Irecv(WSdata, segment_sizes[recv_chunk],
dtype, recv_from, 0, comm, &recv_req);
MPI_Send(segment_send, segment_sizes[send_chunk],
dtype, send_to, 0, comm);
auto* segment_update = &(dXdata[
segment_ends[recv_chunk] - segment_sizes[recv_chunk]]);
MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
#ifdef WITH_MPI_CUDA
math::Axpy<float, Context>(segment_sizes[recv_chunk],
1.0, WSdata, segment_update, &ctx());
ctx().FinishDeviceCompution();
math::Axpy<T, Context>(segment_sizes[recv_chunk],
1.0, WSdata, segment_update, ctx());
ctx()->FinishDeviceCompution();
#else
math::Axpy<float, CPUContext>(segment_sizes[recv_chunk],
1.0, WSdata, segment_update, &ctx());
math::Axpy<T, CPUContext>(segment_sizes[recv_chunk],
1.0, WSdata, segment_update, ctx());
#endif // WITH_MPI_CUDA
}
}
// allgather
for (int i = 0; i < comm_size - 1; i++) {
int send_chunk = (comm_rank - i + 1 + comm_size) % comm_size;
int recv_chunk = (comm_rank - i + comm_size) % comm_size;
auto* segment_send = &(dXdata[
segment_ends[send_chunk] - segment_sizes[send_chunk]
]);
auto* segment_recv = &(dXdata[
segment_ends[recv_chunk] - segment_sizes[recv_chunk]
]);
MPI_Sendrecv(segment_send, segment_sizes[send_chunk],
MPI_FLOAT, send_to, 0,
segment_recv, segment_sizes[recv_chunk],
MPI_FLOAT, recv_from, 0,
comm, MPI_STATUS_IGNORE);
}
// allgather
for (int i = 0; i < comm_size - 1; i++) {
int send_chunk = (comm_rank - i + 1 + comm_size) % comm_size;
int recv_chunk = (comm_rank - i + comm_size) % comm_size;
auto* segment_send = &(dXdata[
segment_ends[send_chunk] - segment_sizes[send_chunk]]);
auto* segment_recv = &(dXdata[
segment_ends[recv_chunk] - segment_sizes[recv_chunk]]);
MPI_Sendrecv(segment_send, segment_sizes[send_chunk],
dtype, send_to, 0, segment_recv, segment_sizes[recv_chunk],
dtype, recv_from, 0, comm, MPI_STATUS_IGNORE);
}
// normalization
if (comm_size > 1) {
// normalization
if (comm_size > 1) {
#ifdef WITH_MPI_CUDA
math::Scal<float, Context>(count,
1.f / comm_size, dXdata, &ctx());
math::Scal<T, Context>(count, 1.f / comm_size, dXdata, ctx());
#else
math::Scal<float, CPUContext>(count,
1.f / comm_size, dXdata, &ctx());
math::Scal<T, CPUContext>(count, 1.f / comm_size, dXdata, ctx());
#endif // WITH_MPI_CUDA
}
}
}
template <class Context>
void CollectiveUpdateOp<Context>::NCCLAllReduceWithFloat() {
#ifdef WITH_MPI_NCCL
auto stream = closure.cuda_stream(0);
for (int i = 0; i < InputSize(); i++) {
TIndex count = Input(i).count();
auto* dXdata = Input(i).template mutable_data<float, Context>();
NCCL_CHECK(ncclAllReduce((const void*)dXdata, (void*)dXdata,
count, ncclFloat, ncclSum, nccl_comm, stream));
}
closure.Sync();
for (int i = 0; i < InputSize(); i++) {
TIndex count = Input(i).count();
auto* dXdata = Input(i).template mutable_data<float, Context>();
math::Scal<float, Context>(count, 1.f / comm_size, dXdata, &ctx());
}
#endif
}
template <class Context>
void CollectiveUpdateOp<Context>::MPIBcastWithFloat() {
for (int i = 0; i < InputSize(); i++) {
TIndex count = Input(i).count();
template <class Context> template <typename T>
void CollectiveUpdateOp<Context>::MPIBcast(
Tensor* tensor,
MPI_Datatype dtype) {
TIndex count = tensor->count();
#ifdef WITH_MPI_CUDA
auto* dXdata = Input(i).template mutable_data<float, Context>();
auto* dXdata = tensor->template mutable_data<float, Context>();
#else
auto* dXdata = Input(i).template mutable_data<float, CPUContext>();
auto* dXdata = tensor->template mutable_data<float, CPUContext>();
#endif
MPI_Bcast(dXdata, count, MPI_FLOAT, comm_root, comm);
}
MPI_Bcast(dXdata, count, dtype, comm_root, comm);
}
template <class Context>
void CollectiveUpdateOp<Context>::NCCLBcastWithFloat() {
#ifdef WITH_MPI_NCCL
auto stream = closure.cuda_stream(0);
for (int i = 0; i < InputSize(); i++) {
TIndex count = Input(i).count();
auto* dXdata = Input(i).template mutable_data<float, Context>();
NCCL_CHECK(ncclBcast((void*)dXdata, count,
ncclFloat, comm_root, nccl_comm, stream));
}
closure.Sync();
#endif
template <class Context> template <typename T>
void CollectiveUpdateOp<Context>::NCCLAllReduce(
Tensor* tensor,
ncclDataType_t dtype,
cudaStream_t& stream) {
TIndex count = tensor->count();
auto* dXdata = tensor->template mutable_data<T, Context>();
NCCL_CHECK(ncclAllReduce((const void*)dXdata, (void*)dXdata,
count, dtype, ncclSum, nccl_comm, stream));
}
template <class Context> template <typename T>
void CollectiveUpdateOp<Context>::NCCLBcast(
Tensor* tensor,
ncclDataType_t dtype,
cudaStream_t& stream) {
TIndex count = tensor->count();
auto* dXdata = tensor->template mutable_data<T, Context>();
NCCL_CHECK(ncclBcast((void*)dXdata,
count, dtype, comm_root, nccl_comm, stream));
}
#endif
template <class Context>
void CollectiveUpdateOp<Context>::RunOnDevice() {
if(XIsType(Input(0), float)) {
if (mode == "MPI_ALLREDUCE") {
MPIAllReduceWithFloat();
} else if (mode == "NCCL_ALLREDUCE") {
NCCLAllReduceWithFloat();
} else if (mode == "MPI_BCAST") {
MPIBcastWithFloat();
} else if (mode == "NCCL_BCAST") {
NCCLBcastWithFloat();
} else LOG(FATAL) << "Unsupported collective mode: " << mode;
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
if (mode == "MPI_ALLREDUCE") {
for (int i = 0; i < InputSize(); i++) {
if (XIsType(Input(i), float))
MPIAllReduce<float>(&Input(i), MPI_FLOAT);
else if (XIsType(Input(i), float16))
MPIAllReduce<float16>(&Input(i), MPI_UNSIGNED_SHORT);
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
} else if (mode == "MPI_BCAST") {
for (int i = 0; i < InputSize(); i++) {
if (XIsType(Input(i), float))
MPIBcast<float>(&Input(i), MPI_FLOAT);
else if (XIsType(Input(i), float16))
MPIBcast<float16>(&Input(i), MPI_UNSIGNED_SHORT);
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
}
#ifdef WITH_MPI_NCCL
else if (mode == "NCCL_ALLREDUCE") {
auto stream = closure.cuda_stream(1);
for (int i = 0; i < InputSize(); i++) {
if (XIsType(Input(i), float))
NCCLAllReduce<float>(&Input(i), ncclFloat, stream);
else if (XIsType(Input(i), float16))
NCCLAllReduce<float16>(&Input(i), ncclHalf, stream);
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
closure.Sync();
for (int i = 0; i < InputSize(); i++) {
TIndex count = Input(i).count();
if (XIsType(Input(i), float)) {
auto* dXdata = Input(i).template mutable_data<float, Context>();
math::Scal<float, Context>(count, 1.f / comm_size, dXdata, ctx());
}
else if (XIsType(Input(i), float16)) {
auto* dXdata = Input(i).template mutable_data<float16, Context>();
math::Scal<float16, Context>(count, 1.f / comm_size, dXdata, ctx());
}
}
} else if (mode == "NCCL_BCAST") {
auto stream = closure.cuda_stream(1);
for (int i = 0; i < InputSize(); i++) {
if (XIsType(Input(i), float))
NCCLBcast<float>(&Input(i), ncclFloat, stream);
else if (XIsType(Input(i), float16))
NCCLBcast<float16>(&Input(i), ncclHalf, stream);
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
closure.Sync();
}
#endif
else LOG(FATAL) << "Unsupported collective mode: " << mode;
}
DEPLOY_CPU(CollectiveUpdate);
......
......@@ -8,7 +8,7 @@ void MovingAverageOp<Context>::RunWithType() {
auto* Xdata = Input(0).template data<T, Context>();
auto* Ydata = Output(0)->template mutable_data<T, Context>();
math::Axpby<T, Context>(Input(0).count(),
1.f - decay, Xdata, decay, Ydata, &ctx());
1.f - decay, Xdata, decay, Ydata, ctx());
}
template <class Context>
......
......@@ -6,16 +6,16 @@
namespace dragon {
template <class Context>
void NesterovUpdateOp<Context>::ComputeRunWithFloat() {
void NesterovUpdateOp<Context>::ComputeRunWithFloat32() {
Tensor* h = ws()->CreateTensor("/mnt/" + Slot() + "/nesterov/h");
h->ReshapeLike(Input(0));
lr = Param("base_lr") * this->lr_mult, momentum = Param("momentum");
auto* dXdata = Input(0).template mutable_data<float, Context>();
auto* Hdata = h->template mutable_data<float, Context>();
auto* Hdata = h->template mutable_data<float, Context>(ctx());
kernel::NesterovUpdate<float, Context>(
Input(0).count(), lr, momentum, dXdata, Hdata);
Input(0).count(), lr, momentum, dXdata, Hdata, ctx());
}
template <class Context>
......@@ -24,11 +24,18 @@ void NesterovUpdateOp<Context>::ComputeRunWithFloat16() {
h->ReshapeLike(Input(0));
lr = Param("base_lr") * this->lr_mult, momentum = Param("momentum");
auto* dXdata = Input(0).template mutable_data<float16, Context>();
auto* Hdata = h->template mutable_data<float16, Context>();
kernel::NesterovUpdate<float16, Context>(
Input(0).count(), lr, momentum, dXdata, Hdata);
auto* dX32T = ws()->CreateTensor(Input(0).name() + "/f32");
dX32T->ReshapeLike(Input(0));
auto* dX32 = dX32T->template mutable_data<float, Context>();
auto* dX16 = Input(0).template mutable_data<float16, Context>();
auto* H32 = h->template mutable_data<float, Context>(ctx());
kernel::TypeA2B<float16, float, Context>(
Input(0).count(), dX16, dX32, ctx());
kernel::NesterovUpdate<float, Context>(
Input(0).count(), lr, momentum, dX32, H32, ctx());
}
DEPLOY_CPU(NesterovUpdate);
......
......@@ -5,17 +5,17 @@
namespace dragon {
template <class Context>
void RMSPropUpdateOp<Context>::ComputeRunWithFloat() {
void RMSPropUpdateOp<Context>::ComputeRunWithFloat32() {
Tensor* h = ws()->CreateTensor("/mnt/" + Slot() + "/rmsprop/h");
h->ReshapeLike(Input(0));
lr = Param("base_lr") * this->lr_mult;
decay = Param("decay"), eps = Param("eps");
auto* dXdata = Input(0).template mutable_data<float, Context>();
auto* Hdata = h->template mutable_data<float, Context>();
auto* Hdata = h->template mutable_data<float, Context>(ctx());
kernel::RMSPropUpdate<float, Context>(
Input(0).count(), lr, decay, eps, dXdata, Hdata);
Input(0).count(), lr, decay, eps, dXdata, Hdata, ctx());
}
template <class Context>
......@@ -25,11 +25,18 @@ void RMSPropUpdateOp<Context>::ComputeRunWithFloat16() {
lr = Param("base_lr") * this->lr_mult;
decay = Param("decay"), eps = Param("eps");
auto* dXdata = Input(0).template mutable_data<float16, Context>();
auto* Hdata = h->template mutable_data<float16, Context>();
kernel::RMSPropUpdate<float16, Context>(
Input(0).count(), lr, decay, eps, dXdata, Hdata);
auto* dX32T = ws()->CreateTensor(Input(0).name() + "/f32");
dX32T->ReshapeLike(Input(0));
auto* dX32 = dX32T->template mutable_data<float, Context>();
auto* dX16 = Input(0).template mutable_data<float16, Context>();
auto* H32 = h->template mutable_data<float, Context>(ctx());
kernel::TypeA2B<float16, float, Context>(
Input(0).count(), dX16, dX32, ctx());
kernel::RMSPropUpdate<float, Context>(
Input(0).count(), lr, decay, eps, dX32, H32, ctx());
}
DEPLOY_CPU(RMSPropUpdate);
......
......@@ -6,7 +6,7 @@
namespace dragon {
template <class Context>
void SGDUpdateOp<Context>::ComputeRunWithFloat() {
void SGDUpdateOp<Context>::ComputeRunWithFloat32() {
Tensor* h = ws()->CreateTensor("/mnt/" + Slot() + "/sgd/h");
h->ReshapeLike(Input(0));
......@@ -14,10 +14,10 @@ void SGDUpdateOp<Context>::ComputeRunWithFloat() {
// momentum correction, see arXiv:1706.02677
if (old_lr > 0) { correction = lr / old_lr; } old_lr = lr;
auto* dXdata = Input(0).template mutable_data<float, Context>();
auto* Hdata = h->template mutable_data<float, Context>();
auto* Hdata = h->template mutable_data<float, Context>(ctx());
kernel::SGDUpdate<float, Context>(Input(0).count(),
lr, momentum * correction, dXdata, Hdata);
lr, momentum * correction, dXdata, Hdata, ctx());
}
template <class Context>
......@@ -27,11 +27,18 @@ void SGDUpdateOp<Context>::ComputeRunWithFloat16() {
lr = Param("base_lr") * this->lr_mult, momentum = Param("momentum");
if (old_lr > 0) { correction = lr / old_lr; } old_lr = lr;
auto* dXdata = Input(0).template mutable_data<float16, Context>();
auto* Hdata = h->template mutable_data<float16, Context>();
kernel::SGDUpdate<float16, Context>(Input(0).count(),
lr, momentum * correction, dXdata, Hdata);
auto* dX32T = ws()->CreateTensor(Input(0).name() + "/f32");
dX32T->ReshapeLike(Input(0));
auto* dX32 = dX32T->template mutable_data<float, Context>();
auto* dX16 = Input(0).template mutable_data<float16, Context>();
auto* H32 = h->template mutable_data<float, Context>(ctx());
kernel::TypeA2B<float16, float, Context>(
Input(0).count(), dX16, dX32, ctx());
kernel::SGDUpdate<float, Context>(Input(0).count(),
lr, momentum * correction, dX32, H32, ctx());
}
DEPLOY_CPU(SGDUpdate);
......
#include "core/workspace.h"
#include "utils/cast.h"
#include "utils/math_functions.h"
#include "utils/op_kernel.h"
#include "operators/update/update_op_base.h"
namespace dragon {
......@@ -20,22 +21,24 @@ template <class Context> template <typename T>
void UpdateOpBase<Context>::PreprocessRunWithType() {
// scale
scale_factor = Param("scale_gradient");
if (scale_factor != 1) {
if (scale_factor != 1.f) {
auto* dXdata = Input(0).template mutable_data<T, Context>();
math::Scal<T, Context>(Input(0).count(),
scale_factor, dXdata, &ctx());
scale_factor, dXdata, ctx());
}
// clip
clip_thresh = Param("clip_gradient");
if (clip_thresh > 0) {
auto* dXdata = Input(0).template mutable_data<T, Context>();
float sumsq_grad = math::Dot<T, Context>(
Input(0).count(), dXdata, dXdata, &ctx());
const float l2norm = sqrt(sumsq_grad);
T sumsq_grad;
math::Dot<T, Context>(Input(0).count(),
dXdata, dXdata, &sumsq_grad, ctx());
const float l2norm = sqrt(
dragon_cast<float, T>(sumsq_grad));
if (l2norm > clip_thresh) {
float norm_factor = clip_thresh / l2norm;
math::Scal<T, Context>(Input(0).count(),
norm_factor, dXdata, &ctx());
norm_factor, dXdata, ctx());
}
}
// decay
......@@ -44,34 +47,76 @@ void UpdateOpBase<Context>::PreprocessRunWithType() {
auto* dXdata = Input(0).template mutable_data<T, Context>();
auto* Xdata = Output(0)->template data<T, Context>();
math::Axpy<T, Context>(Input(0).count(),
l2_decay, Xdata, dXdata, &ctx());
l2_decay, Xdata, dXdata, ctx());
}
}
template <class Context> template <typename T>
void UpdateOpBase<Context>::UpdateRunWithType() {
auto* dXdata = Input(0).template mutable_data<T, Context>();
auto* Xdata = Output(0)->template mutable_data<T, Context>();
math::Axpy<T, Context>(Output(0)->count(), -1, dXdata, Xdata, &ctx());
T zeroT = dragon_cast<T, float>(0.f);
if (zero_grad) math::Set<T, Context>(Input(0).count(), zeroT, dXdata);
template <class Context>
void UpdateOpBase<Context>::UpdateRunWithFloat32() {
auto* dXdata = Input(0).template mutable_data<float, Context>();
auto* Xdata = Output(0)->template mutable_data<float, Context>();
// weights update & zero grads
math::Axpy<float, Context>(Output(0)->count(),
-1, dXdata, Xdata, ctx());
if (zero_grad) math::Set<float, Context>(
Input(0).count(), 0.f, dXdata, ctx());
}
template <class Context>
void UpdateOpBase<Context>::UpdateRunWithFloat16() {
/* ------------------------------------------------
*
* Mixed Precision Training
*
* http://arxiv.org/abs/1710.03740
*
* ------------------------------------------------ */
// the "master" weights
auto* X32T = ws()->CreateTensor(Output(0)->name() + "/f32");
X32T->ReshapeLike(Input(0));
// the "master" updates
auto* dX32T = ws()->GetTensor(Input(0).name() + "/f32");
auto* dX32 = dX32T->template data<float, Context>();
auto* X16 = Output(0)->template mutable_data<float16, Context>();
auto* X32 = X32T->template mutable_data<float, Context>();
// X16 -> X32
kernel::TypeA2B<float16, float, Context>(
Input(0).count(), X16, X32, ctx());
// weights update & zero grads
math::Axpy<float, Context>(
Input(0).count(), -1, dX32, X32, ctx());
if (zero_grad) {
float16 zero = dragon_cast<float16, float>(0.f);
auto* dX16 = Input(0).template mutable_data<float16, Context>();
math::Set<float16, Context>(Input(0).count(), zero, dX16, ctx());
}
// X32 -> X16
kernel::TypeA2B<float, float16, Context>(
Input(0).count(), X32, X16, ctx());
}
template <class Context>
void UpdateOpBase<Context>::RunOnDevice() {
// skip empty param or grad
// skip empty param or grads
if (Input(0).count() == 0 || Output(0)->count() == 0) return;
CHECK(Input(0).dims() == Output(0)->dims())
<< "\nTensor and its gradients should have same dims.\nGot "
<< Output(0)->DimString() << " and " << Input(0).DimString();
if (XIsType(Input(0), float)) {
PreprocessRunWithType<float>();
ComputeRunWithFloat();
UpdateRunWithType<float>();
ComputeRunWithFloat32();
UpdateRunWithFloat32();
} else if (XIsType(Input(0), float16)) {
PreprocessRunWithType<float16>();
ComputeRunWithFloat16();
UpdateRunWithType<float16>();
UpdateRunWithFloat16();
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
}
......
......@@ -15,7 +15,7 @@ void BiasAddOp<Context>::RunWithType() {
kernel::BiasAdd<T, Context>(
Output(0)->count(), outer_dim, dim, inner_dim,
data_format, Bdata, multiplier, Ydata, &ctx());
data_format, Bdata, multiplier, Ydata, ctx());
}
template <class Context>
......@@ -45,19 +45,19 @@ void BiasAddGradientOp<Context>::RunWithType() {
if (Output(1)->name() != "ignore") {
DECLARE_MULTIPLIER(multiplier, inner_dim);
auto* dYdata = Input(-1).template data<T, Context>();
auto* dBias = Output(1)->template mutable_data<T, Context>();
auto* dBias = Output(1)->template mutable_data<T, Context>(ctx());
const int y_offset = dim * inner_dim;
for (int n = 0; n < outer_dim; n++) {
if (data_format == "NCHW") {
math::Gemv<T, Context>(
CblasNoTrans, dim, inner_dim,
1.0, dYdata, multiplier,
1.0, dBias, &ctx());
1.0, dBias, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, inner_dim, dim,
1.0, dYdata, multiplier,
1.0, dBias, &ctx());
1.0, dBias, ctx());
}
dYdata += y_offset;
}
......
......@@ -26,7 +26,7 @@ void BilinearResizeOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::BilinearResize<T, Context>(Output(0)->count(),
n, c, h, w, out_h, out_w, data_format, Xdata, Ydata);
n, c, h, w, out_h, out_w, data_format, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -77,8 +77,10 @@ void BilinearResizeGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());
kernel::BilinearResizeGrad<T, Context>(Input(-1).count(),
n, c, h, w, out_h, out_w, data_format, dYdata, dXdata);
n, c, h, w, out_h, out_w, data_format, dYdata, dXdata, ctx());
}
template <class Context>
......
......@@ -41,7 +41,7 @@ void Conv2dGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
if (HasBias()) {
T* dBdata = Output(2)->template mutable_data<T, Context>();
T* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
for (int n = 0; n < Input(2).dim(0); n++)
Db(dYdata + n * y_offset, dBdata);
}
......@@ -49,7 +49,7 @@ void Conv2dGradientOp<Context>::RunWithType() {
for (int n = 0; n < Input(2).dim(0); n++) {
if (Output(1)->name() != "ignore") {
auto* Xdata = Input(0).template data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
Dw(dYdata + n * y_offset, Xdata + n * x_offset, dWdata);
}
if (Output(0)->name() != "ignore") {
......
......@@ -44,7 +44,7 @@ void Conv2dTransposeGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
if (Output(2)->name() != "ignore") {
auto* dBdata = Output(2)->template mutable_data<T, Context>();
auto* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
for (int n = 0; n < Input(2).dim(0); n++)
Db(dYdata + n * y_offset, dBdata);
}
......@@ -52,7 +52,7 @@ void Conv2dTransposeGradientOp<Context>::RunWithType() {
for (int n = 0; n < Input(2).dim(0); n++) {
if (Output(1)->name() != "ignore") {
auto* Xdata = Input(0).template data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
Dw(Xdata + n * x_offset, dYdata + n * y_offset, dWdata);
}
if (Output(0)->name() != "ignore") {
......
......@@ -77,7 +77,7 @@ void ConvOpBase<Context>::Wx(
kernel_dim,
1.0, weights + weight_offset * g,
col_buffer + col_offset * g,
0.0, y + output_offset * g, &ctx());
0.0, y + output_offset * g, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
......@@ -86,7 +86,7 @@ void ConvOpBase<Context>::Wx(
kernel_dim,
1.0, col_buffer + col_offset * g,
weights + weight_offset * g,
0.0, y + output_offset * g, &ctx());
0.0, y + output_offset * g, ctx());
}
}
}
......@@ -99,13 +99,13 @@ void ConvOpBase<Context>::Pb(const T* bias, T* y) {
CblasNoTrans, CblasNoTrans,
num_output, out_spatial_dim, 1,
1.0, bias, multiplier,
1.0, y, &ctx());
1.0, y, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasNoTrans,
out_spatial_dim, num_output, 1,
1.0, multiplier, bias,
1.0, y, &ctx());
1.0, y, ctx());
}
}
......@@ -122,7 +122,7 @@ void ConvOpBase<Context>::Dx(const T* dy, const T* weights, T* dx) {
conv_out_channels / group,
1.0, weights + weight_offset * g,
dy + output_offset * g,
0.0, col_buffer + col_offset * g, &ctx());
0.0, col_buffer + col_offset * g, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasNoTrans, CblasTrans,
......@@ -131,7 +131,7 @@ void ConvOpBase<Context>::Dx(const T* dy, const T* weights, T* dx) {
conv_out_channels / group,
1.0, dy + output_offset * g,
weights + weight_offset * g,
0.0, col_buffer + col_offset * g, &ctx());
0.0, col_buffer + col_offset * g, ctx());
}
}
if (!is_1x1) Col2Im(col_buffer, dx);
......@@ -154,7 +154,7 @@ void ConvOpBase<Context>::Dw(const T* dy, const T* x, T *dw) {
conv_out_spatial_dim,
1.0, dy + output_offset * g,
col_buffer + col_offset * g,
1.0, dw + weight_offset * g, &ctx());
1.0, dw + weight_offset * g, ctx());
} else if (data_format == "NHWC") {
math::Gemm<T, Context>(
CblasTrans, CblasNoTrans,
......@@ -163,7 +163,7 @@ void ConvOpBase<Context>::Dw(const T* dy, const T* x, T *dw) {
conv_out_spatial_dim,
1.0, col_buffer + col_offset * g,
dy + output_offset * g,
1.0, dw + weight_offset * g, &ctx());
1.0, dw + weight_offset * g, ctx());
}
}
}
......@@ -175,12 +175,12 @@ void ConvOpBase<Context>::Db(const T* dy, T* db) {
math::Gemv<T, Context>(
CblasNoTrans, num_output, out_spatial_dim,
1.0, dy, multiplier,
1.0, db, &ctx());
1.0, db, ctx());
} else if (data_format == "NHWC") {
math::Gemv<T, Context>(
CblasTrans, out_spatial_dim, num_output,
1.0, dy, multiplier,
1.0, db, &ctx());
1.0, db, ctx());
}
}
......
......@@ -54,13 +54,13 @@ void CuDNNConv2dOp<Context>::ResetDesc() {
}
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
ctx().cudnn_handle(), input_desc,
ctx()->cudnn_handle(), input_desc,
filter_desc, conv_desc, output_desc,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
WORKSPACE_LIMIT_BYTES, &fwd_algo));
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
ctx().cudnn_handle(), input_desc,
ctx()->cudnn_handle(), input_desc,
filter_desc, conv_desc, output_desc,
fwd_algo, &fwd_data_size));
}
......@@ -78,7 +78,7 @@ void CuDNNConv2dOp<Context>::RunWithType() {
auto* WSdata = (uint8_t*)ws()->template
caches<Context>({ fwd_data_size })[0];
auto cudnn_handle = ctx().cudnn_handle();
auto cudnn_handle = ctx()->cudnn_handle();
for (int g = 0; g < cudnn_group; g++) {
CUDNN_CHECK(cudnnConvolutionForward(cudnn_handle,
......@@ -104,6 +104,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
#endif
Conv2dOp<Context>::Reshape();
ctx()->set_stream_id(0); // enforce default stream
if (XIsType(Input(0), float)) {
#if CUDNN_VERSION_MIN(6, 0, 0)
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
......@@ -199,24 +201,24 @@ void CuDNNConv2dGradientOp<Context>::ResetDesc() {
}
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
ctx().cudnn_handle(), output_desc,
ctx()->cudnn_handle(), output_desc,
input_desc, conv_desc, filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
WORKSPACE_LIMIT_BYTES, &bwd_filter_algo));
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
ctx().cudnn_handle(), output_desc,
ctx()->cudnn_handle(), output_desc,
input_desc, conv_desc, filter_desc,
bwd_filter_algo, &bwd_filter_size));
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
ctx().cudnn_handle(), filter_desc,
ctx()->cudnn_handle(), filter_desc,
input_desc, conv_desc, output_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
WORKSPACE_LIMIT_BYTES, &bwd_data_algo));
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
ctx().cudnn_handle(), filter_desc,
ctx()->cudnn_handle(), filter_desc,
input_desc, conv_desc, output_desc,
bwd_data_algo, &bwd_data_size));
}
......@@ -230,18 +232,18 @@ void CuDNNConv2dGradientOp<Context>::RunWithType() {
auto* WSdata = ws()->template caches<Context>({
std::max(bwd_data_size, bwd_filter_size)})[0];
auto cudnn_handle = ctx().cudnn_handle();
auto cudnn_handle = ctx()->cudnn_handle();
for (int g = 0; g < cudnn_group; g++) {
if (Output(2)->name() != "ignore") {
T* dBdata = Output(2)->template mutable_data<T, Context>();
T* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
}
if (Output(1)->name() != "ignore") {
auto* Xdata = Input(0).template data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
CUDNN_CHECK(cudnnConvolutionBackwardFilter(cudnn_handle,
CUDNNType<T>::one, output_desc, Xdata + x_offset * g,
input_desc, dYdata + y_offset * g,
......@@ -269,6 +271,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
#endif
Conv2dGradientOp<Context>::GradientReshape();
ctx()->set_stream_id(0); // enforce default stream
if (XIsType(Input(0), float)) {
#if CUDNN_VERSION_MIN(6, 0, 0)
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
......
......@@ -54,13 +54,13 @@ void CuDNNConv2dTransposeOp<Context>::ResetDesc() {
}
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
ctx().cudnn_handle(), filter_desc,
ctx()->cudnn_handle(), filter_desc,
input_desc, conv_desc, output_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
WORKSPACE_LIMIT_BYTES, &fwd_algo));
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
ctx().cudnn_handle(), filter_desc,
ctx()->cudnn_handle(), filter_desc,
input_desc, conv_desc, output_desc,
fwd_algo, &fwd_data_size));
}
......@@ -78,7 +78,7 @@ void CuDNNConv2dTransposeOp<Context>::RunWithType() {
auto* WSdata = (uint8_t*)ws()->template
caches<Context>({ fwd_data_size })[0];
auto cudnn_handle = ctx().cudnn_handle();
auto cudnn_handle = ctx()->cudnn_handle();
for (int g = 0; g < cudnn_group; g++) {
CUDNN_CHECK(cudnnConvolutionBackwardData(cudnn_handle,
......@@ -104,6 +104,8 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() {
#endif
Conv2dTransposeOp<Context>::Reshape();
ctx()->set_stream_id(0); // enforce default stream
if (XIsType(Input(0), float)) {
#if CUDNN_VERSION_MIN(6, 0, 0)
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
......@@ -199,24 +201,24 @@ void CuDNNConv2dTransposeGradientOp<Context>::ResetDesc() {
}
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
ctx().cudnn_handle(), input_desc,
ctx()->cudnn_handle(), input_desc,
output_desc, conv_desc, filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
WORKSPACE_LIMIT_BYTES, &bwd_filter_algo));
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
ctx().cudnn_handle(), input_desc,
ctx()->cudnn_handle(), input_desc,
output_desc, conv_desc, filter_desc,
bwd_filter_algo, &bwd_filter_size));
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
ctx().cudnn_handle(), input_desc,
ctx()->cudnn_handle(), input_desc,
filter_desc, conv_desc, output_desc,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
WORKSPACE_LIMIT_BYTES, &bwd_data_algo));
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
ctx().cudnn_handle(), input_desc,
ctx()->cudnn_handle(), input_desc,
filter_desc, conv_desc, output_desc,
bwd_data_algo, &bwd_data_size));
}
......@@ -230,18 +232,18 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunWithType() {
auto* WSdata = ws()->template caches<Context>({
std::max(bwd_data_size, bwd_filter_size) })[0];
auto cudnn_handle = ctx().cudnn_handle();
auto cudnn_handle = ctx()->cudnn_handle();
for (int g = 0; g < cudnn_group; g++) {
if (Output(2)->name() != "ignore") {
T* dBdata = Output(2)->template mutable_data<T, Context>();
T* dBdata = Output(2)->template mutable_data<T, Context>(ctx());
CUDNN_CHECK(cudnnConvolutionBackwardBias(cudnn_handle,
CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
CUDNNType<T>::one, bias_desc, dBdata + bias_offset * g));
}
if (Output(1)->name() != "ignore") {
auto* Xdata = Input(0).template data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>();
auto* dWdata = Output(1)->template mutable_data<T, Context>(ctx());
CUDNN_CHECK(cudnnConvolutionBackwardFilter(cudnn_handle,
CUDNNType<T>::one, input_desc, dYdata + y_offset * g,
output_desc, Xdata + x_offset * g,
......@@ -269,6 +271,8 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() {
#endif
Conv2dTransposeGradientOp<Context>::GradientReshape();
ctx()->set_stream_id(0); // enforce default stream
if (XIsType(Input(0), float)) {
#if CUDNN_VERSION_MIN(6, 0, 0)
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc,
......
......@@ -13,7 +13,7 @@ void CuDNNLRNOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnLRNCrossChannelForward(
ctx().cudnn_handle(), norm_desc,
ctx()->cudnn_handle(), norm_desc,
CUDNN_LRN_CROSS_CHANNEL_DIM1,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
......@@ -55,7 +55,7 @@ void CuDNNLRNGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnLRNCrossChannelBackward(
ctx().cudnn_handle(), norm_desc,
ctx()->cudnn_handle(), norm_desc,
CUDNN_LRN_CROSS_CHANNEL_DIM1,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Xdata,
......
......@@ -25,7 +25,7 @@ void CuDNNPooling2dOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnPoolingForward(
ctx().cudnn_handle(), pool_desc,
ctx()->cudnn_handle(), pool_desc,
CUDNNType<T>::one, input_desc, Xdata,
CUDNNType<T>::zero, output_desc, Ydata));
}
......@@ -69,7 +69,7 @@ void CuDNNPooling2dGradientOp<Context>::RunWithType() {
auto* dXdata = Output(0)->template mutable_data<T, Context>();
CUDNN_CHECK(cudnnPoolingBackward(
ctx().cudnn_handle(), pool_desc,
ctx()->cudnn_handle(), pool_desc,
CUDNNType<T>::one, input_desc, Ydata,
input_desc, dYdata, output_desc, Xdata,
CUDNNType<T>::zero, output_desc, dXdata));
......
......@@ -28,7 +28,7 @@ void DenseConcatGradientOp<Context>::RestoreX1() {
kernel::ConcatGrad<T, Context>(
count, this->outer_dim, this->inner_dim,
this->x_concat_dim, this->y_concat_dim,
0, Ydata, Xdata);
0, Ydata, Xdata, ctx());
}
template <class Context>
......
......@@ -17,11 +17,11 @@ template <class Context> template <typename T>
void LRNOp<Context>::SplitRunWithType() {
sqr_in = ws()->CreateTensor("/mnt/" + anchor() + "/sqr/in");
sqr_in->ReshapeLike(Input(0));
sqr_in->template CopyFrom<Context>(Input(0));
sqr_in->template CopyFrom<Context>(Input(0), ctx());
prod_in = ws()->CreateTensor("/mnt/" + anchor() + "/prod/in");
prod_in->ReshapeLike(Input(0));
prod_in->template CopyFrom<Context>(Input(0));
prod_in->template CopyFrom<Context>(Input(0), ctx());
}
template <class Context> template <typename T>
......@@ -229,7 +229,7 @@ void LRNGradientOp<Context>::SplitRunWithType() {
auto* data0 = g_sqr_in->template data<T, Context>();
auto* data1 = g_prod_in->template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Add<T, Context>(Output(0)->count(), data0, data1, dXdata);
math::Add<T, Context>(Output(0)->count(), data0, data1, dXdata, ctx());
}
template <class Context>
......
......@@ -26,7 +26,7 @@ void NNResizeOp<Context>::RunWithType() {
auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::NNResize<T, Context>(Output(0)->count(),
n, c, h, w, out_h, out_w, data_format, Xdata, Ydata);
n, c, h, w, out_h, out_w, data_format, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -77,8 +77,10 @@ void NNResizeGradientOp<Context>::RunWithType() {
auto* dYdata = Input(-1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());
kernel::NNResizeGrad<T, Context>(Input(-1).count(),
n, c, h, w, out_h, out_w, data_format, dYdata, dXdata);
n, c, h, w, out_h, out_w, data_format, dYdata, dXdata, ctx());
}
template <class Context>
......
......@@ -17,7 +17,7 @@ void Pooling2dOp<Context>::MAXRunWithType() {
kernel::MAXPooling2d<T, Context>(Output(0)->count(),
n, c, h, w, pool_h, pool_w, kernel_size[0], kernel_size[1],
stride[0], stride[1], pad[0], pad[1],
data_format, Xdata, Mdata, Ydata);
data_format, Xdata, Mdata, Ydata, ctx());
}
template <class Context> template <typename T>
......@@ -28,7 +28,7 @@ void Pooling2dOp<Context>::AVGRunWithType() {
kernel::AVGPooling2d<T, Context>(Output(0)->count(),
n, c, h, w, pool_h, pool_w, kernel_size[0], kernel_size[1],
stride[0], stride[1], pad[0], pad[1],
data_format, Xdata, Ydata);
data_format, Xdata, Ydata, ctx());
}
template <class Context>
......@@ -127,8 +127,9 @@ void Pooling2dGradientOp<Context>::MAXRunWithType() {
kernel::MAXPooling2dGrad<T, Context>(Output(0)->count(),
n, c, h, w, pool_h, pool_w, kernel_size[0], kernel_size[1],
stride[0], stride[1], pad[0], pad[1],
data_format, dYdata, Mdata, dXdata);
data_format, dYdata, Mdata, dXdata, ctx());
ctx()->FinishDeviceCompution();
mask->Reset();
}
......@@ -140,7 +141,7 @@ void Pooling2dGradientOp<Context>::AVGRunWithType() {
kernel::AVGPooling2dGrad<T, Context>(Output(0)->count(),
n, c, h, w, pool_h, pool_w, kernel_size[0], kernel_size[1],
stride[0], stride[1], pad[0], pad[1],
data_format, dYdata, dXdata);
data_format, dYdata, dXdata, ctx());
}
template <class Context>
......
......@@ -14,7 +14,8 @@ void ROIAlignOp<Context>::RunWithType() {
kernel::ROIAlign<T, Context>(
Output(0)->count(), Input(0).dim(0), Input(0).dim(1),
Input(0).dim(2), Input(0).dim(3), pool_h, pool_w,
Input(1).dim(0), spatial_scale, sampling_ratio, Xdata, Rdata, Ydata);
Input(1).dim(0), spatial_scale, sampling_ratio,
Xdata, Rdata, Ydata, ctx());
}
template <class Context>
......@@ -38,12 +39,13 @@ void ROIAlignGradientOp<Context>::RunWithType() {
auto* Rdata = Input(1).template data<T, CUDAContext>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
math::Set<T, Context>(Output(0)->count(), 0, dXdata);
math::Set<T, Context>(Output(0)->count(), 0, dXdata, ctx());
kernel::ROIAlignGrad<T, Context>(
Input(-1).count(), Output(0)->dim(0), Output(0)->dim(1),
Output(0)->dim(2), Output(0)->dim(3), pool_h, pool_w,
Input(1).dim(0), spatial_scale, sampling_ratio, dYdata, Rdata, dXdata);
Input(1).dim(0), spatial_scale, sampling_ratio,
dYdata, Rdata, dXdata, ctx());
}
template <class Context>
......
......@@ -19,7 +19,8 @@ void ROIPoolingOp<Context>::RunWithType() {
kernel::ROIPooling<T, Context>(
Output(0)->count(), Input(0).dim(0), Input(0).dim(1),
Input(0).dim(2), Input(0).dim(3), pool_h, pool_w,
Input(1).dim(0), spatial_scale, Xdata, Rdata, Mdata, Ydata);
Input(1).dim(0), spatial_scale,
Xdata, Rdata, Mdata, Ydata, ctx());
}
template <class Context>
......@@ -50,7 +51,8 @@ void ROIPoolingGradientOp<Context>::RunWithType() {
kernel::ROIPoolingGrad<T, Context>(
Output(0)->count(), Output(0)->dim(0), Output(0)->dim(1),
Output(0)->dim(2), Output(0)->dim(3), pool_h, pool_w,
Input(1).dim(0), spatial_scale, dYdata, Rdata, Mdata, dXdata);
Input(1).dim(0), spatial_scale,
dYdata, Rdata, Mdata, dXdata, ctx());
}
template <class Context>
......
......@@ -14,7 +14,8 @@ namespace math {
template <> void Set<float, CPUContext>(
const int n,
const float alpha,
float* x) {
float* x,
CPUContext* ctx) {
if (alpha == 0) {
memset(x, 0, sizeof(float) * n);
return;
......@@ -32,7 +33,8 @@ template <> void Set<float, CPUContext>(
template <> void Set<int, CPUContext>(
const int n,
const int alpha,
int* x) {
int* x,
CPUContext* ctx) {
if (alpha == 0) {
memset(x, 0, sizeof(int) * n);
return;
......@@ -50,7 +52,8 @@ template <> void Set<int, CPUContext>(
template <> void Set<float16, CPUContext>(
const int n,
const float16 alpha,
float16* x) {
float16* x,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -164,7 +167,8 @@ template <> void Add<float, CPUContext>(
const int n,
const float* a,
const float* b,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_SSE
sse::Add<float>(n, a, b, y);
#else
......@@ -179,7 +183,8 @@ template <> void Add<int, CPUContext>(
const int n,
const int* a,
const int* b,
int* y) {
int* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
......@@ -190,7 +195,8 @@ template <> void Add<float16, CPUContext>(
const int n,
const float16* a,
const float16* b,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -198,7 +204,8 @@ template <> void Sub<float, CPUContext>(
const int n,
const float* a,
const float* b,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_SSE
sse::Sub<float>(n, a, b, y);
#else
......@@ -213,7 +220,8 @@ template <> void Sub<float16, CPUContext>(
const int n,
const float16* a,
const float16* b,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -221,7 +229,8 @@ template <> void Mul<float, CPUContext>(
const int n,
const float* a,
const float* b,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_SSE
sse::Mul<float>(n, a, b, y);
#else
......@@ -236,7 +245,8 @@ template <> void Mul<float16, CPUContext>(
const int n,
const float16* a,
const float16* b,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -244,7 +254,8 @@ template <> void Div<float, CPUContext>(
const int n,
const float* a,
const float* b,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_SSE
sse::Div<float>(n, a, b, y);
#else
......@@ -259,7 +270,8 @@ template <> void Div<float16, CPUContext>(
const int n,
const float16* a,
const float16* b,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -267,7 +279,8 @@ template <> void Clip<float, CPUContext>(
const int n,
const float low,
const float high,
float* x) {
float* x,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
......@@ -279,7 +292,8 @@ template <> void Clip<float, CPUContext>(
template <> void Exp<float, CPUContext>(
int n,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
......@@ -289,7 +303,8 @@ template <> void Exp<float, CPUContext>(
template <> void Log<float, CPUContext>(
int n,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
......@@ -299,7 +314,8 @@ template <> void Log<float, CPUContext>(
template <> void Square<float, CPUContext>(
int n,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
......@@ -309,14 +325,16 @@ template <> void Square<float, CPUContext>(
template <> void Square<float16, CPUContext>(
int n,
const float16* x,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <> void Sqrt<float, CPUContext>(
int n,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
......@@ -326,7 +344,8 @@ template <> void Sqrt<float, CPUContext>(
template <> void Sqrt<float16, CPUContext>(
int n,
const float16* x,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -334,7 +353,8 @@ template <> void Pow<float, CPUContext>(
int n,
const float alpha,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
......@@ -345,7 +365,8 @@ template <> void Pow<float16, CPUContext>(
int n,
const float alpha,
const float16* x,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -353,7 +374,8 @@ template <> void Inv<float, CPUContext>(
const int n,
const float numerator,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
......@@ -364,7 +386,8 @@ template <> void Inv<float16, CPUContext>(
const int n,
const float numerator,
const float16* x,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -423,51 +446,51 @@ template <> void Scale<float, CPUContext>(
#endif // WITH_BLAS
}
template <> float StridedDot<float, CPUContext>(
template <> void StridedDot<float, CPUContext>(
const int n,
const float* a,
const int incx,
const float* b,
const int incy,
float* y,
CPUContext* ctx) {
#ifdef WITH_BLAS
return cblas_sdot(n, a, incx, b, incy);
float result = cblas_sdot(n, a, incx, b, incy);
#else
float ret = 0.f;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) ret += a[i] * b[i];
return ret;
float result = 0.f;
int cx = 0, cy = 0;
for (int i = 0; i < n; ++i) {
result += a[cx] * b[cy];
cx += incx; cy += incy;
}
#endif // WITH_BLAS
*y = result;
}
template <> float Dot<float, CPUContext>(
template <> void Dot<float, CPUContext>(
int n,
const float* a,
const float* b,
float* y,
CPUContext* ctx) {
#ifdef WITH_BLAS
return StridedDot<float, CPUContext>(n, a, 1, b, 1, ctx);
#elif WITH_SSE
return sse::Dot<float>(n, a, b);
StridedDot<float, CPUContext>(n, a, 1, b, 1, y, ctx);
#elif WITH_SSE
*y = sse::Dot<float>(n, a, b);
#else
float ret = 0.f;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) ret += a[i] * b[i];
return ret;
float result = 0.f;
for (int i = 0; i < n; ++i) result += a[i] * b[i];
*y = result;
#endif // WITH_BLAS
}
template <> float Dot<float16, CPUContext>(
template <> void Dot<float16, CPUContext>(
int n,
const float16* a,
const float16* b,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
return 0;
}
template <> float ASum<float, CPUContext>(
......@@ -475,22 +498,19 @@ template <> float ASum<float, CPUContext>(
const float* x) {
#ifdef WITH_BLAS
return cblas_sasum(n, x, 1);
#elif WITH_SSE
return sse::ASum<float>(n, x);
#else
float ret = 0.f;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(n))
#endif
for (int i = 0; i < n; ++i) ret += x[i];
return ret;
float result = 0.f;
for (int i = 0; i < n; ++i)
result += std::abs(x[i]);
return result;
#endif // WITH_BLAS
}
template <> void AddScalar<float, CPUContext>(
const int n,
const float alpha,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_SSE
sse::AddScalar<float>(n, alpha, y);
#else
......@@ -504,14 +524,16 @@ template <> void AddScalar<float, CPUContext>(
template <> void AddScalar<float16, CPUContext>(
const int n,
const float alpha,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <> void MulScalar<float, CPUContext>(
const int n,
const float alpha,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_SSE
sse::MulScalar<float>(n, alpha, y);
#else
......@@ -525,7 +547,8 @@ template <> void MulScalar<float, CPUContext>(
template <> void MulScalar<float16, CPUContext>(
const int n,
const float alpha,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......
......@@ -18,7 +18,7 @@ __global__ void _Set(
const int n,
const T alpha,
T* x) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
x[idx] = alpha;
}
}
......@@ -26,27 +26,31 @@ __global__ void _Set(
template <> void Set<float, CUDAContext>(
const int n,
const float alpha,
float* x) {
if (alpha == 0) {
CUDA_CHECK(cudaMemset(x, 0, sizeof(float) * n));
return;
float* x,
CUDAContext* ctx) {
if (alpha == 0.f) {
CUDA_CHECK(cudaMemsetAsync(x, 0,
sizeof(float) * n, ctx->cuda_stream()));
} else {
_Set<float>
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, alpha, x);
}
_Set<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, alpha, x);
}
template <> void Set<int, CUDAContext>(
const int n,
const int alpha,
int* x) {
int* x,
CUDAContext* ctx) {
if (alpha == 0) {
CUDA_CHECK(cudaMemset(x, 0, sizeof(int) * n));
return;
CUDA_CHECK(cudaMemsetAsync(x, 0,
sizeof(int) * n, ctx->cuda_stream()));
} else {
_Set<int>
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, alpha, x);
}
_Set<int>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, alpha, x);
}
template <> void RandomUniform<uint32_t, CUDAContext>(
......@@ -89,7 +93,7 @@ __global__ void _Add(
const T* a,
const T* b,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = a[idx] + b[idx];
}
}
......@@ -98,10 +102,11 @@ template <> void Add<float, CUDAContext>(
int n,
const float* a,
const float* b,
float* y) {
float* y,
CUDAContext* ctx) {
_Add<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, a, b, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, a, b, y);
}
template <typename T>
......@@ -110,7 +115,7 @@ __global__ void _Sub(
const T* a,
const T* b,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = a[idx] - b[idx];
}
}
......@@ -119,10 +124,11 @@ template <> void Sub<float, CUDAContext>(
int n,
const float* a,
const float* b,
float* y) {
float* y,
CUDAContext* ctx) {
_Sub<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, a, b, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, a, b, y);
}
template <typename T>
......@@ -131,7 +137,7 @@ __global__ void _Mul(
const T* a,
const T* b,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = a[idx] * b[idx];
}
}
......@@ -140,10 +146,11 @@ template <> void Mul<float, CUDAContext>(
int n,
const float* a,
const float* b,
float* y) {
float* y,
CUDAContext* ctx) {
_Mul<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, a, b, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, a, b, y);
}
template <typename T>
......@@ -152,7 +159,7 @@ __global__ void _Div(
const T* a,
const T* b,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = a[idx] / b[idx];
}
}
......@@ -161,10 +168,11 @@ template <> void Div<float, CUDAContext>(
int n,
const float* a,
const float* b,
float* y) {
float* y,
CUDAContext* ctx) {
_Div<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, a, b, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, a, b, y);
}
template <typename T>
......@@ -173,7 +181,7 @@ __global__ void _Clip(
const T low,
const T high,
T* x) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
x[idx] = x[idx] > high ? high : x[idx];
x[idx] = x[idx] < low ? low : x[idx];
}
......@@ -183,10 +191,11 @@ template <> void Clip<float, CUDAContext>(
const int n,
const float low,
const float high,
float* x) {
float* x,
CUDAContext* ctx) {
_Clip<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, low, high, x);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, low, high, x);
}
template <typename T>
......@@ -194,7 +203,7 @@ __global__ void _Exp(
const int n,
const T* a,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = exp(a[idx]);
}
}
......@@ -202,10 +211,11 @@ __global__ void _Exp(
template <> void Exp<float, CUDAContext>(
int n,
const float* x,
float* y) {
float* y,
CUDAContext* ctx) {
_Exp<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, x, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, x, y);
}
template <typename T>
......@@ -213,7 +223,7 @@ __global__ void _Log(
const int n,
const T* a,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = log(a[idx]);
}
}
......@@ -221,10 +231,11 @@ __global__ void _Log(
template <> void Log<float, CUDAContext>(
int n,
const float* x,
float* y) {
float* y,
CUDAContext* ctx) {
_Log<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, x, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, x, y);
}
template <typename T>
......@@ -232,7 +243,7 @@ __global__ void _Square(
const int n,
const T* x,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = x[idx] * x[idx];
}
}
......@@ -240,10 +251,11 @@ __global__ void _Square(
template <> void Square<float, CUDAContext>(
int n,
const float* x,
float* y) {
float* y,
CUDAContext* ctx) {
_Square<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, x, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, x, y);
}
template <typename T>
......@@ -251,7 +263,7 @@ __global__ void _Sqrt(
const int n,
const T* x,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = sqrt(x[idx]);
}
}
......@@ -259,10 +271,11 @@ __global__ void _Sqrt(
template <> void Sqrt<float, CUDAContext>(
int n,
const float* x,
float* y) {
float* y,
CUDAContext* ctx) {
_Sqrt<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, x, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, x, y);
}
template <typename T>
......@@ -271,7 +284,7 @@ __global__ void _Pow(
const T alpha,
const T* a,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = pow(a[idx], alpha);
}
}
......@@ -280,10 +293,11 @@ template <> void Pow<float, CUDAContext>(
int n,
const float alpha,
const float* x,
float* y) {
float* y,
CUDAContext* ctx) {
_Pow<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, alpha, x, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, alpha, x, y);
}
template <typename T>
......@@ -292,7 +306,7 @@ __global__ void _Inv(
const float numerator,
const T* x,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] = numerator / x[idx];
}
}
......@@ -301,10 +315,11 @@ template <> void Inv<float, CUDAContext>(
const int n,
const float numerator,
const float* x,
float* y) {
float* y,
CUDAContext* ctx) {
_Inv<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, numerator, x, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, numerator, x, y);
}
/******************** Level-2 ********************/
......@@ -330,26 +345,27 @@ template <> void Scale<float, CUDAContext>(
ctx->cublas_handle(), n, &alpha, y, 1));
}
template <> float StridedDot<float, CUDAContext>(
template <> void StridedDot<float, CUDAContext>(
const int n,
const float* a,
const int incx,
const float* b,
const int incy,
float* y,
CUDAContext* ctx) {
float result;
CUBLAS_CHECK(cublasSdot_v2(ctx->cublas_handle(),
n, a, incx, b, incy, &result));
return result;
n, a, incx, b, incy, y));
}
template <> float Dot<float, CUDAContext>(
template <> void Dot<float, CUDAContext>(
int n,
const float* a,
const float* b,
float* y,
CUDAContext* ctx) {
return StridedDot<float, CUDAContext>(
n, a, 1, b, 1, ctx);
StridedDot<float, CUDAContext>(
n, a, 1, b, 1, y, ctx);
ctx->FinishDeviceCompution();
}
template <> float ASum<float, CUDAContext>(
......@@ -363,7 +379,7 @@ __global__ void _AddScalar(
const int n,
T alpha,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] += alpha;
}
}
......@@ -371,10 +387,11 @@ __global__ void _AddScalar(
template <> void AddScalar<float, CUDAContext>(
const int n,
const float alpha,
float* y) {
float* y,
CUDAContext* ctx) {
_AddScalar<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, alpha, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, alpha, y);
}
template <typename T>
......@@ -382,7 +399,7 @@ __global__ void _MulScalar(
const int n,
T alpha,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
y[idx] *= alpha;
}
}
......@@ -390,10 +407,11 @@ __global__ void _MulScalar(
template <> void MulScalar<float, CUDAContext>(
const int n,
const float alpha,
float* y) {
float* y,
CUDAContext* ctx) {
_MulScalar<float>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, alpha, y);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, alpha, y);
}
template <> void Axpy<float, CUDAContext>(
......@@ -427,7 +445,7 @@ template <> void RandomUniform<float, CUDAContext>(
ctx->curand_generator(), x, n));
float range = high - low;
if (range != 1.f) Scal<float, CUDAContext>(n, range, x, ctx);
if (low != 0.f) AddScalar<float, CUDAContext>(n, low, x);
if (low != 0.f) AddScalar<float, CUDAContext>(n, low, x, ctx);
}
/******************** Level-3 ********************/
......
......@@ -18,7 +18,7 @@ __global__ void _SetHalf(
const int n,
const T alpha,
T* x) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
x[idx] = alpha;
}
}
......@@ -26,16 +26,19 @@ __global__ void _SetHalf(
template <> void Set<float16, CUDAContext>(
const int n,
const float16 alpha,
float16* x) {
float16* x,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
if ((n & 1) == 0) {
_SetHalf<half2>
<< <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
dragon_cast<half2, float16>(alpha),
reinterpret_cast<half2*>(x));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
dragon_cast<half2, float16>(alpha),
reinterpret_cast<half2*>(x));
} else {
_SetHalf<float16>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(n, alpha, x);
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n, alpha, x);
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -47,7 +50,7 @@ __global__ void _TypeFloat2Half(
const int n,
const float* a,
half* b) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
b[idx] = __float2half(a[idx]);
}
}
......@@ -64,8 +67,9 @@ template <> void RandomNormal<float16, CUDAContext>(
CURAND_CHECK(curandGenerateNormal(
ctx->curand_generator(), xf32, n, mu, sigma));
_TypeFloat2Half
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, xf32, reinterpret_cast<half*>(x));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
xf32, reinterpret_cast<half*>(x));
CUDAContext::Delete(xf32);
#else
CUDA_FP16_NOT_COMPILED;
......@@ -81,7 +85,7 @@ __global__ void _AddHalf(
const half* a,
const half* b,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd(a[idx], b[idx]);
#endif
......@@ -94,7 +98,7 @@ __global__ void _AddHalf2(
const half2* a,
const half2* b,
half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd2(a[idx], b[idx]);
#endif
......@@ -106,20 +110,23 @@ template <> void Add<float16, CUDAContext>(
int n,
const float16* a,
const float16* b,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
if ((n & 1) == 0) {
_AddHalf2<half2>
<< <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
} else {
_AddHalf<half>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -133,7 +140,7 @@ __global__ void _SubHalf(
const half* a,
const half* b,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hsub(a[idx], b[idx]);
#endif
......@@ -146,7 +153,7 @@ __global__ void _SubHalf2(
const half2* a,
const half2* b,
half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hsub2(a[idx], b[idx]);
#endif
......@@ -158,20 +165,23 @@ template <> void Sub<float16, CUDAContext>(
int n,
const float16* a,
const float16* b,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
if ((n & 1) == 0) {
_SubHalf2<half2>
<< <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
} else {
_SubHalf<half>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -185,7 +195,7 @@ __global__ void _MulHalf(
const half* a,
const half* b,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(a[idx], b[idx]);
#endif
......@@ -198,7 +208,7 @@ __global__ void _MulHalf2(
const half2* a,
const half2* b,
half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(a[idx], b[idx]);
#endif
......@@ -210,20 +220,23 @@ template <> void Mul<float16, CUDAContext>(
int n,
const float16* a,
const float16* b,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
if ((n & 1) == 0) {
_MulHalf2<half2>
<< <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
} else {
_MulHalf<half>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> > (n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -237,7 +250,7 @@ __global__ void _DivHalf(
const half* a,
const half* b,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hdiv(a[idx], b[idx]);
#endif
......@@ -249,13 +262,15 @@ template <> void Div<float16, CUDAContext>(
int n,
const float16* a,
const float16* b,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
_DivHalf<half>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
#else
CUDA_FP16_NOT_COMPILED;
#endif
......@@ -267,7 +282,7 @@ __global__ void _SquareHalf(
const int n,
const half* x,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(x[idx], x[idx]);
#endif
......@@ -279,7 +294,7 @@ __global__ void _SquareHalf2(
const int n,
const half2* x,
half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(x[idx], x[idx]);
#endif
......@@ -290,18 +305,21 @@ __global__ void _SquareHalf2(
template <> void Square<float16, CUDAContext>(
int n,
const float16* x,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
if ((n & 1) == 0) {
_SquareHalf2<half2>
<< < CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
} else {
_SquareHalf<half>
<< < CUDA_BLOCKS(n), CUDA_THREADS >> > (n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -314,7 +332,7 @@ __global__ void _SqrtHalf(
int n,
const half* x,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = hsqrt(x[idx]);
#endif
......@@ -326,7 +344,7 @@ __global__ void _SqrtHalf2(
const int n,
const half2* x,
half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = h2sqrt(x[idx]);
#endif
......@@ -337,18 +355,21 @@ __global__ void _SqrtHalf2(
template <> void Sqrt<float16, CUDAContext>(
int n,
const float16* x,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
if ((n & 1) == 0) {
_SqrtHalf2<half2>
<< < CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
} else {
_SqrtHalf<half>
<< < CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -362,7 +383,7 @@ __global__ void _PowHalf(
const float alpha,
const half* a,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(a[idx], a[idx]);
#endif
......@@ -375,7 +396,7 @@ __global__ void _PowHalf2(
const float alpha,
const half2* a,
half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(a[idx], a[idx]);
#endif
......@@ -387,19 +408,22 @@ template <> void Pow<float16, CUDAContext>(
int n,
const float alpha,
const float16* x,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
CHECK(alpha == float(2)) << "fp16 only support the power of 2";
if (n % 2 == 0) {
if ((n & 1) == 0) {
_PowHalf2<half2>
<< < CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
alpha, reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
alpha, reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
} else {
_PowHalf<half>
<< < CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
alpha, reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
alpha, reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -413,7 +437,7 @@ __global__ void _InvHalf(
const half numerator,
const half* x,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(hrcp(x[idx]), numerator);
#endif
......@@ -426,7 +450,7 @@ __global__ void _InvHalf2(
const half2 numerator,
const half2* x,
half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(h2rcp(x[idx]), numerator);
#endif
......@@ -438,20 +462,23 @@ template <> void Inv<float16, CUDAContext>(
const int n,
const float numerator,
const float16* x,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
if ((n & 1) == 0) {
_InvHalf2<half2>
<< < CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
dragon_cast<half2, float>(numerator),
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
dragon_cast<half2, float>(numerator),
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
} else {
_InvHalf<half>
<< < CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
dragon_cast<half, float>(numerator),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
dragon_cast<half, float>(numerator),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -482,27 +509,26 @@ template <> void Scale<float16, CUDAContext>(
const float16* x,
float16* y,
CUDAContext* ctx) {
CUDAContext::Copy<float16, CUDAContext, CUDAContext>(n, y, x);
ctx->Copy<float16, CUDAContext, CUDAContext>(n, y, x);
Scal<float16, CUDAContext>(n, alpha, y, ctx);
}
template <> float Dot<float16, CUDAContext>(
template <> void Dot<float16, CUDAContext>(
int n,
const float16* a,
const float16* b,
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
float16 result;
CUBLAS_CHECK(cublasDotEx(
ctx->cublas_handle(), n,
a, CUDA_R_16F, 1,
b, CUDA_R_16F, 1,
&result, CUDA_R_16F,
y, CUDA_R_16F,
CUDA_R_32F));
return dragon_cast<float, float16>(result);
ctx->FinishDeviceCompution();
#else
CUDA_FP16_NOT_COMPILED;
return 0.;
#endif
}
......@@ -512,7 +538,7 @@ __global__ void _AddScalarHalf(
const int n,
half alpha,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd(y[idx], alpha);
#endif
......@@ -524,7 +550,7 @@ __global__ void _AddScalarHalf2(
const int n,
half2 alpha,
half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd2(y[idx], alpha);
#endif
......@@ -535,18 +561,21 @@ __global__ void _AddScalarHalf2(
template <> void AddScalar<float16, CUDAContext>(
const int n,
const float alpha,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
if ((n & 1) == 0) {
_AddScalarHalf2<half2>
<< <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
dragon_cast<half2, float>(alpha),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
dragon_cast<half2, float>(alpha),
reinterpret_cast<half2*>(y));
} else {
_AddScalarHalf<half>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
dragon_cast<half, float>(alpha),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
dragon_cast<half, float>(alpha),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -559,7 +588,7 @@ __global__ void _MulScalarHalf(
const int n,
half alpha,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(y[idx], alpha);
#endif
......@@ -571,7 +600,7 @@ __global__ void _MulScalarHalf2(
const int n,
half2 alpha,
half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
CUDA_1D_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(y[idx], alpha);
#endif
......@@ -582,18 +611,21 @@ __global__ void _MulScalarHalf2(
template <> void MulScalar<float16, CUDAContext>(
const int n,
const float alpha,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
if ((n & 1) == 0) {
_MulScalarHalf2<half2>
<< <CUDA_BLOCKS(n / 2), CUDA_THREADS >> >(n / 2,
dragon_cast<half2, float>(alpha),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n >> 1,
dragon_cast<half2, float>(alpha),
reinterpret_cast<half2*>(y));
} else {
_MulScalarHalf<half>
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(n,
dragon_cast<half, float>(alpha),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
dragon_cast<half, float>(alpha),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -640,11 +672,12 @@ template <> void RandomUniform<float16, CUDAContext>(
CURAND_CHECK(curandGenerateUniform(
ctx->curand_generator(), xf32, n));
_TypeFloat2Half
<< <CUDA_BLOCKS(n), CUDA_THREADS >> >(
n, xf32, reinterpret_cast<half*>(x));
<< < CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >> >(n,
xf32, reinterpret_cast<half*>(x));
float range = high - low;
if (range != float(1)) Scal<float16, CUDAContext>(n, range, x, ctx);
if (low != float(0)) AddScalar<float16, CUDAContext>(n, low, x);
if (range != 1.f) Scal<float16, CUDAContext>(n, range, x, ctx);
if (low != 0.f) AddScalar<float16, CUDAContext>(n, low, x, ctx);
ctx->Delete(xf32);
#else
CUDA_FP16_NOT_COMPILED;
......
......@@ -53,7 +53,8 @@ template<> void Elu<float, CPUContext>(
const int count,
const float alpha,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -68,7 +69,8 @@ template<> void EluGrad<float, CPUContext>(
const float alpha,
const float* dy,
const float* y,
float* dx) {
float* dx,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -89,7 +91,8 @@ template<> void PRelu<float, CPUContext>(
const string& data_format,
const float* x,
const float* w,
float* y) {
float* y,
CPUContext* ctx) {
if (channel_shared) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
......@@ -130,7 +133,8 @@ template<> void PReluGrad<float, CPUContext>(
const float* dy,
const float* x,
const float* w,
float* dx) {
float* dx,
CPUContext* ctx) {
if (channel_shared) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
......@@ -184,9 +188,10 @@ template<> void PReluWGrad<float, CPUContext>(
}
}
if (channel_shared) {
float w_sum = math::Dot<float, CPUContext>(
channels * dim, bcast_dw, multiplier, ctx);
math::AddScalar<float, CPUContext>(1, w_sum, dw);
float w_sum;
math::Dot<float, CPUContext>(channels * dim,
bcast_dw, multiplier, &w_sum, ctx);
math::AddScalar<float, CPUContext>(1, w_sum, dw, ctx);
} else {
if (data_format == "NCHW") {
math::Gemv<float, CPUContext>(
......@@ -208,7 +213,8 @@ template<> void Relu<float, CPUContext>(
const int count,
const float slope,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -221,7 +227,8 @@ template<> void Relu<float16, CPUContext>(
const int count,
const float slope,
const float16* x,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -230,7 +237,8 @@ template<> void ReluGrad<float, CPUContext>(
const float slope,
const float* dy,
const float* y,
float* dx) {
float* dx,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -244,7 +252,8 @@ template<> void ReluGrad<float, CPUContext>(
template<> void SElu<float, CPUContext>(
const int count,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -258,7 +267,8 @@ template<> void SEluGrad<float, CPUContext>(
const int count,
const float* dy,
const float* y,
float* dx) {
float* dx,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -276,7 +286,8 @@ T _sigmoid(T x) { return T(1) / (T(1) + exp(-x)); }
template<> void Sigmoid<float, CPUContext>(
const int count,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -287,7 +298,8 @@ template<> void SigmoidGrad<float, CPUContext>(
const int count,
const float* dy,
const float* y,
float* dx) {
float* dx,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -310,7 +322,7 @@ template<> void Softmax<float, CPUContext>(
CPUContext* ctx) {
const int dim = count / outer_dim;
for (int i = 0; i < outer_dim; ++i) {
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
inner_dim, scale, x + i*dim);
for (int j = 0; j < classes; ++j) {
for (int k = 0; k < inner_dim; k++)
......@@ -322,13 +334,13 @@ template<> void Softmax<float, CPUContext>(
CblasNoTrans, CblasNoTrans,
classes, inner_dim, 1,
-1.0, sum_multiplier, scale, 1.0, y, ctx);
math::Exp<float, CPUContext>(dim, y, y);
math::Exp<float, CPUContext>(dim, y, y, ctx);
math::Gemv<float, CPUContext>(
CblasTrans, classes, inner_dim,
1.0, y, sum_multiplier,
0.0, scale, ctx);
for (int j = 0; j < classes; ++j) {
math::Div<float, CPUContext>(inner_dim, y, scale, y);
math::Div<float, CPUContext>(inner_dim, y, scale, y, ctx);
y += inner_dim;
}
}
......@@ -348,17 +360,16 @@ template<> void SoftmaxGrad<float, CPUContext>(
const int dim = count / outer_dim;
for (int i = 0; i < outer_dim; ++i) {
for (int k = 0; k < inner_dim; ++k)
scale[k] = math::StridedDot<float, CPUContext>(
classes,
dx + i * dim + k, inner_dim,
y + i*dim + k, inner_dim, ctx);
math::StridedDot<float, CPUContext>(classes,
dx + i * dim + k, inner_dim,
y + i * dim + k, inner_dim, scale + k, ctx);
math::Gemm<float, CPUContext>(
CblasNoTrans, CblasNoTrans,
classes, inner_dim, 1,
-1.0, sum_multiplier, scale,
1.0, dx + i * dim, ctx);
}
math::Mul<float, CPUContext>(count, dx, y, dx);
math::Mul<float, CPUContext>(count, dx, y, dx, ctx);
}
/******************** activation.tanh ********************/
......@@ -366,7 +377,8 @@ template<> void SoftmaxGrad<float, CPUContext>(
template<> void Tanh<float, CPUContext>(
const int count,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -379,7 +391,8 @@ template<> void TanhGrad<float, CPUContext>(
const int count,
const float* dy,
const float* y,
float* dx) {
float* dx,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -467,7 +480,8 @@ template <> void Clip<float, CPUContext>(
const float high,
const float* x,
float* mask,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -484,7 +498,8 @@ template <> void Equal<float, CPUContext>(
const int count,
const float* a,
const float* b,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -497,7 +512,8 @@ template <> void Equal<float, CPUContext>(
template<> void AbsGrad<float, CPUContext>(
const int count,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -651,7 +667,8 @@ template<> void SmoothL1<float, CPUContext>(
const int count,
const float beta,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -667,7 +684,8 @@ template<> void SmoothL1Grad<float, CPUContext>(
const int count,
const float beta,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -686,7 +704,8 @@ template <> void SoftmaxCrossEntropy<float, CPUContext>(
const int count,
const float* prob,
const float* target,
float* loss) {
float* loss,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -834,6 +853,20 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CPUContext>(
losses, flags);
}
template <> void SparseSoftmaxCrossEntropy<float16, float, CPUContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* prob,
const float* labels,
const int* ignores,
const int num_ignores,
float* losses,
float* flags,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
const int outer_dim,
const int axis_dim,
......@@ -851,6 +884,20 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CPUContext>(
losses, flags);
}
template <> void SparseSoftmaxCrossEntropy<float16, int64_t, CPUContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* prob,
const int64_t* labels,
const int* ignores,
const int num_ignores,
float* losses,
float* flags,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <typename Tx, typename Ty>
void _SparseSoftmaxCrossEntropyGrad(
const int outer_dim,
......@@ -897,6 +944,20 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CPUContext>(
num_ignores, dx, flags);
}
template<> void SparseSoftmaxCrossEntropyGrad<float16, float, CPUContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* prob,
const float* labels,
const int* ignores,
const int num_ignores,
float16* dx,
float* flags,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CPUContext>(
const int outer_dim,
const int axis_dim,
......@@ -914,6 +975,20 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CPUContext>(
num_ignores, dx, flags);
}
template<> void SparseSoftmaxCrossEntropyGrad<float16, int64_t, CPUContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* prob,
const int64_t* labels,
const int* ignores,
const int num_ignores,
float16* dx,
float* flags,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
/******************** misc.astype ********************/
template <typename Ta, typename Tb>
......@@ -936,7 +1011,8 @@ void _TypeA2B_v2(const int count, const Ta* a, Tb* b) {
template <> void TypeA2B<type_a, type_b, CPUContext>( \
const int count, \
const type_a* a, \
type_b* b) { \
type_b* b, \
CPUContext* ctx) { \
_TypeA2B<type_a, type_b>(count, a, b); \
}
......@@ -944,7 +1020,8 @@ void _TypeA2B_v2(const int count, const Ta* a, Tb* b) {
template <> void TypeA2B<type_a, type_b, CPUContext>( \
const int count, \
const type_a* a, \
type_b* b) { \
type_b* b, \
CPUContext* ctx) { \
_TypeA2B_v2<type_a, type_b>(count, a, b); \
}
......@@ -952,13 +1029,15 @@ void _TypeA2B_v2(const int count, const Ta* a, Tb* b) {
template <> void TypeA2B<float16, type, CPUContext>( \
const int count, \
const float16* a, \
type* b) { \
type* b, \
CPUContext* ctx) { \
CPU_FP16_NOT_SUPPORTED; \
} \
template <> void TypeA2B<type, float16, CPUContext>( \
const int count, \
const type* a, \
float16* b) { \
float16* b, \
CPUContext* ctx) { \
CPU_FP16_NOT_SUPPORTED; \
}
......@@ -1039,7 +1118,8 @@ template <> void ImageData<float, float, CPUContext>(
const float* std_values,
const string& data_format,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
if (data_format == "NCHW") {
_ImageData_NCHW<float, float>(
N, C, H, W, mean_values, std_values, x, y);
......@@ -1059,7 +1139,8 @@ template <> void ImageData<uint8_t, float, CPUContext>(
const float* std_values,
const string& data_format,
const uint8_t* x,
float* y) {
float* y,
CPUContext* ctx) {
if (data_format == "NCHW") {
_ImageData_NCHW<uint8_t, float>(
N, C, H, W, mean_values, std_values, x, y);
......@@ -1079,7 +1160,8 @@ template <> void ImageData<float, float16, CPUContext>(
const float* std_values,
const string& data_format,
const float* x,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -1093,7 +1175,8 @@ template <> void ImageData<uint8_t, float16, CPUContext>(
const float* std_values,
const string& data_format,
const uint8_t* x,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -1103,7 +1186,8 @@ template<> void Arange<float, CPUContext>(
const int count,
const int start,
const int step,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -1114,7 +1198,8 @@ template<> void Arange<int, CPUContext>(
const int count,
const int start,
const int step,
int* y) {
int* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -1130,7 +1215,8 @@ template<> void Argmax<float, CPUContext>(
const int top_k,
const float* x,
int64_t* indices,
float* values) {
float* values,
CPUContext* ctx) {
vector<pair<float, int> > vec(axis_dim);
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
......@@ -1158,7 +1244,8 @@ template<> void Argmin<float, CPUContext>(
const int top_k,
const float* x,
int64_t* indices,
float* values) {
float* values,
CPUContext* ctx) {
vector<pair<float, int> > vec(axis_dim);
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
......@@ -1182,7 +1269,8 @@ template<> void Argmin<float, CPUContext>(
template <> void CanonicalAxis<int, CPUContext>(
const int count,
const int dim,
int* y) {
int* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -1198,7 +1286,8 @@ void _Gather(
const int y_slice_dim,
const int* indices,
const T* x,
T* y) {
T* y,
CPUContext* ctx) {
TIndex x_offset, y_offset, x_idx_offset, y_idx_offset;
for (int i = 0; i < y_slice_dim; ++i) {
y_idx_offset = i;
......@@ -1206,7 +1295,7 @@ void _Gather(
for (int n = 0; n < outer_dim; ++n) {
x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
CPUContext::Copy<T, CPUContext, CPUContext>(
ctx->Copy<T, CPUContext, CPUContext>(
inner_dim, y + y_offset, x + x_offset);
}
}
......@@ -1220,9 +1309,10 @@ template <> void Gather<float, CPUContext>(
const int y_slice_dim,
const int* indices,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
_Gather<float>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, x, y);
x_slice_dim, y_slice_dim, indices, x, y, ctx);
}
template <> void Gather<int, CPUContext>(
......@@ -1233,9 +1323,10 @@ template <> void Gather<int, CPUContext>(
const int y_slice_dim,
const int* indices,
const int* x,
int* y) {
int* y,
CPUContext* ctx) {
_Gather<int>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, x, y);
x_slice_dim, y_slice_dim, indices, x, y, ctx);
}
template <typename T>
......@@ -1247,7 +1338,8 @@ void _GatherGrad(
const int y_slice_dim,
const int* indices,
const T* dy,
T* dx) {
T* dx,
CPUContext* ctx) {
TIndex x_offset, y_offset, x_idx_offset, y_idx_offset;
for (int i = 0; i < y_slice_dim; ++i) {
y_idx_offset = i;
......@@ -1256,7 +1348,7 @@ void _GatherGrad(
x_offset = (n * x_slice_dim + x_idx_offset) * inner_dim;
y_offset = (n * y_slice_dim + y_idx_offset) * inner_dim;
math::Add<T, CPUContext>(inner_dim,
dy + y_offset, dx + x_offset, dx + x_offset);
dy + y_offset, dx + x_offset, dx + x_offset, ctx);
}
}
}
......@@ -1269,9 +1361,10 @@ template <> void GatherGrad<float, CPUContext>(
const int y_slice_dim,
const int* indices,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
_GatherGrad<float>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, dy, dx);
x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
}
template <> void GatherGrad<int, CPUContext>(
......@@ -1282,9 +1375,10 @@ template <> void GatherGrad<int, CPUContext>(
const int y_slice_dim,
const int* indices,
const int* dy,
int* dx) {
int* dx,
CPUContext* ctx) {
_GatherGrad<int>(count, outer_dim, inner_dim,
x_slice_dim, y_slice_dim, indices, dy, dx);
x_slice_dim, y_slice_dim, indices, dy, dx, ctx);
}
/******************** ndarray.concat ********************/
......@@ -1297,12 +1391,13 @@ template <> void Concat<float, CPUContext>(
const int y_concat_dim,
const int concat_offset,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
TIndex x_offset, y_offset;
for (int n = 0; n < outer_dim; ++n) {
x_offset = n * x_concat_dim * inner_dim;
y_offset = (n * y_concat_dim + concat_offset) * inner_dim;
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
x_concat_dim * inner_dim, y + y_offset, x + x_offset);
}
}
......@@ -1315,12 +1410,13 @@ template <> void Concat<float16, CPUContext>(
const int y_concat_dim,
const int concat_offset,
const float16* x,
float16* y) {
float16* y,
CPUContext* ctx) {
TIndex x_offset, y_offset;
for (int n = 0; n < outer_dim; ++n) {
x_offset = n * x_concat_dim * inner_dim;
y_offset = (n * y_concat_dim + concat_offset) * inner_dim;
CPUContext::Copy<float16, CPUContext, CPUContext>(
ctx->Copy<float16, CPUContext, CPUContext>(
x_concat_dim * inner_dim, y + y_offset, x + x_offset);
}
}
......@@ -1333,12 +1429,13 @@ template <> void ConcatGrad<float, CPUContext>(
const int y_concat_dim,
const int concat_offset,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
TIndex x_offset, y_offset;
for (int n = 0; n < outer_dim; ++n) {
x_offset = n * x_concat_dim * inner_dim;
y_offset = (n * y_concat_dim + concat_offset) * inner_dim;
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
x_concat_dim * inner_dim, dx + x_offset, dy + y_offset);
}
}
......@@ -1351,12 +1448,13 @@ template <> void ConcatGrad<float16, CPUContext>(
const int y_concat_dim,
const int concat_offset,
const float16* dy,
float16* dx) {
float16* dx,
CPUContext* ctx) {
TIndex x_offset, y_offset;
for (int n = 0; n < outer_dim; ++n) {
x_offset = n * x_concat_dim * inner_dim;
y_offset = (n * y_concat_dim + concat_offset) * inner_dim;
CPUContext::Copy<float16, CPUContext, CPUContext>(
ctx->Copy<float16, CPUContext, CPUContext>(
x_concat_dim * inner_dim, dx + x_offset, dy + y_offset);
}
}
......@@ -1371,7 +1469,8 @@ void _Crop1D(
const int inner_dim,
const int start,
const T* x,
T* y) {
T* y,
CPUContext* ctx) {
const int count_v2 = count / inner_dim;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
......@@ -1381,7 +1480,7 @@ void _Crop1D(
const int o = idx / ex_dim;
const T* x_ptr = x + (o * dim + ex_d + start) * inner_dim;
T* y_ptr = y + (o * ex_dim + ex_d) * inner_dim;
CPUContext::Copy<T, CPUContext, CPUContext>(
ctx->Copy<T, CPUContext, CPUContext>(
inner_dim, y_ptr, x_ptr);
}
}
......@@ -1393,8 +1492,10 @@ template<> void Crop1D<int, CPUContext>(
const int inner_dim,
const int start,
const int* x,
int* y) {
_Crop1D<int>(count, dim, ex_dim, inner_dim, start, x, y);
int* y,
CPUContext* ctx) {
_Crop1D<int>(count, dim, ex_dim,
inner_dim, start, x, y, ctx);
}
template<> void Crop1D<float, CPUContext>(
......@@ -1404,8 +1505,10 @@ template<> void Crop1D<float, CPUContext>(
const int inner_dim,
const int start,
const float* x,
float* y) {
_Crop1D<float>(count, dim, ex_dim, inner_dim, start, x, y);
float* y,
CPUContext* ctx) {
_Crop1D<float>(count, dim, ex_dim,
inner_dim, start, x, y, ctx);
}
template <typename T>
......@@ -1417,7 +1520,8 @@ void _Crop1DGrad(
const int start,
const int end,
const T* dy,
T* dx) {
T* dx,
CPUContext* ctx) {
const int count_v2 = count / inner_dim;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
......@@ -1430,7 +1534,7 @@ void _Crop1DGrad(
for (int i = 0; i < inner_dim; ++i) dx_ptr[i] = 0;
} else {
const T* dy_ptr = dy + (o * ex_dim + d - start) * inner_dim;
CPUContext::Copy<T, CPUContext, CPUContext>(
ctx->Copy<T, CPUContext, CPUContext>(
inner_dim, dx_ptr, dy_ptr);
}
}
......@@ -1444,10 +1548,11 @@ template<> void Crop1DGrad<int, CPUContext>(
const int start,
const int end,
const int* dy,
int* dx) {
int* dx,
CPUContext* ctx) {
_Crop1DGrad<int>(
count, dim, ex_dim, inner_dim,
start, end, dy, dx);
start, end, dy, dx, ctx);
}
template<> void Crop1DGrad<float, CPUContext>(
......@@ -1458,10 +1563,11 @@ template<> void Crop1DGrad<float, CPUContext>(
const int start,
const int end,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
_Crop1DGrad<float>(
count, dim, ex_dim, inner_dim,
start, end, dy, dx);
start, end, dy, dx, ctx);
}
/******************** ndarray.pad ********************/
......@@ -1474,7 +1580,8 @@ template <> void ConstPad1D<float, CPUContext>(
const int pad_l,
const float value,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
const int count_v2 = count / inner_dim;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
......@@ -1488,7 +1595,7 @@ template <> void ConstPad1D<float, CPUContext>(
for (int i = 0; i < inner_dim; ++i) y_ptr[i] = value;
} else {
const float* x_ptr = x + (o * dim + d) * inner_dim;
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
inner_dim, y_ptr, x_ptr);
}
}
......@@ -1501,7 +1608,8 @@ template <> void ReflectPad1D<float, CPUContext>(
const int inner_dim,
const int pad_l,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
const int count_v2 = count / inner_dim;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
......@@ -1518,7 +1626,7 @@ template <> void ReflectPad1D<float, CPUContext>(
y_ptr[i] = x[(o * dim + d) * inner_dim + i];
} else {
const float* x_ptr = x + (o * dim + d) * inner_dim;
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
inner_dim, y_ptr, x_ptr);
}
}
......@@ -1531,7 +1639,8 @@ template <> void EdgePad1D<float, CPUContext>(
const int inner_dim,
const int pad_l,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
const int count_v2 = count / inner_dim;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
......@@ -1546,7 +1655,7 @@ template <> void EdgePad1D<float, CPUContext>(
y_ptr[i] = x[(o * dim + d) * inner_dim + i];
} else {
const float* x_ptr = x + (o * dim + d) * inner_dim;
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
inner_dim, y_ptr, x_ptr);
}
}
......@@ -1559,7 +1668,8 @@ template <> void ConstPad1DGrad<float, CPUContext>(
const int inner_dim,
const int pad_l,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
const int count_v2 = count / inner_dim;
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count_v2))
......@@ -1570,7 +1680,7 @@ template <> void ConstPad1DGrad<float, CPUContext>(
const int ex_d = d + pad_l;
const float* dy_ptr = dy + (o * ex_dim + ex_d) * inner_dim;
float* dx_ptr = dx + (o * dim + d) * inner_dim;
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
inner_dim, dx_ptr, dy_ptr);
}
}
......@@ -1582,7 +1692,8 @@ template <> void ReflectPad1DGrad<float, CPUContext>(
const int inner_dim,
const int pad_l,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
for (int idx = 0; idx < count; ++idx) {
const int i = idx % inner_dim;
const int ex_d = (idx / inner_dim) % ex_dim;
......@@ -1601,7 +1712,8 @@ template <> void EdgePad1DGrad<float, CPUContext>(
const int inner_dim,
const int pad_l,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
const int count_v2 = count / inner_dim;
for (int idx = 0; idx < count_v2; ++idx) {
const int ex_d = idx % ex_dim;
......@@ -1613,7 +1725,7 @@ template <> void EdgePad1DGrad<float, CPUContext>(
dx[(o * dim + d) * inner_dim + i] += dy_ptr[i];
} else {
float* dx_ptr = dx + (o * dim + d) * inner_dim;
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
inner_dim, dx_ptr, dy_ptr);
}
}
......@@ -1626,7 +1738,8 @@ template <> void OneHot<float, CPUContext>(
const int depth,
const int on_value,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -1643,7 +1756,8 @@ template<> void Sum<float, CPUContext>(
const int axis_dim,
const int inner_dim,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -1662,7 +1776,8 @@ template<> void SumGrad<float, CPUContext>(
const int inner_dim,
const float coeff,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -1682,14 +1797,15 @@ template <> void Repeat<float, CPUContext>(
const int inner_dim,
const int repeats,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < outer_dim; ++i) {
for (int j = 0; j < dim; ++j) {
for (int k = 0; k < repeats; ++k) {
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
inner_dim, y, x);
y += inner_dim;
}
......@@ -1709,7 +1825,7 @@ template <> void RepeatGrad<float, CPUContext>(
CPUContext* ctx) {
for (int i = 0; i < outer_dim; ++i) {
for (int j = 0; j < dim; ++j) {
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
inner_dim, dx, dy);
dy += inner_dim;
for (int k = 1; k < repeats; ++k) {
......@@ -1732,12 +1848,13 @@ template <> void Slice<float, CPUContext>(
const int y_slice_dim,
const int slice_offset,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
TIndex x_offset, y_offset;
for (int n = 0; n < outer_dim; ++n) {
x_offset = (n * x_slice_dim + slice_offset) * inner_dim;
y_offset = n * y_slice_dim * inner_dim;
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
y_slice_dim * inner_dim, y + y_offset, x + x_offset);
}
}
......@@ -1750,12 +1867,13 @@ template <> void SliceGrad<float, CPUContext>(
const int y_slice_dim,
const int slice_offset,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
TIndex x_offset, y_offset;
for (int n = 0; n < outer_dim; ++n) {
x_offset = (n * x_slice_dim + slice_offset) * inner_dim;
y_offset = n * y_slice_dim * inner_dim;
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
y_slice_dim * inner_dim, dx + x_offset, dy + y_offset);
}
}
......@@ -1768,10 +1886,11 @@ template <> void Tile<float, CPUContext>(
const int ex_inner_dim,
const int multiple,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
for (int i = 0; i < outer_dim; ++i) {
for (int t = 0; t < multiple; ++t) {
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
ex_inner_dim, y, x);
y += ex_inner_dim;
}
......@@ -1788,7 +1907,7 @@ template <> void TileGrad<float, CPUContext>(
float* dx,
CPUContext* ctx) {
for (int i = 0; i < outer_dim; ++i) {
CPUContext::Copy<float, CPUContext, CPUContext>(
ctx->Copy<float, CPUContext, CPUContext>(
ex_inner_dim, dx, dy);
dy += ex_inner_dim;
for (int t = 1; t < multiple; ++t) {
......@@ -1809,7 +1928,8 @@ template <> void Transpose<float, CPUContext>(
const int* old_steps,
const int* new_steps,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -1831,7 +1951,8 @@ template <> void Transpose<float16, CPUContext>(
const int* old_steps,
const int* new_steps,
const float16* x,
float16* y) {
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -1842,7 +1963,8 @@ template <> void TransposeGrad<float, CPUContext>(
const int* old_steps,
const int* new_steps,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
......@@ -1864,7 +1986,8 @@ template <> void TransposeGrad<float16, CPUContext>(
const int* old_steps,
const int* new_steps,
const float16* dy,
float16* dx) {
float16* dx,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -1877,7 +2000,8 @@ template <> void LSTMCell<float, CPUContext>(
const float* cx,
float* xact,
float* c,
float* h) {
float* h,
CPUContext* ctx) {
float i, f, o, c_;
int f_offset = C, o_offset = 2 * C, c_offset = 3 * C, x_offset = 4 * C;
for (int n = 0; n < N; ++n) {
......@@ -1903,7 +2027,8 @@ template <> void LSTMCellGrad<float, CPUContext>(
const float* dc,
const float* dh,
float* dcx,
float* dx) {
float* dx,
CPUContext* ctx) {
float i, f, o, g, tanh_c, dcx_sum_term;
int f_offset = C,
o_offset = 2 * C,
......@@ -1964,7 +2089,8 @@ template <> void AdamUpdate<float, CPUContext>(
const float eps,
float* g,
float* m,
float* v) {
float* v,
CPUContext* ctx) {
_AdamUpdate<float>(count, lr, beta1, beta2, eps, g, m, v);
}
......@@ -1976,7 +2102,8 @@ template <> void AdamUpdate<float16, CPUContext>(
const float eps,
float16* g,
float16* m,
float16* v) {
float16* v,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -2004,7 +2131,8 @@ template <> void NesterovUpdate<float, CPUContext>(
const float lr,
const float momentum,
float* g,
float* h) {
float* h,
CPUContext* ctx) {
_NesterovUpdate<float>(count, lr, momentum, g, h);
}
......@@ -2013,7 +2141,8 @@ template <> void NesterovUpdate<float16, CPUContext>(
const float lr,
const float momentum,
float16* g,
float16* h) {
float16* h,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -2043,7 +2172,8 @@ template <> void RMSPropUpdate<float, CPUContext>(
const float decay,
const float eps,
float* g,
float* h) {
float* h,
CPUContext* ctx) {
_RMSPropUpdate<float>(count, lr, decay, eps, g, h);
}
......@@ -2053,7 +2183,8 @@ template <> void RMSPropUpdate<float16, CPUContext>(
const float decay,
const float eps,
float16* g,
float16* h) {
float16* h,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -2080,7 +2211,8 @@ template <> void SGDUpdate<float, CPUContext>(
const float lr,
const float momentum,
float* g,
float* h) {
float* h,
CPUContext* ctx) {
_SGDUpdate<float>(count, lr, momentum, g, h);
}
......@@ -2089,7 +2221,8 @@ template <> void SGDUpdate<float16, CPUContext>(
const float lr,
const float momentum,
float16* g,
float16* h) {
float16* h,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
......@@ -2217,7 +2350,8 @@ template <> void BilinearResize<float, CPUContext>(
const int out_w,
const string& data_format,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
const float scale_h = (float)H / out_h;
const float scale_w = (float)W / out_w;
if (data_format == "NCHW") {
......@@ -2326,10 +2460,10 @@ template <> void BilinearResizeGrad<float, CPUContext>(
const int out_w,
const string& data_format,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
const float scale_h = (float)H / out_h;
const float scale_w = (float)W / out_w;
math::Set<float, CPUContext>(N * C * H * W, 0, dx);
if (data_format == "NCHW") {
_BilinearResizeGrad_NCHW<float>(
N, C, H, W, out_h, out_w,
......@@ -2439,7 +2573,8 @@ template <> void Im2Col2d<float, CPUContext>(
const int dilation_w,
const string& data_format,
const float* im,
float* col) {
float* col,
CPUContext* ctx) {
if (data_format == "NCHW") {
const int count = (C * col_h * col_w);
_Im2Col2d_NCHW<float>(
......@@ -2471,8 +2606,9 @@ void _Col2Im2d_NCHW(
const int dilation_h,
const int dilation_w,
const T* col,
T* im) {
math::Set<float, CPUContext>(C * H * W, 0, im);
T* im,
CPUContext* ctx) {
math::Set<float, CPUContext>(C * H * W, 0, im, ctx);
const int im_offset = H * W;
for (int c = 0; c < C; ++c, im += im_offset) {
for (int kh = 0; kh < kernel_h; ++kh) {
......@@ -2512,8 +2648,9 @@ void _Col2Im2d_NHWC(
const int dilation_h,
const int dilation_w,
const T* col,
T* im) {
math::Set<float, CPUContext>(C * H * W, 0, im);
T* im,
CPUContext* ctx) {
math::Set<float, CPUContext>(C * H * W, 0, im, ctx);
for (int output_h = 0; output_h < col_h; ++output_h) {
const int base_h = -pad_h + stride_h * output_h;
for (int output_w = 0; output_w < col_w; ++output_w) {
......@@ -2552,19 +2689,20 @@ template<> void Col2Im2d<float, CPUContext>(
const int dilation_w,
const string& data_format,
const float* col,
float* im) {
float* im,
CPUContext* ctx) {
if (data_format == "NCHW") {
const int count = (C * H * W);
_Col2Im2d_NCHW<float>(
C, H, W, col_h, col_w, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, col, im);
dilation_h, dilation_w, col, im, ctx);
} else if (data_format == "NHWC") {
const int count = (H * W * C);
_Col2Im2d_NHWC<float>(
C, H, W, col_h, col_w, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w,
dilation_h, dilation_w, col, im);
dilation_h, dilation_w, col, im, ctx);
} else LOG(FATAL) << "Unknown data format: " << data_format;
}
......@@ -2632,7 +2770,8 @@ template <> void NNResize<float, CPUContext>(
const int out_w,
const string& data_format,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
const float scale_h = (float)H / out_h;
const float scale_w = (float)W / out_w;
if (data_format == "NCHW") {
......@@ -2708,10 +2847,10 @@ template <> void NNResizeGrad<float, CPUContext>(
const int out_w,
const string& data_format,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
const float scale_h = (float)H / out_h;
const float scale_w = (float)W / out_w;
math::Set<float, CPUContext>(N * C * H * W, 0, dx);
if (data_format == "NCHW") {
_NNResizeGrad_NCHW<float>(
N, C, H, W, out_h, out_w,
......@@ -2847,7 +2986,8 @@ template<> void MAXPooling2d<float, CPUContext>(
const string& data_format,
const float* x,
int* mask,
float* y) {
float* y,
CPUContext* ctx) {
if (data_format == "NCHW") {
_MAXPooling2d_NCHW<float>(
N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
......@@ -2966,7 +3106,8 @@ template<> void AVGPooling2d<float, CPUContext>(
const int pad_w,
const string& data_format,
const float* x,
float* y) {
float* y,
CPUContext* ctx) {
if (data_format == "NCHW") {
_AVGPooling2d_NCHW<float>(
N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
......@@ -2994,10 +3135,11 @@ void _MAXPooling2dGrad_NCHW(
const int pad_w,
const float* dy,
const int* mask,
float* dx) {
float* dx,
CPUContext* ctx) {
int x_offset = H * W;
int y_offset = pool_h * pool_w;
math::Set<float, CPUContext>(N * C * H * W, 0, dx);
math::Set<float, CPUContext>(N * C * H * W, 0, dx, ctx);
for (int n = 0; n < N; ++n) {
for (int c = 0; c < C; ++c) {
for (int ph = 0; ph < pool_h; ++ph) {
......@@ -3030,10 +3172,11 @@ void _MAXPooling2dGrad_NHWC(
const int pad_w,
const float* dy,
const int* mask,
float* dx) {
float* dx,
CPUContext* ctx) {
int x_offset = H * W * C;
int y_offset = pool_h * pool_w * C;
math::Set<float, CPUContext>(N * H * W * C, 0, dx);
math::Set<float, CPUContext>(N * H * W * C, 0, dx, ctx);
for (int n = 0; n < N; ++n) {
for (int ph = 0; ph < pool_h; ph++) {
for (int pw = 0; pw < pool_w; ++pw) {
......@@ -3067,15 +3210,16 @@ template<> void MAXPooling2dGrad<float, CPUContext>(
const string& data_format,
const float* dy,
const int* mask,
float* dx) {
float* dx,
CPUContext* ctx) {
if (data_format == "NCHW") {
_MAXPooling2dGrad_NCHW<float>(
N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dy, mask, dx);
stride_h, stride_w, pad_h, pad_w, dy, mask, dx, ctx);
} else if (data_format == "NHWC") {
_MAXPooling2dGrad_NHWC<float>(
N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dy, mask, dx);
stride_h, stride_w, pad_h, pad_w, dy, mask, dx, ctx);
} else LOG(FATAL) << "Unknown data format: " << data_format;
}
......@@ -3094,10 +3238,11 @@ void _AVGPooling2dGrad_NCHW(
const int pad_h,
const int pad_w,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
int x_offset = H * W;
int y_offset = pool_h * pool_w;
math::Set<float, CPUContext>(N * C * H * W, 0, dx);
math::Set<float, CPUContext>(N * C * H * W, 0, dx,ctx);
for (int n = 0; n < N; ++n) {
for (int c = 0; c < C; ++c) {
for (int ph = 0; ph < pool_h; ++ph) {
......@@ -3141,10 +3286,11 @@ void _AVGPooling2dGrad_NHWC(
const int pad_h,
const int pad_w,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
int x_offset = H * W * C;
int y_offset = pool_h * pool_w * C;
math::Set<float, CPUContext>(N * H * W * C, 0, dx);
math::Set<float, CPUContext>(N * H * W * C, 0, dx, ctx);
for (int n = 0; n < N; ++n) {
for (int ph = 0; ph < pool_h; ph++) {
for (int pw = 0; pw < pool_w; ++pw) {
......@@ -3187,15 +3333,16 @@ template<> void AVGPooling2dGrad<float, CPUContext>(
const int pad_w,
const string& data_format,
const float* dy,
float* dx) {
float* dx,
CPUContext* ctx) {
if (data_format == "NCHW") {
_AVGPooling2dGrad_NCHW<float>(
N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dy, dx);
stride_h, stride_w, pad_h, pad_w, dy, dx, ctx);
} else if (data_format == "NHWC") {
_AVGPooling2dGrad_NHWC<float>(
N, C, H, W, pool_h, pool_w, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dy, dx);
stride_h, stride_w, pad_h, pad_w, dy, dx, ctx);
} else LOG(FATAL) << "Unknown data format: " << data_format;
}
......@@ -3214,12 +3361,11 @@ template<> void ROIPooling<float, CPUContext>(
const float* x,
const float* rois,
int* mask,
float* y) {
float* y,
CPUContext* ctx) {
const TIndex x_offset = H * W,
y_offset = pool_h * pool_w,
im_offset = C * H * W;
math::Set<float, CPUContext>(count, -FLT_MAX, y);
math::Set<int, CPUContext>(count, -1, mask);
for (int n = 0; n < num_rois; ++n) {
int im_idx = rois[0];
int x1 = round(rois[1] * spatial_scale);
......@@ -3248,10 +3394,10 @@ template<> void ROIPooling<float, CPUContext>(
end_w = std::min(end_w, W);
bool is_empty = (end_h == start_h) || (end_w == start_w);
const int pool_idx = ph * pool_w + pw;
if (is_empty) {
y[pool_idx] = 0;
mask[pool_idx] = -1;
}
if (is_empty || im_idx < 0) y[pool_idx] = 0;
else y[pool_idx] = -FLT_MAX;
mask[pool_idx] = -1;
if (im_idx < 0) continue;
for (int h = start_h; h < end_h; ++h) {
for (int w = start_w; w < end_w; ++w) {
const int idx = h * W + w;
......@@ -3286,7 +3432,8 @@ template<> void ROIPoolingGrad<float, CPUContext>(
const float* dy,
const float* rois,
const int* mask,
float* dx) {
float* dx,
CPUContext* ctx) {
NOT_IMPLEMENTED;
}
......@@ -3305,7 +3452,8 @@ template<> void ROIAlign<float, CPUContext>(
const int sampling_ratio,
const float* x,
const float* rois,
float* y) {
float* y,
CPUContext* ctx) {
NOT_IMPLEMENTED;
}
......@@ -3322,7 +3470,8 @@ template<> void ROIAlignGrad<float, CPUContext>(
const int sampling_ratio,
const float* dy,
const float* rois,
float* dx) {
float* dx,
CPUContext* ctx) {
NOT_IMPLEMENTED;
}
......
This diff could not be displayed because it is too large.
......@@ -23,7 +23,7 @@ __global__ void _ReluHalf(
const half* x,
half* y) {
const half kZero = __float2half(0.f);
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hgt(x[idx], kZero) ?
x[idx] : __hmul(x[idx], slope);
......@@ -38,7 +38,7 @@ __global__ void _ReluHalf2(
const half2* x,
half2* y) {
const half2 kZero = __float2half2_rn(0.f);
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hbgt2(x[idx], kZero) ?
x[idx] : __hmul2(x[idx], slope);
......@@ -51,20 +51,23 @@ template<> void Relu<float16, CUDAContext>(
const int count,
const float slope,
const float16* x,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (count % 2 == 0) {
if ((count & 1) == 0 == 0) {
_ReluHalf2<half2>
<< < CUDA_BLOCKS(count), CUDA_THREADS >> > (count / 2,
dragon_cast<half2, float>(slope),
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
<< < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> > (count >> 1,
dragon_cast<half2, float>(slope),
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
} else {
_ReluHalf<half>
<< < CUDA_BLOCKS(count), CUDA_THREADS >> >(count,
dragon_cast<half, float>(slope),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
dragon_cast<half, float>(slope),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -82,7 +85,7 @@ __global__ void _AffineWithOBiasHalf(
const half* x,
const half* alpha,
half* y) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = __hmul(alpha[scale_idx], x[idx]);
......@@ -99,7 +102,7 @@ __global__ void _AffineWithBiasHalf(
const half* alpha,
const half* beta,
half* y) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = __hadd(
......@@ -125,25 +128,184 @@ template<> void Affine<float16, CUDAContext>(
#ifdef WITH_CUDA_FP16
if (beta != nullptr) {
_AffineWithBiasHalf<float>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, scale_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
reinterpret_cast<const half*>(beta),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
scale_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
reinterpret_cast<const half*>(beta),
reinterpret_cast<half*>(y));
} else {
_AffineWithOBiasHalf<float>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, scale_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
scale_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
reinterpret_cast<half*>(y));
}
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** loss.sparse_softmax_cross_entropy ********************/
template <typename Ty>
__global__ void _SparseSoftmaxCrossEntropyHalf(
const int count,
const int axis_dim,
const int inner_dim,
const half* prob,
const Ty* labels,
const int* ignores,
const int num_ignores,
float* losses,
float* flags) {
CUDA_1D_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
const int oix = idx / inner_dim;
const int iix = idx % inner_dim;
const int label = labels[oix * inner_dim + iix];
int k;
for (k = 0; k < num_ignores; k++) {
if (label == ignores[k]) {
losses[idx] = flags[idx] = 0;
break;
}
}
if (k == num_ignores) {
const half kMIN = __float2half(HFLT_MIN);
half loss = __hneg(
hlog(
__hgt(prob[(oix * axis_dim + label)
* inner_dim + iix], kMIN) ?
prob[(oix * axis_dim + label)
* inner_dim + iix] : kMIN
)
);
losses[idx] = __half2float(loss);
flags[idx] = 1;
}
#endif
}
}
template <> void SparseSoftmaxCrossEntropy<float16, float, CUDAContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* prob,
const float* labels,
const int* ignores,
const int num_ignores,
float* losses,
float* flags,
CUDAContext* ctx) {
const int num_preds = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropyHalf<float>
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
num_preds, axis_dim, inner_dim,
reinterpret_cast<const half*>(prob), labels,
ignores, num_ignores, losses, flags);
}
template <> void SparseSoftmaxCrossEntropy<float16, int64_t, CUDAContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* prob,
const int64_t* labels,
const int* ignores,
const int num_ignores,
float* losses,
float* flags,
CUDAContext* ctx) {
const int num_preds = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropyHalf<int64_t>
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
num_preds, axis_dim, inner_dim,
reinterpret_cast<const half*>(prob), labels,
ignores, num_ignores, losses, flags);
}
template <typename Ty>
__global__ void _SparseSoftmaxCrossEntropyGradHalf(
const int count,
const int axis_dim,
const int inner_dim,
const half* prob,
const Ty* labels,
const int* ignores,
const int num_ignores,
half* dx,
float* flags) {
CUDA_1D_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
const int oix = idx / inner_dim;
const int iix = idx % inner_dim;
const int label = labels[oix * inner_dim + iix];
int k;
for (k = 0; k < num_ignores; k++)
if (label == ignores[k]) break;
if (k != num_ignores) {
for (int c = 0; c < axis_dim; c++)
dx[(oix * axis_dim + c) * inner_dim + iix]
= __float2half(0.f);
flags[idx] = 0;
} else {
const int x_idx = (oix * axis_dim + label) * inner_dim + iix;
dx[x_idx] = __hsub(dx[x_idx], __float2half(1.f));
flags[idx] = 1;
}
#endif
}
}
template<> void SparseSoftmaxCrossEntropyGrad<float16, float, CUDAContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* prob,
const float* labels,
const int* ignores,
const int num_ignores,
float16* dx,
float* flags,
CUDAContext* ctx) {
const int num_preds = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropyGradHalf<float>
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
num_preds, axis_dim, inner_dim,
reinterpret_cast<const half*>(prob), labels,
ignores, num_ignores,
reinterpret_cast<half*>(dx), flags);
}
template<> void SparseSoftmaxCrossEntropyGrad<float16, int64_t, CUDAContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* prob,
const int64_t* labels,
const int* ignores,
const int num_ignores,
float16* dx,
float* flags,
CUDAContext* ctx) {
const int num_preds = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropyGradHalf<int64_t>
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
num_preds, axis_dim, inner_dim,
reinterpret_cast<const half*>(prob), labels,
ignores, num_ignores,
reinterpret_cast<half*>(dx), flags);
}
/******************** misc.astype ********************/
#ifdef WITH_CUDA_FP16
......@@ -151,7 +313,7 @@ __global__ void _TypeHalf2Float(
const int count,
const half* a,
float* b) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
b[idx] = __half2float(a[idx]);
}
}
......@@ -159,7 +321,7 @@ __global__ void _TypeFloat2Half(
const int count,
const float* a,
half* b) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
b[idx] = __float2half(a[idx]);
}
}
......@@ -168,7 +330,7 @@ __global__ void _TypeHalf2Half(
const int count,
const half* a,
half* b) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
b[idx] = a[idx];
}
}
......@@ -178,14 +340,16 @@ __global__ void _TypeHalf2Half(
template <> void TypeA2B<float16, type, CUDAContext>( \
const int count, \
const float16* a, \
type* b) { \
type* b, \
CUDAContext* ctx) { \
LOG(FATAL) << "CUDAContext has not implemented: float16 -> " \
<< TypeMetaToString(TypeMeta::Make<type>()); \
} \
template <> void TypeA2B<type, float16, CUDAContext>( \
const int count, \
const type* a, \
float16* b) { \
float16* b, \
CUDAContext* ctx) { \
LOG(FATAL) << "CUDAContext has not implemented: " \
<< TypeMetaToString(TypeMeta::Make<type>()) << " -> float16"; \
}
......@@ -194,29 +358,35 @@ __global__ void _TypeHalf2Half(
template <> void TypeA2B<float16, float, CUDAContext>( \
const int count, \
const float16* a, \
float* b) { \
float* b, \
CUDAContext* ctx) { \
_TypeHalf2Float \
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >( \
count, reinterpret_cast<const half*>(a), b); \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >(count, \
reinterpret_cast<const half*>(a), b); \
} \
template <> void TypeA2B<float, float16, CUDAContext>( \
const int count, \
const float* a, \
float16* b) { \
float16* b, \
CUDAContext* ctx) { \
_TypeFloat2Half \
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >( \
count, a, reinterpret_cast<half*>(b)); \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >(count, \
a, reinterpret_cast<half*>(b)); \
}
#ifdef WITH_CUDA_FP16
template <> void TypeA2B<float16, float16, CUDAContext>(
const int count,
const float16* a,
float16* b) {
float16* b,
CUDAContext* ctx) {
_TypeHalf2Half
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(count,
reinterpret_cast<const half*>(a),
reinterpret_cast<half*>(b));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
reinterpret_cast<const half*>(a),
reinterpret_cast<half*>(b));
}
DEFINE_TYPE_ENABLE_FP16_FP32;
DEFINE_TYPE_DISABLE_FP16(double);
......@@ -227,7 +397,8 @@ DEFINE_TYPE_DISABLE_FP16(uint8_t);
template <> void TypeA2B<float16, float16, CUDAContext>(
const int count,
const float16* a,
float16* b) {
float16* b,
CUDAContext* ctx) {
LOG(FATAL) << "CUDAContext has not implemented: float16 -> float16";
}
DEFINE_TYPE_DISABLE_FP16(float);
......@@ -251,7 +422,7 @@ __global__ void _ImageDataHalf_NCHW(
const float* std_values,
const Tx* x,
Ty* y) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
const int w = idx % W;
const int h = (idx / W) % H;
const int c = (idx / W / H) % C;
......@@ -274,7 +445,7 @@ __global__ void _ImageDataHalf_NHWC(
const float* std_values,
const Tx* x,
Ty* y) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
const int c = idx % C;
float raw_value = x[idx];
if (mean_values) raw_value -= mean_values[c];
......@@ -294,18 +465,21 @@ template <> void ImageData<float, float16, CUDAContext>(
const float* std_values,
const string& data_format,
const float* x,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (data_format == "NCHW") {
_ImageDataHalf_NCHW<float, half>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, N, C, H, W, mean_values, std_values,
x, reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
N, C, H, W, mean_values, std_values,
x, reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ImageDataHalf_NHWC<float, half>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, N, C, H, W, mean_values, std_values,
x, reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
N, C, H, W, mean_values, std_values,
x, reinterpret_cast<half*>(y));
} else LOG(FATAL) << "Unknown data format: " << data_format;
#else
CUDA_FP16_NOT_COMPILED;
......@@ -322,18 +496,21 @@ template <> void ImageData<uint8_t, float16, CUDAContext>(
const float* std_values,
const string& data_format,
const uint8_t* x,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (data_format == "NCHW") {
_ImageDataHalf_NCHW<uint8_t, half>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, N, C, H, W, mean_values, std_values,
x, reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
N, C, H, W, mean_values, std_values,
x, reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ImageDataHalf_NHWC<uint8_t, half>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, N, C, H, W, mean_values, std_values,
x, reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
N, C, H, W, mean_values, std_values,
x, reinterpret_cast<half*>(y));
} else LOG(FATAL) << "Unknown data format: " << data_format;
#else
CUDA_FP16_NOT_COMPILED;
......@@ -352,7 +529,7 @@ __global__ void _ConcatHalf(
const int concat_offset,
const T* x,
T* y) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
const int tmp = x_concat_dim * inner_dim;
const int outer_idx = idx / tmp;
const int concat_idx = idx % tmp;
......@@ -370,14 +547,16 @@ template <> void Concat<float16, CUDAContext>(
const int y_concat_dim,
const int concat_offset,
const float16* x,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
_ConcatHalf<half>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, outer_dim, inner_dim,
x_concat_dim, y_concat_dim, concat_offset,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
outer_dim, inner_dim,
x_concat_dim, y_concat_dim, concat_offset,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
#else
CUDA_FP16_NOT_COMPILED;
#endif
......@@ -393,7 +572,7 @@ __global__ void _ConcatGradHalf(
const int concat_offset,
const T* dy,
T* dx) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
const int tmp = x_concat_dim * inner_dim;
const int outer_idx = idx / tmp;
const int concat_idx = idx % tmp;
......@@ -411,14 +590,16 @@ template <> void ConcatGrad<float16, CUDAContext>(
const int y_concat_dim,
const int concat_offset,
const float16* dy,
float16* dx) {
float16* dx,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
_ConcatGradHalf<half>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, outer_dim, inner_dim,
x_concat_dim, y_concat_dim, concat_offset,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
outer_dim, inner_dim,
x_concat_dim, y_concat_dim, concat_offset,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
#else
CUDA_FP16_NOT_COMPILED;
#endif
......@@ -435,7 +616,7 @@ __global__ void _TransposeHalf(
const int* new_steps,
const T* x,
T* y) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
int x_idx = 0, y_idx = idx;
for (int j = 0; j < ndim; ++j) {
int k = order[j];
......@@ -453,13 +634,15 @@ template <> void Transpose<float16, CUDAContext>(
const int* old_steps,
const int* new_steps,
const float16* x,
float16* y) {
float16* y,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
_TransposeHalf<half>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, ndim, order, old_steps, new_steps,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
ndim, order, old_steps, new_steps,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
#else
CUDA_FP16_NOT_COMPILED;
#endif
......@@ -474,7 +657,7 @@ __global__ void _TransposeGradHalf(
const int* new_steps,
const T* dy,
T* dx) {
CUDA_KERNEL_LOOP(idx, count) {
CUDA_1D_KERNEL_LOOP(idx, count) {
int x_idx = 0, y_idx = idx;
for (int j = 0; j < ndim; ++j) {
int k = order[j];
......@@ -492,13 +675,15 @@ template <> void TransposeGrad<float16, CUDAContext>(
const int* old_steps,
const int* new_steps,
const float16* dy,
float16* dx) {
float16* dx,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
_TransposeGradHalf<half>
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, ndim, order, old_steps, new_steps,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
ndim, order, old_steps, new_steps,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
#else
CUDA_FP16_NOT_COMPILED;
#endif
......@@ -516,7 +701,7 @@ __global__ void _AdamUpdateHalf(
half* g,
half* m,
half* v) {
CUDA_KERNEL_LOOP(i, count) {
CUDA_1D_KERNEL_LOOP(i, count) {
#if __CUDA_ARCH__ >= 530
half gi = g[i];
half kOne = __float2half(1.f);
......@@ -545,17 +730,19 @@ template <> void AdamUpdate<float16, CUDAContext>(
const float eps,
float16* g,
float16* m,
float16* v) {
float16* v,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
_AdamUpdateHalf
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, dragon_cast<half, float>(lr),
dragon_cast<half, float>(beta1),
dragon_cast<half, float>(beta2),
dragon_cast<half, float>(eps),
reinterpret_cast<half*>(g),
reinterpret_cast<half*>(m),
reinterpret_cast<half*>(v));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
dragon_cast<half, float>(lr),
dragon_cast<half, float>(beta1),
dragon_cast<half, float>(beta2),
dragon_cast<half, float>(eps),
reinterpret_cast<half*>(g),
reinterpret_cast<half*>(m),
reinterpret_cast<half*>(v));
#else
CUDA_FP16_NOT_COMPILED;
#endif
......@@ -570,7 +757,7 @@ __global__ void _NesterovUpdateHalf(
const half momentum,
half* g,
half* h) {
CUDA_KERNEL_LOOP(i, count) {
CUDA_1D_KERNEL_LOOP(i, count) {
#if __CUDA_ARCH__ >= 530
half hi = h[i];
half hi_new = h[i] = __hadd(
......@@ -592,7 +779,7 @@ __global__ void _NesterovUpdateHalf2(
const half2 momentum,
half2* g,
half2* h) {
CUDA_KERNEL_LOOP(i, count) {
CUDA_1D_KERNEL_LOOP(i, count) {
#if __CUDA_ARCH__ >= 530
half2 hi = h[i];
half2 hi_new = h[i] = __hadd2(
......@@ -614,22 +801,25 @@ template <> void NesterovUpdate<float16, CUDAContext>(
const float lr,
const float momentum,
float16* g,
float16* h) {
float16* h,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (count % 2 == 0) {
if ((count & 1) == 0 == 0) {
_NesterovUpdateHalf2
<< <CUDA_BLOCKS(count / 2), CUDA_THREADS >> >(
count / 2, dragon_cast<half2, float>(lr),
dragon_cast<half2, float>(momentum),
reinterpret_cast<half2*>(g),
reinterpret_cast<half2*>(h));
<< < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count >> 1,
dragon_cast<half2, float>(lr),
dragon_cast<half2, float>(momentum),
reinterpret_cast<half2*>(g),
reinterpret_cast<half2*>(h));
} else {
_NesterovUpdateHalf
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, dragon_cast<half, float>(lr),
dragon_cast<half, float>(momentum),
reinterpret_cast<half*>(g),
reinterpret_cast<half*>(h));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
dragon_cast<half, float>(lr),
dragon_cast<half, float>(momentum),
reinterpret_cast<half*>(g),
reinterpret_cast<half*>(h));
}
#else
CUDA_FP16_NOT_COMPILED;
......@@ -646,7 +836,7 @@ __global__ void _RMSPropUpdateHalf(
const half eps,
half* g,
half* h) {
CUDA_KERNEL_LOOP(i, count) {
CUDA_1D_KERNEL_LOOP(i, count) {
#if __CUDA_ARCH__ >= 530
half gi = g[i];
half kOne = __float2half(1.f);
......@@ -669,15 +859,17 @@ template <> void RMSPropUpdate<float16, CUDAContext>(
const float decay,
const float eps,
float16* g,
float16* h) {
float16* h,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
_RMSPropUpdateHalf
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, dragon_cast<half, float>(lr),
dragon_cast<half, float>(decay),
dragon_cast<half, float>(eps),
reinterpret_cast<half*>(g),
reinterpret_cast<half*>(h));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
dragon_cast<half, float>(lr),
dragon_cast<half, float>(decay),
dragon_cast<half, float>(eps),
reinterpret_cast<half*>(g),
reinterpret_cast<half*>(h));
#else
CUDA_FP16_NOT_COMPILED;
#endif
......@@ -692,7 +884,7 @@ __global__ void _SGDUpdateHalf(
const half momentum,
half* g,
half* h) {
CUDA_KERNEL_LOOP(i, count) {
CUDA_1D_KERNEL_LOOP(i, count) {
#if __CUDA_ARCH__ >= 530
half hi = h[i];
g[i] = h[i] = __hadd(
......@@ -709,7 +901,7 @@ __global__ void _SGDUpdateHalf2(
const half2 momentum,
half2* g,
half2* h) {
CUDA_KERNEL_LOOP(i, count) {
CUDA_1D_KERNEL_LOOP(i, count) {
#if __CUDA_ARCH__ >= 530
half2 hi = h[i];
g[i] = h[i] = __hadd2(
......@@ -726,22 +918,25 @@ template <> void SGDUpdate<float16, CUDAContext>(
const float lr,
const float momentum,
float16* g,
float16* h) {
float16* h,
CUDAContext* ctx) {
#ifdef WITH_CUDA_FP16
if (count % 2 == 0) {
if ((count & 1) == 0 == 0) {
_SGDUpdateHalf2
<< <CUDA_BLOCKS(count / 2), CUDA_THREADS >> >(
count / 2, dragon_cast<half2, float>(lr),
dragon_cast<half2, float>(momentum),
reinterpret_cast<half2*>(g),
reinterpret_cast<half2*>(h));
<< < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count >> 1,
dragon_cast<half2, float>(lr),
dragon_cast<half2, float>(momentum),
reinterpret_cast<half2*>(g),
reinterpret_cast<half2*>(h));
} else {
_SGDUpdateHalf
<< <CUDA_BLOCKS(count), CUDA_THREADS >> >(
count, dragon_cast<half, float>(lr),
dragon_cast<half, float>(momentum),
reinterpret_cast<half*>(g),
reinterpret_cast<half*>(h));
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(count,
dragon_cast<half, float>(lr),
dragon_cast<half, float>(momentum),
reinterpret_cast<half*>(g),
reinterpret_cast<half*>(h));
}
#else
CUDA_FP16_NOT_COMPILED;
......
......@@ -162,7 +162,7 @@ template<> void Axpby(
SSE_LOOP2(i, n) y[i] = alpha * x[i] + beta* y[i];
}
template<> float ASum(
template<> float Sum(
const int n,
const float* x) {
__m128 x1, sum = SSE_FP32_ZERO;
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!