Commit 494774d3 by Ting PAN

Optimize training update operators

Summary:
This commit fuses the weight decay and mixed precision conversion
into update kernels to get lower training latency.
1 parent fb47d86f
Showing with 2315 additions and 2003 deletions
...@@ -418,9 +418,9 @@ class Normalize(Layer): ...@@ -418,9 +418,9 @@ class Normalize(Layer):
def __call__(self, bottom): def __call__(self, bottom):
if len(self.blobs) == 0: if len(self.blobs) == 0:
self.build(bottom) self.build(bottom)
outputs = [normalization_ops.lp_normalize(bottom, **self.norm_args)] outputs = [normalization_ops.lp_norm(bottom, **self.norm_args)]
outputs += [blob['data'] for blob in self.blobs] outputs += [blob['data'] for blob in self.blobs]
return array_ops.channel_affine(outputs, **self.scale_args) return math_ops.affine(outputs, **self.scale_args)
class Permute(Layer): class Permute(Layer):
...@@ -591,8 +591,7 @@ class Scale(Layer): ...@@ -591,8 +591,7 @@ class Scale(Layer):
param = layer_param.scale_param param = layer_param.scale_param
self.axis = param.axis self.axis = param.axis
self.num_axes = param.num_axes self.num_axes = param.num_axes
end_axis = -1 if self.num_axes < 1 else self.axis + self.num_axes - 1 self.call_args = {'axis': list(range(self.axis, self.axis + self.num_axes))}
self.call_args = {'axis': self.axis, 'end_axis': end_axis}
self.filler = caffe_pb2.FillerParameter(type='constant', value=1) self.filler = caffe_pb2.FillerParameter(type='constant', value=1)
self.filler = param.filler if param.HasField('filler') else self.filler self.filler = param.filler if param.HasField('filler') else self.filler
self.bias_filler = param.bias_filler self.bias_filler = param.bias_filler
...@@ -609,7 +608,7 @@ class Scale(Layer): ...@@ -609,7 +608,7 @@ class Scale(Layer):
if len(self.blobs) == 0: if len(self.blobs) == 0:
self.build(bottom) self.build(bottom)
inputs = [bottom] + [blob['data'] for blob in self.blobs] inputs = [bottom] + [blob['data'] for blob in self.blobs]
return array_ops.channel_affine(inputs, **self.call_args) return math_ops.affine(inputs, **self.call_args)
class Slice(Layer): class Slice(Layer):
......
...@@ -16,8 +16,8 @@ from __future__ import print_function ...@@ -16,8 +16,8 @@ from __future__ import print_function
from dragon.core.framework import workspace from dragon.core.framework import workspace
from dragon.core.io.kpl_record import KPLRecordDataset from dragon.core.io.kpl_record import KPLRecordDataset
from dragon.core.ops import array_ops
from dragon.core.ops import framework_ops from dragon.core.ops import framework_ops
from dragon.core.ops import normalization_ops
from dragon.utils import vision from dragon.utils import vision
from dragon.vm.caffe.core.layer import Layer from dragon.vm.caffe.core.layer import Layer
...@@ -121,5 +121,5 @@ class Data(Layer): ...@@ -121,5 +121,5 @@ class Data(Layer):
data._shape = (self.data_args['batch_size'], data._shape = (self.data_args['batch_size'],
None, None, len(self.norm_args['mean'])) None, None, len(self.norm_args['mean']))
label._shape = (self.data_args['batch_size'], None) label._shape = (self.data_args['batch_size'], None)
data = array_ops.channel_normalize(data, **self.norm_args) data = normalization_ops.channel_norm(data, **self.norm_args)
return data, label return data, label
...@@ -9,6 +9,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) ...@@ -9,6 +9,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
# ---[ Compiler flags # ---[ Compiler flags
if (MSVC) if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_ENABLE_EXTENDED_ALIGNED_STORAGE")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}
/wd4003 /wd4114 /wd4003 /wd4114
......
...@@ -36,16 +36,6 @@ dragon ...@@ -36,16 +36,6 @@ dragon
`cast(...) <dragon/cast.html>`_ `cast(...) <dragon/cast.html>`_
: Cast the data type of input. : Cast the data type of input.
`channel_affine(...) <dragon/channel_affine.html>`_
: Apply affine transformation to each channel of input.
`channel_normalize(...) <dragon/channel_normalize.html>`_
: Apply normalization to each channel of input.
`channel_shuffle(...) <dragon/channel_shuffle.html>`_
: Apply group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
`concat(...) <dragon/concat.html>`_ `concat(...) <dragon/concat.html>`_
: Concatenate the inputs along the given axis. : Concatenate the inputs along the given axis.
...@@ -211,9 +201,6 @@ dragon ...@@ -211,9 +201,6 @@ dragon
dragon/boolean_mask dragon/boolean_mask
dragon/broadcast_to dragon/broadcast_to
dragon/cast dragon/cast
dragon/channel_affine
dragon/channel_normalize
dragon/channel_shuffle
dragon/concat dragon/concat
dragon/constant dragon/constant
dragon/device dragon/device
......
...@@ -24,6 +24,9 @@ dragon.cuda ...@@ -24,6 +24,9 @@ dragon.cuda
`memory_allocated(...) <cuda/memory_allocated.html>`_ `memory_allocated(...) <cuda/memory_allocated.html>`_
: Return the size of memory used by tensors in current workspace. : Return the size of memory used by tensors in current workspace.
`set_cublas_flags(...) <cuda/set_cublas_flags.html>`_
: Set the flags of cuBLAS library.
`set_cudnn_flags(...) <cuda/set_cudnn_flags.html>`_ `set_cudnn_flags(...) <cuda/set_cudnn_flags.html>`_
: Set the flags of cuDNN library. : Set the flags of cuDNN library.
...@@ -44,6 +47,7 @@ dragon.cuda ...@@ -44,6 +47,7 @@ dragon.cuda
cuda/get_device_capability cuda/get_device_capability
cuda/is_available cuda/is_available
cuda/memory_allocated cuda/memory_allocated
cuda/set_cublas_flags
cuda/set_cudnn_flags cuda/set_cudnn_flags
cuda/set_default_device cuda/set_default_device
cuda/set_device cuda/set_device
......
channel_affine set_cublas_flags
============== ================
.. autofunction:: dragon.channel_affine .. autofunction:: dragon.cuda.set_cublas_flags
.. raw:: html .. raw:: html
<style> <style>
h1:before { h1:before {
content: "dragon."; content: "dragon.cuda.";
color: #103d3e; color: #103d3e;
} }
</style> </style>
...@@ -12,12 +12,18 @@ dragon.math ...@@ -12,12 +12,18 @@ dragon.math
`add(...) <math/add.html>`_ `add(...) <math/add.html>`_
: Compute the element-wise addition. : Compute the element-wise addition.
`affine(...) <math/affine.html>`_
: Apply the affine transformation to input.
`argmax(...) <math/argmax.html>`_ `argmax(...) <math/argmax.html>`_
: Compute the index of maximum elements along the given axis. : Compute the index of maximum elements along the given axis.
`argmin(...) <math/argmin.html>`_ `argmin(...) <math/argmin.html>`_
: Compute the index of minimum elements along the given axis. : Compute the index of minimum elements along the given axis.
`atan2(...) <math/atan2.html>`_
: Compute the element-wise arc-tangent of two arguments.
`ceil(...) <math/ceil.html>`_ `ceil(...) <math/ceil.html>`_
: Compute the smallest integer not less than input. : Compute the smallest integer not less than input.
...@@ -81,9 +87,6 @@ dragon.math ...@@ -81,9 +87,6 @@ dragon.math
`logical_xor(...) <math/logical_xor.html>`_ `logical_xor(...) <math/logical_xor.html>`_
: Compute the element-wise XOR logical operation. : Compute the element-wise XOR logical operation.
`lp_normalize(...) <math/lp_normalize.html>`_
: Apply the lp normalization.
`matmul(...) <math/matmul.html>`_ `matmul(...) <math/matmul.html>`_
: Compute the matrix multiplication. : Compute the matrix multiplication.
...@@ -158,8 +161,10 @@ dragon.math ...@@ -158,8 +161,10 @@ dragon.math
math/abs math/abs
math/add math/add
math/affine
math/argmax math/argmax
math/argmin math/argmin
math/atan2
math/ceil math/ceil
math/clip math/clip
math/cos math/cos
...@@ -181,7 +186,6 @@ dragon.math ...@@ -181,7 +186,6 @@ dragon.math
math/logical_not math/logical_not
math/logical_or math/logical_or
math/logical_xor math/logical_xor
math/lp_normalize
math/matmul math/matmul
math/max math/max
math/maximum math/maximum
......
lp_normalize affine
============ ======
.. autofunction:: dragon.math.lp_normalize .. autofunction:: dragon.math.affine
.. raw:: html .. raw:: html
......
channel_normalize atan2
================= =====
.. autofunction:: dragon.channel_normalize .. autofunction:: dragon.math.atan2
.. raw:: html .. raw:: html
<style> <style>
h1:before { h1:before {
content: "dragon."; content: "dragon.math.";
color: #103d3e; color: #103d3e;
} }
</style> </style>
...@@ -28,6 +28,13 @@ dragon.nn ...@@ -28,6 +28,13 @@ dragon.nn
`bias_add(...) <nn/bias_add.html>`_ `bias_add(...) <nn/bias_add.html>`_
: Add the bias across channels to input. : Add the bias across channels to input.
`channel_norm(...) <nn/channel_norm.html>`_
: Apply the normalization to each channel of input.
`channel_shuffle(...) <nn/channel_shuffle.html>`_
: Apply the group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
`conv(...) <nn/conv.html>`_ `conv(...) <nn/conv.html>`_
: Apply the n-dimension convolution. : Apply the n-dimension convolution.
...@@ -107,6 +114,9 @@ dragon.nn ...@@ -107,6 +114,9 @@ dragon.nn
`log_softmax(...) <nn/log_softmax.html>`_ `log_softmax(...) <nn/log_softmax.html>`_
: Compute the composite of logarithm and softmax. : Compute the composite of logarithm and softmax.
`lp_norm(...) <nn/lp_norm.html>`_
: Apply the lp normalization.
`moments(...) <nn/moments.html>`_ `moments(...) <nn/moments.html>`_
: Compute the mean and variance of input along the given axis. : Compute the mean and variance of input along the given axis.
...@@ -157,6 +167,8 @@ dragon.nn ...@@ -157,6 +167,8 @@ dragon.nn
nn/RNN nn/RNN
nn/batch_norm nn/batch_norm
nn/bias_add nn/bias_add
nn/channel_norm
nn/channel_shuffle
nn/conv nn/conv
nn/conv_transpose nn/conv_transpose
nn/conv1d nn/conv1d
...@@ -180,6 +192,7 @@ dragon.nn ...@@ -180,6 +192,7 @@ dragon.nn
nn/leaky_relu nn/leaky_relu
nn/local_response_norm nn/local_response_norm
nn/log_softmax nn/log_softmax
nn/lp_norm
nn/moments nn/moments
nn/pool nn/pool
nn/pool1d nn/pool1d
......
channel_norm
============
.. autofunction:: dragon.nn.channel_norm
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
channel_shuffle channel_shuffle
=============== ===============
.. autofunction:: dragon.channel_shuffle .. autofunction:: dragon.nn.channel_shuffle
.. raw:: html .. raw:: html
<style> <style>
h1:before { h1:before {
content: "dragon."; content: "dragon.nn.";
color: #103d3e; color: #103d3e;
} }
</style> </style>
lp_norm
=======
.. autofunction:: dragon.nn.lp_norm
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
...@@ -21,6 +21,9 @@ vm.tensorflow.math ...@@ -21,6 +21,9 @@ vm.tensorflow.math
`argmin(...) <math/argmin.html>`_ `argmin(...) <math/argmin.html>`_
: Compute the index of minimum elements along the given axis. : Compute the index of minimum elements along the given axis.
`atan2(...) <math/atan2.html>`_
: Compute the element-wise arc-tangent of two arguments.
`ceil(...) <math/ceil.html>`_ `ceil(...) <math/ceil.html>`_
: Compute the smallest integer not less than input. : Compute the smallest integer not less than input.
...@@ -134,6 +137,7 @@ vm.tensorflow.math ...@@ -134,6 +137,7 @@ vm.tensorflow.math
math/add_n math/add_n
math/argmax math/argmax
math/argmin math/argmin
math/atan2
math/ceil math/ceil
math/cos math/cos
math/cumsum math/cumsum
......
channel_normalize atan2
================= =====
.. autofunction:: dragon.vm.torch.channel_normalize .. autofunction:: dragon.vm.tensorflow.math.atan2
.. raw:: html .. raw:: html
<style> <style>
h1:before { h1:before {
content: "torch."; content: "tf.math.";
color: #103d3e; color: #103d3e;
} }
</style> </style>
...@@ -51,6 +51,9 @@ vm.torch ...@@ -51,6 +51,9 @@ vm.torch
`argsort(...) <torch/argsort.html>`_ `argsort(...) <torch/argsort.html>`_
: Return the index of sorted elements along the given dimension. : Return the index of sorted elements along the given dimension.
`atan2(...) <torch/atan2.html>`_
: Compute the element-wise arc-tangent of two arguments.
`baddbmm(...) <torch/baddbmm.html>`_ `baddbmm(...) <torch/baddbmm.html>`_
: Add input to the result of batched matrix-matrix multiplication. : Add input to the result of batched matrix-matrix multiplication.
...@@ -75,12 +78,6 @@ vm.torch ...@@ -75,12 +78,6 @@ vm.torch
`ceil(...) <torch/ceil.html>`_ `ceil(...) <torch/ceil.html>`_
: Compute the smallest integer not less than input. : Compute the smallest integer not less than input.
`channel_affine(...) <torch/channel_affine.html>`_
: Apply affine transformation to each channel of input.
`channel_normalize(...) <torch/channel_normalize.html>`_
: Apply normalization to each channel of input.
`chunk(...) <torch/chunk.html>`_ `chunk(...) <torch/chunk.html>`_
: Split input into a specific number of chunks. : Split input into a specific number of chunks.
...@@ -345,6 +342,7 @@ vm.torch ...@@ -345,6 +342,7 @@ vm.torch
torch/argmax torch/argmax
torch/argmin torch/argmin
torch/argsort torch/argsort
torch/atan2
torch/baddbmm torch/baddbmm
torch/bitwise_and torch/bitwise_and
torch/bitwise_not torch/bitwise_not
...@@ -353,8 +351,6 @@ vm.torch ...@@ -353,8 +351,6 @@ vm.torch
torch/bmm torch/bmm
torch/cat torch/cat
torch/ceil torch/ceil
torch/channel_affine
torch/channel_normalize
torch/chunk torch/chunk
torch/clamp torch/clamp
torch/cos torch/cos
......
...@@ -73,6 +73,10 @@ argsort ...@@ -73,6 +73,10 @@ argsort
####### #######
.. automethod:: dragon.vm.torch.Tensor.argsort .. automethod:: dragon.vm.torch.Tensor.argsort
atan2
#####
.. automethod:: dragon.vm.torch.Tensor.atan2
backward backward
######## ########
.. automethod:: dragon.vm.torch.Tensor.backward .. automethod:: dragon.vm.torch.Tensor.backward
...@@ -699,6 +703,7 @@ zero\_ ...@@ -699,6 +703,7 @@ zero\_
.. _torch.argmax(...): argmax.html .. _torch.argmax(...): argmax.html
.. _torch.argmin(...): argmin.html .. _torch.argmin(...): argmin.html
.. _torch.argsort(...): argsort.html .. _torch.argsort(...): argsort.html
.. _torch.atan2(...): atan2.html
.. _torch.baddbmm(...): baddbmm.html .. _torch.baddbmm(...): baddbmm.html
.. _torch.bitwise_and(...): bitwise_and.html .. _torch.bitwise_and(...): bitwise_and.html
.. _torch.bitwise_not(...): bitwise_not.html .. _torch.bitwise_not(...): bitwise_not.html
......
channel_affine atan2
============== =====
.. autofunction:: dragon.vm.torch.channel_affine .. autofunction:: dragon.vm.torch.atan2
.. raw:: html .. raw:: html
......
...@@ -6,12 +6,16 @@ vm.torch.backends ...@@ -6,12 +6,16 @@ vm.torch.backends
Modules Modules
------- -------
`Module cuda <backends/cuda.html>`_
: The CUDA backend module.
`Module cudnn <backends/cudnn.html>`_ `Module cudnn <backends/cudnn.html>`_
: The cuDNN backend module. : The cuDNN backend module.
.. toctree:: .. toctree::
:hidden: :hidden:
backends/cuda
backends/cudnn backends/cudnn
.. raw:: html .. raw:: html
......
cuda
====
Properties
----------
matmul.allow_tf32
#################
.. data:: dragon.vm.torch.backends.cuda.matmul.allow_tf32
:annotation: = False
The flag that allows TF32 math type for matmul or not.
Functions
---------
is_built
########
.. automethod:: dragon.vm.torch.backends.cuda.is_built
.. raw:: html
<style>
h1:before {
content: "torch.backends.";
color: #103d3e;
}
</style>
...@@ -24,8 +24,8 @@ vm.torch.nn ...@@ -24,8 +24,8 @@ vm.torch.nn
`class AdaptiveMaxPool3d <nn/AdaptiveMaxPool3d.html>`_ `class AdaptiveMaxPool3d <nn/AdaptiveMaxPool3d.html>`_
: Apply the 3d adaptive max pooling. : Apply the 3d adaptive max pooling.
`class AffineChannel <nn/AffineChannel.html>`_ `class Affine <nn/Affine.html>`_
: Apply affine transformation along the channels. : Apply the affine transformation.
`class AvgPool1d <nn/AvgPool1d.html>`_ `class AvgPool1d <nn/AvgPool1d.html>`_
: Apply the 1d average pooling. : Apply the 1d average pooling.
...@@ -312,7 +312,7 @@ vm.torch.nn ...@@ -312,7 +312,7 @@ vm.torch.nn
nn/AdaptiveMaxPool1d nn/AdaptiveMaxPool1d
nn/AdaptiveMaxPool2d nn/AdaptiveMaxPool2d
nn/AdaptiveMaxPool3d nn/AdaptiveMaxPool3d
nn/AffineChannel nn/Affine
nn/AvgPool1d nn/AvgPool1d
nn/AvgPool2d nn/AvgPool2d
nn/AvgPool3d nn/AvgPool3d
......
AffineChannel Affine
============= ======
.. autoclass:: dragon.vm.torch.nn.AffineChannel .. autoclass:: dragon.vm.torch.nn.Affine
__init__ __init__
-------- --------
.. automethod:: dragon.vm.torch.nn.AffineChannel.__init__ .. automethod:: dragon.vm.torch.nn.Affine.__init__
.. _torch.channel_affine(...): ../channel_affine.html .. _torch.nn.functional.affine(...): functional/affine.html
.. raw:: html .. raw:: html
......
...@@ -24,6 +24,9 @@ vm.torch.nn.functional ...@@ -24,6 +24,9 @@ vm.torch.nn.functional
`adaptive_max_pool3d(...) <functional/adaptive_max_pool3d.html>`_ `adaptive_max_pool3d(...) <functional/adaptive_max_pool3d.html>`_
: Apply the 3d adaptive max pooling to input. : Apply the 3d adaptive max pooling to input.
`affine(...) <functional/affine.html>`_
: Apply the affine transformation to input.
`avg_pool1d(...) <functional/avg_pool1d.html>`_ `avg_pool1d(...) <functional/avg_pool1d.html>`_
: Apply the 1d average pooling to input. : Apply the 1d average pooling to input.
...@@ -40,8 +43,11 @@ vm.torch.nn.functional ...@@ -40,8 +43,11 @@ vm.torch.nn.functional
`binary_cross_entropy_with_logits(...) <functional/binary_cross_entropy_with_logits.html>`_ `binary_cross_entropy_with_logits(...) <functional/binary_cross_entropy_with_logits.html>`_
: Compute the sigmoid cross entropy with contiguous target. : Compute the sigmoid cross entropy with contiguous target.
`channel_norm(...) <nn/channel_norm.html>`_
: Apply the normalization to each channel of input.
`channel_shuffle(...) <functional/channel_shuffle.html>`_ `channel_shuffle(...) <functional/channel_shuffle.html>`_
: Apply group shuffle to each channel of input. : Apply the group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_. `[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
`conv1d(...) <functional/conv1d.html>`_ `conv1d(...) <functional/conv1d.html>`_
...@@ -229,11 +235,13 @@ vm.torch.nn.functional ...@@ -229,11 +235,13 @@ vm.torch.nn.functional
functional/adaptive_max_pool1d functional/adaptive_max_pool1d
functional/adaptive_max_pool2d functional/adaptive_max_pool2d
functional/adaptive_max_pool3d functional/adaptive_max_pool3d
functional/affine
functional/avg_pool1d functional/avg_pool1d
functional/avg_pool2d functional/avg_pool2d
functional/avg_pool3d functional/avg_pool3d
functional/batch_norm functional/batch_norm
functional/binary_cross_entropy_with_logits functional/binary_cross_entropy_with_logits
functional/channel_norm
functional/channel_shuffle functional/channel_shuffle
functional/conv1d functional/conv1d
functional/conv2d functional/conv2d
......
affine
======
.. autofunction:: dragon.vm.torch.nn.functional.affine
.. _torch.nn.affine(...): ../Affine.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
channel_norm
============
.. autofunction:: dragon.vm.torch.nn.functional.channel_norm
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
...@@ -56,15 +56,16 @@ class CUDAObjects { ...@@ -56,15 +56,16 @@ class CUDAObjects {
auto& handle = handles[stream_id]; auto& handle = handles[stream_id];
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasSetStream(handle, stream(device_id, stream_id))); CUBLAS_CHECK(cublasSetStream(handle, stream(device_id, stream_id)));
}
auto& handle = handles[stream_id];
#if CUDA_VERSION >= 11000 #if CUDA_VERSION >= 11000
if (cudnn_allow_tf32_) { if (cublas_allow_tf32_) {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH)); CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
} else { } else {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
}
#endif
} }
return handles[stream_id]; #endif
return handle;
} }
/*! \brief Return the specified cudnn handle */ /*! \brief Return the specified cudnn handle */
...@@ -150,6 +151,9 @@ class CUDAObjects { ...@@ -150,6 +151,9 @@ class CUDAObjects {
Map<string, ncclComm_t> nccl_comms_[CUDA_MAX_DEVICES]; Map<string, ncclComm_t> nccl_comms_[CUDA_MAX_DEVICES];
#endif #endif
/*! \brief The flag that allows cuBLAS TF32 math type or not */
bool cublas_allow_tf32_ = false;
/*! \brief The flag that uses cuDNN or not */ /*! \brief The flag that uses cuDNN or not */
bool cudnn_enabled_ = true; bool cudnn_enabled_ = true;
......
...@@ -20,32 +20,32 @@ namespace dragon { ...@@ -20,32 +20,32 @@ namespace dragon {
/*! /*!
* \brief Registry to create class instances. * \brief Registry to create class instances.
*/ */
template <class KeyType, class ObjectType, class... Args> template <class KeyT, class ClassT, class... Args>
class Registry { class Registry {
public: public:
typedef std::function<ObjectType*(Args...)> Creator; typedef std::function<ClassT*(Args...)> Creator;
/*! \brief Create an instance of specified class */ /*! \brief Create an instance of specified class */
ObjectType* Create(const KeyType& key, Args... args) { ClassT* Create(const KeyT& key, Args... args) {
CHECK(registry_.count(key)) << "\nKey(" << key << ") has not registered."; CHECK(registry_.count(key)) << "\nKey(" << key << ") has not registered.";
return registry_[key](args...); return registry_[key](args...);
} }
/*! \brief Return whether the specified class is registered */ /*! \brief Return whether the specified class is registered */
bool Has(const KeyType& key) { bool Has(const KeyT& key) {
return (registry_.count(key)) != 0; return (registry_.count(key)) != 0;
} }
/*! \brief Register a class with the creator */ /*! \brief Register a class with the creator */
void Register(const KeyType& key, Creator creator) { void Register(const KeyT& key, Creator creator) {
CHECK(!registry_.count(key)) CHECK(!registry_.count(key))
<< "\nKey(" << key << ") has already registered."; << "\nKey(" << key << ") has already registered.";
registry_[key] = creator; registry_[key] = creator;
} }
/*! \brief Return the key of registered classes */ /*! \brief Return the key of registered classes */
vector<KeyType> keys() { vector<KeyT> keys() {
vector<KeyType> ret; vector<KeyT> ret;
for (const auto& it : registry_) { for (const auto& it : registry_) {
ret.push_back(it.first); ret.push_back(it.first);
} }
...@@ -54,50 +54,49 @@ class Registry { ...@@ -54,50 +54,49 @@ class Registry {
private: private:
/*! \brief The registry map */ /*! \brief The registry map */
Map<KeyType, Creator> registry_; Map<KeyT, Creator> registry_;
}; };
/*! /*!
* \brief Register creator into the registry. * \brief Register creator into the registry.
*/ */
template <class KeyType, class ObjectType, class... Args> template <class KeyT, class ClassT, class... Args>
class Registerer { class Registerer {
public: public:
/*! \brief Constructor with key and creator */ /*! \brief Constructor with key and creator */
Registerer( Registerer(
const KeyType& key, const KeyT& key,
Registry<KeyType, ObjectType, Args...>* registry, Registry<KeyT, ClassT, Args...>* registry,
typename Registry<KeyType, ObjectType, Args...>::Creator creator, typename Registry<KeyT, ClassT, Args...>::Creator creator,
const string& help_msg = "") { const string& help_msg = "") {
registry->Register(key, creator); registry->Register(key, creator);
} }
/*! \brief Return the default creator */ /*! \brief Return the default creator */
template <class DerivedType> template <class DerivedT>
static ObjectType* DefaultCreator(Args... args) { static ClassT* DefaultCreator(Args... args) {
return new DerivedType(args...); return new DerivedT(args...);
} }
}; };
// Used in *.h files // Used in *.h files.
#define DECLARE_TYPED_REGISTRY(RegistryName, KeyType, ObjectType, ...) \ #define DECLARE_TYPED_REGISTRY(RegistryName, KeyT, ClassT, ...) \
DRAGON_API Registry<KeyType, ObjectType, ##__VA_ARGS__>* RegistryName(); \ DRAGON_API Registry<KeyT, ClassT, ##__VA_ARGS__>* RegistryName(); \
typedef Registerer<KeyType, ObjectType, ##__VA_ARGS__> \ typedef Registerer<KeyT, ClassT, ##__VA_ARGS__> Registerer##RegistryName;
Registerer##RegistryName;
// Used in *.cc files.
// Used in *.cc files #define DEFINE_TYPED_REGISTRY(RegistryName, KeyT, ClassT, ...) \
#define DEFINE_TYPED_REGISTRY(RegistryName, KeyType, ObjectType, ...) \ Registry<KeyT, ClassT, ##__VA_ARGS__>* RegistryName() { \
Registry<KeyType, ObjectType, ##__VA_ARGS__>* RegistryName() { \ static Registry<KeyT, ClassT, ##__VA_ARGS__>* registry = \
static Registry<KeyType, ObjectType, ##__VA_ARGS__>* registry = \ new Registry<KeyT, ClassT, ##__VA_ARGS__>(); \
new Registry<KeyType, ObjectType, ##__VA_ARGS__>(); \ return registry; \
return registry; \
} }
#define DECLARE_REGISTRY(RegistryName, ObjectType, ...) \ #define DECLARE_REGISTRY(RegistryName, ClassT, ...) \
DECLARE_TYPED_REGISTRY(RegistryName, string, ObjectType, ##__VA_ARGS__) DECLARE_TYPED_REGISTRY(RegistryName, string, ClassT, ##__VA_ARGS__)
#define DEFINE_REGISTRY(RegistryName, ObjectType, ...) \ #define DEFINE_REGISTRY(RegistryName, ClassT, ...) \
DEFINE_TYPED_REGISTRY(RegistryName, string, ObjectType, ##__VA_ARGS__) DEFINE_TYPED_REGISTRY(RegistryName, string, ClassT, ##__VA_ARGS__)
#define REGISTER_TYPED_CLASS(RegistryName, key, ...) \ #define REGISTER_TYPED_CLASS(RegistryName, key, ...) \
static Registerer##RegistryName ANONYMOUS_VARIABLE(g_##RegistryName)( \ static Registerer##RegistryName ANONYMOUS_VARIABLE(g_##RegistryName)( \
......
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T>
void _ChannelAffine(
const int N,
const int S,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y) {
if (S == 1) {
if (bias != nullptr) {
EigenArrayMap<T>(y, C, N) = (ConstEigenArrayMap<T>(x, C, N).colwise() *
ConstEigenVectorArrayMap<T>(scale, C))
.colwise() +
ConstEigenVectorArrayMap<T>(bias, C);
} else {
EigenArrayMap<T>(y, C, N) = ConstEigenArrayMap<T>(x, C, N).colwise() *
ConstEigenVectorArrayMap<T>(scale, C);
}
return;
}
for (int i = 0; i < N; ++i) {
for (int j = 0; j < C; ++j) {
if (bias != nullptr) {
EigenVectorArrayMap<T>(y, S) =
ConstEigenVectorArrayMap<T>(x, S) * scale[j] + bias[j];
} else {
EigenVectorArrayMap<T>(y, S) =
ConstEigenVectorArrayMap<T>(x, S) * scale[j];
}
x += S;
y += S;
}
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void ChannelAffine<float16, CPUContext>(
const int N,
const int S,
const int C,
const float16* x,
const float16* w,
const float16* b,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelAffine<T, CPUContext>( \
const int N, \
const int S, \
const int C, \
const T* x, \
const T* scale, \
const T* bias, \
T* y, \
CPUContext* ctx) { \
_ChannelAffine(N, S, C, x, scale, bias, y); \
}
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T, typename AccT>
__global__ void _ChannelAffine(
const int NxCxS,
const int S,
const int C,
const T* x,
const T* scale,
T* y) {
CUDA_1D_KERNEL_LOOP(i, NxCxS) {
y[i] = convert::To<T>(
convert::To<AccT>(x[i]) *
convert::To<AccT>(__ldg(scale + (i / S) % C)));
}
}
template <typename T, typename AccT>
__global__ void _ChannelAffine(
const int NxCxS,
const int S,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y) {
CUDA_1D_KERNEL_LOOP(i, NxCxS) {
const int j = (i / S) % C;
y[i] = convert::To<T>(
fma(convert::To<AccT>(x[i]),
convert::To<AccT>(__ldg(scale + j)),
convert::To<AccT>(__ldg(bias + j))));
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelAffine<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const T* x, \
const T* scale, \
const T* bias, \
T* y, \
CUDAContext* ctx) { \
const auto NxCxS = N * C * S; \
if (bias != nullptr) { \
_ChannelAffine<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(scale), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} else { \
_ChannelAffine<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(scale), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} \
}
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#endif // USE_CUDA
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T>
void _ChannelShuffle(
const int N,
const int S,
const int G,
const int K,
const T* x,
T* y) {
for (int i = 0; i < N; ++i) {
for (int gi = 0; gi < G; ++gi) {
for (int ki = 0; ki < K; ++ki) {
std::memcpy(
y + ((i * K + ki) * G + gi) * S,
x + ((i * G + gi) * K + ki) * S,
S * sizeof(T));
}
}
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelShuffle<T, CPUContext>( \
const int N, \
const int S, \
const int C, \
const int G, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_ChannelShuffle(N, S, G, C / G, x, y); \
}
DEFINE_KERNEL_LAUNCHER(bool);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T>
__global__ void _ChannelShuffle(
const int NxCxS,
const int S,
const int G,
const int K,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(index, NxCxS) {
const int j = index % S;
const int gi = index / S % G;
const int ki = index / S / G % K;
const int i = index / S / G / K;
y[index] = x[((i * G + gi) * K + ki) * S + j];
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelShuffle<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const int G, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
const auto NxCxS = N * C * S; \
_ChannelShuffle<<< \
CUDA_BLOCKS(NxCxS), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>(NxCxS, S, G, C / G, x, y); \
}
DEFINE_KERNEL_LAUNCHER(bool);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#endif // USE_CUDA
...@@ -52,14 +52,23 @@ __global__ void _ComputeCounts( ...@@ -52,14 +52,23 @@ __global__ void _ComputeCounts(
CUDAContext* ctx) { \ CUDAContext* ctx) { \
math::Copy(dim, x, y, ctx); \ math::Copy(dim, x, y, ctx); \
auto policy = thrust::cuda::par.on(ctx->cuda_stream()); \ auto policy = thrust::cuda::par.on(ctx->cuda_stream()); \
auto* data = reinterpret_cast<math::ScalarType<T>::type*>(y); \
thrust::device_vector<int> order1(dim), order2(dim); \ thrust::device_vector<int> order1(dim), order2(dim); \
thrust::sequence(policy, order1.begin(), order1.end()); \ thrust::sequence(policy, order1.begin(), order1.end()); \
thrust::sequence(policy, order2.begin(), order2.end()); \ thrust::sequence(policy, order2.begin(), order2.end()); \
thrust::sort_by_key( \ thrust::sort_by_key( \
policy, y, y + dim, order1.begin(), math::LessFunctor<T>()); \ policy, \
data, \
data + dim, \
order1.begin(), \
math::LessFunctor<math::ScalarType<T>::type>()); \
auto last = thrust::unique_by_key( \ auto last = thrust::unique_by_key( \
policy, y, y + dim, order2.begin(), math::EqualFunctor<T>()); \ policy, \
int n = num[0] = last.first - y; \ data, \
data + dim, \
order2.begin(), \
math::EqualFunctor<math::ScalarType<T>::type>()); \
int n = num[0] = last.first - data; \
if (inverse_index) { \ if (inverse_index) { \
_RemapInverse<<<CUDA_BLOCKS(n), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _RemapInverse<<<CUDA_BLOCKS(n), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
dim, n, order1.data(), order2.data(), inverse_index); \ dim, n, order1.data(), order2.data(), inverse_index); \
......
...@@ -8,7 +8,7 @@ namespace kernels { ...@@ -8,7 +8,7 @@ namespace kernels {
namespace { namespace {
template <typename InputT, typename OutputT> template <typename InputT, typename OutputT>
void _ChannelNormalize( void _ChannelNorm(
const int axis, const int axis,
const int num_dims, const int num_dims,
const int64_t* x_strides, const int64_t* x_strides,
...@@ -19,15 +19,14 @@ void _ChannelNormalize( ...@@ -19,15 +19,14 @@ void _ChannelNormalize(
OutputT* y) { OutputT* y) {
const auto N = math::utils::Prod(num_dims, y_dims); const auto N = math::utils::Prod(num_dims, y_dims);
vec64_t idx(num_dims, 0); vec64_t idx(num_dims, 0);
int64_t xi, wi;
for (int yi = 0; yi < N; ++yi) { for (int yi = 0; yi < N; ++yi) {
xi = 0; int64_t xi = 0, wi;
for (int d = num_dims - 1; d >= 0; --d) { for (int d = num_dims - 1; d >= 0; --d) {
xi += idx[d] * x_strides[d]; xi += idx[d] * x_strides[d];
if (d == axis) wi = idx[d]; if (d == axis) wi = idx[d];
} }
y[yi] = const float val = convert::To<float>(x[xi]);
convert::To<OutputT>((convert::To<float>(x[xi]) - mean[wi]) / std[wi]); y[yi] = convert::To<OutputT>((val - mean[wi]) / std[wi]);
math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
} }
} }
...@@ -36,19 +35,19 @@ void _ChannelNormalize( ...@@ -36,19 +35,19 @@ void _ChannelNormalize(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \ #define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \
template <> \ template <> \
void ChannelNormalize<InputT, OutputT, CPUContext>( \ void ChannelNorm<InputT, OutputT, CPUContext>( \
const int axis, \ const int axis, \
const int num_dims, \ const int num_dims, \
const int64_t* x_strides, \ const int64_t* x_strides, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const InputT* x, \ const InputT* x, \
const float* mean, \ const float* mean, \
const float* std, \ const float* std, \
OutputT* y, \ OutputT* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_ChannelNormalize(axis, num_dims, x_strides, y_dims, x, mean, std, y); \ _ChannelNorm(axis, num_dims, x_strides, y_dims, x, mean, std, y); \
} }
DEFINE_KERNEL_LAUNCHER(uint8_t, float16); DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
......
...@@ -11,7 +11,7 @@ namespace kernels { ...@@ -11,7 +11,7 @@ namespace kernels {
namespace { namespace {
template <typename InputT, typename OutputT, int D> template <typename InputT, typename OutputT, int D>
__global__ void _ChannelNormalize( __global__ void _ChannelNorm(
const int N, const int N,
const int axis, const int axis,
const int num_dims, const int num_dims,
...@@ -38,31 +38,27 @@ __global__ void _ChannelNormalize( ...@@ -38,31 +38,27 @@ __global__ void _ChannelNormalize(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \ #define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \
template <> \ template <> \
void ChannelNormalize<InputT, OutputT, CUDAContext>( \ void ChannelNorm<InputT, OutputT, CUDAContext>( \
const int axis, \ const int axis, \
const int num_dims, \ const int num_dims, \
const int64_t* x_strides, \ const int64_t* x_strides, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const InputT* x, \ const InputT* x, \
const float* mean, \ const float* mean, \
const float* std, \ const float* std, \
OutputT* y, \ OutputT* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims; \ SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims; \
const auto N = math::utils::Prod(num_dims, y_dims); \ const auto N = math::utils::Prod(num_dims, y_dims); \
for (int i = 0; i < num_dims; ++i) { \ for (int i = 0; i < num_dims; ++i) { \
X_strides.data[i] = x_strides[i]; \ X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \ Y_dims.data[i] = y_dims[i]; \
} \ } \
_ChannelNormalize<<< \ _ChannelNorm<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
CUDA_BLOCKS(N), \ N, axis, num_dims, X_strides, Y_dims, x, mean, std, y); \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
N, axis, num_dims, X_strides, Y_dims, x, mean, std, y); \
} }
DEFINE_KERNEL_LAUNCHER(uint8_t, float16); DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
......
...@@ -8,7 +8,7 @@ namespace kernels { ...@@ -8,7 +8,7 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T>
void _L1Normalize( void _L1Norm(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -28,7 +28,7 @@ void _L1Normalize( ...@@ -28,7 +28,7 @@ void _L1Normalize(
} }
template <typename T> template <typename T>
void _L2Normalize( void _L2Norm(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -48,7 +48,7 @@ void _L2Normalize( ...@@ -48,7 +48,7 @@ void _L2Normalize(
} }
template <typename T> template <typename T>
void _L1NormalizeGrad( void _L1NormGrad(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -73,7 +73,7 @@ void _L1NormalizeGrad( ...@@ -73,7 +73,7 @@ void _L1NormalizeGrad(
} }
template <typename T> template <typename T>
void _L2NormalizeGrad( void _L2NormGrad(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -101,7 +101,7 @@ void _L2NormalizeGrad( ...@@ -101,7 +101,7 @@ void _L2NormalizeGrad(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <> template <>
void L1Normalize<float16, CPUContext>( void L1Norm<float16, CPUContext>(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -114,7 +114,7 @@ void L1Normalize<float16, CPUContext>( ...@@ -114,7 +114,7 @@ void L1Normalize<float16, CPUContext>(
} }
template <> template <>
void L2Normalize<float16, CPUContext>( void L2Norm<float16, CPUContext>(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -127,7 +127,7 @@ void L2Normalize<float16, CPUContext>( ...@@ -127,7 +127,7 @@ void L2Normalize<float16, CPUContext>(
} }
template <> template <>
void L1NormalizeGrad<float16, CPUContext>( void L1NormGrad<float16, CPUContext>(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -138,10 +138,10 @@ void L1NormalizeGrad<float16, CPUContext>( ...@@ -138,10 +138,10 @@ void L1NormalizeGrad<float16, CPUContext>(
float16* dx, float16* dx,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} // L1NormalizeGrad } // L1NormGrad
template <> template <>
void L2NormalizeGrad<float16, CPUContext>( void L2NormGrad<float16, CPUContext>(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -152,7 +152,7 @@ void L2NormalizeGrad<float16, CPUContext>( ...@@ -152,7 +152,7 @@ void L2NormalizeGrad<float16, CPUContext>(
float16* dx, float16* dx,
CPUContext* ctx) { CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED; CPU_FP16_NOT_SUPPORTED;
} // L2NormalizeGrad } // L2NormGrad
#define DEFINE_KERNEL_LAUNCHER(name, T) \ #define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
...@@ -183,14 +183,14 @@ void L2NormalizeGrad<float16, CPUContext>( ...@@ -183,14 +183,14 @@ void L2NormalizeGrad<float16, CPUContext>(
_##name<T>(N, S, C, normalizer, eps, dy, x, dx); \ _##name<T>(N, S, C, normalizer, eps, dy, x, dx); \
} }
DEFINE_KERNEL_LAUNCHER(L1Normalize, float); DEFINE_KERNEL_LAUNCHER(L1Norm, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double); DEFINE_KERNEL_LAUNCHER(L1Norm, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float); DEFINE_KERNEL_LAUNCHER(L2Norm, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double); DEFINE_KERNEL_LAUNCHER(L2Norm, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -12,7 +12,7 @@ namespace kernels { ...@@ -12,7 +12,7 @@ namespace kernels {
namespace { namespace {
template <typename T, typename AccT> template <typename T, typename AccT>
__global__ void _L1Normalize( __global__ void _L1Norm(
const int NxS, const int NxS,
const int S, const int S,
const int C, const int C,
...@@ -41,7 +41,7 @@ __global__ void _L1Normalize( ...@@ -41,7 +41,7 @@ __global__ void _L1Normalize(
} }
template <typename T, typename AccT> template <typename T, typename AccT>
__global__ void _L2Normalize( __global__ void _L2Norm(
const int NxS, const int NxS,
const int S, const int S,
const int C, const int C,
...@@ -70,7 +70,7 @@ __global__ void _L2Normalize( ...@@ -70,7 +70,7 @@ __global__ void _L2Normalize(
} }
template <typename T, typename AccT> template <typename T, typename AccT>
__global__ void _L1NormalizeGrad( __global__ void _L1NormGrad(
const int NxS, const int NxS,
const int S, const int S,
const int C, const int C,
...@@ -107,7 +107,7 @@ __global__ void _L1NormalizeGrad( ...@@ -107,7 +107,7 @@ __global__ void _L1NormalizeGrad(
} }
template <typename T, typename AccT> template <typename T, typename AccT>
__global__ void _L2NormalizeGrad( __global__ void _L2NormGrad(
const int NxS, const int NxS,
const int S, const int S,
const int C, const int C,
...@@ -195,18 +195,18 @@ __global__ void _L2NormalizeGrad( ...@@ -195,18 +195,18 @@ __global__ void _L2NormalizeGrad(
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \ reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} }
DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, float); DEFINE_KERNEL_LAUNCHER(L1Norm, float16, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float); DEFINE_KERNEL_LAUNCHER(L1Norm, float, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double); DEFINE_KERNEL_LAUNCHER(L1Norm, double, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, float); DEFINE_KERNEL_LAUNCHER(L2Norm, float16, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float); DEFINE_KERNEL_LAUNCHER(L2Norm, float, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double); DEFINE_KERNEL_LAUNCHER(L2Norm, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float16, float); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float, float); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double, double); DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float16, float); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float, float); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double, double); DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, double, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
namespace kernels { namespace kernels {
template <> namespace {
void Adam<float, CPUContext>(
template <typename T, typename CopyT>
void _Adam(
const int N, const int N,
const float lr, const T lr,
const float beta1, const T beta1,
const float beta2, const T beta2,
const float eps, const T eps,
float* g, const T wd,
float* m, const T* x,
float* v, const T* g,
CPUContext* ctx) { T* m,
T* v,
T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
float gi = g[i]; const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
float mi = m[i] = m[i] * beta1 + gi * (1 - beta1); const T mi = m[i] = std::fma(beta1, m[i], (T(1) - beta1) * gi);
float vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2); const T vi = v[i] = std::fma(beta2, v[i], (T(1) - beta2) * gi * gi);
g[i] = lr * mi / (std::sqrt(vi) + eps); y[i] -= lr * mi / (std::sqrt(vi) + eps);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
template <> template <typename T, typename CopyT>
void AdamW<float, CPUContext>( void _AdamW(
const int N, const int N,
const float lr, const T lr,
const float beta1, const T beta1,
const float beta2, const T beta2,
const float eps, const T eps,
const float wd, const T wd,
const float* x, const T* x,
float* g, const T* g,
float* m, T* m,
float* v, T* v,
CPUContext* ctx) { T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
float gi = g[i]; const T gi = g[i];
float mi = m[i] = m[i] * beta1 + gi * (1 - beta1); const T mi = m[i] = std::fma(beta1, m[i], (T(1) - beta1) * gi);
float vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2); const T vi = v[i] = std::fma(beta2, v[i], (T(1) - beta2) * gi * gi);
g[i] = lr * mi / (std::sqrt(vi) + eps) + wd * x[i]; y[i] -= wd > T(0) ? std::fma(wd, x[i], lr * mi / (std::sqrt(vi) + eps))
: lr * mi / (std::sqrt(vi) + eps);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
} // namespace
#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
template <> \
void name<T, CopyT, CPUContext>( \
const int N, \
const float lr, \
const float beta1, \
const float beta2, \
const float eps, \
const float wd, \
const T* x, \
const T* g, \
T* m, \
T* v, \
T* y, \
CopyT* y_copy, \
CPUContext* ctx) { \
_##name( \
N, \
convert::To<T>(lr), \
convert::To<T>(beta1), \
convert::To<T>(beta2), \
convert::To<T>(eps), \
convert::To<T>(wd), \
x, \
g, \
m, \
v, \
y, \
y_copy); \
}
DEFINE_KERNEL_LAUNCHER(Adam, float, float16);
DEFINE_KERNEL_LAUNCHER(Adam, float, float);
DEFINE_KERNEL_LAUNCHER(Adam, double, double);
DEFINE_KERNEL_LAUNCHER(AdamW, float, float16);
DEFINE_KERNEL_LAUNCHER(AdamW, float, float);
DEFINE_KERNEL_LAUNCHER(AdamW, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels } // namespace kernels
} // namespace dragon } // namespace dragon
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -9,25 +10,32 @@ namespace kernels { ...@@ -9,25 +10,32 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T, typename CopyT>
__global__ void _Adam( __global__ void _Adam(
const int N, const int N,
const T lr, const T lr,
const T beta1, const T beta1,
const T beta2, const T beta2,
const T eps, const T eps,
T* g, const T wd,
const T* x,
const T* g,
T* m, T* m,
T* v) { T* v,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) { CUDA_1D_KERNEL_LOOP(i, N) {
T gi = g[i]; const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
T mi = m[i] = m[i] * beta1 + gi * (1 - beta1); const T mi = m[i] = fma(beta1, m[i], (T(1) - beta1) * gi);
T vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2); const T vi = v[i] = fma(beta2, v[i], (T(1) - beta2) * gi * gi);
g[i] = lr * mi / (sqrt(vi) + eps); y[i] -= lr * mi / (sqrt(vi) + eps);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
template <typename T> template <typename T, typename CopyT>
__global__ void _AdamW( __global__ void _AdamW(
const int N, const int N,
const T lr, const T lr,
...@@ -36,14 +44,20 @@ __global__ void _AdamW( ...@@ -36,14 +44,20 @@ __global__ void _AdamW(
const T eps, const T eps,
const T wd, const T wd,
const T* x, const T* x,
T* g, const T* g,
T* m, T* m,
T* v) { T* v,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) { CUDA_1D_KERNEL_LOOP(i, N) {
T gi = g[i]; const T gi = g[i];
T mi = m[i] = m[i] * beta1 + gi * (1 - beta1); const T mi = m[i] = fma(beta1, m[i], (T(1) - beta1) * gi);
T vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2); const T vi = v[i] = fma(beta2, v[i], (T(1) - beta2) * gi * gi);
g[i] = lr * mi / (sqrt(vi) + eps) + wd * x[i]; y[i] -= wd > T(0) ? fma(wd, x[i], lr * mi / (sqrt(vi) + eps))
: lr * mi / (sqrt(vi) + eps);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
...@@ -51,37 +65,44 @@ __global__ void _AdamW( ...@@ -51,37 +65,44 @@ __global__ void _AdamW(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <> #define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
void Adam<float, CUDAContext>( template <> \
const int N, void name<T, CopyT, CUDAContext>( \
const float lr, const int N, \
const float beta1, const float lr, \
const float beta2, const float beta1, \
const float eps, const float beta2, \
float* g, const float eps, \
float* m, const float wd, \
float* v, const T* x, \
CUDAContext* ctx) { const T* g, \
_Adam<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( T* m, \
N, lr, beta1, beta2, eps, g, m, v); T* v, \
} T* y, \
CopyT* y_copy, \
CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<T>(lr), \
convert::To<T>(beta1), \
convert::To<T>(beta2), \
convert::To<T>(eps), \
convert::To<T>(wd), \
x, \
g, \
m, \
v, \
y, \
reinterpret_cast<math::ScalarType<CopyT>::type*>(y_copy)); \
}
template <> DEFINE_KERNEL_LAUNCHER(Adam, float, float16);
void AdamW<float, CUDAContext>( DEFINE_KERNEL_LAUNCHER(Adam, float, float);
const int N, DEFINE_KERNEL_LAUNCHER(Adam, double, double);
const float lr, DEFINE_KERNEL_LAUNCHER(AdamW, float, float16);
const float beta1, DEFINE_KERNEL_LAUNCHER(AdamW, float, float);
const float beta2, DEFINE_KERNEL_LAUNCHER(AdamW, double, double);
const float eps, #undef DEFINE_KERNEL_LAUNCHER
const float wd,
const float* x,
float* g,
float* m,
float* v,
CUDAContext* ctx) {
_AdamW<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, lr, beta1, beta2, eps, wd, x, g, m, v);
}
} // namespace kernels } // namespace kernels
......
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
namespace kernels { namespace kernels {
template <> namespace {
void RMSprop<float, CPUContext>(
template <typename T, typename CopyT>
void _RMSprop(
const int N, const int N,
const float lr, const T lr,
const float momentum, const T momentum,
const float decay, const T alpha,
const float eps, const T eps,
float* g, const T wd,
float* m, const T* x,
float* v, const T* g,
CPUContext* ctx) { T* m,
T* v,
T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
float gi = g[i]; const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
float vi = v[i] = decay * v[i] + (1 - decay) * gi * gi; const T vi = v[i] = std::fma(alpha, v[i], (T(1) - alpha) * gi * gi);
float mi = m[i] = std::fma(momentum, m[i], gi / (std::sqrt(vi) + eps)); const T mi = m[i] = std::fma(momentum, m[i], gi / (std::sqrt(vi) + eps));
g[i] = lr * mi; y[i] -= lr * mi;
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
} // namespace
#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
template <> \
void name<T, CopyT, CPUContext>( \
const int N, \
const float lr, \
const float momentum, \
const float alpha, \
const float eps, \
const float wd, \
const T* x, \
const T* g, \
T* m, \
T* v, \
T* y, \
CopyT* y_copy, \
CPUContext* ctx) { \
_##name( \
N, \
convert::To<T>(lr), \
convert::To<T>(momentum), \
convert::To<T>(alpha), \
convert::To<T>(eps), \
convert::To<T>(wd), \
x, \
g, \
m, \
v, \
y, \
y_copy); \
}
DEFINE_KERNEL_LAUNCHER(RMSprop, float, float16);
DEFINE_KERNEL_LAUNCHER(RMSprop, float, float);
DEFINE_KERNEL_LAUNCHER(RMSprop, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels } // namespace kernels
} // namespace dragon } // namespace dragon
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -9,21 +10,28 @@ namespace kernels { ...@@ -9,21 +10,28 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T, typename CopyT>
__global__ void _RMSprop( __global__ void _RMSprop(
const int N, const int N,
const T lr, const T lr,
const T momentum, const T momentum,
const T decay, const T alpha,
const T eps, const T eps,
T* g, const T wd,
const T* x,
const T* g,
T* m, T* m,
T* v) { T* v,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) { CUDA_1D_KERNEL_LOOP(i, N) {
T gi = g[i]; const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
T vi = v[i] = decay * v[i] + (1 - decay) * gi * gi; const T vi = v[i] = fma(alpha, v[i], (T(1) - alpha) * gi * gi);
T mi = m[i] = fma(momentum, m[i], gi / (sqrt(vi) + eps)); const T mi = m[i] = fma(momentum, m[i], gi / (std::sqrt(vi) + eps));
g[i] = lr * mi; y[i] -= lr * mi;
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
...@@ -31,20 +39,41 @@ __global__ void _RMSprop( ...@@ -31,20 +39,41 @@ __global__ void _RMSprop(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <> #define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
void RMSprop<float, CUDAContext>( template <> \
const int N, void name<T, CopyT, CUDAContext>( \
const float lr, const int N, \
const float momentum, const float lr, \
const float decay, const float momentum, \
const float eps, const float alpha, \
float* g, const float eps, \
float* m, const float wd, \
float* v, const T* x, \
CUDAContext* ctx) { const T* g, \
_RMSprop<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( T* m, \
N, lr, momentum, decay, eps, g, m, v); T* v, \
} T* y, \
CopyT* y_copy, \
CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<T>(lr), \
convert::To<T>(momentum), \
convert::To<T>(alpha), \
convert::To<T>(eps), \
convert::To<T>(wd), \
x, \
g, \
m, \
v, \
y, \
reinterpret_cast<math::ScalarType<CopyT>::type*>(y_copy)); \
}
DEFINE_KERNEL_LAUNCHER(RMSprop, float, float16);
DEFINE_KERNEL_LAUNCHER(RMSprop, float, float);
DEFINE_KERNEL_LAUNCHER(RMSprop, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels } // namespace kernels
......
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
namespace kernels { namespace kernels {
template <> namespace {
void MomentumSGD<float, CPUContext>(
template <typename T, typename CopyT>
void _MomentumSGD(
const int N, const int N,
const float lr, const T lr,
const float momentum, const T momentum,
float* g, const T wd,
float* m, const T* x,
CPUContext* ctx) { const T* g,
T* m,
T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
float mi = m[i] = std::fma(momentum, m[i], g[i]); const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
g[i] = lr * mi; const T mi = m[i] = std::fma(momentum, m[i], gi);
y[i] -= lr * mi;
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
template <> template <typename T, typename CopyT>
void NesterovSGD<float, CPUContext>( void _NesterovSGD(
const int N, const int N,
const float lr, const T lr,
const float momentum, const T momentum,
float* g, const T wd,
float* m, const T* x,
CPUContext* ctx) { const T* g,
T* m,
T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
float gi = g[i]; const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
float mi = m[i] = std::fma(momentum, m[i], gi); const T mi = m[i] = std::fma(momentum, m[i], gi);
g[i] = lr * std::fma(momentum, mi, gi); y[i] -= lr * std::fma(momentum, mi, gi);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
} // namespace
#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
template <> \
void name<T, CopyT, CPUContext>( \
const int N, \
const float lr, \
const float momentum, \
const float wd, \
const T* x, \
const T* g, \
T* m, \
T* y, \
CopyT* y_copy, \
CPUContext* ctx) { \
_##name( \
N, \
convert::To<T>(lr), \
convert::To<T>(momentum), \
convert::To<T>(wd), \
x, \
g, \
m, \
y, \
y_copy); \
}
DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float16);
DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float);
DEFINE_KERNEL_LAUNCHER(MomentumSGD, double, double);
DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float16);
DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float);
DEFINE_KERNEL_LAUNCHER(NesterovSGD, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels } // namespace kernels
} // namespace dragon } // namespace dragon
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -9,22 +10,45 @@ namespace kernels { ...@@ -9,22 +10,45 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T, typename CopyT>
__global__ void __global__ void _MomentumSGD(
_MomentumSGD(const int N, const T lr, const T momentum, T* g, T* m) { const int N,
const T lr,
const T momentum,
const T wd,
const T* x,
const T* g,
T* m,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) { CUDA_1D_KERNEL_LOOP(i, N) {
T mi = m[i] = fma(momentum, m[i], g[i]); const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
g[i] = lr * mi; const T mi = m[i] = fma(momentum, m[i], gi);
y[i] -= lr * mi;
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
template <typename T> template <typename T, typename CopyT>
__global__ void __global__ void _NesterovSGD(
_NesterovSGD(const int N, const T lr, const T momentum, T* g, T* m) { const int N,
const T lr,
const T momentum,
const T wd,
const T* x,
const T* g,
T* m,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) { CUDA_1D_KERNEL_LOOP(i, N) {
T gi = g[i]; const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
T mi = m[i] = fma(momentum, m[i], gi); const T mi = m[i] = fma(momentum, m[i], gi);
g[i] = lr * fma(momentum, mi, gi); y[i] -= lr * fma(momentum, mi, gi);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
} }
} }
...@@ -32,29 +56,38 @@ _NesterovSGD(const int N, const T lr, const T momentum, T* g, T* m) { ...@@ -32,29 +56,38 @@ _NesterovSGD(const int N, const T lr, const T momentum, T* g, T* m) {
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <> #define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
void MomentumSGD<float, CUDAContext>( template <> \
const int N, void name<T, CopyT, CUDAContext>( \
const float lr, const int N, \
const float momentum, const float lr, \
float* g, const float momentum, \
float* m, const float wd, \
CUDAContext* ctx) { const T* x, \
_MomentumSGD<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( const T* g, \
N, lr, momentum, g, m); T* m, \
} T* y, \
CopyT* y_copy, \
CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<T>(lr), \
convert::To<T>(momentum), \
convert::To<T>(wd), \
x, \
g, \
m, \
y, \
reinterpret_cast<math::ScalarType<CopyT>::type*>(y_copy)); \
}
template <> DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float16);
void NesterovSGD<float, CUDAContext>( DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float);
const int N, DEFINE_KERNEL_LAUNCHER(MomentumSGD, double, double);
const float lr, DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float16);
const float momentum, DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float);
float* g, DEFINE_KERNEL_LAUNCHER(NesterovSGD, double, double);
float* m, #undef DEFINE_KERNEL_LAUNCHER
CUDAContext* ctx) {
_NesterovSGD<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, lr, momentum, g, m);
}
} // namespace kernels } // namespace kernels
......
...@@ -91,16 +91,24 @@ void RegisterModule_cuda(py::module& m) { ...@@ -91,16 +91,24 @@ void RegisterModule_cuda(py::module& m) {
#endif #endif
}); });
/*! \brief Set the flags of cuBLAS library */
m.def("cublasSetFlags", [](int allow_tf32) {
#ifdef USE_CUDA
auto& ctx = CUDAContext::objects();
if (allow_tf32 >= 0) ctx.cublas_allow_tf32_ = allow_tf32;
#endif
});
/*! \brief Set the flags of cuDNN library */ /*! \brief Set the flags of cuDNN library */
m.def( m.def(
"cudnnSetFlags", "cudnnSetFlags",
[](bool enabled, bool benchmark, bool deterministic, bool allow_tf32) { [](int enabled, int benchmark, int deterministic, int allow_tf32) {
#ifdef USE_CUDA #ifdef USE_CUDA
auto& cuda_objects = CUDAContext::objects(); auto& ctx = CUDAContext::objects();
cuda_objects.cudnn_enabled_ = enabled; if (enabled >= 0) ctx.cudnn_enabled_ = enabled;
cuda_objects.cudnn_deterministic_ = deterministic; if (benchmark >= 0) ctx.cudnn_benchmark_ = benchmark;
cuda_objects.cudnn_benchmark_ = benchmark; if (deterministic >= 0) ctx.cudnn_deterministic_ = deterministic;
cuda_objects.cudnn_allow_tf32_ = allow_tf32; if (allow_tf32 >= 0) ctx.cudnn_allow_tf32_ = allow_tf32;
#endif #endif
}); });
......
...@@ -132,8 +132,8 @@ PYBIND11_MODULE(libdragon_python, m) { ...@@ -132,8 +132,8 @@ PYBIND11_MODULE(libdragon_python, m) {
PRINT(INFO) << GetVerboseDef(def.DebugString(), "graph"); PRINT(INFO) << GetVerboseDef(def.DebugString(), "graph");
} }
} }
// Return the graph name may be different from the def // Return the graph name may be different from the def.
// We will make a unique dummy name on creating the graph // We will make a unique dummy name on creating the graph.
return graph->name(); return graph->name();
}) })
...@@ -175,8 +175,8 @@ PYBIND11_MODULE(libdragon_python, m) { ...@@ -175,8 +175,8 @@ PYBIND11_MODULE(libdragon_python, m) {
GraphDef init_graph, pred_graph; GraphDef init_graph, pred_graph;
onnx::ONNXBackend onnx_backend; onnx::ONNXBackend onnx_backend;
onnx_backend.Prepare(model_path, &init_graph, &pred_graph); onnx_backend.Prepare(model_path, &init_graph, &pred_graph);
// Serializing to Python is intractable // Serializing to Python is intractable.
// We should apply the initializer immediately // We should apply the initializer immediately.
self->RunGraph(self->CreateGraph(init_graph)->name()); self->RunGraph(self->CreateGraph(init_graph)->name());
return py::bytes(pred_graph.SerializeAsString()); return py::bytes(pred_graph.SerializeAsString());
}); });
......
...@@ -24,14 +24,14 @@ PythonPluginOp<Context>::PythonPluginOp(const OperatorDef& def, Workspace* ws) ...@@ -24,14 +24,14 @@ PythonPluginOp<Context>::PythonPluginOp(const OperatorDef& def, Workspace* ws)
Py_Initialize(); Py_Initialize();
auto* module = PyImport_ImportModule(module_name_.c_str()); auto* module = PyImport_ImportModule(module_name_.c_str());
CHECK(module) << "\nFailed to import module: " << module; CHECK(module) << "\nFailed to import module: " << module;
auto* module_dict = PyModule_GetDict(module); auto* module_dict = PyModule_GetDict(module);
auto* op_class = PyDict_GetItemString(module_dict, class_name_.c_str()); auto* op_class = PyDict_GetItemString(module_dict, class_name_.c_str());
CHECK(op_class) << "\nFailed to import class: " << class_name_ CHECK(op_class) << "\nFailed to import class: " << class_name_
<< " from module: " << module_name_; << " from module: " << module_name_;
self_ = PyObject_CallObject(op_class, NULL); self_ = PyObject_CallObject(op_class, NULL);
// Project inputs and outputs. // Set inputs and outputs.
inputs_ = PyList_New(InputSize()); inputs_ = PyList_New(InputSize());
outputs_ = PyList_New(OutputSize()); outputs_ = PyList_New(OutputSize());
for (int i = 0; i < InputSize(); i++) { for (int i = 0; i < InputSize(); i++) {
...@@ -41,16 +41,15 @@ PythonPluginOp<Context>::PythonPluginOp(const OperatorDef& def, Workspace* ws) ...@@ -41,16 +41,15 @@ PythonPluginOp<Context>::PythonPluginOp(const OperatorDef& def, Workspace* ws)
PyList_SetItem(outputs_, i, PyBytes_FromStdString(Output(i)->name())); PyList_SetItem(outputs_, i, PyBytes_FromStdString(Output(i)->name()));
} }
// Set: self.kwargs_str // Attr: "kwargs_str"
PyObject_SetAttr( PyObject_SetAttr(
self_, self_,
PyBytes_FromRawString("kwargs_str"), PyBytes_FromRawString("kwargs_str"),
PyBytes_FromStdString(kwargs_str_)); PyBytes_FromStdString(kwargs_str_));
// Method: self.setup(inputs, outputs)
if (PyObject_HasAttr(self_, PyBytes_FromRawString("setup"))) { if (PyObject_HasAttr(self_, PyBytes_FromRawString("setup"))) {
CHECK(PyObject_CallMethod(self_, "setup", "OO", inputs_, outputs_)) CHECK(PyObject_CallMethod(self_, "setup", "OO", inputs_, outputs_))
<< CallMethodHelper("setup"); << CallMethodHelper("setup"); // Method: setup(inputs, outputs)
} }
} }
...@@ -67,27 +66,24 @@ string PythonPluginOp<Context>::CallMethodHelper(const string& method_name) { ...@@ -67,27 +66,24 @@ string PythonPluginOp<Context>::CallMethodHelper(const string& method_name) {
template <class Context> template <class Context>
void PythonPluginOp<Context>::RunOnDevice() { void PythonPluginOp<Context>::RunOnDevice() {
// GIL may have been released // GIL may have been released.
pybind11::gil_scoped_acquire g; pybind11::gil_scoped_acquire g;
// Atrribute: self.phase // Attr: phase
PyObject_SetAttr( PyObject_SetAttr(
self_, PyBytes_FromRawString("phase"), PyBytes_FromStdString(phase())); self_, PyBytes_FromRawString("phase"), PyBytes_FromStdString(phase()));
// Method: self.reshape(input, outputs)
if (PyObject_HasAttr(self_, PyBytes_FromRawString("reshape"))) { if (PyObject_HasAttr(self_, PyBytes_FromRawString("reshape"))) {
CHECK(PyObject_CallMethod(self_, "reshape", "OO", inputs_, outputs_)) CHECK(PyObject_CallMethod(self_, "reshape", "OO", inputs_, outputs_))
<< CallMethodHelper("reshape"); << CallMethodHelper("reshape"); // Method: reshape(input, outputs)
} }
// Method: self.run(input, outputs)
// Method: self.forward(input, outputs)
if (PyObject_HasAttr(self_, PyBytes_FromRawString("forward"))) { if (PyObject_HasAttr(self_, PyBytes_FromRawString("forward"))) {
CHECK(PyObject_CallMethod(self_, "forward", "OO", inputs_, outputs_)) CHECK(PyObject_CallMethod(self_, "forward", "OO", inputs_, outputs_))
<< CallMethodHelper("forward"); << CallMethodHelper("forward"); // Method: run(input, outputs)
} else if (PyObject_HasAttr(self_, PyBytes_FromRawString("run"))) { } else if (PyObject_HasAttr(self_, PyBytes_FromRawString("run"))) {
CHECK(PyObject_CallMethod(self_, "run", "OO", inputs_, outputs_)) CHECK(PyObject_CallMethod(self_, "run", "OO", inputs_, outputs_))
<< CallMethodHelper("run"); << CallMethodHelper("run"); // Method: forward(input, outputs)
} }
} }
......
...@@ -13,7 +13,6 @@ void ONNXBackend::Prepare( ...@@ -13,7 +13,6 @@ void ONNXBackend::Prepare(
ModelProto onnx_model; ModelProto onnx_model;
CHECK(ReadProtoFromBinaryFile(onnx_model_path.c_str(), &onnx_model)) CHECK(ReadProtoFromBinaryFile(onnx_model_path.c_str(), &onnx_model))
<< "\nFailed to parse the onnx model."; << "\nFailed to parse the onnx model.";
int opset_version = -1; int opset_version = -1;
for (const auto& imp : onnx_model.opset_import()) { for (const auto& imp : onnx_model.opset_import()) {
if ((!imp.has_domain()) || imp.domain().empty()) { if ((!imp.has_domain()) || imp.domain().empty()) {
...@@ -31,7 +30,6 @@ void ONNXBackend::Prepare( ...@@ -31,7 +30,6 @@ void ONNXBackend::Prepare(
std::cout << "Unrecognized operator set " << opset_version << std::endl; std::cout << "Unrecognized operator set " << opset_version << std::endl;
} }
} }
if (opset_version < 0) { if (opset_version < 0) {
if (onnx_model.ir_version() >= 0x00000003) { if (onnx_model.ir_version() >= 0x00000003) {
LOG(FATAL) << "Model with IR version >= 3 " LOG(FATAL) << "Model with IR version >= 3 "
...@@ -40,7 +38,6 @@ void ONNXBackend::Prepare( ...@@ -40,7 +38,6 @@ void ONNXBackend::Prepare(
opset_version = 1; opset_version = 1;
} }
} }
ONNXToDragon(onnx_model, opset_version, true, init_graph, pred_graph); ONNXToDragon(onnx_model, opset_version, true, init_graph, pred_graph);
} }
...@@ -52,22 +49,23 @@ void ONNXBackend::ONNXToDragon( ...@@ -52,22 +49,23 @@ void ONNXBackend::ONNXToDragon(
GraphDef* pred_graph) { GraphDef* pred_graph) {
ModelProto init_model = ModelProto(); ModelProto init_model = ModelProto();
ModelProto pred_model = onnx_model; ModelProto pred_model = onnx_model;
pred_graph->set_name(onnx_model.graph().name()); pred_graph->set_name(onnx_model.graph().name());
init_graph->set_name(onnx_model.graph().name() + "/init"); init_graph->set_name(onnx_model.graph().name() + "/init");
ValueInfoMap graph_value_infos{}; ValueInfoMap graph_value_infos{};
InitializerMap graph_initializer{}; InitializerMap graph_initializer{};
// Collect graph inputs.
for (const auto& vi : onnx_model.graph().input()) for (const auto& v : onnx_model.graph().input()) {
graph_value_infos[vi.name()].CopyFrom(vi); graph_value_infos[v.name()].CopyFrom(v);
}
for (const auto& vi : onnx_model.graph().output()) // Collect graph outputs.
graph_value_infos[vi.name()].CopyFrom(vi); for (const auto& v : onnx_model.graph().output()) {
graph_value_infos[v.name()].CopyFrom(v);
for (const auto& vi : onnx_model.graph().value_info()) }
graph_value_infos[vi.name()].CopyFrom(vi); // Collect graph values.
for (const auto& v : onnx_model.graph().value_info()) {
graph_value_infos[v.name()].CopyFrom(v);
}
// Collect graph initializers.
for (const auto& tensor : onnx_model.graph().initializer()) { for (const auto& tensor : onnx_model.graph().initializer()) {
if (include_initializers) { if (include_initializers) {
auto* op_def = init_graph->add_op(); auto* op_def = init_graph->add_op();
...@@ -76,16 +74,18 @@ void ONNXBackend::ONNXToDragon( ...@@ -76,16 +74,18 @@ void ONNXBackend::ONNXToDragon(
} }
graph_initializer[tensor.name()] = &tensor; graph_initializer[tensor.name()] = &tensor;
} }
// Convert to graph defs.
auto converter = [&](const ModelProto& model, GraphDef* graph) mutable { auto converter = [&](const ModelProto& model, GraphDef* graph) mutable {
for (const auto& node : model.graph().node()) { for (const auto& node : model.graph().node()) {
ValueInfoMap value_infos{}; ValueInfoMap value_infos{};
InitializerMap initializer{}; InitializerMap initializer{};
for (const auto& name : node.input()) { for (const auto& name : node.input()) {
if (graph_value_infos.count(name)) if (graph_value_infos.count(name)) {
value_infos[name].CopyFrom(graph_value_infos[name]); value_infos[name].CopyFrom(graph_value_infos[name]);
if (graph_initializer.count(name)) }
if (graph_initializer.count(name)) {
initializer[name] = graph_initializer[name]; initializer[name] = graph_initializer[name];
}
} }
auto onnx_node = ONNXNode(node); auto onnx_node = ONNXNode(node);
auto returns = ONNXNodeToOps( auto returns = ONNXNodeToOps(
...@@ -98,23 +98,18 @@ void ONNXBackend::ONNXToDragon( ...@@ -98,23 +98,18 @@ void ONNXBackend::ONNXToDragon(
} }
} }
}; };
converter(pred_model, pred_graph); converter(pred_model, pred_graph);
// Add external inputs.
// Set(Initializer) + Set(Placehoders) = Set(Inputs)
Set<string> initializer; Set<string> initializer;
for (const auto& e : onnx_model.graph().initializer()) { for (const auto& v : onnx_model.graph().initializer()) {
initializer.insert(e.name()); initializer.insert(v.name());
} }
// Add External Inputs
for (const auto& e : onnx_model.graph().input()) { for (const auto& e : onnx_model.graph().input()) {
if (initializer.count(e.name()) == 0) { if (initializer.count(e.name()) == 0) {
pred_graph->add_input(e.name()); pred_graph->add_input(e.name());
} }
} }
// Add external outputs.
// Add External Outputs
for (const auto& e : onnx_model.graph().output()) { for (const auto& e : onnx_model.graph().output()) {
pred_graph->add_output(e.name()); pred_graph->add_output(e.name());
} }
......
#include "dragon/operators/array/channel_shuffle_op.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
template <class Context>
template <typename T>
void ChannelShuffleOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
CHECK_EQ(X.dim(axis) % group_, 0)
<< "\nThe " << X.dim(axis) << " channels "
<< "can not be split into " << group_ << " groups.";
kernels::ChannelShuffle(
X.count(0, axis),
X.count(axis + 1),
X.dim(axis),
group_,
X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void ChannelShuffleOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void ChannelShuffleGradientOp<Context>::DoRunWithType() {
auto &dY = Input(0), *dX = Output(0);
GET_OP_AXIS_ARG(axis, dY.ndim(), -1);
kernels::ChannelShuffle(
dY.count(0, axis),
dY.count(axis + 1),
dY.dim(axis),
dY.dim(axis) / group_,
dY.template data<T, Context>(),
dX->ReshapeLike(dY)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void ChannelShuffleGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(ChannelShuffle);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelShuffle);
#endif
DEPLOY_CPU_OPERATOR(ChannelShuffleGradient);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelShuffleGradient);
#endif
OPERATOR_SCHEMA(ChannelShuffle)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1);
OPERATOR_SCHEMA(ChannelShuffleGradient)
/* dY */
.NumInputs(1)
/* dX */
.NumOutputs(1);
REGISTER_GRADIENT(ChannelShuffle, SimpleGradientMaker);
} // namespace dragon
#include "dragon/operators/array/shuffle_op.h"
#include "dragon/utils/math_functions.h"
namespace dragon {
template <class Context>
template <typename T>
void ChannelShuffleOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
CHECK_EQ(X.dim(axis) % group_, 0)
<< "\nThe " << X.dim(axis) << " channels "
<< "can not be split into " << group_ << " groups.";
auto G = group_, K = X.dim(axis) / group_;
if (def().type() == "ChannelShuffleGradient") std::swap(G, K);
math::Transpose(
4,
vec64_t({X.count(0, axis), G, K, X.count(axis + 1)}).data(),
vec64_t({0, 2, 1, 3}).data(),
X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
DEPLOY_CPU_OPERATOR(ChannelShuffle);
REGISTER_CPU_OPERATOR(ChannelShuffleGradient, ChannelShuffleOp<CPUContext>);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelShuffle);
REGISTER_CUDA_OPERATOR(ChannelShuffleGradient, ChannelShuffleOp<CUDAContext>);
#endif
OPERATOR_SCHEMA(ChannelShuffle).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(ChannelShuffleGradient).NumInputs(1).NumOutputs(1);
REGISTER_GRADIENT(ChannelShuffle, SimpleGradientMaker);
} // namespace dragon
...@@ -10,8 +10,8 @@ ...@@ -10,8 +10,8 @@
* ------------------------------------------------------------ * ------------------------------------------------------------
*/ */
#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_SHUFFLE_OP_H_ #ifndef DRAGON_OPERATORS_ARRAY_SHUFFLE_OP_H_
#define DRAGON_OPERATORS_ARRAY_CHANNEL_SHUFFLE_OP_H_ #define DRAGON_OPERATORS_ARRAY_SHUFFLE_OP_H_
#include "dragon/core/operator.h" #include "dragon/core/operator.h"
...@@ -25,7 +25,9 @@ class ChannelShuffleOp final : public Operator<Context> { ...@@ -25,7 +25,9 @@ class ChannelShuffleOp final : public Operator<Context> {
group_(OP_SINGLE_ARG(int64_t, "group", 1)) {} group_(OP_SINGLE_ARG(int64_t, "group", 1)) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -34,22 +36,6 @@ class ChannelShuffleOp final : public Operator<Context> { ...@@ -34,22 +36,6 @@ class ChannelShuffleOp final : public Operator<Context> {
int64_t group_; int64_t group_;
}; };
template <class Context>
class ChannelShuffleGradientOp final : public Operator<Context> {
public:
ChannelShuffleGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
group_(OP_SINGLE_ARG(int64_t, "group", 1)) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T>
void DoRunWithType();
protected:
int64_t group_;
};
} // namespace dragon } // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_SHUFFLE_OP_H_ #endif // DRAGON_OPERATORS_ARRAY_SHUFFLE_OP_H_
#include "dragon/operators/array/channel_affine_op.h" #include "dragon/operators/math/affine_op.h"
#include "dragon/core/workspace.h" #include "dragon/core/workspace.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
template <class Context> template <class Context>
template <typename T> template <typename T>
void ChannelAffineOp<Context>::DoRunWithType() { void AffineOp<Context>::DoRunWithType() {
auto &X = Input(0), &W = Input(1), *Y = Output(0, {0}); auto &X = Input(0), &W = Input(1), *Y = Output(0, {0});
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
vec64_t affine_dims( // Compute affine dimensions.
{X.dims().begin() + axis, X.dims().begin() + end_axis + 1}); vec64_t affine_dims;
for (auto axis : axes_) {
axis = axis < 0 ? axis + X.ndim() : axis;
CHECK(axis >= 0 && axis < X.ndim())
<< "\nExcepted the axis in [-" << X.ndim() << ", " << X.ndim()
<< "), got " << axis << ".";
affine_dims.push_back(X.dim(axis));
}
CHECK(W.dims() == affine_dims) CHECK(W.dims() == affine_dims)
<< "\nExcepted the weight shape is " << Tensor::DimString(affine_dims) << "\nExcepted the weight shape is " << Tensor::DimString(affine_dims)
<< ", got " << W.DimString() << "."; << ", got " << W.DimString() << ".";
...@@ -23,10 +27,11 @@ void ChannelAffineOp<Context>::DoRunWithType() { ...@@ -23,10 +27,11 @@ void ChannelAffineOp<Context>::DoRunWithType() {
<< ", got " << Input(2).DimString() << "."; << ", got " << Input(2).DimString() << ".";
} }
kernels::ChannelAffine( math::Affine(
X.count(0, axis), X.ndim(),
X.count(end_axis + 1), X.dims().data(),
X.count(axis, end_axis + 1), axes_.size(),
axes_.data(),
X.template data<T, Context>(), X.template data<T, Context>(),
W.template data<T, Context>(), W.template data<T, Context>(),
InputSize() <= 2 ? nullptr : Input(2).template data<T, Context>(), InputSize() <= 2 ? nullptr : Input(2).template data<T, Context>(),
...@@ -35,28 +40,30 @@ void ChannelAffineOp<Context>::DoRunWithType() { ...@@ -35,28 +40,30 @@ void ChannelAffineOp<Context>::DoRunWithType() {
} }
template <class Context> template <class Context>
void ChannelAffineOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void ChannelAffineGradientOp<Context>::DoRunWithType() { void AffineGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &W = Input(1), &dY = Input(2); auto &X = Input(0), &W = Input(1), &dY = Input(2);
auto *dX = Output(0), *dW = Output(1), *dB = Output(2); auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
vec64_t affine_dims = {X.count(0, axis), // Compute reduce axes.
X.count(axis, end_axis + 1), vec64_t reduce_axes;
X.count(end_axis + 1)}, for (int i = 0; i < X.ndim(); ++i) {
affine_axes = {0, 2}; bool keep = true;
for (auto axis : axes_) {
axis = axis < 0 ? axis + X.ndim() : axis;
if (axis == i) keep = false;
}
if (keep) reduce_axes.push_back(i);
}
// Scratch to save the intermediates.
T* data = nullptr;
if (dW->has_name() && X.count() != W.count()) {
data = ctx()->workspace()->template data<T, Context>(X.count());
}
// dW = dY * X // dW = dY * X
if (dW->has_name()) { if (dW->has_name()) {
Output(1)->ReshapeLike(Input(1));
auto* x = Input(0).template data<T, Context>();
auto* dw = Output(1)->template mutable_data<T, Context>();
if (X.count() == W.count()) { if (X.count() == W.count()) {
math::Mul( math::Mul(
X.count(), X.count(),
...@@ -65,20 +72,19 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() { ...@@ -65,20 +72,19 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
dW->ReshapeLike(W)->template mutable_data<T, Context>(), dW->ReshapeLike(W)->template mutable_data<T, Context>(),
ctx()); ctx());
} else { } else {
T* scratch = ctx()->workspace()->template data<T, Context>(X.count());
math::Mul( math::Mul(
X.count(), X.count(),
dY.template data<T, Context>(), dY.template data<T, Context>(),
X.template data<T, Context>(), X.template data<T, Context>(),
scratch, data,
ctx()); ctx());
math::ReduceSum( math::ReduceSum(
3, X.ndim(),
affine_dims.data(), X.dims().data(),
2, reduce_axes.size(),
affine_axes.data(), reduce_axes.data(),
1.f, 1.f,
scratch, data,
dW->ReshapeLike(W)->template mutable_data<T, Context>(), dW->ReshapeLike(W)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
...@@ -90,10 +96,10 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() { ...@@ -90,10 +96,10 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
dB->ReshapeLike(W)->CopyFrom(dY, ctx()); dB->ReshapeLike(W)->CopyFrom(dY, ctx());
} else { } else {
math::ReduceSum( math::ReduceSum(
3, X.ndim(),
affine_dims.data(), X.dims().data(),
2, reduce_axes.size(),
affine_axes.data(), reduce_axes.data(),
1.f, 1.f,
dY.template data<T, Context>(), dY.template data<T, Context>(),
dB->ReshapeLike(W)->template mutable_data<T, Context>(), dB->ReshapeLike(W)->template mutable_data<T, Context>(),
...@@ -103,11 +109,11 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() { ...@@ -103,11 +109,11 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
// dX = dY * W // dX = dY * W
if (dX->has_name()) { if (dX->has_name()) {
Output(0)->ReshapeLike(Input(-1)); math::Affine(
kernels::ChannelAffine( X.ndim(),
X.count(0, axis), X.dims().data(),
X.count(end_axis + 1), axes_.size(),
X.count(axis, end_axis + 1), axes_.data(),
dY.template data<T, Context>(), dY.template data<T, Context>(),
W.template data<T, Context>(), W.template data<T, Context>(),
(const T*)nullptr, (const T*)nullptr,
...@@ -116,22 +122,17 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() { ...@@ -116,22 +122,17 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
} }
} }
template <class Context> DEPLOY_CPU_OPERATOR(Affine);
void ChannelAffineGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(ChannelAffine);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelAffine); DEPLOY_CUDA_OPERATOR(Affine);
#endif #endif
DEPLOY_CPU_OPERATOR(ChannelAffineGradient); DEPLOY_CPU_OPERATOR(AffineGradient);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelAffineGradient); DEPLOY_CUDA_OPERATOR(AffineGradient);
#endif #endif
OPERATOR_SCHEMA(ChannelAffine) OPERATOR_SCHEMA(Affine)
/* X, W, B */ /* X, W, B */
.NumInputs(2, 3) .NumInputs(2, 3)
/* Y */ /* Y */
...@@ -139,7 +140,7 @@ OPERATOR_SCHEMA(ChannelAffine) ...@@ -139,7 +140,7 @@ OPERATOR_SCHEMA(ChannelAffine)
/* X => Y */ /* X => Y */
.AllowInplace({{0, 0}}); .AllowInplace({{0, 0}});
OPERATOR_SCHEMA(ChannelAffineGradient) OPERATOR_SCHEMA(AffineGradient)
/* X, W, dY */ /* X, W, dY */
.NumInputs(3) .NumInputs(3)
/* dX, dW, dB */ /* dX, dW, dB */
...@@ -163,6 +164,6 @@ class GradientMaker final : public GradientMakerBase { ...@@ -163,6 +164,6 @@ class GradientMaker final : public GradientMakerBase {
} // namespace } // namespace
REGISTER_GRADIENT(ChannelAffine, GradientMaker); REGISTER_GRADIENT(Affine, GradientMaker);
} // namespace dragon } // namespace dragon
...@@ -10,37 +10,49 @@ ...@@ -10,37 +10,49 @@
* ------------------------------------------------------------ * ------------------------------------------------------------
*/ */
#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_ #ifndef DRAGON_OPERATORS_MATH_AFFINE_OP_H_
#define DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_ #define DRAGON_OPERATORS_MATH_AFFINE_OP_H_
#include "dragon/core/operator.h" #include "dragon/core/operator.h"
namespace dragon { namespace dragon {
template <class Context> template <class Context>
class ChannelAffineOp final : public Operator<Context> { class AffineOp final : public Operator<Context> {
public: public:
SIMPLE_CTOR_DTOR(ChannelAffineOp); AffineOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), axes_(OP_REPEATED_ARG(int64_t, "axes")) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
protected:
vec64_t axes_;
}; };
template <class Context> template <class Context>
class ChannelAffineGradientOp final : public Operator<Context> { class AffineGradientOp final : public Operator<Context> {
public: public:
SIMPLE_CTOR_DTOR(ChannelAffineGradientOp); AffineGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), axes_(OP_REPEATED_ARG(int64_t, "axes")) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
protected:
vec64_t axes_;
}; };
} // namespace dragon } // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_ #endif // DRAGON_OPERATORS_MATH_AFFINE_OP_H_
...@@ -26,6 +26,7 @@ DISPATCH_WITH_TENSOR_TYPES(IsInf, dtypes::Floating, Input(0)); ...@@ -26,6 +26,7 @@ DISPATCH_WITH_TENSOR_TYPES(IsInf, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(IsNaN, dtypes::Floating, Input(0)); DISPATCH_WITH_TENSOR_TYPES(IsNaN, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(IsFinite, dtypes::Floating, Input(0)); DISPATCH_WITH_TENSOR_TYPES(IsFinite, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(Pow, dtypes::Floating, Input(0)); DISPATCH_WITH_TENSOR_TYPES(Pow, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(Atan2, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(Minimum, dtypes::Numerical, Input(0)); DISPATCH_WITH_TENSOR_TYPES(Minimum, dtypes::Numerical, Input(0));
DISPATCH_WITH_TENSOR_TYPES(Maximum, dtypes::Numerical, Input(0)); DISPATCH_WITH_TENSOR_TYPES(Maximum, dtypes::Numerical, Input(0));
DISPATCH_WITH_TENSOR_TYPES(BitwiseNot, dtypes::Bitwise, Input(0)); DISPATCH_WITH_TENSOR_TYPES(BitwiseNot, dtypes::Bitwise, Input(0));
...@@ -120,6 +121,7 @@ DEFINE_INPLACE_UNARY_OP_IMPL(BitwiseNot, T); ...@@ -120,6 +121,7 @@ DEFINE_INPLACE_UNARY_OP_IMPL(BitwiseNot, T);
} }
DEFINE_SIMPLE_BINARY_OP_IMPL(Pow, T); DEFINE_SIMPLE_BINARY_OP_IMPL(Pow, T);
DEFINE_SIMPLE_BINARY_OP_IMPL(Atan2, T);
DEFINE_SIMPLE_BINARY_OP_IMPL(Minimum, T); DEFINE_SIMPLE_BINARY_OP_IMPL(Minimum, T);
DEFINE_SIMPLE_BINARY_OP_IMPL(Maximum, T); DEFINE_SIMPLE_BINARY_OP_IMPL(Maximum, T);
DEFINE_SIMPLE_BINARY_OP_IMPL(BitwiseAnd, T); DEFINE_SIMPLE_BINARY_OP_IMPL(BitwiseAnd, T);
...@@ -152,6 +154,7 @@ DEPLOY_CPU_OPERATOR(IsInf); ...@@ -152,6 +154,7 @@ DEPLOY_CPU_OPERATOR(IsInf);
DEPLOY_CPU_OPERATOR(IsNaN); DEPLOY_CPU_OPERATOR(IsNaN);
DEPLOY_CPU_OPERATOR(IsFinite); DEPLOY_CPU_OPERATOR(IsFinite);
DEPLOY_CPU_OPERATOR(Pow); DEPLOY_CPU_OPERATOR(Pow);
DEPLOY_CPU_OPERATOR(Atan2);
DEPLOY_CPU_OPERATOR(Minimum); DEPLOY_CPU_OPERATOR(Minimum);
DEPLOY_CPU_OPERATOR(Maximum); DEPLOY_CPU_OPERATOR(Maximum);
DEPLOY_CPU_OPERATOR(BitwiseNot); DEPLOY_CPU_OPERATOR(BitwiseNot);
...@@ -186,6 +189,7 @@ DEPLOY_CUDA_OPERATOR(IsInf); ...@@ -186,6 +189,7 @@ DEPLOY_CUDA_OPERATOR(IsInf);
DEPLOY_CUDA_OPERATOR(IsNaN); DEPLOY_CUDA_OPERATOR(IsNaN);
DEPLOY_CUDA_OPERATOR(IsFinite); DEPLOY_CUDA_OPERATOR(IsFinite);
DEPLOY_CUDA_OPERATOR(Pow); DEPLOY_CUDA_OPERATOR(Pow);
DEPLOY_CUDA_OPERATOR(Atan2);
DEPLOY_CUDA_OPERATOR(Minimum); DEPLOY_CUDA_OPERATOR(Minimum);
DEPLOY_CUDA_OPERATOR(Maximum); DEPLOY_CUDA_OPERATOR(Maximum);
DEPLOY_CUDA_OPERATOR(BitwiseNot); DEPLOY_CUDA_OPERATOR(BitwiseNot);
...@@ -222,6 +226,7 @@ OPERATOR_SCHEMA(IsNaN).NumInputs(1).NumOutputs(1); ...@@ -222,6 +226,7 @@ OPERATOR_SCHEMA(IsNaN).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(IsFinite).NumInputs(1).NumOutputs(1); OPERATOR_SCHEMA(IsFinite).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(Not).NumInputs(1).NumOutputs(1); OPERATOR_SCHEMA(Not).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(Pow).NumInputs(2).NumOutputs(1); OPERATOR_SCHEMA(Pow).NumInputs(2).NumOutputs(1);
OPERATOR_SCHEMA(Atan2).NumInputs(2).NumOutputs(1);
OPERATOR_SCHEMA(Minimum).NumInputs(2).NumOutputs(1); OPERATOR_SCHEMA(Minimum).NumInputs(2).NumOutputs(1);
OPERATOR_SCHEMA(Maximum).NumInputs(2).NumOutputs(1); OPERATOR_SCHEMA(Maximum).NumInputs(2).NumOutputs(1);
OPERATOR_SCHEMA(BitwiseAnd) OPERATOR_SCHEMA(BitwiseAnd)
...@@ -250,6 +255,7 @@ NO_GRADIENT(Round); ...@@ -250,6 +255,7 @@ NO_GRADIENT(Round);
NO_GRADIENT(IsInf); NO_GRADIENT(IsInf);
NO_GRADIENT(IsNaN); NO_GRADIENT(IsNaN);
NO_GRADIENT(IsFinite); NO_GRADIENT(IsFinite);
NO_GRADIENT(Atan2);
NO_GRADIENT(BitwiseNot); NO_GRADIENT(BitwiseNot);
NO_GRADIENT(BitwiseAnd); NO_GRADIENT(BitwiseAnd);
NO_GRADIENT(BitwiseOr); NO_GRADIENT(BitwiseOr);
......
...@@ -70,7 +70,7 @@ inline vec32_t CheckOutputAliases( ...@@ -70,7 +70,7 @@ inline vec32_t CheckOutputAliases(
return available_aliases; return available_aliases;
} }
// Unary ElementwiseOp // Unary ElementwiseOp.
DECLARE_ELEMENTWISE_OP(Abs); DECLARE_ELEMENTWISE_OP(Abs);
DECLARE_ELEMENTWISE_OP(Ceil); DECLARE_ELEMENTWISE_OP(Ceil);
DECLARE_ELEMENTWISE_OP(Cos); DECLARE_ELEMENTWISE_OP(Cos);
...@@ -101,12 +101,13 @@ DECLARE_ELEMENTWISE_OP(SignGradient); ...@@ -101,12 +101,13 @@ DECLARE_ELEMENTWISE_OP(SignGradient);
DECLARE_ELEMENTWISE_OP(SinGradient); DECLARE_ELEMENTWISE_OP(SinGradient);
DECLARE_ELEMENTWISE_OP(SqrtGradient); DECLARE_ELEMENTWISE_OP(SqrtGradient);
DECLARE_ELEMENTWISE_OP(SquareGradient); DECLARE_ELEMENTWISE_OP(SquareGradient);
// Binary ElementwiseOp // Binary ElementwiseOp.
DECLARE_ELEMENTWISE_OP(Add); DECLARE_ELEMENTWISE_OP(Add);
DECLARE_ELEMENTWISE_OP(Sub); DECLARE_ELEMENTWISE_OP(Sub);
DECLARE_ELEMENTWISE_OP(Mul); DECLARE_ELEMENTWISE_OP(Mul);
DECLARE_ELEMENTWISE_OP(Div); DECLARE_ELEMENTWISE_OP(Div);
DECLARE_ELEMENTWISE_OP(Pow); DECLARE_ELEMENTWISE_OP(Pow);
DECLARE_ELEMENTWISE_OP(Atan2);
DECLARE_ELEMENTWISE_OP(Minimum); DECLARE_ELEMENTWISE_OP(Minimum);
DECLARE_ELEMENTWISE_OP(Maximum); DECLARE_ELEMENTWISE_OP(Maximum);
DECLARE_ELEMENTWISE_OP(BitwiseAnd); DECLARE_ELEMENTWISE_OP(BitwiseAnd);
...@@ -128,7 +129,7 @@ DECLARE_ELEMENTWISE_OP(DivGradient); ...@@ -128,7 +129,7 @@ DECLARE_ELEMENTWISE_OP(DivGradient);
DECLARE_ELEMENTWISE_OP(PowGradient); DECLARE_ELEMENTWISE_OP(PowGradient);
DECLARE_ELEMENTWISE_OP(MinimumGradient); DECLARE_ELEMENTWISE_OP(MinimumGradient);
DECLARE_ELEMENTWISE_OP(MaximumGradient); DECLARE_ELEMENTWISE_OP(MaximumGradient);
// Trinary ElementwiseOp // Trinary ElementwiseOp.
DECLARE_ELEMENTWISE_OP(Where); DECLARE_ELEMENTWISE_OP(Where);
DECLARE_ELEMENTWISE_OP(WhereGradient); DECLARE_ELEMENTWISE_OP(WhereGradient);
#undef DECLARE_ELEMENTWISE_OP #undef DECLARE_ELEMENTWISE_OP
......
...@@ -199,11 +199,6 @@ void MatMulOp<Context>::DoRunWithType() { ...@@ -199,11 +199,6 @@ void MatMulOp<Context>::DoRunWithType() {
} }
template <class Context> template <class Context>
void MatMulOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void MatMulGradientOp<Context>::DoRunWithType() { void MatMulGradientOp<Context>::DoRunWithType() {
auto &A = Input(0), &B = Input(1), &dY = Input(2); auto &A = Input(0), &B = Input(1), &dY = Input(2);
...@@ -590,11 +585,6 @@ void MatMulGradientOp<Context>::DoRunWithType() { ...@@ -590,11 +585,6 @@ void MatMulGradientOp<Context>::DoRunWithType() {
} }
} }
template <class Context>
void MatMulGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(MatMul); DEPLOY_CPU_OPERATOR(MatMul);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(MatMul); DEPLOY_CUDA_OPERATOR(MatMul);
......
...@@ -23,7 +23,9 @@ class MatMulOp final : public Operator<Context> { ...@@ -23,7 +23,9 @@ class MatMulOp final : public Operator<Context> {
SIMPLE_CTOR_DTOR(MatMulOp); SIMPLE_CTOR_DTOR(MatMulOp);
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -35,7 +37,9 @@ class MatMulGradientOp final : public Operator<Context> { ...@@ -35,7 +37,9 @@ class MatMulGradientOp final : public Operator<Context> {
SIMPLE_CTOR_DTOR(MatMulGradientOp); SIMPLE_CTOR_DTOR(MatMulGradientOp);
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
......
#include "dragon/operators/array/channel_normalize_op.h" #include "dragon/operators/normalization/channel_norm_op.h"
#include "dragon/core/workspace.h" #include "dragon/core/workspace.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -6,7 +6,7 @@ namespace dragon { ...@@ -6,7 +6,7 @@ namespace dragon {
template <class Context> template <class Context>
template <typename InputT, typename OutputT> template <typename InputT, typename OutputT>
void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() { void ChannelNormOp<Context>::DoRunWithTypeAndCast() {
auto &X = Input(0), *Y = Output(0); auto &X = Input(0), *Y = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1); GET_OP_AXIS_ARG(axis, X.ndim(), -1);
...@@ -30,7 +30,7 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() { ...@@ -30,7 +30,7 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
<< "\nProviding " << X_mean_.count() << " values to normalize Dimension(" << "\nProviding " << X_mean_.count() << " values to normalize Dimension("
<< Y_dims[axis] << ")."; << Y_dims[axis] << ").";
kernels::ChannelNormalize( kernels::ChannelNorm(
axis, axis,
num_dims, num_dims,
X_strides.data(), X_strides.data(),
...@@ -44,7 +44,7 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() { ...@@ -44,7 +44,7 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
template <class Context> template <class Context>
template <typename T> template <typename T>
void ChannelNormalizeOp<Context>::DoRunWithType() { void ChannelNormOp<Context>::DoRunWithType() {
if (data_type() == "float16") { if (data_type() == "float16") {
DoRunWithTypeAndCast<T, float16>(); DoRunWithTypeAndCast<T, float16>();
} else if (data_type() == "float32") { } else if (data_type() == "float32") {
...@@ -58,21 +58,21 @@ void ChannelNormalizeOp<Context>::DoRunWithType() { ...@@ -58,21 +58,21 @@ void ChannelNormalizeOp<Context>::DoRunWithType() {
} }
template <class Context> template <class Context>
void ChannelNormalizeOp<Context>::RunOnDevice() { void ChannelNormOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Numerical>::Call(this, Input(0)); DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
} }
DEPLOY_CPU_OPERATOR(ChannelNormalize); DEPLOY_CPU_OPERATOR(ChannelNorm);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelNormalize); DEPLOY_CUDA_OPERATOR(ChannelNorm);
#endif #endif
OPERATOR_SCHEMA(ChannelNormalize) OPERATOR_SCHEMA(ChannelNorm)
/* X */ /* X */
.NumInputs(1) .NumInputs(1)
/* Y */ /* Y */
.NumOutputs(1); .NumOutputs(1);
NO_GRADIENT(ChannelNormalize); NO_GRADIENT(ChannelNorm);
} // namespace dragon } // namespace dragon
...@@ -10,17 +10,17 @@ ...@@ -10,17 +10,17 @@
* ------------------------------------------------------------ * ------------------------------------------------------------
*/ */
#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_NORMALIZE_OP_H_ #ifndef DRAGON_OPERATORS_NORMALIZATION_CHANNEL_NORM_OP_H_
#define DRAGON_OPERATORS_ARRAY_CHANNEL_NORMALIZE_OP_H_ #define DRAGON_OPERATORS_NORMALIZATION_CHANNEL_NORM_OP_H_
#include "dragon/core/operator.h" #include "dragon/core/operator.h"
namespace dragon { namespace dragon {
template <class Context> template <class Context>
class ChannelNormalizeOp final : public Operator<Context> { class ChannelNormOp final : public Operator<Context> {
public: public:
ChannelNormalizeOp(const OperatorDef& def, Workspace* ws) ChannelNormOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) { : Operator<Context>(def, ws) {
INITIALIZE_OP_REPEATED_ARG(int64_t, perm); INITIALIZE_OP_REPEATED_ARG(int64_t, perm);
auto mean = OP_REPEATED_ARG(float, "mean"); auto mean = OP_REPEATED_ARG(float, "mean");
...@@ -50,8 +50,8 @@ class ChannelNormalizeOp final : public Operator<Context> { ...@@ -50,8 +50,8 @@ class ChannelNormalizeOp final : public Operator<Context> {
DECLARE_OP_REPEATED_ARG(int64_t, perm); DECLARE_OP_REPEATED_ARG(int64_t, perm);
}; };
DEFINE_OP_REPEATED_ARG(int64_t, ChannelNormalizeOp, perm); DEFINE_OP_REPEATED_ARG(int64_t, ChannelNormOp, perm);
} // namespace dragon } // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_NORMALIZE_OP_H_ #endif // DRAGON_OPERATORS_NORMALIZATION_CHANNEL_NORM_OP_H_
#include "dragon/operators/normalization/lp_normalize_op.h" #include "dragon/operators/normalization/lp_norm_op.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
template <class Context> template <class Context>
template <typename T> template <typename T>
void LpNormalizeOp<Context>::DoRunWithType() { void LpNormOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0); auto &X = Input(0), *Y = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1); GET_OP_AXIS_ARG(axis, X.ndim(), -1);
GET_OP_AXIS_ARG(end_axis, X.ndim(), axis); GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
auto reduce_dim = X.count(axis, end_axis + 1); auto reduce_dim = X.count(axis, end_axis + 1);
// Normalize input with a scaled Lp-norm
if (p_ == 1) { if (p_ == 1) {
kernels::L1Normalize( kernels::L1Norm(
X.count(0, axis), X.count(0, axis),
X.count(end_axis + 1), X.count(end_axis + 1),
reduce_dim, reduce_dim,
...@@ -25,7 +22,7 @@ void LpNormalizeOp<Context>::DoRunWithType() { ...@@ -25,7 +22,7 @@ void LpNormalizeOp<Context>::DoRunWithType() {
Y->ReshapeLike(X)->template mutable_data<T, Context>(), Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx()); ctx());
} else if (p_ == 2) { } else if (p_ == 2) {
kernels::L2Normalize( kernels::L2Norm(
X.count(0, axis), X.count(0, axis),
X.count(end_axis + 1), X.count(end_axis + 1),
reduce_dim, reduce_dim,
...@@ -40,20 +37,15 @@ void LpNormalizeOp<Context>::DoRunWithType() { ...@@ -40,20 +37,15 @@ void LpNormalizeOp<Context>::DoRunWithType() {
} }
template <class Context> template <class Context>
void LpNormalizeOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void LpNormalizeGradientOp<Context>::DoRunWithType() { void LpNormGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &dY = Input(1), *dX = Output(0); auto &X = Input(0), &dY = Input(1), *dX = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1); GET_OP_AXIS_ARG(axis, X.ndim(), -1);
GET_OP_AXIS_ARG(end_axis, X.ndim(), axis); GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
auto reduce_dim = X.count(axis, end_axis + 1); auto reduce_dim = X.count(axis, end_axis + 1);
if (p_ == 1) { if (p_ == 1) {
kernels::L1NormalizeGrad( kernels::L1NormGrad(
X.count(0, axis), X.count(0, axis),
X.count(end_axis + 1), X.count(end_axis + 1),
reduce_dim, reduce_dim,
...@@ -64,7 +56,7 @@ void LpNormalizeGradientOp<Context>::DoRunWithType() { ...@@ -64,7 +56,7 @@ void LpNormalizeGradientOp<Context>::DoRunWithType() {
dX->ReshapeLike(X)->template mutable_data<T, Context>(), dX->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx()); ctx());
} else if (p_ == 2) { } else if (p_ == 2) {
kernels::L2NormalizeGrad( kernels::L2NormGrad(
X.count(0, axis), X.count(0, axis),
X.count(end_axis + 1), X.count(end_axis + 1),
reduce_dim, reduce_dim,
...@@ -79,33 +71,28 @@ void LpNormalizeGradientOp<Context>::DoRunWithType() { ...@@ -79,33 +71,28 @@ void LpNormalizeGradientOp<Context>::DoRunWithType() {
} }
} }
template <class Context> DEPLOY_CPU_OPERATOR(LpNorm);
void LpNormalizeGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(LpNormalize);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(LpNormalize); DEPLOY_CUDA_OPERATOR(LpNorm);
#endif #endif
DEPLOY_CPU_OPERATOR(LpNormalizeGradient); DEPLOY_CPU_OPERATOR(LpNormGradient);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(LpNormalizeGradient); DEPLOY_CUDA_OPERATOR(LpNormGradient);
#endif #endif
OPERATOR_SCHEMA(LpNormalize) OPERATOR_SCHEMA(LpNorm)
/* X */ /* X */
.NumInputs(1) .NumInputs(1)
/* Y */ /* Y */
.NumOutputs(1); .NumOutputs(1);
OPERATOR_SCHEMA(LpNormalizeGradient) OPERATOR_SCHEMA(LpNormGradient)
/* X, dY */ /* X, dY */
.NumInputs(2) .NumInputs(2)
/* dX */ /* dX */
.NumOutputs(1); .NumOutputs(1);
REGISTER_GRADIENT(LpNormalize, GenericGradientMaker); REGISTER_GRADIENT(LpNorm, GenericGradientMaker);
} // namespace dragon } // namespace dragon
...@@ -10,24 +10,26 @@ ...@@ -10,24 +10,26 @@
* ------------------------------------------------------------ * ------------------------------------------------------------
*/ */
#ifndef DRAGON_OPERATORS_NORMALIZATION_LP_NORMALIZE_OP_H_ #ifndef DRAGON_OPERATORS_NORMALIZATION_LP_NORM_OP_H_
#define DRAGON_OPERATORS_NORMALIZATION_LP_NORMALIZE_OP_H_ #define DRAGON_OPERATORS_NORMALIZATION_LP_NORM_OP_H_
#include "dragon/core/operator.h" #include "dragon/core/operator.h"
namespace dragon { namespace dragon {
template <class Context> template <class Context>
class LpNormalizeOp final : public Operator<Context> { class LpNormOp final : public Operator<Context> {
public: public:
LpNormalizeOp(const OperatorDef& def, Workspace* ws) LpNormOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), : Operator<Context>(def, ws),
p_(OP_SINGLE_ARG(int64_t, "p", 2)), p_(OP_SINGLE_ARG(int64_t, "p", 2)),
epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)), epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)),
reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {} reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -39,16 +41,18 @@ class LpNormalizeOp final : public Operator<Context> { ...@@ -39,16 +41,18 @@ class LpNormalizeOp final : public Operator<Context> {
}; };
template <class Context> template <class Context>
class LpNormalizeGradientOp final : public Operator<Context> { class LpNormGradientOp final : public Operator<Context> {
public: public:
LpNormalizeGradientOp(const OperatorDef& def, Workspace* ws) LpNormGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), : Operator<Context>(def, ws),
p_(OP_SINGLE_ARG(int64_t, "p", 2)), p_(OP_SINGLE_ARG(int64_t, "p", 2)),
epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)), epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)),
reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {} reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -61,4 +65,4 @@ class LpNormalizeGradientOp final : public Operator<Context> { ...@@ -61,4 +65,4 @@ class LpNormalizeGradientOp final : public Operator<Context> {
} // namespace dragon } // namespace dragon
#endif // DRAGON_OPERATORS_NORMALIZATION_LP_NORMALIZE_OP_H_ #endif // DRAGON_OPERATORS_NORMALIZATION_LP_NORM_OP_H_
...@@ -5,46 +5,41 @@ ...@@ -5,46 +5,41 @@
namespace dragon { namespace dragon {
template <class Context> template <class Context>
void AdamOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) { template <typename T, typename CopyT>
void AdamOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::Adam( kernels::Adam(
dX->count(), dX->count(),
lr_ * correction_, lr_ * correction_,
beta1_, beta1_,
beta2_, beta2_,
eps_, eps_,
dX->template mutable_data<float, Context>(), this->weight_decay_,
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(), X->template data<T, Context>(),
Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(), dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
GetState("v")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx()); ctx());
} }
template <class Context> template <class Context>
void AdamWOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) { template <typename T, typename CopyT>
if (lambda_ > 0.f) { void AdamWOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::AdamW( kernels::AdamW(
dX->count(), dX->count(),
lr_ * correction_, lr_ * correction_,
beta1_, beta1_,
beta2_, beta2_,
eps_, eps_,
this->lr_ * lambda_, lr_ * this->weight_decay_,
X->template data<float, Context>(), X->template data<T, Context>(),
dX->template mutable_data<float, Context>(), dX->template mutable_data<T, Context>(),
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(), GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(), GetState("v")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
ctx()); X->template mutable_data<T, Context>(),
} else { Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
kernels::Adam( ctx());
dX->count(),
lr_ * correction_,
beta1_,
beta2_,
eps_,
dX->template mutable_data<float, Context>(),
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
ctx());
}
} }
DEPLOY_CPU_OPERATOR(Adam); DEPLOY_CPU_OPERATOR(Adam);
......
...@@ -5,16 +5,21 @@ ...@@ -5,16 +5,21 @@
namespace dragon { namespace dragon {
template <class Context> template <class Context>
void RMSpropOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) { template <typename T, typename CopyT>
void RMSpropOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::RMSprop( kernels::RMSprop(
dX->count(), dX->count(),
lr_, lr_,
momentum_, momentum_,
decay_, alpha_,
eps_, eps_,
dX->template mutable_data<float, Context>(), this->weight_decay_,
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(), X->template data<T, Context>(),
Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(), dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
GetState("v")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx()); ctx());
} }
......
...@@ -6,33 +6,44 @@ ...@@ -6,33 +6,44 @@
namespace dragon { namespace dragon {
template <class Context> template <class Context>
void MomentumSGDOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) { template <typename T, typename CopyT>
void MomentumSGDOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::MomentumSGD( kernels::MomentumSGD(
dX->count(), dX->count(),
lr_, lr_,
momentum_, momentum_,
dX->template mutable_data<float, Context>(), this->weight_decay_,
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(), X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx()); ctx());
} }
template <class Context> template <class Context>
void NesterovSGDOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) { template <typename T, typename CopyT>
void NesterovSGDOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::NesterovSGD( kernels::NesterovSGD(
dX->count(), dX->count(),
lr_, lr_,
momentum_, momentum_,
dX->template mutable_data<float, Context>(), this->weight_decay_,
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(), X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx()); ctx());
} }
template <class Context> template <class Context>
void LARSOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) { template <typename T, typename CopyT>
void LARSOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
float trust_ratio = 0.f; float trust_ratio = 0.f;
if (trust_coef_ > 0.f) { if (trust_coef_ > 0.f) {
auto* x = X->template data<float, Context>(); auto* x = X->template data<T, Context>();
auto* dx = dX->template mutable_data<float, Context>(); auto* dx = dX->template mutable_data<T, Context>();
float x_norm = std::sqrt(math::Dot(X->count(), x, x, ctx())); float x_norm = std::sqrt(math::Dot(X->count(), x, x, ctx()));
float dx_norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx())); float dx_norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx()));
if (x_norm > 0.f && dx_norm > 0.f) { if (x_norm > 0.f && dx_norm > 0.f) {
...@@ -43,16 +54,20 @@ void LARSOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) { ...@@ -43,16 +54,20 @@ void LARSOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) {
math::Scale( math::Scale(
dX->count(), dX->count(),
trust_ratio, trust_ratio,
dX->template data<float, Context>(), dX->template data<T, Context>(),
dX->template mutable_data<float, Context>(), dX->template mutable_data<T, Context>(),
ctx()); ctx());
} }
kernels::MomentumSGD( kernels::MomentumSGD(
dX->count(), dX->count(),
lr_, lr_,
momentum_, momentum_,
dX->template mutable_data<float, Context>(), this->weight_decay_,
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(), X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx()); ctx());
} }
......
...@@ -37,17 +37,14 @@ class UpdateOpBase : public Operator<Context> { ...@@ -37,17 +37,14 @@ class UpdateOpBase : public Operator<Context> {
void RunOnDevice() override; void RunOnDevice() override;
template <typename T> template <typename T>
void TransformGrad(Tensor* dX, Tensor* X); void TransformGrad(Tensor* dX);
virtual void ComputeUpdate(Tensor* dX, Tensor* X) = 0; virtual void ApplyUpdate(Tensor* dX, Tensor* X, Tensor* Y) = 0;
template <typename T>
void ApplyUpdate(Tensor* dX, Tensor* X);
template <typename T> template <typename T>
T GetHyper(const string& key); T GetHyper(const string& key);
Tensor* Slot(const string& key); Tensor* GetState(const string& key);
protected: protected:
int weight_index_; int weight_index_;
...@@ -55,9 +52,26 @@ class UpdateOpBase : public Operator<Context> { ...@@ -55,9 +52,26 @@ class UpdateOpBase : public Operator<Context> {
float clip_norm_, clip_value_; float clip_norm_, clip_value_;
}; };
#define USE_UPDATE_FUNCTIONS \ #define USE_UPDATE_FUNCTIONS \
using UpdateOpBase<Context>::GetHyper; \ using UpdateOpBase<Context>::GetHyper; \
using UpdateOpBase<Context>::Slot using UpdateOpBase<Context>::GetState; \
void ApplyUpdate(Tensor* dX, Tensor* X, Tensor* Y) override { \
if (dX->template IsType<float>()) { \
if (Y == nullptr) { \
DoRunWithType<float, float>(dX, X, Y); \
} else if (Y->template IsType<float16>()) { \
DoRunWithType<float, float16>(dX, X, Y); \
} else { \
LOG(FATAL) << MessageForUnsupported( \
dtypes::to_string(Y->meta()), {"float16", "float32"}); \
} \
} else if (dX->template IsType<double>()) { \
DoRunWithType<double, double>(dX, X, Y); \
} else { \
LOG(FATAL) << MessageForUnsupported( \
dtypes::to_string(dX->meta()), {"float32", "float64"}); \
} \
}
template <class Context> template <class Context>
class MomentumSGDOp final : public UpdateOpBase<Context> { class MomentumSGDOp final : public UpdateOpBase<Context> {
...@@ -73,7 +87,8 @@ class MomentumSGDOp final : public UpdateOpBase<Context> { ...@@ -73,7 +87,8 @@ class MomentumSGDOp final : public UpdateOpBase<Context> {
UpdateOpBase<Context>::GetArguments(); UpdateOpBase<Context>::GetArguments();
} }
void ComputeUpdate(Tensor* dX, Tensor* /* X */) override; template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected: protected:
float lr_, momentum_; float lr_, momentum_;
...@@ -93,7 +108,8 @@ class NesterovSGDOp final : public UpdateOpBase<Context> { ...@@ -93,7 +108,8 @@ class NesterovSGDOp final : public UpdateOpBase<Context> {
UpdateOpBase<Context>::GetArguments(); UpdateOpBase<Context>::GetArguments();
} }
void ComputeUpdate(Tensor* dX, Tensor* /* X */) override; template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected: protected:
float lr_, momentum_; float lr_, momentum_;
...@@ -110,15 +126,16 @@ class RMSpropOp final : public UpdateOpBase<Context> { ...@@ -110,15 +126,16 @@ class RMSpropOp final : public UpdateOpBase<Context> {
void GetArguments() override { void GetArguments() override {
lr_ = this->template GetHyper<float>("lr"); lr_ = this->template GetHyper<float>("lr");
momentum_ = this->template GetHyper<float>("momentum"); momentum_ = this->template GetHyper<float>("momentum");
decay_ = this->template GetHyper<float>("decay"); alpha_ = this->template GetHyper<float>("alpha");
eps_ = this->template GetHyper<float>("eps"); eps_ = this->template GetHyper<float>("eps");
UpdateOpBase<Context>::GetArguments(); UpdateOpBase<Context>::GetArguments();
} }
void ComputeUpdate(Tensor* dX, Tensor* /* X */) override; template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected: protected:
float lr_, momentum_, decay_, eps_; float lr_, momentum_, alpha_, eps_;
}; };
template <class Context> template <class Context>
...@@ -139,7 +156,8 @@ class AdamOp : public UpdateOpBase<Context> { ...@@ -139,7 +156,8 @@ class AdamOp : public UpdateOpBase<Context> {
UpdateOpBase<Context>::GetArguments(); UpdateOpBase<Context>::GetArguments();
} }
void ComputeUpdate(Tensor* dX, Tensor* /* X */) override; template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected: protected:
int64_t t_; int64_t t_;
...@@ -163,16 +181,15 @@ class AdamWOp final : public UpdateOpBase<Context> { ...@@ -163,16 +181,15 @@ class AdamWOp final : public UpdateOpBase<Context> {
t_++; t_++;
correction_ = sqrt(1.f - pow(beta2_, t_)) / (1.f - pow(beta1_, t_)); correction_ = sqrt(1.f - pow(beta2_, t_)) / (1.f - pow(beta1_, t_));
UpdateOpBase<Context>::GetArguments(); UpdateOpBase<Context>::GetArguments();
lambda_ = this->weight_decay_;
this->weight_decay_ = 0.f;
} }
void ComputeUpdate(Tensor* dX, Tensor* X) override; template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected: protected:
int64_t t_; int64_t t_;
float lr_, beta1_, beta2_; float lr_, beta1_, beta2_;
float eps_, correction_, lambda_; float eps_, correction_;
}; };
template <class Context> template <class Context>
...@@ -190,14 +207,13 @@ class LARSOp final : public UpdateOpBase<Context> { ...@@ -190,14 +207,13 @@ class LARSOp final : public UpdateOpBase<Context> {
UpdateOpBase<Context>::GetArguments(); UpdateOpBase<Context>::GetArguments();
} }
void ComputeUpdate(Tensor* dX, Tensor* X) override; template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected: protected:
float lr_, momentum_, trust_coef_; float lr_, momentum_, trust_coef_;
}; };
#undef USE_UPDATE_FUNCTIONS
} // namespace dragon } // namespace dragon
#endif // DRAGON_OPERATORS_TRAINING_UPDATE_OP_H_ #endif // DRAGON_OPERATORS_TRAINING_UPDATE_OP_H_
...@@ -13,67 +13,40 @@ T UpdateOpBase<Context>::GetHyper(const string& key) { ...@@ -13,67 +13,40 @@ T UpdateOpBase<Context>::GetHyper(const string& key) {
} }
template <class Context> template <class Context>
Tensor* UpdateOpBase<Context>::Slot(const string& key) { Tensor* UpdateOpBase<Context>::GetState(const string& key) {
const string& weight_name = Output(weight_index_)->name(); const string& weight_name = Output(weight_index_)->name();
return workspace()->CreateTensor(name() + "/" + weight_name + "/" + key); return workspace()->CreateTensor(name() + "/" + weight_name + "/" + key);
} }
template <class Context> template <class Context>
template <typename T> template <typename T>
void UpdateOpBase<Context>::TransformGrad(Tensor* dX, Tensor* X) { void UpdateOpBase<Context>::TransformGrad(Tensor* dX) {
// Scale.
if (grad_scale_ != 1.f) { if (grad_scale_ != 1.f) {
auto* dx = dX->template mutable_data<T, Context>(); auto* dx = dX->template mutable_data<T, Context>();
math::Scale(dX->count(), grad_scale_, dx, dx, ctx()); math::Scale(dX->count(), grad_scale_, dx, dx, ctx());
} }
// Clip.
if (clip_norm_ > 0.f) { if (clip_norm_ > 0.f) {
auto* dx = dX->template mutable_data<T, Context>(); auto* dx = dX->template mutable_data<T, Context>();
float grad_norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx())); float norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx()));
if (grad_norm > clip_norm_) { if (norm > clip_norm_) {
math::Scale(dX->count(), clip_norm_ / grad_norm, dx, dx, ctx()); math::Scale(dX->count(), clip_norm_ / norm, dx, dx, ctx());
} }
} else if (clip_value_ > 0.f) { } else if (clip_value_ > 0.f) {
auto* dx = dX->template mutable_data<T, Context>(); auto* dx = dX->template mutable_data<T, Context>();
kernels::Clip(dX->count(), -clip_value_, clip_value_, dx, dx, ctx()); kernels::Clip(dX->count(), -clip_value_, clip_value_, dx, dx, ctx());
} }
// Penalty.
if (weight_decay_ > 0.f) {
math::Axpy(
X->count(),
weight_decay_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
ctx());
}
}
template <class Context>
template <typename T>
void UpdateOpBase<Context>::ApplyUpdate(Tensor* dX, Tensor* X) {
math::Sub(
X->count(),
X->template data<T, Context>(),
dX->template data<T, Context>(),
X->template mutable_data<T, Context>(),
ctx());
} }
template <class Context> template <class Context>
void UpdateOpBase<Context>::RunOnDevice() { void UpdateOpBase<Context>::RunOnDevice() {
GetArguments(); GetArguments();
for (int i = 0; i < InputSize(); ++i) { for (weight_index_ = 0; weight_index_ < InputSize(); ++weight_index_) {
weight_index_ = i; auto &dX = Input(weight_index_), *X = Output(weight_index_);
auto &dX = Input(i), *X = Output(i); if (dX.count() == 0 || X->count() == 0) continue;
if (dX.count() == 0 || X->count() == 0) return;
CHECK(dX.dims() == X->dims()) CHECK(dX.dims() == X->dims())
<< "\nWeight and grad should have the same dimensions." << "\nWeight and grad should have the same dimensions."
<< "\nGot" << X->DimString() << " and " << dX.DimString(); << "\nGot" << X->DimString() << " and " << dX.DimString();
if (dX.template IsType<float>()) { if (dX.template IsType<float16>()) {
TransformGrad<float>(&dX, X);
ComputeUpdate(&dX, X);
ApplyUpdate<float>(&dX, X);
} else if (dX.template IsType<float16>()) {
auto* X_master = workspace()->CreateTensor(X->name() + "_master"); auto* X_master = workspace()->CreateTensor(X->name() + "_master");
auto* X_grad = ctx()->workspace()->CreateTensor("BufferShared"); auto* X_grad = ctx()->workspace()->CreateTensor("BufferShared");
if (X_master->count() != X->count()) { if (X_master->count() != X->count()) {
...@@ -88,17 +61,17 @@ void UpdateOpBase<Context>::RunOnDevice() { ...@@ -88,17 +61,17 @@ void UpdateOpBase<Context>::RunOnDevice() {
dX.template data<float16, Context>(), dX.template data<float16, Context>(),
X_grad->ReshapeLike(dX)->template mutable_data<float, Context>(), X_grad->ReshapeLike(dX)->template mutable_data<float, Context>(),
ctx()); ctx());
TransformGrad<float>(X_grad, X_master); TransformGrad<float>(X_grad);
ComputeUpdate(X_grad, X_master); ApplyUpdate(X_grad, X_master, X);
ApplyUpdate<float>(X_grad, X_master); } else if (dX.template IsType<float>()) {
math::Cast( TransformGrad<float>(&dX);
X->count(), ApplyUpdate(&dX, X, nullptr);
X_master->template data<float, Context>(), } else if (dX.template IsType<double>()) {
X->template mutable_data<float16, Context>(), TransformGrad<double>(&dX);
ctx()); ApplyUpdate(&dX, X, nullptr);
} else { } else {
LOG(FATAL) << MessageForUnsupported( LOG(FATAL) << MessageForUnsupported(
dtypes::to_string(dX.meta()), {"float16", "float32"}); dtypes::to_string(dX.meta()), {"float16", "float32", "float64"});
} }
} }
} }
......
...@@ -58,9 +58,6 @@ from dragon.core.ops import tensor_ops as _ ...@@ -58,9 +58,6 @@ from dragon.core.ops import tensor_ops as _
from dragon.core.ops.array_ops import assign from dragon.core.ops.array_ops import assign
from dragon.core.ops.array_ops import boolean_mask from dragon.core.ops.array_ops import boolean_mask
from dragon.core.ops.array_ops import broadcast_to from dragon.core.ops.array_ops import broadcast_to
from dragon.core.ops.array_ops import channel_affine
from dragon.core.ops.array_ops import channel_normalize
from dragon.core.ops.array_ops import channel_shuffle
from dragon.core.ops.array_ops import concat from dragon.core.ops.array_ops import concat
from dragon.core.ops.array_ops import expand_dims from dragon.core.ops.array_ops import expand_dims
from dragon.core.ops.array_ops import flatten from dragon.core.ops.array_ops import flatten
......
...@@ -21,6 +21,7 @@ from dragon.core.device.cuda import current_device ...@@ -21,6 +21,7 @@ from dragon.core.device.cuda import current_device
from dragon.core.device.cuda import get_device_capability from dragon.core.device.cuda import get_device_capability
from dragon.core.device.cuda import is_available from dragon.core.device.cuda import is_available
from dragon.core.device.cuda import memory_allocated from dragon.core.device.cuda import memory_allocated
from dragon.core.device.cuda import set_cublas_flags
from dragon.core.device.cuda import set_cudnn_flags from dragon.core.device.cuda import set_cudnn_flags
from dragon.core.device.cuda import set_default_device from dragon.core.device.cuda import set_default_device
from dragon.core.device.cuda import set_device from dragon.core.device.cuda import set_device
......
...@@ -17,8 +17,10 @@ from dragon.core.ops.activation_ops import sigmoid ...@@ -17,8 +17,10 @@ from dragon.core.ops.activation_ops import sigmoid
from dragon.core.ops.activation_ops import tanh from dragon.core.ops.activation_ops import tanh
from dragon.core.ops.math_ops import abs from dragon.core.ops.math_ops import abs
from dragon.core.ops.math_ops import add from dragon.core.ops.math_ops import add
from dragon.core.ops.math_ops import affine
from dragon.core.ops.math_ops import argmax from dragon.core.ops.math_ops import argmax
from dragon.core.ops.math_ops import argmin from dragon.core.ops.math_ops import argmin
from dragon.core.ops.math_ops import atan2
from dragon.core.ops.math_ops import ceil from dragon.core.ops.math_ops import ceil
from dragon.core.ops.math_ops import clip from dragon.core.ops.math_ops import clip
from dragon.core.ops.math_ops import cos from dragon.core.ops.math_ops import cos
...@@ -60,7 +62,6 @@ from dragon.core.ops.math_ops import sqrt ...@@ -60,7 +62,6 @@ from dragon.core.ops.math_ops import sqrt
from dragon.core.ops.math_ops import square from dragon.core.ops.math_ops import square
from dragon.core.ops.math_ops import sub from dragon.core.ops.math_ops import sub
from dragon.core.ops.math_ops import sum from dragon.core.ops.math_ops import sum
from dragon.core.ops.normalization_ops import lp_normalize
from dragon.core.ops.sort_ops import top_k from dragon.core.ops.sort_ops import top_k
__all__ = [_s for _s in dir() if not _s.startswith('_')] __all__ = [_s for _s in dir() if not _s.startswith('_')]
...@@ -34,12 +34,15 @@ from dragon.core.ops.activation_ops import relu6 ...@@ -34,12 +34,15 @@ from dragon.core.ops.activation_ops import relu6
from dragon.core.ops.activation_ops import selu from dragon.core.ops.activation_ops import selu
from dragon.core.ops.activation_ops import silu from dragon.core.ops.activation_ops import silu
from dragon.core.ops.activation_ops import softmax from dragon.core.ops.activation_ops import softmax
from dragon.core.ops.array_ops import channel_shuffle
from dragon.core.ops.math_ops import moments from dragon.core.ops.math_ops import moments
from dragon.core.ops.normalization_ops import batch_norm from dragon.core.ops.normalization_ops import batch_norm
from dragon.core.ops.normalization_ops import channel_norm
from dragon.core.ops.normalization_ops import group_norm from dragon.core.ops.normalization_ops import group_norm
from dragon.core.ops.normalization_ops import instance_norm from dragon.core.ops.normalization_ops import instance_norm
from dragon.core.ops.normalization_ops import layer_norm from dragon.core.ops.normalization_ops import layer_norm
from dragon.core.ops.normalization_ops import local_response_norm from dragon.core.ops.normalization_ops import local_response_norm
from dragon.core.ops.normalization_ops import lp_norm
from dragon.core.ops.normalization_ops import sync_batch_norm from dragon.core.ops.normalization_ops import sync_batch_norm
from dragon.core.ops.vision_ops import bias_add from dragon.core.ops.vision_ops import bias_add
from dragon.core.ops.vision_ops import conv from dragon.core.ops.vision_ops import conv
......
...@@ -78,16 +78,13 @@ def cast_args(**kwargs): ...@@ -78,16 +78,13 @@ def cast_args(**kwargs):
return {'dtype': kwargs.get('dtype', 'float32')} return {'dtype': kwargs.get('dtype', 'float32')}
@register('ChannelAffine') @register('Affine')
def channel_affine_args(**kwargs): def affine_args(**kwargs):
return { return {'axes': kwargs.get('axes', None)}
'axis': kwargs.get('axis', -1),
'end_axis': kwargs.get('end_axis', kwargs.get('axis', -1)),
}
@register('ChannelNormalize') @register('ChannelNorm')
def channel_normalize_args(**kwargs): def channel_norm_args(**kwargs):
return { return {
'axis': kwargs.get('axis', -1), 'axis': kwargs.get('axis', -1),
'mean': kwargs.get('mean', None), 'mean': kwargs.get('mean', None),
...@@ -323,8 +320,8 @@ def loss_args(**kwargs): ...@@ -323,8 +320,8 @@ def loss_args(**kwargs):
return {'reduction': kwargs.get('reduction', 'MEAN')} return {'reduction': kwargs.get('reduction', 'MEAN')}
@register('LpNormalize') @register('LpNorm')
def lp_normalize_args(**kwargs): def lp_norm_args(**kwargs):
return { return {
'p': kwargs.get('p', 2), 'p': kwargs.get('p', 2),
'axis': kwargs.get('axis', -1), 'axis': kwargs.get('axis', -1),
......
...@@ -81,6 +81,7 @@ def binary_shape_spec(inputs, outputs): ...@@ -81,6 +81,7 @@ def binary_shape_spec(inputs, outputs):
@register([ @register([
'Add', 'Add',
'Atan2',
'BitwiseAnd', 'BitwiseAnd',
'BitwiseOr', 'BitwiseOr',
'BitwiseXor', 'BitwiseXor',
...@@ -403,7 +404,7 @@ def gemm_spec(args, inputs, outputs): ...@@ -403,7 +404,7 @@ def gemm_spec(args, inputs, outputs):
return outputs return outputs
@register('ChannelNormalize') @register('ChannelNorm')
def channel_normalize_spec(args, inputs, outputs): def channel_normalize_spec(args, inputs, outputs):
outputs[0]._dtype = args['dtype'] outputs[0]._dtype = args['dtype']
try: try:
......
...@@ -62,11 +62,23 @@ def current_device(): ...@@ -62,11 +62,23 @@ def current_device():
return backend.cudaGetDevice() return backend.cudaGetDevice()
def set_cublas_flags(allow_tf32=None):
"""Set the flags of cuBLAS library.
Parameters
----------
allow_tf32 : bool, optional, default=False
Allow TF32 tensor core operation or not.
"""
backend.cublasSetFlags(-1 if allow_tf32 is None else allow_tf32)
def set_cudnn_flags( def set_cudnn_flags(
enabled=True, enabled=None,
benchmark=False, benchmark=None,
deterministic=False, deterministic=None,
allow_tf32=False, allow_tf32=None,
): ):
"""Set the flags of cuDNN library. """Set the flags of cuDNN library.
...@@ -82,7 +94,11 @@ def set_cudnn_flags( ...@@ -82,7 +94,11 @@ def set_cudnn_flags(
Allow TF32 tensor core operation or not. Allow TF32 tensor core operation or not.
""" """
backend.cudnnSetFlags(enabled, benchmark, deterministic, allow_tf32) backend.cudnnSetFlags(
-1 if enabled is None else enabled,
-1 if benchmark is None else benchmark,
-1 if deterministic is None else deterministic,
-1 if allow_tf32 is None else allow_tf32)
def get_device_capability(device_index=None): def get_device_capability(device_index=None):
......
...@@ -122,107 +122,16 @@ def broadcast_to(inputs, shape, **kwargs): ...@@ -122,107 +122,16 @@ def broadcast_to(inputs, shape, **kwargs):
return OpLib.add('Expand', **args) return OpLib.add('Expand', **args)
@OpSchema.num_inputs(2, 3)
def channel_affine(inputs, axis=-1, end_axis=None, **kwargs):
r"""Apply affine transformation to each channel of input.
Parameters
----------
inputs : Sequence[dragon.Tensor]
The input, weight and optional bias tensor.
axis : int, optional, default=-1
The first channel axis.
end_axis : int, optional
The last channel axis.
Returns
-------
dragon.Tensor
The output tensor.
"""
outputs = kwargs.pop('outputs', [None])
if context.executing_eagerly():
return OpLib.execute(
'ChannelAffine', inputs, outputs=outputs,
axis=axis, end_axis=end_axis)
return OpLib.add('ChannelAffine', inputs,
axis=axis, end_axis=end_axis, **kwargs)
@OpSchema.num_inputs(1)
@OpSchema.convert_arg('perm')
def channel_normalize(
inputs,
mean,
std,
axis=-1,
dtype='float32',
perm=None,
**kwargs
):
"""Apply normalization to each channel of input.
:attr:`axis` can be negative:
```python
m = s = (1., 1., 1.)
x = dragon.constant([1, 2, 3])
print(dragon.channel_normalize(x, m, s, axis=0)) # [0., 1., 2.]
print(dragon.channel_normalize(x, m, s, axis=-1)) # Equivalent
```
If :attr:`perm` provided, :attr:`axis` is selected from the output layout:
```python
m, s = (1., 2., 3.), (1., 1., 1.)
x = dragon.constant([[1, 2, 3]])
# Provided 3 values to normalize the last axis
# with length 1, only the first value will be taken
print(dragon.channel_normalize(x, m, s, perm=(1, 0))) # [[0.], [1.], [2.]]
```
Parameters
----------
inputs : dragon.Tensor
The input tensor.
mean : Sequence[float], required
The mean to subtract.
std : Sequence[float], required
The standard deviation to divide.
axis : int, optional, default=-1
The channel axis.
dtype : str, optional, default='float32'
The output data type.
perm : Sequence[Union[int, dragon.Tensor]], optional
The output permutation.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = OpSchema.parse_args(locals())
if context.executing_eagerly():
return OpLib.execute(
'ChannelNormalize', inputs,
axis=axis, mean=mean, std=std, dtype=dtype,
ndim=len(args['perm']) if perm is not None else 0,
perm=args['perm'])
return OpLib.add('ChannelNormalize', **args)
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
def channel_shuffle(inputs, axis=-1, group=1, **kwargs): def channel_shuffle(inputs, axis=-1, group=1, **kwargs):
"""Apply group shuffle to each channel of input. """Apply the group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_. `[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
Examples: Examples:
```python ```python
x = dragon.constant([1, 2, 3, 4]) x = dragon.constant([1, 2, 3, 4])
print(dragon.channel_shuffle(x, group=2)) # [1, 3, 2, 4] print(dragon.nn.channel_shuffle(x, group=2)) # [1, 3, 2, 4]
``` ```
Parameters Parameters
......
...@@ -82,6 +82,30 @@ def add(inputs, **kwargs): ...@@ -82,6 +82,30 @@ def add(inputs, **kwargs):
return OpLib.add('Add', inputs, **kwargs) return OpLib.add('Add', inputs, **kwargs)
@OpSchema.num_inputs(2, 3)
def affine(inputs, axis=-1, **kwargs):
"""Apply affine transformation to input.
Parameters
----------
inputs : Sequence[dragon.Tensor]
The input, scale and bias tensor.
axis : Union[int, Sequence[int]], optional, default=-1
The axis to apply.
Returns
-------
dragon.Tensor
The output tensor.
"""
axes = nest.flatten(axis)
outputs = kwargs.pop('outputs', [None])
if context.executing_eagerly():
return OpLib.execute('Affine', inputs, outputs=outputs, axes=axes)
return OpLib.add('Affine', inputs, axes=axes, **kwargs)
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
def argmax(inputs, axis=0, keepdims=False, **kwargs): def argmax(inputs, axis=0, keepdims=False, **kwargs):
"""Compute the index of maximum elements along the given axis. """Compute the index of maximum elements along the given axis.
...@@ -149,6 +173,37 @@ def argmin(inputs, axis=0, keepdims=False, **kwargs): ...@@ -149,6 +173,37 @@ def argmin(inputs, axis=0, keepdims=False, **kwargs):
@OpSchema.num_inputs(2) @OpSchema.num_inputs(2)
def atan2(inputs, **kwargs):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{input1}}{\text{input2}})
Examples:
```python
y = dragon.constant(1)
x = dragon.constant(2)
print(dragon.math.atan2([y, x])) # 0.46364761
```
Parameters
----------
inputs : Sequence[dragon.Tensor]
The input1 and input2 tensor.
Returns
-------
dragon.Tensor
The output tensor.
"""
inputs = constant_ops.remove_scalars(inputs)
if context.executing_eagerly():
return OpLib.execute('Atan2', inputs)
return OpLib.add('Atan2', inputs, **kwargs)
@OpSchema.num_inputs(2)
def bitwise_and(inputs, **kwargs): def bitwise_and(inputs, **kwargs):
r"""Compute the element-wise AND bitwise operation. r"""Compute the element-wise AND bitwise operation.
......
...@@ -72,6 +72,69 @@ def batch_norm( ...@@ -72,6 +72,69 @@ def batch_norm(
return OpLib.add('BatchNorm', **args) return OpLib.add('BatchNorm', **args)
@OpSchema.num_inputs(1)
@OpSchema.convert_arg('perm')
def channel_norm(
inputs,
mean,
std,
axis=-1,
dtype='float32',
perm=None,
**kwargs
):
"""Apply the normalization to each channel of input.
:attr:`axis` can be negative:
```python
m = s = (1., 1., 1.)
x = dragon.constant([1, 2, 3])
print(dragon.nn.channel_norm(x, m, s, axis=0)) # [0., 1., 2.]
print(dragon.nn.channel_norm(x, m, s, axis=-1)) # Equivalent
```
If :attr:`perm` provided, :attr:`axis` is selected from the output layout:
```python
m, s = (1., 2., 3.), (1., 1., 1.)
x = dragon.constant([[1, 2, 3]])
# Provided 3 values to normalize the last axis
# with length 1, only the first value will be taken
print(dragon.nn.channel_norm(x, m, s, perm=(1, 0))) # [[0.], [1.], [2.]]
```
Parameters
----------
inputs : dragon.Tensor
The input tensor.
mean : Sequence[float], required
The mean to subtract.
std : Sequence[float], required
The standard deviation to divide.
axis : int, optional, default=-1
The channel axis.
dtype : str, optional, default='float32'
The output data type.
perm : Sequence[Union[int, dragon.Tensor]], optional
The output permutation.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = OpSchema.parse_args(locals())
if context.executing_eagerly():
return OpLib.execute(
'ChannelNorm', inputs,
axis=axis, mean=mean, std=std, dtype=dtype,
ndim=len(args['perm']) if perm is not None else 0,
perm=args['perm'])
return OpLib.add('ChannelNorm', **args)
@OpSchema.num_inputs(3) @OpSchema.num_inputs(3)
def group_norm(inputs, axis=-1, group=0, epsilon=1e-5, **kwargs): def group_norm(inputs, axis=-1, group=0, epsilon=1e-5, **kwargs):
r"""Apply the group normalization. r"""Apply the group normalization.
...@@ -180,7 +243,7 @@ def layer_norm(inputs, axis=-1, epsilon=1e-5, **kwargs): ...@@ -180,7 +243,7 @@ def layer_norm(inputs, axis=-1, epsilon=1e-5, **kwargs):
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
def lp_normalize( def lp_norm(
inputs, inputs,
axis=-1, axis=-1,
end_axis=None, end_axis=None,
...@@ -200,15 +263,15 @@ def lp_normalize( ...@@ -200,15 +263,15 @@ def lp_normalize(
```python ```python
x = dragon.constant([[1, 2, 3], [4, 5, 6]], 'float32') x = dragon.constant([[1, 2, 3], [4, 5, 6]], 'float32')
# A negative axis is the last-k axis # A negative axis is the last-k axis
print(dragon.math.lp_normalize(x, 1)) print(dragon.nn.lp_norm(x, 1))
print(dragon.math.lp_normalize(x, -1)) # Equivalent print(dragon.nn.lp_norm(x, -1)) # Equivalent
``` ```
More than one axis could be specified to reduce: More than one axis could be specified to reduce:
```python ```python
# Along the continuous axes: [axis, end_axis] # Along the continuous axes: [axis, end_axis]
print(dragon.math.lp_normalize(x, axis=0, end_axis=1)) print(dragon.nn.lp_norm(x, axis=0, end_axis=1))
``` ```
Parameters Parameters
...@@ -236,9 +299,9 @@ def lp_normalize( ...@@ -236,9 +299,9 @@ def lp_normalize(
reduction = reduction.upper() reduction = reduction.upper()
if context.executing_eagerly(): if context.executing_eagerly():
return OpLib.execute( return OpLib.execute(
'LpNormalize', inputs, p=p, axis=axis, end_axis=end_axis, 'LpNorm', inputs, p=p, axis=axis, end_axis=end_axis,
epsilon=epsilon, reduction=reduction) epsilon=epsilon, reduction=reduction)
return OpLib.add('LpNormalize', inputs, p=p, axis=axis, end_axis=end_axis, return OpLib.add('LpNorm', inputs, p=p, axis=axis, end_axis=end_axis,
epsilon=epsilon, reduction=reduction, **kwargs) epsilon=epsilon, reduction=reduction, **kwargs)
......
...@@ -24,9 +24,11 @@ class Adam(optimizer.Optimizer): ...@@ -24,9 +24,11 @@ class Adam(optimizer.Optimizer):
The **Adam** update is defined as: The **Adam** update is defined as:
.. math:: .. math::
\text{Adam}(g) = \frac{\text{lr} * m_{t}}{\sqrt{v_{t}} + \epsilon} \\ \text{Adam}(g) = \text{lr} * (\frac{\text{correction}* m_{t}}
{\sqrt{v_{t}} + \epsilon}) \\
\quad \\ \text{where}\quad \quad \\ \text{where}\quad
\begin{cases} \begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\ m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
\end{cases} \end{cases}
...@@ -62,12 +64,13 @@ class AdamW(Adam): ...@@ -62,12 +64,13 @@ class AdamW(Adam):
The **AdamW** update is defined as: The **AdamW** update is defined as:
.. math:: .. math::
\text{AdamW}(g, p) = \text{lr} * (\frac{m_{t}}{\sqrt{v_{t}} + \epsilon} \text{AdamW}(g, p) = \text{lr} * (\frac{\text{correction} * m_{t}}
+ \lambda p) \\ {\sqrt{v_{t}} + \epsilon} + \lambda p) \\
\quad \\ \text{where}\quad \quad \\ \text{where}\quad
\begin{cases} \begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\ m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} \\
\end{cases} \end{cases}
""" """
......
...@@ -27,13 +27,13 @@ class RMSprop(optimizer.Optimizer): ...@@ -27,13 +27,13 @@ class RMSprop(optimizer.Optimizer):
\text{RMSprop}(g) = \text{lr} * m_{t} \\ \text{RMSprop}(g) = \text{lr} * m_{t} \\
\quad \\ \text{where} \quad \quad \\ \text{where} \quad
\begin{cases} \begin{cases}
v_{t} = \text{decay} * v_{t-1} + (1 - \text{decay}) * g^{2} \\ v_{t} = \alpha * v_{t-1} + (1 - \alpha) * g^{2} \\
m_{t} = \text{momentum} * m_{t-1} + \frac{g}{\sqrt{v_{t}} + \epsilon} m_{t} = \text{momentum} * m_{t-1} + \frac{g}{\sqrt{v_{t}} + \epsilon}
\end{cases} \end{cases}
""" """
def __init__(self, lr=0.01, momentum=0, decay=0.9, eps=1e-8, **kwargs): def __init__(self, lr=0.01, momentum=0, alpha=0.9, eps=1e-8, **kwargs):
r"""Create a ``RMSProp`` optimizer. r"""Create a ``RMSProp`` optimizer.
Parameters Parameters
...@@ -42,8 +42,8 @@ class RMSprop(optimizer.Optimizer): ...@@ -42,8 +42,8 @@ class RMSprop(optimizer.Optimizer):
The initial value to :math:`\text{lr}`. The initial value to :math:`\text{lr}`.
momentum : float, optional, default=0 momentum : float, optional, default=0
The initial value to :math:`\text{momentum}`. The initial value to :math:`\text{momentum}`.
decay : float, optional, default=0.9 alpha : float, optional, default=0.9
The initial value to :math:`\text{decay}`. The initial value to :math:`\alpha`.
eps : float, optional, default=1e-8 eps : float, optional, default=1e-8
The initial value to :math:`\epsilon`. The initial value to :math:`\epsilon`.
...@@ -51,5 +51,5 @@ class RMSprop(optimizer.Optimizer): ...@@ -51,5 +51,5 @@ class RMSprop(optimizer.Optimizer):
super(RMSprop, self).__init__(**kwargs) super(RMSprop, self).__init__(**kwargs)
self._set_hyper('lr', lr) self._set_hyper('lr', lr)
self._set_hyper('momentum', momentum) self._set_hyper('momentum', momentum)
self._set_hyper('decay', decay) self._set_hyper('alpha', alpha)
self._set_hyper('eps', eps) self._set_hyper('eps', eps)
...@@ -51,41 +51,6 @@ def cast_exporter(op_def, context): ...@@ -51,41 +51,6 @@ def cast_exporter(op_def, context):
return node, const_tensors return node, const_tensors
@export_util.register('ChannelAffine')
def channel_affine_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'ChannelAffine')
for arg in op_def.arg:
if arg.name == 'axis':
helper.add_attribute(node, 'axis', arg.i)
elif arg.name == 'end_axis':
helper.add_attribute(node, 'end_axis', arg.i)
return node, const_tensors
@export_util.register('ChannelNormalize')
def channel_normalize_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'ChannelNormalize')
for arg in op_def.arg:
if arg.name == 'mean':
helper.add_attribute(node, 'mean', arg.floats)
elif arg.name == 'std':
helper.add_attribute(node, 'std', arg.floats)
elif arg.name == 'axis':
helper.add_attribute(node, 'axis', arg.i)
elif arg.name == 'dtype':
helper.add_attribute(node, 'dtype', arg.s)
elif arg.name == 'perm':
helper.add_attribute(node, 'perm', arg.ints)
elif arg.name == 'perm_desc':
values = helper.fetch_argument(op_def, arg, context.ws)
helper.add_attribute(node, 'perm', values)
return node, const_tensors
@export_util.register('Concat') @export_util.register('Concat')
def concat_exporter(op_def, context): def concat_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals()) node, const_tensors = export_util.translate(**locals())
......
...@@ -31,6 +31,17 @@ def add_exporter(op_def, context): ...@@ -31,6 +31,17 @@ def add_exporter(op_def, context):
return node, const_tensors return node, const_tensors
@export_util.register('Affine')
def affine_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'Affine')
for arg in op_def.arg:
if arg.name == 'axes':
helper.add_attribute(node, 'axes', arg.ints)
return node, const_tensors
@export_util.register('Div') @export_util.register('Div')
def div_exporter(op_def, context): def div_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals()) node, const_tensors = export_util.translate(**locals())
......
...@@ -32,6 +32,28 @@ def batch_norm_exporter(op_def, context): ...@@ -32,6 +32,28 @@ def batch_norm_exporter(op_def, context):
return node, const_tensors return node, const_tensors
@export_util.register('ChannelNorm')
def channel_norm_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'ChannelNorm')
for arg in op_def.arg:
if arg.name == 'mean':
helper.add_attribute(node, 'mean', arg.floats)
elif arg.name == 'std':
helper.add_attribute(node, 'std', arg.floats)
elif arg.name == 'axis':
helper.add_attribute(node, 'axis', arg.i)
elif arg.name == 'dtype':
helper.add_attribute(node, 'dtype', arg.s)
elif arg.name == 'perm':
helper.add_attribute(node, 'perm', arg.ints)
elif arg.name == 'perm_desc':
values = helper.fetch_argument(op_def, arg, context.ws)
helper.add_attribute(node, 'perm', values)
return node, const_tensors
@export_util.register('GroupNorm') @export_util.register('GroupNorm')
def group_norm_exporter(op_def, context): def group_norm_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals()) node, const_tensors = export_util.translate(**locals())
...@@ -49,8 +71,8 @@ def group_norm_exporter(op_def, context): ...@@ -49,8 +71,8 @@ def group_norm_exporter(op_def, context):
return node, const_tensors return node, const_tensors
@export_util.register('LpNormalize') @export_util.register('LpNorm')
def lp_normalize_exporter(op_def, context): def lp_norm_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals()) node, const_tensors = export_util.translate(**locals())
node.op_type = 'LpNormalization' node.op_type = 'LpNormalization'
axis, end_axis = None, None axis, end_axis = None, None
......
...@@ -33,9 +33,6 @@ constexpr int CUDA_WARP_SIZE = 32; ...@@ -33,9 +33,6 @@ constexpr int CUDA_WARP_SIZE = 32;
/*! \brief The number of cuda threads in a block */ /*! \brief The number of cuda threads in a block */
constexpr int CUDA_THREADS = 256; constexpr int CUDA_THREADS = 256;
/*! \brief The maximum number of blocks to use in a default kernel call */
constexpr int CUDA_MAX_BLOCKS = 4096;
/*! \brief The maximum number of devices in a single machine */ /*! \brief The maximum number of devices in a single machine */
constexpr int CUDA_MAX_DEVICES = 16; constexpr int CUDA_MAX_DEVICES = 16;
...@@ -82,12 +79,15 @@ constexpr int CUDA_TENSOR_MAX_DIMS = 8; ...@@ -82,12 +79,15 @@ constexpr int CUDA_TENSOR_MAX_DIMS = 8;
for (size_t j = threadIdx.x; j < m; j += blockDim.x) for (size_t j = threadIdx.x; j < m; j += blockDim.x)
inline int CUDA_BLOCKS(const int N) { inline int CUDA_BLOCKS(const int N) {
return std::max( int device, sm_count, threads_per_sm;
std::min((N + CUDA_THREADS - 1) / CUDA_THREADS, CUDA_MAX_BLOCKS), 1); CUDA_CHECK(cudaGetDevice(&device));
} CUDA_CHECK(cudaDeviceGetAttribute(
&sm_count, cudaDevAttrMultiProcessorCount, device));
inline int CUDA_2D_BLOCKS(const int N) { CUDA_CHECK(cudaDeviceGetAttribute(
return std::max(std::min(N, CUDA_MAX_BLOCKS), 1); &threads_per_sm, cudaDevAttrMaxThreadsPerMultiProcessor, device));
const auto num_blocks = (N + CUDA_THREADS - 1) / CUDA_THREADS;
const auto max_blocks = sm_count * threads_per_sm / CUDA_THREADS * 32;
return std::max(1, std::min(num_blocks, max_blocks));
} }
#if CUDA_VERSION_MAX(9, 0) #if CUDA_VERSION_MAX(9, 0)
......
...@@ -84,6 +84,7 @@ DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Sub, T); ...@@ -84,6 +84,7 @@ DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Sub, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Mul, T); DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Mul, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Div, T); DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Div, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Pow, T); DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Pow, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Atan2, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Minimum, T); DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Minimum, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Maximum, T); DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Maximum, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Equal, bool); DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Equal, bool);
...@@ -434,6 +435,7 @@ DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(And, bool, std::logical_and); ...@@ -434,6 +435,7 @@ DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(And, bool, std::logical_and);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Or, bool, std::logical_or); DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Or, bool, std::logical_or);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Xor, bool, math::XorFunctor); DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Xor, bool, math::XorFunctor);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Pow, T, math::PowFunctor); DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Pow, T, math::PowFunctor);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Atan2, T, math::Atan2Functor);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Minimum, T, math::MinFunctor); DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Minimum, T, math::MinFunctor);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Maximum, T, math::MaxFunctor); DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Maximum, T, math::MaxFunctor);
#undef DEFINE_ROWWISE_COLWISE_BIANRY_FUNC #undef DEFINE_ROWWISE_COLWISE_BIANRY_FUNC
...@@ -469,6 +471,7 @@ DEFINE_BROADCAST_BINARY_FUNC(And, bool, std::logical_and); ...@@ -469,6 +471,7 @@ DEFINE_BROADCAST_BINARY_FUNC(And, bool, std::logical_and);
DEFINE_BROADCAST_BINARY_FUNC(Or, bool, std::logical_or); DEFINE_BROADCAST_BINARY_FUNC(Or, bool, std::logical_or);
DEFINE_BROADCAST_BINARY_FUNC(Xor, bool, math::XorFunctor); DEFINE_BROADCAST_BINARY_FUNC(Xor, bool, math::XorFunctor);
DEFINE_BROADCAST_BINARY_FUNC(Pow, T, math::PowFunctor); DEFINE_BROADCAST_BINARY_FUNC(Pow, T, math::PowFunctor);
DEFINE_BROADCAST_BINARY_FUNC(Atan2, T, math::Atan2Functor);
DEFINE_BROADCAST_BINARY_FUNC(Minimum, T, math::MinFunctor); DEFINE_BROADCAST_BINARY_FUNC(Minimum, T, math::MinFunctor);
DEFINE_BROADCAST_BINARY_FUNC(Maximum, T, math::MaxFunctor); DEFINE_BROADCAST_BINARY_FUNC(Maximum, T, math::MaxFunctor);
#undef DEFINE_BROADCAST_BINARY_FUNC #undef DEFINE_BROADCAST_BINARY_FUNC
...@@ -612,6 +615,9 @@ DEFINE_BINARY_FUNC(Div, float, float); ...@@ -612,6 +615,9 @@ DEFINE_BINARY_FUNC(Div, float, float);
DEFINE_BINARY_FUNC(Div, double, double); DEFINE_BINARY_FUNC(Div, double, double);
DEFINE_BINARY_FUNC(Pow, float, float); DEFINE_BINARY_FUNC(Pow, float, float);
DEFINE_BINARY_FUNC(Pow, double, double); DEFINE_BINARY_FUNC(Pow, double, double);
DEFINE_BINARY_FUNC(Atan2, float16, float16);
DEFINE_BINARY_FUNC(Atan2, float, float);
DEFINE_BINARY_FUNC(Atan2, double, double);
DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t); DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t);
DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t); DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t);
DEFINE_BINARY_FUNC(Minimum, int, int); DEFINE_BINARY_FUNC(Minimum, int, int);
......
...@@ -388,6 +388,9 @@ DEFINE_BINARY_FUNC(Div, double, double, math::DividesFunctor); ...@@ -388,6 +388,9 @@ DEFINE_BINARY_FUNC(Div, double, double, math::DividesFunctor);
DEFINE_BINARY_FUNC(Pow, float16, float16, math::PowFunctor); DEFINE_BINARY_FUNC(Pow, float16, float16, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, float, float, math::PowFunctor); DEFINE_BINARY_FUNC(Pow, float, float, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, double, double, math::PowFunctor); DEFINE_BINARY_FUNC(Pow, double, double, math::PowFunctor);
DEFINE_BINARY_FUNC(Atan2, float16, float16, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, float, float, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, double, double, math::Atan2Functor);
DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t, math::MinFunctor); DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t, math::MinFunctor); DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int, int, math::MinFunctor); DEFINE_BINARY_FUNC(Minimum, int, int, math::MinFunctor);
......
...@@ -122,6 +122,17 @@ DRAGON_API void Pow( ...@@ -122,6 +122,17 @@ DRAGON_API void Pow(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
DRAGON_API void Atan2(
const int A_ndim,
const int64_t* A_dims,
const int B_ndim,
const int64_t* B_dims,
const T* a,
const T* b,
T* y,
Context* ctx);
template <typename T, class Context>
DRAGON_API void Minimum( DRAGON_API void Minimum(
const int A_ndim, const int A_ndim,
const int64_t* A_dims, const int64_t* A_dims,
......
...@@ -550,6 +550,9 @@ DEFINE_BINARY_FUNC(Maximum, double, double, max); ...@@ -550,6 +550,9 @@ DEFINE_BINARY_FUNC(Maximum, double, double, max);
_SimpleBinaryFunc(N, Functor<InputT>(), a, b, y); \ _SimpleBinaryFunc(N, Functor<InputT>(), a, b, y); \
} }
DEFINE_BINARY_FUNC(Atan2, float16, float16, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, float, float, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, double, double, math::Atan2Functor);
DEFINE_BINARY_FUNC(BitwiseAnd, bool, bool, std::bit_and); DEFINE_BINARY_FUNC(BitwiseAnd, bool, bool, std::bit_and);
DEFINE_BINARY_FUNC(BitwiseAnd, uint8_t, uint8_t, std::bit_and); DEFINE_BINARY_FUNC(BitwiseAnd, uint8_t, uint8_t, std::bit_and);
DEFINE_BINARY_FUNC(BitwiseAnd, int8_t, int8_t, std::bit_and); DEFINE_BINARY_FUNC(BitwiseAnd, int8_t, int8_t, std::bit_and);
......
...@@ -342,7 +342,10 @@ _Where(const int N, const T* a, const T* b, const bool* c, T* y) { ...@@ -342,7 +342,10 @@ _Where(const int N, const T* a, const T* b, const bool* c, T* y) {
DRAGON_API void name<InputT, CUDAContext>( \ DRAGON_API void name<InputT, CUDAContext>( \
const int N, const InputT* x, OutputT* y, CUDAContext* ctx) { \ const int N, const InputT* x, OutputT* y, CUDAContext* ctx) { \
_SimpleUnaryFunc<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _SimpleUnaryFunc<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, Functor<InputT>(), x, y); \ N, \
Functor<math::ScalarType<InputT>::type>(), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(x), \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
} }
DEFINE_UNARY_FUNC(BitwiseNot, bool, bool, math::BitNotFunctor); DEFINE_UNARY_FUNC(BitwiseNot, bool, bool, math::BitNotFunctor);
...@@ -706,6 +709,87 @@ DEFINE_APPLY_MASK_FUNC(float, float); ...@@ -706,6 +709,87 @@ DEFINE_APPLY_MASK_FUNC(float, float);
DEFINE_APPLY_MASK_FUNC(double, double); DEFINE_APPLY_MASK_FUNC(double, double);
#undef DEFINE_APPLY_MASK_FUNC #undef DEFINE_APPLY_MASK_FUNC
#define DEFINE_BINARY_FUNC(name, T, Functor) \
template <> \
DRAGON_API void name<T, CUDAContext>( \
const int N, const T* a, const T* b, T* y, CUDAContext* ctx) { \
using ScalarT = typename math::ScalarType<T>::type; \
using ScalarT2 = typename math::ScalarType<T>::type2; \
if ((N & 1) == 0 && sizeof(ScalarT) != sizeof(ScalarT2)) { \
_SimpleBinaryFunc<<< \
CUDA_BLOCKS(N >> 1), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
N >> 1, \
Functor<ScalarT2>(), \
reinterpret_cast<const ScalarT2*>(a), \
reinterpret_cast<const ScalarT2*>(b), \
reinterpret_cast<ScalarT2*>(y)); \
} else { \
_SimpleBinaryFunc<<< \
CUDA_BLOCKS(N), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
N, \
Functor<ScalarT>(), \
reinterpret_cast<const ScalarT*>(a), \
reinterpret_cast<const ScalarT*>(b), \
reinterpret_cast<ScalarT*>(y)); \
} \
}
DEFINE_BINARY_FUNC(Add, uint8_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int8_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int64_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, float16, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, float, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, double, math::PlusFunctor);
DEFINE_BINARY_FUNC(Sub, uint8_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int8_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int64_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, float16, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, float, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, double, math::MinusFunctor);
DEFINE_BINARY_FUNC(Mul, uint8_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int8_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int64_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, float16, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, float, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, double, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Div, uint8_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int8_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int64_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, float16, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, float, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, double, math::DividesFunctor);
DEFINE_BINARY_FUNC(Pow, float16, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, float, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, double, math::PowFunctor);
DEFINE_BINARY_FUNC(Atan2, float16, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, float, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, double, math::Atan2Functor);
DEFINE_BINARY_FUNC(Minimum, uint8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int64_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, float16, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, float, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, double, math::MinFunctor);
DEFINE_BINARY_FUNC(Maximum, uint8_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int8_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int64_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, float16, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, float, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, double, math::MaxFunctor);
#undef DEFINE_BINARY_FUNC
#define DEFINE_BINARY_FUNC(name, InputT, OutputT, Functor) \ #define DEFINE_BINARY_FUNC(name, InputT, OutputT, Functor) \
template <> \ template <> \
DRAGON_API void name<InputT, CUDAContext>( \ DRAGON_API void name<InputT, CUDAContext>( \
...@@ -726,51 +810,6 @@ DEFINE_APPLY_MASK_FUNC(double, double); ...@@ -726,51 +810,6 @@ DEFINE_APPLY_MASK_FUNC(double, double);
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \ reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
} }
DEFINE_BINARY_FUNC(Add, uint8_t, uint8_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int8_t, int8_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int, int, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int64_t, int64_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, float16, float16, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, float, float, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, double, double, math::PlusFunctor);
DEFINE_BINARY_FUNC(Sub, uint8_t, uint8_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int8_t, int8_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int, int, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int64_t, int64_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, float16, float16, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, float, float, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, double, double, math::MinusFunctor);
DEFINE_BINARY_FUNC(Mul, uint8_t, uint8_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int8_t, int8_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int, int, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int64_t, int64_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, float16, float16, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, float, float, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, double, double, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Div, uint8_t, uint8_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int8_t, int8_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int, int, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int64_t, int64_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, float16, float16, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, float, float, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, double, double, math::DividesFunctor);
DEFINE_BINARY_FUNC(Pow, float16, float16, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, float, float, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, double, double, math::PowFunctor);
DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int, int, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int64_t, int64_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, float16, float16, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, float, float, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, double, double, math::MinFunctor);
DEFINE_BINARY_FUNC(Maximum, uint8_t, uint8_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int8_t, int8_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int, int, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int64_t, int64_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, float16, float16, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, float, float, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, double, double, math::MaxFunctor);
DEFINE_BINARY_FUNC(BitwiseAnd, bool, bool, math::BitAndFunctor); DEFINE_BINARY_FUNC(BitwiseAnd, bool, bool, math::BitAndFunctor);
DEFINE_BINARY_FUNC(BitwiseAnd, uint8_t, uint8_t, math::BitAndFunctor); DEFINE_BINARY_FUNC(BitwiseAnd, uint8_t, uint8_t, math::BitAndFunctor);
DEFINE_BINARY_FUNC(BitwiseAnd, int8_t, int8_t, math::BitAndFunctor); DEFINE_BINARY_FUNC(BitwiseAnd, int8_t, int8_t, math::BitAndFunctor);
......
...@@ -126,6 +126,9 @@ template <typename T, class Context> ...@@ -126,6 +126,9 @@ template <typename T, class Context>
DRAGON_API void Pow(const int N, const T* a, const T* b, T* y, Context* ctx); DRAGON_API void Pow(const int N, const T* a, const T* b, T* y, Context* ctx);
template <typename T, class Context> template <typename T, class Context>
DRAGON_API void Atan2(const int N, const T* a, const T* b, T* y, Context* ctx);
template <typename T, class Context>
DRAGON_API void DRAGON_API void
Minimum(const int N, const T* a, const T* b, T* y, Context* ctx); Minimum(const int N, const T* a, const T* b, T* y, Context* ctx);
......
...@@ -16,24 +16,25 @@ ...@@ -16,24 +16,25 @@
#include "dragon/core/types.h" #include "dragon/core/types.h"
#include "dragon/utils/conversions.h" #include "dragon/utils/conversions.h"
#if defined(__CUDA_ARCH__)
#define HOSTDEVICE_DECL inline __host__ __device__
#else
#define HOSTDEVICE_DECL inline
#endif
namespace dragon { namespace dragon {
namespace math { namespace math {
/* /*
* Arithmetic Functors */ * Arithmetic Functors
*/
template <typename T> template <typename T>
struct IdentityFunctor { struct IdentityFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& x) const {
inline __device__ T operator()(const T& x) const {
return x; return x;
} }
#else
inline T operator()(const T& x) const {
return x;
}
#endif
}; };
template <typename T> template <typename T>
...@@ -76,15 +77,9 @@ struct AbsFunctor<half2> { ...@@ -76,15 +77,9 @@ struct AbsFunctor<half2> {
template <typename T> template <typename T>
struct SqrFunctor { struct SqrFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& x) const {
inline __device__ T operator()(const T& x) const {
return x * x; return x * x;
} }
#else
inline T operator()(const T& x) const {
return x * x;
}
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -115,40 +110,16 @@ struct SqrFunctor<half2> { ...@@ -115,40 +110,16 @@ struct SqrFunctor<half2> {
template <typename T> template <typename T>
struct MaxFunctor { struct MaxFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? rhs : lhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? rhs : lhs; return lhs < rhs ? rhs : lhs;
} }
#endif
}; };
template <> template <>
struct MaxFunctor<float16> { struct MaxFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
return __hlt(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs))
? rhs
: lhs;
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) <
__half2float(*reinterpret_cast<const half*>(&rhs))
? rhs
: lhs;
#endif
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const { inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) < convert::To<float>(rhs) ? rhs : lhs; return convert::To<float>(lhs) < convert::To<float>(rhs) ? rhs : lhs;
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -176,40 +147,16 @@ struct MaxFunctor<half2> { ...@@ -176,40 +147,16 @@ struct MaxFunctor<half2> {
template <typename T> template <typename T>
struct MinFunctor { struct MinFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? lhs : rhs; return lhs < rhs ? lhs : rhs;
} }
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? lhs : rhs;
}
#endif
}; };
template <> template <>
struct MinFunctor<float16> { struct MinFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
return __hlt(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs))
? lhs
: rhs;
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) <
__half2float(*reinterpret_cast<const half*>(&rhs))
? lhs
: rhs;
#endif
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const { inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) < convert::To<float>(rhs) ? lhs : rhs; return convert::To<float>(lhs) < convert::To<float>(rhs) ? lhs : rhs;
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -237,39 +184,17 @@ struct MinFunctor<half2> { ...@@ -237,39 +184,17 @@ struct MinFunctor<half2> {
template <typename T> template <typename T>
struct PlusFunctor { struct PlusFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs + rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs + rhs; return lhs + rhs;
} }
#endif
}; };
template <> template <>
struct PlusFunctor<float16> { struct PlusFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
half ret = __hadd(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
half ret = __float2half(
__half2float(*reinterpret_cast<const half*>(&lhs)) +
__half2float(*reinterpret_cast<const half*>(&rhs)));
#endif
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const { inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>( return convert::To<float16>(
convert::To<float>(lhs) + convert::To<float>(rhs)); convert::To<float>(lhs) + convert::To<float>(rhs));
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -300,39 +225,17 @@ struct PlusFunctor<half2> { ...@@ -300,39 +225,17 @@ struct PlusFunctor<half2> {
template <typename T> template <typename T>
struct MinusFunctor { struct MinusFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs - rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs - rhs; return lhs - rhs;
} }
#endif
}; };
template <> template <>
struct MinusFunctor<float16> { struct MinusFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
half ret = __hsub(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
half ret = __float2half(
__half2float(*reinterpret_cast<const half*>(&lhs)) -
__half2float(*reinterpret_cast<const half*>(&rhs)));
#endif
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const { inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>( return convert::To<float16>(
convert::To<float>(lhs) - convert::To<float>(rhs)); convert::To<float>(lhs) - convert::To<float>(rhs));
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -363,39 +266,17 @@ struct MinusFunctor<half2> { ...@@ -363,39 +266,17 @@ struct MinusFunctor<half2> {
template <typename T> template <typename T>
struct MultipliesFunctor { struct MultipliesFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs * rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs * rhs; return lhs * rhs;
} }
#endif
}; };
template <> template <>
struct MultipliesFunctor<float16> { struct MultipliesFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
half ret = __hmul(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
half ret = __float2half(
__half2float(*reinterpret_cast<const half*>(&lhs)) *
__half2float(*reinterpret_cast<const half*>(&rhs)));
#endif
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const { inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>( return convert::To<float16>(
convert::To<float>(lhs) * convert::To<float>(rhs)); convert::To<float>(lhs) * convert::To<float>(rhs));
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -426,39 +307,17 @@ struct MultipliesFunctor<half2> { ...@@ -426,39 +307,17 @@ struct MultipliesFunctor<half2> {
template <typename T> template <typename T>
struct DividesFunctor { struct DividesFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs / rhs; return lhs / rhs;
} }
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs / rhs;
}
#endif
}; };
template <> template <>
struct DividesFunctor<float16> { struct DividesFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
half ret = __hdiv(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
half ret = __float2half(
__half2float(*reinterpret_cast<const half*>(&lhs)) /
__half2float(*reinterpret_cast<const half*>(&rhs)));
#endif
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const { inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>( return convert::To<float16>(
convert::To<float>(lhs) / convert::To<float>(rhs)); convert::To<float>(lhs) / convert::To<float>(rhs));
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -498,20 +357,10 @@ struct PowFunctor { ...@@ -498,20 +357,10 @@ struct PowFunctor {
template <> template <>
struct PowFunctor<float16> { struct PowFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
half ret = __float2half(
pow(__half2float(*reinterpret_cast<const half*>(&lhs)),
__half2float(*reinterpret_cast<const half*>(&rhs))));
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const { inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>( return convert::To<float16>(
std::pow(convert::To<float>(lhs), convert::To<float>(rhs))); std::pow(convert::To<float>(lhs), convert::To<float>(rhs)));
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -532,34 +381,104 @@ struct PowFunctor<half2> { ...@@ -532,34 +381,104 @@ struct PowFunctor<half2> {
}; };
#endif #endif
/*
* Logical Functors
*/
template <typename T> template <typename T>
struct NotFunctor { struct Atan2Functor {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& x) const { inline __device__ T operator()(const T& lhs, const T& rhs) const {
return !x; return atan2(lhs, rhs);
} }
#else #else
inline bool operator()(const T& x) const { inline T operator()(const T& lhs, const T& rhs) const {
return !x; return std::atan2(lhs, rhs);
} }
#endif #endif
}; };
template <> template <>
struct NotFunctor<float16> { struct Atan2Functor<float16> {
inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>(
std::atan2(convert::To<float>(lhs), convert::To<float>(rhs)));
}
};
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& x) const { template <>
return !__half2float(*reinterpret_cast<const half*>(&x)); struct Atan2Functor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
return __float2half(atan2f(__half2float(lhs), __half2float(rhs)));
}
};
template <>
struct Atan2Functor<half2> {
inline __device__ half2 operator()(const half2& lhs, const half2& rhs) const {
const float2 v1 = __half22float2(lhs);
const float2 v2 = __half22float2(rhs);
return __floats2half2_rn(atan2f(v1.x, v2.x), atan2f(v1.y, v2.y));
} }
};
#endif
template <typename T>
struct FMAFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& x, const T& y, const T& z) const {
return fma(x, y, z);
}
#else
inline T operator()(const T& x, const T& y, const T& z) const {
return std::fma(x, y, z);
}
#endif
};
#if defined(__CUDA_ARCH__)
template <>
struct FMAFunctor<half> {
inline __device__ half
operator()(const half& x, const half& y, const half& z) const {
#if __CUDA_ARCH__ >= 530
return __hfma(x, y, z);
#else
return __float2half(
fmaf(__half2float(x), __half2float(y), __half2float(z)));
#endif
}
};
template <>
struct FMAFunctor<half2> {
inline __device__ half2
operator()(const half2& x, const half2& y, const half2& z) const {
#if __CUDA_ARCH__ >= 530
return __hfma2(x, y, z);
#else #else
const float2 v1 = __half22float2(x);
const float2 v2 = __half22float2(y);
const float2 v3 = __half22float2(z);
return __floats2half2_rn(fmaf(v1.x, v2.x, v3.x), fmaf(v1.y, v2.y, v3.y));
#endif
}
};
#endif
/*
* Logical Functors
*/
template <typename T>
struct NotFunctor {
HOSTDEVICE_DECL bool operator()(const T& x) const {
return !x;
}
};
template <>
struct NotFunctor<float16> {
inline bool operator()(const float16& x) const { inline bool operator()(const float16& x) const {
return !convert::To<float>(x); return !convert::To<float>(x);
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -573,30 +492,16 @@ struct NotFunctor<half> { ...@@ -573,30 +492,16 @@ struct NotFunctor<half> {
template <typename T> template <typename T>
struct AndFunctor { struct AndFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs && rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs && rhs; return lhs && rhs;
} }
#endif
}; };
template <> template <>
struct AndFunctor<float16> { struct AndFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
return __half2float(*reinterpret_cast<const half*>(&lhs)) &&
__half2float(*reinterpret_cast<const half*>(&rhs));
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const { inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) && convert::To<float>(rhs); return convert::To<float>(lhs) && convert::To<float>(rhs);
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -610,30 +515,16 @@ struct AndFunctor<half> { ...@@ -610,30 +515,16 @@ struct AndFunctor<half> {
template <typename T> template <typename T>
struct OrFunctor { struct OrFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs || rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs || rhs; return lhs || rhs;
} }
#endif
}; };
template <> template <>
struct OrFunctor<float16> { struct OrFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
return __half2float(*reinterpret_cast<const half*>(&lhs)) ||
__half2float(*reinterpret_cast<const half*>(&rhs));
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const { inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) || convert::To<float>(rhs); return convert::To<float>(lhs) || convert::To<float>(rhs);
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -647,32 +538,17 @@ struct OrFunctor<half> { ...@@ -647,32 +538,17 @@ struct OrFunctor<half> {
template <typename T> template <typename T>
struct XorFunctor { struct XorFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return convert::To<bool>(lhs) ^ convert::To<bool>(rhs);
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return convert::To<bool>(lhs) ^ convert::To<bool>(rhs); return convert::To<bool>(lhs) ^ convert::To<bool>(rhs);
} }
#endif
}; };
template <> template <>
struct XorFunctor<float16> { struct XorFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
return convert::To<bool>(
__half2float(*reinterpret_cast<const half*>(&lhs))) ^
convert::To<bool>(__half2float(*reinterpret_cast<const half*>(&rhs)));
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const { inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<bool>(convert::To<float>(lhs)) ^ return convert::To<bool>(convert::To<float>(lhs)) ^
convert::To<bool>(convert::To<float>(rhs)); convert::To<bool>(convert::To<float>(rhs));
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -691,54 +567,30 @@ struct XorFunctor<half> { ...@@ -691,54 +567,30 @@ struct XorFunctor<half> {
template <typename T> template <typename T>
struct BitNotFunctor { struct BitNotFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& x) const {
inline __device__ T operator()(const T& x) const {
return ~x; return ~x;
} }
#else
inline T operator()(const T& x) const {
return ~x;
}
#endif
}; };
template <typename T> template <typename T>
struct BitAndFunctor { struct BitAndFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs & rhs; return lhs & rhs;
} }
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs & rhs;
}
#endif
}; };
template <typename T> template <typename T>
struct BitOrFunctor { struct BitOrFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs | rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs | rhs; return lhs | rhs;
} }
#endif
}; };
template <typename T> template <typename T>
struct BitXorFunctor { struct BitXorFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs ^ rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs ^ rhs; return lhs ^ rhs;
} }
#endif
}; };
/* /*
...@@ -747,36 +599,16 @@ struct BitXorFunctor { ...@@ -747,36 +599,16 @@ struct BitXorFunctor {
template <typename T> template <typename T>
struct EqualFunctor { struct EqualFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs == rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs == rhs; return lhs == rhs;
} }
#endif
}; };
template <> template <>
struct EqualFunctor<float16> { struct EqualFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __heq(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) ==
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const { inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) == convert::To<float>(rhs); return convert::To<float>(lhs) == convert::To<float>(rhs);
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -794,36 +626,16 @@ struct EqualFunctor<half> { ...@@ -794,36 +626,16 @@ struct EqualFunctor<half> {
template <typename T> template <typename T>
struct NotEqualFunctor { struct NotEqualFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs != rhs; return lhs != rhs;
} }
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs != rhs;
}
#endif
}; };
template <> template <>
struct NotEqualFunctor<float16> { struct NotEqualFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hne(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) !=
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const { inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) != convert::To<float>(rhs); return convert::To<float>(lhs) != convert::To<float>(rhs);
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -841,36 +653,16 @@ struct NotEqualFunctor<half> { ...@@ -841,36 +653,16 @@ struct NotEqualFunctor<half> {
template <typename T> template <typename T>
struct GreaterFunctor { struct GreaterFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs > rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs > rhs; return lhs > rhs;
} }
#endif
}; };
template <> template <>
struct GreaterFunctor<float16> { struct GreaterFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hgt(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) >
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const { inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) > convert::To<float>(rhs); return convert::To<float>(lhs) > convert::To<float>(rhs);
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -888,36 +680,16 @@ struct GreaterFunctor<half> { ...@@ -888,36 +680,16 @@ struct GreaterFunctor<half> {
template <typename T> template <typename T>
struct LessFunctor { struct LessFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs < rhs; return lhs < rhs;
} }
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs < rhs;
}
#endif
}; };
template <> template <>
struct LessFunctor<float16> { struct LessFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hlt(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) <
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const { inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) < convert::To<float>(rhs); return convert::To<float>(lhs) < convert::To<float>(rhs);
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -935,36 +707,16 @@ struct LessFunctor<half> { ...@@ -935,36 +707,16 @@ struct LessFunctor<half> {
template <typename T> template <typename T>
struct GreaterEqualFunctor { struct GreaterEqualFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs >= rhs; return lhs >= rhs;
} }
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs >= rhs;
}
#endif
}; };
template <> template <>
struct GreaterEqualFunctor<float16> { struct GreaterEqualFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hge(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) >=
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const { inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) >= convert::To<float>(rhs); return convert::To<float>(lhs) >= convert::To<float>(rhs);
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -982,36 +734,16 @@ struct GreaterEqualFunctor<half> { ...@@ -982,36 +734,16 @@ struct GreaterEqualFunctor<half> {
template <typename T> template <typename T>
struct LessEqualFunctor { struct LessEqualFunctor {
#if defined(__CUDA_ARCH__) HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs <= rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs <= rhs; return lhs <= rhs;
} }
#endif
}; };
template <> template <>
struct LessEqualFunctor<float16> { struct LessEqualFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hle(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) <
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const { inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) <= convert::To<float>(rhs); return convert::To<float>(lhs) <= convert::To<float>(rhs);
} }
#endif
}; };
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
...@@ -1031,4 +763,6 @@ struct LessEqualFunctor<half> { ...@@ -1031,4 +763,6 @@ struct LessEqualFunctor<half> {
} // namespace dragon } // namespace dragon
#undef HOSTDEVICE_DECL
#endif // DRAGON_UTILS_MATH_FUNCTIONAL_H_ #endif // DRAGON_UTILS_MATH_FUNCTIONAL_H_
...@@ -108,13 +108,11 @@ void _GenericReduce( ...@@ -108,13 +108,11 @@ void _GenericReduce(
} \ } \
if (math::utils::IsRowwiseReduce( \ if (math::utils::IsRowwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \ num_dims, dims, out_dims.data(), &rows, &cols)) { \
_RowwiseReduce##name(rows, cols, scale, x, y); \ return _RowwiseReduce##name(rows, cols, scale, x, y); \
return; \
} \ } \
if (math::utils::IsColwiseReduce( \ if (math::utils::IsColwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \ num_dims, dims, out_dims.data(), &rows, &cols)) { \
_ColwiseReduce##name(rows, cols, scale, x, y); \ return _ColwiseReduce##name(rows, cols, scale, x, y); \
return; \
} \ } \
vec64_t transpose_axes(num_dims); \ vec64_t transpose_axes(num_dims); \
vec64_t transpose_strides(num_dims); \ vec64_t transpose_strides(num_dims); \
......
#include "dragon/utils/math/transform.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/math/reduce.h"
namespace dragon {
namespace math {
namespace {
template <typename T>
void _AffineChannel(
const int N,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y) {
EigenArrayMap<T> Y(y, C, N);
ConstEigenArrayMap<T> X(x, C, N);
Y = X.colwise() * ConstEigenVectorArrayMap<T>(scale, C);
if (bias != nullptr) {
Y.colwise() += ConstEigenVectorArrayMap<T>(bias, C);
}
}
template <typename T>
void _AffineChannel(
const int N,
const int C,
const int S,
const T* x,
const T* scale,
const T* bias,
T* y) {
const auto CxS = C * S;
for (int i = 0; i < N; ++i) {
EigenArrayMap<T> Y(y + i * CxS, S, C);
ConstEigenArrayMap<T> X(x + i * CxS, S, C);
Y = X.rowwise() * ConstEigenVectorArrayMap<T>(scale, C).transpose();
if (bias != nullptr) {
Y.rowwise() += ConstEigenVectorArrayMap<T>(bias, C).transpose();
}
}
}
template <typename T>
void _AffineImpl(
const int num_dims,
const int64_t* dims,
const int num_axes,
const int64_t* axes,
const T* x,
const T* scale,
const T* bias,
T* y) {
if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
_AffineChannel(dims[0], dims[1], x, scale, bias, y);
} else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
_AffineChannel(dims[0], dims[1], dims[2], x, scale, bias, y);
} else {
LOG(FATAL) << "Unsupported affine dimensions.";
}
}
} // namespace
template <>
void Affine<float16, CPUContext>(
const int num_dims,
const int64_t* dims,
const int num_axes,
const int64_t* axes,
const float16* x,
const float16* scale,
const float16* bias,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_AFFINE_FUNC(T) \
template <> \
void Affine<T, CPUContext>( \
const int num_dims, \
const int64_t* dims, \
const int num_axes, \
const int64_t* axes, \
const T* x, \
const T* scale, \
const T* bias, \
T* y, \
CPUContext* ctx) { \
vec64_t new_dims, new_axes; \
math::utils::CollapseReduceAxes( \
num_dims, dims, num_axes, axes, new_dims, new_axes); \
_AffineImpl( \
new_dims.size(), \
new_dims.data(), \
new_axes.size(), \
new_axes.data(), \
x, \
scale, \
bias, \
y); \
}
DEFINE_AFFINE_FUNC(float);
DEFINE_AFFINE_FUNC(double);
#undef DEFINE_AFFINE_FUNC
} // namespace math
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math/functional.h"
#include "dragon/utils/math/reduce.h"
#include "dragon/utils/math/transform.h"
#include "dragon/utils/math/types.h"
#include "dragon/utils/math/utils.h"
namespace dragon {
namespace math {
namespace {
template <typename T>
__global__ void _AffineChannel(
const int NxC,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y) {
auto op3 = math::FMAFunctor<T>();
auto op2 = math::MultipliesFunctor<T>();
CUDA_1D_KERNEL_LOOP(i, NxC) {
if (bias != nullptr) {
y[i] = op3(x[i], __ldg(scale + i % C), __ldg(bias + i % C));
} else {
y[i] = op2(x[i], __ldg(scale + i % C));
}
}
}
template <typename T>
__global__ void _AffineChannel(
const int NxCxS,
const int C,
const int S,
const T* x,
const T* scale,
const T* bias,
T* y) {
auto op3 = math::FMAFunctor<T>();
auto op2 = math::MultipliesFunctor<T>();
CUDA_1D_KERNEL_LOOP(i, NxCxS) {
const int j = (i / S) % C;
if (bias != nullptr) {
y[i] = op3(x[i], __ldg(scale + j), __ldg(bias + j));
} else {
y[i] = op2(x[i], __ldg(scale + j));
}
}
}
template <typename T>
void _AffineImpl(
const int num_dims,
const int64_t* dims,
const int num_axes,
const int64_t* axes,
const T* x,
const T* scale,
const T* bias,
T* y,
CUDAContext* ctx) {
const auto N = math::utils::Prod(num_dims, dims);
if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], x, scale, bias, y);
} else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], dims[2], x, scale, bias, y);
} else {
LOG(FATAL) << "Unsupported affine dimensions.";
}
}
} // namespace
#define DEFINE_AFFINE_FUNC(T) \
template <> \
void Affine<T, CUDAContext>( \
const int num_dims, \
const int64_t* dims, \
const int num_axes, \
const int64_t* axes, \
const T* x, \
const T* scale, \
const T* bias, \
T* y, \
CUDAContext* ctx) { \
vec64_t new_dims, new_axes; \
math::utils::CollapseReduceAxes( \
num_dims, dims, num_axes, axes, new_dims, new_axes); \
_AffineImpl( \
new_dims.size(), \
new_dims.data(), \
new_axes.size(), \
new_axes.data(), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(scale), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y), \
ctx); \
}
DEFINE_AFFINE_FUNC(float);
DEFINE_AFFINE_FUNC(float16);
DEFINE_AFFINE_FUNC(double);
#undef DEFINE_AFFINE_FUNC
} // namespace math
} // namespace dragon
#endif // USE_CUDA
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_UTILS_MATH_TRANSFORM_H_
#define DRAGON_UTILS_MATH_TRANSFORM_H_
#include "dragon/core/context.h"
namespace dragon {
namespace math {
template <typename T, class Context>
DRAGON_API void Affine(
const int num_dims,
const int64_t* dims,
const int num_axes,
const int64_t* axes,
const T* x,
const T* scale,
const T* bias,
T* y,
Context* ctx);
} // namespace math
} // namespace dragon
#endif // DRAGON_UTILS_MATH_TRANSFORM_H_
...@@ -141,8 +141,7 @@ void _TransposeImpl( ...@@ -141,8 +141,7 @@ void _TransposeImpl(
CUDAContext* ctx) { CUDAContext* ctx) {
auto aligned_size = sizeof(T); auto aligned_size = sizeof(T);
if (axes.back() == D - 1) { if (axes.back() == D - 1) {
const auto N = math::utils::Prod(D, dims.data()); aligned_size = utils::GetAlignedSize<T, 16>(dims[D - 1], x, y);
aligned_size = utils::GetAlignedSize<T, 16>(N, x, y);
} }
SimpleArray<int, D> X_dims, X_strides, Y_dims; SimpleArray<int, D> X_dims, X_strides, Y_dims;
for (int i = 0; i < D; ++i) { for (int i = 0; i < D; ++i) {
......
...@@ -27,6 +27,7 @@ template <typename T> ...@@ -27,6 +27,7 @@ template <typename T>
class ScalarType { class ScalarType {
public: public:
typedef T type; typedef T type;
typedef T type2;
}; };
#if defined(__CUDACC__) #if defined(__CUDACC__)
...@@ -34,6 +35,7 @@ template <> ...@@ -34,6 +35,7 @@ template <>
class ScalarType<float16> { class ScalarType<float16> {
public: public:
typedef half type; typedef half type;
typedef half2 type2;
}; };
#endif #endif
......
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
#include "dragon/utils/conversions.h" #include "dragon/utils/conversions.h"
#if defined(__CUDACC__) #if defined(__CUDACC__)
#define MATH_UTILS_DECL inline __host__ __device__ #define HOSTDEVICE_DECL inline __host__ __device__
#else #else
#define MATH_UTILS_DECL inline #define HOSTDEVICE_DECL inline
#endif #endif
#define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \ #define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \
...@@ -41,28 +41,28 @@ namespace utils { ...@@ -41,28 +41,28 @@ namespace utils {
template < template <
typename T, typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0> typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
MATH_UTILS_DECL T IsInf(const T x) { HOSTDEVICE_DECL T IsInf(const T x) {
return false; return false;
} }
template < template <
typename T, typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0> typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
MATH_UTILS_DECL T IsNaN(const T x) { HOSTDEVICE_DECL T IsNaN(const T x) {
return false; return false;
} }
template < template <
typename T, typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0> typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
MATH_UTILS_DECL T IsFinite(const T x) { HOSTDEVICE_DECL T IsFinite(const T x) {
return true; return true;
} }
template < template <
typename T, typename T,
typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0> typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
MATH_UTILS_DECL bool IsInf(T x) { HOSTDEVICE_DECL bool IsInf(T x) {
#if defined(__CUDACC__) #if defined(__CUDACC__)
return isinf(x); return isinf(x);
#else #else
...@@ -73,7 +73,7 @@ MATH_UTILS_DECL bool IsInf(T x) { ...@@ -73,7 +73,7 @@ MATH_UTILS_DECL bool IsInf(T x) {
template < template <
typename T, typename T,
typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0> typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
MATH_UTILS_DECL bool IsNaN(T x) { HOSTDEVICE_DECL bool IsNaN(T x) {
#if defined(__CUDACC__) #if defined(__CUDACC__)
return isnan(x); return isnan(x);
#else #else
...@@ -84,7 +84,7 @@ MATH_UTILS_DECL bool IsNaN(T x) { ...@@ -84,7 +84,7 @@ MATH_UTILS_DECL bool IsNaN(T x) {
template < template <
typename T, typename T,
typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0> typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
MATH_UTILS_DECL bool IsFinite(T x) { HOSTDEVICE_DECL bool IsFinite(T x) {
#if defined(__CUDACC__) #if defined(__CUDACC__)
return isfinite(x); return isfinite(x);
#else #else
...@@ -106,27 +106,27 @@ inline bool IsFinite(float16 x) { ...@@ -106,27 +106,27 @@ inline bool IsFinite(float16 x) {
} }
template <typename T> template <typename T>
MATH_UTILS_DECL bool IsAGeZeroAndALtB(const T a, const T b) { HOSTDEVICE_DECL bool IsAGeZeroAndALtB(const T a, const T b) {
return static_cast<unsigned int>(a) < static_cast<unsigned int>(b); return static_cast<unsigned int>(a) < static_cast<unsigned int>(b);
} }
template <typename T> template <typename T>
MATH_UTILS_DECL T Sign(const T x) { HOSTDEVICE_DECL T Sign(const T x) {
return x > T(0) ? T(1) : (x < T(0) ? T(-1) : T(0)); return x > T(0) ? T(1) : (x < T(0) ? T(-1) : T(0));
} }
template <typename T> template <typename T>
MATH_UTILS_DECL T Identity(const T x) { HOSTDEVICE_DECL T Identity(const T x) {
return x; return x;
} }
template <typename T> template <typename T>
MATH_UTILS_DECL T Square(const T x) { HOSTDEVICE_DECL T Square(const T x) {
return x * x; return x * x;
} }
template <typename T> template <typename T>
MATH_UTILS_DECL T Cube(const T x) { HOSTDEVICE_DECL T Cube(const T x) {
return x * x * x; return x * x * x;
} }
...@@ -247,4 +247,6 @@ void IncreaseIndexInDims(const int num_dims, const DimT* dims, IndexT* index) { ...@@ -247,4 +247,6 @@ void IncreaseIndexInDims(const int num_dims, const DimT* dims, IndexT* index) {
} // namespace dragon } // namespace dragon
#undef HOSTDEVICE_DECL
#endif // DRAGON_UTILS_MATH_UTILS_H_ #endif // DRAGON_UTILS_MATH_UTILS_H_
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "dragon/utils/math/random.h" #include "dragon/utils/math/random.h"
#include "dragon/utils/math/reduce.h" #include "dragon/utils/math/reduce.h"
#include "dragon/utils/math/sort.h" #include "dragon/utils/math/sort.h"
#include "dragon/utils/math/transform.h"
#include "dragon/utils/math/transpose.h" #include "dragon/utils/math/transpose.h"
#include "dragon/utils/math/types.h" #include "dragon/utils/math/types.h"
#include "dragon/utils/math/utils.h" #include "dragon/utils/math/utils.h"
......
...@@ -284,39 +284,6 @@ void BooleanMaskGrad( ...@@ -284,39 +284,6 @@ void BooleanMaskGrad(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void ChannelAffine(
const int N,
const int S,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y,
Context* ctx);
template <typename InputT, typename OutputT, class Context>
void ChannelNormalize(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const InputT* x,
const float* mean,
const float* std,
OutputT* y,
Context* ctx);
template <typename T, class Context>
void ChannelShuffle(
const int N,
const int S,
const int C,
const int G,
const T* x,
T* y,
Context* ctx);
template <typename T, class Context>
void ConstPad( void ConstPad(
const int num_dims, const int num_dims,
const int64_t* x_dims, const int64_t* x_dims,
...@@ -813,6 +780,18 @@ void TopK( ...@@ -813,6 +780,18 @@ void TopK(
* NormalizationOp Kernels * NormalizationOp Kernels
*/ */
template <typename InputT, typename OutputT, class Context>
void ChannelNorm(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const InputT* x,
const float* mean,
const float* std,
OutputT* y,
Context* ctx);
template <typename T, typename AccT, class Context> template <typename T, typename AccT, class Context>
void BatchNormExpectation( void BatchNormExpectation(
const int N, const int N,
...@@ -923,7 +902,7 @@ void GroupNormGrad( ...@@ -923,7 +902,7 @@ void GroupNormGrad(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void L1Normalize( void L1Norm(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -934,7 +913,7 @@ void L1Normalize( ...@@ -934,7 +913,7 @@ void L1Normalize(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void L1NormalizeGrad( void L1NormGrad(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -946,7 +925,7 @@ void L1NormalizeGrad( ...@@ -946,7 +925,7 @@ void L1NormalizeGrad(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void L2Normalize( void L2Norm(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -957,7 +936,7 @@ void L2Normalize( ...@@ -957,7 +936,7 @@ void L2Normalize(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void L2NormalizeGrad( void L2NormGrad(
const int N, const int N,
const int S, const int S,
const int C, const int C,
...@@ -1012,19 +991,23 @@ void LSTMCellGrad( ...@@ -1012,19 +991,23 @@ void LSTMCellGrad(
* TrainingOp Kernels * TrainingOp Kernels
*/ */
template <typename T, class Context> template <typename T, typename CopyT, class Context>
void Adam( void Adam(
const int N, const int N,
const float lr, const float lr,
const float beta1, const float beta1,
const float beta2, const float beta2,
const float eps, const float eps,
T* g, const float wd,
const T* x,
const T* g,
T* m, T* m,
T* v, T* v,
T* y,
CopyT* y_copy,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, typename CopyT, class Context>
void AdamW( void AdamW(
const int N, const int N,
const float lr, const float lr,
...@@ -1033,39 +1016,53 @@ void AdamW( ...@@ -1033,39 +1016,53 @@ void AdamW(
const float eps, const float eps,
const float wd, const float wd,
const T* x, const T* x,
T* g, const T* g,
T* m, T* m,
T* v, T* v,
T* y,
CopyT* y_copy,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, typename CopyT, class Context>
void MomentumSGD( void MomentumSGD(
const int N, const int N,
const float lr, const float lr,
const float momentum, const float momentum,
T* g, const float wd,
const T* x,
const T* g,
T* m, T* m,
T* y,
CopyT* y_copy,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, typename CopyT, class Context>
void NesterovSGD( void NesterovSGD(
const int N, const int N,
const float lr, const float lr,
const float momentum, const float momentum,
T* g, const float wd,
const T* x,
const T* g,
T* m, T* m,
T* y,
CopyT* y_copy,
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, typename CopyT, class Context>
void RMSprop( void RMSprop(
const int N, const int N,
const float lr, const float lr,
const float momentum, const float momentum,
const float decay, const float alpha,
const float eps, const float eps,
T* g, const float wd,
const T* x,
const T* g,
T* m, T* m,
T* v, T* v,
T* y,
CopyT* y_copy,
Context* ctx); Context* ctx);
/* /*
......
...@@ -18,6 +18,7 @@ from dragon.vm.tensorflow.core.ops.math_ops import add ...@@ -18,6 +18,7 @@ from dragon.vm.tensorflow.core.ops.math_ops import add
from dragon.vm.tensorflow.core.ops.math_ops import add_n from dragon.vm.tensorflow.core.ops.math_ops import add_n
from dragon.vm.tensorflow.core.ops.math_ops import argmax from dragon.vm.tensorflow.core.ops.math_ops import argmax
from dragon.vm.tensorflow.core.ops.math_ops import argmin from dragon.vm.tensorflow.core.ops.math_ops import argmin
from dragon.vm.tensorflow.core.ops.math_ops import atan2
from dragon.vm.tensorflow.core.ops.math_ops import cast from dragon.vm.tensorflow.core.ops.math_ops import cast
from dragon.vm.tensorflow.core.ops.math_ops import ceil from dragon.vm.tensorflow.core.ops.math_ops import ceil
from dragon.vm.tensorflow.core.ops.math_ops import cos from dragon.vm.tensorflow.core.ops.math_ops import cos
......
...@@ -27,9 +27,11 @@ class Adam(optimizer.Optimizer): ...@@ -27,9 +27,11 @@ class Adam(optimizer.Optimizer):
The **Adam** update is defined as: The **Adam** update is defined as:
.. math:: .. math::
\text{Adam}(g) = \text{lr} * \frac{m_{t}}{\sqrt{v_{t}} + \epsilon} \\ \text{Adam}(g) = \text{lr} * (\frac{\text{correction}* m_{t}}
{\sqrt{v_{t}} + \epsilon}) \\
\quad \\ \text{where}\quad \quad \\ \text{where}\quad
\begin{cases} \begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\ m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
\end{cases} \end{cases}
......
...@@ -61,8 +61,8 @@ class RMSprop(optimizer.Optimizer): ...@@ -61,8 +61,8 @@ class RMSprop(optimizer.Optimizer):
super(RMSprop, self).__init__(name, **kwargs) super(RMSprop, self).__init__(name, **kwargs)
self._set_hyper('lr', learning_rate) self._set_hyper('lr', learning_rate)
self._set_hyper('momentum', momentum) self._set_hyper('momentum', momentum)
self._set_hyper('decay', rho) self._set_hyper('alpha', rho)
self._set_hyper('eps', epsilon) self._set_hyper('eps', epsilon)
self._hyper_aliases['learning_rate'] = 'lr' self._hyper_aliases['learning_rate'] = 'lr'
self._hyper_aliases['rho'] = 'decay' self._hyper_aliases['rho'] = 'alpha'
self._hyper_aliases['eps'] = 'epsilon' self._hyper_aliases['eps'] = 'epsilon'
...@@ -184,6 +184,35 @@ def argmin(input, axis=None, name=None): ...@@ -184,6 +184,35 @@ def argmin(input, axis=None, name=None):
return math_ops.argmin(input, axis=axis, name=name) return math_ops.argmin(input, axis=axis, name=name)
def atan2(y, x, name=None):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{input1}}{\text{input2}})
```python
y = tf.constant(1.)
x = tf.constant(2.)
print(tf.math.atan2(y, x)) # 0.46364761
```
Parameters
----------
y : dragon.Tensor
The input1 tensor.
x : dragon.Tensor
The input2 tensor.
name : str, optional
The operation name.
Returns
-------
dragon.Tensor
The output tensor.
"""
return math_ops.atan2([y, x], name=name)
def cast(x, dtype, name=None): def cast(x, dtype, name=None):
"""Cast the data type of input. """Cast the data type of input.
......
...@@ -129,13 +129,8 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None): ...@@ -129,13 +129,8 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None):
The output tensor. The output tensor.
""" """
return normalization_ops.lp_normalize( return normalization_ops.lp_norm(
x, x, p=2, axis=axis, epsilon=epsilon, name=name)
p=2,
axis=axis,
epsilon=epsilon,
name=name,
)
def moments(x, axes=None, keepdims=False, name=None): def moments(x, axes=None, keepdims=False, name=None):
......
...@@ -501,8 +501,8 @@ class TestOpSpecWithTensorDesc(unittest.TestCase): ...@@ -501,8 +501,8 @@ class TestOpSpecWithTensorDesc(unittest.TestCase):
self.assertEqual(dragon.broadcast_to( self.assertEqual(dragon.broadcast_to(
self.sym2, shape=self.shape1).shape, (None,) * len(self.sym2.shape)) self.sym2, shape=self.shape1).shape, (None,) * len(self.sym2.shape))
def test_channel_normalize(self): def test_channel_norm(self):
func = functools.partial(dragon.channel_normalize, func = functools.partial(dragon.nn.channel_norm,
mean=(1., 1., 1.), std=(1., 1., 1.)) mean=(1., 1., 1.), std=(1., 1., 1.))
with dragon.graph_mode(): with dragon.graph_mode():
self.assertEqual(func(self.sym1).shape, None) self.assertEqual(func(self.sym1).shape, None)
......
...@@ -31,6 +31,9 @@ class TestCUDA(unittest.TestCase): ...@@ -31,6 +31,9 @@ class TestCUDA(unittest.TestCase):
stream.synchronize() stream.synchronize()
dragon.cuda.synchronize() dragon.cuda.synchronize()
def test_cublas(self):
dragon.cuda.set_cublas_flags()
def test_cudnn(self): def test_cudnn(self):
dragon.cuda.set_cudnn_flags() dragon.cuda.set_cudnn_flags()
......
...@@ -572,51 +572,6 @@ class TestArrayOps(OpTestCase): ...@@ -572,51 +572,6 @@ class TestArrayOps(OpTestCase):
with dragon.device('cuda'): with dragon.device('cuda'):
self.test_cast() self.test_cast()
def test_channel_affine(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
data1 = arange((2, 3, 4, 5))
data2, data3 = arange((3, 4)), arange((3, 4))
data4 = arange(data1.shape)
grad1 = data4 * np.expand_dims(data2, -1)
grad2 = np.sum(data4 * data1, (0, 3))
grad3 = np.sum(data4, (0, 3))
x, w, b = new_tensor(data1), new_tensor(data2), new_tensor(data3)
with dragon.GradientTape() as tape:
tape.watch([x, w, b])
y = dragon.channel_affine([x, w, b], axis=1, end_axis=2)
dy = new_tensor(data4)
dx, dw, db = tape.gradient(y, [x, w, b], output_gradients=[dy])
self.assertEqual(
[y, dx, dw, db],
[data1 * np.expand_dims(data2, -1) +
np.expand_dims(data3, -1),
grad1, grad2, grad3])
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_channel_affine_cuda(self):
with dragon.device('cuda'):
self.test_channel_affine()
def test_channel_normalize(self):
entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'perm': (0, 1, 2)}),
((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'perm': (0, 2, 1)})]
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
for shape, args, kwargs in entries:
perm = kwargs['perm']
data = np.ones(shape, dtype='uint8').transpose(perm)
mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
x = dragon.ones(shape, dtype='uint8')
y = dragon.channel_normalize(x, *args, **kwargs)
self.assertEqual(y, (data - mean) / std)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_channel_normalize_cuda(self):
with dragon.device('cuda'):
self.test_channel_normalize()
def test_channel_shuffle(self): def test_channel_shuffle(self):
entries = [(0, 2), (1, 4)] entries = [(0, 2), (1, 4)]
for execution in ('EAGER_MODE', 'GRAPH_MODE'): for execution in ('EAGER_MODE', 'GRAPH_MODE'):
...@@ -630,7 +585,7 @@ class TestArrayOps(OpTestCase): ...@@ -630,7 +585,7 @@ class TestArrayOps(OpTestCase):
x, dy = new_tensor(data), new_tensor(data) x, dy = new_tensor(data), new_tensor(data)
with dragon.GradientTape() as tape: with dragon.GradientTape() as tape:
tape.watch(x) tape.watch(x)
y = dragon.channel_shuffle(x, axis, group) y = dragon.nn.channel_shuffle(x, axis, group)
dx = tape.gradient(y, [x], output_gradients=[dy])[0] dx = tape.gradient(y, [x], output_gradients=[dy])[0]
self.assertEqual( self.assertEqual(
[y, dx], [data.reshape(shape1).transpose(perm).reshape(data.shape), [y, dx], [data.reshape(shape1).transpose(perm).reshape(data.shape),
...@@ -1676,6 +1631,32 @@ class TestMathOps(OpTestCase): ...@@ -1676,6 +1631,32 @@ class TestMathOps(OpTestCase):
with dragon.device('cuda'): with dragon.device('cuda'):
self.test_add() self.test_add()
def test_affine(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
data1 = arange((2, 3, 4, 5))
data2, data3 = arange((3, 4)), arange((3, 4))
data4 = arange(data1.shape)
grad1 = data4 * np.expand_dims(data2, -1)
grad2 = np.sum(data4 * data1, (0, 3))
grad3 = np.sum(data4, (0, 3))
x, w, b = new_tensor(data1), new_tensor(data2), new_tensor(data3)
with dragon.GradientTape() as tape:
tape.watch([x, w, b])
y = dragon.math.affine([x, w, b], axis=(1, 2))
dy = new_tensor(data4)
dx, dw, db = tape.gradient(y, [x, w, b], output_gradients=[dy])
self.assertEqual(
[y, dx, dw, db],
[data1 * np.expand_dims(data2, -1) +
np.expand_dims(data3, -1),
grad1, grad2, grad3])
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_affine_cuda(self):
with dragon.device('cuda'):
self.test_affine()
def test_argmax(self): def test_argmax(self):
entries = [(0, True), (0, False), (1, True), (1, False)] entries = [(0, True), (0, False), (1, True), (1, False)]
for execution in ('EAGER_MODE', 'GRAPH_MODE'): for execution in ('EAGER_MODE', 'GRAPH_MODE'):
...@@ -1712,6 +1693,20 @@ class TestMathOps(OpTestCase): ...@@ -1712,6 +1693,20 @@ class TestMathOps(OpTestCase):
with dragon.device('cuda'): with dragon.device('cuda'):
self.test_argmin() self.test_argmin()
def test_atan2(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
for a_shape, b_shape in self.binary_test_shapes:
data1, data2 = arange(a_shape), arange(b_shape, 1)
a, b = new_tensor(data1), new_tensor(data2)
y = dragon.math.atan2([a, b])
self.assertEqual(y, np.arctan2(data1, data2))
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_atan2_cuda(self):
with dragon.device('cuda'):
self.test_atan2()
def test_bitwise_and(self): def test_bitwise_and(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'): for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution): with execution_context().mode(execution):
...@@ -2738,6 +2733,25 @@ class TestNormalizationOps(OpTestCase): ...@@ -2738,6 +2733,25 @@ class TestNormalizationOps(OpTestCase):
with dragon.device('cuda'), self.cudnn_ws.as_default(): with dragon.device('cuda'), self.cudnn_ws.as_default():
self.test_batch_norm() self.test_batch_norm()
def test_channel_norm(self):
entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'perm': (0, 1, 2)}),
((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'perm': (0, 2, 1)})]
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
for shape, args, kwargs in entries:
perm = kwargs['perm']
data = np.ones(shape, dtype='uint8').transpose(perm)
mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
x = dragon.ones(shape, dtype='uint8')
y = dragon.nn.channel_norm(x, *args, **kwargs)
self.assertEqual(y, (data - mean) / std)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_channel_norm_cuda(self):
with dragon.device('cuda'):
self.test_channel_norm()
def test_group_norm(self): def test_group_norm(self):
eps = 1e-5 eps = 1e-5
entries = [((1, 4), (4,), -1, 2, (2,)), entries = [((1, 4), (4,), -1, 2, (2,)),
...@@ -2904,7 +2918,7 @@ class TestNormalizationOps(OpTestCase): ...@@ -2904,7 +2918,7 @@ class TestNormalizationOps(OpTestCase):
with dragon.device('cuda'), self.cudnn_ws.as_default(): with dragon.device('cuda'), self.cudnn_ws.as_default():
self.test_local_response_norm(test_cudnn=True, prec=1e-2) self.test_local_response_norm(test_cudnn=True, prec=1e-2)
def test_lp_normalize(self): def test_lp_norm(self):
entries = [(0, 1, 1e-12, 'sum'), entries = [(0, 1, 1e-12, 'sum'),
(0, 1, 1e-12, 'mean'), (0, 1, 1e-12, 'mean'),
(0, 2, 1e-12, 'sum'), (0, 2, 1e-12, 'sum'),
...@@ -2921,7 +2935,7 @@ class TestNormalizationOps(OpTestCase): ...@@ -2921,7 +2935,7 @@ class TestNormalizationOps(OpTestCase):
x, dy = new_tensor(data1), new_tensor(data2) x, dy = new_tensor(data1), new_tensor(data2)
with dragon.GradientTape() as tape: with dragon.GradientTape() as tape:
tape.watch(x) tape.watch(x)
y = dragon.math.lp_normalize( y = dragon.nn.lp_norm(
x, axis, p=p, epsilon=eps, reduction=reduction) x, axis, p=p, epsilon=eps, reduction=reduction)
dx = tape.gradient(y, [x], output_gradients=[dy])[0] dx = tape.gradient(y, [x], output_gradients=[dy])[0]
norm = np.abs(data1) if p == 1 else np.square(data1) norm = np.abs(data1) if p == 1 else np.square(data1)
...@@ -2930,9 +2944,9 @@ class TestNormalizationOps(OpTestCase): ...@@ -2930,9 +2944,9 @@ class TestNormalizationOps(OpTestCase):
self.assertEqual([y, dx], [data1 / max(norm, eps), grad]) self.assertEqual([y, dx], [data1 / max(norm, eps), grad])
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable') @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_lp_normalize_cuda(self): def test_lp_norm_cuda(self):
with dragon.device('cuda'): with dragon.device('cuda'):
self.test_lp_normalize() self.test_lp_norm()
class TestRNNOps(OpTestCase): class TestRNNOps(OpTestCase):
...@@ -3028,7 +3042,7 @@ class TestTrainingOps(OpTestCase): ...@@ -3028,7 +3042,7 @@ class TestTrainingOps(OpTestCase):
def test_rmsprop_update(self): def test_rmsprop_update(self):
with execution_context().mode('EAGER_MODE'): with execution_context().mode('EAGER_MODE'):
momentum, lr = self.rmsprop.momentum, self.rmsprop.lr momentum, lr = self.rmsprop.momentum, self.rmsprop.lr
decay, eps = self.rmsprop.decay, self.rmsprop.eps alpha, eps = self.rmsprop.alpha, self.rmsprop.eps
data1 = uniform((2, 3)) data1 = uniform((2, 3))
data2, data3 = np.zeros((2, 3), 'float32'), np.zeros((2, 3), 'float32') data2, data3 = np.zeros((2, 3), 'float32'), np.zeros((2, 3), 'float32')
param = new_tensor(data1) param = new_tensor(data1)
...@@ -3036,7 +3050,7 @@ class TestTrainingOps(OpTestCase): ...@@ -3036,7 +3050,7 @@ class TestTrainingOps(OpTestCase):
data4 = uniform((2, 3)) data4 = uniform((2, 3))
grad = new_tensor(data4) grad = new_tensor(data4)
self.rmsprop.apply_gradients([[grad, param]]) self.rmsprop.apply_gradients([[grad, param]])
data3 = decay * data3 + (1 - decay) * np.square(data4) data3 = alpha * data3 + (1 - alpha) * np.square(data4)
data2 = momentum * data2 + (data4 / (np.sqrt(data3) + eps)) data2 = momentum * data2 + (data4 / (np.sqrt(data3) + eps))
data1 -= lr * data2 data1 -= lr * data2
self.assertEqual(param, data1) self.assertEqual(param, data1)
......
...@@ -20,6 +20,17 @@ from dragon.core.testing.unittest.common_utils import run_tests ...@@ -20,6 +20,17 @@ from dragon.core.testing.unittest.common_utils import run_tests
from dragon.vm import torch from dragon.vm import torch
class TestCUDA(unittest.TestCase):
"""Test the CUDA backend."""
def test_library(self):
_ = torch.backends.cuda.is_built()
def test_set_flags(self):
torch.backends.cuda.matmul.allow_tf32 = False
self.assertEqual(torch.backends.cuda.matmul.allow_tf32, False)
class TestCuDNN(unittest.TestCase): class TestCuDNN(unittest.TestCase):
"""Test the CuDNN backend.""" """Test the CuDNN backend."""
......
...@@ -169,7 +169,7 @@ class TestModule(unittest.TestCase): ...@@ -169,7 +169,7 @@ class TestModule(unittest.TestCase):
class TestModules(OpTestCase): class TestModules(OpTestCase):
"""Test the nn module class.""" """Test the nn module class."""
def test_affine_channel(self): def test_affine(self):
data1 = arange((2, 3, 4, 5)) data1 = arange((2, 3, 4, 5))
data2, data3 = arange((1, 3, 1, 1)), arange((1, 3, 1, 1)) data2, data3 = arange((1, 3, 1, 1)), arange((1, 3, 1, 1))
w, b = new_tensor(data2.flatten()), new_tensor(data3.flatten()) w, b = new_tensor(data2.flatten()), new_tensor(data3.flatten())
...@@ -181,21 +181,19 @@ class TestModules(OpTestCase): ...@@ -181,21 +181,19 @@ class TestModules(OpTestCase):
for bias, fix_weight, fix_bias in entries: for bias, fix_weight, fix_bias in entries:
x = new_tensor(data1) x = new_tensor(data1)
try: try:
m = torch.nn.AffineChannel( m = torch.nn.Affine(
num_features=3, num_features=3,
bias=bias, bias=bias,
fix_weight=fix_weight, fix_weight=fix_weight,
fix_bias=fix_bias, fix_bias=fix_bias,
inplace=True, inplace=True)
)
except ValueError: except ValueError:
m = torch.nn.AffineChannel( m = torch.nn.Affine(
num_features=3, num_features=3,
bias=bias, bias=bias,
fix_weight=fix_weight, fix_weight=fix_weight,
fix_bias=fix_bias, fix_bias=fix_bias,
inplace=False, inplace=False)
)
m.weight.copy_(w) m.weight.copy_(w)
result = data1 * data2 result = data1 * data2
if bias: if bias:
...@@ -262,6 +260,18 @@ class TestModules(OpTestCase): ...@@ -262,6 +260,18 @@ class TestModules(OpTestCase):
y, _ = m(x), repr(m) y, _ = m(x), repr(m)
self.assertEqual(y, result) self.assertEqual(y, result)
def test_channel_norm(self):
entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'dims': (0, 1, 2)}),
((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'dims': (0, 2, 1)})]
for shape, args, kwargs in entries:
perm = kwargs['dims']
data = np.ones(shape, dtype='uint8').transpose(perm)
mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
x = torch.ones(shape, dtype='uint8')
y = torch.nn.functional.channel_norm(x, *args, **kwargs)
self.assertEqual(y, (data - mean) / std)
def test_channel_shuffle(self): def test_channel_shuffle(self):
entries = [(1, 4)] entries = [(1, 4)]
for axis, group in entries: for axis, group in entries:
......
...@@ -127,6 +127,12 @@ class TestTensorOps(OpTestCase): ...@@ -127,6 +127,12 @@ class TestTensorOps(OpTestCase):
result = np.expand_dims(result, axis) result = np.expand_dims(result, axis)
self.assertEqual(x.argmin(axis, keepdims), result) self.assertEqual(x.argmin(axis, keepdims), result)
def test_atan2(self):
for a_shape, b_shape in self.binary_test_shapes:
data1, data2 = arange(a_shape), arange(b_shape, 1)
a, b = new_tensor(data1, False), new_tensor(data2, False)
self.assertEqual(a.atan2(b), np.arctan2(data1, data2))
def test_baddbmm(self): def test_baddbmm(self):
entries = [((2, 2, 3), (2, 3, 4), (2, 2, 4))] entries = [((2, 2, 3), (2, 3, 4), (2, 2, 4))]
for a_shape, b_shape, c_shape in entries: for a_shape, b_shape, c_shape in entries:
...@@ -944,18 +950,6 @@ class TestTorchOps(OpTestCase): ...@@ -944,18 +950,6 @@ class TestTorchOps(OpTestCase):
y = torch.cat([x, x], dim=axis) y = torch.cat([x, x], dim=axis)
self.assertEqual(y, np.concatenate([data, data], axis=axis)) self.assertEqual(y, np.concatenate([data, data], axis=axis))
def test_channel_normalize(self):
entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'dims': (0, 1, 2)}),
((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'dims': (0, 2, 1)})]
for shape, args, kwargs in entries:
perm = kwargs['dims']
data = np.ones(shape, dtype='uint8').transpose(perm)
mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
x = torch.ones(shape, dtype='uint8')
y = torch.channel_normalize(x, *args, **kwargs)
self.assertEqual(y, (data - mean) / std)
def test_linspace(self): def test_linspace(self):
entries = [([[0., 5.], [10., 40.], 5], {'dim': 0, 'dtype': 'float32'}), entries = [([[0., 5.], [10., 40.], 5], {'dim': 0, 'dtype': 'float32'}),
([[0., 5.], [10., 40.], 5], {'dim': 1, 'dtype': 'float32'}), ([[0., 5.], [10., 40.], 5], {'dim': 1, 'dtype': 'float32'}),
......
...@@ -49,8 +49,6 @@ from dragon.vm.torch.core.tensor import Tensor ...@@ -49,8 +49,6 @@ from dragon.vm.torch.core.tensor import Tensor
from dragon.vm.torch.core.ops import tensor_ops as _ from dragon.vm.torch.core.ops import tensor_ops as _
from dragon.vm.torch.core.ops.array_ops import broadcast_to from dragon.vm.torch.core.ops.array_ops import broadcast_to
from dragon.vm.torch.core.ops.array_ops import cat from dragon.vm.torch.core.ops.array_ops import cat
from dragon.vm.torch.core.ops.array_ops import channel_affine
from dragon.vm.torch.core.ops.array_ops import channel_normalize
from dragon.vm.torch.core.ops.array_ops import chunk from dragon.vm.torch.core.ops.array_ops import chunk
from dragon.vm.torch.core.ops.array_ops import flatten from dragon.vm.torch.core.ops.array_ops import flatten
from dragon.vm.torch.core.ops.array_ops import flip from dragon.vm.torch.core.ops.array_ops import flip
...@@ -71,7 +69,6 @@ from dragon.vm.torch.core.ops.array_ops import scatter_add ...@@ -71,7 +69,6 @@ from dragon.vm.torch.core.ops.array_ops import scatter_add
from dragon.vm.torch.core.ops.array_ops import split from dragon.vm.torch.core.ops.array_ops import split
from dragon.vm.torch.core.ops.array_ops import squeeze from dragon.vm.torch.core.ops.array_ops import squeeze
from dragon.vm.torch.core.ops.array_ops import stack from dragon.vm.torch.core.ops.array_ops import stack
from dragon.vm.torch.core.ops.math_ops import sum
from dragon.vm.torch.core.ops.array_ops import tile from dragon.vm.torch.core.ops.array_ops import tile
from dragon.vm.torch.core.ops.array_ops import transpose from dragon.vm.torch.core.ops.array_ops import transpose
from dragon.vm.torch.core.ops.array_ops import tril from dragon.vm.torch.core.ops.array_ops import tril
...@@ -97,6 +94,7 @@ from dragon.vm.torch.core.ops.math_ops import add ...@@ -97,6 +94,7 @@ from dragon.vm.torch.core.ops.math_ops import add
from dragon.vm.torch.core.ops.math_ops import addmm from dragon.vm.torch.core.ops.math_ops import addmm
from dragon.vm.torch.core.ops.math_ops import argmax from dragon.vm.torch.core.ops.math_ops import argmax
from dragon.vm.torch.core.ops.math_ops import argmin from dragon.vm.torch.core.ops.math_ops import argmin
from dragon.vm.torch.core.ops.math_ops import atan2
from dragon.vm.torch.core.ops.math_ops import baddbmm from dragon.vm.torch.core.ops.math_ops import baddbmm
from dragon.vm.torch.core.ops.math_ops import bitwise_and from dragon.vm.torch.core.ops.math_ops import bitwise_and
from dragon.vm.torch.core.ops.math_ops import bitwise_not from dragon.vm.torch.core.ops.math_ops import bitwise_not
...@@ -144,6 +142,7 @@ from dragon.vm.torch.core.ops.math_ops import sin ...@@ -144,6 +142,7 @@ from dragon.vm.torch.core.ops.math_ops import sin
from dragon.vm.torch.core.ops.math_ops import sqrt from dragon.vm.torch.core.ops.math_ops import sqrt
from dragon.vm.torch.core.ops.math_ops import square from dragon.vm.torch.core.ops.math_ops import square
from dragon.vm.torch.core.ops.math_ops import sub from dragon.vm.torch.core.ops.math_ops import sub
from dragon.vm.torch.core.ops.math_ops import sum
from dragon.vm.torch.core.ops.random_ops import normal from dragon.vm.torch.core.ops.random_ops import normal
from dragon.vm.torch.core.ops.random_ops import rand from dragon.vm.torch.core.ops.random_ops import rand
from dragon.vm.torch.core.ops.random_ops import randn from dragon.vm.torch.core.ops.random_ops import randn
......
...@@ -15,6 +15,7 @@ from __future__ import division as _division ...@@ -15,6 +15,7 @@ from __future__ import division as _division
from __future__ import print_function as _print_function from __future__ import print_function as _print_function
# Modules # Modules
from dragon.vm.torch.core.backends import cuda
from dragon.vm.torch.core.backends import cudnn from dragon.vm.torch.core.backends import cudnn
__all__ = [_s for _s in dir() if not _s.startswith('_')] __all__ = [_s for _s in dir() if not _s.startswith('_')]
...@@ -56,6 +56,7 @@ from dragon.vm.torch.core.nn.modules.dropout import Dropout ...@@ -56,6 +56,7 @@ from dragon.vm.torch.core.nn.modules.dropout import Dropout
from dragon.vm.torch.core.nn.modules.dropout import DropPath from dragon.vm.torch.core.nn.modules.dropout import DropPath
from dragon.vm.torch.core.nn.modules.flatten import Flatten from dragon.vm.torch.core.nn.modules.flatten import Flatten
from dragon.vm.torch.core.nn.modules.fold import Unfold from dragon.vm.torch.core.nn.modules.fold import Unfold
from dragon.vm.torch.core.nn.modules.linear import Affine
from dragon.vm.torch.core.nn.modules.linear import Identity from dragon.vm.torch.core.nn.modules.linear import Identity
from dragon.vm.torch.core.nn.modules.linear import Linear from dragon.vm.torch.core.nn.modules.linear import Linear
from dragon.vm.torch.core.nn.modules.loss import CTCLoss from dragon.vm.torch.core.nn.modules.loss import CTCLoss
...@@ -68,7 +69,6 @@ from dragon.vm.torch.core.nn.modules.loss import NLLLoss ...@@ -68,7 +69,6 @@ from dragon.vm.torch.core.nn.modules.loss import NLLLoss
from dragon.vm.torch.core.nn.modules.loss import SigmoidFocalLoss from dragon.vm.torch.core.nn.modules.loss import SigmoidFocalLoss
from dragon.vm.torch.core.nn.modules.loss import SmoothL1Loss from dragon.vm.torch.core.nn.modules.loss import SmoothL1Loss
from dragon.vm.torch.core.nn.modules.module import Module from dragon.vm.torch.core.nn.modules.module import Module
from dragon.vm.torch.core.nn.modules.normalization import AffineChannel
from dragon.vm.torch.core.nn.modules.normalization import GroupNorm from dragon.vm.torch.core.nn.modules.normalization import GroupNorm
from dragon.vm.torch.core.nn.modules.normalization import LayerNorm from dragon.vm.torch.core.nn.modules.normalization import LayerNorm
from dragon.vm.torch.core.nn.modules.normalization import LocalResponseNorm from dragon.vm.torch.core.nn.modules.normalization import LocalResponseNorm
......
...@@ -20,11 +20,13 @@ from dragon.vm.torch.core.nn.functional import adaptive_avg_pool3d ...@@ -20,11 +20,13 @@ from dragon.vm.torch.core.nn.functional import adaptive_avg_pool3d
from dragon.vm.torch.core.nn.functional import adaptive_max_pool1d from dragon.vm.torch.core.nn.functional import adaptive_max_pool1d
from dragon.vm.torch.core.nn.functional import adaptive_max_pool2d from dragon.vm.torch.core.nn.functional import adaptive_max_pool2d
from dragon.vm.torch.core.nn.functional import adaptive_max_pool3d from dragon.vm.torch.core.nn.functional import adaptive_max_pool3d
from dragon.vm.torch.core.nn.functional import affine
from dragon.vm.torch.core.nn.functional import avg_pool1d from dragon.vm.torch.core.nn.functional import avg_pool1d
from dragon.vm.torch.core.nn.functional import avg_pool2d from dragon.vm.torch.core.nn.functional import avg_pool2d
from dragon.vm.torch.core.nn.functional import avg_pool3d from dragon.vm.torch.core.nn.functional import avg_pool3d
from dragon.vm.torch.core.nn.functional import batch_norm from dragon.vm.torch.core.nn.functional import batch_norm
from dragon.vm.torch.core.nn.functional import binary_cross_entropy_with_logits from dragon.vm.torch.core.nn.functional import binary_cross_entropy_with_logits
from dragon.vm.torch.core.nn.functional import channel_norm
from dragon.vm.torch.core.nn.functional import channel_shuffle from dragon.vm.torch.core.nn.functional import channel_shuffle
from dragon.vm.torch.core.nn.functional import conv1d from dragon.vm.torch.core.nn.functional import conv1d
from dragon.vm.torch.core.nn.functional import conv2d from dragon.vm.torch.core.nn.functional import conv2d
......
...@@ -173,6 +173,7 @@ class Function(object): ...@@ -173,6 +173,7 @@ class Function(object):
outputs_id.append(outputs[i].id) outputs_id.append(outputs[i].id)
else: else:
if isinstance(spec, Tensor): if isinstance(spec, Tensor):
spec._device = device.copy()
outputs.append(spec) outputs.append(spec)
outputs_id.append(spec.id) outputs_id.append(spec.id)
else: else:
......
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""CUDA backend."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dragon.core.device import cuda
from dragon.core.framework import sysconfig
class CuBLASModule(object):
"""CuBLAS module class."""
def __init__(self):
self._allow_tf32 = False
@property
def allow_tf32(self):
"""The flag that allows cuBLAS TF32 math type or not."""
return self._allow_tf32
@allow_tf32.setter
def allow_tf32(self, value):
self._allow_tf32 = value
cuda.set_cublas_flags(allow_tf32=value)
def is_built():
"""Return a bool reporting if built with CUDA support.
Returns
-------
bool
``True`` if built otherwise ``False``.
"""
version = sysconfig.get_build_info().get('cuda_version', None)
return True if version is not None else False
# Module instances.
matmul = CuBLASModule()
...@@ -37,7 +37,7 @@ class CuDNNModule(object): ...@@ -37,7 +37,7 @@ class CuDNNModule(object):
@allow_tf32.setter @allow_tf32.setter
def allow_tf32(self, value): def allow_tf32(self, value):
self._allow_tf32 = value self._allow_tf32 = value
self._set_flags() cuda.set_cudnn_flags(allow_tf32=value)
@property @property
def benchmark(self): def benchmark(self):
...@@ -47,7 +47,7 @@ class CuDNNModule(object): ...@@ -47,7 +47,7 @@ class CuDNNModule(object):
@benchmark.setter @benchmark.setter
def benchmark(self, value): def benchmark(self, value):
self._benchmark = value self._benchmark = value
self._set_flags() cuda.set_cudnn_flags(benchmark=value)
@property @property
def deterministic(self): def deterministic(self):
...@@ -57,7 +57,7 @@ class CuDNNModule(object): ...@@ -57,7 +57,7 @@ class CuDNNModule(object):
@deterministic.setter @deterministic.setter
def deterministic(self, value): def deterministic(self, value):
self._deterministic = value self._deterministic = value
self._set_flags() cuda.set_cudnn_flags(deterministic=value)
@property @property
def enabled(self): def enabled(self):
...@@ -67,7 +67,7 @@ class CuDNNModule(object): ...@@ -67,7 +67,7 @@ class CuDNNModule(object):
@enabled.setter @enabled.setter
def enabled(self, value): def enabled(self, value):
self._enabled = value self._enabled = value
self._set_flags() cuda.set_cudnn_flags(enabled=value)
@staticmethod @staticmethod
def is_available(): def is_available():
...@@ -97,15 +97,6 @@ class CuDNNModule(object): ...@@ -97,15 +97,6 @@ class CuDNNModule(object):
version = major * 1000 + minor * 100 + patch version = major * 1000 + minor * 100 + patch
return version return version
def _set_flags(self):
"""Set all flags with current value."""
cuda.set_cudnn_flags(
enabled=self._enabled,
benchmark=self._benchmark,
deterministic=self._deterministic,
allow_tf32=self._allow_tf32,
)
# Module instances. # Module instances.
sys.modules[__name__] = CuDNNModule() sys.modules[__name__] = CuDNNModule()
......
...@@ -170,8 +170,42 @@ def adaptive_max_pool3d(input, output_size): ...@@ -170,8 +170,42 @@ def adaptive_max_pool3d(input, output_size):
return _pool('MAX', utils._triple, input, **args) return _pool('MAX', utils._triple, input, **args)
def affine(input, weight, bias=None, dim=-1, out=None):
r"""Apply affine transformation to input.
.. math:: \text{out} = \text{input} \times \text{weight} + \text{bias}
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
weight : dragon.vm.torch.Tensor
The weight tensor.
bias : dragon.vm.torch.Tensor, optional
The bias tensor.
dim : Union[int, Sequence[int]], optional
The dimension to apply.
out : dragon.vm.torch.Tensor, optional
The output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.nn.Affine(...)`_
"""
return Function.apply(
'Affine', input.device,
[input, weight] + ([bias] if bias else []), outputs=[out],
axes=nest.flatten(dim))
def avg_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False): def avg_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 1d average pooling to input. """Apply the 1d average pooling to input.
Parameters Parameters
---------- ----------
...@@ -200,7 +234,7 @@ def avg_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False): ...@@ -200,7 +234,7 @@ def avg_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def avg_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False): def avg_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 2d average pooling to input. """Apply the 2d average pooling to input.
Parameters Parameters
---------- ----------
...@@ -229,7 +263,7 @@ def avg_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False): ...@@ -229,7 +263,7 @@ def avg_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def avg_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False): def avg_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 3d average pooling to input. """Apply the 3d average pooling to input.
Parameters Parameters
---------- ----------
...@@ -267,7 +301,7 @@ def batch_norm( ...@@ -267,7 +301,7 @@ def batch_norm(
momentum=0.1, momentum=0.1,
eps=1e-5, eps=1e-5,
): ):
r"""Apply the batch normalization to input. """Apply the batch normalization to input.
`[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_. `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
Parameters Parameters
...@@ -315,7 +349,7 @@ def binary_cross_entropy_with_logits( ...@@ -315,7 +349,7 @@ def binary_cross_entropy_with_logits(
reduction='mean', reduction='mean',
pos_weight=None, pos_weight=None,
): ):
r"""Compute the sigmoid cross entropy with contiguous target. """Compute the sigmoid cross entropy with contiguous target.
Parameters Parameters
---------- ----------
...@@ -353,6 +387,55 @@ def binary_cross_entropy_with_logits( ...@@ -353,6 +387,55 @@ def binary_cross_entropy_with_logits(
[input, target], reduction=reduction.upper()) [input, target], reduction=reduction.upper())
def channel_norm(input, mean, std, dim=-1, dtype='float32', dims=None):
"""Apply the normalization to each channel of input.
:attr:`dim` can be negative:
```python
m = s = (1., 1., 1.)
x = torch.tensor([1, 2, 3])
print(nn.functional.channel_norm(x, m, s, dim=0)) # [0., 1., 2.]
print(nn.functional.channel_norm(x, m, s, dim=-1)) # Equivalent
```
If :attr:`dims` provided, :attr:`dim` is selected from the output layout:
```python
m, s = (1., 2., 3.), (1., 1., 1.)
x = torch.tensor([[1, 2, 3]])
# Provided 3 values to normalize the last dimension
# with length 1, only the first value will be taken
print(nn.functional.channel_norm(x, m, s, dims=(1, 0))) # [[0.], [1.], [2.]]
```
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
mean : Sequence[float], required
The mean to subtract.
std : Sequence[float], required
The standard deviation to divide.
dim : int, optional, default=-1
The channel dimension.
dtype : str, optional, default='float32'
The output data type.
dims : Sequence[int], optional
The order of output dimensions.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
return Function.apply(
'ChannelNorm', input.device, [input],
axis=dim, mean=mean, std=std, dtype=dtype,
ndim=len(dims) if dims is not None else 0, perm=dims)
def channel_shuffle(input, groups): def channel_shuffle(input, groups):
"""Apply group shuffle to each channel of input. """Apply group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_. `[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
...@@ -387,7 +470,7 @@ def conv1d( ...@@ -387,7 +470,7 @@ def conv1d(
dilation=1, dilation=1,
groups=1, groups=1,
): ):
r"""Apply the 1d convolution to input. """Apply the 1d convolution to input.
Parameters Parameters
---------- ----------
...@@ -428,7 +511,7 @@ def conv2d( ...@@ -428,7 +511,7 @@ def conv2d(
dilation=1, dilation=1,
groups=1, groups=1,
): ):
r"""Apply the 2d convolution to input. """Apply the 2d convolution to input.
Parameters Parameters
---------- ----------
...@@ -469,7 +552,7 @@ def conv3d( ...@@ -469,7 +552,7 @@ def conv3d(
dilation=1, dilation=1,
groups=1, groups=1,
): ):
r"""Apply the 3d convolution to input. """Apply the 3d convolution to input.
Parameters Parameters
---------- ----------
...@@ -511,7 +594,7 @@ def conv_transpose1d( ...@@ -511,7 +594,7 @@ def conv_transpose1d(
groups=1, groups=1,
dilation=1, dilation=1,
): ):
r"""Apply the 1d deconvolution to input. """Apply the 1d deconvolution to input.
Parameters Parameters
---------- ----------
...@@ -555,7 +638,7 @@ def conv_transpose2d( ...@@ -555,7 +638,7 @@ def conv_transpose2d(
groups=1, groups=1,
dilation=1, dilation=1,
): ):
r"""Apply the 2d deconvolution to input. """Apply the 2d deconvolution to input.
Parameters Parameters
---------- ----------
...@@ -599,7 +682,7 @@ def conv_transpose3d( ...@@ -599,7 +682,7 @@ def conv_transpose3d(
groups=1, groups=1,
dilation=1, dilation=1,
): ):
r"""Apply the 3d deconvolution to input. """Apply the 3d deconvolution to input.
Parameters Parameters
---------- ----------
...@@ -747,7 +830,7 @@ def depthwise_conv2d( ...@@ -747,7 +830,7 @@ def depthwise_conv2d(
padding=0, padding=0,
dilation=1, dilation=1,
): ):
r"""Apply the 2d depthwise convolution to input. """Apply the 2d depthwise convolution to input.
Parameters Parameters
---------- ----------
...@@ -778,7 +861,7 @@ def depthwise_conv2d( ...@@ -778,7 +861,7 @@ def depthwise_conv2d(
def dropout(input, p=0.5, training=True, inplace=False): def dropout(input, p=0.5, training=True, inplace=False):
r"""Set the elements of the input to zero randomly. """Set the elements of the input to zero randomly.
`[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_. `[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_.
Parameters Parameters
...@@ -810,7 +893,7 @@ def dropout(input, p=0.5, training=True, inplace=False): ...@@ -810,7 +893,7 @@ def dropout(input, p=0.5, training=True, inplace=False):
def drop_block2d(input, p=0.5, block_size=1, training=True, inplace=False): def drop_block2d(input, p=0.5, block_size=1, training=True, inplace=False):
r"""Set the blocks over input to zero randomly. """Set the blocks over input to zero randomly.
Parameters Parameters
---------- ----------
...@@ -994,6 +1077,15 @@ def group_norm(input, num_groups, weight, bias, eps=1e-5): ...@@ -994,6 +1077,15 @@ def group_norm(input, num_groups, weight, bias, eps=1e-5):
def hardsigmoid(input, inplace=False): def hardsigmoid(input, inplace=False):
r"""Apply the hard sigmoid function to input. r"""Apply the hard sigmoid function to input.
The **HardSigmoid** function is defined as:
.. math::
\text{Hardsigmoid}(x) = \begin{cases}
0 & \text{if~} x \le -3, \\
1 & \text{if~} x \ge +3, \\
x / 6 + 1 / 2 & \text{otherwise}
\end{cases}
Parameters Parameters
---------- ----------
input : dragon.vm.torch.Tensor input : dragon.vm.torch.Tensor
...@@ -1020,6 +1112,15 @@ def hardswish(input): ...@@ -1020,6 +1112,15 @@ def hardswish(input):
r"""Apply the hard swish function to input. r"""Apply the hard swish function to input.
`[Howard et.al, 2019] <https://arxiv.org/abs/1905.02244>`_. `[Howard et.al, 2019] <https://arxiv.org/abs/1905.02244>`_.
The **HardSwish** function is defined as:
.. math::
\text{Hardsigmoid}(x) = \begin{cases}
0 & \text{if~} x \le -3, \\
x & \text{if~} x \ge +3, \\
x \cdot (x + 3) /6 & \text{otherwise}
\end{cases}
Parameters Parameters
---------- ----------
input : dragon.vm.torch.Tensor input : dragon.vm.torch.Tensor
...@@ -1161,7 +1262,7 @@ def kl_div( ...@@ -1161,7 +1262,7 @@ def kl_div(
def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'): def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):
r"""Compute the element-wise absolute value difference. """Compute the element-wise absolute value difference.
Parameters Parameters
---------- ----------
...@@ -1196,7 +1297,7 @@ def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'): ...@@ -1196,7 +1297,7 @@ def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):
def layer_norm(input, normalized_shape, weight, bias, eps=1e-5): def layer_norm(input, normalized_shape, weight, bias, eps=1e-5):
r"""Apply the layer normalization to input. """Apply the layer normalization to input.
`[Ba et.al, 2016] <https://arxiv.org/abs/1607.06450>`_ `[Ba et.al, 2016] <https://arxiv.org/abs/1607.06450>`_
Parameters Parameters
...@@ -1387,7 +1488,7 @@ def lstm_cell(input, cx): ...@@ -1387,7 +1488,7 @@ def lstm_cell(input, cx):
def max_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False): def max_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 1d max pooling to input. """Apply the 1d max pooling to input.
Parameters Parameters
---------- ----------
...@@ -1416,7 +1517,7 @@ def max_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False): ...@@ -1416,7 +1517,7 @@ def max_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def max_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False): def max_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 2d max pooling to input. """Apply the 2d max pooling to input.
Parameters Parameters
---------- ----------
...@@ -1445,7 +1546,7 @@ def max_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False): ...@@ -1445,7 +1546,7 @@ def max_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def max_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False): def max_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 3d max pooling to input. """Apply the 3d max pooling to input.
Parameters Parameters
---------- ----------
...@@ -1474,11 +1575,7 @@ def max_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False): ...@@ -1474,11 +1575,7 @@ def max_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def mse_loss(input, target, size_average=None, reduce=None, reduction='mean'): def mse_loss(input, target, size_average=None, reduce=None, reduction='mean'):
r"""Compute the element-wise squared error. """Compute the element-wise squared error.
The ``MSELoss`` function is defined as:
.. math:: \text{MSELoss}(x, y) = (x - y)^{2}
Parameters Parameters
---------- ----------
...@@ -1726,7 +1823,7 @@ def normalize(input, p=2, dim=1, end_dim=None, eps=1e-12, out=None): ...@@ -1726,7 +1823,7 @@ def normalize(input, p=2, dim=1, end_dim=None, eps=1e-12, out=None):
""" """
return Function.apply( return Function.apply(
'LpNormalize', input.device, [input], outputs=[out], 'LpNorm', input.device, [input], outputs=[out],
p=p, axis=dim, end_axis=end_dim, epsilon=eps, reduction='SUM') p=p, axis=dim, end_axis=end_dim, epsilon=eps, reduction='SUM')
......
...@@ -19,9 +19,97 @@ import math ...@@ -19,9 +19,97 @@ import math
from dragon.vm.torch.core.nn import functional as F from dragon.vm.torch.core.nn import functional as F
from dragon.vm.torch.core.nn.modules.module import Module from dragon.vm.torch.core.nn.modules.module import Module
from dragon.vm.torch.core.nn.parameter import Parameter from dragon.vm.torch.core.nn.parameter import Parameter
from dragon.vm.torch.core.ops import constant_ops
from dragon.vm.torch.core.tensor import Tensor from dragon.vm.torch.core.tensor import Tensor
class Affine(Module):
"""Apply affine transformation.
Affine is often taken as a post-processing of normalization.
Examples:
```python
m = torch.nn.Affine(5)
# Apply a 2d transformation
x2d = torch.ones(3, 5)
y2d = m(x2d)
# Apply a 3d transformation
x3d = torch.ones(3, 5, 4)
y3d = m(x3d)
# Apply a 4d transformation
x4d = torch.ones(3, 5, 2, 2)
y4d = m(x4d)
```
See Also
--------
`torch.nn.functional.affine(...)`_
"""
def __init__(
self,
num_features,
bias=True,
fix_weight=False,
fix_bias=False,
inplace=False,
):
"""Create an ``AffineChannel`` module.
Parameters
----------
num_features : int
The number of channels.
bias : bool, optional, default=True
``True`` to attach a bias.
fix_weight : bool, optional, default=False
``True`` to frozen the ``weight``.
fix_bias : bool, optional, default=False
``True`` to frozen the ``bias``.
inplace : bool, optional, default=False
Whether to do the operation in-place.
"""
super(Affine, self).__init__()
self.num_features = num_features
self.inplace = inplace
if not fix_weight:
self.weight = Parameter(constant_ops.ones(num_features))
if inplace:
raise ValueError('In-place operation requires fixed weight.')
else:
self.register_buffer('weight', constant_ops.ones(num_features))
if bias:
if not fix_bias:
self.bias = Parameter(constant_ops.zeros(num_features))
else:
self.register_buffer('bias', constant_ops.zeros(num_features))
else:
self.bias = None
def extra_repr(self):
s = '{num_features}, ' \
'inplace={inplace}'.format(**self.__dict__)
if self.bias is None:
s += ', bias=False'
return s
def forward(self, input):
return F.affine(
input,
self.weight,
self.bias,
dim=1,
out=input if self.inplace else None,
)
class Identity(Module): class Identity(Module):
r"""Apply the identity transformation. r"""Apply the identity transformation.
......
...@@ -20,98 +20,10 @@ from dragon.core.util import nest ...@@ -20,98 +20,10 @@ from dragon.core.util import nest
from dragon.vm.torch.core.nn import functional as F from dragon.vm.torch.core.nn import functional as F
from dragon.vm.torch.core.nn.modules.module import Module from dragon.vm.torch.core.nn.modules.module import Module
from dragon.vm.torch.core.nn.parameter import Parameter from dragon.vm.torch.core.nn.parameter import Parameter
from dragon.vm.torch.core.ops import array_ops
from dragon.vm.torch.core.ops import constant_ops from dragon.vm.torch.core.ops import constant_ops
from dragon.vm.torch.core.tensor import Tensor from dragon.vm.torch.core.tensor import Tensor
class AffineChannel(Module):
"""Apply affine transformation to channels.
Affine is often taken as a post-processing of normalization.
Examples:
```python
m = torch.nn.AffineChannel(5)
# Apply a 2d transformation
x2d = torch.ones(3, 5)
y2d = m(x2d)
# Apply a 3d transformation
x3d = torch.ones(3, 5, 4)
y3d = m(x3d)
# Apply a 4d transformation
x4d = torch.ones(3, 5, 2, 2)
y4d = m(x4d)
```
See Also
--------
`torch.channel_affine(...)`_
"""
def __init__(
self,
num_features,
bias=True,
fix_weight=False,
fix_bias=False,
inplace=False,
):
"""Create an ``AffineChannel`` module.
Parameters
----------
num_features : int
The number of channels.
bias : bool, optional, default=True
``True`` to attach a bias.
fix_weight : bool, optional, default=False
``True`` to frozen the ``weight``.
fix_bias : bool, optional, default=False
``True`` to frozen the ``bias``.
inplace : bool, optional, default=False
Whether to do the operation in-place.
"""
super(AffineChannel, self).__init__()
self.num_features = num_features
self.inplace = inplace
if not fix_weight:
self.weight = Parameter(constant_ops.ones(num_features))
if inplace:
raise ValueError('In-place operation requires fixed weight.')
else:
self.register_buffer('weight', constant_ops.ones(num_features))
if bias:
if not fix_bias:
self.bias = Parameter(constant_ops.zeros(num_features))
else:
self.register_buffer('bias', constant_ops.zeros(num_features))
else:
self.bias = None
def extra_repr(self):
s = '{num_features}, ' \
'inplace={inplace}'.format(**self.__dict__)
if self.bias is None:
s += ', bias=False'
return s
def forward(self, input):
return array_ops.channel_affine(
input,
self.weight,
self.bias,
dim=1,
out=input if self.inplace else None,
)
class GroupNorm(Module): class GroupNorm(Module):
r"""Apply the group normalization. r"""Apply the group normalization.
`[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_. `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
......
...@@ -42,85 +42,6 @@ def cat(tensors, dim=0, out=None): ...@@ -42,85 +42,6 @@ def cat(tensors, dim=0, out=None):
'Concat', tensors[0].device, tensors, outputs=[out], axis=dim) 'Concat', tensors[0].device, tensors, outputs=[out], axis=dim)
def channel_affine(input, weight, bias=None, dim=-1, end_dim=None, out=None):
"""Apply affine transformation to each channel of input.
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
weight : dragon.vm.torch.Tensor
The weight tensor.
bias : dragon.vm.torch.Tensor, optional
The bias tensor.
dim : int, optional, default=-1
The first channel dimension.
end_dim : int, optional
The last channel dimension.
out : dragon.vm.torch.Tensor, optional
The output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
return Function.apply(
'ChannelAffine', input.device,
[input, weight] + ([bias] if bias else []), outputs=[out],
axis=dim, end_axis=end_dim)
def channel_normalize(input, mean, std, dim=-1, dtype='float32', dims=None):
"""Apply normalization to each channel of input.
:attr:`dim` can be negative:
```python
m = s = (1., 1., 1.)
x = torch.tensor([1, 2, 3])
print(torch.channel_normalize(x, m, s, dim=0)) # [0., 1., 2.]
print(torch.channel_normalize(x, m, s, dim=-1)) # Equivalent
```
If :attr:`dims` provided, :attr:`dim` is selected from the output layout:
```python
m, s = (1., 2., 3.), (1., 1., 1.)
x = torch.tensor([[1, 2, 3]])
# Provided 3 values to normalize the last dimension
# with length 1, only the first value will be taken
print(torch.channel_normalize(x, m, s, dims=(1, 0))) # [[0.], [1.], [2.]]
```
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
mean : Sequence[float], required
The mean to subtract.
std : Sequence[float], required
The standard deviation to divide.
dim : int, optional, default=-1
The channel dimension.
dtype : str, optional, default='float32'
The output data type.
dims : Sequence[int], optional
The order of output dimensions.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
return Function.apply(
'ChannelNormalize', input.device, [input],
axis=dim, mean=mean, std=std, dtype=dtype,
ndim=len(dims) if dims is not None else 0, perm=dims)
def chunk(tensor, chunks, dim=0, copy=True): def chunk(tensor, chunks, dim=0, copy=True):
"""Split input into a specific number of chunks. """Split input into a specific number of chunks.
......
...@@ -168,6 +168,37 @@ def argmin(input, dim, keepdim=False, out=None): ...@@ -168,6 +168,37 @@ def argmin(input, dim, keepdim=False, out=None):
axis=dim, keepdims=keepdim) axis=dim, keepdims=keepdim)
def atan2(input, other, out=None):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{input}}{\text{other}})
Examples:
```python
y = torch.tensor(1.)
x = torch.tensor(2.)
print(torch.atan2(y, x)) # 0.46364761
```
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
other : Union[dragon.vm.torch.Tensor, number]
The tensor to divide.
out : dragon.vm.torch.Tensor, optional
The output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
return _binary_func(input, other, 'Atan2', out)
def baddbmm(input, batch1, batch2, beta=1, alpha=1, out=None): def baddbmm(input, batch1, batch2, beta=1, alpha=1, out=None):
r"""Add input to the result of batched matrix-matrix multiplication. r"""Add input to the result of batched matrix-matrix multiplication.
......
...@@ -186,6 +186,29 @@ def argsort(self, dim=-1, descending=False): ...@@ -186,6 +186,29 @@ def argsort(self, dim=-1, descending=False):
return sort_ops.argsort(self, dim, descending) return sort_ops.argsort(self, dim, descending)
def atan2(self, other):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{self}}{\text{other}})
Parameters
----------
other : Union[dragon.vm.torch.Tensor, number]
The value to divide.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.atan2(...)`_
"""
return math_ops.atan2(self, other)
def baddbmm(self, batch1, batch2, beta=1, alpha=1): def baddbmm(self, batch1, batch2, beta=1, alpha=1):
r"""Add the result of batched matrix-matrix multiplication. r"""Add the result of batched matrix-matrix multiplication.
...@@ -3051,6 +3074,7 @@ Tensor.addmm = addmm ...@@ -3051,6 +3074,7 @@ Tensor.addmm = addmm
Tensor.argmax = argmax Tensor.argmax = argmax
Tensor.argmin = argmin Tensor.argmin = argmin
Tensor.argsort = argsort Tensor.argsort = argsort
Tensor.atan2 = atan2
Tensor.backward = backward Tensor.backward = backward
Tensor.baddbmm = baddbmm Tensor.baddbmm = baddbmm
Tensor.baddbmm_ = baddbmm_ Tensor.baddbmm_ = baddbmm_
......
...@@ -28,9 +28,11 @@ class Adam(Optimizer): ...@@ -28,9 +28,11 @@ class Adam(Optimizer):
The **Adam** update is defined as: The **Adam** update is defined as:
.. math:: .. math::
\text{Adam}(g) = \text{lr} * \frac{m_{t}}{\sqrt{v_{t}} + \epsilon} \\ \text{Adam}(g) = \text{lr} * (\frac{\text{correction}* m_{t}}
{\sqrt{v_{t}} + \epsilon}) \\
\quad \\ \text{where}\quad \quad \\ \text{where}\quad
\begin{cases} \begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\ m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
\end{cases} \end{cases}
...@@ -88,12 +90,13 @@ class AdamW(Adam): ...@@ -88,12 +90,13 @@ class AdamW(Adam):
The **AdamW** update is defined as: The **AdamW** update is defined as:
.. math:: .. math::
\text{AdamW}(g, p) = \text{lr} * (\frac{m_{t}}{\sqrt{v_{t}} + \epsilon} \text{AdamW}(g, p) = \text{lr} * (\frac{\text{correction} * m_{t}}
+ \lambda p) \\ {\sqrt{v_{t}} + \epsilon} + \lambda p) \\
\quad \\ \text{where}\quad \quad \\ \text{where}\quad
\begin{cases} \begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\ m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} \\
\end{cases} \end{cases}
""" """
......
...@@ -78,5 +78,4 @@ class RMSprop(Optimizer): ...@@ -78,5 +78,4 @@ class RMSprop(Optimizer):
defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps,
centered=centered, weight_decay=weight_decay) centered=centered, weight_decay=weight_decay)
super(RMSprop, self).__init__(params, defaults, **kwargs) super(RMSprop, self).__init__(params, defaults, **kwargs)
self._hyper['alpha'][0] = 'decay'
self._hyper.pop('centered') # Unsupported. self._hyper.pop('centered') # Unsupported.
...@@ -372,6 +372,27 @@ class Tensor(object): ...@@ -372,6 +372,27 @@ class Tensor(object):
""" """
def atan2(self, other):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{self}}{\text{other}})
Parameters
----------
other : Union[dragon.vm.torch.Tensor, number]
The value to divide.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.atan2(...)`_
"""
def backward(self, gradient=None, retain_graph=False): def backward(self, gradient=None, retain_graph=False):
"""Compute the derivatives of this tensor w.r.t. graph leaves. """Compute the derivatives of this tensor w.r.t. graph leaves.
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!