Update documentation

Ting PAN
Commit a96b9375 authored Sep 18, 2022 by Ting PAN
Showing with 631 additions and 484 deletions
README.md
dali/_api/ops/__init__.py
dali/core/ops/reader_ops.py
docs/README.md
docs/api/cc/Doxyfile
docs/api/cc/dragon/core.rst
docs/api/cc/dragon/core/CPUContext.rst
docs/api/cc/dragon/core/CUDAContext.rst
docs/api/cc/dragon/core/Graph.rst
docs/api/cc/dragon/core/MPSContext.rst
docs/api/cc/dragon/core/Operator.rst
docs/api/cc/dragon/core/Tensor.rst
docs/api/cc/dragon/core/TypeMeta.rst
docs/api/cc/dragon/core/UnifiedMemory.rst
docs/api/cc/dragon/core/Workspace.rst
docs/api/python/dali/ops.rst
docs/api/python/dali/ops/CGRecordReader.rst
docs/api/python/dragon/cuda.rst
docs/api/python/dragon/cuda/get_device_name.rst
docs/api/python/torch/cuda.rst
--- a/README.md
+++ b/README.md
 <p align="center">
-    <img width="40%" src="https://dragon.seetatech.com/static/images/styles-dragon.png"/>
+  <img width="100%" src="https://dragon.seetatech.com/download/dragon/assets/banner.png"/>
 </p>
-[Dragon](https://dragon.seetatech.com) is a **C**(Computation)**G**(Graph)**V**(Virtual)**M**(Machine) based distributed deep learning framework.
+[Dragon](https://dragon.seetatech.com) is a machine learning library that provides diverse programming styles for AI modeling. It builds an virtual machine for computation graph by leveraging the carefully designed intermediate representation, makes execution decoupled from the specific invocation. As a result, it can fuse modern frameworks and integrations together, powered by a unified engine.
-It fuses several modern frameworks and integrations together, powered by a unified engine.
-The computation between different programming styles is deterministic and reproduceable.
-It is the first deep learning framework that focuses on developing multiple styles, rather than
+Dragon devotes to provide universal but invisible interface for designing AI models. Developers can continue to use their codebase and familiar interface in this novel framework. It hopes to help developers to get rid of the burden in transferring projects written by other frameworks, while achieves similar or even better performance.
-promoting internal interfaces. We will always learn from the AI community to evolve Dragon over time.
+Dragon actively tracks the release of [PyTorch](https://www.pytorch.org/) and [TensorFlow](https://www.tensorflow.org), dispatches AI computation on diverse accelerators, including the newest NVIDIA GPUs and Apple Silicon processors. It is the first deep learning framework that focuses on developing multiple styles, rather than promoting private interface. We will always learn from the AI community to evolve Dragon over time.
 ## Installation
-See the [install guide](https://dragon.seetatech.com/install) for the pip package
+See the [install guide](https://dragon.seetatech.com/install) for the pip package or how to build from source.
-or how to build from source.
 ## License
 [BSD 2-Clause license](https://github.com/seetaresearch/dragon/blob/master/LICENSE)
--- a/dali/_api/ops/__init__.py
+++ b/dali/_api/ops/__init__.py
@@ -41,7 +41,6 @@ from dragon.vm.dali.core.ops.image_ops import WarpAffine
 from dragon.vm.dali.core.ops.math_ops import Normalize
 from dragon.vm.dali.core.ops.random_ops import CoinFlip
 from dragon.vm.dali.core.ops.random_ops import Uniform
-from dragon.vm.dali.core.ops.reader_ops import CGRecordReader
 from dragon.vm.dali.core.ops.reader_ops import TFRecordReader
 __all__ = [_s for _s in dir() if not _s.startswith('_')]
--- a/dali/core/ops/reader_ops.py
+++ b/dali/core/ops/reader_ops.py
@@ -14,147 +14,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import collections
+import json
-import multiprocessing as mp
 import os
 try:
    from nvidia.dali import ops
-    from nvidia.dali import tfrecord
+    from nvidia.dali import tfrecord as tfrec
 except ImportError:
    from dragon.core.util import deprecation
    ops = deprecation.NotInstalled('nvidia.dali')
-    tfrecord = deprecation.NotInstalled('nvidia.dali')
+    tfrec = deprecation.NotInstalled('nvidia.dali')
-try:
-    import codewithgpu
-except ImportError:
-    codewithgpu = deprecation.NotInstalled('codewithgpu')
-from dragon.vm.dali.core.framework import context
-from dragon.vm.dali.core.ops.builtin_ops import ExternalSource
-class CGRecordReader(object):
-    """Read examples from the CGRecord.
-    Examples:
-    ```python
-    class MyPipeline(dali.Pipeline):
-        def __init__():
-            super(MyPipeline, self).__init__()
-            # Assume that we have the following files:
-            # /path/to/records/00000.data
-            # /path/to/records/00000.index
-            # /path/to/records/METADATA
-            self.reader = dali.ops.CGRecordReader(
-                path='/path/to/records'
-                features=('image', 'label'),
-                pipeline=self,
-                # Shuffle locally in the next ``initial_fill`` examples
-                # It turns to be weak with the decreasing of ``initial_fill``
-                # and disabled if ``initial_fill`` is set to **1**
-                random_shuffle=True, initial_fill=1024)
-        def iter_step(self):
-            self.reader.feed_inputs()
-        def define_graph(self):
-            inputs = self.reader()
-    ```
-    """
-    def __init__(
-        self,
-        path,
-        features,
-        pipeline,
-        shard_id=0,
-        num_shards=1,
-        random_shuffle=False,
-        initial_fill=1024,
-        **kwargs
-    ):
-        """Create a ``KPLRecordReader``.
-        Parameters
-        ----------
-        path : str
-            The folder of record files.
-        features : Sequence[str], required
-            The name of features to extract.
-        pipeline : nvidia.dali.Pipeline, required
-            The pipeline to connect to.
-        shard_id : int, optional, default=0
-            The index of partition to read.
-        num_shards : int, optional, default=1
-            The total number of partitions over dataset.
-        random_shuffle : bool, optional, default=False
-            Whether to shuffle the data.
-        initial_fill : int, optional, default=1024
-            The length of sampling sequence for shuffle.
-        """
-        self._pipe = pipeline
-        self._batch_size = pipeline.batch_size
-        self._prefetch_depth = pipeline._prefetch_queue_depth
-        self._buffer = mp.Queue(self._prefetch_depth * self._batch_size)
-        self._dataset_reader = codewithgpu.DatasetReader(
-            path=path, output_queue=self._buffer,
-            partition_idx=shard_id, num_partitions=num_shards,
-            shuffle=random_shuffle, initial_fill=initial_fill, **kwargs)
-        self._dataset_reader.start()
-        with context.device('cpu'):
-            self.features = dict((k, ExternalSource()) for k in features)
-        def cleanup():
-            self.terminate()
-        import atexit
-        atexit.register(cleanup)
-    def example_to_data(self, example):
-        """Define the translation from example to array data.
-        Override this method to implement the translation.
-        """
-        raise NotImplementedError
-    def feed_inputs(self):
-        """Feed the data to edge references.
-        Call this method in the ``Pipeline.iter_setup(...)``.
-        """
-        feed_dict = collections.defaultdict(list)
-        for i in range(self._pipe.batch_size):
-            data = self.example_to_data(self._buffer.get())
-            for k, v in data.items():
-                feed_dict[k].append(v)
-        for k, v in self.features.items():
-            self._pipe.feed_input(self.features[k], feed_dict[k])
-    def terminate(self):
-        """Terminate the reader."""
-        self._dataset_reader.terminate()
-        self._dataset_reader.join()
-    def __call__(self, *args, **kwargs):
-        """Create the edge references for features.
-        Call this method in the ``Pipeline.define_graph(...)``.
-        Returns
-        -------
-        Dict[str, _EdgeReference]
-            The feature reference dict.
-        """
-        self.features = dict((k, v()) for k, v in self.features.items())
-        return self.features
 class TFRecordReader(object):
@@ -232,10 +101,14 @@ class TFRecordReader(object):
        if meta_data_file is None:
            raise FileNotFoundError('Excepted meta data file: %s' % meta_data_file)
        with open(os.path.join(path, meta_data_file), 'r') as f:
-            features = f.read()
+            features = json.load(f)['features']
-            features = features.replace('tf.', 'tfrecord.')
+            for k in list(features.keys()):
-            features = features.replace('tf.io.', 'tfrecord.')
+                shape, dtype, default_value = features[k]
-            features = eval(features)
+                dtype = getattr(tfrec, 'string' if dtype == 'bytes' else dtype)
+                if shape is None:
+                    features[k] = tfrec.VarLenFeature(dtype, default_value)
+                else:
+                    features[k] = tfrec.FixedLenFeature(shape, dtype, default_value)
        data_files.sort()
        index_files.sort()
        data = [os.path.join(path, e) for e in data_files]

--- a/docs/README.md
+++ b/docs/README.md
@@ -16,10 +16,10 @@ Requirements
 pip install sphinx
 ```
- sphinx_seeta_theme
+- sphinx-seeta-theme
 ```bash
-pip install sphinx_seeta_theme
+pip install sphinx-seeta-theme
 ```
 - doxygen (C++ API only)

--- a/docs/api/cc/Doxyfile
+++ b/docs/api/cc/Doxyfile
@@ -2083,7 +2083,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-PREDEFINED             = DRAGON_API= USE_MPI USE_CUDA USE_CUDNN USE_NCCL
+PREDEFINED             = DRAGON_API= USE_MPI USE_CUDA USE_CUDNN USE_MPS USE_NCCL
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The

--- a/docs/api/cc/dragon/core.rst
+++ b/docs/api/cc/dragon/core.rst
@@ -15,6 +15,9 @@ dragon/core
  `class Graph <core/Graph.html>`_
  : Graph to execute operators sequentially.
+  `class MPSContext <core/MPSContext.html>`_
+  : The mps device context.
  `class Operator <core/Operator.html>`_
  : The base operator class with context.
@@ -22,10 +25,10 @@ dragon/core
  : Class to record the schema of operator.
  `class Tensor <core/Tensor.html>`_
-  : The base tensor class, manage memory or not.
+  : The base tensor class.
  `class TypeMeta <core/TypeMeta.html>`_
-  : Metaclass for all types.
+  : The meta class for all types.
  `class UnifiedMemory <core/UnifiedMemory.html>`_
  : Memory to manage both the host and device data.
@@ -39,6 +42,7 @@ dragon/core
  core/CPUContext
  core/CUDAContext
  core/Graph
+  core/MPSContext
  core/Operator
  core/OpSchema
  core/Tensor

--- a/docs/api/cc/dragon/core/CPUContext.rst
+++ b/docs/api/cc/dragon/core/CPUContext.rst
@@ -10,6 +10,29 @@ Constructors
 .. doxygenfunction:: dragon::CPUContext::CPUContext(unsigned int random_seed)
 .. doxygenfunction:: dragon::CPUContext::CPUContext(const DeviceOption &option)
+Public Properties
+-----------------
+device
+######
+.. doxygenfunction:: dragon::CPUContext::device
+rand_generator
+##############
+.. doxygenfunction:: dragon::CPUContext::rand_generator
+stream
+######
+.. doxygenfunction:: dragon::CPUContext::stream
+workspace
+#########
+.. doxygenfunction:: dragon::CPUContext::workspace
+set_stream
+##########
+.. doxygenfunction:: dragon::CPUContext::set_stream
 Public Functions
 ----------------
@@ -49,26 +72,6 @@ SwitchToDevice
 ##############
 .. doxygenfunction:: dragon::CPUContext::SwitchToDevice
-device
-######
-.. doxygenfunction:: dragon::CPUContext::device
-rand_generator
-##############
-.. doxygenfunction:: dragon::CPUContext::rand_generator
-set_stream
-##########
-.. doxygenfunction:: dragon::CPUContext::set_stream
-stream
-######
-.. doxygenfunction:: dragon::CPUContext::stream
-workspace
-#########
-.. doxygenfunction:: dragon::CPUContext::workspace
 .. raw:: html
  <style>

--- a/docs/api/cc/dragon/core/CUDAContext.rst
+++ b/docs/api/cc/dragon/core/CUDAContext.rst
@@ -10,6 +10,65 @@ Constructors
 .. doxygenfunction:: dragon::CUDAContext::CUDAContext(int device)
 .. doxygenfunction:: dragon::CUDAContext::CUDAContext(const DeviceOption &option)
+Public Properties
+-----------------
+cublas_handle
+#############
+.. doxygenfunction:: dragon::CUDAContext::cublas_handle
+cuda_stream
+###########
+.. doxygenfunction:: dragon::CUDAContext::cuda_stream()
+cuda_stream
+###########
+.. doxygenfunction:: dragon::CUDAContext::cuda_stream(int device, int stream)
+cudnn_handle
+############
+.. doxygenfunction:: dragon::CUDAContext::cudnn_handle
+curand_generator
+################
+.. doxygenfunction:: dragon::CUDAContext::curand_generator
+current_device
+##############
+.. doxygenfunction:: dragon::CUDAContext::current_device
+device
+######
+.. doxygenfunction:: dragon::CUDAContext::device
+mutex
+#####
+.. doxygenfunction:: dragon::CUDAContext::mutex
+objects
+#######
+.. doxygenfunction:: dragon::CUDAContext::objects
+rand_generator
+##############
+.. doxygenfunction:: dragon::CUDAContext::rand_generator
+stream
+######
+.. doxygenfunction:: dragon::CUDAContext::stream
+workspace
+#########
+.. doxygenfunction:: dragon::CUDAContext::workspace()
+workspace
+#########
+.. doxygenfunction:: dragon::CUDAContext::workspace(int device, int stream)
+set_stream
+##########
+.. doxygenfunction:: dragon::CUDAContext::set_stream
 Public Functions
 ----------------
@@ -21,6 +80,10 @@ Delete
 ######
 .. doxygenfunction:: dragon::CUDAContext::Delete
+DeleteHost
+##########
+.. doxygenfunction:: dragon::CUDAContext::DeleteHost
 FinishDeviceComputation
 #######################
 .. doxygenfunction:: dragon::CUDAContext::FinishDeviceComputation
@@ -49,6 +112,10 @@ New
 ###
 .. doxygenfunction:: dragon::CUDAContext::New
+NewHost
+#######
+.. doxygenfunction:: dragon::CUDAContext::NewHost
 SwitchToDevice
 ##############
 .. doxygenfunction:: dragon::CUDAContext::SwitchToDevice
@@ -57,50 +124,6 @@ SynchronizeStream
 #################
 .. doxygenfunction:: dragon::CUDAContext::SynchronizeStream
-cublas_handle
-#############
-.. doxygenfunction:: dragon::CUDAContext::cublas_handle
-cuda_stream
-###########
-.. doxygenfunction:: dragon::CUDAContext::cuda_stream()
-cuda_stream
-###########
-.. doxygenfunction:: dragon::CUDAContext::cuda_stream(int device, int stream)
-cudnn_handle
-############
-.. doxygenfunction:: dragon::CUDAContext::cudnn_handle
-curand_generator
-################
-.. doxygenfunction:: dragon::CUDAContext::curand_generator
-rand_generator
-##############
-.. doxygenfunction:: dragon::CUDAContext::rand_generator
-device
-######
-.. doxygenfunction:: dragon::CUDAContext::device
-set_stream
-##########
-.. doxygenfunction:: dragon::CUDAContext::set_stream
-stream
-######
-.. doxygenfunction:: dragon::CUDAContext::stream
-workspace
-#########
-.. doxygenfunction:: dragon::CUDAContext::workspace()
-workspace
-#########
-.. doxygenfunction:: dragon::CUDAContext::workspace(int device, int stream)
 .. raw:: html
  <style>

--- a/docs/api/cc/dragon/core/Graph.rst
+++ b/docs/api/cc/dragon/core/Graph.rst
@@ -8,16 +8,8 @@ Constructors
 .. doxygenfunction:: dragon::Graph::Graph(const GraphDef& def, Workspace* ws)
-Public Functions
+Public Properties
----------------
+-----------------
-Create
-######
-.. doxygenfunction:: dragon::Graph::Create
-Run
-###
-.. doxygenfunction:: dragon::Graph::Run
 arg
 ###
@@ -31,14 +23,14 @@ def
 ###
 .. doxygenfunction:: dragon::Graph::def
-optimized_def
-#############
-.. doxygenfunction:: dragon::Graph::optimized_def
 name
 ####
 .. doxygenfunction:: dragon::Graph::name
+optimized_def
+#############
+.. doxygenfunction:: dragon::Graph::optimized_def
 phase
 #####
 .. doxygenfunction:: dragon::Graph::phase
@@ -47,6 +39,17 @@ workspace
 #########
 .. doxygenfunction:: dragon::Graph::workspace
+Public Functions
+----------------
+Create
+######
+.. doxygenfunction:: dragon::Graph::Create
+Run
+###
+.. doxygenfunction:: dragon::Graph::Run
 .. raw:: html
  <style>

--- a/docs/api/cc/dragon/core/MPSContext.rst
+++ b/docs/api/cc/dragon/core/MPSContext.rst
+MPSContext
+==========
+.. doxygenclass:: dragon::MPSContext
+Constructors
+------------
+.. doxygenfunction:: dragon::MPSContext::MPSContext()
+.. doxygenfunction:: dragon::MPSContext::MPSContext(int device)
+.. doxygenfunction:: dragon::MPSContext::MPSContext(const DeviceOption &option)
+Public Properties
+-----------------
+current_device
+##############
+.. doxygenfunction:: dragon::MPSContext::current_device
+device
+######
+.. doxygenfunction:: dragon::MPSContext::device
+mps_stream
+##########
+.. doxygenfunction:: dragon::MPSContext::mps_stream
+mutex
+#####
+.. doxygenfunction:: dragon::MPSContext::mutex
+objects
+#######
+.. doxygenfunction:: dragon::MPSContext::objects
+rand_generator
+##############
+.. doxygenfunction:: dragon::MPSContext::rand_generator
+stream
+######
+.. doxygenfunction:: dragon::MPSContext::stream
+workspace
+#########
+.. doxygenfunction:: dragon::MPSContext::workspace()
+workspace
+#########
+.. doxygenfunction:: dragon::MPSContext::workspace(int device, int stream)
+set_stream
+##########
+.. doxygenfunction:: dragon::MPSContext::set_stream
+Public Functions
+----------------
+Delete
+######
+.. doxygenfunction:: dragon::MPSContext::Delete
+FinishDeviceComputation
+#######################
+.. doxygenfunction:: dragon::MPSContext::FinishDeviceComputation
+Memset
+######
+.. doxygenfunction:: dragon::MPSContext::Memset
+MemsetAsync
+###########
+.. doxygenfunction:: dragon::MPSContext::MemsetAsync
+Memcpy
+######
+.. doxygenfunction:: dragon::MPSContext::Memcpy(size_t n, void *dest, const void *src)
+Memcpy
+######
+.. doxygenfunction:: dragon::MPSContext::Memcpy(size_t n, void *dest, const void *src, int device)
+MemcpyAsync
+###########
+.. doxygenfunction:: dragon::MPSContext::MemcpyAsync
+New
+###
+.. doxygenfunction:: dragon::MPSContext::New
+NewShared
+#########
+.. doxygenfunction:: dragon::MPSContext::NewShared
+NewSharedFromBytes
+##################
+.. doxygenfunction:: dragon::MPSContext::NewSharedFromBytes
+NewSharedFromBuffer
+###################
+.. doxygenfunction:: dragon::MPSContext::NewSharedFromBuffer
+SwitchToDevice
+##############
+.. doxygenfunction:: dragon::MPSContext::SwitchToDevice
+SynchronizeStream
+#################
+.. doxygenfunction:: dragon::MPSContext::SynchronizeStream
+.. raw:: html
+  <style>
+    h1:before {
+      content: "dragon::";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/cc/dragon/core/Operator.rst
+++ b/docs/api/cc/dragon/core/Operator.rst
@@ -8,6 +8,45 @@ Constructors
 .. doxygenfunction:: dragon::Operator::Operator(const OperatorDef &def, Workspace *ws)
+Public Properties
+-----------------
+arg
+###
+.. doxygenfunction:: dragon::Operator::arg
+args
+####
+.. doxygenfunction:: dragon::Operator::args
+data_format
+###########
+.. doxygenfunction:: dragon::Operator::data_format
+data_type
+#########
+.. doxygenfunction:: dragon::Operator::data_type
+def
+###
+.. doxygenfunction:: dragon::Operator::def
+name
+####
+.. doxygenfunction:: dragon::Operator::name
+phase
+#####
+.. doxygenfunction:: dragon::Operator::phase
+type
+####
+.. doxygenfunction:: dragon::Operator::type
+workspace
+#########
+.. doxygenfunction:: dragon::Operator::workspace
 Public Functions
 ----------------
@@ -63,42 +102,6 @@ Run
 ###
 .. doxygenfunction:: dragon::Operator::Run
-arg
-###
-.. doxygenfunction:: dragon::Operator::arg
-args
-####
-.. doxygenfunction:: dragon::Operator::args
-data_format
-###########
-.. doxygenfunction:: dragon::Operator::data_format
-data_type
-#########
-.. doxygenfunction:: dragon::Operator::data_type
-def
-###
-.. doxygenfunction:: dragon::Operator::def
-name
-####
-.. doxygenfunction:: dragon::Operator::name
-type
-####
-.. doxygenfunction:: dragon::Operator::type
-phase
-#####
-.. doxygenfunction:: dragon::Operator::phase
-workspace
-#########
-.. doxygenfunction:: dragon::Operator::workspace
 .. raw:: html
  <style>

--- a/docs/api/cc/dragon/core/Tensor.rst
+++ b/docs/api/cc/dragon/core/Tensor.rst
@@ -12,48 +12,8 @@ Constructors
 .. doxygenfunction:: dragon::Tensor::Tensor(const vec32_t &dims)
 .. doxygenfunction:: dragon::Tensor::Tensor(const TypeMeta &meta)
-Public Functions
+Public Properties
----------------
+-----------------
-CopyFrom
-########
-.. doxygenfunction:: dragon::Tensor::CopyFrom(Tensor &other, Context *ctx)
-CopyFrom
-########
-.. doxygenfunction:: dragon::Tensor::CopyFrom(const vector<VectorType> &other)
-CopyTo
-######
-.. doxygenfunction:: dragon::Tensor::CopyTo
-DimString
-#########
-.. doxygenfunction:: dragon::Tensor::DimString() const
-DimString
-#########
-.. doxygenfunction:: dragon::Tensor::DimString(const vector<int64_t> &dims)
-IsType
-######
-.. doxygenfunction:: dragon::Tensor::IsType
-MapFrom
-#######
-.. doxygenfunction:: dragon::Tensor::MapFrom
-Reset
-#####
-.. doxygenfunction:: dragon::Tensor::Reset
-Reshape
-#######
-.. doxygenfunction:: dragon::Tensor::Reshape
-ReshapeLike
-###########
-.. doxygenfunction:: dragon::Tensor::ReshapeLike
 axis
 ####
@@ -91,18 +51,6 @@ empty
 #####
 .. doxygenfunction:: dragon::Tensor::empty
-has_memory
-##########
-.. doxygenfunction:: dragon::Tensor::has_memory
-has_name
-########
-.. doxygenfunction:: dragon::Tensor::has_name
-meta
-####
-.. doxygenfunction:: dragon::Tensor::meta
 memory
 ######
 .. doxygenfunction:: dragon::Tensor::memory
@@ -111,6 +59,10 @@ memory_state
 ############
 .. doxygenfunction:: dragon::Tensor::memory_state
+meta
+####
+.. doxygenfunction:: dragon::Tensor::meta
 mutable_data
 ############
 .. doxygenfunction:: dragon::Tensor::mutable_data
@@ -151,6 +103,57 @@ version
 #######
 .. doxygenfunction:: dragon::Tensor::version
+has_memory
+##########
+.. doxygenfunction:: dragon::Tensor::has_memory
+has_name
+########
+.. doxygenfunction:: dragon::Tensor::has_name
+Public Functions
+----------------
+CopyFrom
+########
+.. doxygenfunction:: dragon::Tensor::CopyFrom(Tensor &other, Context *ctx)
+CopyFrom
+########
+.. doxygenfunction:: dragon::Tensor::CopyFrom(const vector<VectorType> &other)
+CopyTo
+######
+.. doxygenfunction:: dragon::Tensor::CopyTo
+DimString
+#########
+.. doxygenfunction:: dragon::Tensor::DimString() const
+DimString
+#########
+.. doxygenfunction:: dragon::Tensor::DimString(const vector<int64_t> &dims)
+IsType
+######
+.. doxygenfunction:: dragon::Tensor::IsType
+MapFrom
+#######
+.. doxygenfunction:: dragon::Tensor::MapFrom
+Reset
+#####
+.. doxygenfunction:: dragon::Tensor::Reset
+Reshape
+#######
+.. doxygenfunction:: dragon::Tensor::Reshape
+ReshapeLike
+###########
+.. doxygenfunction:: dragon::Tensor::ReshapeLike
 .. raw:: html
  <style>

--- a/docs/api/cc/dragon/core/TypeMeta.rst
+++ b/docs/api/cc/dragon/core/TypeMeta.rst
@@ -9,6 +9,29 @@ Constructors
 .. doxygenfunction:: dragon::TypeMeta::TypeMeta()
 .. doxygenfunction:: dragon::TypeMeta::TypeMeta(const TypeMeta &src)
+Public Properties
+-----------------
+copy
+####
+.. doxygenfunction:: dragon::TypeMeta::copy
+ctor
+####
+.. doxygenfunction:: dragon::TypeMeta::ctor
+dtor
+####
+.. doxygenfunction:: dragon::TypeMeta::dtor
+id
+##
+.. doxygenfunction:: dragon::TypeMeta::id
+itemsize
+########
+.. doxygenfunction:: dragon::TypeMeta::itemsize
 Public Functions
 ----------------
@@ -40,26 +63,6 @@ Match
 #####
 .. doxygenfunction:: dragon::TypeMeta::Match
-copy
-####
-.. doxygenfunction:: dragon::TypeMeta::copy
-ctor
-####
-.. doxygenfunction:: dragon::TypeMeta::ctor
-dtor
-####
-.. doxygenfunction:: dragon::TypeMeta::dtor
-id
-##
-.. doxygenfunction:: dragon::TypeMeta::id
-itemsize
-########
-.. doxygenfunction:: dragon::TypeMeta::itemsize
 .. raw:: html
  <style>

--- a/docs/api/cc/dragon/core/UnifiedMemory.rst
+++ b/docs/api/cc/dragon/core/UnifiedMemory.rst
@@ -16,20 +16,8 @@ State
 #####
 .. doxygenenum:: dragon::UnifiedMemory::State
-Public Functions
+Public Properties
----------------
+-----------------
-SwitchToCUDADevice
-##################
-.. doxygenfunction:: dragon::UnifiedMemory::SwitchToCUDADevice
-ToCPU
-#####
-.. doxygenfunction:: dragon::UnifiedMemory::ToCPU
-ToCUDA
-######
-.. doxygenfunction:: dragon::UnifiedMemory::ToCUDA
 cpu_data
 ########
@@ -47,6 +35,26 @@ info
 ####
 .. doxygenfunction:: dragon::UnifiedMemory::info
+mps_data
+########
+.. doxygenfunction:: dragon::UnifiedMemory::mps_data
+order
+#####
+.. doxygenfunction:: dragon::UnifiedMemory::order
+size
+####
+.. doxygenfunction:: dragon::UnifiedMemory::size() const
+size
+####
+.. doxygenfunction:: dragon::UnifiedMemory::size(const string &device_type, int device_id) const
+state
+#####
+.. doxygenfunction:: dragon::UnifiedMemory::state
 mutable_cpu_data
 ################
 .. doxygenfunction:: dragon::UnifiedMemory::mutable_cpu_data
@@ -63,17 +71,32 @@ set_cuda_data
 #############
 .. doxygenfunction:: dragon::UnifiedMemory::set_cuda_data
-size
+set_order
-####
+#########
-.. doxygenfunction:: dragon::UnifiedMemory::size() const
+.. doxygenfunction:: dragon::UnifiedMemory::set_order
-size
+Public Functions
-####
+----------------
-.. doxygenfunction:: dragon::UnifiedMemory::size(const string &device_type, int device_id) const
-state
+SwitchToCUDADevice
+##################
+.. doxygenfunction:: dragon::UnifiedMemory::SwitchToCUDADevice
+SwitchToMPSDevice
+##################
+.. doxygenfunction:: dragon::UnifiedMemory::SwitchToMPSDevice
+ToCPU
 #####
-.. doxygenfunction:: dragon::UnifiedMemory::state
+.. doxygenfunction:: dragon::UnifiedMemory::ToCPU
+ToCUDA
+######
+.. doxygenfunction:: dragon::UnifiedMemory::ToCUDA
+ToMPS
+#####
+.. doxygenfunction:: dragon::UnifiedMemory::ToMPS
 .. raw:: html

--- a/docs/api/cc/dragon/core/Workspace.rst
+++ b/docs/api/cc/dragon/core/Workspace.rst
@@ -8,6 +8,29 @@ Constructors
 .. doxygenfunction:: dragon::Workspace::Workspace(const string &name)
+Public Properties
+-----------------
+data
+####
+.. doxygenfunction:: dragon::Workspace::data(size_t size, const string &name = "BufferShared")
+data
+####
+.. doxygenfunction:: dragon::Workspace::data(int64_t size, const string &name = "BufferShared")
+graphs
+######
+.. doxygenfunction:: dragon::Workspace::graphs
+name
+####
+.. doxygenfunction:: dragon::Workspace::name
+tensors
+#######
+.. doxygenfunction:: dragon::Workspace::tensors
 Public Functions
 ----------------
@@ -55,26 +78,6 @@ UniqueName
 ##########
 .. doxygenfunction:: dragon::Workspace::UniqueName
-data
-####
-.. doxygenfunction:: dragon::Workspace::data(size_t size, const string &name = "BufferShared")
-data
-####
-.. doxygenfunction:: dragon::Workspace::data(int64_t size, const string &name = "BufferShared")
-graphs
-######
-.. doxygenfunction:: dragon::Workspace::graphs
-name
-####
-.. doxygenfunction:: dragon::Workspace::name
-tensors
-#######
-.. doxygenfunction:: dragon::Workspace::tensors
 .. raw:: html
  <style>

--- a/docs/api/python/dali/ops.rst
+++ b/docs/api/python/dali/ops.rst
@@ -21,9 +21,6 @@ vm.dali.ops
  `class Cast <ops/Cast.html>`_
  : Cast the data type of input.
-  `class CGRecordReader <ops/CGRecordReader.html>`_
-  : Read examples from the cg-record file.
  `class CoinFlip <ops/CoinFlip.html>`_
  : Sample values from a bernoulli distribution.
@@ -101,7 +98,6 @@ vm.dali.ops
  ops/Brightness
  ops/BrightnessContrast
  ops/Cast
-  ops/CGRecordReader
  ops/CoinFlip
  ops/ColorSpaceConversion
  ops/ColorTwist

--- a/docs/api/python/dali/ops/CGRecordReader.rst
+++ b/docs/api/python/dali/ops/CGRecordReader.rst
-CGRecordReader
-===============
-.. autoclass:: dragon.vm.dali.ops.CGRecordReader
-__init__
--------
-.. automethod:: dragon.vm.dali.ops.CGRecordReader.__init__
-Methods
-------
-example_to_data
-###############
-.. automethod:: dragon.vm.dali.ops.CGRecordReader.example_to_data
-feed_inputs
-###########
-.. automethod:: dragon.vm.dali.ops.CGRecordReader.feed_inputs
-__call__
-########
-.. automethod:: dragon.vm.dali.ops.CGRecordReader.__call__
-.. raw:: html
-  <style>
-    h1:before {
-      content: "dali.ops.";
-      color: #103d3e;
-    }
-  </style>
--- a/docs/api/python/dragon/cuda.rst
+++ b/docs/api/python/dragon/cuda.rst
@@ -18,6 +18,9 @@ dragon.cuda
  `get_device_capability(...) <cuda/get_device_capability.html>`_
  : Return the capability of specified device.
+  `get_device_name(...) <cuda/get_device_name.html>`_
+  : Return the name of specified device.
  `is_available(...) <cuda/is_available.html>`_
  : Return a bool reporting if runtime is available.
@@ -45,6 +48,7 @@ dragon.cuda
  cuda/Stream
  cuda/current_device
  cuda/get_device_capability
+  cuda/get_device_name
  cuda/is_available
  cuda/memory_allocated
  cuda/set_cublas_flags

--- a/docs/api/python/dragon/cuda/get_device_name.rst
+++ b/docs/api/python/dragon/cuda/get_device_name.rst
+get_device_name
+===============
+.. autofunction:: dragon.cuda.get_device_name
+.. raw:: html
+  <style>
+    h1:before {
+      content: "dragon.cuda.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/cuda.rst
+++ b/docs/api/python/torch/cuda.rst
@@ -12,6 +12,9 @@ vm.torch.cuda
  `get_device_capability(...) <cuda/get_device_capability.html>`_
  : Return the capability of specified device.
+  `get_device_name(...) <cuda/get_device_name.html>`_
+  : Return the name of specified device.
  `is_available(...) <cuda/is_available.html>`_
  : Return a bool reporting if runtime is available.
@@ -26,6 +29,7 @@ vm.torch.cuda
  cuda/current_device
  cuda/get_device_capability
+  cuda/get_device_name
  cuda/is_available
  cuda/set_device
  cuda/synchronize

--- a/docs/api/python/torch/cuda/get_device_name.rst
+++ b/docs/api/python/torch/cuda/get_device_name.rst
+get_device_name
+===============
+.. autofunction:: dragon.vm.torch.cuda.get_device_name
+.. raw:: html
+  <style>
+    h1:before {
+      content: "torch.cuda.";
+      color: #103d3e;
+    }
+  </style>
--- a/dragon/core/typeid.h
+++ b/dragon/core/typeid.h
@@ -32,7 +32,7 @@ struct DRAGON_API TypeRegister {
 };
 /*!
- * \brief Metaclass for all types.
+ * \brief The meta class for all types.
 *
 * TypeMeta is commonly used for type identification:
 *

--- a/dragon/kernels/vision/cuda/bias_add_kernel.cu
+++ b/dragon/kernels/vision/cuda/bias_add_kernel.cu
@@ -7,16 +7,16 @@ namespace kernels {
 namespace {
-template <typename T, typename AccT>
+template <typename T>
 __global__ void
 _BiasAdd(const int NxC, const int C, const T* x, const T* bias, T* y) {
+  const math::PlusFunctor<T> functor;
  CUDA_1D_KERNEL_LOOP(i, NxC) {
-    y[i] = convert::To<T>(
+    y[i] = functor(x[i], __ldg(bias + i % C));
-        convert::To<AccT>(x[i]) + convert::To<AccT>(__ldg(bias + i % C)));
  }
 }
-template <typename T, typename AccT>
+template <typename T>
 __global__ void _BiasAdd(
    const int NxCxS,
    const int S,
@@ -24,43 +24,41 @@ __global__ void _BiasAdd(
    const T* x,
    const T* bias,
    T* y) {
+  const math::PlusFunctor<T> functor;
  CUDA_1D_KERNEL_LOOP(i, NxCxS) {
-    y[i] = convert::To<T>(
+    y[i] = functor(x[i], __ldg(bias + i / S % C));
-        convert::To<AccT>(x[i]) + convert::To<AccT>(__ldg(bias + (i / S) % C)));
  }
 }
 } // namespace
-#define DEFINE_KERNEL_LAUNCHER(T)                                         \
+#define DEFINE_KERNEL_LAUNCHER(T)                                            \
-  template <>                                                             \
+  template <>                                                                \
-  void BiasAdd<T, CUDAContext>(                                           \
+  void BiasAdd<T, CUDAContext>(                                              \
-      const int N,                                                        \
+      const int N,                                                           \
-      const int S,                                                        \
+      const int S,                                                           \
-      const int C,                                                        \
+      const int C,                                                           \
-      const T* x,                                                         \
+      const T* x,                                                            \
-      const T* bias,                                                      \
+      const T* bias,                                                         \
-      T* y,                                                               \
+      T* y,                                                                  \
-      CUDAContext* ctx) {                                                 \
+      CUDAContext* ctx) {                                                    \
-    const auto NxCxS = N * C * S;                                         \
+    const auto NxCxS = N * C * S;                                            \
-    if (S == 1) {                                                         \
+    if (S == 1) {                                                            \
-      _BiasAdd<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
+      _BiasAdd<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-          <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>(  \
+          NxCxS,                                                             \
-              NxCxS,                                                      \
+          C,                                                                 \
-              C,                                                          \
+          reinterpret_cast<const math::ScalarType<T>::type*>(x),             \
-              reinterpret_cast<const math::ScalarType<T>::type*>(x),      \
+          reinterpret_cast<const math::ScalarType<T>::type*>(bias),          \
-              reinterpret_cast<const math::ScalarType<T>::type*>(bias),   \
+          reinterpret_cast<math::ScalarType<T>::type*>(y));                  \
-              reinterpret_cast<math::ScalarType<T>::type*>(y));           \
+    } else {                                                                 \
-    } else {                                                              \
+      _BiasAdd<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-      _BiasAdd<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
+          NxCxS,                                                             \
-          <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>(  \
+          S,                                                                 \
-              NxCxS,                                                      \
+          C,                                                                 \
-              S,                                                          \
+          reinterpret_cast<const math::ScalarType<T>::type*>(x),             \
-              C,                                                          \
+          reinterpret_cast<const math::ScalarType<T>::type*>(bias),          \
-              reinterpret_cast<const math::ScalarType<T>::type*>(x),      \
+          reinterpret_cast<math::ScalarType<T>::type*>(y));                  \
-              reinterpret_cast<const math::ScalarType<T>::type*>(bias),   \
+    }                                                                        \
-              reinterpret_cast<math::ScalarType<T>::type*>(y));           \
-    }                                                                     \
  }
 DEFINE_KERNEL_LAUNCHER(uint8_t);

--- a/dragon/kernels/vision/mps/bias_add_kernel.mm
+++ b/dragon/kernels/vision/mps/bias_add_kernel.mm
+#include "dragon/kernels/vision/op_kernels.h"
+namespace dragon {
+namespace kernels {
+namespace {
+const static string METAL_SHADERS = R"(
+#include <metal_stdlib>
+using namespace metal;
+constant uint uint_arg1 [[function_constant(0)]]; // C
+constant uint uint_arg2 [[function_constant(1)]]; // S
+template <typename T>
+kernel void BiasAdd(
+    device const T* x,
+    device const T* bias,
+    device T* y,
+    const uint index [[thread_position_in_grid]]) {
+  y[index] = x[index] + bias[index % uint_arg1];
+}
+template <typename T>
+kernel void SpatialBiasAdd(
+    device const T* x,
+    device const T* bias,
+    device T* y,
+    const uint index [[thread_position_in_grid]]) {
+  y[index] = x[index] + bias[index / uint_arg2 % uint_arg1];
+}
+#define INSTANTIATE_KERNEL(name, T) \
+  template [[host_name(#name"_"#T)]] \
+  kernel void name(device const T*, device const T*, device T*, uint);
+INSTANTIATE_KERNEL(BiasAdd, half);
+INSTANTIATE_KERNEL(BiasAdd, float);
+INSTANTIATE_KERNEL(SpatialBiasAdd, half);
+INSTANTIATE_KERNEL(SpatialBiasAdd, float);
+#if defined(__HAVE_NATIVE_DOUBLE__)
+INSTANTIATE_KERNEL(BiasAdd, double);
+INSTANTIATE_KERNEL(SpatialBiasAdd, double);
+#endif // defined(__HAVE_NATIVE_DOUBLE__)
+#undef INSTANTIATE_KERNEL
+)";
+} // namespace
+#define DEFINE_KERNEL_LAUNCHER(T)                                             \
+  template <>                                                                 \
+  void BiasAdd<T, MPSContext>(                                                \
+      const int N,                                                            \
+      const int S,                                                            \
+      const int C,                                                            \
+      const T* x,                                                             \
+      const T* bias,                                                          \
+      T* y,                                                                   \
+      MPSContext* ctx) {                                                      \
+    const uint arg1 = C, arg2 = S;                                            \
+    auto kernel = MPSKernel::TypedString<T>("BiasAdd");                       \
+    vector<MPSConstant> args({MPSConstant(&arg1, MTLDataTypeUInt, 0)});       \
+    MTLComputePipelineState_t pso = nil;                                      \
+    if (S == 1) {                                                             \
+      pso = MPSKernel(kernel, METAL_SHADERS).GetState(ctx, args);             \
+    } else {                                                                  \
+      args.emplace_back(MPSConstant(&arg2, MTLDataTypeUInt, 1));              \
+      pso = MPSKernel("Spatial" + kernel, METAL_SHADERS).GetState(ctx, args); \
+    }                                                                         \
+    auto* command_buffer = ctx->mps_stream()->command_buffer();               \
+    auto* encoder = [command_buffer computeCommandEncoder];                   \
+    [encoder setComputePipelineState:pso];                                    \
+    [encoder setBuffer:id<MTLBuffer>(x) offset:0 atIndex:0];                  \
+    [encoder setBuffer:id<MTLBuffer>(bias) offset:0 atIndex:1];               \
+    [encoder setBuffer:id<MTLBuffer>(y) offset:0 atIndex:2];                  \
+    MPSDispatchThreads((N * C * S), encoder, pso);                            \
+    [encoder endEncoding];                                                    \
+    [encoder release];                                                        \
+  }
+DEFINE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_KERNEL_LAUNCHER(int8_t);
+DEFINE_KERNEL_LAUNCHER(int);
+DEFINE_KERNEL_LAUNCHER(int64_t);
+DEFINE_KERNEL_LAUNCHER(float16);
+DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(double);
+#undef DEFINE_KERNEL_LAUNCHER
+} // namespace kernels
+} // namespace dragon
--- a/dragon/kernels/vision/mps/roi_align_kernel.mm
+++ b/dragon/kernels/vision/mps/roi_align_kernel.mm
@@ -10,11 +10,11 @@ const static string METAL_SHADERS = R"(
 #include <metal_stdlib>
 using namespace metal;
-constant int int_arg1 [[function_constant(0)]];   // C
+constant int int_arg1 [[function_constant(0)]];     // C
-constant int int_arg2 [[function_constant(1)]];   // H
+constant int int_arg2 [[function_constant(1)]];     // H
-constant int int_arg3 [[function_constant(2)]];   // W
+constant int int_arg3 [[function_constant(2)]];     // W
-constant int int_arg4 [[function_constant(3)]];   // out_h
+constant int int_arg4 [[function_constant(3)]];     // out_h
-constant int int_arg5 [[function_constant(4)]];   // out_w
+constant int int_arg5 [[function_constant(4)]];     // out_w
 constant float float_arg1 [[function_constant(5)]]; // spatial_scale
 constant int int_arg6 [[function_constant(6)]];     // sampling_ratio
 constant bool bool_arg1 [[function_constant(7)]];   // aligned

--- a/dragon/operators/vision/bias_add_op.cc
+++ b/dragon/operators/vision/bias_add_op.cc
@@ -31,11 +31,6 @@ void BiasAddOp<Context>::DoRunWithType() {
 }
 template <class Context>
-void BiasAddOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-template <class Context>
 template <typename T>
 void BiasAddGradientOp<Context>::DoRunWithType() {
  auto &dY = Input(0), *dX = Output(0), *dB = Output(1);
@@ -67,20 +62,16 @@ void BiasAddGradientOp<Context>::DoRunWithType() {
  }
 }
-template <class Context>
-void BiasAddGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
 DEPLOY_CPU_OPERATOR(BiasAdd);
-#ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(BiasAdd);
-#endif
 DEPLOY_CPU_OPERATOR(BiasAddGradient);
 #ifdef USE_CUDA
+DEPLOY_CUDA_OPERATOR(BiasAdd);
 DEPLOY_CUDA_OPERATOR(BiasAddGradient);
 #endif
+#ifdef USE_MPS
+DEPLOY_MPS_OPERATOR(BiasAdd, BiasAdd);
+DEPLOY_MPS_OPERATOR(BiasAddGradient, BiasAddGradient);
+#endif
 OPERATOR_SCHEMA(BiasAdd)
    /* X, B */

--- a/dragon/operators/vision/bias_add_op.h
+++ b/dragon/operators/vision/bias_add_op.h
@@ -23,7 +23,9 @@ class BiasAddOp final : public Operator<Context> {
  SIMPLE_CTOR_DTOR(BiasAddOp);
  USE_OPERATOR_FUNCTIONS;
-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }
  template <typename T>
  void DoRunWithType();
@@ -35,7 +37,9 @@ class BiasAddGradientOp final : public Operator<Context> {
  SIMPLE_CTOR_DTOR(BiasAddGradientOp);
  USE_OPERATOR_FUNCTIONS;
-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }
  template <typename T>
  void DoRunWithType();
@@ -58,7 +62,9 @@ class CuDNNBiasAddGradientOp final : public Operator<Context> {
    CuDNNDestroyTensorDesc(&input_desc_);
  }
-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }
  template <typename T>
  void DoRunWithType();

--- a/dragon/operators/vision/bias_add_op_cudnn.cc
+++ b/dragon/operators/vision/bias_add_op_cudnn.cc
@@ -39,11 +39,6 @@ void CuDNNBiasAddGradientOp<Context>::DoRunWithType() {
  }
 }
-template <class Context>
-void CuDNNBiasAddGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
 DEPLOY_CUDNN_OPERATOR(BiasAddGradient);
 } // namespace dragon

--- a/dragon/python/vm/onnx/core/frontend/native.py
+++ b/dragon/python/vm/onnx/core/frontend/native.py
@@ -22,6 +22,7 @@ try:
    import onnx
 except ImportError:
    onnx = None
+from packaging.version import parse as version_parse
 from dragon.core.autograph import context as eager_context
 from dragon.core.autograph.graph_lib import GraphLib
@@ -50,6 +51,9 @@ class DragonFrontend(object):
        (12, '1.7.0'),
        (13, '1.8.0'),
        (14, '1.9.0'),
+        (15, '1.10.0'),
+        (16, '1.11.0'),
+        (17, '1.12.0'),
    ])
    @classmethod
@@ -254,9 +258,9 @@ class DragonFrontend(object):
                    detail_msg += '  * Opset = %d, ONNX >= %s,\n' % (k, v)
                raise ValueError(detail_msg + '}')
        onnx_version = cls.OPSET_VERSIONS[opset_version]
-        if onnx.__version__ < onnx_version:
+        if version_parse(onnx.__version__) < version_parse(onnx_version):
            raise RuntimeError(
-                'OpSet {} requires ONNX version >= {}. '
+                'OpSet {} requires ONNX version >= {} '
                '({} currently installed.)'
                .format(opset_version, onnx_version, onnx.__version__))
        return opset_version

--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,7 @@
 #    <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
+"""Python setup script."""
 from __future__ import absolute_import
 from __future__ import division
@@ -81,7 +82,6 @@ class BuildPyCommand(setuptools.command.build_py.build_py):
    """Enhanced 'build_py' command."""
    def build_packages(self):
-        clean_builds()
        shutil.copytree('dragon/python', self.build_lib + '/dragon')
        shutil.copytree('dali', self.build_lib + '/dragon/vm/dali')
        shutil.copytree('keras', self.build_lib + '/dragon/vm/keras')

--- a/test/dragon/test_ops.py
+++ b/test/dragon/test_ops.py
@@ -3988,6 +3988,11 @@ class TestVisionOps(OpTestCase):
        with dragon.device('cuda'):
            self.test_bias_add()
+    @unittest.skipIf(not TEST_MPS, 'MPS unavailable')
+    def test_bias_add_mps(self):
+        with dragon.device('mps'):
+            self.test_bias_add()
    def test_conv1d(self, prec=1e-3, test_nhwc=True):
        entries = [((2, 2, 2), (3, 2, 1), (3,), 1, 1, 0, 1, 1, 'NCHW'),
                   ((2, 2, 2), (3, 2, 3), (3,), 3, 1, 1, 1, 1, 'NCHW'),