diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index f62ad49123..2818dfa6d3 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -23,6 +23,16 @@ if (GIT_FOUND)
         OUTPUT_VARIABLE CURRENT_RELEASE
         OUTPUT_STRIP_TRAILING_WHITESPACE
     )
+    set(CURRENT_COMMIT "")
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} describe --tags
+        RESULT_VARIABLE result
+        OUTPUT_VARIABLE CURRENT_COMMIT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if (NOT "${CURRENT_RELEASE}" STREQUAL "${CURRENT_COMMIT}")
+        set(CURRENT_RELEASE "master")
+    endif ()
 endif (GIT_FOUND)
 
 set(DOXYGEN_INPUT_DIR ../dpctl-capi)
diff --git a/docs/README.md b/docs/README.md
index 7c14af8a0c..73bead65f5 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,10 +4,10 @@ What?
 Generator scripts for dpCtl API documentation. To run these scripts, follow
 the following steps:
 
-`mkdir build`
-`cd build`
-`cmake -DDPCTL_DOCGEN_PREFIX=<WHERE_YOU_WANT_DOCS_TO_BE_GENERATED>`
-`make Sphinx`
+`mkdir build` <br/>
+`cd build` <br/>
+`cmake -DDPCTL_DOCGEN_PREFIX=<WHERE_YOU_WANT_DOCS_TO_BE_GENERATED>` <br/>
+`make Sphinx` <br/>
 
 The `DPCTL_DOCGEN_PREFIX` flag is optional and can be omitted to generate the
 documents in the current source directory in a sub-directory called
diff --git a/docs/dpCtl.dptensor_api.rst b/docs/dpCtl.dptensor_api.rst
new file mode 100644
index 0000000000..51cebf7c05
--- /dev/null
+++ b/docs/dpCtl.dptensor_api.rst
@@ -0,0 +1,8 @@
+.. _dpCtl.dptensor_api:
+
+#########################
+dpCtl dptensor Python API
+#########################
+
+.. automodule:: dpctl.dptensor
+    :members:
diff --git a/docs/dpCtl.program_api.rst b/docs/dpCtl.program_api.rst
new file mode 100644
index 0000000000..c6163e26c5
--- /dev/null
+++ b/docs/dpCtl.program_api.rst
@@ -0,0 +1,8 @@
+.. _dpCtl.program_api:
+
+########################
+dpCtl Program Python API
+########################
+
+.. automodule:: dpctl.program
+    :members:
diff --git a/docs/index.rst b/docs/index.rst
index bf9a77efda..6bc50c3f16 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -19,10 +19,9 @@ Indices and tables
 * :ref:`search`
 
 .. toctree::
-    :maxdepth: 2
+    :maxdepth: 3
     :caption: Contents:
 
     self
-    dpCtl Python API <dpCtl_api>
-    dpctl.memory Python API <dpCtl.memory_api>
+    toc_pyapi
     api/dpCtl-CAPI_root
diff --git a/docs/toc_pyapi.rst b/docs/toc_pyapi.rst
new file mode 100644
index 0000000000..2750aa41ab
--- /dev/null
+++ b/docs/toc_pyapi.rst
@@ -0,0 +1,10 @@
+Python API
+================
+
+.. toctree::
+    :maxdepth: 1
+
+    dpctl - SYCL runtime wrapper classes and queue manager <dpCtl_api>
+    dpctl.memory - USM memory manager <dpCtl.memory_api>
+    dpctl.dptensor - Data-parallel tensor containers <dpCtl.dptensor_api>
+    dpctl.program - Program manager <dpCtl.program_api>
diff --git a/dpctl/__init__.py b/dpctl/__init__.py
index d453eeeb45..97277188f5 100644
--- a/dpctl/__init__.py
+++ b/dpctl/__init__.py
@@ -1,5 +1,4 @@
-# ===---------------- __init__.py - dpctl module -------*- Cython -*--------===#
-#
+# ===-----------------------------------------------------------------------===#
 #                      Data Parallel Control (dpCtl)
 #
 # Copyright 2020 Intel Corporation
@@ -17,11 +16,7 @@
 # limitations under the License.
 #
 # ===-----------------------------------------------------------------------===#
-#
-# \file
-# The top-level dpctl module.
-#
-# ===-----------------------------------------------------------------------===#
+
 """
     **Data Parallel Control (dpCtl)**
 
@@ -31,8 +26,14 @@
     a common runtime to manage specific SYCL resources, such as devices
     and USM memory, for SYCL-based Python packages and extension modules.
 
-    Currently, dpCtl has two main features: a global SYCL queue manager
-    and a USM memory manager.
+    The main features presently provided by dpCtl are:
+
+    * A SYCL queue manager exposed directly inside the top-level `dpctl`
+      module.
+    * A USM memory manager (`dpctl.memory`) that provides Python objects
+      implementing the Python buffer protocol using USM shared and USM host
+      allocators. The memory manager also exposes various utility functions
+      to wrap SYCL's USM allocators, deallocators, `memcpy` functions, *etc.*
 """
 __author__ = "Intel Corp."
 
diff --git a/dpctl/_sycl_core.pyx b/dpctl/_sycl_core.pyx
index 997ec0edef..f163090403 100644
--- a/dpctl/_sycl_core.pyx
+++ b/dpctl/_sycl_core.pyx
@@ -303,7 +303,7 @@ cdef class SyclQueue:
     """
 
     @staticmethod
-    cdef SyclQueue _create (DPCTLSyclQueueRef qref):
+    cdef SyclQueue _create(DPCTLSyclQueueRef qref):
         if qref is NULL:
             raise SyclQueueCreationError("Queue creation failed.")
         cdef SyclQueue ret = SyclQueue.__new__(SyclQueue)
@@ -605,7 +605,7 @@ cdef class _SyclRTManager:
     cdef dict _backend_enum_ty_dict
     cdef dict _device_enum_ty_dict
 
-    def __cinit__ (self):
+    def __cinit__(self):
 
         self._backend_str_ty_dict = {
             "opencl" : _backend_type._OPENCL,
@@ -627,7 +627,7 @@ cdef class _SyclRTManager:
             device_type.gpu : _device_type._GPU,
         }
 
-    def _set_as_current_queue (self, backend_ty, device_ty, device_id):
+    def _set_as_current_queue(self, backend_ty, device_ty, device_id):
         cdef DPCTLSyclQueueRef queue_ref
 
         try :
@@ -642,45 +642,47 @@ cdef class _SyclRTManager:
             raise UnsupportedBackendError("Backend can only be opencl or "
                                           "level-0")
 
-    def _remove_current_queue (self):
+    def _remove_current_queue(self):
         DPCTLQueueMgr_PopQueue()
 
-    def dump (self):
+    def dump(self):
         """ Prints information about the Runtime object.
         """
         DPCTLPlatform_DumpInfo()
 
-    def print_available_backends (self):
-        """ Prints the available backends.
+    def print_available_backends(self):
+        """ Prints the available SYCL backends.
         """
         print(self._backend_str_ty_dict.keys())
 
-    cpdef get_current_backend (self):
-        """ Returns the backend for the current queue as `backend_type` enum
+    cpdef get_current_backend(self):
+        """ Returns the backend for the current queue as a `backend_type` enum
         """
         return self.get_current_queue().get_sycl_backend()
 
-    cpdef get_current_device_type (self):
-        """ Returns current device type as `device_type` enum
+    cpdef get_current_device_type(self):
+        """ Returns current device type as a `device_type` enum
         """
         return self.get_current_queue().get_sycl_device().get_device_type()
 
-    cpdef SyclQueue get_current_queue (self):
-        """ Returns the activated SYCL queue as a PyCapsule.
+    cpdef SyclQueue get_current_queue(self):
+        """ Returns the currently activate SYCL queue as a new SyclQueue object.
+        If there are no active queues then a SyclQueueCreationError exception is
+        raised.
         """
         return SyclQueue._create(DPCTLQueueMgr_GetCurrentQueue())
 
-    def get_num_activated_queues (self):
-        """ Return the number of currently activated queues for this thread.
+    def get_num_activated_queues(self):
+        """ Returns the number of currently activated queues for this thread.
         """
         return DPCTLQueueMgr_GetNumActivatedQueues()
 
-    def get_num_platforms (self):
+    def get_num_platforms(self):
         """ Returns the number of available non-host SYCL platforms.
         """
         return DPCTLPlatform_GetNumNonHostPlatforms()
 
-    def get_num_queues (self, backend_ty, device_ty):
+    def get_num_queues(self, backend_ty, device_ty):
         cdef size_t num = 0
         try :
             beTy = self._backend_enum_ty_dict[backend_ty]
@@ -699,7 +701,7 @@ cdef class _SyclRTManager:
 
         return num
 
-    def has_gpu_queues (self, backend_ty=backend_type.opencl):
+    def has_gpu_queues(self, backend_ty=backend_type.opencl):
         cdef size_t num = 0
         try :
             beTy = self._backend_enum_ty_dict[backend_ty]
@@ -714,7 +716,7 @@ cdef class _SyclRTManager:
         else:
             return False
 
-    def has_cpu_queues (self, backend_ty=backend_type.opencl):
+    def has_cpu_queues(self, backend_ty=backend_type.opencl):
         cdef size_t num = 0
         try :
             beTy = self._backend_enum_ty_dict[backend_ty]
@@ -729,21 +731,21 @@ cdef class _SyclRTManager:
         else:
             return False
 
-    def has_sycl_platforms (self):
+    def has_sycl_platforms(self):
         cdef size_t num_platforms = DPCTLPlatform_GetNumNonHostPlatforms()
         if num_platforms:
             return True
         else:
             return False
 
-    def is_in_device_context (self):
+    def is_in_device_context(self):
         cdef size_t num = DPCTLQueueMgr_GetNumActivatedQueues()
         if num:
             return True
         else:
             return False
 
-    def set_default_queue (self, backend_ty, device_ty, device_id):
+    def set_default_queue(self, backend_ty, device_ty, device_id):
         cdef DPCTLSyclQueueRef ret
         try :
             if isinstance(backend_ty, str):
@@ -785,8 +787,17 @@ set_default_queue        = _mgr.set_default_queue
 is_in_device_context     = _mgr.is_in_device_context
 
 cpdef SyclQueue get_current_queue():
-    """
-        Obtain current Sycl Queue from Data Parallel Control package.
+    """ Returns the currently activate SYCL queue as a new SyclQueue object.
+
+    Returns:
+        SyclQueue: If there is a currently active SYCL queue that queue
+        is returned wrapped in a SyclQueue object. The SyclQueue object
+        owns a copy of the currently active SYCL queue as an opaque
+        `DPCTLSyclQueueRef` pointer. The pointer is freed when the SyclQueue
+        is garbage collected.
+
+    Raises:
+        SyclQueueCreationError: If no currently active SYCL queue found.
     """
     return _mgr.get_current_queue()
 
@@ -805,7 +816,7 @@ cpdef get_current_backend():
 from contextlib import contextmanager
 
 @contextmanager
-def device_context (str queue_str="opencl:gpu:0"):
+def device_context(str queue_str="opencl:gpu:0"):
     """
         The SYCL queue defined by the "backend:device type:device id" tuple is
         set as the currently active queue, *i.e.*, a subsequent call to
diff --git a/dpctl/dptensor/__init__.py b/dpctl/dptensor/__init__.py
index c7695fcd4f..e427b82718 100644
--- a/dpctl/dptensor/__init__.py
+++ b/dpctl/dptensor/__init__.py
@@ -1 +1,32 @@
+# ===-----------------------------------------------------------------------===#
+#                      Data Parallel Control (dpCtl)
+#
+# Copyright 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===-----------------------------------------------------------------------===#
+"""
+    **Data Parallel Tensor Collection**
+
+    `dpctl.dptensor` is an experimental collection of tensor implementations
+    that will implement future Python data API (https://data-apis.github.io/array-api/latest/).
+
+    Available tensor implementations:
+
+    * `numpy_usm_shared`: Provides a `numpy.ndarray` sub-class whose \
+    underlying memory buffer is allocated with a USM shared memory allocator.
+
+"""
+
 import dpctl.dptensor.numpy_usm_shared
diff --git a/dpctl/dptensor/numpy_usm_shared.py b/dpctl/dptensor/numpy_usm_shared.py
index 2c790bc1dd..ff7a5315e4 100644
--- a/dpctl/dptensor/numpy_usm_shared.py
+++ b/dpctl/dptensor/numpy_usm_shared.py
@@ -190,8 +190,8 @@ def __array_finalize__(self, obj):
             return
         # When called in new-from-template, `obj` is another instance of our own
         # subclass, that we might use to update the new `self` instance.
-        # However, when called from view casting, `obj` can be an instance of any
-        # subclass of ndarray, including our own.
+        # However, when called from view casting, `obj` can be an instance of
+        # any subclass of ndarray, including our own.
         if hasattr(obj, array_interface_property):
             return
         for ext_checker in ndarray.external_usm_checkers:
@@ -204,14 +204,16 @@ def __array_finalize__(self, obj):
                     return
                 ob = ob.base
 
-        # Just raise an exception since __array_ufunc__ makes all reasonable cases not
-        # need the code below.
+        # Just raise an exception since __array_ufunc__ makes all
+        # reasonable cases not need the code below.
         raise ValueError(
-            "Non-USM allocated ndarray can not viewed as a USM-allocated one without a copy"
+            "Non-USM allocated ndarray can not viewed as a USM-allocated \
+             one without a copy"
         )
 
-    # Tell Numba to not treat this type just like a NumPy ndarray but to propagate its type.
-    # This way it will use the custom numpy_usm_shared allocator.
+    # Tell Numba to not treat this type just like a NumPy ndarray but to
+    # propagate its type. This way it will use the custom numpy_usm_shared
+    # allocator.
     __numba_no_subtype_ndarray__ = True
 
     # Convert to a NumPy ndarray.
@@ -257,8 +259,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                 out_as_np = np.ndarray(out.shape, out.dtype, out)
                 kwargs["out"] = out_as_np
             else:
-                # If they manually gave numpy_usm_shared as out kwarg then we have to also
-                # cast as regular NumPy ndarray to avoid recursion.
+                # If they manually gave numpy_usm_shared as out kwarg then we
+                # have to also cast as regular NumPy ndarray to avoid recursion.
                 if isinstance(kwargs["out"], ndarray):
                     out = kwargs["out"]
                     kwargs["out"] = np.ndarray(out.shape, out.dtype, out)
@@ -282,7 +284,8 @@ def isdef(x):
     cname = c[0]
     if isdef(cname):
         continue
-    # For now we do the simple thing and copy the types from NumPy module into numpy_usm_shared module.
+    # For now we do the simple thing and copy the types from NumPy module
+    # into numpy_usm_shared module.
     new_func = "%s = np.%s" % (cname, cname)
     try:
         the_code = compile(new_func, "__init__", "exec")