From 06faa506bf6c99fcb20241eaece16e578ac4c893 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 9 Nov 2023 09:42:03 -0500 Subject: [PATCH 1/5] init --- tutorials/sphinx-tutorials/dqn_with_rnn.py | 163 +++++++++++--------- tutorials/sphinx-tutorials/pendulum.py | 165 +++++++++++---------- 2 files changed, 179 insertions(+), 149 deletions(-) diff --git a/tutorials/sphinx-tutorials/dqn_with_rnn.py b/tutorials/sphinx-tutorials/dqn_with_rnn.py index 6fbc8218ffb..14470617eef 100644 --- a/tutorials/sphinx-tutorials/dqn_with_rnn.py +++ b/tutorials/sphinx-tutorials/dqn_with_rnn.py @@ -1,49 +1,66 @@ # -*- coding: utf-8 -*- + """ Recurrent DQN: Training recurrent policies ========================================== **Author**: `Vincent Moens `_ -Memory-based policies are crucial not only when the observations are partially -observable but also when the time dimension must be taken into account to -make informed decisions. - -Recurrent neural network have long been a popular tool for memory-based -policies. The idea is to keep a recurrent state in memory between two -consecutive steps, and use this as an input to the policy along with the -current observation. - -This tutorial shows how to incorporate an RNN in a policy. - -Key learnings: - -- Incorporating an RNN in an actor in TorchRL; -- Using that memory-based policy with a replay buffer and a loss module. +.. grid:: 2 -The core idea of using RNNs in TorchRL is to use TensorDict as a data carrier -for the hidden states from one step to another. We'll build a policy that -reads the previous recurrent state from the current tensordict, and writes the -current recurrent states in the tensordict of the next state: + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn -.. figure:: /_static/img/rollout_recurrent.png - :alt: Data collection with a recurrent policy + * How to incorporating an RNN in an actor in TorchRL + * How to use that memory-based policy with a replay buffer and a loss module -As this figure shows, our env populates the tensordict with zeroed recurrent -states which are read by the policy together with the observation to produce an -action, and recurrent states that will be used for the next step. -When the :func:`torchrl.envs.step_mdp` function is called, the recurrent states -from the next state are brought to the current tensordict. Let's see how this -is implemented in practice. + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + * PyTorch v2.0.0 + * gym[mujoco] + * tqdm """ +######################################################################### +# Overview +# -------- +# +# Memory-based policies are crucial not only when the observations are partially +# observable but also when the time dimension must be taken into account to +# make informed decisions. +# +# Recurrent neural network have long been a popular tool for memory-based +# policies. The idea is to keep a recurrent state in memory between two +# consecutive steps, and use this as an input to the policy along with the +# current observation. +# +# This tutorial shows how to incorporate an RNN in a policy using TorchRL. +# +# Key learnings: +# +# - Incorporating an RNN in an actor in TorchRL; +# - Using that memory-based policy with a replay buffer and a loss module. +# +# The core idea of using RNNs in TorchRL is to use TensorDict as a data carrier +# for the hidden states from one step to another. We'll build a policy that +# reads the previous recurrent state from the current TensorDict, and writes the +# current recurrent states in the TensorDict of the next state: +# +# .. figure:: /_static/img/rollout_recurrent.png +# :alt: Data collection with a recurrent policy +# +# As this figure shows, our environment populates the TensorDict with zeroed recurrent +# states which are read by the policy together with the observation to produce an +# action, and recurrent states that will be used for the next step. +# When the :func:`~torchrl.envs.utils.step_mdp` function is called, the recurrent states +# from the next state are brought to the current TensorDict. Let's see how this +# is implemented in practice. + ###################################################################### # If you are running this in Google Colab, make sure you install the following dependencies: # # .. code-block:: bash # -# !pip3 install torchrl-nightly +# !pip3 install torchrl # !pip3 install gym[mujoco] # !pip3 install tqdm # @@ -87,18 +104,18 @@ # 84x84, scaling down the rewards and normalizing the observations. # # .. note:: -# The :class:`torchrl.envs.StepCounter` transform is accessory. Since the CartPole +# The :class:`~torchrl.envs.transforms.StepCounter` transform is accessory. Since the CartPole # task goal is to make trajectories as long as possible, counting the steps # can help us track the performance of our policy. # # Two transforms are important for the purpose of this tutorial: # -# - :class:`torchrl.envs.InitTracker` will stamp the -# calls to :meth:`torchrl.envs.EnvBase.reset` by adding a ``"is_init"`` -# boolean mask in the tensordict that will track which steps require a reset +# - :class:`~torchrl.envs.transforms.InitTracker` will stamp the +# calls to :meth:`~torchrl.envs.EnvBase.reset` by adding a ``"is_init"`` +# boolean mask in the TensorDict that will track which steps require a reset # of the RNN hidden states. -# - The :class:`torchrl.envs.TensorDictPrimer` transform is a bit more -# technical: per se, it is not required to use RNN policies. However, it +# - The :class:`~torchrl.envs.transforms.TensorDictPrimer` transform is a bit more +# technical. It is not required to use RNN policies. However, it # instructs the environment (and subsequently the collector) that some extra # keys are to be expected. Once added, a call to `env.reset()` will populate # the entries indicated in the primer with zeroed tensors. Knowing that @@ -110,7 +127,7 @@ # the training of our policy, but it will make the recurrent keys disappear # from the collected data and the replay buffer, which will in turn lead to # a slightly less optimal training. -# Fortunately, the :class:`torchrl.modules.LSTMModule` we propose is +# Fortunately, the :class:`~torchrl.modules.LSTMModule` we propose is # equipped with a helper method to build just that transform for us, so # we can wait until we build it! # @@ -127,6 +144,7 @@ ObservationNorm(standard_normal=True, in_keys=["pixels"]), ), ) + ###################################################################### # As always, we need to initialize manually our normalization constants: # @@ -137,16 +155,16 @@ # Policy # ------ # -# Our policy will have 3 components: a :class:`torchrl.modules.ConvNet` -# backbone, an :class:`torchrl.modules.LSTMModule` memory layer and a shallow -# :class:`torchrl.modules.MLP` block that will map the LSTM output onto the +# Our policy will have 3 components: a :class:`~torchrl.modules.ConvNet` +# backbone, an :class:`~torchrl.modules.LSTMModule` memory layer and a shallow +# :class:`~torchrl.modules.MLP` block that will map the LSTM output onto the # action values. # # Convolutional network # ~~~~~~~~~~~~~~~~~~~~~ # -# We build a convolutional network flanked with a :class:torch.nn.AdaptiveAvgPool2d` -# that will squash the output in a vector of size 64. The :class:`torchrl.modules.ConvNet` +# We build a convolutional network flanked with a :class:`torch.nn.AdaptiveAvgPool2d` +# that will squash the output in a vector of size 64. The :class:`~torchrl.modules.ConvNet` # can assist us with this: # @@ -171,11 +189,11 @@ # LSTM Module # ~~~~~~~~~~~ # -# TorchRL provides a specialized :class:`torchrl.modules.LSTMModule` class -# to incorporate LSTMs in your code-base. It is a :class:`tensordict.nn.TensorDictModuleBase` +# TorchRL provides a specialized :class:`~torchrl.modules.LSTMModule` class +# to incorporate LSTMs in your code-base. It is a :class:`~tensordict.nn.TensorDictModuleBase` # subclass: as such, it has a set of ``in_keys`` and ``out_keys`` that indicate # what values should be expected to be read and written/updated during the -# execution of the module. The class comes with customizable pre-defined +# execution of the module. The class comes with customizable predefined # values for these attributes to facilitate its construction. # # .. note:: @@ -183,8 +201,8 @@ # dropout or multi-layered LSTMs. # However, to respect TorchRL's conventions, this LSTM must have the ``batch_first`` # attribute set to ``True`` which is **not** the default in PyTorch. However, -# our :class:`torchrl.modules.LSTMModule` changes this default -# behaviour so we're good with a native call. +# our :class:`~torchrl.modules.LSTMModule` changes this default +# behavior, so we're good with a native call. # # Also, the LSTM cannot have a ``bidirectional`` attribute set to ``True`` as # this wouldn't be usable in online settings. In this case, the default value @@ -200,28 +218,28 @@ ) ###################################################################### -# Let us look at the lstm class, specifically its in and out_keys: +# Let us look at the LSTM Module class, specifically its in and out_keys: print("in_keys", lstm.in_keys) print("out_keys", lstm.out_keys) ###################################################################### # We can see that these values contain the key we indicated as the in_key (and out_key) # as well as recurrent key names. The out_keys are preceded by a "next" prefix -# that indicates that they will need to be written in the "next" tensordict. +# that indicates that they will need to be written in the "next" TensorDict. # We use this convention (which can be overridden by passing the in_keys/out_keys -# arguments) to make sure that a call to :func:`torchrl.envs.step_mdp` will -# move the recurrent state to the root tensordict, making it available to the +# arguments) to make sure that a call to :func:`~torchrl.envs.utils.step_mdp` will +# move the recurrent state to the root TensorDict, making it available to the # RNN during the following call (see figure in the intro). # # As mentioned earlier, we have one more optional transform to add to our # environment to make sure that the recurrent states are passed to the buffer. -# The :meth:`torchrl.modules.LSTMModule.make_tensordict_primer` method does +# The :meth:`~torchrl.modules.LSTMModule.make_tensordict_primer` method does # exactly that: # env.append_transform(lstm.make_tensordict_primer()) ###################################################################### -# and that's it! We can print the env to check that everything looks good now +# and that's it! We can print the environment to check that everything looks good now # that we have added the primer: print(env) @@ -249,7 +267,8 @@ # Using the Q-Values to select an action # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# The last part of our policy is the Q-Value Module. The Q-Value module :class:`torchrl.modules.QValueModule` +# The last part of our policy is the Q-Value Module. +# The Q-Value module :class:`~torchrl.modules.tensordict_module.QValueModule` # will read the ``"action_values"`` key that is produced by our MLP and # from it, gather the action that has the maximum value. # The only thing we need to do is to specify the action space, which can be done @@ -261,19 +280,20 @@ ###################################################################### # .. note:: # TorchRL also provides a wrapper class :class:`torchrl.modules.QValueActor` that -# wraps a module in a Sequential together with a :class:`torchrl.modules.QValueModule` +# wraps a module in a Sequential together with a :class:`~torchrl.modules.tensordict_module.QValueModule` # like we are doing explicitly here. There is little advantage to do this # and the process is less transparent, but the end results will be similar to # what we do here. # -# We can now put things together in a :class:`tensordict.nn.TensorDictSequential` +# We can now put things together in a :class:`~tensordict.nn.TensorDictSequential` # stoch_policy = Seq(feature, lstm, mlp, qval) ###################################################################### # DQN being a deterministic algorithm, exploration is a crucial part of it. # We'll be using an :math:`\epsilon`-greedy policy with an epsilon of 0.2 decaying -# progressively to 0. This decay is achieved via a call to :meth:`torchrl.modules.EGreedyWrapper.step` +# progressively to 0. +# This decay is achieved via a call to :meth:`~torchrl.modules.EGreedyWrapper.step` # (see training loop below). # stoch_policy = EGreedyWrapper( @@ -291,7 +311,7 @@ # To use it, we just need to tell the LSTM module to run on "recurrent-mode" # when used by the loss. # As we'll usually want to have two copies of the LSTM module, we do this by -# calling a :meth:`torchrl.modules.LSTMModule.set_recurrent_mode` method that +# calling a :meth:`~torchrl.modules.LSTMModule.set_recurrent_mode` method that # will return a new instance of the LSTM (with shared weights) that will # assume that the input data is sequential in nature. # @@ -309,7 +329,7 @@ # # Out DQN loss requires us to pass the policy and, again, the action-space. # While this may seem redundant, it is important as we want to make sure that -# the :class:`torchrl.objectives.DQNLoss` and the :class:`torchrl.modules.QValueModule` +# the :class:`~torchrl.objectives.DQNLoss` and the :class:`~torchrl.modules.tensordict_module.QValueModule` # classes are compatible, but aren't strongly dependent on each other. # # To use the Double-DQN, we ask for a ``delay_value`` argument that will @@ -319,7 +339,7 @@ ###################################################################### # Since we are using a double DQN, we need to update the target parameters. -# We'll use a :class:`torchrl.objectives.SoftUpdate` instance to carry out +# We'll use a :class:`~torchrl.objectives.SoftUpdate` instance to carry out # this work. # updater = SoftUpdate(loss_fn, eps=0.95) @@ -335,7 +355,7 @@ # will be designed to store 20 thousands trajectories of 50 steps each. # At each optimization step (16 per data collection), we'll collect 4 items # from our buffer, for a total of 200 transitions. -# We'll use a :class:`torchrl.data.LazyMemmapStorage` storage to keep the data +# We'll use a :class:`~torchrl.data.replay_buffers.LazyMemmapStorage` storage to keep the data # on disk. # # .. note:: @@ -374,7 +394,7 @@ # it is important to pass data that is not flattened rb.extend(data.unsqueeze(0).to_tensordict().cpu()) for _ in range(utd): - s = rb.sample().to(device) + s = rb.sample().to(device, non_blocking=True) loss_vals = loss_fn(s) loss_vals["loss"].backward() optim.step() @@ -386,10 +406,9 @@ stoch_policy.step(data.numel()) updater.step() - if i % 50 == 0: - with set_exploration_type(ExplorationType.MODE), torch.no_grad(): - rollout = env.rollout(10000, stoch_policy) - traj_lens.append(rollout.get(("next", "step_count")).max().item()) + with set_exploration_type(ExplorationType.MODE), torch.no_grad(): + rollout = env.rollout(10000, stoch_policy) + traj_lens.append(rollout.get(("next", "step_count")).max().item()) ###################################################################### # Let's plot our results: @@ -405,14 +424,18 @@ # Conclusion # ---------- # -# We have seen how an RNN can be incorporated in a policy in torchrl. +# We have seen how an RNN can be incorporated in a policy in TorchRL. # You should now be able: # -# - To create an LSTM module that acts as a TensorDictModule; -# - How to indicate to the LSTMModule that a reset is needed via an :class:`torchrl.envs.InitTracker` -# transform. -# - Incorporate this module in a policy and in a loss module; +# - Create an LSTM module that acts as a :class:`~tensordict.nn.TensorDictModule` +# - Indicate to the LSTM module that a reset is needed via an :class:`~torchrl.envs.transforms.InitTracker` +# transform +# - Incorporate this module in a policy and in a loss module # - Make sure that the collector is made aware of the recurrent state entries # such that they can be stored in the replay buffer along with the rest of -# the data. +# the data +# +# Further Reading +# --------------- # +# - The TorchRL documentation can be found `here `_. diff --git a/tutorials/sphinx-tutorials/pendulum.py b/tutorials/sphinx-tutorials/pendulum.py index 2190ff9f4b8..84fdd6b6539 100644 --- a/tutorials/sphinx-tutorials/pendulum.py +++ b/tutorials/sphinx-tutorials/pendulum.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- + """ Pendulum: Writing your environment and transforms with TorchRL ============================================================== @@ -9,34 +10,35 @@ is an integrative part of reinforcement learning and control engineering. TorchRL provides a set of tools to do this in multiple contexts. -This tutorial demonstrates how to use PyTorch and ``torchrl`` code a pendulum +This tutorial demonstrates how to use PyTorch and TorchRL code a pendulum simulator from the ground up. It is freely inspired by the Pendulum-v1 implementation from `OpenAI-Gym/Farama-Gymnasium control library `__. .. figure:: /_static/img/pendulum.gif :alt: Pendulum + :align: center Simple Pendulum Key learnings: - How to design an environment in TorchRL: - - Writing specs (input, observation and reward); - - Implementing behaviour: seeding, reset and step. + - Implementing behavior: seeding, reset and step. - Transforming your environment inputs and outputs, and writing your own transforms; -- How to use :class:`tensordict.TensorDict` to carry arbitrary data structures - from step to step. +- How to use :class:`~tensordict.TensorDict` to carry arbitrary data structures + through the ``codebase``. -In the process, we will touch three crucial components of TorchRL: + In the process, we will touch three crucial components of TorchRL: * `environments `__ * `transforms `__ * `models (policy and value function) `__ """ + ###################################################################### # To give a sense of what can be achieved with TorchRL's environments, we will # be designing a *stateless* environment. While stateful environments keep track of @@ -44,32 +46,32 @@ # transition, stateless environments expect the current state to be provided to # them at each step, along with the action undertaken. TorchRL supports both # types of environments, but stateless environments are more generic and hence -# cover a broader range of features of the environment API in torchrl. +# cover a broader range of features of the environment API in TorchRL. # -# Modelling stateless environments gives users full control over the input and -# outputs of the simulator: one can reset an experiment at any stage. It also -# assumes that we have some control over a task, which may not always be the -# case: solving a problem where we cannot control the current state is more -# challenging but has a much wider set of applications. +# Modeling stateless environments gives users full control over the input and +# outputs of the simulator: one can reset an experiment at any stage or actively +# modify the dynamics from the outside. However, it assumes that we have some control +# over a task, which may not always be the case: solving a problem where we cannot +# control the current state is more challenging but has a much wider set of applications. # # Another advantage of stateless environments is that they can enable # batched execution of transition simulations. If the backend and the # implementation allow it, an algebraic operation can be executed seamlessly on -# scalars, vectors or tensors. This tutorial gives such examples. +# scalars, vectors, or tensors. This tutorial gives such examples. # # This tutorial will be structured as follows: # # * We will first get acquainted with the environment properties: -# its shape (``batch_size``), its methods (mainly :meth:`EnvBase.step`, -# :meth:`EnvBase.reset` and :meth:`EnvBase.set_seed`) +# its shape (``batch_size``), its methods (mainly :meth:`~torchrl.envs.EnvBase.step`, +# :meth:`~torchrl.envs.EnvBase.reset` and :meth:`~torchrl.envs.EnvBase.set_seed`) # and finally its specs. # * After having coded our simulator, we will demonstrate how it can be used # during training with transforms. # * We will explore new avenues that follow from the TorchRL's API, # including: the possibility of transforming inputs, the vectorized execution -# of the simulation and the possibility of backpropagating through the +# of the simulation and the possibility of backpropagation through the # simulation graph. -# * Finally, will train a simple policy to solve the system we implemented. +# * Finally, we will train a simple policy to solve the system we implemented. # from collections import defaultdict from typing import Optional @@ -96,7 +98,7 @@ DEFAULT_Y = 1.0 ###################################################################### -# There are four things one must take care of when designing a new environment +# There are four things you must take care of when designing a new environment # class: # # * :meth:`EnvBase._reset`, which codes for the resetting of the simulator @@ -106,7 +108,7 @@ # * the environment specs. # # Let us first describe the problem at hand: we would like to model a simple -# pendulum, over which we can control the torque applied on its fixed point. +# pendulum over which we can control the torque applied on its fixed point. # Our goal is to place the pendulum in upward position (angular position at 0 # by convention) and having it standing still in that position. # To design our dynamic system, we need to define two equations: the motion @@ -147,25 +149,25 @@ # method that receives a :class:`tensordict.TensorDict` # instance with an ``"action"`` entry indicating what action is to be taken. # -# To facilitate the reading and writing from that tensordict and to make sure +# To facilitate the reading and writing from that ``tensordict`` and to make sure # that the keys are consistent with what's expected from the library, the # simulation part has been delegated to a private abstract method :meth:`_step` -# which reads input data from a tensordict, and writes a *new* tensordict +# which reads input data from a ``tensordict``, and writes a *new* ``tensordict`` # with the output data. # # The :func:`_step` method should do the following: # -# 1. read the input keys (such as ``"action"``) and execute the simulation +# 1. Read the input keys (such as ``"action"``) and execute the simulation # based on these; -# 2. retrieve observations, done state and reward; -# 3. write the set of observation value along with the reward and done state +# 2. Retrieve observations, done state and reward; +# 3. Write the set of observation values along with the reward and done state # at the corresponding entries in a new :class:`TensorDict`. # # Next, the :meth:`~torchrl.envs.EnvBase.step` method will merge the output -# of :meth:`~torchrl.envs.EnvBase.step` in the input tensordict to enforce +# of :meth:`~torchrl.envs.EnvBase.step` in the input ``tensordict`` to enforce # input/output consistency. # -# Typically, for stateful environments, this will look like +# Typically, for stateful environments, this will look like this: # # .. code-block:: # @@ -198,11 +200,11 @@ # device=cpu, # is_shared=False) # -# Notice that the root tensordict has not changed, the only modification is the +# Notice that the root ``tensordict`` has not changed, the only modification is the # appearance of a new ``"next"`` entry that contains the new information. # # In the Pendulum example, our :meth:`_step` method will read the relevant -# entries from the input tensordict and compute the position and velocity of +# entries from the input ``tensordict`` and compute the position and velocity of # the pendulum after the force encoded by the ``"action"`` key has been applied # onto it. We compute the new angular position of the pendulum # ``"new_th"`` as the result of the previous position ``"th"`` plus the new @@ -219,7 +221,6 @@ # needed as the state needs to be read from the environment. # - def _step(tensordict): th, thdot = tensordict["th"], tensordict["thdot"] # th := theta @@ -265,24 +266,24 @@ def angle_normalize(x): # The second method we need to care about is the # :meth:`~torchrl.envs.EnvBase._reset` method. Like # :meth:`~torchrl.envs.EnvBase._step`, it should write the observation entries -# and possibly a done state in the tensordict it outputs (if the done state is +# and possibly a done state in the ``tensordict`` it outputs (if the done state is # omitted, it will be filled as ``False`` by the parent method # :meth:`~torchrl.envs.EnvBase.reset`). In some contexts, it is required that # the ``_reset`` method receives a command from the function that called -# it (e.g. in multi-agent settings we may want to indicate which agents need +# it (for example, in multi-agent settings we may want to indicate which agents need # to be reset). This is why the :meth:`~torchrl.envs.EnvBase._reset` method -# also expects a tensordict as input, albeit it may perfectly be empty or +# also expects a ``tensordict`` as input, albeit it may perfectly be empty or # ``None``. # # The parent :meth:`EnvBase.reset` does some simple checks like the # :meth:`EnvBase.step` does, such as making sure that a ``"done"`` state -# is returned in the output tensordict and that the shapes match what is +# is returned in the output ``tensordict`` and that the shapes match what is # expected from the specs. # # For us, the only important thing to consider is whether # :meth:`EnvBase._reset` contains all the expected observations. Once more, # since we are working with a stateless environment, we pass the configuration -# of the pendulum in a nested tensordict named ``"params"``. +# of the pendulum in a nested ``tensordict`` named ``"params"``. # # In this example, we do not pass a done state as this is not mandatory # for :meth:`_reset` and our environment is non-terminating, so we always @@ -292,8 +293,8 @@ def angle_normalize(x): def _reset(self, tensordict): if tensordict is None or tensordict.is_empty(): - # if no tensordict is passed, we generate a single set of hyperparameters - # Otherwise, we assume that the input tensordict contains all the relevant + # if no ``tensordict`` is passed, we generate a single set of hyperparameters + # Otherwise, we assume that the input ``tensordict`` contains all the relevant # parameters to get started. tensordict = self.gen_params(batch_size=self.batch_size) @@ -302,7 +303,7 @@ def _reset(self, tensordict): low_th = -high_th low_thdot = -high_thdot - # for non batch-locked envs, the input tensordict shape dictates the number + # for non batch-locked environments, the input ``tensordict`` shape dictates the number # of simulators run simultaneously. In other contexts, the initial # random state's shape will depend upon the environment batch-size instead. th = ( @@ -344,12 +345,12 @@ def _reset(self, tensordict): # instance where each key is an observation (a :class:`CompositeSpec` can be # viewed as a dictionary of specs). # * :obj:`EnvBase.action_spec`: It can be any type of spec, but it is required -# that it corresponds to the ``"action"`` entry in the input tensordict; +# that it corresponds to the ``"action"`` entry in the input ``tensordict``; # * :obj:`EnvBase.reward_spec`: provides information about the reward space; # * :obj:`EnvBase.done_spec`: provides information about the space of the done # flag. # -# TorchRL specs are organised in two general containers: ``input_spec`` which +# TorchRL specs are organized in two general containers: ``input_spec`` which # contains the specs of the information that the step function reads (divided # between ``action_spec`` containing the action and ``state_spec`` containing # all the rest), and ``output_spec`` which encodes the specs that the @@ -357,7 +358,7 @@ def _reset(self, tensordict): # In general, you should not interact directly with ``output_spec`` and # ``input_spec`` but only with their content: ``observation_spec``, # ``reward_spec``, ``done_spec``, ``action_spec`` and ``state_spec``. -# The reason if that the specs are organised in a non-trivial way +# The reason if that the specs are organized in a non-trivial way # within ``output_spec`` and # ``input_spec`` and neither of these should be directly modified. # @@ -377,8 +378,8 @@ def _reset(self, tensordict): # the expected input and output shapes. This is something that should be # accurately coded in stateful settings. # -# For non batch-locked environments such as the one in our example (see below), -# this is irrelevant as the environment batch-size will most likely be empty. +# For non batch-locked environments, such as the one in our example (see below), +# this is irrelevant as the environment batch size will most likely be empty. # @@ -397,13 +398,13 @@ def _make_spec(self, td_params): shape=(), dtype=torch.float32, ), - # we need to add the "params" to the observation specs, as we want + # we need to add the ``params`` to the observation specs, as we want # to pass it at each step during a rollout params=make_composite_from_td(td_params["params"]), shape=(), ) # since the environment is stateless, we expect the previous output as input. - # For this, EnvBase expects some state_spec to be available + # For this, ``EnvBase`` expects some state_spec to be available self.state_spec = self.observation_spec.clone() # action-spec will be automatically wrapped in input_spec when # `self.action_spec = spec` will be called supported @@ -417,7 +418,7 @@ def _make_spec(self, td_params): def make_composite_from_td(td): - # custom funtion to convert a tensordict in a similar spec structure + # custom function to convert a ``tensordict`` in a similar spec structure # of unbounded values. composite = CompositeSpec( { @@ -438,8 +439,8 @@ def make_composite_from_td(td): # --------------------------------- # # Seeding an environment is a common operation when initializing an experiment. -# :func:`EnvBase._set_seed` only goal is to set the seed of the contained -# simulator. If possible, this operation should not call `reset()` or interact +# The only goal of :func:`EnvBase._set_seed` is to set the seed of the contained +# simulator. If possible, this operation should not call ``reset()`` or interact # with the environment execution. The parent :func:`EnvBase.set_seed` method # incorporates a mechanism that allows seeding multiple environments with a # different pseudo-random and reproducible seed. @@ -460,13 +461,13 @@ def _set_seed(self, seed: Optional[int]): # construction, so we must take care of calling the :func:`_make_spec` method # within :func:`PendulumEnv.__init__`. # -# We add a static method :func:`PendulumEnv.gen_params` which deterministically +# We add a static method :meth:`PendulumEnv.gen_params` which deterministically # generates a set of hyperparameters to be used during execution: # def gen_params(g=10.0, batch_size=None) -> TensorDictBase: - """Returns a tensordict containing the physical parameters such as gravitational force and torque or speed limits.""" + """Returns a ``tensordict`` containing the physical parameters such as gravitational force and torque or speed limits.""" if batch_size is None: batch_size = [] td = TensorDict( @@ -491,9 +492,9 @@ def gen_params(g=10.0, batch_size=None) -> TensorDictBase: ###################################################################### -# We define the environment as non-``batch_locked`` by turning the homonymous +# We define the environment as non-``batch_locked`` by turning the ``homonymous`` # attribute to ``False``. This means that we will **not** enforce the input -# tensordict to have a batch-size that matches the one of the environment. +# ``tensordict`` to have a ``batch-size`` that matches the one of the environment. # # The following code will just put together the pieces we have coded above. # @@ -557,8 +558,8 @@ def __init__(self, td_params=None, seed=None, device="cpu"): ###################################################################### # We can run the :func:`env.rand_step` to generate -# an action randomly from the ``action_spec`` domain. A tensordict containing -# the hyperparams and the current state **must** be passed since our +# an action randomly from the ``action_spec`` domain. A ``tensordict`` containing +# the hyperparameters and the current state **must** be passed since our # environment is stateless. In stateful contexts, ``env.rand_step()`` works # perfectly too. # @@ -572,18 +573,18 @@ def __init__(self, td_params=None, seed=None, device="cpu"): # Writing environment transforms for stateless simulators is slightly more # complicated than for stateful ones: transforming an output entry that needs # to be read at the following iteration requires to apply the inverse transform -# before calling :func:`env.step` at the next step. -# This is an ideal scenario to showcase all the features of torchrl's +# before calling :func:`meth.step` at the next step. +# This is an ideal scenario to showcase all the features of TorchRL's # transforms! # -# For instance, in the following transformed environment we unsqueeze the entries +# For instance, in the following transformed environment we ``unsqueeze`` the entries # ``["th", "thdot"]`` to be able to stack them along the last # dimension. We also pass them as ``in_keys_inv`` to squeeze them back to their # original shape once they are passed as input in the next iteration. # env = TransformedEnv( env, - # Unsqueezes the observations that we will concatenate + # ``Unsqueeze`` the observations that we will concatenate UnsqueezeTransform( unsqueeze_dim=-1, in_keys=["th", "thdot"], @@ -604,26 +605,32 @@ def __init__(self, td_params=None, seed=None, device="cpu"): # - Adapting the environment specs. # # A transform can be used in two settings: on its own, it can be used as a -# :class:`torch.nn.Module`. It can also be used appended to a -# :class:`~torchrl.envs.TransformedEnv`. The structure of the class allows to -# customize the behaviour in the different contexts. +# :class:`~torch.nn.Module`. It can also be used appended to a +# :class:`~torchrl.envs.transforms.TransformedEnv`. The structure of the class allows to +# customize the behavior in the different contexts. # -# A :class:`~torchrl.envs.Transform` skeleton can be summarized as follows: +# A :class:`~torchrl.envs.transforms.Transform` skeleton can be summarized as follows: # # .. code-block:: # # class Transform(nn.Module): # def forward(self, tensordict): +# ... # def _apply_transform(self, tensordict): +# ... # def _step(self, tensordict): +# ... # def _call(self, tensordict): +# ... # def inv(self, tensordict): +# ... # def _inv_apply_transform(self, tensordict): +# ... # # There are three entry points (:func:`forward`, :func:`_step` and :func:`inv`) # which all receive :class:`tensordict.TensorDict` instances. The first two -# will eventually go through the keys indicated by :obj:`Transform.in_keys` -# and call :func:`Transform._apply_transform` to each of these. The results will +# will eventually go through the keys indicated by :obj:`~tochrl.envs.transforms.Transform.in_keys` +# and call :meth:`~torchrl.envs.transforms.Transform._apply_transform` to each of these. The results will # be written in the entries pointed by :obj:`Transform.out_keys` if provided # (if not the ``in_keys`` will be updated with the transformed values). # If inverse transforms need to be executed, a similar data flow will be @@ -633,13 +640,12 @@ def __init__(self, td_params=None, seed=None, device="cpu"): # The following figure summarized this flow for environments and replay # buffers. # -# .. figure:: /_static/img/transforms.png # # Transform API # # In some cases, a transform will not work on a subset of keys in a unitary # manner, but will execute some operation on the parent environment or -# work with the entire input tensordict. +# work with the entire input ``tensordict``. # In those cases, the :func:`_call` and :func:`forward` methods should be # re-written, and the :func:`_apply_transform` method can be skipped. # @@ -703,7 +709,7 @@ def transform_observation_spec(self, observation_spec): ###################################################################### # Concatenates the observations onto an "observation" entry. -# del_keys=False ensures that we keep these values for the next +# ``del_keys=False`` ensures that we keep these values for the next # iteration. cat_transform = CatTensors( in_keys=["sin", "cos", "thdot"], dim=-1, out_key="observation", del_keys=False @@ -711,7 +717,7 @@ def transform_observation_spec(self, observation_spec): env.append_transform(cat_transform) ###################################################################### -# Once more, let us check that our env specs match what is received: +# Once more, let us check that our environment specs match what is received: check_env_specs(env) ###################################################################### @@ -726,11 +732,11 @@ def transform_observation_spec(self, observation_spec): # * compute an action given a policy # * execute a step given this action # * collect the data -# * make a MDP step +# * make a ``MDP`` step # # * gather the data and return # -# These operations have been convinently wrapped in the :func:`EnvBase.rollout` +# These operations have been conveniently wrapped in the :meth:`~torchrl.envs.EnvBase.rollout` # method, from which we provide a simplified version here below. @@ -758,7 +764,7 @@ def simple_rollout(steps=100): # make any assumptions regarding the input data shape, we can seamlessly # execute it over batches of data. Even better: for non-batch-locked # environments such as our Pendulum, we can change the batch size on the fly -# without recreating the env. +# without recreating the environment. # To do this, we just generate parameters with the desired shape. # @@ -769,9 +775,9 @@ def simple_rollout(steps=100): print("rand step (batch size of 10)", td) ###################################################################### -# executing a rollout with a batch of data requires us to reset the env +# Executing a rollout with a batch of data requires us to reset the environment # out of the rollout function, since we need to define the batch_size -# dynamically and this is not supported by :func:`EnvBase.rollout`: +# dynamically and this is not supported by :meth:`~torchrl.envs.EnvBase.rollout`: # rollout = env.rollout( @@ -787,12 +793,12 @@ def simple_rollout(steps=100): # ------------------------ # # In this example, we will train a simple policy using the reward as a -# differentiable objective (i.e. a negative loss). +# differentiable objective, such as a negative loss. # We will take advantage of the fact that our dynamic system is fully # differentiable to backpropagate through the trajectory return and adjust the -# weights of our policy to maximise this value directly. Of course, in many +# weights of our policy to maximize this value directly. Of course, in many # settings many of the assumptions we make do not hold, such as -# differentiability of the system and full access to the underlying mechanics. +# differentiable system and full access to the underlying mechanics. # # Still, this is a very simple example that showcases how a training loop can # be coded with a custom environment in TorchRL. @@ -886,6 +892,7 @@ def plot(): plot() + ###################################################################### # Conclusion # ---------- @@ -893,11 +900,11 @@ def plot(): # In this tutorial, we have learned how to code a stateless environment from # scratch. We touched the subjects of: # -# * the four essential components that need to be taken care of when coding -# an environment (:func:`step`, :func:`reset", seeding and building specs). +# * The four essential components that need to be taken care of when coding +# an environment (``step``, ``reset``, seeding and building specs). # We saw how these methods and classes interact with the -# :class:`tensordict.TensorDict` class; -# * how to test that an environment is properly coded using +# :class:`~tensordict.TensorDict` class; +# * How to test that an environment is properly coded using # :func:`~torchrl.envs.utils.check_env_specs`; # * How to append transforms in the context of stateless environments and how # to write custom transformations; From d4351d336af4937f1f6197e688a0c1a260c5c12c Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 9 Nov 2023 10:39:18 -0500 Subject: [PATCH 2/5] sphinx_design --- docs/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/requirements.txt b/docs/requirements.txt index 8bb409ff326..9c316eb0554 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,6 +2,7 @@ matplotlib numpy sphinx-copybutton sphinx-gallery +sphinx_design sphinx===5.0.0 Jinja2==3.1.2 sphinx-autodoc-typehints From c8b386eed251d5156d90a1462328f5829988a974 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 9 Nov 2023 15:26:39 -0500 Subject: [PATCH 3/5] lint --- tutorials/sphinx-tutorials/pendulum.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tutorials/sphinx-tutorials/pendulum.py b/tutorials/sphinx-tutorials/pendulum.py index 84fdd6b6539..889c9616a2b 100644 --- a/tutorials/sphinx-tutorials/pendulum.py +++ b/tutorials/sphinx-tutorials/pendulum.py @@ -221,6 +221,7 @@ # needed as the state needs to be read from the environment. # + def _step(tensordict): th, thdot = tensordict["th"], tensordict["thdot"] # th := theta From 171789a90d471305e00678e301b358d5ed7faee5 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 9 Nov 2023 15:28:49 -0500 Subject: [PATCH 4/5] extension += ['sphinx_design'] --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 00acf6b67ed..aef0044f0bb 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -69,6 +69,7 @@ "sphinx_gallery.gen_gallery", "sphinxcontrib.aafig", "myst_parser", + 'sphinx_design' ] intersphinx_mapping = { From ca7aa4f7f9ceb00d904819b2a4eb372eff1d8ee2 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 9 Nov 2023 16:19:57 -0500 Subject: [PATCH 5/5] amend --- docs/requirements.txt | 2 +- docs/source/conf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 9c316eb0554..1b043c07daf 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,7 +2,6 @@ matplotlib numpy sphinx-copybutton sphinx-gallery -sphinx_design sphinx===5.0.0 Jinja2==3.1.2 sphinx-autodoc-typehints @@ -12,6 +11,7 @@ sphinxcontrib-htmlhelp -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme myst-parser docutils +sphinx_design torchvision dm_control diff --git a/docs/source/conf.py b/docs/source/conf.py index aef0044f0bb..f0821ede0bf 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -69,7 +69,7 @@ "sphinx_gallery.gen_gallery", "sphinxcontrib.aafig", "myst_parser", - 'sphinx_design' + "sphinx_design", ] intersphinx_mapping = {