Improved legalize_graph pass in FX (#82874)

Chillee · pytorchmergebot · commit 51bbf6329a6a · 2022-08-07T00:13:17.000Z
Pull Request resolved: #82874 Approved by: https://github.com/jamesr66a
diff --git a/functorch/functorch/_src/partitioners.py b/functorch/functorch/_src/partitioners.py
@@ -277,12 +277,12 @@ def classify_nodes(joint_module):
 
     pointwise_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward]  # noqa: E501
     if compiler == "inductor":
-        pointwise_ops += [prims.div, prims.convert_element_type, aten.sign, aten.clone]  # noqa: E501
+        pointwise_ops += [prims.div, prims.convert_element_type, aten.sign, aten.clone, aten._to_copy]  # noqa: E501
     misc_ops = [aten.to, aten.type_as, operator.getitem]
 
     reduction_ops = [aten.softmax, aten._softmax, aten._softmax_backward_data, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax]  # noqa: E501
     if compiler == "inductor":
-        reduction_ops += [prims.var, prims.sum, aten.var]
+        reduction_ops += [prims.var, prims.sum, aten.var, aten.std]
 
     # not recomputed by default since these are kinda expensive/hard to fuse into
     # norm_ops = [aten.instance_norm, aten._batch_norm_impl_index, aten.native_batch_norm, aten.batch_norm, aten._batch_norm_impl_index_backward, aten.native_layer_norm, aten.layer_norm, aten.native_layer_norm_backward]  # noqa: E501
diff --git a/torch/fx/passes/tools_common.py b/torch/fx/passes/tools_common.py
@@ -1,4 +1,5 @@
 from typing import List, Tuple, Union, Dict, Any, Set, Mapping
+import collections
 from dataclasses import dataclass
 
 import torch
@@ -209,7 +210,7 @@ def __call__(self) -> Dict[torch.fx.Node, NodeSet]:
 
 
 @compatibility(is_backward_compatible=False)
-def legalize_graph(gm: torch.fx.GraphModule):
+def legalize_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
     """
     Replace the graph of the given GraphModule with one that contains the same nodes as the
     original, but in topologically sorted order.
@@ -220,43 +221,33 @@ def legalize_graph(gm: torch.fx.GraphModule):
     Arguments:
         gm: The graph module to topologically sort. It is modified in-place.
 
+    Returns:
+        The graph module in-place sorted
     """
-    # Build an adjacency list representation of node dependencies in the graph. This also
-    # serves as a list of nodes that still need to be inserted into the new, topologically
-    # sorted graph.
-    dependencies = {node: node.all_input_nodes.copy() for node in gm.graph.nodes}
-
-    # Construct a new graph that will contain all nodes in topologically sorted order.
+    indeg = {node: 0 for node in gm.graph.nodes}
     new_graph = torch.fx.Graph()
-    value_remap: Dict[torch.fx.Node, torch.fx.Node] = {}
-
-    # Copy over all nodes with no dependencies.
-    for node, deps in dependencies.items():
-        if not deps:
-            value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
-
-    # Remove the copied over nodes from the adjacency list.
-    for copied_node in value_remap.keys():
-        del dependencies[copied_node]
-
-    # While there are still nodes to insert into the new graph:
-    while dependencies:
-        copied_this_round = []
-
-        # Copy over all nodes whose dependencies already exist in the new graph.
-        for node, deps in dependencies.items():
-            all_deps_copied = True
-            for dep in deps:
-                if dep not in value_remap:
-                    all_deps_copied = False
-
-            if all_deps_copied:
-                value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
-                copied_this_round.append(node)
-
-        # Delete all nodes copied over in this iteration from dependencies.
-        for copied_node in copied_this_round:
-            del dependencies[copied_node]
-
-    # Replace the old graph with the new, topologically sorted one.
+    # Track how many unfulfilled dependencies each node has
+    for node in gm.graph.nodes:
+        for user in node.users:
+            indeg[user] += 1
+    queue: collections.deque = collections.deque()
+    # Add all nodes with no dependencies to the queue
+    for node in gm.graph.nodes:
+        if indeg[node] == 0:
+            queue.append(node)
+    env: Dict[torch.fx.Node, torch.fx.Node] = {}
+    # Pop nodes from the queue, and add nodes that have had all their
+    # dependencies fulfilled
+    while len(queue) > 0:
+        cur = queue.popleft()
+        env[cur] = new_graph.node_copy(cur, lambda x: env[x])
+        for user in cur.users:
+            indeg[user] -= 1
+            if indeg[user] == 0:
+                queue.append(user)
+    # If the new graph's size is not as large as the old one, then there must be
+    # a cycle (i.e. some node's dependencies were not satisfied.)
+    if len(new_graph.nodes) < len(gm.graph.nodes):
+        raise RuntimeError(f"Input graph has cycles, unable to add {[node for node in indeg if indeg[node] != 0]}")
     gm.graph = new_graph
+    return gm