From 79594cddb372c117157b2ae0569fcf3086d6b056 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 17 Nov 2017 09:47:27 +0200 Subject: [PATCH 01/29] update TODO --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f0f1a5c..77c1dd8 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ Currently Pyan always operates at the level of individual functions and methods; ## TODO + - Calling the name of a known class should add a uses edge to its `__init__()` - Make the analyzer understand `del name` (probably seen as `isinstance(node.ctx, ast.Del)` in `visit_Name()`, `visit_Attribute()`) - Prefix methods by class name in the graph; create a legend for annotations. See the discussion [here](https://github.com/johnyf/pyan/issues/4). - Improve the wildcard resolution mechanism, see discussion [here](https://github.com/johnyf/pyan/issues/5). From eb041a9a45e2b54cbf397d5be8f2f5f27f089281 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 17 Nov 2017 09:48:50 +0200 Subject: [PATCH 02/29] bugfix: prevent crash if super() called, or inherited attributes looked up, with no known bases --- pyan/analyzer.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 91e15f0..d1e96a3 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -30,6 +30,16 @@ from .node import Node # # Namespaces also get a Node (with no associated AST node). +def head(lst): + if len(lst): + return lst[0] + +def tail(lst): + if len(lst) > 1: + return lst[1:] + else: + return [] + def get_module_name(filename): """Try to determine the full module name of a source file, by figuring out if its directory looks like a package (i.e. has an __init__.py file).""" @@ -206,15 +216,6 @@ class CallGraphVisitor(ast.NodeVisitor): # https://en.wikipedia.org/wiki/C3_linearization#Description - def head(lst): - if len(lst): - return lst[0] - def tail(lst): - if len(lst) > 1: - return lst[1:] - else: - return [] - class LinearizationImpossible(Exception): pass @@ -692,7 +693,7 @@ class CallGraphVisitor(ast.NodeVisitor): # after self.mro has been populated) # if obj_node in self.mro: - for base_node in self.mro[obj_node][1:]: # the first element is always obj itself + for base_node in tail(self.mro[obj_node]): # the first element is always obj itself ns = base_node.get_name() value_node = lookup(ns) if value_node is not None: @@ -996,9 +997,12 @@ class CallGraphVisitor(ast.NodeVisitor): # This is a limitation of pure lexical scope based static # code analysis. # - result = self.mro[class_node][1] - self.logger.debug("super of %s is %s" % (class_node, result)) - return result + if len(self.mro[class_node]) > 1: + result = self.mro[class_node][1] + self.logger.debug("super of %s is %s" % (class_node, result)) + return result + else: + self.logger.info("super called for %s, but no known bases" % (class_node)) # add implementations for other built-in funcnames here if needed def visit_Call(self, node): From 05eafab802f4a0a6bcd6c86181d98539e991c761 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 17 Nov 2017 09:50:55 +0200 Subject: [PATCH 03/29] refactor: add get_parent_node(); also fix a crash from an improperly implemented local copy of the parent node getter --- pyan/analyzer.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index d1e96a3..3a44b95 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -1213,6 +1213,14 @@ class CallGraphVisitor(ast.NodeVisitor): return n + def get_parent_node(self, node): + """Get the parent node of the given Node. (Used in postprocessing.)""" + if '.' in node.namespace: + ns,name = node.namespace.rsplit('.', 1) + else: + ns,name = '',node.namespace + return self.get_node(ns, name, None) + def associate_node(self, graph_node, ast_node, filename=None): """Change the AST node (and optionally filename) mapping of a graph node. @@ -1414,16 +1422,8 @@ class CallGraphVisitor(ast.NodeVisitor): inherited = False for n3 in self.uses_edges[n]: if n3.name == n2.name and n2.namespace is not None and n3.namespace is not None and n3.namespace != n2.namespace: - if '.' in n2.namespace: - nsp2,p2 = n2.namespace.rsplit('.', 1) - else: - nsp2,p2 = '',n2.namespace - if '.' in n3.namespace: - nsp3,p3 = n3.namespace.rsplit('.', 1) - else: - nsp3,p3 = '',n3.namespace - pn2 = self.get_node(nsp2, p2, None) - pn3 = self.get_node(nsp3, p3, None) + pn2 = self.get_parent_node(n2) + pn3 = self.get_parent_node(n3) if pn2 in self.uses_edges and pn3 in self.uses_edges[pn2]: # remove the first edge W to X.name # if pn3 in self.uses_edges and pn2 in self.uses_edges[pn3]: # remove the second edge W to Y.name (TODO: add an option to choose this) inherited = True @@ -1451,9 +1451,9 @@ class CallGraphVisitor(ast.NodeVisitor): for name in self.nodes: if name in ('lambda', 'listcomp', 'setcomp', 'dictcomp', 'genexpr'): for n in self.nodes[name]: - nsp,p = n.namespace.rsplit('.', 1) # parent - pn = self.get_node(nsp, p, None) - for n2 in self.uses_edges[n]: # outgoing uses edges - self.logger.info("Collapsing inner from %s to %s, uses %s" % (n, pn, n2)) - self.add_uses_edge(pn, n2) + pn = self.get_parent_node(n) + if n in self.uses_edges: + for n2 in self.uses_edges[n]: # outgoing uses edges + self.logger.info("Collapsing inner from %s to %s, uses %s" % (n, pn, n2)) + self.add_uses_edge(pn, n2) n.defined = False From 7df34d4a17aae39115ec0ea4b58699a8a58ff048 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 17 Nov 2017 09:52:05 +0200 Subject: [PATCH 04/29] enh: figure out which names correspond to arguments in FunctionDef, prevent leakage of matching names from the enclosing scope (fixes some spurious edges) --- pyan/analyzer.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 3a44b95..c995d24 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -428,11 +428,34 @@ class CallGraphVisitor(ast.NodeVisitor): # name representing "self" if this is a method definition. self_name = self.analyze_functiondef(node) + # Enter the function scope + # self.name_stack.append(node.name) inner_ns = self.get_current_namespace().get_name() self.scope_stack.append(self.scopes[inner_ns]) self.context_stack.append("FunctionDef %s" % (node.name)) + # Capture which names correspond to function args. + # + # In the function scope, set them to a nonsense Node, + # to prevent leakage of identifiers of matching name + # from the enclosing scope (due to the local value being None). + # + # As the name of the nonsense node, we can use any string that + # is not a valid Python identifier. + # + sc = self.scopes[inner_ns] + nonsense_node = self.get_node(inner_ns, '^^^argument^^^', None) + all_args = node.args # args, vararg (*args), kwonlyargs, kwarg (**kwargs) + for a in all_args.args: # positional + sc.defs[a.arg] = nonsense_node + if all_args.vararg is not None: # *args if present + sc.defs[all_args.vararg] = nonsense_node + for a in all_args.kwonlyargs: + sc.defs[a.arg] = nonsense_node + if all_args.kwarg is not None: # **kwargs if present + sc.defs[all_args.kwarg] = nonsense_node + # self_name is just an ordinary name in the method namespace, except # that its value is implicitly set by Python when the method is called. # @@ -454,6 +477,8 @@ class CallGraphVisitor(ast.NodeVisitor): for stmt in node.body: self.visit(stmt) + # Exit the function scope + # self.context_stack.pop() self.scope_stack.pop() self.name_stack.pop() From 5b5424ea589b47e01ff9a9ff975015170378ec3f Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 17 Nov 2017 10:06:30 +0200 Subject: [PATCH 05/29] enh: analyze instantiation: add a uses edge to MyClass.__init__ for a call to MyClass() --- README.md | 1 - pyan/analyzer.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 77c1dd8..f0f1a5c 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,6 @@ Currently Pyan always operates at the level of individual functions and methods; ## TODO - - Calling the name of a known class should add a uses edge to its `__init__()` - Make the analyzer understand `del name` (probably seen as `isinstance(node.ctx, ast.Del)` in `visit_Name()`, `visit_Attribute()`) - Prefix methods by class name in the graph; create a legend for annotations. See the discussion [here](https://github.com/johnyf/pyan/issues/4). - Improve the wildcard resolution mechanism, see discussion [here](https://github.com/johnyf/pyan/issues/5). diff --git a/pyan/analyzer.py b/pyan/analyzer.py index c995d24..31b1283 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -1048,6 +1048,24 @@ class CallGraphVisitor(ast.NodeVisitor): # it will be left standing as self.last_value. self.visit(node.func) + # If self.last_value matches a known class i.e. the call was of the + # form MyClass(), add a uses edge to MyClass.__init__(). + # + # We need to do this manually, because there is no text "__init__" + # at the call site. + # + # In this lookup to self.class_base_ast_nodes we don't care about + # the AST nodes; the keys just conveniently happen to be the Nodes + # of known classes. + # + if self.last_value in self.class_base_ast_nodes: + from_node = self.get_current_namespace() + class_node = self.last_value + to_node = self.get_node(class_node.get_name(), '__init__', None) + self.logger.debug("Use from %s to %s (call creates an instance)" % (from_node, to_node)) + if self.add_uses_edge(from_node, to_node): + self.logger.info("New edge added for Use from %s to %s (call creates an instance)" % (from_node, to_node)) + ########################################################################### # Scope analysis From c76ca1ea14f82945be86d0644686809f67670fb2 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 17 Nov 2017 10:21:18 +0200 Subject: [PATCH 06/29] remove old bare-bones readme (pyan.txt) now that we have a proper README --- pyan.txt | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 pyan.txt diff --git a/pyan.txt b/pyan.txt deleted file mode 100644 index 0a1ad8e..0000000 --- a/pyan.txt +++ /dev/null @@ -1,7 +0,0 @@ -Original version by Edmund Horner, from: - -http://code.google.com/p/ejrh/source/browse/trunk/utils/pyan.py - -Explanation: - -http://ejrh.wordpress.com/2012/01/31/call-graphs-in-python-part-2/ From 2b4340634d7efd3e0400d17af5695da84803e83f Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 17 Nov 2017 16:31:07 +0200 Subject: [PATCH 07/29] rename get_current_namespace to get_node_of_current_namespace (since this is what it actually does) --- pyan/analyzer.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 31b1283..b6fb744 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -354,7 +354,7 @@ class CallGraphVisitor(ast.NodeVisitor): def visit_ClassDef(self, node): self.logger.debug("ClassDef %s" % (node.name)) - from_node = self.get_current_namespace() + from_node = self.get_node_of_current_namespace() ns = from_node.get_name() to_node = self.get_node(ns, node.name, node) if self.add_defines_edge(from_node, to_node): @@ -376,7 +376,7 @@ class CallGraphVisitor(ast.NodeVisitor): self.class_stack.append(to_node) self.name_stack.append(node.name) - inner_ns = self.get_current_namespace().get_name() + inner_ns = self.get_node_of_current_namespace().get_name() self.scope_stack.append(self.scopes[inner_ns]) self.context_stack.append("ClassDef %s" % (node.name)) @@ -414,7 +414,7 @@ class CallGraphVisitor(ast.NodeVisitor): # self.visit(stmt) # return - from_node = self.get_current_namespace() + from_node = self.get_node_of_current_namespace() ns = from_node.get_name() to_node = self.get_node(ns, node.name, node) if self.add_defines_edge(from_node, to_node): @@ -431,7 +431,7 @@ class CallGraphVisitor(ast.NodeVisitor): # Enter the function scope # self.name_stack.append(node.name) - inner_ns = self.get_current_namespace().get_name() + inner_ns = self.get_node_of_current_namespace().get_name() self.scope_stack.append(self.scopes[inner_ns]) self.context_stack.append("FunctionDef %s" % (node.name)) @@ -545,7 +545,7 @@ class CallGraphVisitor(ast.NodeVisitor): # mark the use site # - from_node = self.get_current_namespace() # where it is being imported to, i.e. the **user** + from_node = self.get_node_of_current_namespace() # where it is being imported to, i.e. the **user** to_node = self.get_node('', tgt_name, node) # the thing **being used** (under the asname, if any) self.logger.debug("Use from %s to Import %s" % (from_node, to_node)) if self.add_uses_edge(from_node, to_node): @@ -566,7 +566,7 @@ class CallGraphVisitor(ast.NodeVisitor): self.logger.debug("ImportFrom: from %s import %s" % (node.module, [format_alias(x) for x in node.names])) tgt_name = node.module - from_node = self.get_current_namespace() + from_node = self.get_node_of_current_namespace() to_node = self.get_node('', tgt_name, node) # module, in top-level namespace self.logger.debug("Use from %s to ImportFrom %s" % (from_node, to_node)) if self.add_uses_edge(from_node, to_node): @@ -772,7 +772,7 @@ class CallGraphVisitor(ast.NodeVisitor): self.logger.info('getattr %s on %s returns %s' % (node.attr, objname, attr_node)) # add uses edge - from_node = self.get_current_namespace() + from_node = self.get_node_of_current_namespace() self.logger.debug("Use from %s to %s" % (from_node, attr_node)) if self.add_uses_edge(from_node, attr_node): self.logger.info("New edge added for Use from %s to %s" % (from_node, attr_node)) @@ -800,7 +800,7 @@ class CallGraphVisitor(ast.NodeVisitor): # elif isinstance(obj_node, Node) and obj_node.namespace is not None: tgt_name = node.attr - from_node = self.get_current_namespace() + from_node = self.get_node_of_current_namespace() ns = obj_node.get_name() # fully qualified namespace **of attr** to_node = self.get_node(ns, tgt_name, node) self.logger.debug("Use from %s to %s (target obj %s known but target attr %s not resolved; maybe fwd ref or unanalyzed import)" % (from_node, to_node, obj_node, node.attr)) @@ -815,7 +815,7 @@ class CallGraphVisitor(ast.NodeVisitor): # Object unknown, add uses edge to a wildcard by attr name. else: tgt_name = node.attr - from_node = self.get_current_namespace() + from_node = self.get_node_of_current_namespace() to_node = self.get_node(None, tgt_name, node) self.logger.debug("Use from %s to %s (target obj %s not resolved; maybe fwd ref, function argument, or unanalyzed import)" % (from_node, to_node, objname)) if self.add_uses_edge(from_node, to_node): @@ -846,7 +846,7 @@ class CallGraphVisitor(ast.NodeVisitor): if not isinstance(to_node, Node): to_node = self.get_node(None, tgt_name, node) # namespace=None means we don't know the namespace yet - from_node = self.get_current_namespace() + from_node = self.get_node_of_current_namespace() self.logger.debug("Use from %s to Name %s" % (from_node, to_node)) if self.add_uses_edge(from_node, to_node): self.logger.info("New edge added for Use from %s to Name %s" % (from_node, to_node)) @@ -1059,7 +1059,7 @@ class CallGraphVisitor(ast.NodeVisitor): # of known classes. # if self.last_value in self.class_base_ast_nodes: - from_node = self.get_current_namespace() + from_node = self.get_node_of_current_namespace() class_node = self.last_value to_node = self.get_node(class_node.get_name(), '__init__', None) self.logger.debug("Use from %s to %s (call creates an instance)" % (from_node, to_node)) @@ -1110,7 +1110,7 @@ class CallGraphVisitor(ast.NodeVisitor): # the Python 3 scoping rules correctly. self.name_stack.append(scopename) - inner_ns = self.get_current_namespace().get_name() + inner_ns = self.get_node_of_current_namespace().get_name() if inner_ns not in self.scopes: raise ValueError("Unknown scope '%s'" % (inner_ns)) self.scope_stack.append(self.scopes[inner_ns]) @@ -1128,7 +1128,7 @@ class CallGraphVisitor(ast.NodeVisitor): # current ns will be grouped into a single node, as they have no name. # We create a namespace-like node that has no associated AST node, # as it does not represent any unique AST node. - from_node = self.get_current_namespace() + from_node = self.get_node_of_current_namespace() ns = from_node.get_name() to_node = self.get_node(ns, scopename, None) if self.add_defines_edge(from_node, to_node): @@ -1140,7 +1140,7 @@ class CallGraphVisitor(ast.NodeVisitor): """Return the node representing the current class, or None if not inside a class definition.""" return self.class_stack[-1] if len(self.class_stack) else None - def get_current_namespace(self): + def get_node_of_current_namespace(self): """Return a node representing the current namespace, based on self.name_stack.""" # For a Node n representing a namespace: @@ -1165,7 +1165,7 @@ class CallGraphVisitor(ast.NodeVisitor): # # If we wanted to get rid of a separate scope stack, we could do this: # def find_scope(name): -# ns0 = self.get_current_namespace().get_name() +# ns0 = self.get_node_of_current_namespace().get_name() # for j in range(ns0.count('.')+1): # ns = ns0.rsplit(".",j)[0] # if ns in self.scopes: @@ -1196,7 +1196,7 @@ class CallGraphVisitor(ast.NodeVisitor): # # If we wanted to get rid of a separate scope stack, we could do this: # def find_scope(name): -# ns0 = self.get_current_namespace().get_name() +# ns0 = self.get_node_of_current_namespace().get_name() # for j in range(ns0.count('.')+1): # ns = ns0.rsplit(".",j)[0] # if ns in self.scopes: From 1a678ab241cf8ab04acf83aa1f692f7978f255f2 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 17 Nov 2017 17:00:47 +0200 Subject: [PATCH 08/29] complete author info is in README, remove partial list from analyzer.py --- pyan/analyzer.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index b6fb744..549a562 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -1,12 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -"""The AST visitor. - -Created on Mon Nov 13 03:33:00 2017 - -Original code by Edmund Horner. -Python 3 port by Juha Jeronen. -""" +"""The AST visitor.""" import os.path import logging From 724eb8e20a09d7c3434bdde5ea66559ec095bd11 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Fri, 17 Nov 2017 17:01:39 +0200 Subject: [PATCH 09/29] comment and docstring updates --- pyan/analyzer.py | 24 +++++++++++++++--------- pyan/node.py | 19 ++++++++++++++----- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 549a562..011a097 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -671,8 +671,9 @@ class CallGraphVisitor(ast.NodeVisitor): """Get value of an ast.Attribute. Supports inherited attributes. If the obj's own namespace has no match - for attr, the ancestors of obj are also tried recursively until one of - them matches or until all ancestors are exhausted. + for attr, the ancestors of obj are also tried, following the MRO based + on the static type of the object, until one of them matches or until + all ancestors are exhausted. Return pair of Node objects (obj,attr), where each item can be None on lookup failure. (Object not known, or no Node value assigned @@ -1135,12 +1136,14 @@ class CallGraphVisitor(ast.NodeVisitor): return self.class_stack[-1] if len(self.class_stack) else None def get_node_of_current_namespace(self): - """Return a node representing the current namespace, based on self.name_stack.""" + """Return a node representing the current namespace, based on self.name_stack. - # For a Node n representing a namespace: - # - n.namespace = parent namespaces (empty string if top level) - # - n.name = name of this namespace - # - no associated AST node. + For a Node n representing a namespace: + - n.namespace = fully qualified name of the parent namespace + (empty string if at top level) + - n.name = name of this namespace + - no associated AST node. + """ assert len(self.name_stack) # name_stack should never be empty (always at least module name) @@ -1234,6 +1237,9 @@ class CallGraphVisitor(ast.NodeVisitor): # so the filenames should be trusted only after the analysis is # complete. # + # TODO: this is tentative. Add in filename only when sure? + # (E.g. in visit_ClassDef(), visit_FunctionDef()) + # if namespace in self.module_to_filename: # If the namespace is one of the modules being analyzed, # the the Node belongs to the correponding file. @@ -1271,8 +1277,8 @@ class CallGraphVisitor(ast.NodeVisitor): number is contained in the AST node). However, a graph Node must be created immediately when the function is first encountered, in order to have a Node that can act as a "uses" target (namespaced correctly, - to avoid the over-reaching unknowns expansion in cases where it is - not needed). + to avoid a wildcard and the over-reaching expand_unknowns() in cases + where they are not needed). This method re-associates the given graph Node with a different AST node, which allows updating the context when the definition diff --git a/pyan/node.py b/pyan/node.py index 182016e..378f755 100644 --- a/pyan/node.py +++ b/pyan/node.py @@ -12,11 +12,20 @@ def make_safe_label(label): class Node: - """A node is an object in the call graph. Nodes have names, and are in - namespaces. The full name of a node is its namespace, a dot, and its name. - If the namespace is None, it is rendered as *, and considered as an unknown - node. The meaning of this is that a use-edge to an unknown node is created - when the analysis cannot determine which actual node is being used.""" + """A node is an object in the call graph. + + Nodes have names, and reside in namespaces. + + The namespace is a dot-delimited string of names. It can be blank, '', + denoting the top level. + + The fully qualified name of a node is its namespace, a dot, and its name; + except at the top level, where the leading dot is omitted. + + If the namespace has the special value None, it is rendered as *, and the + node is considered as an unknown node. A uses edge to an unknown node is + created when the analysis cannot determine which actual node is being used. + """ def __init__(self, namespace, name, ast_node, filename): self.namespace = namespace From b8c1738dd549f542ea3d1e437f6b84ad926b9709 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Sun, 19 Nov 2017 02:06:41 +0200 Subject: [PATCH 10/29] enh: record the node flavor in analysis --- README.md | 16 ++++- pyan/analyzer.py | 153 ++++++++++++++++++++++++++++++++++++----------- pyan/node.py | 61 ++++++++++++++++++- pyan/visgraph.py | 6 +- 4 files changed, 193 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index f0f1a5c..9afb9dc 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,17 @@ Currently Pyan always operates at the level of individual functions and methods; ## TODO + - Visualize Node flavors? (Framework already exists, both in `analyzer` and in `visgraph`.) + - Determine confidence of detected edges (probability that the edge is correct). Start with a binary system, with only values 1.0 and 0.0. + - A fully resolved reference to a name, based on lexical scoping, has confidence 1.0. + - A reference to an unknown name has confidence 0.0. + - Attributes: + - A fully resolved reference to a known attribute of a known object has confidence 1.0. + - A reference to an unknown attribute of a known object has confidence 1.0. These are mainly generated by imports, when the imported file is not in the analyzed set. (Does this need a third value, such as 0.5?) + - A reference to an attribute of an unknown object has confidence 0.0. + - A wildcard and its expansions have confidence 0.0. + - Effects of binding analysis? The system should not claim full confidence in a bound value, unless it fully understands both the binding syntax and the value. (Note that this is very restrictive. A function call or a list in the expression for the value will currently spoil the full analysis.) + - Confidence values may need updating in pass 2. - Make the analyzer understand `del name` (probably seen as `isinstance(node.ctx, ast.Del)` in `visit_Name()`, `visit_Attribute()`) - Prefix methods by class name in the graph; create a legend for annotations. See the discussion [here](https://github.com/johnyf/pyan/issues/4). - Improve the wildcard resolution mechanism, see discussion [here](https://github.com/johnyf/pyan/issues/5). @@ -105,7 +116,8 @@ Currently Pyan always operates at the level of individual functions and methods; The analyzer **does not currently support**: - - Tuples/lists as first-class values (will ignore any assignment of a tuple/list to a single name). + - Tuples/lists as first-class values (currently ignores any assignment of a tuple/list to a single name). + - Support empty lists, too (for resolving method calls to `.append()` and similar). - Starred assignment `a,*b,c = d,e,f,g,h` - Slicing and indexing in assignment (`ast.Subscript`) - Additional unpacking generalizations ([PEP 448](https://www.python.org/dev/peps/pep-0448/), Python 3.5+). @@ -116,6 +128,8 @@ The analyzer **does not currently support**: - Type hints ([PEP 484](https://www.python.org/dev/peps/pep-0484/), Python 3.5+). - Type inference for function arguments - Either of these two could be used to bind function argument names to the appropriate object types, avoiding the need for wildcard references (especially for attribute accesses on objects passed in as function arguments). + - Type inference could run as pass 3, using additional information from the state of the graph after pass 2 to connect call sites to function definitions. Alternatively, no additional pass; store the AST nodes in the earlier pass. Type inference would allow resolving some wildcards by finding the method of the actual object instance passed in. + - Must understand, at the call site, whether the first positional argument in the function def is handled implicitly or not. This is found by looking at the flavor of the Node representing the call target. - Async definitions are detected, but passed through to the corresponding non-async analyzers; could be annotated. - Cython; could strip or comment out Cython-specific code as a preprocess step, then treat as Python (will need to be careful to get line numbers right). diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 011a097..0d981c4 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -7,7 +7,7 @@ import logging import ast import symtable -from .node import Node +from .node import Node, Flavor # TODO: add Cython support (strip type annotations in a preprocess step, then treat as Python) # TODO: built-in functions (range(), enumerate(), zip(), iter(), ...): @@ -326,14 +326,17 @@ class CallGraphVisitor(ast.NodeVisitor): ########################################################################### # visitor methods + # In visit_*(), the "node" argument refers to an AST node. + # Python docs: # https://docs.python.org/3/library/ast.html#abstract-grammar def visit_Module(self, node): self.logger.debug("Module") - # TODO: self.get_node() this too, and associate_node() to get the - # source file information for annotated output? + # Modules live in the top-level namespace, ''. + module_node = self.get_node('', self.module_name, node, flavor=Flavor.MODULE) + self.associate_node(module_node, node, filename=self.filename) ns = self.module_name self.name_stack.append(ns) @@ -350,7 +353,7 @@ class CallGraphVisitor(ast.NodeVisitor): from_node = self.get_node_of_current_namespace() ns = from_node.get_name() - to_node = self.get_node(ns, node.name, node) + to_node = self.get_node(ns, node.name, node, flavor=Flavor.CLASS) if self.add_defines_edge(from_node, to_node): self.logger.info("Def from %s to Class %s" % (from_node, to_node)) @@ -408,20 +411,33 @@ class CallGraphVisitor(ast.NodeVisitor): # self.visit(stmt) # return + # To begin with: + # + # - Analyze decorators. They belong to the surrounding scope, + # so we must analyze them before entering the function scope. + # + # - Determine whether this definition is for a function, an (instance) + # method, a static method or a class method. + # + # - Grab the name representing "self", if this is either an instance + # method or a class method. (For a class method, it represents cls, + # but Pyan only cares about types, not instances.) + # + self_name,flavor = self.analyze_functiondef(node) + + # Now we can create the Node. + # from_node = self.get_node_of_current_namespace() ns = from_node.get_name() - to_node = self.get_node(ns, node.name, node) + to_node = self.get_node(ns, node.name, node, flavor=flavor) if self.add_defines_edge(from_node, to_node): self.logger.info("Def from %s to Function %s" % (from_node, to_node)) + # Same remarks as for ClassDef above. + # self.associate_node(to_node, node, self.filename) self.set_value(node.name, to_node) - # Decorators belong to the surrounding scope, so analyze them - # before entering the function scope. This also grabs the - # name representing "self" if this is a method definition. - self_name = self.analyze_functiondef(node) - # Enter the function scope # self.name_stack.append(node.name) @@ -433,11 +449,14 @@ class CallGraphVisitor(ast.NodeVisitor): # # In the function scope, set them to a nonsense Node, # to prevent leakage of identifiers of matching name - # from the enclosing scope (due to the local value being None). + # from the enclosing scope (due to the local value being None + # until we set it to this nonsense Node). # # As the name of the nonsense node, we can use any string that # is not a valid Python identifier. # + # It has no sensible flavor, so we leave its flavor unspecified. + # sc = self.scopes[inner_ns] nonsense_node = self.get_node(inner_ns, '^^^argument^^^', None) all_args = node.args # args, vararg (*args), kwonlyargs, kwarg (**kwargs) @@ -497,21 +516,34 @@ class CallGraphVisitor(ast.NodeVisitor): deco_names.append(deco_node.name) self.last_value = None + # Analyze flavor + in_class_ns = self.context_stack[-1].startswith("ClassDef") + if not in_class_ns: + flavor = Flavor.FUNCTION + else: + if "staticmethod" in deco_names: + flavor = Flavor.STATICMETHOD + elif "classmethod" in deco_names: + flavor = Flavor.CLASSMETHOD + else: # instance method + flavor = Flavor.METHOD + # Get the name representing "self", if applicable. # # - ignore static methods # - ignore functions defined inside methods (this new FunctionDef # must be directly in a class namespace) # - in_class_ns = self.context_stack[-1].startswith("ClassDef") - if in_class_ns and "staticmethod" not in deco_names: + if flavor in (Flavor.METHOD, Flavor.CLASSMETHOD): # We can treat instance methods and class methods the same, # since Pyan is only interested in object types, not instances. all_args = ast_node.args # args, vararg (*args), kwonlyargs, kwarg (**kwargs) posargs = all_args.args if len(posargs): self_name = posargs[0].arg - return self_name + return self_name, flavor + + return None, flavor def visit_AsyncFunctionDef(self, node): self.visit_FunctionDef(node) # TODO: alias for now; tag async functions in output in a future version? @@ -539,11 +571,17 @@ class CallGraphVisitor(ast.NodeVisitor): # mark the use site # - from_node = self.get_node_of_current_namespace() # where it is being imported to, i.e. the **user** - to_node = self.get_node('', tgt_name, node) # the thing **being used** (under the asname, if any) - self.logger.debug("Use from %s to Import %s" % (from_node, to_node)) - if self.add_uses_edge(from_node, to_node): - self.logger.info("New edge added for Use from %s to Import %s" % (from_node, to_node)) + # where it is being imported to, i.e. the **user** + from_node = self.get_node_of_current_namespace() + # the thing **being used** (under the asname, if any) + to_node = self.get_node('', tgt_name, node, flavor=Flavor.IMPORTEDITEM) + + is_new_edge = self.add_uses_edge(from_node, to_node) + + # TODO: e.g. "os.path" is not a MODULE; add logic to keep + # dot-limited names as IMPORTEDITEM. Or maybe even, + # treat namespaces properly, and create the MODULE and + # IMPORTEDITEM nodes (one of the latter for each level of nesting). # bind asname in the current namespace to the imported module # @@ -553,15 +591,27 @@ class CallGraphVisitor(ast.NodeVisitor): mod_name = self.module_names[src_name] else: mod_name = src_name - tgt_module = self.get_node('', mod_name, node) + tgt_module = self.get_node('', mod_name, node, flavor=Flavor.MODULE) + # XXX: if there is no asname, it may happen that mod_name == tgt_name, + # in which case these will be the same Node. They are semantically + # distinct (Python name at receiving end, vs. module), but currently + # Pyan has no way of retaining that information. + if to_node is tgt_module: + to_node.flavor = Flavor.MODULE self.set_value(tgt_name, tgt_module) + # must do this after possibly munging flavor to avoid confusing + # the user reading the log + self.logger.debug("Use from %s to Import %s" % (from_node, to_node)) + if is_new_edge: + self.logger.info("New edge added for Use from %s to Import %s" % (from_node, to_node)) + def visit_ImportFrom(self, node): self.logger.debug("ImportFrom: from %s import %s" % (node.module, [format_alias(x) for x in node.names])) tgt_name = node.module from_node = self.get_node_of_current_namespace() - to_node = self.get_node('', tgt_name, node) # module, in top-level namespace + to_node = self.get_node('', tgt_name, node, flavor=Flavor.MODULE) # module, in top-level namespace self.logger.debug("Use from %s to ImportFrom %s" % (from_node, to_node)) if self.add_uses_edge(from_node, to_node): self.logger.info("New edge added for Use from %s to ImportFrom %s" % (from_node, to_node)) @@ -574,7 +624,8 @@ class CallGraphVisitor(ast.NodeVisitor): for import_item in node.names: name = import_item.name new_name = import_item.asname if import_item.asname is not None else name - tgt_id = self.get_node(mod_name, name, node) # we imported the identifier name from the module mod_name + # we imported the identifier name from the module mod_name + tgt_id = self.get_node(mod_name, name, node, flavor=Flavor.IMPORTEDITEM) self.set_value(new_name, tgt_id) self.logger.info("From setting name %s to %s" % (new_name, tgt_id)) @@ -582,6 +633,7 @@ class CallGraphVisitor(ast.NodeVisitor): # # https://ejrh.wordpress.com/2012/01/31/call-graphs-in-python-part-2/ # # # # Essentially, this should make '.'.join(...) see str.join. +# # Pyan3 currently handles that in resolve_attribute() and get_attribute(). # # # # Python 3.4 does not have ast.Constant, but 3.6 does. Disabling for now. # # TODO: revisit this part after upgrading Python. @@ -646,7 +698,11 @@ class CallGraphVisitor(ast.NodeVisitor): # frozen to the first constant of any matching type that # the analyzer encountered in the analyzed source code, # which is not useful. - obj_node = self.get_node('', tn, None) + # + # The CLASS flavor is the best match, as these constants + # are object types. + # + obj_node = self.get_node('', tn, None, flavor=Flavor.CLASS) # attribute of a function call. Detect cases like super().dostuff() elif isinstance(ast_node.value, ast.Call): @@ -695,7 +751,7 @@ class CallGraphVisitor(ast.NodeVisitor): # special handling, by design.) # if ns in ("Num", "Str"): # TODO: other types? - return obj_node, self.get_node(ns, attr_name, None) + return obj_node, self.get_node(ns, attr_name, None, flavor=Flavor.ATTRIBUTE) # look up attr_name in the given namespace, return Node or None def lookup(ns): @@ -720,7 +776,7 @@ class CallGraphVisitor(ast.NodeVisitor): break else: return None, None # not found - return base_node, value_node + return base_node, value_node # as obj, return the base class in which attr was found return obj_node, None # here obj_node is either None or unknown (namespace None) @@ -797,7 +853,7 @@ class CallGraphVisitor(ast.NodeVisitor): tgt_name = node.attr from_node = self.get_node_of_current_namespace() ns = obj_node.get_name() # fully qualified namespace **of attr** - to_node = self.get_node(ns, tgt_name, node) + to_node = self.get_node(ns, tgt_name, node, flavor=Flavor.ATTRIBUTE) self.logger.debug("Use from %s to %s (target obj %s known but target attr %s not resolved; maybe fwd ref or unanalyzed import)" % (from_node, to_node, obj_node, node.attr)) if self.add_uses_edge(from_node, to_node): self.logger.info("New edge added for Use from %s to %s (target obj %s known but target attr %s not resolved; maybe fwd ref or unanalyzed import)" % (from_node, to_node, obj_node, node.attr)) @@ -811,7 +867,7 @@ class CallGraphVisitor(ast.NodeVisitor): else: tgt_name = node.attr from_node = self.get_node_of_current_namespace() - to_node = self.get_node(None, tgt_name, node) + to_node = self.get_node(None, tgt_name, node, flavor=Flavor.UNKNOWN) self.logger.debug("Use from %s to %s (target obj %s not resolved; maybe fwd ref, function argument, or unanalyzed import)" % (from_node, to_node, objname)) if self.add_uses_edge(from_node, to_node): self.logger.info("New edge added for Use from %s to %s (target obj %s not resolved; maybe fwd ref, function argument, or unanalyzed import)" % (from_node, to_node, objname)) @@ -839,7 +895,8 @@ class CallGraphVisitor(ast.NodeVisitor): ###TODO if the name is a local variable (i.e. in the innermost scope), and ###has no known value, then don't try to create a Node for it. if not isinstance(to_node, Node): - to_node = self.get_node(None, tgt_name, node) # namespace=None means we don't know the namespace yet + # namespace=None means we don't know the namespace yet + to_node = self.get_node(None, tgt_name, node, flavor=Flavor.UNKNOWN) from_node = self.get_node_of_current_namespace() self.logger.debug("Use from %s to Name %s" % (from_node, to_node)) @@ -907,6 +964,10 @@ class CallGraphVisitor(ast.NodeVisitor): if len(node.targets) > 1: self.logger.debug("Assign (chained with %d outputs)" % (len(node.targets))) + # TODO: support lists, dicts, sets (so that we can recognize calls to their methods) + # TODO: begin with supporting empty lists, dicts, sets + # TODO: need to be more careful in sanitizing; currently destroys a bare list + values = sanitize_exprs(node.value) # values is the same for each set of targets for targets in node.targets: targets = sanitize_exprs(targets) @@ -1056,7 +1117,7 @@ class CallGraphVisitor(ast.NodeVisitor): if self.last_value in self.class_base_ast_nodes: from_node = self.get_node_of_current_namespace() class_node = self.last_value - to_node = self.get_node(class_node.get_name(), '__init__', None) + to_node = self.get_node(class_node.get_name(), '__init__', None, flavor=Flavor.METHOD) self.logger.debug("Use from %s to %s (call creates an instance)" % (from_node, to_node)) if self.add_uses_edge(from_node, to_node): self.logger.info("New edge added for Use from %s to %s (call creates an instance)" % (from_node, to_node)) @@ -1125,7 +1186,7 @@ class CallGraphVisitor(ast.NodeVisitor): # as it does not represent any unique AST node. from_node = self.get_node_of_current_namespace() ns = from_node.get_name() - to_node = self.get_node(ns, scopename, None) + to_node = self.get_node(ns, scopename, None, flavor=Flavor.NAMESPACE) if self.add_defines_edge(from_node, to_node): self.logger.info("Def from %s to %s %s" % (from_node, scopename, to_node)) self.last_value = to_node # Make this inner scope node assignable to track its uses. @@ -1136,7 +1197,8 @@ class CallGraphVisitor(ast.NodeVisitor): return self.class_stack[-1] if len(self.class_stack) else None def get_node_of_current_namespace(self): - """Return a node representing the current namespace, based on self.name_stack. + """Return the unique node representing the current namespace, + based on self.name_stack. For a Node n representing a namespace: - n.namespace = fully qualified name of the parent namespace @@ -1144,12 +1206,11 @@ class CallGraphVisitor(ast.NodeVisitor): - n.name = name of this namespace - no associated AST node. """ - assert len(self.name_stack) # name_stack should never be empty (always at least module name) namespace = '.'.join(self.name_stack[0:-1]) name = self.name_stack[-1] - return self.get_node(namespace, name, None) + return self.get_node(namespace, name, None, flavor=Flavor.NAMESPACE) def get_value(self, name): """Get the value of name in the current scope. Return the Node, or None if name is not set to a value.""" @@ -1215,18 +1276,37 @@ class CallGraphVisitor(ast.NodeVisitor): ########################################################################### # Graph creation - def get_node(self, namespace, name, ast_node=None): + def get_node(self, namespace, name, ast_node=None, flavor=Flavor.UNSPECIFIED): """Return the unique node matching the namespace and name. - Creates a new node if one doesn't already exist. + Create a new node if one doesn't already exist. + To associate the node with a syntax object in the analyzed source code, + an AST node can be passed in. This only takes effect if a new Node + is created. + + To associate an AST node to an existing graph node, + see associate_node(). + + Flavor describes the kind of object the node represents. + See the node.Flavor enum for currently supported values. + + For existing nodes, flavor overwrites, if the given flavor is + (strictly) more specific than the node's existing one. + See node.Flavor.specificity(). + + !!! In CallGraphVisitor, always use get_node() to create nodes, because it also sets some auxiliary information. Do not call the Node constructor directly. + !!! """ if name in self.nodes: for n in self.nodes[name]: if n.namespace == namespace: + if Flavor.specificity(flavor) > Flavor.specificity(n.flavor): + n.flavor = flavor + return n # Try to figure out which source file this Node belongs to @@ -1247,8 +1327,9 @@ class CallGraphVisitor(ast.NodeVisitor): else: # Assume the Node belongs to the current file. filename = self.filename - n = Node(namespace, name, ast_node, filename) + n = Node(namespace, name, ast_node, filename, flavor) + # Add to the list of nodes that have this short name. if name in self.nodes: self.nodes[name].append(n) else: diff --git a/pyan/node.py b/pyan/node.py index 378f755..eda1b9c 100644 --- a/pyan/node.py +++ b/pyan/node.py @@ -2,6 +2,8 @@ # -*- coding: utf-8 -*- """Abstract node representing data gathered from the analysis.""" +from enum import Enum + def make_safe_label(label): """Avoid name clashes with GraphViz reserved words such as 'graph'.""" unsafe_words = ("digraph", "graph", "cluster", "subgraph") @@ -10,6 +12,43 @@ def make_safe_label(label): out = out.replace(word, "%sX" % word) return out.replace('.', '__').replace('*', '') +class Flavor(Enum): + """Flavor describes the kind of object a node represents.""" + UNSPECIFIED = "---" # as it says on the tin + UNKNOWN = "???" # not determined by analysis (wildcard) + + NAMESPACE = "namespace" # node representing a namespace + ATTRIBUTE = "attribute" # attr of something, but not known if class or func. + + IMPORTEDITEM = "import" # imported item of unanalyzed type + + MODULE = "module" + CLASS = "class" + FUNCTION = "function" + METHOD = "method" # instance method + STATICMETHOD = "staticmethod" + CLASSMETHOD = "classmethod" + NAME = "name" # Python name (e.g. "x" in "x = 42") + + # Flavors have a partial ordering in specificness of the information. + # + # This sort key scores higher on flavors that are more specific, + # allowing selective overwriting (while defining the override rules + # here, where that information belongs). + # + @staticmethod + def specificity(flavor): + if flavor in (Flavor.UNSPECIFIED, Flavor.UNKNOWN): + return 0 + elif flavor in (Flavor.NAMESPACE, Flavor.ATTRIBUTE): + return 1 + elif flavor == Flavor.IMPORTEDITEM: + return 2 + else: + return 3 + + def __repr__(self): + return self.value class Node: """A node is an object in the call graph. @@ -25,13 +64,31 @@ class Node: If the namespace has the special value None, it is rendered as *, and the node is considered as an unknown node. A uses edge to an unknown node is created when the analysis cannot determine which actual node is being used. + + A graph node can be associated with an AST node from the analysis. + This identifies the syntax object the node represents, and as a bonus, + provides the line number at which the syntax object appears in the + analyzed code. The filename, however, must be given manually. + + Nodes can also represent namespaces. These namespace nodes do not have an + associated AST node. For a namespace node, the "namespace" argument is the + **parent** namespace, and the "name" argument is the (last component of + the) name of the namespace itself. For example, + + Node("mymodule", "main", None) + + represents the namespace "mymodule.main". + + Flavor describes the kind of object the node represents. + See the Flavor enum for currently supported values. """ - def __init__(self, namespace, name, ast_node, filename): + def __init__(self, namespace, name, ast_node, filename, flavor): self.namespace = namespace self.name = name self.ast_node = ast_node self.filename = filename + self.flavor = flavor self.defined = namespace is None # assume that unknown nodes are defined def get_short_name(self): @@ -118,4 +175,4 @@ class Node: return make_safe_label(self.namespace) def __repr__(self): - return '' % self.get_name() + return '' % (repr(self.flavor), self.get_name()) diff --git a/pyan/visgraph.py b/pyan/visgraph.py index bdf8660..3872843 100644 --- a/pyan/visgraph.py +++ b/pyan/visgraph.py @@ -71,16 +71,13 @@ class Colorizer: class VisualNode(object): """ A node in the output graph: colors, internal ID, human-readable label, ... - - flavor is meant to be used one day for things like 'source file', 'class', - 'function'... """ def __init__( self, id, label='', flavor='', fill_color='', text_color='', group=''): self.id = id # graphing software friendly label (no special chars) self.label = label # human-friendly label - self.flavor = '' + self.flavor = flavor self.fill_color = fill_color self.text_color = text_color self.group = group @@ -183,6 +180,7 @@ class VisualGraph(object): visual_node = VisualNode( id=node.get_label(), label=labeler(node), + flavor=repr(node.flavor), fill_color=fill_RGBA, text_color=text_RGB, group=idx) From 8ee6f7cdb655c3faffbda4d029ba673616804e52 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Sun, 19 Nov 2017 02:28:54 +0200 Subject: [PATCH 11/29] refactor remove_uses_edge --- pyan/analyzer.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 0d981c4..963992a 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -1419,6 +1419,14 @@ class CallGraphVisitor(ast.NodeVisitor): return True + def remove_uses_edge(self, from_node, to_node): + """Remove a uses edge from the graph. (Used in postprocessing.)""" + + if from_node in self.uses_edges: + u = self.uses_edges[from_node] + if to_node in u: + u.remove(to_node) + def remove_wild(self, from_node, to_node, name): """Remove uses edge from from_node to wildcard *.name. @@ -1478,7 +1486,7 @@ class CallGraphVisitor(ast.NodeVisitor): if len(matching_wilds): wild_node = matching_wilds[0] self.logger.info("Use from %s to %s resolves %s; removing wildcard" % (from_node, to_node, wild_node)) - self.uses_edges[from_node].remove(wild_node) + self.remove_uses_edge(from_node, wild_node) ########################################################################### # Postprocessing @@ -1501,12 +1509,12 @@ class CallGraphVisitor(ast.NodeVisitor): self.add_uses_edge(from_node, to_node) for from_node, to_node in removed_uses_edges: - self.uses_edges[from_node].remove(to_node) + self.remove_uses_edge(from_node, to_node) def expand_unknowns(self): """For each unknown node *.name, replace all its incoming edges with edges to X.name for all possible Xs. - Also mark all unknown nodes as not defined.""" + Also mark all unknown nodes as not defined (so that they won't be visualized).""" new_defines_edges = [] for n in self.defines_edges: @@ -1557,7 +1565,7 @@ class CallGraphVisitor(ast.NodeVisitor): self.logger.info("Removing inherited edge from %s to %s" % (n, n2)) for from_node, to_node in removed_uses_edges: - self.uses_edges[from_node].remove(to_node) + self.remove_uses_edge(from_node, to_node) def collapse_inner(self): """Combine lambda and comprehension Nodes with their parent Nodes to reduce visual noise. From 7f086675f9e825a9e65b33d2df4e996d7d864075 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Sun, 19 Nov 2017 02:29:03 +0200 Subject: [PATCH 12/29] update todo in README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9afb9dc..6cfe112 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,7 @@ The analyzer **does not currently support**: - Slicing and indexing in assignment (`ast.Subscript`) - Additional unpacking generalizations ([PEP 448](https://www.python.org/dev/peps/pep-0448/), Python 3.5+). - Any **uses** on the RHS *at the binding site* in all of the above are already detected by the name and attribute analyzers, but the binding information from assignments of these forms will not be recorded (at least not correctly). + - Enums; need to mark the use of any of their attributes as use of the Enum. Need to detect `Enum` in `bases` during analysis of ClassDef; then tag the class as an enum and handle differently. - Resolving results of function calls, except for a very limited special case for `super()`. - Any binding of a name to a result of a function (or method) call - provided that the binding itself is understood by Pyan - will instead show in the output as binding the name to that function (or method). (This may generate some unintuitive uses edges in the graph.) - Distinguishing between different Lambdas in the same namespace (to report uses of a particular `lambda` that has been stored in `self.something`). From 30d44c67a3fa75a4dc0529d980569c935e4b64d5 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Sun, 19 Nov 2017 02:35:54 +0200 Subject: [PATCH 13/29] upon closer inspection, remove nonsense TODO --- pyan/analyzer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 963992a..30d84bb 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -578,11 +578,6 @@ class CallGraphVisitor(ast.NodeVisitor): is_new_edge = self.add_uses_edge(from_node, to_node) - # TODO: e.g. "os.path" is not a MODULE; add logic to keep - # dot-limited names as IMPORTEDITEM. Or maybe even, - # treat namespaces properly, and create the MODULE and - # IMPORTEDITEM nodes (one of the latter for each level of nesting). - # bind asname in the current namespace to the imported module # # conversion: possible short name -> fully qualified name From 4e5fabb80411067d448cd3b0fb94c44db5ae88b0 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Sun, 19 Nov 2017 02:40:02 +0200 Subject: [PATCH 14/29] visualize_pyan_architecture.sh: change the options to get one clear graph of everything --- visualize_pyan_architecture.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/visualize_pyan_architecture.sh b/visualize_pyan_architecture.sh index 8ec28b2..f7471c4 100755 --- a/visualize_pyan_architecture.sh +++ b/visualize_pyan_architecture.sh @@ -1,6 +1,4 @@ #!/bin/bash -./pyan.py pyan/*.py --no-uses --defines --grouped --nested-groups --colored --dot --annotated >defines.dot -./pyan.py pyan/*.py --uses --no-defines --grouped --nested-groups --colored --dot --annotated >uses.dot -dot -Tsvg defines.dot >defines.svg -dot -Tsvg uses.dot >uses.svg -echo -ne "Pyan architecture: generated defines.svg and uses.svg\n" +echo -ne "Pyan architecture: generating architecture.{dot,svg}\n" +./pyan.py pyan/*.py --no-defines --uses --colored --annotate --dot -V >architecture.dot 2>architecture.log +dot -Tsvg architecture.dot >architecture.svg From a3f051bd6380f1955e42e5ec4d8b7c51d39346a6 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Mon, 20 Nov 2017 16:12:34 +0200 Subject: [PATCH 15/29] enh: include flavor into output graph with --annotate --- pyan/node.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyan/node.py b/pyan/node.py index eda1b9c..cad6982 100644 --- a/pyan/node.py +++ b/pyan/node.py @@ -119,9 +119,9 @@ class Node: else: if self.get_level() >= 1: if self.ast_node is not None: - return "%s\\n\\n(%s:%d,\\nin %s)" % (self.name, self.filename, self.ast_node.lineno, self.namespace) + return "%s\\n\\n(%s:%d,\\n%s in %s)" % (self.name, self.filename, self.ast_node.lineno, repr(self.flavor), self.namespace) else: - return "%s\\n\\n(in %s)" % (self.name, self.namespace) + return "%s\\n\\n(%s in %s)" % (self.name, repr(self.flavor), self.namespace) else: return self.name From 89f71815087894c8e78978324b107d7a39db3b10 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Mon, 20 Nov 2017 22:32:16 +0200 Subject: [PATCH 16/29] use /usr/bin/env in hashbang --- pyan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyan.py b/pyan.py index 7ad8a26..fe3aa49 100755 --- a/pyan.py +++ b/pyan.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re From 4f43527bf61260ee71e95a6af7a75cad1781a082 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Mon, 20 Nov 2017 23:56:31 +0200 Subject: [PATCH 17/29] enh: don't add a wildcard when encountering an unresolved super() (likely reasons: it is still pass 1, or some relevant source file is not in the analyzed set); this fixes some spurious edges. --- pyan/analyzer.py | 55 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 30d84bb..323fa75 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -101,6 +101,9 @@ class Scope: def __repr__(self): return "" % (self.type, self.name) +class UnresolvedSuperCallError(Exception): + """For signaling an unresolved super() in attribute resolution.""" + pass # These tables were useful for porting the visitor to Python 3: # @@ -646,6 +649,9 @@ class CallGraphVisitor(ast.NodeVisitor): Return (obj,attrname), where obj is a Node (or None on lookup failure), and attrname is the attribute name. + + May pass through UnresolvedSuperCallError, if the attribute resolution + failed specifically due to an unresolved super() call. """ if not isinstance(ast_node, ast.Attribute): @@ -701,6 +707,8 @@ class CallGraphVisitor(ast.NodeVisitor): # attribute of a function call. Detect cases like super().dostuff() elif isinstance(ast_node.value, ast.Call): + # Note that resolve_builtins() will signal an unresolved + # super() by an exception, which we just pass through here. obj_node = self.resolve_builtins(ast_node.value) # can't resolve result of general function call @@ -729,6 +737,8 @@ class CallGraphVisitor(ast.NodeVisitor): Return pair of Node objects (obj,attr), where each item can be None on lookup failure. (Object not known, or no Node value assigned to its attr.) + + May pass through UnresolvedSuperCallError. """ if not isinstance(ast_node.ctx, ast.Load): @@ -778,7 +788,10 @@ class CallGraphVisitor(ast.NodeVisitor): def set_attribute(self, ast_node, new_value): """Assign the Node provided as new_value into the attribute described by the AST node ast_node. Return True if assignment was done, - False otherwise.""" + False otherwise. + + May pass through UnresolvedSuperCallError. + """ if not isinstance(ast_node.ctx, ast.Store): raise ValueError("Expected a store context, got %s" % (type(ast_node.ctx))) @@ -807,11 +820,21 @@ class CallGraphVisitor(ast.NodeVisitor): # if isinstance(node.ctx, ast.Store): new_value = self.last_value - if self.set_attribute(node, new_value): - self.logger.info('setattr %s on %s to %s' % (node.attr, objname, new_value)) + try: + if self.set_attribute(node, new_value): + self.logger.info('setattr %s on %s to %s' % (node.attr, objname, new_value)) + except UnresolvedSuperCallError: + # Trying to set something belonging to an unresolved super() + # of something; just ignore this attempt to setattr. + return elif isinstance(node.ctx, ast.Load): - obj_node,attr_node = self.get_attribute(node) + try: + obj_node,attr_node = self.get_attribute(node) + except UnresolvedSuperCallError: + # Avoid adding a wildcard if the lookup failed due to an + # unresolved super() in the attribute chain. + return # Both object and attr known. if isinstance(attr_node, Node): @@ -1050,7 +1073,15 @@ class CallGraphVisitor(ast.NodeVisitor): """Resolve those calls to built-in functions whose return values can be determined in a simple manner. - Currently, this supports only super(). This works only in pass 2.""" + Currently, this supports only super(), which works only in pass 2, + because the MRO is determined between passes. + + May raise UnresolvedSuperCallError, if the call is to super(), + but the result cannot be (currently) determined (usually because either + pass 1, or some relevant source file is not in the analyzed set). + + Returns the Node the call resolves to, or None if not determined. + """ if not isinstance(ast_node, ast.Call): raise TypeError("Expected ast.Call; got %s" % (type(ast_node))) @@ -1078,7 +1109,13 @@ class CallGraphVisitor(ast.NodeVisitor): self.logger.debug("super of %s is %s" % (class_node, result)) return result else: - self.logger.info("super called for %s, but no known bases" % (class_node)) + msg = "super called for %s, but no known bases" % (class_node) + self.logger.info(msg) + raise UnresolvedSuperCallError(msg) + else: + msg = "super called for %s, but MRO not determined for it (maybe still in pass 1?)" % (class_node) + self.logger.info(msg) + raise UnresolvedSuperCallError(msg) # add implementations for other built-in funcnames here if needed def visit_Call(self, node): @@ -1091,7 +1128,11 @@ class CallGraphVisitor(ast.NodeVisitor): self.visit(kw.value) # see if we can predict the result - result_node = self.resolve_builtins(node) + try: + result_node = self.resolve_builtins(node) + except UnresolvedSuperCallError: + result_node = None + if isinstance(result_node, Node): self.last_value = result_node else: # generic function call From 765b40816f19e9af8f54a362dafd764fa2858c8c Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 00:04:41 +0200 Subject: [PATCH 18/29] remove done TODO --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 6cfe112..1cb2e29 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,6 @@ Currently Pyan always operates at the level of individual functions and methods; ## TODO - - Visualize Node flavors? (Framework already exists, both in `analyzer` and in `visgraph`.) - Determine confidence of detected edges (probability that the edge is correct). Start with a binary system, with only values 1.0 and 0.0. - A fully resolved reference to a name, based on lexical scoping, has confidence 1.0. - A reference to an unknown name has confidence 0.0. From 89048367be790773d379174502aca789cef9fc9b Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 00:07:51 +0200 Subject: [PATCH 19/29] document idea to improve wildcard resolution --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1cb2e29..0d10516 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,7 @@ Currently Pyan always operates at the level of individual functions and methods; - Make the analyzer understand `del name` (probably seen as `isinstance(node.ctx, ast.Del)` in `visit_Name()`, `visit_Attribute()`) - Prefix methods by class name in the graph; create a legend for annotations. See the discussion [here](https://github.com/johnyf/pyan/issues/4). - Improve the wildcard resolution mechanism, see discussion [here](https://github.com/johnyf/pyan/issues/5). + - Could record the namespace of the use site upon creating the wildcard, and check any possible resolutions against that (requiring that the resolved name is in scope at the use site)? - Add an option to visualize relations only between namespaces, useful for large projects. - Scan the nodes and edges, basically generate a new graph and visualize that. - Publish test cases. From 6aeb6306d8c49309009ae549594542de8d613592 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 01:37:37 +0200 Subject: [PATCH 20/29] refactor: split analyzer to the main class and anutils; reorder methods in analyzer main class to make the file more logical to read for humans --- pyan/analyzer.py | 913 ++++++++++++++++++----------------------------- pyan/anutils.py | 246 +++++++++++++ 2 files changed, 589 insertions(+), 570 deletions(-) create mode 100644 pyan/anutils.py diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 323fa75..20b47ed 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -2,12 +2,15 @@ # -*- coding: utf-8 -*- """The AST visitor.""" -import os.path import logging import ast import symtable from .node import Node, Flavor +from .anutils import tail, get_module_name, format_alias, \ + get_ast_node_name, sanitize_exprs, \ + resolve_method_resolution_order, \ + Scope, ExecuteInInnerScope, UnresolvedSuperCallError # TODO: add Cython support (strip type annotations in a preprocess step, then treat as Python) # TODO: built-in functions (range(), enumerate(), zip(), iter(), ...): @@ -24,87 +27,6 @@ from .node import Node, Flavor # # Namespaces also get a Node (with no associated AST node). -def head(lst): - if len(lst): - return lst[0] - -def tail(lst): - if len(lst) > 1: - return lst[1:] - else: - return [] - -def get_module_name(filename): - """Try to determine the full module name of a source file, by figuring out - if its directory looks like a package (i.e. has an __init__.py file).""" - - if os.path.basename(filename) == '__init__.py': - return get_module_name(os.path.dirname(filename)) - - init_path = os.path.join(os.path.dirname(filename), '__init__.py') - mod_name = os.path.basename(filename).replace('.py', '') - - if not os.path.exists(init_path): - return mod_name - - if not os.path.dirname(filename): - return mod_name - - return get_module_name(os.path.dirname(filename)) + '.' + mod_name - -def format_alias(x): - """Return human-readable description of an ast.alias (used in Import and ImportFrom nodes).""" - if not isinstance(x, ast.alias): - raise TypeError("Can only format an ast.alias; got %s" % type(x)) - - if x.asname is not None: - return "%s as %s" % (x.name, x.asname) - else: - return "%s" % (x.name) - -def get_ast_node_name(x): - """Return human-readable name of ast.Attribute or ast.Name. Pass through anything else.""" - if isinstance(x, ast.Attribute): - # x.value might also be an ast.Attribute (think "x.y.z") - return "%s.%s" % (get_ast_node_name(x.value), x.attr) - elif isinstance(x, ast.Name): - return x.id - else: - return x - -# Helper for handling binding forms. -def sanitize_exprs(exprs): - """Convert ast.Tuples in exprs to Python tuples; wrap result in a Python tuple.""" - def process(expr): - if isinstance(expr, (ast.Tuple, ast.List)): - return expr.elts # .elts is a Python tuple - else: - return [expr] - if isinstance(exprs, (tuple, list)): - return [process(expr) for expr in exprs] - else: - return process(exprs) - -class Scope: - """Adaptor that makes scopes look somewhat like those from the Python 2 - compiler module, as far as Pyan's CallGraphVisitor is concerned.""" - - def __init__(self, table): - """table: SymTable instance from symtable.symtable()""" - name = table.get_name() - if name == 'top': - name = '' # Pyan defines the top level as anonymous - self.name = name - self.type = table.get_type() # useful for __repr__() - self.defs = {iden:None for iden in table.get_identifiers()} # name:assigned_value - - def __repr__(self): - return "" % (self.type, self.name) - -class UnresolvedSuperCallError(Exception): - """For signaling an unresolved super() in attribute resolution.""" - pass - # These tables were useful for porting the visitor to Python 3: # # https://docs.python.org/2/library/compiler.html#module-compiler.ast @@ -204,105 +126,10 @@ class CallGraphVisitor(ast.NodeVisitor): self.class_base_nodes[node].append(baseclass_node) self.logger.debug("All base classes (non-recursive, local level only): %s" % self.class_base_nodes) - self.mro = self._resolve_method_resolution_order() - - def _resolve_method_resolution_order(self): - """Compute the method resolution order (MRO) for each of the analyzed classes.""" self.logger.debug("Resolving method resolution order (MRO) for all analyzed classes") - - # https://en.wikipedia.org/wiki/C3_linearization#Description - - class LinearizationImpossible(Exception): - pass - - from functools import reduce - from operator import add - def C3_find_good_head(heads, tails): # find an element of heads which is not in any of the tails - flat_tails = reduce(add, tails, []) # flatten the outer level - for hd in heads: - if hd not in flat_tails: - break - else: # no break only if there are cyclic dependencies. - raise LinearizationImpossible("MRO linearization impossible; cyclic dependency detected. heads: %s, tails: %s" % (heads, tails)) - return hd - - def remove_all(elt, lst): # remove all occurrences of elt from lst, return a copy - return [x for x in lst if x != elt] - def remove_all_in(elt, lists): # remove elt from all lists, return a copy - return [remove_all(elt, lst) for lst in lists] - - def C3_merge(lists): - out = [] - while True: - self.logger.debug("MRO: C3 merge: out: %s, lists: %s" % (out, lists)) - heads = [head(lst) for lst in lists if head(lst) is not None] - if not len(heads): - break - tails = [tail(lst) for lst in lists] - self.logger.debug("MRO: C3 merge: heads: %s, tails: %s" % (heads, tails)) - hd = C3_find_good_head(heads, tails) - self.logger.debug("MRO: C3 merge: chose head %s" % (hd)) - out.append(hd) - lists = remove_all_in(hd, lists) - return out - - mro = {} # result - try: - memo = {} # caching/memoization - def C3_linearize(node): - self.logger.debug("MRO: C3 linearizing %s" % (node)) - seen.add(node) - if node not in memo: - # unknown class or no ancestors - if node not in self.class_base_nodes or not len(self.class_base_nodes[node]): - memo[node] = [node] - else: # known and has ancestors - lists = [] - # linearization of parents... - for baseclass_node in self.class_base_nodes[node]: - if baseclass_node not in seen: - lists.append(C3_linearize(baseclass_node)) - # ...and the parents themselves (in the order they appear in the ClassDef) - self.logger.debug("MRO: parents of %s: %s" % (node, self.class_base_nodes[node])) - lists.append(self.class_base_nodes[node]) - self.logger.debug("MRO: C3 merging %s" % (lists)) - memo[node] = [node] + C3_merge(lists) - self.logger.debug("MRO: C3 linearized %s, result %s" % (node, memo[node])) - return memo[node] - for node in self.class_base_nodes: - self.logger.debug("MRO: analyzing class %s" % (node)) - seen = set() # break cycles (separately for each class we start from) - mro[node] = C3_linearize(node) - except LinearizationImpossible as e: - self.logger.error(e) - - # generic fallback: depth-first search of lists of ancestors - # - # (so that we can try to draw *something* if the code to be - # analyzed is so badly formed that the MRO algorithm fails) - - memo = {} # caching/memoization - def lookup_bases_recursive(node): - seen.add(node) - if node not in memo: - out = [node] # first look up in obj itself... - if node in self.class_base_nodes: # known class? - for baseclass_node in self.class_base_nodes[node]: # ...then in its bases - if baseclass_node not in seen: - out.append(baseclass_node) - out.extend(lookup_bases_recursive(baseclass_node)) - memo[node] = out - return memo[node] - - mro = {} - for node in self.class_base_nodes: - self.logger.debug("MRO: generic fallback: analyzing class %s" % (node)) - seen = set() # break cycles (separately for each class we start from) - mro[node] = lookup_bases_recursive(node) - - self.logger.debug("Method resolution order (MRO) for all analyzed classes: %s" % mro) - return mro + self.mro = resolve_method_resolution_order(self.class_base_nodes, self.logger) + self.logger.debug("Method resolution order (MRO) for all analyzed classes: %s" % self.mro) def postprocess(self): """Finalize the analysis.""" @@ -398,22 +225,6 @@ class CallGraphVisitor(ast.NodeVisitor): def visit_FunctionDef(self, node): self.logger.debug("FunctionDef %s" % (node.name)) -# # Place instance members at class level in the call graph -# # TODO: brittle: breaks analysis if __init__ defines an internal helper class, -# # because then the scope lookup will fail. Disabled this special handling for now. -# # -# # Any assignments in __init__ to self.anything will still be picked up -# # correctly, because they use setattr. -# # -# if node.name == '__init__': -# for d in node.args.defaults: -# self.visit(d) -# for d in node.args.kw_defaults: -# self.visit(d) -# for stmt in node.body: -# self.visit(stmt) -# return - # To begin with: # # - Analyze decorators. They belong to the surrounding scope, @@ -499,67 +310,17 @@ class CallGraphVisitor(ast.NodeVisitor): self.scope_stack.pop() self.name_stack.pop() - def analyze_functiondef(self, ast_node): - """Helper for analyzing function definitions. - - Visit decorators, and if this is a method definition, capture the name - of the first positional argument to denote "self", like Python does. - Return the name representing self, or None if not applicable.""" - - if not isinstance(ast_node, ast.FunctionDef): - raise TypeError("Expected ast.FunctionDef; got %s" % (type(ast_node))) - - # Visit decorators - self.last_value = None - deco_names = [] - for deco in ast_node.decorator_list: - self.visit(deco) # capture function name of decorator (self.last_value hack) - deco_node = self.last_value - if isinstance(deco_node, Node): - deco_names.append(deco_node.name) - self.last_value = None - - # Analyze flavor - in_class_ns = self.context_stack[-1].startswith("ClassDef") - if not in_class_ns: - flavor = Flavor.FUNCTION - else: - if "staticmethod" in deco_names: - flavor = Flavor.STATICMETHOD - elif "classmethod" in deco_names: - flavor = Flavor.CLASSMETHOD - else: # instance method - flavor = Flavor.METHOD - - # Get the name representing "self", if applicable. - # - # - ignore static methods - # - ignore functions defined inside methods (this new FunctionDef - # must be directly in a class namespace) - # - if flavor in (Flavor.METHOD, Flavor.CLASSMETHOD): - # We can treat instance methods and class methods the same, - # since Pyan is only interested in object types, not instances. - all_args = ast_node.args # args, vararg (*args), kwonlyargs, kwarg (**kwargs) - posargs = all_args.args - if len(posargs): - self_name = posargs[0].arg - return self_name, flavor - - return None, flavor - def visit_AsyncFunctionDef(self, node): self.visit_FunctionDef(node) # TODO: alias for now; tag async functions in output in a future version? def visit_Lambda(self, node): self.logger.debug("Lambda") - def process(): + with ExecuteInInnerScope(self, "lambda"): for d in node.args.defaults: self.visit(d) for d in node.args.kw_defaults: self.visit(d) self.visit(node.body) # single expr - self.with_scope("lambda", process) def visit_Import(self, node): self.logger.debug("Import %s" % [format_alias(x) for x in node.names]) @@ -642,173 +403,6 @@ class CallGraphVisitor(ast.NodeVisitor): # tn = t.__name__ # self.last_value = self.get_node('', tn, node) - def resolve_attribute(self, ast_node): - """Resolve an ast.Attribute. - - Nested attributes (a.b.c) are automatically handled by recursion. - - Return (obj,attrname), where obj is a Node (or None on lookup failure), - and attrname is the attribute name. - - May pass through UnresolvedSuperCallError, if the attribute resolution - failed specifically due to an unresolved super() call. - """ - - if not isinstance(ast_node, ast.Attribute): - raise TypeError("Expected ast.Attribute; got %s" % (type(ast_node))) - - self.logger.debug("Resolve %s.%s in context %s" % (get_ast_node_name(ast_node.value), - ast_node.attr, type(ast_node.ctx))) - - # Resolve nested attributes - # - # In pseudocode, e.g. "a.b.c" is represented in the AST as: - # ast.Attribute(attr=c, value=ast.Attribute(attr=b, value=a)) - # - if isinstance(ast_node.value, ast.Attribute): - obj_node,attr_name = self.resolve_attribute(ast_node.value) - - if isinstance(obj_node, Node) and obj_node.namespace is not None: - ns = obj_node.get_name() # fully qualified namespace **of attr** - if ns in self.scopes: # imported modules not in the set of analyzed files are not seen by Pyan - sc = self.scopes[ns] - if attr_name in sc.defs: - self.logger.debug("Resolved to attr %s of %s" % (ast_node.attr, sc.defs[attr_name])) - return sc.defs[attr_name], ast_node.attr - - # It may happen that ast_node.value has no corresponding graph Node, - # if this is a forward-reference, or a reference to a file - # not in the analyzed set. - # - # In this case, return None for the object to let visit_Attribute() - # add a wildcard reference to *.attr. - # - self.logger.debug("Unresolved, returning attr %s of unknown" % (ast_node.attr)) - return None, ast_node.attr - else: - # detect str.join() and similar (attributes of constant literals) - if isinstance(ast_node.value, (ast.Num, ast.Str)): # TODO: other types? - t = type(ast_node.value) - tn = t.__name__ - # Create a namespace-like Node with no associated AST node. - # Constants are builtins, so they should live in the - # top-level namespace (same level as module names). - # - # Since get_node() creates only one node per unique - # (namespace,name) pair, the AST node would anyway be - # frozen to the first constant of any matching type that - # the analyzer encountered in the analyzed source code, - # which is not useful. - # - # The CLASS flavor is the best match, as these constants - # are object types. - # - obj_node = self.get_node('', tn, None, flavor=Flavor.CLASS) - - # attribute of a function call. Detect cases like super().dostuff() - elif isinstance(ast_node.value, ast.Call): - # Note that resolve_builtins() will signal an unresolved - # super() by an exception, which we just pass through here. - obj_node = self.resolve_builtins(ast_node.value) - - # can't resolve result of general function call - if not isinstance(obj_node, Node): - self.logger.debug("Unresolved function call as obj, returning attr %s of unknown" % (ast_node.attr)) - return None, ast_node.attr - else: - # Get the Node object corresponding to node.value in the current ns. - # - # (Using the current ns here is correct; this case only gets - # triggered when there are no more levels of recursion, - # and the leftmost name always resides in the current ns.) - obj_node = self.get_value(get_ast_node_name(ast_node.value)) # resolves "self" if needed - - self.logger.debug("Resolved to attr %s of %s" % (ast_node.attr, obj_node)) - return obj_node, ast_node.attr - - def get_attribute(self, ast_node): - """Get value of an ast.Attribute. - - Supports inherited attributes. If the obj's own namespace has no match - for attr, the ancestors of obj are also tried, following the MRO based - on the static type of the object, until one of them matches or until - all ancestors are exhausted. - - Return pair of Node objects (obj,attr), where each item can be None - on lookup failure. (Object not known, or no Node value assigned - to its attr.) - - May pass through UnresolvedSuperCallError. - """ - - if not isinstance(ast_node.ctx, ast.Load): - raise ValueError("Expected a load context, got %s" % (type(ast_node.ctx))) - - obj_node,attr_name = self.resolve_attribute(ast_node) - - if isinstance(obj_node, Node) and obj_node.namespace is not None: - ns = obj_node.get_name() # fully qualified namespace **of attr** - - # detect str.join() and similar (attributes of constant literals) - # - # Any attribute is considered valid for these special types, - # but only in a load context. (set_attribute() does not have this - # special handling, by design.) - # - if ns in ("Num", "Str"): # TODO: other types? - return obj_node, self.get_node(ns, attr_name, None, flavor=Flavor.ATTRIBUTE) - - # look up attr_name in the given namespace, return Node or None - def lookup(ns): - if ns in self.scopes: - sc = self.scopes[ns] - if attr_name in sc.defs: - return sc.defs[attr_name] - - # first try directly in object's ns (this works already in pass 1) - value_node = lookup(ns) - if value_node is not None: - return obj_node, value_node - - # next try ns of each ancestor (this works only in pass 2, - # after self.mro has been populated) - # - if obj_node in self.mro: - for base_node in tail(self.mro[obj_node]): # the first element is always obj itself - ns = base_node.get_name() - value_node = lookup(ns) - if value_node is not None: - break - else: - return None, None # not found - return base_node, value_node # as obj, return the base class in which attr was found - - return obj_node, None # here obj_node is either None or unknown (namespace None) - - def set_attribute(self, ast_node, new_value): - """Assign the Node provided as new_value into the attribute described - by the AST node ast_node. Return True if assignment was done, - False otherwise. - - May pass through UnresolvedSuperCallError. - """ - - if not isinstance(ast_node.ctx, ast.Store): - raise ValueError("Expected a store context, got %s" % (type(ast_node.ctx))) - - if not isinstance(new_value, Node): - return False - - obj_node,attr_name = self.resolve_attribute(ast_node) - - if isinstance(obj_node, Node) and obj_node.namespace is not None: - ns = obj_node.get_name() # fully qualified namespace **of attr** - if ns in self.scopes: - sc = self.scopes[ns] - sc.defs[attr_name] = new_value - return True - return False - # attribute access (node.ctx determines whether set (ast.Store) or get (ast.Load)) def visit_Attribute(self, node): objname = get_ast_node_name(node.value) @@ -923,58 +517,6 @@ class CallGraphVisitor(ast.NodeVisitor): self.last_value = to_node - def analyze_binding(self, targets, values): - """Generic handler for binding forms. Inputs must be sanitize_exprs()d.""" - - # Before we begin analyzing the assignment, clean up any leftover self.last_value. - # - # (e.g. from any Name in load context (including function names in a Call) - # that did not assign anything.) - # - self.last_value = None - - # TODO: properly support tuple unpacking - # - # - the problem is: - # a,*b,c = [1,2,3,4,5] --> Name,Starred,Name = List - # so a simple analysis of the AST won't get us far here. - # - # To fix this: - # - # - find the index of Starred on the LHS - # - unpack the RHS into a tuple/list (if possible) - # - unpack just one level; the items may be tuples/lists and that's just fine - # - if not possible to unpack directly (e.g. enumerate(foo) is a **call**), - # don't try to be too smart; just do some generic fallback handling (or give up) - # - if RHS unpack successful: - # - map the non-starred items directly (one-to-one) - # - map the remaining sublist of the RHS to the Starred term - # - requires support for tuples/lists of AST nodes as values of Nodes - # - but generally, we need that anyway: consider self.a = (f, g, h) - # --> any use of self.a should detect the possible use of f, g, and h; - # currently this is simply ignored. - # - # TODO: support Additional Unpacking Generalizations (Python 3.6+): - # https://www.python.org/dev/peps/pep-0448/ - - if len(targets) == len(values): # handle correctly the most common trivial case "a1,a2,... = b1,b2,..." - captured_values = [] - for value in values: - self.visit(value) # RHS -> set self.last_value - captured_values.append(self.last_value) - self.last_value = None - for tgt,val in zip(targets,captured_values): - self.last_value = val - self.visit(tgt) # LHS, name in a store context - self.last_value = None - else: # FIXME: for now, do the wrong thing in the non-trivial case - # old code, no tuple unpacking support - for value in values: - self.visit(value) # set self.last_value to **something** on the RHS and hope for the best - for tgt in targets: # LHS, name in a store context - self.visit(tgt) - self.last_value = None - def visit_Assign(self, node): # - chaining assignments like "a = b = c" produces multiple targets # - tuple unpacking works as a separate mechanism on top of that (see analyze_binding()) @@ -1031,35 +573,184 @@ class CallGraphVisitor(ast.NodeVisitor): def visit_ListComp(self, node): self.logger.debug("ListComp") - def process(): + with ExecuteInInnerScope(self, "listcomp"): self.visit(node.elt) self.analyze_generators(node.generators) - self.with_scope("listcomp", process) def visit_SetComp(self, node): self.logger.debug("SetComp") - def process(): + with ExecuteInInnerScope(self, "setcomp"): self.visit(node.elt) self.analyze_generators(node.generators) - self.with_scope("setcomp", process) def visit_DictComp(self, node): self.logger.debug("DictComp") - def process(): + with ExecuteInInnerScope(self, "dictcomp"): self.visit(node.key) self.visit(node.value) self.analyze_generators(node.generators) - self.with_scope("dictcomp", process) def visit_GeneratorExp(self, node): self.logger.debug("GeneratorExp") - def process(): + with ExecuteInInnerScope(self, "genexpr"): self.visit(node.elt) self.analyze_generators(node.generators) - self.with_scope("genexpr", process) + + def visit_Call(self, node): + self.logger.debug("Call %s" % (get_ast_node_name(node.func))) + + # visit args to detect uses + for arg in node.args: + self.visit(arg) + for kw in node.keywords: + self.visit(kw.value) + + # see if we can predict the result + try: + result_node = self.resolve_builtins(node) + except UnresolvedSuperCallError: + result_node = None + + if isinstance(result_node, Node): + self.last_value = result_node + else: # generic function call + # Visit the function name part last, so that inside a binding form, + # it will be left standing as self.last_value. + self.visit(node.func) + + # If self.last_value matches a known class i.e. the call was of the + # form MyClass(), add a uses edge to MyClass.__init__(). + # + # We need to do this manually, because there is no text "__init__" + # at the call site. + # + # In this lookup to self.class_base_ast_nodes we don't care about + # the AST nodes; the keys just conveniently happen to be the Nodes + # of known classes. + # + if self.last_value in self.class_base_ast_nodes: + from_node = self.get_node_of_current_namespace() + class_node = self.last_value + to_node = self.get_node(class_node.get_name(), '__init__', None, flavor=Flavor.METHOD) + self.logger.debug("Use from %s to %s (call creates an instance)" % (from_node, to_node)) + if self.add_uses_edge(from_node, to_node): + self.logger.info("New edge added for Use from %s to %s (call creates an instance)" % (from_node, to_node)) + + ########################################################################### + # Analysis helpers + + def analyze_functiondef(self, ast_node): + """Analyze a function definition. + + Visit decorators, and if this is a method definition, capture the name + of the first positional argument to denote "self", like Python does. + + Return (self_name, flavor), where self_name the name representing self, + or None if not applicable; and flavor is a Flavor, specifically one of + FUNCTION, METHOD, STATICMETHOD or CLASSMETHOD.""" + + if not isinstance(ast_node, ast.FunctionDef): + raise TypeError("Expected ast.FunctionDef; got %s" % (type(ast_node))) + + # Visit decorators + self.last_value = None + deco_names = [] + for deco in ast_node.decorator_list: + self.visit(deco) # capture function name of decorator (self.last_value hack) + deco_node = self.last_value + if isinstance(deco_node, Node): + deco_names.append(deco_node.name) + self.last_value = None + + # Analyze flavor + in_class_ns = self.context_stack[-1].startswith("ClassDef") + if not in_class_ns: + flavor = Flavor.FUNCTION + else: + if "staticmethod" in deco_names: + flavor = Flavor.STATICMETHOD + elif "classmethod" in deco_names: + flavor = Flavor.CLASSMETHOD + else: # instance method + flavor = Flavor.METHOD + + # Get the name representing "self", if applicable. + # + # - ignore static methods + # - ignore functions defined inside methods (this new FunctionDef + # must be directly in a class namespace) + # + if flavor in (Flavor.METHOD, Flavor.CLASSMETHOD): + # We can treat instance methods and class methods the same, + # since Pyan is only interested in object types, not instances. + all_args = ast_node.args # args, vararg (*args), kwonlyargs, kwarg (**kwargs) + posargs = all_args.args + if len(posargs): + self_name = posargs[0].arg + return self_name, flavor + + return None, flavor + + def analyze_binding(self, targets, values): + """Generic handler for binding forms. Inputs must be sanitize_exprs()d.""" + + # Before we begin analyzing the assignment, clean up any leftover self.last_value. + # + # (e.g. from any Name in load context (including function names in a Call) + # that did not assign anything.) + # + self.last_value = None + + # TODO: properly support tuple unpacking + # + # - the problem is: + # a,*b,c = [1,2,3,4,5] --> Name,Starred,Name = List + # so a simple analysis of the AST won't get us far here. + # + # To fix this: + # + # - find the index of Starred on the LHS + # - unpack the RHS into a tuple/list (if possible) + # - unpack just one level; the items may be tuples/lists and that's just fine + # - if not possible to unpack directly (e.g. enumerate(foo) is a **call**), + # don't try to be too smart; just do some generic fallback handling (or give up) + # - if RHS unpack successful: + # - map the non-starred items directly (one-to-one) + # - map the remaining sublist of the RHS to the Starred term + # - requires support for tuples/lists of AST nodes as values of Nodes + # - but generally, we need that anyway: consider self.a = (f, g, h) + # --> any use of self.a should detect the possible use of f, g, and h; + # currently this is simply ignored. + # + # TODO: support Additional Unpacking Generalizations (Python 3.6+): + # https://www.python.org/dev/peps/pep-0448/ + + if len(targets) == len(values): # handle correctly the most common trivial case "a1,a2,... = b1,b2,..." + captured_values = [] + for value in values: + self.visit(value) # RHS -> set self.last_value + captured_values.append(self.last_value) + self.last_value = None + for tgt,val in zip(targets,captured_values): + self.last_value = val + self.visit(tgt) # LHS, name in a store context + self.last_value = None + else: # FIXME: for now, do the wrong thing in the non-trivial case + # old code, no tuple unpacking support + for value in values: + self.visit(value) # set self.last_value to **something** on the RHS and hope for the best + for tgt in targets: # LHS, name in a store context + self.visit(tgt) + self.last_value = None def analyze_generators(self, generators): - """Analyze the generators in a comprehension form.""" + """Analyze the generators in a comprehension form. + + Analyzes the binding part, and visits the "if" expressions (if any). + + generators: an iterable of ast.comprehension objects + """ + for gen in generators: # TODO: there's also an is_async field we might want to use in a future version. targets = sanitize_exprs(gen.target) @@ -1118,45 +809,89 @@ class CallGraphVisitor(ast.NodeVisitor): raise UnresolvedSuperCallError(msg) # add implementations for other built-in funcnames here if needed - def visit_Call(self, node): - self.logger.debug("Call %s" % (get_ast_node_name(node.func))) + def resolve_attribute(self, ast_node): + """Resolve an ast.Attribute. - # visit args to detect uses - for arg in node.args: - self.visit(arg) - for kw in node.keywords: - self.visit(kw.value) + Nested attributes (a.b.c) are automatically handled by recursion. - # see if we can predict the result - try: - result_node = self.resolve_builtins(node) - except UnresolvedSuperCallError: - result_node = None + Return (obj,attrname), where obj is a Node (or None on lookup failure), + and attrname is the attribute name. - if isinstance(result_node, Node): - self.last_value = result_node - else: # generic function call - # Visit the function name part last, so that inside a binding form, - # it will be left standing as self.last_value. - self.visit(node.func) + May pass through UnresolvedSuperCallError, if the attribute resolution + failed specifically due to an unresolved super() call. + """ - # If self.last_value matches a known class i.e. the call was of the - # form MyClass(), add a uses edge to MyClass.__init__(). + if not isinstance(ast_node, ast.Attribute): + raise TypeError("Expected ast.Attribute; got %s" % (type(ast_node))) + + self.logger.debug("Resolve %s.%s in context %s" % (get_ast_node_name(ast_node.value), + ast_node.attr, type(ast_node.ctx))) + + # Resolve nested attributes + # + # In pseudocode, e.g. "a.b.c" is represented in the AST as: + # ast.Attribute(attr=c, value=ast.Attribute(attr=b, value=a)) + # + if isinstance(ast_node.value, ast.Attribute): + obj_node,attr_name = self.resolve_attribute(ast_node.value) + + if isinstance(obj_node, Node) and obj_node.namespace is not None: + ns = obj_node.get_name() # fully qualified namespace **of attr** + if ns in self.scopes: # imported modules not in the set of analyzed files are not seen by Pyan + sc = self.scopes[ns] + if attr_name in sc.defs: + self.logger.debug("Resolved to attr %s of %s" % (ast_node.attr, sc.defs[attr_name])) + return sc.defs[attr_name], ast_node.attr + + # It may happen that ast_node.value has no corresponding graph Node, + # if this is a forward-reference, or a reference to a file + # not in the analyzed set. # - # We need to do this manually, because there is no text "__init__" - # at the call site. + # In this case, return None for the object to let visit_Attribute() + # add a wildcard reference to *.attr. # - # In this lookup to self.class_base_ast_nodes we don't care about - # the AST nodes; the keys just conveniently happen to be the Nodes - # of known classes. - # - if self.last_value in self.class_base_ast_nodes: - from_node = self.get_node_of_current_namespace() - class_node = self.last_value - to_node = self.get_node(class_node.get_name(), '__init__', None, flavor=Flavor.METHOD) - self.logger.debug("Use from %s to %s (call creates an instance)" % (from_node, to_node)) - if self.add_uses_edge(from_node, to_node): - self.logger.info("New edge added for Use from %s to %s (call creates an instance)" % (from_node, to_node)) + self.logger.debug("Unresolved, returning attr %s of unknown" % (ast_node.attr)) + return None, ast_node.attr + else: + # detect str.join() and similar (attributes of constant literals) + if isinstance(ast_node.value, (ast.Num, ast.Str)): # TODO: other types? + t = type(ast_node.value) + tn = t.__name__ + # Create a namespace-like Node with no associated AST node. + # Constants are builtins, so they should live in the + # top-level namespace (same level as module names). + # + # Since get_node() creates only one node per unique + # (namespace,name) pair, the AST node would anyway be + # frozen to the first constant of any matching type that + # the analyzer encountered in the analyzed source code, + # which is not useful. + # + # The CLASS flavor is the best match, as these constants + # are object types. + # + obj_node = self.get_node('', tn, None, flavor=Flavor.CLASS) + + # attribute of a function call. Detect cases like super().dostuff() + elif isinstance(ast_node.value, ast.Call): + # Note that resolve_builtins() will signal an unresolved + # super() by an exception, which we just pass through here. + obj_node = self.resolve_builtins(ast_node.value) + + # can't resolve result of general function call + if not isinstance(obj_node, Node): + self.logger.debug("Unresolved function call as obj, returning attr %s of unknown" % (ast_node.attr)) + return None, ast_node.attr + else: + # Get the Node object corresponding to node.value in the current ns. + # + # (Using the current ns here is correct; this case only gets + # triggered when there are no more levels of recursion, + # and the leftmost name always resides in the current ns.) + obj_node = self.get_value(get_ast_node_name(ast_node.value)) # resolves "self" if needed + + self.logger.debug("Resolved to attr %s of %s" % (ast_node.attr, obj_node)) + return obj_node, ast_node.attr ########################################################################### # Scope analysis @@ -1193,41 +928,6 @@ class CallGraphVisitor(ast.NodeVisitor): self.logger.debug("Scopes now: %s" % (self.scopes)) - def with_scope(self, scopename, thunk): - """Run thunk (0-argument function) with the scope stack augmented with an inner scope. - Used to analyze lambda, listcomp et al. (The scope must still be present in self.scopes.)""" - - # The inner scopes pollute the graph too much; we will need to collapse - # them in postprocessing. However, we must use them here to follow - # the Python 3 scoping rules correctly. - - self.name_stack.append(scopename) - inner_ns = self.get_node_of_current_namespace().get_name() - if inner_ns not in self.scopes: - raise ValueError("Unknown scope '%s'" % (inner_ns)) - self.scope_stack.append(self.scopes[inner_ns]) - self.context_stack.append(scopename) - thunk() - self.context_stack.pop() - self.scope_stack.pop() - self.name_stack.pop() - - # Add a defines edge, which will mark the inner scope as defined, - # allowing any uses to other objects from inside the lambda/listcomp/etc. - # body to be visualized. - # - # All inner scopes of the same scopename (lambda, listcomp, ...) in the - # current ns will be grouped into a single node, as they have no name. - # We create a namespace-like node that has no associated AST node, - # as it does not represent any unique AST node. - from_node = self.get_node_of_current_namespace() - ns = from_node.get_name() - to_node = self.get_node(ns, scopename, None, flavor=Flavor.NAMESPACE) - if self.add_defines_edge(from_node, to_node): - self.logger.info("Def from %s to %s %s" % (from_node, scopename, to_node)) - self.last_value = to_node # Make this inner scope node assignable to track its uses. - - def get_current_class(self): """Return the node representing the current class, or None if not inside a class definition.""" return self.class_stack[-1] if len(self.class_stack) else None @@ -1248,8 +948,12 @@ class CallGraphVisitor(ast.NodeVisitor): name = self.name_stack[-1] return self.get_node(namespace, name, None, flavor=Flavor.NAMESPACE) + ########################################################################### + # Value getter and setter + def get_value(self, name): - """Get the value of name in the current scope. Return the Node, or None if name is not set to a value.""" + """Get the value of name in the current scope. Return the Node, or None + if name is not set to a value.""" # get the innermost scope that has name **and where name has a value** def find_scope(name): @@ -1257,16 +961,6 @@ class CallGraphVisitor(ast.NodeVisitor): if name in sc.defs and sc.defs[name] is not None: return sc -# # If we wanted to get rid of a separate scope stack, we could do this: -# def find_scope(name): -# ns0 = self.get_node_of_current_namespace().get_name() -# for j in range(ns0.count('.')+1): -# ns = ns0.rsplit(".",j)[0] -# if ns in self.scopes: -# sc = self.scopes[ns] -# if name in sc.defs and sc.defs[name] is not None: -# return sc - sc = find_scope(name) if sc is not None: value = sc.defs[name] @@ -1288,16 +982,6 @@ class CallGraphVisitor(ast.NodeVisitor): if name in sc.defs: return sc -# # If we wanted to get rid of a separate scope stack, we could do this: -# def find_scope(name): -# ns0 = self.get_node_of_current_namespace().get_name() -# for j in range(ns0.count('.')+1): -# ns = ns0.rsplit(".",j)[0] -# if ns in self.scopes: -# sc = self.scopes[ns] -# if name in sc.defs: -# return sc - sc = find_scope(name) if sc is not None: if isinstance(value, Node): @@ -1309,6 +993,96 @@ class CallGraphVisitor(ast.NodeVisitor): else: self.logger.debug('Set: name %s not in scope' % (name)) + ########################################################################### + # Attribute getter and setter + + def get_attribute(self, ast_node): + """Get value of an ast.Attribute. + + Supports inherited attributes. If the obj's own namespace has no match + for attr, the ancestors of obj are also tried, following the MRO based + on the static type of the object, until one of them matches or until + all ancestors are exhausted. + + Return pair of Node objects (obj,attr), where each item can be None + on lookup failure. (Object not known, or no Node value assigned + to its attr.) + + May pass through UnresolvedSuperCallError. + """ + + if not isinstance(ast_node, ast.Attribute): + raise TypeError("Expected ast.Attribute; got %s" % (type(ast_node))) + if not isinstance(ast_node.ctx, ast.Load): + raise ValueError("Expected a load context, got %s" % (type(ast_node.ctx))) + + obj_node,attr_name = self.resolve_attribute(ast_node) + + if isinstance(obj_node, Node) and obj_node.namespace is not None: + ns = obj_node.get_name() # fully qualified namespace **of attr** + + # detect str.join() and similar (attributes of constant literals) + # + # Any attribute is considered valid for these special types, + # but only in a load context. (set_attribute() does not have this + # special handling, by design.) + # + if ns in ("Num", "Str"): # TODO: other types? + return obj_node, self.get_node(ns, attr_name, None, flavor=Flavor.ATTRIBUTE) + + # look up attr_name in the given namespace, return Node or None + def lookup(ns): + if ns in self.scopes: + sc = self.scopes[ns] + if attr_name in sc.defs: + return sc.defs[attr_name] + + # first try directly in object's ns (this works already in pass 1) + value_node = lookup(ns) + if value_node is not None: + return obj_node, value_node + + # next try ns of each ancestor (this works only in pass 2, + # after self.mro has been populated) + # + if obj_node in self.mro: + for base_node in tail(self.mro[obj_node]): # the first element is always obj itself + ns = base_node.get_name() + value_node = lookup(ns) + if value_node is not None: + break + else: + return None, None # not found + return base_node, value_node # as obj, return the base class in which attr was found + + return obj_node, None # here obj_node is either None or unknown (namespace None) + + def set_attribute(self, ast_node, new_value): + """Assign the Node provided as new_value into the attribute described + by the AST node ast_node. Return True if assignment was done, + False otherwise. + + May pass through UnresolvedSuperCallError. + """ + + if not isinstance(ast_node, ast.Attribute): + raise TypeError("Expected ast.Attribute; got %s" % (type(ast_node))) + if not isinstance(ast_node.ctx, ast.Store): + raise ValueError("Expected a store context, got %s" % (type(ast_node.ctx))) + + if not isinstance(new_value, Node): + return False + + obj_node,attr_name = self.resolve_attribute(ast_node) + + if isinstance(obj_node, Node) and obj_node.namespace is not None: + ns = obj_node.get_name() # fully qualified namespace **of attr** + if ns in self.scopes: + sc = self.scopes[ns] + sc.defs[attr_name] = new_value + return True + return False + ########################################################################### # Graph creation @@ -1332,8 +1106,8 @@ class CallGraphVisitor(ast.NodeVisitor): !!! In CallGraphVisitor, always use get_node() to create nodes, because it - also sets some auxiliary information. Do not call the Node constructor - directly. + also sets some important auxiliary information. Do not call the Node + constructor directly. !!! """ @@ -1342,7 +1116,6 @@ class CallGraphVisitor(ast.NodeVisitor): if n.namespace == namespace: if Flavor.specificity(flavor) > Flavor.specificity(n.flavor): n.flavor = flavor - return n # Try to figure out which source file this Node belongs to @@ -1373,12 +1146,12 @@ class CallGraphVisitor(ast.NodeVisitor): return n - def get_parent_node(self, node): + def get_parent_node(self, graph_node): """Get the parent node of the given Node. (Used in postprocessing.)""" - if '.' in node.namespace: - ns,name = node.namespace.rsplit('.', 1) + if '.' in graph_node.namespace: + ns,name = graph_node.namespace.rsplit('.', 1) else: - ns,name = '',node.namespace + ns,name = '',graph_node.namespace return self.get_node(ns, name, None) def associate_node(self, graph_node, ast_node, filename=None): diff --git a/pyan/anutils.py b/pyan/anutils.py new file mode 100644 index 0000000..55f31cc --- /dev/null +++ b/pyan/anutils.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Utilities for analyzer.""" + +import os.path +import ast +from .node import Flavor + +def head(lst): + if len(lst): + return lst[0] + +def tail(lst): + if len(lst) > 1: + return lst[1:] + else: + return [] + +def get_module_name(filename): + """Try to determine the full module name of a source file, by figuring out + if its directory looks like a package (i.e. has an __init__.py file).""" + + if os.path.basename(filename) == '__init__.py': + return get_module_name(os.path.dirname(filename)) + + init_path = os.path.join(os.path.dirname(filename), '__init__.py') + mod_name = os.path.basename(filename).replace('.py', '') + + if not os.path.exists(init_path): + return mod_name + + if not os.path.dirname(filename): + return mod_name + + return get_module_name(os.path.dirname(filename)) + '.' + mod_name + +def format_alias(x): + """Return human-readable description of an ast.alias (used in Import and ImportFrom nodes).""" + if not isinstance(x, ast.alias): + raise TypeError("Can only format an ast.alias; got %s" % type(x)) + + if x.asname is not None: + return "%s as %s" % (x.name, x.asname) + else: + return "%s" % (x.name) + +def get_ast_node_name(x): + """Return human-readable name of ast.Attribute or ast.Name. Pass through anything else.""" + if isinstance(x, ast.Attribute): + # x.value might also be an ast.Attribute (think "x.y.z") + return "%s.%s" % (get_ast_node_name(x.value), x.attr) + elif isinstance(x, ast.Name): + return x.id + else: + return x + +# Helper for handling binding forms. +def sanitize_exprs(exprs): + """Convert ast.Tuples in exprs to Python tuples; wrap result in a Python tuple.""" + def process(expr): + if isinstance(expr, (ast.Tuple, ast.List)): + return expr.elts # .elts is a Python tuple + else: + return [expr] + if isinstance(exprs, (tuple, list)): + return [process(expr) for expr in exprs] + else: + return process(exprs) + +def resolve_method_resolution_order(class_base_nodes, logger): + """Compute the method resolution order (MRO) for each of the analyzed classes. + + class_base_nodes: dict cls: [base1, base2, ..., baseN] + where dict and basej are all Node objects. + """ + + # https://en.wikipedia.org/wiki/C3_linearization#Description + + class LinearizationImpossible(Exception): + pass + + from functools import reduce + from operator import add + def C3_find_good_head(heads, tails): # find an element of heads which is not in any of the tails + flat_tails = reduce(add, tails, []) # flatten the outer level + for hd in heads: + if hd not in flat_tails: + break + else: # no break only if there are cyclic dependencies. + raise LinearizationImpossible("MRO linearization impossible; cyclic dependency detected. heads: %s, tails: %s" % (heads, tails)) + return hd + + def remove_all(elt, lst): # remove all occurrences of elt from lst, return a copy + return [x for x in lst if x != elt] + def remove_all_in(elt, lists): # remove elt from all lists, return a copy + return [remove_all(elt, lst) for lst in lists] + + def C3_merge(lists): + out = [] + while True: + logger.debug("MRO: C3 merge: out: %s, lists: %s" % (out, lists)) + heads = [head(lst) for lst in lists if head(lst) is not None] + if not len(heads): + break + tails = [tail(lst) for lst in lists] + logger.debug("MRO: C3 merge: heads: %s, tails: %s" % (heads, tails)) + hd = C3_find_good_head(heads, tails) + logger.debug("MRO: C3 merge: chose head %s" % (hd)) + out.append(hd) + lists = remove_all_in(hd, lists) + return out + + mro = {} # result + try: + memo = {} # caching/memoization + def C3_linearize(node): + logger.debug("MRO: C3 linearizing %s" % (node)) + seen.add(node) + if node not in memo: + # unknown class or no ancestors + if node not in class_base_nodes or not len(class_base_nodes[node]): + memo[node] = [node] + else: # known and has ancestors + lists = [] + # linearization of parents... + for baseclass_node in class_base_nodes[node]: + if baseclass_node not in seen: + lists.append(C3_linearize(baseclass_node)) + # ...and the parents themselves (in the order they appear in the ClassDef) + logger.debug("MRO: parents of %s: %s" % (node, class_base_nodes[node])) + lists.append(class_base_nodes[node]) + logger.debug("MRO: C3 merging %s" % (lists)) + memo[node] = [node] + C3_merge(lists) + logger.debug("MRO: C3 linearized %s, result %s" % (node, memo[node])) + return memo[node] + for node in class_base_nodes: + logger.debug("MRO: analyzing class %s" % (node)) + seen = set() # break cycles (separately for each class we start from) + mro[node] = C3_linearize(node) + except LinearizationImpossible as e: + logger.error(e) + + # generic fallback: depth-first search of lists of ancestors + # + # (so that we can try to draw *something* if the code to be + # analyzed is so badly formed that the MRO algorithm fails) + + memo = {} # caching/memoization + def lookup_bases_recursive(node): + seen.add(node) + if node not in memo: + out = [node] # first look up in obj itself... + if node in class_base_nodes: # known class? + for baseclass_node in class_base_nodes[node]: # ...then in its bases + if baseclass_node not in seen: + out.append(baseclass_node) + out.extend(lookup_bases_recursive(baseclass_node)) + memo[node] = out + return memo[node] + + mro = {} + for node in class_base_nodes: + logger.debug("MRO: generic fallback: analyzing class %s" % (node)) + seen = set() # break cycles (separately for each class we start from) + mro[node] = lookup_bases_recursive(node) + + return mro + +class UnresolvedSuperCallError(Exception): + """For specifically signaling an unresolved super().""" + pass + +class Scope: + """Adaptor that makes scopes look somewhat like those from the Python 2 + compiler module, as far as Pyan's CallGraphVisitor is concerned.""" + + def __init__(self, table): + """table: SymTable instance from symtable.symtable()""" + name = table.get_name() + if name == 'top': + name = '' # Pyan defines the top level as anonymous + self.name = name + self.type = table.get_type() # useful for __repr__() + self.defs = {iden:None for iden in table.get_identifiers()} # name:assigned_value + + def __repr__(self): + return "" % (self.type, self.name) + +# A context manager, sort of a friend of CallGraphVisitor (depends on implementation details) +class ExecuteInInnerScope: + """Execute a code block with the scope stack augmented with an inner scope. + + Used to analyze lambda, listcomp et al. The scope must still be present in + analyzer.scopes. + + !!! + Will add a defines edge from the current namespace to the inner scope, + marking both nodes as defined. + !!! + """ + + def __init__(self, analyzer, scopename): + """analyzer: CallGraphVisitor instance + scopename: name of the inner scope""" + self.analyzer = analyzer + self.scopename = scopename + + def __enter__(self): + # The inner scopes pollute the graph too much; we will need to collapse + # them in postprocessing. However, we must use them during analysis to + # follow the Python 3 scoping rules correctly. + + analyzer = self.analyzer + scopename = self.scopename + + analyzer.name_stack.append(scopename) + inner_ns = analyzer.get_node_of_current_namespace().get_name() + if inner_ns not in analyzer.scopes: + analyzer.name_stack.pop() + raise ValueError("Unknown scope '%s'" % (inner_ns)) + analyzer.scope_stack.append(analyzer.scopes[inner_ns]) + analyzer.context_stack.append(scopename) + + def __exit__(self, errtype, errvalue, traceback): + # TODO: do we need some error handling here? + analyzer = self.analyzer + scopename = self.scopename + + analyzer.context_stack.pop() + analyzer.scope_stack.pop() + analyzer.name_stack.pop() + + # Add a defines edge, which will mark the inner scope as defined, + # allowing any uses to other objects from inside the lambda/listcomp/etc. + # body to be visualized. + # + # All inner scopes of the same scopename (lambda, listcomp, ...) in the + # current ns will be grouped into a single node, as they have no name. + # We create a namespace-like node that has no associated AST node, + # as it does not represent any unique AST node. + from_node = analyzer.get_node_of_current_namespace() + ns = from_node.get_name() + to_node = analyzer.get_node(ns, scopename, None, flavor=Flavor.NAMESPACE) + if analyzer.add_defines_edge(from_node, to_node): + analyzer.logger.info("Def from %s to %s %s" % (from_node, scopename, to_node)) + analyzer.last_value = to_node # Make this inner scope node assignable to track its uses. From d119a9b777561581d9d32a03f1fa6c9245cb91a7 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 02:06:30 +0200 Subject: [PATCH 21/29] oops, convention breakage: __enter__ should return self --- pyan/anutils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyan/anutils.py b/pyan/anutils.py index 55f31cc..47659f5 100644 --- a/pyan/anutils.py +++ b/pyan/anutils.py @@ -221,6 +221,8 @@ class ExecuteInInnerScope: analyzer.scope_stack.append(analyzer.scopes[inner_ns]) analyzer.context_stack.append(scopename) + return self + def __exit__(self, errtype, errvalue, traceback): # TODO: do we need some error handling here? analyzer = self.analyzer From 3d5cfe841538c6de30210bb687881d71a31a3eef Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 02:23:00 +0200 Subject: [PATCH 22/29] enh: analyzer now understands 'with' (binding analysis currently restricted to one target only, __enter__ is assumed to return self) --- README.md | 2 +- pyan/analyzer.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0d10516..63ba5ad 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ Currently Pyan always operates at the level of individual functions and methods; - MRO is (statically) respected in looking up inherited attributes and `super()` ☆ - Assignment tracking with lexical scoping - E.g. if `self.a = MyFancyClass()`, the analyzer knows that any references to `self.a` point to `MyFancyClass` - - All binding forms are supported (assign, augassign, for, comprehensions, generator expressions) ☆ + - All binding forms are supported (assign, augassign, for, comprehensions, generator expressions, with) ☆ - Name clashes between `for` loop counter variables and functions or classes defined elsewhere no longer confuse Pyan. - `self` is defined by capturing the name of the first argument of a method definition, like Python does. ☆ - Simple item-by-item tuple assignments like `x,y,z = a,b,c` ☆ diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 20b47ed..21e0c52 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -636,6 +636,49 @@ class CallGraphVisitor(ast.NodeVisitor): if self.add_uses_edge(from_node, to_node): self.logger.info("New edge added for Use from %s to %s (call creates an instance)" % (from_node, to_node)) + def visit_With(self, node): + self.logger.debug("With (context manager)") + + def add_uses_enter_exit_of(graph_node): + # add uses edges to __enter__ and __exit__ methods of given Node + if isinstance(graph_node, Node): + from_node = self.get_node_of_current_namespace() + withed_obj_node = graph_node + + self.logger.debug("Use from %s to With %s" % (from_node, withed_obj_node)) + for methodname in ('__enter__', '__exit__'): + to_node = self.get_node(withed_obj_node.get_name(), methodname, None, flavor=Flavor.METHOD) + if self.add_uses_edge(from_node, to_node): + self.logger.info("New edge added for Use from %s to %s" % (from_node, to_node)) + + for withitem in node.items: + expr = withitem.context_expr + vars = withitem.optional_vars + + # XXX: we currently visit expr twice (again in analyze_binding()) if vars is not None + self.last_value = None + self.visit(expr) + add_uses_enter_exit_of(self.last_value) + self.last_value = None + + if vars is not None: + # bind optional_vars + # + # TODO: For now, we support only the following (most common) case: + # - only one binding target, vars is ast.Name + # (not ast.Tuple or something else) + # - the variable will point to the object that was with'd + # (i.e. we assume the object's __enter__() method + # to finish with "return self") + # + if isinstance(vars, ast.Name): + self.analyze_binding(sanitize_exprs(vars), sanitize_exprs(expr)) + else: + self.visit(vars) # just capture any uses on the With line itself + + for stmt in node.body: + self.visit(stmt) + ########################################################################### # Analysis helpers From 2d6e54b5a68fa8d7294435e44e0e0899ab252d59 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 13:03:38 +0200 Subject: [PATCH 23/29] bump version to 1.0.1 --- pyan/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyan/__init__.py b/pyan/__init__.py index f94fd76..43a2c10 100644 --- a/pyan/__init__.py +++ b/pyan/__init__.py @@ -3,4 +3,5 @@ from .main import main -__version__ = "1.0.0" +__version__ = "1.0.1" + From 416a14c4f3a7cee47d9c4fbaefd121c0eda0640c Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 15:12:37 +0200 Subject: [PATCH 24/29] enh: analyze str() and repr() if the argument is a Name or Attribute mapping to a known Node --- pyan/analyzer.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 21e0c52..0afa077 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -807,8 +807,12 @@ class CallGraphVisitor(ast.NodeVisitor): """Resolve those calls to built-in functions whose return values can be determined in a simple manner. - Currently, this supports only super(), which works only in pass 2, - because the MRO is determined between passes. + Currently, this supports: + + - str(obj), repr(obj) --> obj.__str__, obj.__repr__ + + - super() (any arguments ignored), which works only in pass 2, + because the MRO is determined between passes. May raise UnresolvedSuperCallError, if the call is to super(), but the result cannot be (currently) determined (usually because either @@ -850,6 +854,19 @@ class CallGraphVisitor(ast.NodeVisitor): msg = "super called for %s, but MRO not determined for it (maybe still in pass 1?)" % (class_node) self.logger.info(msg) raise UnresolvedSuperCallError(msg) + + if funcname in ("str", "repr"): + if len(ast_node.args) == 1: # these take only one argument + obj_astnode = ast_node.args[0] + if isinstance(obj_astnode, (ast.Name, ast.Attribute)): + self.logger.debug("Resolving %s() of %s" % (funcname, get_ast_node_name(obj_astnode))) + attrname = "__%s__" % (funcname) + # build a temporary ast.Attribute AST node so that we can use get_attribute() + tmp_astnode = ast.Attribute(value=obj_astnode, attr=attrname, ctx=obj_astnode.ctx) + obj_node, attr_node = self.get_attribute(tmp_astnode) + self.logger.debug("Resolve %s() of %s: returning attr node %s" % (funcname, get_ast_node_name(obj_astnode), attr_node)) + return attr_node + # add implementations for other built-in funcnames here if needed def resolve_attribute(self, ast_node): From 990c4864aa13b98779bbdc25d125c22709895f8f Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 15:13:23 +0200 Subject: [PATCH 25/29] fix regression: keep wildcard if the 'resolved' target is actually an unresolved function argument --- pyan/analyzer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 0afa077..774b244 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -1309,6 +1309,11 @@ class CallGraphVisitor(ast.NodeVisitor): if from_node not in self.uses_edges: # no uses edges to remove return + # Keep wildcard if the target is actually an unresolved argument + # (see visit_FunctionDef()) + if to_node.get_name().find("^^^argument^^^") != -1: + return + # Here we may prefer to err in one of two ways: # # a) A node seemingly referring to itself is actually referring From 4d399ebf7340f5196ebe31bb1f5c5dc381092acf Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 15:15:38 +0200 Subject: [PATCH 26/29] enh: add --grouped-alt (-G) to make invisible defines edges to suggest grouping for GraphViz, without actually placing the nodes into GraphViz clusters. Overrides --no-defines. Overridden by --defines. --- pyan/main.py | 4 ++++ pyan/visgraph.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pyan/main.py b/pyan/main.py index 9c48fdd..e78e17c 100644 --- a/pyan/main.py +++ b/pyan/main.py @@ -57,6 +57,9 @@ def main(): parser.add_option("-c", "--colored", action="store_true", default=False, dest="colored", help="color nodes according to namespace [dot only]") + parser.add_option("-G", "--grouped-alt", + action="store_true", default=False, dest="grouped_alt", + help="suggest grouping by adding invisible defines edges [only useful with --no-defines]") parser.add_option("-g", "--grouped", action="store_true", default=False, dest="grouped", help="group nodes (create subgraphs) according to namespace [dot only]") @@ -85,6 +88,7 @@ def main(): 'draw_defines': options.draw_defines, 'draw_uses': options.draw_uses, 'colored': options.colored, + 'grouped_alt' : options.grouped_alt, 'grouped': options.grouped, 'nested_groups': options.nested_groups, 'annotated': options.annotated} diff --git a/pyan/visgraph.py b/pyan/visgraph.py index 3872843..5b27086 100644 --- a/pyan/visgraph.py +++ b/pyan/visgraph.py @@ -127,6 +127,7 @@ class VisualGraph(object): def from_visitor(cls, visitor, options=None, logger=None): colored = options.get('colored', False) nested = options.get('nested_groups', False) + grouped_alt = options.get('grouped_alt', False) grouped = nested or options.get('grouped', False) # nested -> grouped annotated = options.get('annotated', False) draw_defines = options.get('draw_defines', False) @@ -221,7 +222,7 @@ class VisualGraph(object): subgraph.nodes.append(visual_node) # Now add edges - if draw_defines or not grouped: + if draw_defines or grouped_alt: # If grouped, use gray lines so they won't visually obstruct # the "uses" lines. # From 2b8b9e40da3b244b1252fcde9258bfa60d2573e5 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Tue, 21 Nov 2017 15:38:13 +0200 Subject: [PATCH 27/29] enh: add uses edge to result from a resolved call to built-ins (important for str() and repr()) --- pyan/analyzer.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pyan/analyzer.py b/pyan/analyzer.py index 774b244..0b8baad 100644 --- a/pyan/analyzer.py +++ b/pyan/analyzer.py @@ -611,8 +611,15 @@ class CallGraphVisitor(ast.NodeVisitor): except UnresolvedSuperCallError: result_node = None - if isinstance(result_node, Node): + if isinstance(result_node, Node): # resolved result self.last_value = result_node + + from_node = self.get_node_of_current_namespace() + to_node = result_node + self.logger.debug("Use from %s to %s (via resolved call to built-ins)" % (from_node, to_node)) + if self.add_uses_edge(from_node, to_node): + self.logger.info("New edge added for Use from %s to %s (via resolved call to built-ins)" % (from_node, to_node)) + else: # generic function call # Visit the function name part last, so that inside a binding form, # it will be left standing as self.last_value. From f99c60e870d3c7cdd27bf226abd7b15586969105 Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Wed, 3 Jan 2018 16:02:40 +0200 Subject: [PATCH 28/29] bump version to 1.0.2 --- pyan/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyan/__init__.py b/pyan/__init__.py index 43a2c10..3a589b8 100644 --- a/pyan/__init__.py +++ b/pyan/__init__.py @@ -3,5 +3,5 @@ from .main import main -__version__ = "1.0.1" +__version__ = "1.0.2" From 410d863717959d2db6ec6721a8951422c3ccc0bd Mon Sep 17 00:00:00 2001 From: Juha Jeronen Date: Wed, 3 Jan 2018 16:02:55 +0200 Subject: [PATCH 29/29] remove extra blank line in __init__.py --- pyan/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyan/__init__.py b/pyan/__init__.py index 3a589b8..d99f4f6 100644 --- a/pyan/__init__.py +++ b/pyan/__init__.py @@ -4,4 +4,3 @@ from .main import main __version__ = "1.0.2" -