0001"""A rudamentary Python source code scanner.
0002
0003A regular expression based Python source code scanner. The `scan`
0004function runs through a file line by line and collects bits of
0005information not provided by object introspection.
0006
0007Synopsis
0008--------
0009
0010Consider the following source file (``example.py``):
0011
0012>>> foo = 'FOO'
0013>>> def bar_function():
0014... print foo
0015>>> class Bling:
0016... foo = 'Bling\'s FOO'
0017... def bar_method(self):
0018... print self.foo
0019
0020The `scan` function returns a `Token` instance representing the file
0021that was scanned:
0022
0023>>> import pudge.scanner as scanner
0024>>> file_tok = scanner.scan('example.py')
0025>>> (file_tok.type, file_tok.name)
0026('file', 'example.py')
0027
0028Traverse the token tree using `Token.find`:
0029
0030>>> tok = file_tok.find('foo')
0031>>> (tok.type, tok.name, tok.line, tok.last_line)
0032('=', 'foo', 1, 2)
0033
0034Line numbers are one piece of information not available via introspection.
0035
0036You can traverse multiple levels of depth using dot notation:
0037
0038>>> tok = file_tok.find('Bling.bar_method')
0039>>> (tok.type, tok.name, tok.line, tok.last_line)
0040('def', 'bar_method', 6, 8)
0041
0042`Token` instances can be treated like dictionaries for syntactic
0043pleasure; this is just like calling find:
0044
0045>>> tok = file_tok['Bling']
0046>>> (tok.type, tok.name, tok.children)
0047('class', 'Bling', [<Token('=', 'foo')>, <Token('def', 'bar_method')>])
0048
0049Note also that the `Token.children` attribute contains a ``list`` containing
0050the immediately children of the token. This provides a source level order of
0051tokens which is not available via introspection.
0052
0053"""
0054
0055import re
0056
0057try:
0058 from cStringIO import StringIO
0059except ImportError:
0060 from StringIO import StringIO
0061
0062class Token(object):
0063 """A Python syntax token.
0064
0065 This class provides access to information about a named python object.
0066 Token objects are arranged into a hierarchy that *should* look exactly
0067 like the introspection object hierarchy.
0068
0069 Token objects have six important attributes:
0070
0071 * ``type`` - The token's type.
0072 This will be one of the following string values:
0073
0074 * ``'file'`` - The token is a root file token. The `name` attribute
0075 contains the name of the file.
0076 * ``'def'`` - The token describes a function or method.
0077 * ``'class'`` - The token describes a class.
0078 * ``'='`` - The token describes an attribute
0079
0080 * ``name`` - The name of the file, class, function, or attribute
0081
0082 * ``indent`` - The indent level as an integer starting at ``0``.
0083
0084 * ``line`` - The line number that the token appears on.
0085
0086 * ``last_line`` - The line at which the token is no longer 'on the stack'
0087
0088 * ``children`` - ``list`` of child tokens.
0089
0090 """
0091 def __init__(self, type, name, line, indent):
0092 self.type = type
0093 self.name = name
0094 self.indent = indent
0095 self.line = line
0096 self.last_line = None
0097 self.children = []
0098
0099 def tuplize(self):
0100 return (self.type, self.name, self.indent, self.line, self.last_line,
0101 [ch.tuplize() for ch in self.children if self.children])
0102
0103 def find(self, name):
0104 components = name.split('.', 1)
0105 this = components[0]
0106 for c in self.children:
0107 if c.name == this:
0108 if len(components) > 1:
0109 return c.find(components[1])
0110 else:
0111 return c
0112 return None
0113
0114 def for_line(self, line):
0115 for c in self.children:
0116 if c.line == line:
0117 if len(components) > 1:
0118 return c.find(components[1])
0119 else:
0120 return c
0121 return None
0122
0123 def __getitem__(self, name):
0124 rslt = self.find(name)
0125 if not rslt:
0126 raise KeyError(name)
0127 return rslt
0128
0129 def __iter__(self):
0130 return iter(self.children)
0131
0132 def __repr__(self):
0133 return '<Token(%r, %r)>' % (self.type, self.name)
0134
0135token_patterns = [('def', re.compile(r'[ ]*def\s+([A-Za-z0-9_]+)\s*\(')),
0136 ('class', re.compile(r'[ ]*class\s+([A-Za-z0-9_]+)')),
0137 ('=', re.compile(r'[ ]*([A-Za-z0-9_]+)\s*=.*'))]
0138
0139space = re.compile(r'([ ]*)')
0140_cache = {}
0141
0142def empty_cache():
0143 """Empties the ``filename -> Token`` cache.
0144
0145 It isn't a bad idea to do this every once in a while if you use the
0146 cache argument to `scan` so that the garbage collector can free up the
0147 objects.
0148 """
0149 global _cache
0150 _cache = {}
0151
0152def scan(filename, file=None, cache=0):
0153 """Scan a file and return collected bits
0154
0155 `filename` is the name of the file to scan. If `file` is specified,
0156 it is a file object that responds to the ``readline`` method. When a
0157 truthful `cache` argument is provided, this method memoizes the
0158 result based on the `filename` argument. *The cache is not thread safe.*
0159
0160 A single `Token` object is returned that represents the root of the
0161 tree. The `Token`'s type will be ``'file'``
0162 """
0163
0164 if _cache.has_key(filename):
0165 return _cache[filename]
0166 if file is None:
0167 file = open(filename, 'r')
0168 current = top = Token('file', filename, 0, -1)
0169 parents = []
0170 pos = 0
0171 indent = 0
0172 line = file.readline()
0173 while line:
0174 pos+=1
0175 stripped = line.strip()
0176 if stripped == '' or stripped.startswith('#'):
0177 line = file.readline()
0178 continue
0179 indent = len(space.match(line).group(1)) / 4
0180 while indent <= current.indent:
0181 current.last_line = pos
0182 current = parents.pop()
0183 if current.type in ['class', 'file'] and indent == current.indent + 1:
0184 for (t, p) in token_patterns:
0185 m = p.match(line)
0186 if m:
0187 parents.append(current)
0188 current = Token(t, m.group(1), pos, indent)
0189 parents[-1].children.append(current)
0190 line = file.readline()
0191 for tok in [top] + parents:
0192 tok.last_line = pos
0193 if cache:
0194 _cache[filename] = top
0195 return top
0196
0197__all__ = ['scan', 'Token', 'empty_cache', 'token_patterns']
0198
0199
0200__author__ = "Ryan Tomayko <rtomayko@gmail.com>"
0201__date__ = "$Date: 2005-05-25 23:16:24 -0400 (Wed, 25 May 2005) $"
0202__revision__ = "$Revision: 35 $"
0203__url__ = "$URL: svn://lesscode.org/pudge/trunk/pudge/scanner.py $"
0204__copyright__ = "Copyright 2005, Ryan Tomayko"
0205__license__ = "MIT <http://www.opensource.org/licenses/mit-license.php>"