w4py-olde-docs/DocSupport/PyFontify.py at master · WebwareForPython/w4py-olde-docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""Module to analyze Python source code; for syntax coloring tools.

Interface:
    tags = fontify(pytext, searchfrom, searchto)

The 'pytext' argument is a string containing Python source code.
The (optional) arguments 'searchfrom' and 'searchto' may contain a slice in pytext.
The returned value is a list of tuples, formatted like this:
    [('keyword', 0, 6, None),
     ('keyword', 11, 17, None),
     ('comment', 23, 53, None), ...]
The tuple contents are always like this:
    (tag, startindex, endindex, sublist)
tag is one of ('comment', 'string', 'keyword', 'function', 'class')
sublist is not used, hence always None.

"""

# Based on FontText.py by Mitchell S. Chapman,
# which was modified by Zachary Roadhouse,
# then un-Tk'd by Just van Rossum.
# Many thanks for regular expression debugging & authoring are due to:
#    Tim (the-incredib-ly y'rs) Peters and Christian Tismer
# So, who owns the copyright? ;-) How about this:
# Copyright 1996-1997:
#    Mitchell S. Chapman,
#    Zachary Roadhouse,
#    Tim Peters,
#    Just van Rossum
#
# Version 0.4 - changes copyright (c) 2001 Mark Pilgrim (f8dy@diveintopython.org)
#   2001/02/05 - MAP - distinguish between class and function identifiers
#   2001/03/21 - MAP - get keywords from keyword module (instead of hard-coded list)
#   2001/03/22 - MAP - use re module instead of deprecated regex module
#
# 2005/09/09 - deprecated string functions removed by Christoph Zwerschke

__version__ = "0.4"

import re, keyword

# Build up a regular expression which will match anything interesting,
# including multi-line triple-quoted strings.
commentPat = "#.*"

pat = "q[^\q\n]*(\\\\[\000-\377][^\q\n]*)*q"
quotePat = pat.replace("q", "'") + "|" + pat.replace('q', '"')

# Way to go, Tim!
pat = """
qqq
[^\\q]*
(
    (   \\\\[\000-\377]
    |   q
        (   \\\\[\000-\377]
        |   [^\\q]
        |   q
            (   \\\\[\000-\377]
            |   [^\\q]
            )
        )
    )
    [^\\q]*
)*
qqq
"""
pat = ''.join(pat.split()) # get rid of whitespace
tripleQuotePat = pat.replace("q", "'") + "|" + pat.replace('q', '"')

# Build up a regular expression which matches all and only Python keywords.
# This will let us skip the uninteresting identifier references.
# nonKeyPat identifies characters which may legally precede a keyword pattern.
nonKeyPat = "(^|[^a-zA-Z0-9_.\"'])"
keywordsPat = '|'.join(keyword.kwlist)
keyPat = nonKeyPat + "(" + keywordsPat + ")" + nonKeyPat

matchPat = keyPat + "|" + commentPat + "|" + tripleQuotePat + "|" + quotePat
matchRE = re.compile(matchPat)

idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*" # ident with leading whitespace
idRE = re.compile(idKeyPat)


def fontify(pytext, searchfrom=0, searchto=None):
    if searchto is None:
        searchto = len(pytext)
    tags = []
    commentTag = 'comment'
    stringTag = 'string'
    keywordTag = 'keyword'
    functionTag = 'function'
    classTag = 'class'

    start = 0
    end = searchfrom
    while 1:
        matchObject = matchRE.search(pytext, end)
        if not matchObject:
            break
        start, end = matchObject.span()
        match = matchObject.group(0)
        c = match[0]
        if c not in "#'\"":
            # Must have matched a keyword.
            if start == searchfrom:
                # this is the first keyword in the text
                match = match[:-1] # only a space at the end
            else:
                # there's still a redundant char before and after it
                match = match[1:-1]
                start += 1
            end -= 1
            tags.append((keywordTag, start, end, None))
            # If this was a defining keyword,
            # look ahead to the following identifier.
            if match in ('def', 'class'):
                idMatchObject = idRE.search(pytext, end)
                if idMatchObject:
                    start, end = idMatchObject.span()
                    match = idMatchObject.group(0)
                    tags.append((match == 'def' and functionTag or classTag,
                        start, end, None))
        elif c == "#":
            tags.append((commentTag, start, end, None))
        else:
            tags.append((stringTag, start, end, None))
    return tags


def test(path):
    f = open(path)
    text = f.read()
    f.close()
    tags = fontify(text)
    for tag, start, end, sublist in tags:
        print tag, repr(text[start:end]), start, end