-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Expand file tree
/
Copy pathtokenizer_template.py
More file actions
172 lines (163 loc) · 6.54 KB
/
tokenizer_template.py
File metadata and controls
172 lines (163 loc) · 6.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
'''
Lookup table based tokenizer with state popping and pushing capabilities.
The ability to push and pop state is required for handling parenthesised expressions,
indentation, and f-strings. We also use it for handling the different quotation mark types,
but it is not essential for that, merely convenient.
'''
class Tokenizer(object):
def __init__(self, text):
self.text = text
self.index = 0
self.line_start_index = 0
self.token_start_index = 0
self.token_start = 1, 0
self.line = 1
self.super_state = START_SUPER_STATE
self.state_stack = []
self.indents = [0]
#ACTIONS-HERE
def tokens(self, debug=False):
text = self.text
cls_table = CLASS_TABLE
id_index = ID_INDEX
id_chunks = ID_CHUNKS
max_id = len(id_index)*256
#ACTION_TABLE_HERE
state = 0
try:
if debug:
while True:
c = ord(text[self.index])
if c < 128:
cls = cls_table[c]
elif c >= max_id:
cls = ERROR_CLASS
else:
b = id_chunks[id_index[c>>8]][(c>>2)&63]
cls = (b>>((c&3)*2))&3
prev_state = state
print("char = '%s', state=%d, cls=%d" % (text[self.index], state, cls))
state, transition = action_table[self.super_state[state][cls]]
print ("%s -> %s on %r in %s" % (prev_state, state, text[self.index], TRANSITION_STATE_NAMES[id(self.super_state)]))
if transition:
tkn = transition()
if tkn:
yield tkn
else:
self.index += 1
else:
while True:
c = ord(text[self.index])
if c < 128:
cls = cls_table[c]
elif c >= max_id:
cls = ERROR_CLASS
else:
b = id_chunks[id_index[c>>8]][(c>>2)&63]
cls = (b>>((c&3)*2))&3
state, transition = action_table[self.super_state[state][cls]]
if transition:
tkn = transition()
if tkn:
yield tkn
else:
self.index += 1
except IndexError as ex:
if self.index != len(text):
#Reraise index error
cls = cls_table[c]
trans = self.super_state[state]
action_index = trans[cls]
action_table[action_index]
# Not raised? Must have been raised in transition function.
raise ex
tkn = self.emit_indent()
while tkn is not None:
yield tkn
tkn = self.emit_indent()
end = self.line, self.index-self.line_start_index
yield ENDMARKER, u"", self.token_start, end
return
def emit_indent(self):
indent = 0
index = self.line_start_index
current = self.index
here = self.line, current-self.line_start_index
while index < current:
if self.text[index] == ' ':
indent += 1
elif self.text[index] == '\t':
indent = (indent+8) & -8
elif self.text[index] == '\f':
indent = 0
else:
#Unexpected state. Emit error token
while len(self.indents) > 1:
self.indents.pop()
result = ERRORTOKEN, self.text[self.token_start_index:self.index+1], self.token_start, here
self.token_start = here
self.line_start_index = self.index
return result
index += 1
if indent == self.indents[-1]:
self.token_start = here
self.token_start_index = self.index
return None
elif indent > self.indents[-1]:
self.indents.append(indent)
start = self.line, 0
result = INDENT, self.text[self.line_start_index:current], start, here
self.token_start = here
self.token_start_index = current
return result
else:
self.indents.pop()
if indent > self.indents[-1]:
#Illegal indent
result = ILLEGALINDENT, u"", here, here
else:
result = DEDENT, u"", here, here
if indent < self.indents[-1]:
#More dedents to do
self.state_stack.append(self.super_state)
self.super_state = PENDING_DEDENT
self.token_start = here
self.token_start_index = self.index
return result
ENCODING_RE = re.compile(br'.*coding[:=]\s*([-\w.]+).*')
NEWLINE_BYTES = b'\n'
def encoding_from_source(source):
'Returns encoding of source (bytes), plus source strip of any BOM markers.'
#Check for BOM
if source.startswith(codecs.BOM_UTF8):
return 'utf8', source[len(codecs.BOM_UTF8):]
if source.startswith(codecs.BOM_UTF16_BE):
return 'utf-16be', source[len(codecs.BOM_UTF16_BE):]
if source.startswith(codecs.BOM_UTF16_LE):
return 'utf-16le', source[len(codecs.BOM_UTF16_LE):]
try:
first_new_line = source.find(NEWLINE_BYTES)
first_line = source[:first_new_line]
second_new_line = source.find(NEWLINE_BYTES, first_new_line+1)
second_line = source[first_new_line+1:second_new_line]
match = ENCODING_RE.match(first_line) or ENCODING_RE.match(second_line)
if match:
ascii_encoding = match.groups()[0]
encoding = ascii_encoding.decode("ascii")
# Handle non-standard encodings that are recognised by the interpreter.
if encoding.startswith("utf-8-"):
encoding = "utf-8"
elif encoding == "iso-latin-1":
encoding = "iso-8859-1"
elif encoding.startswith("latin-1-"):
encoding = "iso-8859-1"
elif encoding.startswith("iso-8859-1-"):
encoding = "iso-8859-1"
elif encoding.startswith("iso-latin-1-"):
encoding = "iso-8859-1"
return encoding, source
except Exception as ex:
print(ex)
#Failed to determine encoding -- Just treat as default.
pass
return 'utf-8', source