codeql/python/extractor/tokenizer_generator/tokenizer_template.py at codeql-cli/latest · github/codeql · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
'''
Lookup table based tokenizer with state popping and pushing capabilities.
The ability to push and pop state is required for handling parenthesised expressions,
indentation, and f-strings. We also use it for handling the different quotation mark types,
but it is not essential for that, merely convenient.

'''


class Tokenizer(object):

    def __init__(self, text):
        self.text = text
        self.index = 0
        self.line_start_index = 0
        self.token_start_index = 0
        self.token_start = 1, 0
        self.line = 1
        self.super_state = START_SUPER_STATE
        self.state_stack = []
        self.indents = [0]
#ACTIONS-HERE
    def tokens(self, debug=False):
        text = self.text
        cls_table = CLASS_TABLE
        id_index = ID_INDEX
        id_chunks = ID_CHUNKS
        max_id = len(id_index)*256
#ACTION_TABLE_HERE
        state = 0
        try:
            if debug:
                while True:
                    c = ord(text[self.index])
                    if c < 128:
                        cls = cls_table[c]
                    elif c >= max_id:
                        cls = ERROR_CLASS
                    else:
                        b = id_chunks[id_index[c>>8]][(c>>2)&63]
                        cls = (b>>((c&3)*2))&3
                    prev_state = state
                    print("char = '%s', state=%d, cls=%d" % (text[self.index], state, cls))
                    state, transition = action_table[self.super_state[state][cls]]
                    print ("%s -> %s on %r in %s" % (prev_state, state, text[self.index], TRANSITION_STATE_NAMES[id(self.super_state)]))
                    if transition:
                        tkn = transition()
                        if tkn:
                            yield tkn
                    else:
                        self.index += 1
            else:
                while True:
                    c = ord(text[self.index])
                    if c < 128:
                        cls = cls_table[c]
                    elif c >= max_id:
                        cls = ERROR_CLASS
                    else:
                        b = id_chunks[id_index[c>>8]][(c>>2)&63]
                        cls = (b>>((c&3)*2))&3
                    state, transition = action_table[self.super_state[state][cls]]
                    if transition:
                        tkn = transition()
                        if tkn:
                            yield tkn
                    else:
                        self.index += 1
        except IndexError as ex:
            if self.index != len(text):
                #Reraise index error
                cls = cls_table[c]
                trans = self.super_state[state]
                action_index = trans[cls]
                action_table[action_index]
                # Not raised? Must have been raised in transition function.
                raise ex
            tkn = self.emit_indent()
            while tkn is not None:
                yield tkn
                tkn = self.emit_indent()
            end = self.line, self.index-self.line_start_index
            yield ENDMARKER, u"", self.token_start, end
            return

    def emit_indent(self):
        indent = 0
        index = self.line_start_index
        current = self.index
        here = self.line, current-self.line_start_index
        while index < current:
            if self.text[index] == ' ':
                indent += 1
            elif self.text[index] == '\t':
                indent = (indent+8) & -8
            elif self.text[index] == '\f':
                indent = 0
            else:
                #Unexpected state. Emit error token
                while len(self.indents) > 1:
                    self.indents.pop()
                result = ERRORTOKEN, self.text[self.token_start_index:self.index+1], self.token_start, here
                self.token_start = here
                self.line_start_index = self.index
                return result
            index += 1
        if indent == self.indents[-1]:
            self.token_start = here
            self.token_start_index = self.index
            return None
        elif indent > self.indents[-1]:
            self.indents.append(indent)
            start = self.line, 0
            result = INDENT, self.text[self.line_start_index:current], start, here
            self.token_start = here
            self.token_start_index = current
            return result
        else:
            self.indents.pop()
            if indent > self.indents[-1]:
                #Illegal indent
                result = ILLEGALINDENT, u"", here, here
            else:
                result = DEDENT, u"", here, here
                if indent < self.indents[-1]:
                    #More dedents to do
                    self.state_stack.append(self.super_state)
                    self.super_state = PENDING_DEDENT
            self.token_start = here
            self.token_start_index = self.index
            return result


ENCODING_RE = re.compile(br'.*coding[:=]\s*([-\w.]+).*')
NEWLINE_BYTES = b'\n'

def encoding_from_source(source):
    'Returns encoding of source (bytes), plus source strip of any BOM markers.'
    #Check for BOM
    if source.startswith(codecs.BOM_UTF8):
        return 'utf8', source[len(codecs.BOM_UTF8):]
    if source.startswith(codecs.BOM_UTF16_BE):
        return 'utf-16be', source[len(codecs.BOM_UTF16_BE):]
    if source.startswith(codecs.BOM_UTF16_LE):
        return 'utf-16le', source[len(codecs.BOM_UTF16_LE):]
    try:
        first_new_line = source.find(NEWLINE_BYTES)
        first_line = source[:first_new_line]
        second_new_line = source.find(NEWLINE_BYTES, first_new_line+1)
        second_line = source[first_new_line+1:second_new_line]
        match = ENCODING_RE.match(first_line) or ENCODING_RE.match(second_line)
        if match:
            ascii_encoding = match.groups()[0]
            encoding = ascii_encoding.decode("ascii")
            # Handle non-standard encodings that are recognised by the interpreter.
            if encoding.startswith("utf-8-"):
                encoding = "utf-8"
            elif encoding == "iso-latin-1":
                encoding = "iso-8859-1"
            elif encoding.startswith("latin-1-"):
                encoding = "iso-8859-1"
            elif encoding.startswith("iso-8859-1-"):
                encoding = "iso-8859-1"
            elif encoding.startswith("iso-latin-1-"):
                encoding = "iso-8859-1"
            return encoding, source
    except Exception as ex:
        print(ex)
        #Failed to determine encoding -- Just treat as default.
        pass
    return 'utf-8', source