forked from aboutcode-org/scancode-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathseq.py
More file actions
176 lines (142 loc) · 6.47 KB
/
seq.py
File metadata and controls
176 lines (142 loc) · 6.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from collections import namedtuple as _namedtuple
"""
Token sequences alignment and diffing based on the longest common substrings of
"high tokens". This essentially a non-optimal and reasonably fast single local
sequence alignment between two sequences of integers/token ids.
Based on and heavily modified from Python's difflib.py from the 3.X tip:
https://hg.python.org/cpython/raw-file/0a69b1e8b7fe/Lib/difflib.py
license: PSF. See seq.ABOUT file for details.
"""
Match = _namedtuple('Match', 'a b size')
def find_longest_match(a, b, alo, ahi, blo, bhi, b2j, len_good, matchables):
"""
Find longest matching block of a and b in a[alo:ahi] and b[blo:bhi].
`b2j` is a mapping of b high token ids -> list of position in b
`len_good` is such that token ids smaller than `_good_good` are treated as
good, non-junk tokens. `matchables` is a set of matchable positions.
Positions absent from this set are ignored.
Return (i,j,k) Match tuple where:
"i" in the start in "a"
"j" in the start in "b"
"k" in the size of the match
and such that a[i:i+k] is equal to b[j:j+k], where
alo <= i <= i+k <= ahi
blo <= j <= j+k <= bhi
and for all (i',j',k') matchable token positions meeting those conditions,
k >= k'
i <= i'
and if i == i', j <= j'
In other words, of all maximal matching blocks, return one that starts
earliest in a, and of all those maximal matching blocks that start earliest
in a, return the one that starts earliest in b.
First the longest matching block (aka contiguous substring) is determined
where no junk element appears in the block. Then that block is extended as
far as possible by matching other tokens including junk on both sides. So
the resulting block never matches on junk.
If no blocks match, return (alo, blo, 0).
"""
besti, bestj, bestsize = alo, blo, 0
b2j_get = b2j.get
# find longest matchable junk-free match
# during an iteration of the loop, j2len[j] = length of longest
# junk-free match ending with a[i-1] and b[j]
j2len = {}
j2lenget = j2len.get
nothing = []
for i in range(alo, ahi):
newj2len = {}
# we cannot do LCS on junk or non matchable
cura = a[i]
if cura < len_good and i in matchables:
# look at all instances of a[i] in b; note that because
# b2j has no junk token, the loop is skipped if a[i] is junk
for j in b2j_get(cura, nothing):
# a[i] matches b[j]
if j < blo:
continue
if j >= bhi:
break
k = newj2len[j] = j2lenget(j - 1, 0) + 1
if k > bestsize:
besti, bestj, bestsize = i - k + 1, j - k + 1, k
j2len = newj2len
j2lenget = j2len.get
return extend_match(besti, bestj, bestsize, a, b, alo, ahi, blo, bhi, matchables)
def extend_match(besti, bestj, bestsize, a, b, alo, ahi, blo, bhi, matchables):
"""
Extend a match identifier by (besti, bestj, bestsize) with any matching
tokens on each end. Return a new Match.
"""
if bestsize:
while (besti > alo and bestj > blo
and a[besti - 1] == b[bestj - 1]
and (besti - 1) in matchables):
besti -= 1
bestj -= 1
bestsize += 1
while (besti + bestsize < ahi and bestj + bestsize < bhi
and a[besti + bestsize] == b[bestj + bestsize]
and (besti + bestsize) in matchables):
bestsize += 1
return Match(besti, bestj, bestsize)
def match_blocks(a, b, a_start, a_end, b2j, len_good, matchables=frozenset(), *args, **kwargs):
"""
Return a list of matching block Match triples describing matching
subsequences of `a` in `b` starting from the `a_start` position in `a` up to
the `a_end` position in `a`.
`b2j` is a mapping of b "high" token ids -> list of positions in b, e.g. a
posting list.
`len_good` is such that token ids smaller than `len_good` are treated as
important, non-junk tokens.
`matchables` is a set of matchable positions. Positions absent from this set
are ignored.
Each triple is of the form (i, j, n), and means that a[i:i+n] == b[j:j+n].
The triples are monotonically increasing in i and in j. It is also
guaranteed that adjacent triples never describe adjacent equal blocks.
Instead adjacent blocks are merged and collapsed in a single block.
"""
# This non-recursive algorithm is using a list as a queue of blocks. We
# still need to look at and append partial results to matching_blocks in a
# loop. The matches are sorted at the end.
queue = [(a_start, a_end, 0, len(b))]
queue_append = queue.append
queue_pop = queue.pop
matching_blocks = []
matching_blocks_append = matching_blocks.append
while queue:
alo, ahi, blo, bhi = queue_pop()
i, j, k = x = find_longest_match(
a, b, alo, ahi, blo, bhi, b2j, len_good, matchables)
# a[alo:i] vs b[blo:j] unknown
# a[i:i+k] same as b[j:j+k]
# a[i+k:ahi] vs b[j+k:bhi] unknown
if k: # if k is 0, there was no matching block as the size is 0
matching_blocks_append(x)
if alo < i and blo < j:
# there is unprocessed things remaining to the left
queue_append((alo, i, blo, j))
if i + k < ahi and j + k < bhi:
# there is unprocessed things remaining to the right
queue_append((i + k, ahi, j + k, bhi))
matching_blocks.sort()
# collapse adjacent blocks
i1 = j1 = k1 = 0
non_adjacent = []
non_adjacent_append = non_adjacent.append
for i2, j2, k2 in matching_blocks:
# Is this block adjacent to i1, j1, k1?
if i1 + k1 == i2 and j1 + k1 == j2:
# Yes, so collapse them -- this just increases the length of the
# first block by the length of the second, and the first block so
# lengthened remains the block to compare against.
k1 += k2
else:
# Not adjacent: keep it unless this is the first block (k1==0 means
# it's the dummy we started with), and make the second block the new
# block to compare against.
if k1:
non_adjacent_append((i1, j1, k1))
i1, j1, k1 = i2, j2, k2
if k1:
non_adjacent_append((i1, j1, k1))
return [Match._make(na) for na in non_adjacent]