Coverage for pyquickhelper/texthelper/edit_text_diff.py: 99%
225 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Improves text comparison.
5"""
6import numpy
7try:
8 from cpyquickhelper.algorithms.edit_distance import (
9 edit_distance_string as edit_distance_string_fast)
10except ImportError:
11 edit_distance_string_fast = None
14def edit_distance_string(s1, s2, cmp_cost=1.):
15 """
16 Computes the edit distance between strings *s1* and *s2*.
18 :param s1: first string
19 :param s2: second string
20 :return: dist, list of tuples of aligned characters
22 Another version is implemented in module :epkg:`cpyquickhelper`.
23 It uses C++ to make it around 25 times faster than the python
24 implementation.
25 """
26 n1 = len(s1) + 1
27 n2 = len(s2) + 1
28 dist = numpy.full((n1, n2), n1 * n2, dtype=numpy.float64)
29 pred = numpy.full(dist.shape, 0, dtype=numpy.int32)
31 for i in range(0, n1):
32 dist[i, 0] = i
33 pred[i, 0] = 1
34 for j in range(1, n2):
35 dist[0, j] = j
36 pred[0, j] = 2
37 pred[0, 0] = -1
39 for i in range(1, n1):
40 for j in range(1, n2):
41 c = dist[i, j]
43 p = 0
44 if dist[i - 1, j] + 1 < c:
45 c = dist[i - 1, j] + 1
46 p = 1
47 if dist[i, j - 1] + 1 < c:
48 c = dist[i, j - 1] + 1
49 p = 2
50 d = 0 if s1[i - 1] == s2[j - 1] else cmp_cost
51 if dist[i - 1, j - 1] + d < c:
52 c = dist[i - 1, j - 1] + d
53 p = 3
54 if p == 0:
55 raise RuntimeError( # pragma: no cover
56 "Unexpected value for p=%d at position=%r." % (p, (i, j)))
58 dist[i, j] = c
59 pred[i, j] = p
61 d = dist[len(s1), len(s2)]
62 equals = []
63 i, j = len(s1), len(s2)
64 p = pred[i, j]
65 while p != -1:
66 if p == 3:
67 equals.append((i - 1, j - 1))
68 i -= 1
69 j -= 1
70 elif p == 2:
71 j -= 1
72 elif p == 1:
73 i -= 1
74 else:
75 raise RuntimeError( # pragma: no cover
76 "Unexpected value for p=%d at position=%r." % (p, (i, j)))
77 p = pred[i, j]
78 return d, list(reversed(equals))
81def edit_distance_text(rows1, rows2, strategy="full",
82 verbose=False, return_matrices=False,
83 **thresholds):
84 """
85 Computes an edit distance between lines of a text.
87 :param rows1: first set of rows
88 :param rows2: second set of rows
89 :param strategy: strategy to match lines (see below)
90 :param verbose: if True, show progress with tqdm
91 :param return_matrices: return distances and predecessor
92 matrices as well
93 :param thresholds: see below
94 :return: distance, list of tuples of aligned lines, distance and
95 alignment for each aligned lines, and finally an array
96 with aligned line number for both texts
98 Strategies:
99 * `'full'`: computes all edit distances between all lines
101 Thresholds:
102 * `'threshold'`: two lines can match if the edit distance is not too big,
103 a low threshold means no match (default is 0.5)
104 * `'insert_len'`: variable cost of insertion (default is 1.)
105 * `'insert_cst'`: fixed cost of insertion (default is 1.)
106 * `'weight_cmp'`: weight for comparison cost (default is 2.)
107 * '`cmp_cost'`: cost of a bad comparison, default is `2 * insert_len`
109 .. note::
111 The full python implementation is quite slow. Function
112 @see fn edit_distance_string is also implemented in module
113 :epkg:`cpyquickhelper`. If this module is installed and recent
114 enough, this function will use this version as it is 25 times
115 faster. The version in :epkg:`cpyquickhelper` is using C++.
116 """
117 if strategy != 'full':
118 raise NotImplementedError( # pragma: no cover
119 "No other strategy than 'full' was implemented.")
120 cached_distances = {}
122 insert_len = thresholds.get('insert_len', 1.)
123 insert_cst = thresholds.get('insert_cst', 1.)
124 threshold = thresholds.get('threshold', 0.5)
125 weight_cmp = thresholds.get('weight_cmp', 2.)
126 cmp_cost = thresholds.get('cmp_cost', 2. * insert_len)
128 fct_cmp = edit_distance_string_fast or edit_distance_string
130 def cost_insert(row):
131 s = row.strip("\t ")
132 return (len(s) + insert_cst) * insert_len
134 def cost_cmp(i, j, row1, row2, bypass=True):
135 s1 = row1.strip('\t ')
136 s2 = row2.strip('\t ')
137 c1 = cost_insert(s1)
138 c2 = cost_insert(s2)
139 if (bypass and len(s1) > 9 and len(s2) > 9 and
140 min(c1, c2) < threshold * max(c1, c2)):
141 if len(row1) < len(row2):
142 return cost_insert(row2[len(row1):]), []
143 return cost_insert(row1[len(row2):]), []
144 if (i, j) in cached_distances:
145 return cached_distances[i, j]
146 ed, equals = fct_cmp(row1, row2, cmp_cost=cmp_cost)
147 ed *= weight_cmp
148 cached_distances[i, j] = ed, equals
149 return ed, equals
151 if isinstance(rows1, str):
152 rows1 = rows1.split("\n")
153 if isinstance(rows2, str):
154 rows2 = rows2.split("\n")
156 n1 = len(rows1) + 1
157 n2 = len(rows2) + 1
158 t1 = sum(map(len, rows1)) + 1
159 t2 = sum(map(len, rows2)) + 1
160 dist = numpy.full((n1, n2), t1 * t2 + t1 + t2 + 10, dtype=numpy.float64)
161 pred = numpy.full(dist.shape, 0, dtype=numpy.int32)
163 dist[0, 0] = 0
164 for i in range(1, n1):
165 dist[i, 0] = cost_insert(rows1[i - 1]) + dist[i - 1, 0]
166 pred[i, 0] = 1
167 for j in range(1, n2):
168 dist[0, j] = cost_insert(rows2[j - 1]) + dist[0, j - 1]
169 pred[0, j] = 2
170 pred[0, 0] = -1
172 if verbose:
173 from tqdm import tqdm # pragma: no cover
174 loop = tqdm(range(1, n1)) # pragma: no cover
175 else:
176 loop = range(1, n1)
177 for i in loop:
178 for j in range(1, n2):
179 c = dist[i, j]
181 p = 0
182 d = dist[i - 1, j] + cost_insert(rows1[i - 1])
183 if d < c:
184 c = d
185 p = 1
186 d = dist[i, j - 1] + cost_insert(rows2[j - 1])
187 if d < c:
188 c = d
189 p = 2
190 if c < dist[i - 1, j - 1]:
191 dist[i, j] = c
192 pred[i, j] = p
193 continue
194 d = (dist[i - 1, j - 1] +
195 cost_cmp(i - 1, j - 1, rows1[i - 1], rows2[j - 1])[0])
196 if d < c:
197 c = d
198 p = 3
199 if p == 0:
200 raise RuntimeError( # pragma: no cover
201 "Unexpected value for p=%d at position=%r, c=%d, "
202 "dist[i, j]=%d, dist=\n%r." % (
203 p, (i, j), c, dist[i, j],
204 dist[i - 1:i + 1, j - 1: j + 1]))
206 dist[i, j] = c
207 pred[i, j] = p
209 cached_distances.clear()
210 d = dist[n1 - 1, n2 - 1]
211 equals = []
212 i, j = n1 - 1, n2 - 1
213 lines1 = {}
214 lines2 = {}
215 p = pred[i, j]
216 while p != -1:
217 if p == 3:
218 cd = cost_cmp(i - 1, j - 1, rows1[i - 1],
219 rows2[j - 1], bypass=False)
220 lines1[i - 1] = j - 1
221 lines2[j - 1] = i - 1
222 equals.append((i - 1, j - 1) + cd)
223 i -= 1
224 j -= 1
225 elif p == 2:
226 j -= 1
227 elif p == 1:
228 i -= 1
229 else:
230 raise RuntimeError( # pragma: no cover
231 "Unexpected value for p=%d at position=%r." % (p, (i, j)))
232 p = pred[i, j]
234 # final alignment
235 aligned = []
236 la = 0
237 lb = 0
238 while la < len(rows1) or lb < len(rows2):
239 if la in lines1 and lb in lines2:
240 aligned.append((la, lb))
241 la += 1
242 lb += 1
243 while la not in lines1 and la < len(rows1):
244 aligned.append((la, None))
245 la += 1
246 while lb not in lines2 and lb < len(rows2):
247 aligned.append((None, lb))
248 lb += 1
249 if return_matrices:
250 return d, list(reversed(equals)), aligned, (dist, pred)
251 return d, list(reversed(equals)), aligned
254def diff2html(rows1, rows2, equals, aligned, two_columns=False):
255 """
256 Produces a HTML report with differences between *rows1*
257 and *rows2*.
259 :param rows1: first set of rows
260 :param rows2: second set of rows
261 :param equals: third output of @see fn edit_distance_text
262 :param aligned: fourth output of @see fn edit_distance_text
263 :param two_columns: displays the differences on two columns
264 :return: HTML text
265 """
266 if isinstance(rows1, str):
267 rows1 = rows1.split("\n")
268 if isinstance(rows2, str):
269 rows2 = rows2.split("\n")
271 char_aligned = {}
272 for i, j, d, eq in equals:
273 char_aligned[i, j] = (d, eq)
275 tr = '<tr style="1px solid black;">'
276 tr_ = '</tr>'
277 tda = '<td style="background-color:#E59866;"><code style="background-color:#E59866;">'
278 tda_ = '</code></td>'
279 tdb = '<td style="background-color:#ABEBC6;"><code style="background-color:#ABEBC6;">'
280 tdb_ = '</code></td>'
281 tdc = '<td style="background-color:#E5E7E9;"><code style="background-color:#E5E7E9;">'
282 tdc_ = '</code></td>'
283 spana = '<span style="color:#BA4A00;">'
284 spanb = '<span style="color:#196F3D;">'
285 span_ = "</span>"
286 rows = []
287 rows.append(
288 '<table style="white-space: pre; 1px solid black; font-family:courier; text-align:left !important;">')
289 for a, b in aligned:
290 row = [tr]
292 if a is None:
293 row.append("<td></td>")
294 elif b is None:
295 row.extend([tda, str(a), tda_])
296 else:
297 al = char_aligned[a, b]
298 if al[0] == 0:
299 row.extend([
300 '<td style="background-color:#FFFFFF;">'
301 '<code style="background-color:#FFFFFF;">',
302 str(a), "</code></td>"])
303 else:
304 row.extend(["<td><code>", str(a), "</code></td>"])
306 if b is None:
307 row.append("<td></td>")
308 elif a is None:
309 row.extend([tdb, str(b), tdb_])
310 else:
311 al = char_aligned[a, b]
312 if al[0] == 0:
313 row.extend([
314 '<td style="background-color:#FFFFFF;">'
315 '<code style="background-color:#FFFFFF;">',
316 str(b), '</code></td>'])
317 else:
318 row.extend(["<td><code>", str(b), "</code></td>"])
320 if a is None:
321 if two_columns:
322 row.extend(["<td></td>", tdb, rows2[b], tdb_])
323 else:
324 row.extend([tdb, rows2[b], tdb_])
325 elif b is None:
326 if two_columns:
327 row.extend([tda, rows1[a], tda_, "<td></td>"])
328 else:
329 row.extend([tda, rows1[a], tda_])
330 else:
331 al = char_aligned[a, b]
332 if al[0] == 0:
333 if two_columns:
334 row.extend([
335 '<td style="background-color:#FFFFFF;">'
336 '<code style="background-color:#FFFFFF;">',
337 rows1[a], '</code></td>',
338 '<td style="background-color:#FFFFFF;">'
339 '<code style="background-color:#FFFFFF;">',
340 rows2[b], '</code></td>'])
341 else:
342 row.extend([
343 '<td style="background-color:#FFFFFF;">'
344 '<code style="background-color:#FFFFFF;">',
345 rows1[a], '</code></td>'])
346 else:
347 # Not equal
348 s1 = rows1[a]
349 s2 = rows2[b]
350 l1 = [spana + _ + span_ for _ in s1]
351 l2 = [spanb + _ + span_ for _ in s2]
352 for i, j in al[1]:
353 if i is None or j is None:
354 continue # pragma: no cover
355 if s1[i] == s2[j]:
356 l1[i] = s1[i]
357 l2[j] = s2[j]
358 if two_columns:
359 row.extend(
360 [tdc, "".join(l1), "</code>", tdc_,
361 tdc, "<code>", "".join(l2), tdc_])
362 else:
363 row.extend(
364 [tdc, "".join(l1), "</code><br /><code>",
365 "".join(l2), tdc_])
367 row.append(tr_)
368 rows.append("".join(row))
369 rows.append("</table>")
370 return "\n".join(rows)