Coverage for pyquickhelper/texthelper/edit_text_diff.py: 99%

225 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-03 02:21 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Improves text comparison. 

5""" 

6import numpy 

7try: 

8 from cpyquickhelper.algorithms.edit_distance import ( 

9 edit_distance_string as edit_distance_string_fast) 

10except ImportError: 

11 edit_distance_string_fast = None 

12 

13 

14def edit_distance_string(s1, s2, cmp_cost=1.): 

15 """ 

16 Computes the edit distance between strings *s1* and *s2*. 

17 

18 :param s1: first string 

19 :param s2: second string 

20 :return: dist, list of tuples of aligned characters 

21 

22 Another version is implemented in module :epkg:`cpyquickhelper`. 

23 It uses C++ to make it around 25 times faster than the python 

24 implementation. 

25 """ 

26 n1 = len(s1) + 1 

27 n2 = len(s2) + 1 

28 dist = numpy.full((n1, n2), n1 * n2, dtype=numpy.float64) 

29 pred = numpy.full(dist.shape, 0, dtype=numpy.int32) 

30 

31 for i in range(0, n1): 

32 dist[i, 0] = i 

33 pred[i, 0] = 1 

34 for j in range(1, n2): 

35 dist[0, j] = j 

36 pred[0, j] = 2 

37 pred[0, 0] = -1 

38 

39 for i in range(1, n1): 

40 for j in range(1, n2): 

41 c = dist[i, j] 

42 

43 p = 0 

44 if dist[i - 1, j] + 1 < c: 

45 c = dist[i - 1, j] + 1 

46 p = 1 

47 if dist[i, j - 1] + 1 < c: 

48 c = dist[i, j - 1] + 1 

49 p = 2 

50 d = 0 if s1[i - 1] == s2[j - 1] else cmp_cost 

51 if dist[i - 1, j - 1] + d < c: 

52 c = dist[i - 1, j - 1] + d 

53 p = 3 

54 if p == 0: 

55 raise RuntimeError( # pragma: no cover 

56 "Unexpected value for p=%d at position=%r." % (p, (i, j))) 

57 

58 dist[i, j] = c 

59 pred[i, j] = p 

60 

61 d = dist[len(s1), len(s2)] 

62 equals = [] 

63 i, j = len(s1), len(s2) 

64 p = pred[i, j] 

65 while p != -1: 

66 if p == 3: 

67 equals.append((i - 1, j - 1)) 

68 i -= 1 

69 j -= 1 

70 elif p == 2: 

71 j -= 1 

72 elif p == 1: 

73 i -= 1 

74 else: 

75 raise RuntimeError( # pragma: no cover 

76 "Unexpected value for p=%d at position=%r." % (p, (i, j))) 

77 p = pred[i, j] 

78 return d, list(reversed(equals)) 

79 

80 

81def edit_distance_text(rows1, rows2, strategy="full", 

82 verbose=False, return_matrices=False, 

83 **thresholds): 

84 """ 

85 Computes an edit distance between lines of a text. 

86 

87 :param rows1: first set of rows 

88 :param rows2: second set of rows 

89 :param strategy: strategy to match lines (see below) 

90 :param verbose: if True, show progress with tqdm 

91 :param return_matrices: return distances and predecessor 

92 matrices as well 

93 :param thresholds: see below 

94 :return: distance, list of tuples of aligned lines, distance and 

95 alignment for each aligned lines, and finally an array 

96 with aligned line number for both texts 

97 

98 Strategies: 

99 * `'full'`: computes all edit distances between all lines 

100 

101 Thresholds: 

102 * `'threshold'`: two lines can match if the edit distance is not too big, 

103 a low threshold means no match (default is 0.5) 

104 * `'insert_len'`: variable cost of insertion (default is 1.) 

105 * `'insert_cst'`: fixed cost of insertion (default is 1.) 

106 * `'weight_cmp'`: weight for comparison cost (default is 2.) 

107 * '`cmp_cost'`: cost of a bad comparison, default is `2 * insert_len` 

108 

109 .. note:: 

110 

111 The full python implementation is quite slow. Function 

112 @see fn edit_distance_string is also implemented in module 

113 :epkg:`cpyquickhelper`. If this module is installed and recent 

114 enough, this function will use this version as it is 25 times 

115 faster. The version in :epkg:`cpyquickhelper` is using C++. 

116 """ 

117 if strategy != 'full': 

118 raise NotImplementedError( # pragma: no cover 

119 "No other strategy than 'full' was implemented.") 

120 cached_distances = {} 

121 

122 insert_len = thresholds.get('insert_len', 1.) 

123 insert_cst = thresholds.get('insert_cst', 1.) 

124 threshold = thresholds.get('threshold', 0.5) 

125 weight_cmp = thresholds.get('weight_cmp', 2.) 

126 cmp_cost = thresholds.get('cmp_cost', 2. * insert_len) 

127 

128 fct_cmp = edit_distance_string_fast or edit_distance_string 

129 

130 def cost_insert(row): 

131 s = row.strip("\t ") 

132 return (len(s) + insert_cst) * insert_len 

133 

134 def cost_cmp(i, j, row1, row2, bypass=True): 

135 s1 = row1.strip('\t ') 

136 s2 = row2.strip('\t ') 

137 c1 = cost_insert(s1) 

138 c2 = cost_insert(s2) 

139 if (bypass and len(s1) > 9 and len(s2) > 9 and 

140 min(c1, c2) < threshold * max(c1, c2)): 

141 if len(row1) < len(row2): 

142 return cost_insert(row2[len(row1):]), [] 

143 return cost_insert(row1[len(row2):]), [] 

144 if (i, j) in cached_distances: 

145 return cached_distances[i, j] 

146 ed, equals = fct_cmp(row1, row2, cmp_cost=cmp_cost) 

147 ed *= weight_cmp 

148 cached_distances[i, j] = ed, equals 

149 return ed, equals 

150 

151 if isinstance(rows1, str): 

152 rows1 = rows1.split("\n") 

153 if isinstance(rows2, str): 

154 rows2 = rows2.split("\n") 

155 

156 n1 = len(rows1) + 1 

157 n2 = len(rows2) + 1 

158 t1 = sum(map(len, rows1)) + 1 

159 t2 = sum(map(len, rows2)) + 1 

160 dist = numpy.full((n1, n2), t1 * t2 + t1 + t2 + 10, dtype=numpy.float64) 

161 pred = numpy.full(dist.shape, 0, dtype=numpy.int32) 

162 

163 dist[0, 0] = 0 

164 for i in range(1, n1): 

165 dist[i, 0] = cost_insert(rows1[i - 1]) + dist[i - 1, 0] 

166 pred[i, 0] = 1 

167 for j in range(1, n2): 

168 dist[0, j] = cost_insert(rows2[j - 1]) + dist[0, j - 1] 

169 pred[0, j] = 2 

170 pred[0, 0] = -1 

171 

172 if verbose: 

173 from tqdm import tqdm # pragma: no cover 

174 loop = tqdm(range(1, n1)) # pragma: no cover 

175 else: 

176 loop = range(1, n1) 

177 for i in loop: 

178 for j in range(1, n2): 

179 c = dist[i, j] 

180 

181 p = 0 

182 d = dist[i - 1, j] + cost_insert(rows1[i - 1]) 

183 if d < c: 

184 c = d 

185 p = 1 

186 d = dist[i, j - 1] + cost_insert(rows2[j - 1]) 

187 if d < c: 

188 c = d 

189 p = 2 

190 if c < dist[i - 1, j - 1]: 

191 dist[i, j] = c 

192 pred[i, j] = p 

193 continue 

194 d = (dist[i - 1, j - 1] + 

195 cost_cmp(i - 1, j - 1, rows1[i - 1], rows2[j - 1])[0]) 

196 if d < c: 

197 c = d 

198 p = 3 

199 if p == 0: 

200 raise RuntimeError( # pragma: no cover 

201 "Unexpected value for p=%d at position=%r, c=%d, " 

202 "dist[i, j]=%d, dist=\n%r." % ( 

203 p, (i, j), c, dist[i, j], 

204 dist[i - 1:i + 1, j - 1: j + 1])) 

205 

206 dist[i, j] = c 

207 pred[i, j] = p 

208 

209 cached_distances.clear() 

210 d = dist[n1 - 1, n2 - 1] 

211 equals = [] 

212 i, j = n1 - 1, n2 - 1 

213 lines1 = {} 

214 lines2 = {} 

215 p = pred[i, j] 

216 while p != -1: 

217 if p == 3: 

218 cd = cost_cmp(i - 1, j - 1, rows1[i - 1], 

219 rows2[j - 1], bypass=False) 

220 lines1[i - 1] = j - 1 

221 lines2[j - 1] = i - 1 

222 equals.append((i - 1, j - 1) + cd) 

223 i -= 1 

224 j -= 1 

225 elif p == 2: 

226 j -= 1 

227 elif p == 1: 

228 i -= 1 

229 else: 

230 raise RuntimeError( # pragma: no cover 

231 "Unexpected value for p=%d at position=%r." % (p, (i, j))) 

232 p = pred[i, j] 

233 

234 # final alignment 

235 aligned = [] 

236 la = 0 

237 lb = 0 

238 while la < len(rows1) or lb < len(rows2): 

239 if la in lines1 and lb in lines2: 

240 aligned.append((la, lb)) 

241 la += 1 

242 lb += 1 

243 while la not in lines1 and la < len(rows1): 

244 aligned.append((la, None)) 

245 la += 1 

246 while lb not in lines2 and lb < len(rows2): 

247 aligned.append((None, lb)) 

248 lb += 1 

249 if return_matrices: 

250 return d, list(reversed(equals)), aligned, (dist, pred) 

251 return d, list(reversed(equals)), aligned 

252 

253 

254def diff2html(rows1, rows2, equals, aligned, two_columns=False): 

255 """ 

256 Produces a HTML report with differences between *rows1* 

257 and *rows2*. 

258 

259 :param rows1: first set of rows 

260 :param rows2: second set of rows 

261 :param equals: third output of @see fn edit_distance_text 

262 :param aligned: fourth output of @see fn edit_distance_text 

263 :param two_columns: displays the differences on two columns 

264 :return: HTML text 

265 """ 

266 if isinstance(rows1, str): 

267 rows1 = rows1.split("\n") 

268 if isinstance(rows2, str): 

269 rows2 = rows2.split("\n") 

270 

271 char_aligned = {} 

272 for i, j, d, eq in equals: 

273 char_aligned[i, j] = (d, eq) 

274 

275 tr = '<tr style="1px solid black;">' 

276 tr_ = '</tr>' 

277 tda = '<td style="background-color:#E59866;"><code style="background-color:#E59866;">' 

278 tda_ = '</code></td>' 

279 tdb = '<td style="background-color:#ABEBC6;"><code style="background-color:#ABEBC6;">' 

280 tdb_ = '</code></td>' 

281 tdc = '<td style="background-color:#E5E7E9;"><code style="background-color:#E5E7E9;">' 

282 tdc_ = '</code></td>' 

283 spana = '<span style="color:#BA4A00;">' 

284 spanb = '<span style="color:#196F3D;">' 

285 span_ = "</span>" 

286 rows = [] 

287 rows.append( 

288 '<table style="white-space: pre; 1px solid black; font-family:courier; text-align:left !important;">') 

289 for a, b in aligned: 

290 row = [tr] 

291 

292 if a is None: 

293 row.append("<td></td>") 

294 elif b is None: 

295 row.extend([tda, str(a), tda_]) 

296 else: 

297 al = char_aligned[a, b] 

298 if al[0] == 0: 

299 row.extend([ 

300 '<td style="background-color:#FFFFFF;">' 

301 '<code style="background-color:#FFFFFF;">', 

302 str(a), "</code></td>"]) 

303 else: 

304 row.extend(["<td><code>", str(a), "</code></td>"]) 

305 

306 if b is None: 

307 row.append("<td></td>") 

308 elif a is None: 

309 row.extend([tdb, str(b), tdb_]) 

310 else: 

311 al = char_aligned[a, b] 

312 if al[0] == 0: 

313 row.extend([ 

314 '<td style="background-color:#FFFFFF;">' 

315 '<code style="background-color:#FFFFFF;">', 

316 str(b), '</code></td>']) 

317 else: 

318 row.extend(["<td><code>", str(b), "</code></td>"]) 

319 

320 if a is None: 

321 if two_columns: 

322 row.extend(["<td></td>", tdb, rows2[b], tdb_]) 

323 else: 

324 row.extend([tdb, rows2[b], tdb_]) 

325 elif b is None: 

326 if two_columns: 

327 row.extend([tda, rows1[a], tda_, "<td></td>"]) 

328 else: 

329 row.extend([tda, rows1[a], tda_]) 

330 else: 

331 al = char_aligned[a, b] 

332 if al[0] == 0: 

333 if two_columns: 

334 row.extend([ 

335 '<td style="background-color:#FFFFFF;">' 

336 '<code style="background-color:#FFFFFF;">', 

337 rows1[a], '</code></td>', 

338 '<td style="background-color:#FFFFFF;">' 

339 '<code style="background-color:#FFFFFF;">', 

340 rows2[b], '</code></td>']) 

341 else: 

342 row.extend([ 

343 '<td style="background-color:#FFFFFF;">' 

344 '<code style="background-color:#FFFFFF;">', 

345 rows1[a], '</code></td>']) 

346 else: 

347 # Not equal 

348 s1 = rows1[a] 

349 s2 = rows2[b] 

350 l1 = [spana + _ + span_ for _ in s1] 

351 l2 = [spanb + _ + span_ for _ in s2] 

352 for i, j in al[1]: 

353 if i is None or j is None: 

354 continue # pragma: no cover 

355 if s1[i] == s2[j]: 

356 l1[i] = s1[i] 

357 l2[j] = s2[j] 

358 if two_columns: 

359 row.extend( 

360 [tdc, "".join(l1), "</code>", tdc_, 

361 tdc, "<code>", "".join(l2), tdc_]) 

362 else: 

363 row.extend( 

364 [tdc, "".join(l1), "</code><br /><code>", 

365 "".join(l2), tdc_]) 

366 

367 row.append(tr_) 

368 rows.append("".join(row)) 

369 rows.append("</table>") 

370 return "\n".join(rows)