Coverage for src/ensae_teaching_cs/homeblog/buildkeywords.py: 86%

311 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-04-28 06:23 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Contains the main function to published my blog (http://www.xavierdupre.fr/blog). 

5executed: 

6""" 

7import re 

8import os 

9import xml.dom.minidom 

10from pyquickhelper.loghelper import fLOG 

11from .modifypost import load_and_modify_xml_dom 

12from .filefunction import find_all_blogs_function 

13 

14 

15def removeAccent(s): 

16 return re.sub("([^~+'.0-9,ea-zA-Z&; -])", "", s) 

17 

18 

19def removeAccent_debug(s): 

20 return re.sub("([^~+'.#çôéàèâû0-9,ea-zA-Z&; -])", "", s) 

21 

22 

23def removeHtmlAccent(s): 

24 s = s.replace("é", "é") \ 

25 .replace("à", "à") \ 

26 .replace("â", "â") \ 

27 .replace("ê", "ê") \ 

28 .replace("ô", "ô") \ 

29 .replace("è", "è") \ 

30 .replace("ç", "ç") \ 

31 .replace("û", "û") 

32 return s 

33 

34 

35def FixIssuesWithAccent(text): 

36 """ 

37 voir http://migo.sixbit.org/more/html-entities.html 

38 http://www.thesauruslex.com/typo/eng/enghtml.htm 

39 

40 :: 

41 

42 é = é = é 

43 è = è = è 

44 à = Ã = à 

45 ï = ï = ï 

46 ô = ô = ô 

47 ç = ç = ç 

48 ê = ê = ê 

49 ù = ù = ù 

50 æ = æ = æ 

51 œ = Å“ = œ 

52 ë = ë = ë 

53 ü = ü = ü 

54 â = â = â 

55 € = € = € 

56 © = © = © 

57 ¤ = ¤ = ¤ 

58 """ 

59 o = text 

60 

61 correspondance = [ 

62 ("ã©", "é"), 

63 ("ô", "ô"), 

64 ("â", "â"), 

65 ("î", "î"), 

66 ("è", "è"), 

67 ("ê", "ê"), 

68 ("â", "â"), 

69 ("ç", "ç"), 

70 ("Ã ", "à "), 

71 ("\xE9", "é"), 

72 ("\xE0", "à"), 

73 ("\xA0", "à"), 

74 ("\xE8", "è"), 

75 ("\xA8", "è"), 

76 ("\xF4", "ô"), 

77 ("\xB4", "ô"), 

78 ("\xFB", "û"), 

79 ("\xC3\xAA", "ê"), 

80 ("\xC3\xAE", "î"), 

81 ("\xAE", "î"), 

82 ("\xEE", "î"), 

83 ("\xEA", "ê"), 

84 ("\xAA", "ê"), 

85 ("Ã", "à"), 

86 ] 

87 

88 for k, v in correspondance: 

89 text = text.replace("\xC3" + k, v).replace("\xE3" + k, v) 

90 text = text.replace(k, v) 

91 

92 if len(removeAccent_debug(text)) != len(text) and len(text) < 50: 

93 fLOG("FixIssuesWithAccent", o.encode("utf8"), text.encode("utf8")) 

94 fLOG("FixIssuesWithAccent", o, text) 

95 raise ValueError("unable to deal with " + 

96 str([text, [text], removeAccent_debug(text), text.encode("utf8")])) 

97 return text 

98 

99 

100def modify_all_blogs_list_in_place(folder=".", 

101 mainpage=os.path.join( 

102 "blog", "xd_blog.html"), 

103 outmainpage=os.path.join( 

104 "blog", "xd_blog.html"), 

105 allow_temp=False): 

106 file = find_all_blogs_function(folder, allow_temp=allow_temp) 

107 file = [os.path.split(_)[-1].replace(".html", "") for _ in file] 

108 f = open(mainpage, "r", encoding="utf8") 

109 cont = f.read() 

110 f.close() 

111 trois = cont.split("//////////////////////////////////////////") 

112 assert len(trois) == 3 

113 file.sort(reverse=True) 

114 trois[1] = "\n" + ",\n".join([f"\"{_}\"" for _ in file]) + "\n" 

115 cont = "//////////////////////////////////////////".join(trois) 

116 f = open(outmainpage, "w", encoding="utf8") 

117 f.write(cont) 

118 f.close() 

119 

120 

121def file_all_keywords(folder=".", 

122 mainpage=os.path.join("blog", "xd_blog.html"), 

123 outmainpage=os.path.join("blog", "xd_blog.html"), 

124 exclude=None, allow_temp=False): 

125 keepfile = find_all_blogs_function(folder, exclude, allow_temp=allow_temp) 

126 if len(keepfile) == 0: 

127 raise RuntimeError("no found file") 

128 hist = {} 

129 store_keywords = {} 

130 files = [] 

131 

132 for f in keepfile: 

133 dom = load_and_modify_xml_dom(f, None) 

134 meta = dom.documentElement.getElementsByTagName("meta") 

135 node = [_ for _ in meta if "name" in _.attributes and _.attributes[ 

136 "name"].value == "keywords"] 

137 keywords = [_.strip() for _ in node[0].attributes[ 

138 "content"].value.split(",")] 

139 keywords.sort() 

140 store_keywords[f] = keywords 

141 for k in keywords: 

142 k = k.strip() 

143 hist[k] = hist.get(k, 0) + 1 

144 res = [(v, k) for k, v in hist.items() if v > 1] 

145 res.sort(reverse=True) 

146 

147 # tag 

148 f = open(mainpage, "r", encoding="utf8") 

149 cont = f.read() 

150 f.close() 

151 trois = cont.split("////////////###########") 

152 trois[1] = "\n" + ",\n".join(["[\"%s (%d)\",\"%s\"]" % 

153 (FixIssuesWithAccent(k), v, removeAccent(k)) for v, k in res]) + "\n" 

154 cont = "////////////###########".join(trois) 

155 

156 # documents 

157 trois = cont.split("////////////---------------------") 

158 rows = [] 

159 for k, v in res: 

160 files = [] 

161 text = f'"{removeAccent(v)}":' 

162 for f in keepfile: 

163 keywords = store_keywords[f] 

164 if v in keywords: 

165 files.append(f) 

166 files = [os.path.split(_)[-1].replace(".html", "") for _ in files] 

167 files.sort(reverse=True) 

168 files = [f'"{_}"' for _ in files] 

169 text += f"[ {', '.join(files)} ] " 

170 rows.append(text) 

171 trois[1] = "\n" + ",\n".join([_ for _ in rows]) + "\n" 

172 

173 cont = "////////////---------------------".join(trois) 

174 

175 # rev keywords 

176 trois = cont.split("////////////+++++++++++++++++") 

177 rows = [] 

178 for k, v in res: 

179 text = removeAccent(v) 

180 rows.append(f'"{text}":"{FixIssuesWithAccent(v)}"') 

181 trois[1] = "\n" + ",\n".join([_ for _ in rows]) + "\n" 

182 cont = "////////////+++++++++++++++++".join(trois) 

183 

184 f = open(outmainpage, "w", encoding="utf8") 

185 f.write(cont) 

186 f.close() 

187 

188 modify_all_blogs_list_in_place( 

189 folder, outmainpage, outmainpage, allow_temp=allow_temp) 

190 return store_keywords 

191 

192 

193def build_bloc_keywords(res, frequence_threshold, rootfile): 

194 """ 

195 builds the keywords bloc 

196 

197 @param res .... 

198 @param frequence_threshold number of times a keyword needs to appear before getting the right bar 

199 """ 

200 keywords = {} 

201 for a, b in res.items(): 

202 for _ in b: 

203 keywords[_] = keywords.get(_, 0) + 1 

204 keywords = [(b, a) for a, b in keywords.items()] 

205 keywords.sort(reverse=True) 

206 text = [] 

207 for a, b in keywords: 

208 if a >= frequence_threshold: 

209 s = '<p class="keywordtitle"><a href="%s_%s.html" target="_parent">%s</a> (%d)</p>' % \ 

210 (rootfile, removeAccent(b), FixIssuesWithAccent(b), a) 

211 text.append(s) 

212 return "\n".join(text), keywords 

213 

214 

215def build_bloc_months(res, rootfile): 

216 """ 

217 builds the months bloc (we assume the page name is YYYY-MM-DD-something-.html 

218 

219 @param res list of blog per months 

220 @param rootfile files location 

221 """ 

222 months = {} 

223 for a, b in res.items(): 

224 month = os.path.split(a)[-1][:7] 

225 months[month] = months.get(month, 0) + 1 

226 months = [(a, str(b)) for a, b in months.items()] 

227 months.sort(reverse=True) 

228 text = [] 

229 year = None 

230 for a, b in months: 

231 if year is not None and a[:4] != year: 

232 text.append('<p class="smallspace">.</p>') 

233 s = '<p class="monthtitle"><a href="%s_%s.html" target="_parent">%s</a> (%s)</p>' % \ 

234 (rootfile, a, a, b) 

235 text.append(s) 

236 year = a[:4] 

237 months = [(b, a) for a, b in months] 

238 return "\n".join(text), months 

239 

240 

241def replace_xml_in_template_using_dom_dirty(dom, node, newvalue): 

242 xmltext = node.toxml() 

243 allxml = dom.documentElement.toxml() 

244 pos = allxml.find(xmltext) 

245 if pos == -1: 

246 raise ValueError("unable to replace") 

247 allxml = allxml.replace(xmltext, newvalue) 

248 res = xml.dom.minidom.parseString(allxml) 

249 return res 

250 

251 

252def get_node_div(template, cl): 

253 sidebar = template.documentElement.getElementsByTagName("div") 

254 sidebar = [_ for _ in sidebar if "class" in _.attributes] 

255 sidebar = [_ for _ in sidebar if _.attributes["class"].value == cl] 

256 if len(sidebar) != 1: 

257 raise ValueError("issue with HTML format: " + 

258 cl + ", " + str(len(sidebar))) 

259 sidebar = sidebar[0] 

260 return sidebar 

261 

262 

263def generate_html_article(res, 

264 templateFile, 

265 toFolder, 

266 overwrite=False, 

267 aggregatedFile=None, 

268 maxAggregrate=15, 

269 keywordsText=None, 

270 otherLayer=None): 

271 

272 fileToReturn = [] 

273 

274 if not os.path.exists(toFolder): 

275 raise FileNotFoundError("not found " + toFolder) 

276 

277 # group files or not 

278 toprocess = [] 

279 if aggregatedFile is not None: 

280 counter = 0 

281 stackFile = [] 

282 

283 for file in sorted(res, reverse=True): 

284 stackFile.append(file) 

285 if len(stackFile) == maxAggregrate: 

286 fileOutName = "%s_%04d.html" % (aggregatedFile.replace(".html", ""), counter) if counter > 0 \ 

287 else aggregatedFile 

288 fileOutName = os.path.join(toFolder, fileOutName) 

289 stackFile.sort(reverse=True) 

290 toprocess.append((stackFile, fileOutName)) 

291 counter += len(stackFile) 

292 stackFile = [] 

293 

294 if len(stackFile) > 0: 

295 fileOutName = "%s_%04d.html" % (aggregatedFile.replace(".html", ""), counter) if counter > 0 \ 

296 else aggregatedFile 

297 fileOutName = os.path.join(toFolder, fileOutName) 

298 stackFile.sort(reverse=True) 

299 toprocess.append((stackFile, fileOutName)) 

300 else: 

301 # we process all files, each of them gives a file 

302 for file in sorted(res, reverse=True): 

303 filename = os.path.split(file)[-1].replace(".html", "_nojs.html") 

304 filename = os.path.join(toFolder, filename) 

305 toprocess.append(([file], filename)) 

306 

307 # updating the sidebar 

308 template = load_and_modify_xml_dom(templateFile, None, False) 

309 templateText = template.documentElement.toxml() 

310 title_to_rep = template.documentElement.getElementsByTagName("title")[ 

311 0].toxml() 

312 

313 # all files to process are now in the list 

314 for indexProcess, couple in enumerate(toprocess): 

315 files, filename = couple 

316 stackContent = [] 

317 scripthtml = "" 

318 replacetitle = None 

319 

320 for file in files: 

321 dom = load_and_modify_xml_dom(file, None) 

322 date = os.path.split(file)[-1][:10] 

323 

324 title = dom.documentElement.getElementsByTagName("title")[ 

325 0].toxml() 

326 if "XD blog" in title: 

327 raise ValueError("a blog contains a bad title: " + file) 

328 if len(files) == 1: 

329 # in that case, we want to change the page title 

330 replacetitle = title 

331 

332 title = title.replace("title>", "h2>") 

333 link = f'<a href="{date}_nojs.html"><b>{date}</b></a>' 

334 title = title.replace("<h2>", "<h2>" + link + " ") 

335 

336 scripts = dom.documentElement.getElementsByTagName("script") 

337 if len(scripts) > 1: 

338 scr = [""] + [_.toxml() for _ in scripts] 

339 scripthtml += "\n".join(scr) 

340 

341 b = dom.documentElement.getElementsByTagName("body")[0] 

342 body = b.toxml() 

343 

344 body = body[6:] 

345 body = body[:-7] 

346 

347 if len(files) > 1 and '<!-- CUT PAGE HERE -->' in body: 

348 # here we deal with shortcuts except if we process a single 

349 # document 

350 body = body.split('<!-- CUT PAGE HERE -->')[0] 

351 body += "<br />" + \ 

352 f"<a href=\"{date}_nojs.html\">{'more...'}</a>" 

353 

354 if len(body.strip()) == 0: 

355 raise ValueError("empty body for " + file) 

356 stackContent.append(title + "\n" + body) 

357 keywords = res[file] 

358 

359 # we 

360 uniqueKeys = [_ for _ in set(keywords) if not _.startswith("~")] 

361 uniqueKeys.sort() 

362 keystext = ", ".join(uniqueKeys) 

363 

364 nextPage = "" 

365 if indexProcess > 0: 

366 nextPage += '<a href="%s"><i>&lt;--</i></a> ' % ( 

367 os.path.split(toprocess[indexProcess - 1][1])[-1]) 

368 if indexProcess < len(toprocess) - 1: 

369 nextPage += '<a href="%s"><i>--&gt;</i></a> ' % ( 

370 os.path.split(toprocess[indexProcess + 1][1])[-1]) 

371 

372 if keywordsText is not None: 

373 keystext = keywordsText 

374 

375 # inside 

376 

377 post = templateText.replace( 

378 "<!-- article here -->", "\n".join(stackContent)) 

379 post = post.replace( 

380 '<a href="xd_blog_nojs_DDD.html"><i>suite</i></a>', nextPage) 

381 post = post.replace("<!-- javascript here -->", scripthtml) 

382 post = post.replace("<!-- article keywords -->", keystext) 

383 post = post.replace("### KEYWORDS ###", keystext) 

384 post = post.replace("### keywords ###", keystext) 

385 

386 enabled = False 

387 if enabled: 

388 olayer = f'<p class="keywordtitle"><a href="xd_blog.html?date={date}">Other Layer</a></p>' \ 

389 if otherLayer is None else \ 

390 f'<p class="keywordtitle"><a href="{otherLayer}">Other Layer</a></p>' 

391 post = post.replace("<!-- other layer -->", olayer) 

392 # it does not work (pages too big) 

393 

394 post = '<?xml version="1.0" encoding="utf-8"?>\n' + post 

395 post = post.replace('type="text/javascript"/>', 

396 'type="text/javascript"></script>') 

397 

398 post = FixIssuesWithAccent(post) 

399 

400 if replacetitle is not None: 

401 # there was only one document, we replace it 

402 post = post.replace(title_to_rep, replacetitle) 

403 

404 # we save the results 

405 

406 if os.path.exists(filename): 

407 try: 

408 f = open(filename, "r", encoding="utf8") 

409 hist = f.read() 

410 f.close() 

411 except UnicodeDecodeError as e: 

412 fLOG("issue with file ", filename) 

413 content = open(filename, "r").read() 

414 fLOG(content[170:]) 

415 raise e 

416 else: 

417 hist = "" 

418 

419 if post != hist or overwrite: 

420 if "\xC3" in post: 

421 #raise RuntimeError("forbidden character ") 

422 pass 

423 if not overwrite: 

424 fLOG(" writing ", filename) 

425 if "### keywords ###" in post.lower(): 

426 raise RuntimeError( 

427 "unable to release that document with this string ### KEYWORDS ###,\nkeywords should be " + str(keystext)) 

428 f = open(filename, "w", encoding="utf8") 

429 f.write(post) 

430 f.close() 

431 fileToReturn.append(filename) 

432 

433 return fileToReturn 

434 

435 

436def build_process_all_pages(res, 

437 keywordsHTML="frame_keywords.html", 

438 siteFolder="../site/blog", 

439 xd_blog_template_nojs=os.path.join( 

440 "blog", "xd_blog_template_nojs.html"), 

441 xd_blog_nojs="xd_blog_nojs.html", 

442 frequence_keywords=3, 

443 monthsHTML="frame_months.html" 

444 ): 

445 """ 

446 @param res output from function file_all_keywords 

447 @param keywordsHTML html template for the keywords 

448 @param siteFolder folder the blog (the one to be published) 

449 @param xd_blog_template_nojs template for blog (static text, less javascript) 

450 @param xd_blog_nojs main page (static text, less javascript) 

451 @param frequence_keywords there won't be any page for a keyword whose frequency is below that threshold 

452 @param monthsHTML html template for the months 

453 @return all created pages 

454 """ 

455 

456 add = [] 

457 

458 fLOG("processing keywords") 

459 htmlkey, keywords = build_bloc_keywords( 

460 res, frequence_keywords, "xd_blog_key") 

461 if keywordsHTML is not None: 

462 file = os.path.join(siteFolder, keywordsHTML) 

463 fLOG("writing ", file) 

464 f = open(file, "w", encoding="utf8") 

465 f.write("""<?xml version="1.0" encoding="utf-8"?>\n""") 

466 f.write("<html>\n") 

467 f.write("<head>\n") 

468 f.write( 

469 """<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n""") 

470 f.write("""<link href="pMenu.css" rel="stylesheet" type="text/css"/>\n""") 

471 f.write("</head>\n") 

472 f.write("<body>\n") 

473 f.write("""<div class="sidebarfull">\n""") 

474 f.write("""<p class="keywordtitle"><b>Keywords</b></p>\n""") 

475 f.write(htmlkey) 

476 f.write("\n</div>\n") 

477 f.write("\n</body></html>\n") 

478 f.close() 

479 add.append(file) 

480 

481 fLOG("processing months") 

482 htmlkeym, monthsp = build_bloc_months(res, "xd_blog_month") 

483 if monthsHTML is not None: 

484 file = os.path.join(siteFolder, monthsHTML) 

485 fLOG("writing ", file) 

486 f = open(file, "w", encoding="utf8") 

487 f.write("""<?xml version="1.0" encoding="utf-8"?>\n""") 

488 f.write("<html>\n") 

489 f.write("<head>\n") 

490 f.write( 

491 """<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n""") 

492 f.write("""<link href="pMenu.css" rel="stylesheet" type="text/css"/>\n""") 

493 f.write("</head>\n") 

494 f.write("<body>\n") 

495 f.write("""<div class="sidebarfullleft">\n<hr />\n""") 

496 f.write("""<p class="monthtitle"><b>Months</b></p>\n""") 

497 f.write(htmlkeym) 

498 f.write("\n</div>\n") 

499 f.write("\n</body></html>\n") 

500 f.close() 

501 add.append(file) 

502 

503 # build keyword pages 

504 fLOG("building aggregated page for keywords") 

505 add += generate_html_article( 

506 res, 

507 xd_blog_template_nojs, 

508 siteFolder, 

509 True, 

510 xd_blog_nojs, 

511 keywordsText="", 

512 otherLayer="xd_blog.html") 

513 

514 # process all pages for each keyword) 

515 for a, b in keywords: 

516 fLOG("building page for keyword", FixIssuesWithAccent(b)) 

517 bb = removeAccent(b) 

518 tempres = {} 

519 for k, v in res.items(): 

520 if b in v: 

521 tempres[k] = "" 

522 add += generate_html_article( 

523 tempres, 

524 xd_blog_template_nojs, 

525 siteFolder, 

526 True, 

527 f"xd_blog_key_{bb}.html", 

528 keywordsText=FixIssuesWithAccent(b), 

529 otherLayer=f"xd_blog.html?tag={FixIssuesWithAccent(b)}") 

530 

531 # build months pages 

532 fLOG("building aggregated page for months") 

533 add += generate_html_article( 

534 res, 

535 xd_blog_template_nojs, 

536 siteFolder, 

537 True, 

538 xd_blog_nojs, 

539 keywordsText="", 

540 otherLayer="xd_blog.html") 

541 

542 # process all pages for each months) 

543 for a, b in monthsp: 

544 fLOG("building page for months", b) 

545 bb = removeAccent(b) 

546 tempres = {} 

547 for k, v in res.items(): 

548 if os.path.split(k)[-1].startswith(b): 

549 tempres[k] = "" 

550 add += generate_html_article( 

551 tempres, 

552 xd_blog_template_nojs, 

553 siteFolder, 

554 True, 

555 f"xd_blog_month_{bb}.html", 

556 keywordsText=FixIssuesWithAccent(b), 

557 otherLayer=f"xd_blog.html?tag={FixIssuesWithAccent(b)}") 

558 

559 # build all pages (one per blog) 

560 fLOG("building all pages") 

561 add += generate_html_article( 

562 res, 

563 xd_blog_template_nojs, 

564 siteFolder, 

565 overwrite=True, 

566 otherLayer=None) 

567 return add