Coverage for src/ensae_teaching_cs/homeblog/buildrss.py: 92%

77 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-01-27 05:44 +0100

1# coding:utf-8 

2""" 

3@file 

4@brief About RSS 

5""" 

6import datetime 

7import os 

8import re 

9from pyquickhelper.loghelper import fLOG 

10from .filefunction import find_all_blogs_function 

11 

12 

13modelForARSSFeed = """<rss version="2.0"> 

14 <channel> 

15 <title>XD blog</title> 

16 <link>http://www.xavierdupre.fr/blog/xd_blog_nojs.html</link> 

17 <description>new posts from XD blog</description> 

18 """.replace(" ", "") 

19 

20modelForARSSRow = """ 

21 <item> 

22 <title>%s</title> 

23 <link>http://www.xavierdupre.fr/blog/%s_nojs.html</link> 

24 <guid isPermaLink="true">http://www.xavierdupre.fr/blog/%s_nojs.html</guid> 

25 <description>%s</description> 

26 <pubDate>%s</pubDate> 

27 </item>""" 

28 

29modelForARSSChannel = """\n</channel>\n</rss>\n""" 

30 

31 

32def file_build_rss(folder=".", outfile="blog/xdbrss.xml", now=datetime.datetime.now(), 

33 model_feed=modelForARSSFeed, model_row=modelForARSSRow, 

34 model_channel=modelForARSSChannel, months_delay=6): 

35 """ 

36 Build a RSS file, the function keeps the blog post (HTML format) from the last month. 

37 If a post contains one the two following string: 

38 

39 :: 

40 

41 <!-- SUMMARY BEGINS --> 

42 <!-- SUMMARY ENDS --> 

43 

44 The summary will only contains the part included in those two comments. 

45 

46 

47 @param folder folder where the blog post can be found 

48 @param outfile final file to produce 

49 @param now date to use as a final date, only blog post between one month now and now will be kept 

50 @param model_feed see model_channel 

51 @param model_row see model_row 

52 @param model_channel the part related to a post in the rss stream is composed 

53 by the concatenation of the three stream: 

54 

55 :: 

56 

57 model_feed 

58 model_row 

59 model_channel 

60 

61 You should see the default value to see how you can replace them. 

62 @param months_delay keep mails written a couple of months ago: *month_delay* months 

63 @return 2-uple: outfile and the list of kept blog post (the last month) 

64 """ 

65 

66 now -= datetime.timedelta(days=months_delay * 30) 

67 fLOG("now - month ", now) 

68 file = find_all_blogs_function(folder) 

69 nbfile = len(file) 

70 exp = re.compile('<meta +name=\\"description\\" +content=\\"(.*?)\\" */>') 

71 expt = re.compile('<title>(.*?)</title>') 

72 

73 keepfiles = [] 

74 rss = [] 

75 for f in file: 

76 temp = os.path.split(f)[-1].lower().replace(".html", "") 

77 day = datetime.datetime(int(temp[:4]), int(temp[5:7]), int(temp[8:10])) 

78 if day > now: 

79 keepfiles.append(f) 

80 

81 ff = open(f, "r", encoding="utf8") 

82 t = ff.read().replace("\n", " ").replace("\r", " ") 

83 ff.close() 

84 check_encoding(f) 

85 

86 summary = exp.search(t) 

87 title = expt.search(t) 

88 

89 if not title: 

90 raise ValueError("unable to find title in " + f) 

91 fLOG("getting summary for ", f) 

92 

93 title = title.groups()[0] 

94 summary = None if summary is None else summary.groups()[0] 

95 adddots = False 

96 

97 if summary is None or len(summary) == 0: 

98 if "<!-- SUMMARY BEGINS -->" in t and "<!-- SUMMARY ENDS -->" in t: 

99 p0 = t.find("<!-- SUMMARY BEGINS -->") 

100 p1 = t.find("<!-- SUMMARY ENDS -->") 

101 summary = t[ 

102 p0 + len("<!-- SUMMARY BEGINS -->"):p1].strip(" \n\r\t") 

103 summary = summary.replace("<", "&lt;") 

104 summary = summary.replace(">", "&gt;") 

105 adddots = True 

106 

107 if summary is None or len(summary) == 0: 

108 p0 = t.find("<body>") 

109 p1 = t.find("</body>") 

110 summary = t[p0 + len("<body>"):p1].strip(" \n\r\t") 

111 summary = summary.replace("<", "&lt;") 

112 summary = summary.replace(">", "&gt;") 

113 

114 if summary is None or len(summary) == 0: 

115 raise ValueError("summary is empty for blog " + f) 

116 

117 summary = re.sub(r"\s+", " ", summary) 

118 rss.append((day, f, summary, temp, title)) 

119 

120 rows = ["<?xml version=\"1.0\" encoding=\"utf-8\"?>"] 

121 rows.append(modelForARSSFeed) 

122 if len(rss) == 0: 

123 raise Exception( 

124 f"No found file in '{folder}' (raw count {nbfile}).") 

125 

126 rss.sort(reverse=True) 

127 for day, f, summary, short, title in rss: 

128 if adddots and not summary.endswith("..."): 

129 summary += " suite..." if not summary.endswith( 

130 ".") else " suite..." 

131 

132 row = modelForARSSRow % (title, short, short, summary, str(day)) 

133 rows.append(row) 

134 

135 rows.append(modelForARSSChannel) 

136 content = "\n".join(rows) 

137 rssf = open(outfile, "w", encoding='utf-8') 

138 rssf.write(content) 

139 rssf.close() 

140 

141 return outfile, keepfiles 

142 

143 

144def check_encoding(file): 

145 """ 

146 check the encoding of a file (ASCII here), 

147 read the file, it does not return anything 

148 @param file file to check 

149 """ 

150 f = open(file, "r") 

151 try: 

152 f.read() 

153 except Exception as e: 

154 size = os.stat(file).st_size 

155 raise Exception( 

156 f"issue with file (size {size})\n File \"{file}\", line 1") from e 

157 f.close()