Coverage for src/ensae_teaching_cs/td_1a/discours_politique.py: 13%

123 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-04-28 06:23 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Retrive political speeches from Internet 

5 

6""" 

7 

8import re 

9import html.parser 

10import html.entities as htmlentitydefs 

11import warnings 

12from pyquickhelper.loghelper import get_url_content 

13 

14 

15def xmlParsingLongestDiv(text): 

16 """ 

17 Extracts the longest div section. 

18 

19 @param text text of HTML page 

20 @return text 

21 """ 

22 class MyHTMLParser(html.parser.HTMLParser): 

23 """ 

24 To get rid of paragraphs, and bolded text. 

25 """ 

26 

27 def __init__(self): 

28 html.parser.HTMLParser.__init__(self, convert_charrefs=True) 

29 self.mtag = [] 

30 self.mvalue = [] 

31 self.mall = [] 

32 

33 def handle_starttag(self, tag, attrs): 

34 if tag == "div": 

35 self.mtag.append(tag) 

36 self.mvalue.append([]) 

37 elif len(self.mtag) > 0: 

38 self.mvalue[-1].append(" ") 

39 

40 def handle_endtag(self, tag): 

41 if tag == "div": 

42 self.mall.append((self.mtag[-1], "".join(self.mvalue[-1]))) 

43 self.mtag.pop() 

44 self.mvalue.pop() 

45 elif len(self.mtag) > 0: 

46 if tag == "p" or tag == "br": 

47 self.mvalue[-1].append("\n") 

48 else: 

49 self.mvalue[-1].append(" ") 

50 

51 def handle_data(self, data): 

52 if len(self.mtag) > 0: 

53 self.mvalue[-1].append(data) 

54 

55 parser = MyHTMLParser() 

56 text = text.replace(" -g8\" ", " ") 

57 parser.feed(text) 

58 

59 best = "" 

60 for tag, value in parser.mall: 

61 if tag == "div" and len(value) > len(best): 

62 best = value 

63 

64 endLine = "\n" 

65 res = best.replace( 

66 "<p>", 

67 "").replace( 

68 "</p>", 

69 endLine).replace( 

70 "\r", 

71 "").replace( 

72 "<br />", 

73 endLine).replace( 

74 "<br>", 

75 endLine) 

76 exp = re.compile("[|]((.|\n){5,50}) ") 

77 nb = exp.findall(res) 

78 if (len(nb) == 0 or len(res) > 10000) and "if (window.xtparam!=null)" not in res: 

79 return res 

80 else: 

81 return "" 

82 

83 

84def html_unescape(text): 

85 """ 

86 Removes :epkg:`HTML` or :epkg:`XML` 

87 character references and entities from a text string. 

88 keep ``&amp;``, ``&gt;``, ``&lt;`` in the source code. 

89 from `Fredrik Lundh <http://effbot.org/zone/re-sub.htm#unescape-html>`_ 

90 

91 @param text text 

92 @return cleaning text 

93 """ 

94 def fixup(m): 

95 text = m.group(0) 

96 if text[:2] == "&#": 

97 try: 

98 if text[:3] == "&#x": 

99 return chr(int(text[3:-1], 16)) 

100 else: 

101 return chr(int(text[2:-1])) 

102 except ValueError: 

103 pass 

104 else: 

105 # named entity 

106 try: 

107 if text[1:-1] == "amp": 

108 text = "&amp;amp;" 

109 elif text[1:-1] == "gt": 

110 text = "&amp;gt;" 

111 elif text[1:-1] == "lt": 

112 text = "&amp;lt;" 

113 else: 

114 text = chr(htmlentitydefs.name2codepoint[text[1:-1]]) 

115 except KeyError: 

116 pass 

117 return text # leave as is 

118 return re.sub("&#?\\w+;", fixup, text) 

119 

120 

121def force_unicode(text): 

122 """ 

123 Deals with unicodes. 

124 

125 @param text text 

126 @return text 

127 """ 

128 exp = re.compile("([0-9]+):") 

129 turn = 0 

130 while True: 

131 try: 

132 text = text.encode("ascii", errors="ignore") 

133 break 

134 except UnicodeDecodeError as e: 

135 pos = exp.findall(str(e)) 

136 pos = int(pos[0]) 

137 text = text.replace("ô", "o").replace( 

138 "é", "e").replace("Ã", "a") 

139 text = text.replace( 

140 " ", 

141 " ").replace( 

142 "’", 

143 "'").replace( 

144 "a§", 

145 "c") 

146 text = text.replace( 

147 "a¹", 

148 "u").replace( 

149 "a¨", 

150 "e").replace( 

151 "a‰", 

152 "E") 

153 text = text.replace( 

154 "a¢", 

155 "a").replace( 

156 "aª", 

157 "e").replace( 

158 "aƒÂ´", 

159 "o") 

160 text = text.replace( 

161 "aƒÂ©", 

162 "e").replace( 

163 "aƒÂ", 

164 "e").replace( 

165 "©", 

166 "e") 

167 text = text.replace( 

168 "a»", 

169 "u").replace( 

170 "€", 

171 "E").replace( 

172 "a®", 

173 "i") 

174 text = text.replace( 

175 '\xa0', 

176 " ").replace( 

177 "Å“", 

178 "oe").replace( 

179 "«", 

180 " ") 

181 text = text.replace( 

182 "»", 

183 " ").replace( 

184 "e¹ ", 

185 "ei").replace( 

186 "‚Â", 

187 " ") 

188 turn += 1 

189 if turn > 100: 

190 # too much 

191 return None 

192 return text 

193 

194 

195def remove_accent(text): 

196 """ 

197 Replaces French accents by regular letters. 

198 

199 @param text text 

200 @return cleaned text 

201 """ 

202 for c in ["aàâä", "eéèêë", "iîï", "oöô", "uùüû"]: 

203 for d in c[1:]: 

204 text = text.replace(c, d) 

205 return text 

206 

207 

208def get_elysee_speech_from_elysees(title, url="https://www.elysee.fr/"): 

209 """ 

210 Retrieves the text from the :epkg:`Elysees`. 

211 

212 @param title title of the document 

213 @param url website 

214 @return html page 

215 

216 The function tries something like:: 

217 

218 url + title.replace(" ","-") 

219 """ 

220 if title.startswith("http"): 

221 full = title 

222 else: 

223 if not url.endswith("/"): 

224 raise RuntimeError("url should end with /: " + url) 

225 link = remove_accent(title.lower()).replace( 

226 " ", "-").replace("'", "-").replace('"', "") 

227 full = url + "/" + link + "/" 

228 try: 

229 text = get_url_content(full) 

230 except Exception as e: 

231 warnings.warn(f"Unable to retrieve '{full}' - {e}") 

232 return None 

233 return xmlParsingLongestDiv(text) 

234 

235 

236def enumerate_speeches_from_elysees(url="agenda", skip=0): 

237 """ 

238 Enumerates speeches from the :epkg:`Elysees`. 

239 

240 @param url subaddress, url source will be 

241 ``'https://www.elysee.fr/' + url`` 

242 @param skip skip the first *skip* one in the list 

243 @return enumerate dictionaries 

244 

245 .. exref:: 

246 :title: Récupérer des discours du président de la république 

247 :tag: Exercice 

248 

249 :: 

250 

251 for i, disc in enumerate(enumerate_speeches_from_elysees()): 

252 print(disc) 

253 

254 Others links can be used such as 

255 ``https://www.elysee.fr/recherche?query=discours``. 

256 The website changed in 2018 and no longer support xml or json 

257 streams. 

258 """ 

259 base = "https://www.elysee.fr/" 

260 if not url.startswith("http"): 

261 url = base + url 

262 xml = get_url_content(url) 

263 reg = re.compile( 

264 "href=\\\"(.+?/[0-9]{4}/[0-9]{2}/[0-9]{2}/.+?)\\\" class=") 

265 links = reg.findall(xml) 

266 for i, link in enumerate(links): 

267 if i < skip: 

268 continue 

269 if link.startswith("/"): 

270 link = base + link 

271 content = get_elysee_speech_from_elysees(link) 

272 if content is not None: 

273 yield dict(link=link, text=content) 

274 if len(links) == 0: 

275 raise ValueError("Unable to extract links from url='{0}'\npattern='{1}'\n-----\n{2}".format( 

276 url, reg, xml))