Coverage for src/ensae_teaching_cs/td_1a/discours

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Retrive political speeches from Internet

6"""

8import re

9import html.parser

10import html.entities as htmlentitydefs

11import warnings

12from pyquickhelper.loghelper import get_url_content

15def xmlParsingLongestDiv(text):

16 """

17 Extracts the longest div section.

19 @param text text of HTML page

20 @return text

21 """

22 class MyHTMLParser(html.parser.HTMLParser):

23 """

24 To get rid of paragraphs, and bolded text.

25 """

27 def __init__(self):

28 html.parser.HTMLParser.__init__(self, convert_charrefs=True)

29 self.mtag = []

30 self.mvalue = []

31 self.mall = []

33 def handle_starttag(self, tag, attrs):

34 if tag == "div":

35 self.mtag.append(tag)

36 self.mvalue.append([])

37 elif len(self.mtag) > 0:

38 self.mvalue[-1].append(" ")

40 def handle_endtag(self, tag):

41 if tag == "div":

42 self.mall.append((self.mtag[-1], "".join(self.mvalue[-1])))

43 self.mtag.pop()

44 self.mvalue.pop()

45 elif len(self.mtag) > 0:

46 if tag == "p" or tag == "br":

47 self.mvalue[-1].append("\n")

48 else:

49 self.mvalue[-1].append(" ")

51 def handle_data(self, data):

52 if len(self.mtag) > 0:

53 self.mvalue[-1].append(data)

55 parser = MyHTMLParser()

56 text = text.replace(" -g8\" ", " ")

57 parser.feed(text)

59 best = ""

60 for tag, value in parser.mall:

61 if tag == "div" and len(value) > len(best):

62 best = value

64 endLine = "\n"

65 res = best.replace(

66 "<p>",

67 "").replace(

68 "</p>",

69 endLine).replace(

70 "\r",

71 "").replace(

72 "<br />",

73 endLine).replace(

74 "<br>",

75 endLine)

76 exp = re.compile("[|]((.|\n){5,50}) ")

77 nb = exp.findall(res)

78 if (len(nb) == 0 or len(res) > 10000) and "if (window.xtparam!=null)" not in res:

79 return res

80 else:

81 return ""

84def html_unescape(text):

85 """

86 Removes :epkg:`HTML` or :epkg:`XML`

87 character references and entities from a text string.

88 keep ``&``, ``>``, ``<`` in the source code.

89 from `Fredrik Lundh <http://effbot.org/zone/re-sub.htm#unescape-html>`_

91 @param text text

92 @return cleaning text

93 """

94 def fixup(m):

95 text = m.group(0)

96 if text[:2] == "&#":

97 try:

98 if text[:3] == "&#x":

99 return chr(int(text[3:-1], 16))

100 else:

101 return chr(int(text[2:-1]))

102 except ValueError:

103 pass

104 else:

105 # named entity

106 try:

107 if text[1:-1] == "amp":

108 text = "&amp;"

109 elif text[1:-1] == "gt":

110 text = "&gt;"

111 elif text[1:-1] == "lt":

112 text = "&lt;"

113 else:

114 text = chr(htmlentitydefs.name2codepoint[text[1:-1]])

115 except KeyError:

116 pass

117 return text # leave as is

118 return re.sub("&#?\\w+;", fixup, text)

119

120

121def force_unicode(text):

122 """

123 Deals with unicodes.

124

125 @param text text

126 @return text

127 """

128 exp = re.compile("([0-9]+):")

129 turn = 0

130 while True:

131 try:

132 text = text.encode("ascii", errors="ignore")

133 break

134 except UnicodeDecodeError as e:

135 pos = exp.findall(str(e))

136 pos = int(pos[0])

137 text = text.replace("Ã´", "o").replace(

138 "Ã©", "e").replace("Ã", "a")

139 text = text.replace(

140 " ",

141 " ").replace(

142 "â€™",

143 "'").replace(

144 "a§",

145 "c")

146 text = text.replace(

147 "a¹",

148 "u").replace(

149 "a¨",

150 "e").replace(

151 "a‰",

152 "E")

153 text = text.replace(

154 "a¢",

155 "a").replace(

156 "aª",

157 "e").replace(

158 "aƒÂ´",

159 "o")

160 text = text.replace(

161 "aƒÂ©",

162 "e").replace(

163 "aƒÂ",

164 "e").replace(

165 "Â©",

166 "e")

167 text = text.replace(

168 "a»",

169 "u").replace(

170 "â‚¬",

171 "E").replace(

172 "a®",

173 "i")

174 text = text.replace(

175 '\xa0',

176 " ").replace(

177 "Å“",

178 "oe").replace(

179 "Â«",

180 " ")

181 text = text.replace(

182 "Â»",

183 " ").replace(

184 "e¹ ",

185 "ei").replace(

186 "‚Â",

187 " ")

188 turn += 1

189 if turn > 100:

190 # too much

191 return None

192 return text

193

194

195def remove_accent(text):

196 """

197 Replaces French accents by regular letters.

198

199 @param text text

200 @return cleaned text

201 """

202 for c in ["aàâä", "eéèêë", "iîï", "oöô", "uùüû"]:

203 for d in c[1:]:

204 text = text.replace(c, d)

205 return text

206

207

208def get_elysee_speech_from_elysees(title, url="https://www.elysee.fr/"):

209 """

210 Retrieves the text from the :epkg:`Elysees`.

211

212 @param title title of the document

213 @param url website

214 @return html page

215

216 The function tries something like::

217

218 url + title.replace(" ","-")

219 """

220 if title.startswith("http"):

221 full = title

222 else:

223 if not url.endswith("/"):

224 raise RuntimeError("url should end with /: " + url)

225 link = remove_accent(title.lower()).replace(

226 " ", "-").replace("'", "-").replace('"', "")

227 full = url + "/" + link + "/"

228 try:

229 text = get_url_content(full)

230 except Exception as e:

231 warnings.warn(f"Unable to retrieve '{full}' - {e}")

232 return None

233 return xmlParsingLongestDiv(text)

234

235

236def enumerate_speeches_from_elysees(url="agenda", skip=0):

237 """

238 Enumerates speeches from the :epkg:`Elysees`.

239

240 @param url subaddress, url source will be

241 ``'https://www.elysee.fr/' + url``

242 @param skip skip the first *skip* one in the list

243 @return enumerate dictionaries

244

245 .. exref::

246 :title: Récupérer des discours du président de la république

247 :tag: Exercice

248

249 ::

250

251 for i, disc in enumerate(enumerate_speeches_from_elysees()):

252 print(disc)

253

254 Others links can be used such as

255 ``https://www.elysee.fr/recherche?query=discours``.

256 The website changed in 2018 and no longer support xml or json

257 streams.

258 """

259 base = "https://www.elysee.fr/"

260 if not url.startswith("http"):

261 url = base + url

262 xml = get_url_content(url)

263 reg = re.compile(

264 "href=\\\"(.+?/[0-9]{4}/[0-9]{2}/[0-9]{2}/.+?)\\\" class=")

265 links = reg.findall(xml)

266 for i, link in enumerate(links):

267 if i < skip:

268 continue

269 if link.startswith("/"):

270 link = base + link

271 content = get_elysee_speech_from_elysees(link)

272 if content is not None:

273 yield dict(link=link, text=content)

274 if len(links) == 0:

275 raise ValueError("Unable to extract links from url='{0}'\npattern='{1}'\n-----\n{2}".format(

276 url, reg, xml))

Coverage for src/ensae_teaching_cs/td_1a/discours_politique.py: 13%

123 statements