Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Functions get_page_wheel 

5""" 

6 

7import sys 

8from ssl import SSLEOFError 

9from .install_memoize import install_memoize 

10from .internet_settings import default_user_agent 

11 

12if sys.version_info[0] == 2: 

13 import urllib2 as urllib_request 

14 from codecs import open 

15 from HTMLParser import HTMLParser 

16else: 

17 import urllib.request as urllib_request 

18 from html.parser import HTMLParser 

19 from urllib.error import URLError 

20 

21 

22class InternalJsException(Exception): 

23 """ 

24 Raises when a javascript url cannot be decrypted. 

25 """ 

26 pass 

27 

28 

29@install_memoize 

30def get_page_wheel(page, sele=True): 

31 """ 

32 get the page 

33 

34 @param page location 

35 @param sele use selenium or not or False to try if the other way did not work 

36 @return page content 

37 """ 

38 req = urllib_request.Request( 

39 page, 

40 headers={ 

41 'User-agent': default_user_agent}) 

42 ull = False 

43 try: 

44 u = urllib_request.urlopen(req) 

45 ull = True 

46 except (SSLEOFError, URLError) as ee: 

47 # This usually happens on Windows. 

48 # ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:749) 

49 if sele: 

50 from ..installcustom.install_custom_chromedriver import install_chromedriver 

51 import selenium.webdriver 

52 install_chromedriver(fLOG=None) 

53 browser = selenium.webdriver.Chrome() 

54 browser.get(page) 

55 text = browser.page_source 

56 browser.close() 

57 if len(text) < 1000: 

58 raise ValueError( 

59 "Unable to retrieve information from '{0}' with selenium len={1}".format(page, len(text))) 

60 else: 

61 raise ee 

62 except Exception as e: 

63 raise Exception( 

64 "unable to get '{0}' '{1}'".format(page, type(e))) from e 

65 

66 if ull: 

67 text = u.read() 

68 u.close() 

69 text = text.decode("utf8") 

70 

71 return _clean_page_wheel(text) 

72 

73 

74def _clean_page_wheel(text): 

75 """ 

76 remove unexpected characters 

77 

78 @param text string 

79 @return string 

80 """ 

81 text = text.replace("&quot;", "'") 

82 text = text.replace("&#8209;", "-") 

83 text = text.replace("&#46;", ".") 

84 text = text.replace(" &middot; ", "-") 

85 text = text.replace("&ndash;", "-") 

86 return text 

87 

88 

89def save_page_wheel(filename, content): 

90 """ 

91 cache a HTML page 

92 

93 @param filename filename 

94 @param content content 

95 @return filename 

96 """ 

97 with open(filename, "w", encoding="utf8") as f: 

98 f.write(content) 

99 

100 

101def read_page_wheel(filename): 

102 """ 

103 read a cached HTML page 

104 

105 @param filename filename 

106 @return filename 

107 """ 

108 with open(filename, "r", encoding="utf8") as f: 

109 text = f.read() 

110 return _clean_page_wheel(text) 

111 

112 

113def _cg_dl1(ml, mi): 

114 ot = "" 

115 for j in range(0, len(mi)): 

116 ot += chr(ml[ord(mi[j]) - 48]) 

117 return ot 

118 

119 

120def _cg_dl(ml, mi, fLOG=None): 

121 """ 

122 compressed:: 

123 

124 if (top.location!=location) top.location.href=location.href; 

125 function dc(ml,mi){var ot="";for(var j=0;j<mi.length;j++)ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]); 

126 document.write(ot);}function dl1(ml,mi){var ot="";for(var j=0;j<mi.length;j++)ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]); 

127 location.href=ot;}function dl(ml,mi){mi=mi.replace('&lt;','<');mi=mi.replace('&#62;','>');mi=mi.replace('&#38;','&'); 

128 setTimeout(function(){dl1(ml,mi)},1500);} 

129 

130 source:: 

131 

132 <script type="text/javascript"> 

133 // <![CDATA[ 

134 if (top.location!=location) 

135 top.location.href=location.href; 

136 function dc(ml,mi) 

137 { 

138 var ot=""; 

139 for(var j=0;j<mi.length;j++) 

140 ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]); 

141 document.write(ot); 

142 } 

143 function dl1(ml,mi) 

144 { 

145 var ot=""; 

146 for(var j=0;j<mi.length;j++) 

147 ot+=String.fromCharCode(ml[mi.charCodeAt(j)-48]); 

148 location.href=ot; 

149 } 

150 function dl(ml,mi) 

151 { 

152 mi=mi.replace('&lt;','<'); 

153 mi=mi.replace('&#62;','>'); 

154 mi=mi.replace('&#38;','&'); 

155 setTimeout(function(){dl1(ml,mi)},1500); 

156 } 

157 // ]]> 

158 </script> 

159 """ 

160 if fLOG: 

161 fLOG("[pymy] decode", ml) 

162 fLOG("[pymy] decode", mi) 

163 mi = mi.replace('&lt;', '<') 

164 mi = mi.replace('&#62;', '>') 

165 mi = mi.replace('&gt;', '>') 

166 mi = mi.replace('&#38;', '&') 

167 return _cg_dl1(ml, mi) 

168 

169 

170class HTMLParser4Links(HTMLParser): 

171 """ 

172 extreact all links ni HTML page 

173 """ 

174 

175 def __init__(self): 

176 """ 

177 constructor 

178 """ 

179 if sys.version_info[0] == 2: 

180 HTMLParser.__init__(self) 

181 else: 

182 HTMLParser.__init__(self, convert_charrefs=True) 

183 self.links = [] 

184 self.current = None 

185 

186 def handle_starttag(self, tag, attrs): 

187 """ 

188 enters a tag 

189 """ 

190 if tag == "a": 

191 self.current = "" 

192 self.attrs = attrs 

193 

194 def handle_endtag(self, tag): 

195 """ 

196 ends of a tag 

197 """ 

198 def clean_dashes(st): 

199 b = st.encode('utf-8') 

200 b = b.replace(b'\xe2\x80\x91', b'-') 

201 b = b.replace(b'\xc2\xa0', b' ') 

202 return b.decode('utf-8') 

203 if tag == "a": 

204 if self.current is not None and len(self.current) > 0: 

205 app = (clean_dashes(self.current), 

206 [(clean_dashes(name), clean_dashes(link)) for name, link in self.attrs]) 

207 self.links.append(app) 

208 self.current = None 

209 

210 def handle_data(self, data): 

211 """ 

212 stores data if a link 

213 """ 

214 if self.current is not None: 

215 self.current += data 

216 

217 

218def extract_all_links(text): 

219 """ 

220 parses HTML to extract all links 

221 

222 @param text HTML page 

223 @return list of links 

224 """ 

225 parser = HTMLParser4Links() 

226 parser.feed(text) 

227 return parser.links 

228 

229 

230def enumerate_links_module(name, alls, version, plat): 

231 """ 

232 Selects the links for a specific module. 

233 

234 @param name module name 

235 @param alls all links from @see fn extract_all_links 

236 @param version python version 

237 @param plat platform 

238 """ 

239 version = "%d%d" % version[:2] 

240 lname = name.lower() 

241 lname_ = lname.replace("-", "_") + "-" 

242 lname += "-" 

243 for a in alls: 

244 n = a[0] 

245 ln = n.lower() 

246 if (ln.startswith(lname) or ln.startswith(lname_)) and plat in ln: 

247 vers = ("cp" + version, "py" + version) 

248 good = False 

249 for v in vers: 

250 if v in ln: 

251 good = True 

252 if not good: 

253 continue 

254 else: 

255 continue 

256 

257 js = None 

258 for at, val in a[1]: 

259 if at == "onclick": 

260 js = val.lstrip() 

261 

262 if js: 

263 js0 = js 

264 suf = '"javascript:dl("' 

265 bs = ["javascript:", "javascript :", "javascript :"] 

266 res = None 

267 for b in bs: 

268 if js.startswith(b): 

269 js = js[len(b):] 

270 if js.endswith(suf): 

271 js = js[:-len(suf) - 2] 

272 if "javascript:" in js: 

273 # Addition: 207-08-24 

274 js = js[:js.index('javascript:')] 

275 dl = _cg_dl 

276 js = js.strip('" \t ;\'') 

277 if dl is not None: 

278 try: 

279 res = eval(js) 

280 except SyntaxError as e: 

281 raise SyntaxError( 

282 "Unable to evaluate '{0}'\njs0='{1}'.".format(js, js0)) from e 

283 break 

284 if res is None: 

285 raise InternalJsException( 

286 "Unable to decode js '{0}'".format(js)) 

287 yield n, js, res