Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief A function to download the content of a url.
4"""
5import os
6from datetime import datetime
7import socket
8import gzip
9import warnings
10import hashlib
11import urllib.error as urllib_error
12import urllib.request as urllib_request
13import http.client as http_client
14try:
15 from http.client import InvalidURL
16except ImportError:
17 InvalidURL = ValueError
20class InternetException(Exception):
22 """
23 Exception for the function @see fn get_url_content_timeout
24 """
25 pass
28def get_url_content_timeout(url, timeout=10, output=None, encoding="utf8",
29 raise_exception=True, chunk=None, fLOG=None):
30 """
31 Downloads a file from internet (by default, it assumes
32 it is text information, otherwise, encoding should be None).
34 @param url (str) url
35 @param timeout (int) in seconds, after this time, the function drops an returns None, -1 for forever
36 @param output (str) if None, the content is stored in that file
37 @param encoding (str) utf8 by default, but if it is None, the returned information is binary
38 @param raise_exception (bool) True to raise an exception, False to send a warnings
39 @param chunk (int|None) save data every chunk (only if output is not None)
40 @param fLOG logging function (only applies when chunk is not None)
41 @return content of the url
43 If the function automatically detects that the downloaded data is in gzip
44 format, it will decompress it.
46 The function raises the exception @see cl InternetException.
47 """
48 def save_content(content, append=False):
49 "local function"
50 app = "a" if append else "w"
51 if encoding is not None:
52 with open(output, app, encoding=encoding) as f:
53 f.write(content)
54 else:
55 with open(output, app + "b") as f:
56 f.write(content)
58 try:
59 if chunk is not None:
60 if output is None:
61 raise ValueError(
62 "output cannot be None if chunk is not None")
63 app = [False]
64 size = [0]
66 def _local_loop(ur):
67 while True:
68 res = ur.read(chunk)
69 size[0] += len(res) # pylint: disable=E1137
70 if fLOG is not None:
71 fLOG("[get_url_content_timeout] downloaded",
72 size, "bytes")
73 if len(res) > 0:
74 if encoding is not None:
75 res = res.decode(encoding=encoding)
76 save_content(res, app)
77 else:
78 break
79 app[0] = True # pylint: disable=E1137
81 if timeout != -1:
82 with urllib_request.urlopen(url, timeout=timeout) as ur:
83 _local_loop(ur)
84 else:
85 with urllib_request.urlopen(url) as ur:
86 _local_loop(ur)
87 app = app[0]
88 size = size[0]
89 else:
90 if timeout != -1:
91 with urllib_request.urlopen(url, timeout=timeout) as ur:
92 res = ur.read()
93 else:
94 with urllib_request.urlopen(url) as ur:
95 res = ur.read()
96 except (urllib_error.HTTPError, urllib_error.URLError,
97 ConnectionRefusedError) as e:
98 if raise_exception:
99 raise InternetException(
100 "Unable to retrieve content, url='{0}'".format(url)) from e
101 warnings.warn(
102 "Unable to retrieve content from '{0}' exc: {1}".format(url, e), ResourceWarning)
103 return None
104 except socket.timeout as e:
105 if raise_exception:
106 raise InternetException(
107 "Unable to retrieve content, url='{0}'".format(url)) from e
108 warnings.warn("unable to retrieve content from {0} because of timeout {1}: {2}".format(
109 url, timeout, e), ResourceWarning)
110 return None
111 except ConnectionResetError as e:
112 if raise_exception:
113 raise InternetException(
114 "Unable to retrieve content, url='{0}'".format(url)) from e
115 warnings.warn(
116 "unable to retrieve content from {0} because of ConnectionResetError: {1}".format(url, e), ResourceWarning)
117 return None
118 except http_client.BadStatusLine as e:
119 if raise_exception:
120 raise InternetException(
121 "Unable to retrieve content, url='{0}'".format(url)) from e
122 warnings.warn(
123 "Unable to retrieve content from '{0}' because of http.client.BadStatusLine: {1}".format(url, e), ResourceWarning)
124 return None
125 except http_client.IncompleteRead as e:
126 if raise_exception:
127 raise InternetException(
128 "Unable to retrieve content url='{0}'".format(url)) from e
129 warnings.warn(
130 "Unable to retrieve content from '{0}' because of http.client.IncompleteRead: {1}".format(url, e), ResourceWarning)
131 return None
132 except (ValueError, InvalidURL) as e:
133 if raise_exception:
134 raise InternetException(
135 "Unable to retrieve content url='{0}'".format(url)) from e
136 warnings.warn(
137 "Unable to retrieve content from '{0}' because of {1}".format(url, e), ResourceWarning)
138 return None
139 except Exception as e:
140 if raise_exception:
141 raise InternetException(
142 "Unable to retrieve content, url='{0}', exc={1}".format(url, e)) from e
143 warnings.warn(
144 "Unable to retrieve content from '{0}' because of unknown exception: {1}".format(url, e), ResourceWarning)
145 raise e
147 if chunk is None:
148 if len(res) >= 2 and res[:2] == b"\x1f\x8B":
149 # gzip format
150 res = gzip.decompress(res)
152 if encoding is not None:
153 try:
154 content = res.decode(encoding)
155 except UnicodeDecodeError as e:
156 # it tries different encoding
158 laste = [e]
159 othenc = ["iso-8859-1", "latin-1"]
161 for encode in othenc:
162 try:
163 content = res.decode(encode)
164 break
165 except UnicodeDecodeError as e:
166 laste.append(e)
167 content = None
169 if content is None:
170 mes = ["Unable to parse text from '{0}'.".format(url)]
171 mes.append("tried:" + str([encoding] + othenc))
172 mes.append("beginning:\n" + str([res])[:50])
173 for e in laste:
174 mes.append("Exception: " + str(e))
175 raise ValueError("\n".join(mes))
176 else:
177 content = res
178 else:
179 content = None
181 if output is not None and chunk is None:
182 save_content(content)
184 return content
187def _hash_url(url):
188 m = hashlib.sha256()
189 m.update(url.encode('utf-8'))
190 return m.hexdigest()[:25]
193def get_urls_content_timeout(urls, timeout=10, folder=None, encoding=None,
194 raise_exception=True, chunk=None, fLOG=None):
195 """
196 Downloads data from urls (by default, it assumes
197 it is text information, otherwise, encoding should be None).
199 :param urls: urls
200 :param timeout: in seconds, after this time, the function drops an returns None, -1 for forever
201 :param folder: if None, the content is stored in that file
202 :param encoding: None by default, but if it is None, the returned information is binary
203 :param raise_exception: True to raise an exception, False to send a warnings
204 :param chunk: save data every chunk (only if output is not None)
205 :param fLOG: logging function (only applies when chunk is not None)
206 :return: list of downloaded content
208 If the function automatically detects that the downloaded data is in gzip
209 format, it will decompress it.
211 The function raises the exception @see cl InternetException.
212 """
213 import pandas
214 import pandas.errors
215 if not isinstance(urls, list):
216 raise TypeError("urls must be a list")
217 if folder is None:
218 raise ValueError("folder should not be None")
219 summary = os.path.join(folder, "summary.csv")
220 if os.path.exists(summary):
221 try:
222 df = pandas.read_csv(summary)
223 except pandas.errors.EmptyDataError:
224 df = None
225 else:
226 df = None
227 if df is not None:
228 all_obs = [dict(url=df.loc[i, 'url'], # pylint: disable=E1101
229 size=df.loc[i, 'size'], # pylint: disable=E1101
230 date=df.loc[i, 'date'], # pylint: disable=E1101
231 dest=df.loc[i, 'dest']) # pylint: disable=E1101
232 for i in range(df.shape[0])] # pylint: disable=E1101
233 done = set(d['dest'] for d in all_obs)
234 else:
235 all_obs = []
236 done = set()
237 for i, url in enumerate(urls):
238 dest = _hash_url(url)
239 if dest in done:
240 continue
241 full_dest = os.path.join(folder, dest + '.bin')
242 content = get_url_content_timeout(url, timeout=timeout, output=full_dest,
243 encoding=encoding, chunk=chunk,
244 raise_exception=raise_exception)
245 if content is None:
246 continue
247 if fLOG is not None:
248 fLOG("{}/{} downloaded {} bytes from '{}' to '{}'.".format(
249 i + 1, len(urls), len(content), url, dest + '.bin'))
251 obs = dict(url=url, size=len(content), date=datetime.now(),
252 dest=dest)
253 all_obs.append(obs)
254 done.add(dest)
256 new_df = pandas.DataFrame(all_obs)
257 new_df.to_csv(summary, index=False)
258 return all_obs
261def local_url(url, folder=None, envvar='REPO_LOCAL_URLS'):
262 """
263 Replaces the url by a local file in a folder
264 or an environment variable
265 if *folder* is None.
267 :param url: url to replace
268 :param folder: local folder
269 :param envvar: environment variable
270 :return: local file or url
271 """
272 if folder is None:
273 folder = os.environ.get(envvar, None) # pragma: no cover
274 if folder is None:
275 raise FileNotFoundError(
276 "Unable to find local folder '{}' or environment variable '{}'.".format(
277 folder, envvar))
278 loc = _hash_url(url)
279 name = os.path.join(folder, loc + '.bin')
280 if os.path.exists(name):
281 return name
282 return url