Coverage for pyquickhelper/filehelper/download_helper.py: 91%
117 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
1"""
2@file
3@brief A function to download the content of a url.
4"""
5import os
6from datetime import datetime
7import socket
8import gzip
9import warnings
10import hashlib
11import urllib.error as urllib_error
12import urllib.request as urllib_request
13import http.client as http_client
14try:
15 from http.client import InvalidURL
16except ImportError: # pragma: no cover
17 InvalidURL = ValueError
20class InternetException(Exception):
22 """
23 Exception for the function @see fn get_url_content_timeout
24 """
25 pass
28def get_url_content_timeout(url, timeout=10, output=None, encoding="utf8",
29 raise_exception=True, chunk=None, fLOG=None):
30 """
31 Downloads a file from internet (by default, it assumes
32 it is text information, otherwise, encoding should be None).
34 @param url (str) url
35 @param timeout (int) in seconds, after this time, the function drops an returns None, -1 for forever
36 @param output (str) if None, the content is stored in that file
37 @param encoding (str) utf8 by default, but if it is None, the returned information is binary
38 @param raise_exception (bool) True to raise an exception, False to send a warnings
39 @param chunk (int|None) save data every chunk (only if output is not None)
40 @param fLOG logging function (only applies when chunk is not None)
41 @return content of the url
43 If the function automatically detects that the downloaded data is in gzip
44 format, it will decompress it.
46 The function raises the exception @see cl InternetException.
47 """
48 def save_content(content, append=False):
49 "local function"
50 app = "a" if append else "w"
51 if encoding is not None:
52 with open(output, app, encoding=encoding) as f:
53 f.write(content)
54 else:
55 with open(output, app + "b") as f:
56 f.write(content)
58 try:
59 if chunk is not None:
60 if output is None:
61 raise ValueError(
62 "output cannot be None if chunk is not None")
63 app = [False]
64 size = [0]
66 def _local_loop(ur):
67 while True:
68 res = ur.read(chunk)
69 size[0] += len(res) # pylint: disable=E1137
70 if fLOG is not None:
71 fLOG("[get_url_content_timeout] downloaded",
72 size, "bytes")
73 if len(res) > 0:
74 if encoding is not None:
75 res = res.decode(encoding=encoding)
76 save_content(res, app)
77 else:
78 break
79 app[0] = True # pylint: disable=E1137
81 if timeout != -1:
82 with urllib_request.urlopen(url, timeout=timeout) as ur:
83 _local_loop(ur)
84 else:
85 with urllib_request.urlopen(url) as ur:
86 _local_loop(ur)
87 app = app[0]
88 size = size[0]
89 else:
90 if timeout != -1:
91 with urllib_request.urlopen(url, timeout=timeout) as ur:
92 res = ur.read()
93 else:
94 with urllib_request.urlopen(url) as ur:
95 res = ur.read()
96 except (urllib_error.HTTPError, urllib_error.URLError, ConnectionRefusedError,
97 socket.timeout, ConnectionResetError, http_client.BadStatusLine,
98 http_client.IncompleteRead, ValueError, InvalidURL) as e:
99 if raise_exception:
100 raise InternetException(
101 f"Unable to retrieve content url='{url}'") from e
102 warnings.warn(
103 f"Unable to retrieve content from '{url}' because of {e}", ResourceWarning)
104 return None
105 except Exception as e:
106 if raise_exception: # pragma: no cover
107 raise InternetException(
108 f"Unable to retrieve content, url='{url}', exc={e}") from e
109 warnings.warn(
110 f"Unable to retrieve content from '{url}' because of unknown exception: {e}", ResourceWarning)
111 raise e
113 if chunk is None:
114 if len(res) >= 2 and res[:2] == b"\x1f\x8B":
115 # gzip format
116 res = gzip.decompress(res)
118 if encoding is not None:
119 try:
120 content = res.decode(encoding)
121 except UnicodeDecodeError as e: # pragma: no cover
122 # it tries different encoding
124 laste = [e]
125 othenc = ["iso-8859-1", "latin-1"]
127 for encode in othenc:
128 try:
129 content = res.decode(encode)
130 break
131 except UnicodeDecodeError as ee:
132 laste.append(ee)
133 content = None
135 if content is None:
136 mes = [f"Unable to parse text from '{url}'."]
137 mes.append("tried:" + str([encoding] + othenc))
138 mes.append("beginning:\n" + str([res])[:50])
139 for e in laste:
140 mes.append("Exception: " + str(e))
141 raise ValueError("\n".join(mes))
142 else:
143 content = res
144 else:
145 content = None
147 if output is not None and chunk is None:
148 save_content(content)
150 return content
153def _hash_url(url):
154 m = hashlib.sha256()
155 m.update(url.encode('utf-8'))
156 return m.hexdigest()[:25]
159def get_urls_content_timeout(urls, timeout=10, folder=None, encoding=None,
160 raise_exception=True, chunk=None, fLOG=None):
161 """
162 Downloads data from urls (by default, it assumes
163 it is text information, otherwise, encoding should be None).
165 :param urls: urls
166 :param timeout: in seconds, after this time, the function drops an returns None, -1 for forever
167 :param folder: if None, the content is stored in that file
168 :param encoding: None by default, but if it is None, the returned information is binary
169 :param raise_exception: True to raise an exception, False to send a warnings
170 :param chunk: save data every chunk (only if output is not None)
171 :param fLOG: logging function (only applies when chunk is not None)
172 :return: list of downloaded content
174 If the function automatically detects that the downloaded data is in gzip
175 format, it will decompress it.
177 The function raises the exception @see cl InternetException.
178 """
179 import pandas
180 import pandas.errors
181 if not isinstance(urls, list):
182 raise TypeError("urls must be a list")
183 if folder is None:
184 raise ValueError("folder should not be None")
185 summary = os.path.join(folder, "summary.csv")
186 if os.path.exists(summary):
187 try:
188 df = pandas.read_csv(summary)
189 except pandas.errors.EmptyDataError:
190 df = None
191 else:
192 df = None
193 if df is not None:
194 all_obs = [dict(url=df.loc[i, 'url'], # pylint: disable=E1101
195 size=df.loc[i, 'size'], # pylint: disable=E1101
196 date=df.loc[i, 'date'], # pylint: disable=E1101
197 dest=df.loc[i, 'dest']) # pylint: disable=E1101
198 for i in range(df.shape[0])] # pylint: disable=E1101
199 done = set(d['dest'] for d in all_obs)
200 else:
201 all_obs = []
202 done = set()
203 for i, url in enumerate(urls):
204 dest = _hash_url(url)
205 if dest in done:
206 continue
207 full_dest = os.path.join(folder, dest + '.bin')
208 content = get_url_content_timeout(url, timeout=timeout, output=full_dest,
209 encoding=encoding, chunk=chunk,
210 raise_exception=raise_exception)
211 if content is None:
212 continue
213 if fLOG is not None:
214 fLOG("{}/{} downloaded {} bytes from '{}' to '{}'.".format(
215 i + 1, len(urls), len(content), url, dest + '.bin'))
217 obs = dict(url=url, size=len(content), date=datetime.now(),
218 dest=dest)
219 all_obs.append(obs)
220 done.add(dest)
222 new_df = pandas.DataFrame(all_obs)
223 new_df.to_csv(summary, index=False)
224 return all_obs
227def local_url(url, folder=None, envvar='REPO_LOCAL_URLS'):
228 """
229 Replaces the url by a local file in a folder
230 or an environment variable
231 if *folder* is None.
233 :param url: url to replace
234 :param folder: local folder
235 :param envvar: environment variable
236 :return: local file or url
237 """
238 if folder is None:
239 folder = os.environ.get(envvar, None) # pragma: no cover
240 if folder is None:
241 raise FileNotFoundError(
242 "Unable to find local folder '{}' or environment variable '{}'.".format(
243 folder, envvar))
244 loc = _hash_url(url)
245 name = os.path.join(folder, loc + '.bin')
246 if os.path.exists(name):
247 return name
248 return url