Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Series of functions related to folder, explore, synchronize, remove (recursively). 

5""" 

6import re 

7from .synchelper import explore_folder_iterfile 

8from .download_helper import get_urls_content_timeout 

9 

10 

11def download_urls_in_folder_content(folder, pattern=".+[.]((py)|(ipynb))", neg_pattern=None, 

12 recursive=True, timeout=10, folder_dest=None, 

13 encoding='utf-8', raise_exception=False, chunk=None, 

14 fLOG=None): 

15 """ 

16 Iterates on files in folder, parse them, extracts all urls, download 

17 them in a folder. 

18 

19 :param folder: folder 

20 :param pattern: if None, get all files, otherwise, it is a regular expression, 

21 the filename must verify (with the folder is fullname is True) 

22 :param neg_pattern: negative pattern to exclude files 

23 :param fullname: if True, include the subfolder while checking the regex 

24 :param recursive: look into subfolders 

25 :param urls: urls 

26 :param timeout: in seconds, after this time, the function drops an returns None, -1 for forever 

27 :param folder_dest: if None, the content is stored in that file 

28 :param encoding: None by default, but if it is None, the returned information is binary 

29 :param raise_exception: True to raise an exception, False to send a warnings 

30 :param chunk: save data every chunk (only if output is not None) 

31 :param fLOG: logging function (only applies when chunk is not None) 

32 :return: list of downloaded content 

33 """ 

34 if neg_pattern == '': 

35 neg_pattern = None # pragma: no cover 

36 if chunk == '': 

37 chunk = None # pragma: no cover 

38 if isinstance(chunk, str): 

39 chunk = int(chunk) # pragma: no cover 

40 res = [] 

41 url_pattern = ("(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+" 

42 "[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+" 

43 "(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:" 

44 "'\\\".,<>?]))") 

45 reg = re.compile(url_pattern) 

46 for obj in explore_folder_iterfile(folder, pattern=pattern, neg_pattern=neg_pattern, 

47 fullname=True, recursive=recursive): 

48 with open(obj, "r", encoding=encoding, errors='ignore') as f: 

49 content = f.read() 

50 fall = reg.findall(content) 

51 if len(fall) == 0: 

52 continue 

53 if fLOG is not None: 

54 fLOG( # pragma: no cover 

55 "[download_urls_in_folder_content] explore '{}'".format(obj)) 

56 urls = [f[0] for f in fall] 

57 r = get_urls_content_timeout(urls, folder=folder_dest, timeout=timeout, 

58 raise_exception=raise_exception, chunk=chunk, 

59 fLOG=fLOG) 

60 res.extend(r) 

61 return res