Coverage for src/ensae_teaching_cs/faq/faq_web.py: 57%

100 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-04-28 06:23 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief A few functions about scrapping 

5 

6""" 

7import sys 

8import os 

9import datetime 

10import warnings 

11from pyquickhelper.loghelper import noLOG 

12from pymyinstall.installcustom import where_in_path, install_chromedriver, install_operadriver 

13 

14 

15default_driver = "opera" 

16 

17 

18def webshot(img, url, navigator=default_driver, add_date=False, 

19 size=None, fLOG=noLOG): 

20 """ 

21 Uses the module :epkg:`selenium` 

22 to take a picture of a website. 

23 If url and img are lists, the function goes 

24 through all the urls and save webshots. 

25 

26 @param img list of image names 

27 @param url url 

28 @param navigator firefox, chrome, (ie: does not work well) 

29 @param add_date add a date to the image filename 

30 @param size to resize the webshot (if not None) 

31 @param fLOG logging function 

32 @return list of [ ( url, image name) ] 

33 

34 Check the list of available webdriver at 

35 `selenium/webdriver <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_ 

36 and add one to the code if needed. 

37 

38 Chrome requires the `chromedriver <http://chromedriver.storage.googleapis.com/index.html>`_. 

39 See function `install_chromedriver <http://www.xavierdupre.fr/app/pymyinstall/helpsphinx/pymyinstall/ 

40 installcustom/install_custom_chromedriver.html?highlight=chromedriver 

41 #pymyinstall.installcustom.install_custom_chromedriver.install_chromedriver>`_. 

42 """ 

43 res = [] 

44 browser = _get_selenium_browser(navigator, fLOG=fLOG) 

45 

46 if size is not None: 

47 fLOG("set size", size) 

48 browser.set_window_size(size[0], size[1]) 

49 

50 if not isinstance(url, list): 

51 url = [url] 

52 if not isinstance(img, list): 

53 img = [img] 

54 if len(url) != len(img): 

55 raise RuntimeError("different number of urls and images") 

56 for u, i in zip(url, img): 

57 fLOG("url", url, " into ", img) 

58 browser.get(u) 

59 if add_date: 

60 dt = datetime.datetime.now() 

61 a, b = os.path.splitext(i) 

62 i = f"{a}.{str(dt).replace(':', '-').replace('/', '-')}{b}" 

63 browser.get_screenshot_as_file(i) 

64 res.append((u, i)) 

65 browser.quit() 

66 return res 

67 

68 

69def _get_selenium_browser(navigator, fLOG=noLOG): 

70 """ 

71 Returns the associated driver with some custom settings. 

72 

73 The function automatically gets chromedriver if not present (:epkg:`Windows` only). 

74 On :epkg:`Linux`, package *chromium-driver* should be installed: 

75 ``apt-get install chromium-driver``. 

76 

77 .. faqref:: 

78 :tag: web 

79 :title: Issue with Selenium and Firefox 

80 :lid: faq-web-selenium 

81 

82 Firefox >= v47 does not work on Windows. 

83 See `Selenium WebDriver and Firefox 47 <http://www.theautomatedtester.co.uk/blog/2016/selenium-webdriver-and-firefox-47.html>`_. 

84 

85 Voir `ChromeDriver download <http://chromedriver.storage.googleapis.com/index.html>`_, 

86 `Error message: 'chromedriver' executable needs to be available in the path 

87 <http://stackoverflow.com/questions/29858752/error-message-chromedriver-executable-needs-to-be-available-in-the-path>`_. 

88 

89 See `Selenium - Remote WebDriver example 

90 <https://sauceclient.readthedocs.io/en/latest/selenium_on_sauce.html#selenium-remote-webdriver-example>`_, 

91 see also `Running the remote driver with Selenium and python <https://gist.github.com/alfredo/1962031>`_. 

92 """ 

93 with warnings.catch_warnings(): 

94 warnings.simplefilter("ignore", ImportWarning) 

95 from selenium import webdriver 

96 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 

97 

98 fLOG("[webshot] navigator=", navigator) 

99 if navigator == "firefox": 

100 firefox_capabilities = DesiredCapabilities.FIREFOX.copy() 

101 firefox_capabilities['marionette'] = True 

102 firefox_capabilities[ 

103 'binary'] = r"C:\Program Files (x86)\Mozilla Firefox\firefox.exe" 

104 browser = webdriver.Firefox(capabilities=firefox_capabilities) 

105 elif navigator == "chrome": 

106 if sys.platform.startswith("win"): 

107 chromed = where_in_path("chromedriver.exe") 

108 if chromed is None: 

109 install_chromedriver(fLOG=fLOG) 

110 chromed = where_in_path("chromedriver.exe") 

111 if chromed is None: 

112 raise FileNotFoundError( 

113 "unable to install 'chromedriver.exe'") 

114 else: 

115 fLOG("[_get_selenium_browser] found chromedriver:", chromed) 

116 else: 

117 chromed = 'chromedriver' 

118 

119 start_navi = True 

120 if start_navi: 

121 fLOG("[_get_selenium_browser] start", navigator) 

122 chrome_options = webdriver.ChromeOptions() 

123 chrome_options.add_argument('--headless') 

124 chrome_options.add_argument('--no-sandbox') 

125 chrome_options.add_argument('--verbose') 

126 browser = webdriver.Chrome(executable_path=chromed, 

127 chrome_options=chrome_options) 

128 else: 

129 with warnings.catch_warnings(): 

130 warnings.simplefilter("ignore", ImportWarning) 

131 import selenium.webdriver.chrome.service as wservice 

132 fLOG("[_get_selenium_browser] create service") 

133 service = wservice.Service(chromed) 

134 fLOG("[_get_selenium_browser] start service") 

135 service.start() 

136 fLOG("[_get_selenium_browser] declare remote") 

137 capabilities = {'chrome.binary': chromed} 

138 browser = webdriver.Remote(service.service_url, capabilities) 

139 elif navigator == "ie": 

140 browser = webdriver.Ie() 

141 elif navigator == "opera": 

142 if sys.platform.startswith("win"): 

143 chromed = where_in_path("operadriver.exe") 

144 if chromed is None: 

145 install_operadriver(fLOG=fLOG) 

146 chromed = where_in_path("operadriver.exe") 

147 if chromed is None: 

148 raise FileNotFoundError( 

149 "unable to install operadriver.exe") 

150 else: 

151 fLOG("[_get_selenium_browser] found chromedriver:", chromed) 

152 else: 

153 chromed = 'operadriver' 

154 browser = webdriver.Opera(chromed) # pylint: disable=E1101 

155 elif navigator == "edge": 

156 browser = webdriver.Edge() 

157 else: 

158 raise RuntimeError( 

159 f"unable to interpret the navigator '{navigator}'") 

160 fLOG("[_get_selenium_browser] navigator is started") 

161 return browser 

162 

163 

164def webhtml(url, navigator=default_driver, fLOG=noLOG): 

165 """ 

166 Uses the module `selenium <http://selenium-python.readthedocs.io/>`_ 

167 to retrieve the html content of a website. 

168 

169 @param url url 

170 @param navigator firefox, chrome, (ie: does not work well) 

171 @param fLOG logging function 

172 @return list of [ ( url, html) ] 

173 

174 Check the list of available webdriver at 

175 `selenium/webdriver 

176 <https://github.com/SeleniumHQ/selenium/tree/master/py/selenium/webdriver>`_ 

177 and add one to the code if needed. 

178 """ 

179 res = [] 

180 browser = _get_selenium_browser(navigator, fLOG=fLOG) 

181 if not isinstance(url, list): 

182 url = [url] 

183 for u in url: 

184 fLOG(f"[webhtml] get url '{url}'") 

185 browser.get(u) 

186 i = browser.page_source 

187 res.append((u, i)) 

188 browser.quit() 

189 return res