Coverage for src/pyrsslocal/helper/search_engine.py: 90%

39 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2023-02-02 02:59 +0100

1""" 

2@file 

3 

4@brief various function to get the content of a page, of a search page... 

5""" 

6 

7import urllib 

8import urllib.request 

9import time 

10import random 

11import re 

12import os 

13 

14from pyquickhelper.loghelper import noLOG 

15 

16 

17def extract_bing_result(searchPage, filter_=lambda u: True): 

18 """ 

19 extract the first results from a search page assuming it coms from `Bing <http://www.bing.com>`_ 

20 @param searchPage content of `Bing <http://www.bing.com>`_ search page 

21 @param filter_ remove some urls if this function is False ``filter_(u)`` --> True or False 

22 @return a list with the urls 

23 """ 

24 reg = re.compile("""<h2><a href="(.*?)" h="ID=SERP,""") 

25 alls = reg.findall(searchPage) 

26 if alls is None or len(alls) == 0: 

27 return None 

28 if len(alls) > 10: 

29 alls = alls[:10] 

30 # here I sort by length, maybe not the best idea 

31 alltemp = sorted([(len(_), _) for _ in alls]) 

32 # alltemp = [ (len(_), _) for _ in alls ] # or not 

33 alls = [_ for _ in alltemp if filter_(_[1])] 

34 if len(alls) == 0: 

35 mes = "\n".join(str(_) for _ in alltemp) # pragma: no cover 

36 raise ValueError( # pragma: no cover 

37 "unable to find a proper url\n" + mes) 

38 res = alls[0][1] 

39 if res in ["http://chrome.angrybirds.com/"]: 

40 join = "\n".join(str(_) for _ in alls) # pragma: no cover 

41 raise ValueError( # pragma: no cover 

42 "bad result\n{0}".format(join)) 

43 return [_[1] for _ in alls] 

44 

45 

46def query_bing(query, folderCache="cacheSearchPage", 

47 filter_=lambda u: True, fLOG=noLOG): 

48 """ 

49 Returns the search page from 

50 `Bing <http://www.bing.com>`_ for a specific query. 

51 @param query search query 

52 @param folderCache folder used to stored the result page or 

53 to retrieve a page if the query was already 

54 searched for 

55 @param filter_ remove some urls if this function is False 

56 ``filter(u)`` --> True or False 

57 @param fLOG logging function 

58 @return list of urls 

59 """ 

60 if not os.path.exists(folderCache): 

61 os.mkdir(folderCache) 

62 cache = os.path.join(folderCache, "%s.bing.html" % query) 

63 if os.path.exists(cache): 

64 with open(cache, "r", encoding="utf8") as f: 

65 text = f.read() 

66 else: 

67 fLOG(" downloading results for ", query) 

68 x = 1. + random.random() 

69 time.sleep(x) 

70 url = "http://www.bing.com/search?q=" + query.replace(" ", "%20") 

71 with urllib.request.urlopen(url) as uur: 

72 text = uur.read() 

73 text = text.decode("utf8") 

74 

75 fLOG(" caching results for ", query, " in ", cache) 

76 with open(cache, "w", encoding="utf8") as f: 

77 f.write(text) 

78 

79 url = extract_bing_result(text, filter_) 

80 return url