Coverage for src/actuariat_python/data/wolf.py: 93%

41 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-02 07:38 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Various function to download data about population 

5""" 

6import os 

7import re 

8from pyquickhelper.loghelper import noLOG 

9from pymyinstall.installcustom import download_page 

10from pyensae.datasource import download_data 

11from pyensae.datasource.http_retrieve import DownloadDataException 

12from pyrsslocal.xmlhelper import xml_filter_iterator 

13from .data_exceptions import LinkNotFoundError 

14 

15 

16def wolf_xml(url="http://pauillac.inria.fr/~sagot/index.html", temp_folder=".", fLOG=noLOG): 

17 """ 

18 The `WOLF <http://alpage.inria.fr/~sagot/wolf-en.html>`_ 

19 (Wordnet Libre du Français, Free French Wordnet) is a free semantic 

20 lexical resource (wordnet) for French. 

21 

22 This data is licensed under `Cecill-C license 

23 <http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html>`_. 

24 Language is French. 

25 

26 @param url url 

27 @param fLOG logging function 

28 @param temp_folder where to download 

29 @return list of files 

30 """ 

31 link = url 

32 page = download_page(link) 

33 reg = re.compile("href=\\\"(https.*?wolf.*?[.]bz2)\\\"") 

34 alls = reg.findall(page) 

35 if len(alls) == 0: 

36 raise LinkNotFoundError( # pragma: no cover 

37 "unable to find a link on a .bz2 file on page\n{}".format(page)) 

38 

39 url = alls[0] 

40 spl = url.split("/") 

41 url = "/".join(spl[:-1]) + "/" 

42 url2 = "/".join(spl[:-2]) + "/31718/" 

43 try: 

44 dtd = download_data("debvisdic-strict.dtd", url=[url2, "xd"], 

45 fLOG=fLOG, whereTo=temp_folder) 

46 except DownloadDataException: 

47 dtd = None 

48 name = spl[-1].strip('.') 

49 

50 try: 

51 local = download_data( 

52 name, url=[url, "xd"], fLOG=fLOG, whereTo=temp_folder) 

53 except DownloadDataException: 

54 local = None 

55 if local is not None and isinstance(local, str): 

56 local = [local] 

57 # We check the file was downloaded. 

58 expected = os.path.join(temp_folder, "wolf-1.0b4.xml") 

59 if local is None or not os.path.exists(expected): # pragma: no cover 

60 res = download_data("wolf-1.0b4.xml.zip", 

61 whereTo=temp_folder, fLOG=fLOG) 

62 if not os.path.exists(expected): 

63 raise FileNotFoundError(expected) 

64 return res 

65 elif isinstance(dtd, list): 

66 return local + dtd 

67 return local + [dtd] # pragma: no cover 

68 

69 

70def enumerate_wolf_xml_row(filename, fLOG=noLOG, xmlformat=False, encoding="utf-8", errors=None): 

71 """ 

72 walk through an XML file returned by function 

73 @see fn wolf_xml 

74 

75 @param filename filename 

76 @param fLOG logging function 

77 @param xmlformat if True, return the xml, otherwise return the node, 

78 see `XMLHandlerDictNode <http://www.xavierdupre.fr/app/pyrsslocal/ 

79 helpsphinx/pyrsslocal/xmlhelper/xml_tree_node.html# 

80 module-pyrsslocal.xmlhelper.xml_tree_node>`_ 

81 @param encoding encoding 

82 @param errors what to do with errors 

83 @return elements 

84 """ 

85 for row in xml_filter_iterator(filename, xmlformat=xmlformat, fLOG=fLOG, encoding=encoding, errors=errors): 

86 yield row 

87 

88 

89def enumerate_wolf_synonyms(filename, fLOG=noLOG, encoding="utf-8", errors=None): 

90 """ 

91 enumerate list of synonyms 

92 Language is French. 

93 

94 @param filename xml file 

95 @param fLOG logging function 

96 @param encoding encoding 

97 @param errors what to do with errors 

98 @return iterator on list of words 

99 """ 

100 for row in enumerate_wolf_xml_row( 

101 filename, fLOG=fLOG, encoding=encoding, errors=errors): 

102 syn = [v for k, v in row.iterfields() if k == "SYNSET/SYNONYM/LITERAL/_"] 

103 if len(syn) > 1: 

104 yield syn