Coverage for src/actuariat_python/data/wolf.py: 93%
41 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-02 07:38 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-02 07:38 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Various function to download data about population
5"""
6import os
7import re
8from pyquickhelper.loghelper import noLOG
9from pymyinstall.installcustom import download_page
10from pyensae.datasource import download_data
11from pyensae.datasource.http_retrieve import DownloadDataException
12from pyrsslocal.xmlhelper import xml_filter_iterator
13from .data_exceptions import LinkNotFoundError
16def wolf_xml(url="http://pauillac.inria.fr/~sagot/index.html", temp_folder=".", fLOG=noLOG):
17 """
18 The `WOLF <http://alpage.inria.fr/~sagot/wolf-en.html>`_
19 (Wordnet Libre du Français, Free French Wordnet) is a free semantic
20 lexical resource (wordnet) for French.
22 This data is licensed under `Cecill-C license
23 <http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html>`_.
24 Language is French.
26 @param url url
27 @param fLOG logging function
28 @param temp_folder where to download
29 @return list of files
30 """
31 link = url
32 page = download_page(link)
33 reg = re.compile("href=\\\"(https.*?wolf.*?[.]bz2)\\\"")
34 alls = reg.findall(page)
35 if len(alls) == 0:
36 raise LinkNotFoundError( # pragma: no cover
37 "unable to find a link on a .bz2 file on page\n{}".format(page))
39 url = alls[0]
40 spl = url.split("/")
41 url = "/".join(spl[:-1]) + "/"
42 url2 = "/".join(spl[:-2]) + "/31718/"
43 try:
44 dtd = download_data("debvisdic-strict.dtd", url=[url2, "xd"],
45 fLOG=fLOG, whereTo=temp_folder)
46 except DownloadDataException:
47 dtd = None
48 name = spl[-1].strip('.')
50 try:
51 local = download_data(
52 name, url=[url, "xd"], fLOG=fLOG, whereTo=temp_folder)
53 except DownloadDataException:
54 local = None
55 if local is not None and isinstance(local, str):
56 local = [local]
57 # We check the file was downloaded.
58 expected = os.path.join(temp_folder, "wolf-1.0b4.xml")
59 if local is None or not os.path.exists(expected): # pragma: no cover
60 res = download_data("wolf-1.0b4.xml.zip",
61 whereTo=temp_folder, fLOG=fLOG)
62 if not os.path.exists(expected):
63 raise FileNotFoundError(expected)
64 return res
65 elif isinstance(dtd, list):
66 return local + dtd
67 return local + [dtd] # pragma: no cover
70def enumerate_wolf_xml_row(filename, fLOG=noLOG, xmlformat=False, encoding="utf-8", errors=None):
71 """
72 walk through an XML file returned by function
73 @see fn wolf_xml
75 @param filename filename
76 @param fLOG logging function
77 @param xmlformat if True, return the xml, otherwise return the node,
78 see `XMLHandlerDictNode <http://www.xavierdupre.fr/app/pyrsslocal/
79 helpsphinx/pyrsslocal/xmlhelper/xml_tree_node.html#
80 module-pyrsslocal.xmlhelper.xml_tree_node>`_
81 @param encoding encoding
82 @param errors what to do with errors
83 @return elements
84 """
85 for row in xml_filter_iterator(filename, xmlformat=xmlformat, fLOG=fLOG, encoding=encoding, errors=errors):
86 yield row
89def enumerate_wolf_synonyms(filename, fLOG=noLOG, encoding="utf-8", errors=None):
90 """
91 enumerate list of synonyms
92 Language is French.
94 @param filename xml file
95 @param fLOG logging function
96 @param encoding encoding
97 @param errors what to do with errors
98 @return iterator on list of words
99 """
100 for row in enumerate_wolf_xml_row(
101 filename, fLOG=fLOG, encoding=encoding, errors=errors):
102 syn = [v for k, v in row.iterfields() if k == "SYNSET/SYNONYM/LITERAL/_"]
103 if len(syn) > 1:
104 yield syn