Coverage for src/actuariat_python/data/wolf.py : 97%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Various function to download data about population
5"""
6import os
7import re
8from pyquickhelper.loghelper import noLOG
9from pymyinstall.installcustom import download_page
10from pyensae.datasource import download_data
11from pyrsslocal.xmlhelper import xml_filter_iterator
12from .data_exceptions import LinkNotFoundError
15def wolf_xml(url="http://pauillac.inria.fr/~sagot/index.html", temp_folder=".", fLOG=noLOG):
16 """
17 The `WOLF <http://alpage.inria.fr/~sagot/wolf-en.html>`_
18 (Wordnet Libre du Français, Free French Wordnet) is a free semantic
19 lexical resource (wordnet) for French.
21 This data is licensed under `Cecill-C license
22 <http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html>`_.
23 Language is French.
25 @param url url
26 @param fLOG logging function
27 @param temp_folder where to download
28 @return list of files
29 """
30 link = url
31 page = download_page(link)
32 reg = re.compile("href=\\\"(https.*?wolf.*?[.]bz2)\\\"")
33 alls = reg.findall(page)
34 if len(alls) == 0:
35 raise LinkNotFoundError( # pragma: no cover
36 "unable to find a link on a .bz2 file on page\n{}".format(page))
38 url = alls[0]
39 spl = url.split("/")
40 url = "/".join(spl[:-1]) + "/"
41 url2 = "/".join(spl[:-2]) + "/31718/"
42 dtd = download_data("debvisdic-strict.dtd", url=[url2, "xd"],
43 fLOG=fLOG, whereTo=temp_folder)
44 name = spl[-1].strip('.')
45 local = download_data(
46 name, url=[url, "xd"], fLOG=fLOG, whereTo=temp_folder)
47 if isinstance(local, str):
48 local = [local]
49 # We check the file was downloaded.
50 expected = os.path.join(temp_folder, "wolf-1.0b4.xml")
51 if not os.path.exists(expected): # pragma: no cover
52 res = download_data("wolf-1.0b4.xml.zip",
53 whereTo=temp_folder, fLOG=fLOG)
54 if not os.path.exists(expected):
55 raise FileNotFoundError(expected)
56 return res
57 elif isinstance(dtd, list):
58 return local + dtd
59 return local + [dtd] # pragma: no cover
62def enumerate_wolf_xml_row(filename, fLOG=noLOG, xmlformat=False, encoding="utf-8", errors=None):
63 """
64 walk through an XML file returned by function
65 @see fn wolf_xml
67 @param filename filename
68 @param fLOG logging function
69 @param xmlformat if True, return the xml, otherwise return the node,
70 see `XMLHandlerDictNode <http://www.xavierdupre.fr/app/pyrsslocal/
71 helpsphinx/pyrsslocal/xmlhelper/xml_tree_node.html#
72 module-pyrsslocal.xmlhelper.xml_tree_node>`_
73 @param encoding encoding
74 @param errors what to do with errors
75 @return elements
76 """
77 for row in xml_filter_iterator(filename, xmlformat=xmlformat, fLOG=fLOG, encoding=encoding, errors=errors):
78 yield row
81def enumerate_wolf_synonyms(filename, fLOG=noLOG, encoding="utf-8", errors=None):
82 """
83 enumerate list of synonyms
84 Language is French.
86 @param filename xml file
87 @param fLOG logging function
88 @param encoding encoding
89 @param errors what to do with errors
90 @return iterator on list of words
91 """
92 for row in enumerate_wolf_xml_row(
93 filename, fLOG=fLOG, encoding=encoding, errors=errors):
94 syn = [v for k, v in row.iterfields() if k == "SYNSET/SYNONYM/LITERAL/_"]
95 if len(syn) > 1:
96 yield syn