Coverage for src/pyrsslocal/xmlhelper/xml_utils.py: 34%
38 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-09-26 08:41 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-09-26 08:41 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief parsing XML
5"""
7import re
8from xml.sax.saxutils import escape as sax_escape
9from html.entities import name2codepoint
12def escape(s):
13 """
14 @param s string to escape
15 @return escaped string
16 """
17 if isinstance(s, list):
18 return [escape(_) for _ in s]
19 else:
20 s = sax_escape(s)
21 s = s.replace("&", "&")
22 return s
25def html_unescape(text):
26 """
27 Removes :epkg:`HTML` or :epkg:`XML` character references
28 and entities from a text string.
29 keep ``&``, ``>``, ``<`` in the source code.
30 from `Fredrik Lundh
31 <http://effbot.org/zone/re-sub.htm#unescape-html>`_.
32 """
33 def fixup(m):
34 text = m.group(0)
35 if text[:2] == "&#":
36 try:
37 if text[:3] == "&#x":
38 return chr(int(text[3:-1], 16))
39 else:
40 return chr(int(text[2:-1]))
41 except ValueError:
42 pass
43 else:
44 # named entity
45 try:
46 if text[1:-1] == "amp":
47 text = "&amp;"
48 elif text[1:-1] == "gt":
49 text = "&gt;"
50 elif text[1:-1] == "lt":
51 text = "&lt;"
52 else:
53 text = chr(name2codepoint[text[1:-1]])
54 except KeyError:
55 pass
56 return text # leave as is
57 return re.sub("&#?\\w+;", fixup, text)
60character_to_escape = {
61 "é": "é",
62 " ": " ",
63 "è": "è",
64 "à": "à",
65 "â": "â",
66 "ê": "ê",
67 "ë": "ë",
68 "î": "î",
69 "ù": "ù",
70 "ü": "ü",
71 "ô": "ô",
72 "œ": "œ",
73}
76def html_escape(text):
77 """
78 Escapes any French character with an accent.
79 """
80 def fixup(m):
81 text = m.group(0)
82 return character_to_escape.get(text, text)
83 return re.sub("[àâäéèêëîïôöùüü]", fixup, text)