Coverage for src/pyrsslocal/xmlhelper/xml_utils.py: 34%

38 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2023-02-02 02:59 +0100

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief parsing XML 

5""" 

6 

7import re 

8from xml.sax.saxutils import escape as sax_escape 

9from html.entities import name2codepoint 

10 

11 

12def escape(s): 

13 """ 

14 @param s string to escape 

15 @return escaped string 

16 """ 

17 if isinstance(s, list): 

18 return [escape(_) for _ in s] 

19 else: 

20 s = sax_escape(s) 

21 s = s.replace("&", "&") 

22 return s 

23 

24 

25def html_unescape(text): 

26 """ 

27 Removes :epkg:`HTML` or :epkg:`XML` character references 

28 and entities from a text string. 

29 keep ``&``, ``>``, ``<`` in the source code. 

30 from `Fredrik Lundh 

31 <http://effbot.org/zone/re-sub.htm#unescape-html>`_. 

32 """ 

33 def fixup(m): 

34 text = m.group(0) 

35 if text[:2] == "&#": 

36 try: 

37 if text[:3] == "&#x": 

38 return chr(int(text[3:-1], 16)) 

39 else: 

40 return chr(int(text[2:-1])) 

41 except ValueError: 

42 pass 

43 else: 

44 # named entity 

45 try: 

46 if text[1:-1] == "amp": 

47 text = "&amp;amp;" 

48 elif text[1:-1] == "gt": 

49 text = "&amp;gt;" 

50 elif text[1:-1] == "lt": 

51 text = "&amp;lt;" 

52 else: 

53 text = chr(name2codepoint[text[1:-1]]) 

54 except KeyError: 

55 pass 

56 return text # leave as is 

57 return re.sub("&#?\\w+;", fixup, text) 

58 

59 

60character_to_escape = { 

61 "é": "&eacute;", 

62 " ": "&nbsp;", 

63 "è": "&egrave;", 

64 "à": "&agrave;", 

65 "â": "&acirc;", 

66 "ê": "&ecirc;", 

67 "ë": "&euml;", 

68 "î": "&icirc;", 

69 "ù": "&ugrave;", 

70 "ü": "&uuml;", 

71 "ô": "&ocirc;", 

72 "œ": "&oelig;", 

73} 

74 

75 

76def html_escape(text): 

77 """ 

78 Escapes any French character with an accent. 

79 """ 

80 def fixup(m): 

81 text = m.group(0) 

82 return character_to_escape.get(text, text) 

83 return re.sub("[àâäéèêëîïôöùüü]", fixup, text)