Coverage for src/pyrsslocal/xmlhelper/html_parser_json.py: 94%

77 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2024-04-23 08:45 +0200

1""" 

2@file 

3@brief parsing HTML to convert it into JSON 

4""" 

5import html.parser 

6 

7 

8def iterate_on_json(json_structure, prefix="", keep_dictionaries=False, # pylint: disable=W0102 

9 skip=["__parent__"]): # pylint: disable=W0102 

10 """ 

11 Iterates on every field contains in the :epkg:`JSON` structure. 

12 

13 @param json_structure json structure 

14 @param prefix prefix to add 

15 @param keep_dictionaries if True, add yield k,v where v is a JSON dictionary 

16 @param skip do not enter the following tag 

17 @return iterator of (path, value) 

18 """ 

19 for k, v in sorted(json_structure.items()): 

20 if k in skip: 

21 continue 

22 p = prefix + "/" + k 

23 if isinstance(v, str): 

24 yield (p, v) 

25 elif isinstance(v, dict): 

26 if keep_dictionaries: 

27 yield (p, v) 

28 for r in iterate_on_json(v, p, keep_dictionaries, skip): 

29 yield r 

30 elif isinstance(v, list): 

31 for el in v: 

32 if keep_dictionaries: 

33 yield (p, el) 

34 for r in iterate_on_json(el, p, keep_dictionaries, skip): 

35 yield r 

36 else: 

37 raise TypeError( # pragma: no cover 

38 "Unexpected type, the json was altered at path '{0}'".format( 

39 p)) 

40 

41 

42class HTMLtoJSONParser(html.parser.HTMLParser): 

43 

44 """ 

45 Parses :epkg:`HTML` and output a :epkg:`JSON` structure. 

46 Example: 

47 

48 :: 

49 

50 file = ... 

51 with open(file,"r",encoding="utf8") as f : content = f.read() 

52 parser = HTMLtoJSONParser() 

53 parser.feed(content) 

54 js = parser.json 

55 

56 Or: 

57 

58 :: 

59 

60 js = HTMLtoJSONParser.to_json(content) 

61 

62 To iterator on path: 

63 

64 :: 

65 

66 all = [ (k,v) for k,v in HTMLtoJSONParser.iterate(js) ] 

67 """ 

68 

69 def __init__(self, raise_exception=True): 

70 """ 

71 @param raise_exception if True, raises an exception if the 

72 HTML is malformed, otherwise does what it can 

73 """ 

74 html.parser.HTMLParser.__init__(self, convert_charrefs=True) 

75 self.doc = {} 

76 self.path = [] 

77 self.cur = self.doc 

78 self.line = 0 

79 self.raise_exception = raise_exception 

80 

81 @property 

82 def json(self): 

83 """ 

84 Returns the :epkg:`JSON` strucure. 

85 @return json 

86 """ 

87 return self.doc 

88 

89 @staticmethod 

90 def to_json(content, raise_exception=True): 

91 """ 

92 Converts :epkg:`HTML` into :epkg:`JSON`. 

93 @param content :epkg:`HTML` content to parse 

94 @param raise_exception if True, raises an exception if the HTML is malformed, otherwise does what it can 

95 """ 

96 parser = HTMLtoJSONParser(raise_exception=raise_exception) 

97 parser.feed(content) 

98 return parser.json 

99 

100 @staticmethod 

101 def iterate(json_structure, prefix="", keep_dictionaries=False, # pylint: disable=W0102 

102 skip=["__parent__"]): # pylint: disable=W0102 

103 """ 

104 Iterates on every field contains in the :epkg:`JSON` structure. 

105 

106 @param json_structure json structure 

107 @param prefix prefix to add 

108 @param keep_dictionaries if True, add yield k,v where v is a JSON dictionary 

109 @param skip do not enter the following tag 

110 @return iterator of (path, value) 

111 """ 

112 for _ in iterate_on_json( 

113 json_structure, prefix, keep_dictionaries, skip): 

114 yield _ 

115 

116 def handle_starttag(self, tag, attrs): 

117 """ 

118 What to do for a new tag. 

119 """ 

120 self.path.append(tag) 

121 attrs = {k: v for k, v in attrs} # pylint: disable=R1721 

122 if tag in self.cur: 

123 if isinstance(self.cur[tag], list): 

124 self.cur[tag].append({"__parent__": self.cur}) 

125 self.cur = self.cur[tag][-1] 

126 else: 

127 self.cur[tag] = [self.cur[tag]] 

128 self.cur[tag].append({"__parent__": self.cur}) 

129 self.cur = self.cur[tag][-1] 

130 else: 

131 self.cur[tag] = {"__parent__": self.cur} 

132 self.cur = self.cur[tag] 

133 

134 for a, v in attrs.items(): 

135 self.cur["#" + a] = v 

136 self.cur[""] = "" 

137 

138 def handle_endtag(self, tag): 

139 """ 

140 What to do for the end of a tag. 

141 """ 

142 if tag != self.path[-1] and self.raise_exception: 

143 raise ValueError( # pragma: no cover 

144 "html is malformed around line: {0} (it might be because " 

145 "of a tag <br>, <hr>, <img .. > not closed)".format( 

146 self.line)) 

147 del self.path[-1] 

148 memo = self.cur 

149 self.cur = self.cur["__parent__"] 

150 self.clean(memo) 

151 

152 def handle_data(self, data): 

153 """ 

154 What to do with data. 

155 """ 

156 self.line += data.count("\n") 

157 if "" in self.cur: 

158 self.cur[""] += data 

159 

160 def clean(self, values): 

161 """ 

162 Cleans a dictionary of value. 

163 """ 

164 keys = list(values.keys()) 

165 for k in keys: 

166 v = values[k] 

167 if isinstance(v, str): 

168 #print ("clean", k,[v]) 

169 c = v.strip(" \n\r\t") 

170 if c != v: 

171 if len(c) > 0: 

172 values[k] = c 

173 else: 

174 del values[k] 

175 elif len(v) == 0: 

176 del values[k] 

177 del values["__parent__"]