Coverage for src/pyrsslocal/xmlhelper/xmlfilewalk.py: 26%

96 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2023-02-02 02:59 +0100

1""" 

2@file 

3 

4@brief functions related to XML files representing objects 

5""" 

6 

7from pyquickhelper.loghelper.flog import GetSepLine 

8from .xml_tree import XMLHandlerDict, XMLIterParser 

9 

10 

11def _iteration_values(values): 

12 """ 

13 Iterators on all possible tuple of values taken into a list. 

14 Let's assume you have two rows: 

15 

16 :: 

17 

18 a1 a2 a3 

19 b1 b2 

20 

21 The function will produce: 

22 

23 :: 

24 

25 a1 b1 

26 a1 b2 

27 a2 b1 

28 a2 b2 

29 a3 b1 

30 a3 b2 

31 

32 

33 The function is used by @see fn table_extraction_from_xml_files_iterator. 

34 

35 @param values list of rows 

36 @return iterator on rows 

37 """ 

38 co = [] 

39 for v in values: 

40 if isinstance(v, list): 

41 co.append(v) 

42 else: 

43 co.append([v]) 

44 

45 ind = [0 for _ in co] 

46 while ind[0] < len(co[0]): 

47 line = [c[i] for c, i in zip(co, ind)] 

48 yield line 

49 

50 ind[-1] += 1 

51 i = len(ind) - 1 

52 while i > 0: 

53 if ind[i] >= len(co[i]): 

54 ind[i] = 0 

55 ind[i - 1] += 1 

56 i -= 1 

57 

58 

59def table_extraction_from_xml_files_iterator(file, fields, log=False, fLOG=None, encoding="utf-8", errors=None): 

60 """ 

61 Goes through a XML file, extracts values and put 

62 them into an iterator. 

63 

64 @param file a file 

65 @param fields list of fields to get from the XML files (see below) 

66 @param log do logs if True 

67 @param fLOG logging function 

68 @param errors sent to function :epkg:`*py:library:function` 

69 @param encoding encoding 

70 @return iterator on lines 

71 

72 One example for fields: 

73 

74 :: 

75 

76 [ ("tag1/tag2", "all"), 

77 ("tag1/tag2/tag3/_", "one"), 

78 ... 

79 ] 

80 """ 

81 

82 fileh = open(file, "r", encoding=encoding, errors=errors) if isinstance( 

83 file, str) else file 

84 

85 parser = XMLIterParser() 

86 handler = XMLHandlerDict(no_content=True) 

87 parser.setContentHandler(handler) 

88 

89 fields = [(a.split("/"), b) for a, b in fields] 

90 if log: 

91 fLOG("table_extraction_from_xml_files: begin") 

92 

93 for i_, o in enumerate(parser.parse(fileh)): 

94 

95 values = [] 

96 nb = 0 

97 for look, typ in fields: 

98 path = o.find_node_value(look) 

99 

100 if typ == "one": 

101 if len(path) == 0: 

102 if log: 

103 fLOG(o.get_xml_content()) 

104 raise Exception( 

105 "unable to find a value for path %s" % 

106 "/".join(look)) 

107 val = path[0] 

108 if val is None: 

109 val = "" 

110 elif typ == "all": 

111 if len(path) == 1: 

112 val = path[0] 

113 elif len(path) == 0: 

114 val = "" 

115 else: 

116 val = path 

117 nb += 1 

118 else: 

119 raise Exception( 

120 "the type must in (one, all) %s,%s" % 

121 (look, typ)) 

122 values.append(val) 

123 

124 if nb == 0: 

125 line = "\t".join(values) 

126 yield line 

127 else: 

128 for v in _iteration_values(values): 

129 line = "\t".join(v) 

130 yield line 

131 

132 if log and (i_ + 1) % 1000 == 0: 

133 fLOG("table_extraction_from_xml_files reading ", i_) 

134 

135 if isinstance(file, str): 

136 fileh.close() 

137 if log: 

138 fLOG("table_extraction_from_xml_files: end") 

139 

140 

141def table_extraction_from_xml_files(file, output, fields, log=False, encoding="utf-8", errors=None): 

142 """ 

143 Goes through a :epkg:`XML` file, extracts values and 

144 put them into a flat file. 

145 

146 @param file a file 

147 @param output output file, string or file object, 

148 @param fields list of fields to get from the XML files 

149 @param log do logs if True 

150 @param errors sent to function :epkg:`*py:library:function` 

151 @param encoding encoding 

152 

153 One example for fields: 

154 

155 :: 

156 

157 [ ("tag1/tag2", "all"), 

158 ("tag1/tag2/tag3/_", "one"), 

159 ... 

160 ] 

161 """ 

162 outputh = open(output, "w", encoding=encoding, 

163 errors=errors) if isinstance(output, str) else output 

164 for line in table_extraction_from_xml_files_iterator(file, fields, log): 

165 outputh.write(line) 

166 outputh.write(GetSepLine()) 

167 if isinstance(output, str): 

168 outputh.close() 

169 

170 

171def xml_filter_iterator(file, filter_=None, log=False, xmlformat=True, 

172 fLOG=None, encoding="utf-8", errors=None): 

173 """ 

174 Goes through a :epkg:`XML` file, 

175 returns :epkg:`XML` content if a condition is verified, 

176 the result is an iterator. 

177 

178 @param file a file 

179 @param filter_ a function which takes a node and returns a boolean, if None, accepts everything 

180 @param log do logs if True 

181 @param xmlformat if True, return the xml, otherwise return the node 

182 @param fLOG logging function 

183 @param encoding encoding 

184 @param errors sent to function :epkg:`*py:library:function` 

185 @return the xml format or a node depending on thevalue of xmlformat 

186 """ 

187 if filter_ is None: 

188 def filter__(node): 

189 return True 

190 

191 filter_ = filter__ 

192 

193 fileh = open(file, "r", encoding=encoding, errors=errors) if isinstance( 

194 file, str) else file 

195 

196 parser = XMLIterParser() 

197 handler = XMLHandlerDict() 

198 parser.setContentHandler(handler) 

199 

200 for i_, o in enumerate(parser.parse(fileh)): 

201 

202 res = filter_(o) 

203 if res: 

204 if xmlformat: 

205 yield o.get_xml_content() 

206 else: 

207 yield o 

208 

209 if fLOG and log and (i_ + 1) % 1000 == 0: 

210 fLOG("table_extraction_from_xml_files reading ", i_) 

211 

212 if isinstance(file, str): 

213 fileh.close() 

214 if log and fLOG: 

215 fLOG("xml_filter_iterator: end") 

216 

217 

218def xml_filter(file, output, filter_, log=False, xmlformat=True, encoding="utf-8", errors=None): 

219 """ 

220 Goes through a :epkg:`XML` file, returns :epkg:`XML` content 

221 if a condition is verified, the result is put into a stream. 

222 

223 @param file a file 

224 @param output output file, string or file object 

225 @param filter_ a function which takes a node and returns a boolean 

226 @param xmlformat if True, return the xml, otherwise return the node 

227 @param encoding encoding 

228 @param errors sent to function :epkg:`*py:library:function` 

229 @param log do logs if True 

230 """ 

231 outputh = open(output, "r", encoding=encoding, 

232 errors=errors) if isinstance(output, str) else output 

233 for line in xml_filter_iterator(file, filter_, log, xmlformat): 

234 outputh.write(line) 

235 outputh.write(GetSepLine()) 

236 if isinstance(output, str): 

237 outputh.close()