Coverage for src/ensae_projects/hackathon/json_helper.py: 93%

119 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-07-20 04:37 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Helpers for the hackathon 2017 (Label Emmaüs). 

5""" 

6import os 

7from io import BytesIO 

8import ijson 

9from pyquickhelper.loghelper import noLOG 

10 

11 

12def enumerate_json_items(filename, encoding=None, fLOG=noLOG): 

13 """ 

14 Enumerates items from a JSON file or string. 

15 

16 @param filename filename or string or stream to parse 

17 @param encoding encoding 

18 @param fLOG logging function 

19 @return iterator on records at first level. 

20 

21 It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``. 

22 

23 .. exref:: 

24 :title: Processes a json file by streaming. 

25 

26 The module :epkg:`ijson` can read a JSON file by streaming. 

27 This module is needed because a record can be written on multiple lines. 

28 This function leverages it produces the following results. 

29 

30 .. runpython:: 

31 :showcode: 

32 

33 from ensae_projects.hackathon import enumerate_json_items 

34 

35 text_json = ''' 

36 [ 

37 { 

38 "glossary": { 

39 "title": "example glossary", 

40 "GlossDiv": { 

41 "title": "S", 

42 "GlossList": [{ 

43 "GlossEntry": { 

44 "ID": "SGML", 

45 "SortAs": "SGML", 

46 "GlossTerm": "Standard Generalized Markup Language", 

47 "Acronym": "SGML", 

48 "Abbrev": "ISO 8879:1986", 

49 "GlossDef": { 

50 "para": "A meta-markup language, used to create markup languages such as DocBook.", 

51 "GlossSeeAlso": ["GML", "XML"] 

52 }, 

53 "GlossSee": "markup" 

54 } 

55 }] 

56 } 

57 } 

58 }, 

59 { 

60 "glossary": { 

61 "title": "example glossary", 

62 "GlossDiv": { 

63 "title": "S", 

64 "GlossList": { 

65 "GlossEntry": [{ 

66 "ID": "SGML", 

67 "SortAs": "SGML", 

68 "GlossTerm": "Standard Generalized Markup Language", 

69 "Acronym": "SGML", 

70 "Abbrev": "ISO 8879:1986", 

71 "GlossDef": { 

72 "para": "A meta-markup language, used to create markup languages such as DocBook.", 

73 "GlossSeeAlso": ["GML", "XML"] 

74 }, 

75 "GlossSee": "markup" 

76 }] 

77 } 

78 } 

79 } 

80 } 

81 ] 

82 ''' 

83 

84 for item in enumerate_json_items(text_json): 

85 print('------------') 

86 print(item) 

87 """ 

88 if isinstance(filename, str): 

89 if "{" not in filename and os.path.exists(filename): 

90 with open(filename, "rb", encoding=encoding) as f: 

91 for el in enumerate_json_items(f, encoding=encoding, fLOG=fLOG): 

92 yield el 

93 elif isinstance(filename, str): 

94 st = BytesIO(filename.encode('utf-8')) 

95 for el in enumerate_json_items(st, encoding=encoding, fLOG=fLOG): 

96 yield el 

97 else: 

98 raise TypeError( 

99 "Unable to process type '{}'.".format(type(filename))) 

100 else: 

101 parser = ijson.parse(filename) 

102 current = None 

103 curkey = None 

104 stack = [] 

105 nbyield = 0 

106 for i, (_, event, value) in enumerate(parser): 

107 if i % 1000000 == 0: 

108 fLOG("[enumerate_json_items] i={0} yielded={1}".format( 

109 i, nbyield)) 

110 if event == "start_array": 

111 if curkey is None: 

112 current = [] 

113 else: 

114 if not isinstance(current, dict): 

115 raise RuntimeError( 

116 "Type issue {0}".format(type(current))) 

117 c = [] 

118 current[curkey] = c # pylint: disable=E1137 

119 current = c 

120 curkey = None 

121 stack.append(current) 

122 elif event == "end_array": 

123 stack.pop() 

124 if len(stack) == 0: 

125 # We should be done. 

126 current = None 

127 else: 

128 current = stack[-1] 

129 elif event == "start_map": 

130 c = {} 

131 if curkey is None: 

132 current.append(c) 

133 else: 

134 current[curkey] = c # pylint: disable=E1137 

135 stack.append(c) 

136 current = c 

137 curkey = None 

138 elif event == "end_map": 

139 stack.pop() 

140 current = stack[-1] 

141 if len(stack) == 1: 

142 nbyield += 1 

143 yield current[-1] 

144 # We clear the memory. 

145 current.clear() 

146 elif event == "map_key": 

147 curkey = value 

148 elif event in {"string", "number", "boolean"}: 

149 if curkey is None: 

150 current.append(value) 

151 else: 

152 current[curkey] = value # pylint: disable=E1137 

153 curkey = None 

154 elif event == "null": 

155 if curkey is None: 

156 current.append(None) 

157 else: 

158 current[curkey] = None # pylint: disable=E1137 

159 curkey = None 

160 else: 

161 raise ValueError("Unknown event '{0}'".format(event)) 

162 

163 

164def extract_images_from_json_2017(filename, encoding=None, fLOG=noLOG): 

165 """ 

166 Extracts fields from a JSON files such as images. 

167 

168 @param filename filename 

169 @param encoding encoding 

170 @param fLOG logging function 

171 @return iterator on images 

172 

173 ..warning:: Copy between two iterations? 

174 

175 If you plan to store the enumerated dictionaries, you should 

176 copy them because dictionary are reused. 

177 

178 One example on dummy data implementing a subset of the fields 

179 the JSON contains. This can be easily converted into a dataframe. 

180 

181 .. runpython:: 

182 :showcode: 

183 

184 from ensae_projects.hackathon import extract_images_from_json_2017 

185 

186 text_json = ''' 

187 [ 

188 {"assigned_images": [], 

189 "best_offer": {"created_on": "2016-11-04T23:20:53+01:00", "images": [], "offer_longitude": null, "availability": "in_stock", 

190 "start_selling_date": null, "delay_before_shipping": 0.00, "free_return": null, "free_shipping": null, 

191 "assigned_images": [{"image_path": "https://coucou.JPEG"}], 

192 "id": 1306501, "eco_tax": 0.000000, "keywords": ["boutique", "test"], 

193 "sku": "AAAA27160018", 

194 "product": {"pk": 2550, "external_id": null, "id": 2580}, 

195 "description": "livre l", "last_modified": "2016-11-04T23:27:01+01:00", 

196 "name": "les names", "language": "fr"}, "id": 25540, 

197 "description": "livre 2", "slug": "les-l", 

198 "application_categories": [280, 283], "product_type": "physical", 

199 "name": "les l n", "language": "fr", "popularity": 99, "gender": null 

200 } 

201 ] 

202 ''' 

203 

204 items = [] 

205 for item in extract_images_from_json_2017(text_json): 

206 print(item) 

207 items.append(item) 

208 

209 from pandas import DataFrame 

210 df = DataFrame(items) 

211 print(df) 

212 """ 

213 for record in enumerate_json_items(filename, encoding=encoding, fLOG=fLOG): 

214 images = [] 

215 if "best_offer" in record and record["best_offer"]: 

216 best = record["best_offer"] 

217 if "assigned_images" in best and best["assigned_images"]: 

218 images.extend(best["assigned_images"]) 

219 else: 

220 continue 

221 product = best.get("product") 

222 if product is None: 

223 continue 

224 if "assigned_images" in record and record["assigned_images"]: 

225 images.extend(record["assigned_images"]) 

226 res = {} 

227 res["product_pk"] = product.get("pk") 

228 res["product_id"] = product.get("id") 

229 res["id2"] = record.get("id2") 

230 res["sku"] = best.get("sku") 

231 res["created_on"] = record.get("created_on") 

232 res["keywords"] = record.get("keywords") 

233 if isinstance(res["keywords"], list): 

234 res['keywords'] = ";".join(res['keywords']) 

235 res["availability"] = best.get("availability") 

236 res["eco_tax"] = best.get("eco_tax") 

237 res["restock_date"] = best.get("restock_date") 

238 res["status"] = best.get("status") 

239 res["number_of_items"] = best.get("number_of_items") 

240 res["price_with_vat"] = best.get("price_with_vat") 

241 res["price_without_vat"] = best.get("price_without_vat") 

242 res["previous_price_without_vat"] = best.get( 

243 "previous_price_without_vat") 

244 res["max_order_quantity"] = best.get("max_order_quantity") 

245 res["stock"] = best.get("stock") 

246 res["start_selling_date"] = best.get("start_selling_date") 

247 res["description"] = record.get("description") 

248 if isinstance(res["description"], str): 

249 res["description"] = res["description"].replace( 

250 "\n", "\\n").replace("\t", "\\t").replace("\r", "") 

251 res["last_modified"] = best.get("last_modified") 

252 res["name"] = record.get("name") 

253 res["product_type"] = record.get("product_type") 

254 res["gender"] = record.get("gender") 

255 res["popularity"] = record.get("popularity") 

256 res["application_categories"] = record.get("application_categories") 

257 if isinstance(res["application_categories"], list): 

258 res["application_categories"] = ",".join( 

259 map(str, res["application_categories"])) 

260 res["language"] = record.get("language") 

261 paths = list(im.get("image_path") for im in images) 

262 done = set() 

263 for p in paths: 

264 if p and p not in done: 

265 res["image_path"] = p 

266 yield res 

267 done.add(p)