Coverage for src/ensae_projects/hackathon/ 93%

119 statements  

« prev     ^ index     » next v7.1.0, created at 2023-07-20 04:37 +0200

1# -*- coding: utf-8 -*- 



4@brief Helpers for the hackathon 2017 (Label Emmaüs). 


6import os 

7from io import BytesIO 

8import ijson 

9from pyquickhelper.loghelper import noLOG 



12def enumerate_json_items(filename, encoding=None, fLOG=noLOG): 

13 """ 

14 Enumerates items from a JSON file or string. 


16 @param filename filename or string or stream to parse 

17 @param encoding encoding 

18 @param fLOG logging function 

19 @return iterator on records at first level. 


21 It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``. 


23 .. exref:: 

24 :title: Processes a json file by streaming. 


26 The module :epkg:`ijson` can read a JSON file by streaming. 

27 This module is needed because a record can be written on multiple lines. 

28 This function leverages it produces the following results. 


30 .. runpython:: 

31 :showcode: 


33 from ensae_projects.hackathon import enumerate_json_items 


35 text_json = ''' 

36 [ 

37 { 

38 "glossary": { 

39 "title": "example glossary", 

40 "GlossDiv": { 

41 "title": "S", 

42 "GlossList": [{ 

43 "GlossEntry": { 

44 "ID": "SGML", 

45 "SortAs": "SGML", 

46 "GlossTerm": "Standard Generalized Markup Language", 

47 "Acronym": "SGML", 

48 "Abbrev": "ISO 8879:1986", 

49 "GlossDef": { 

50 "para": "A meta-markup language, used to create markup languages such as DocBook.", 

51 "GlossSeeAlso": ["GML", "XML"] 

52 }, 

53 "GlossSee": "markup" 

54 } 

55 }] 

56 } 

57 } 

58 }, 

59 { 

60 "glossary": { 

61 "title": "example glossary", 

62 "GlossDiv": { 

63 "title": "S", 

64 "GlossList": { 

65 "GlossEntry": [{ 

66 "ID": "SGML", 

67 "SortAs": "SGML", 

68 "GlossTerm": "Standard Generalized Markup Language", 

69 "Acronym": "SGML", 

70 "Abbrev": "ISO 8879:1986", 

71 "GlossDef": { 

72 "para": "A meta-markup language, used to create markup languages such as DocBook.", 

73 "GlossSeeAlso": ["GML", "XML"] 

74 }, 

75 "GlossSee": "markup" 

76 }] 

77 } 

78 } 

79 } 

80 } 

81 ] 

82 ''' 


84 for item in enumerate_json_items(text_json): 

85 print('------------') 

86 print(item) 

87 """ 

88 if isinstance(filename, str): 

89 if "{" not in filename and os.path.exists(filename): 

90 with open(filename, "rb", encoding=encoding) as f: 

91 for el in enumerate_json_items(f, encoding=encoding, fLOG=fLOG): 

92 yield el 

93 elif isinstance(filename, str): 

94 st = BytesIO(filename.encode('utf-8')) 

95 for el in enumerate_json_items(st, encoding=encoding, fLOG=fLOG): 

96 yield el 

97 else: 

98 raise TypeError( 

99 "Unable to process type '{}'.".format(type(filename))) 

100 else: 

101 parser = ijson.parse(filename) 

102 current = None 

103 curkey = None 

104 stack = [] 

105 nbyield = 0 

106 for i, (_, event, value) in enumerate(parser): 

107 if i % 1000000 == 0: 

108 fLOG("[enumerate_json_items] i={0} yielded={1}".format( 

109 i, nbyield)) 

110 if event == "start_array": 

111 if curkey is None: 

112 current = [] 

113 else: 

114 if not isinstance(current, dict): 

115 raise RuntimeError( 

116 "Type issue {0}".format(type(current))) 

117 c = [] 

118 current[curkey] = c # pylint: disable=E1137 

119 current = c 

120 curkey = None 

121 stack.append(current) 

122 elif event == "end_array": 

123 stack.pop() 

124 if len(stack) == 0: 

125 # We should be done. 

126 current = None 

127 else: 

128 current = stack[-1] 

129 elif event == "start_map": 

130 c = {} 

131 if curkey is None: 

132 current.append(c) 

133 else: 

134 current[curkey] = c # pylint: disable=E1137 

135 stack.append(c) 

136 current = c 

137 curkey = None 

138 elif event == "end_map": 

139 stack.pop() 

140 current = stack[-1] 

141 if len(stack) == 1: 

142 nbyield += 1 

143 yield current[-1] 

144 # We clear the memory. 

145 current.clear() 

146 elif event == "map_key": 

147 curkey = value 

148 elif event in {"string", "number", "boolean"}: 

149 if curkey is None: 

150 current.append(value) 

151 else: 

152 current[curkey] = value # pylint: disable=E1137 

153 curkey = None 

154 elif event == "null": 

155 if curkey is None: 

156 current.append(None) 

157 else: 

158 current[curkey] = None # pylint: disable=E1137 

159 curkey = None 

160 else: 

161 raise ValueError("Unknown event '{0}'".format(event)) 



164def extract_images_from_json_2017(filename, encoding=None, fLOG=noLOG): 

165 """ 

166 Extracts fields from a JSON files such as images. 


168 @param filename filename 

169 @param encoding encoding 

170 @param fLOG logging function 

171 @return iterator on images 


173 ..warning:: Copy between two iterations? 


175 If you plan to store the enumerated dictionaries, you should 

176 copy them because dictionary are reused. 


178 One example on dummy data implementing a subset of the fields 

179 the JSON contains. This can be easily converted into a dataframe. 


181 .. runpython:: 

182 :showcode: 


184 from ensae_projects.hackathon import extract_images_from_json_2017 


186 text_json = ''' 

187 [ 

188 {"assigned_images": [], 

189 "best_offer": {"created_on": "2016-11-04T23:20:53+01:00", "images": [], "offer_longitude": null, "availability": "in_stock", 

190 "start_selling_date": null, "delay_before_shipping": 0.00, "free_return": null, "free_shipping": null, 

191 "assigned_images": [{"image_path": "https://coucou.JPEG"}], 

192 "id": 1306501, "eco_tax": 0.000000, "keywords": ["boutique", "test"], 

193 "sku": "AAAA27160018", 

194 "product": {"pk": 2550, "external_id": null, "id": 2580}, 

195 "description": "livre l", "last_modified": "2016-11-04T23:27:01+01:00", 

196 "name": "les names", "language": "fr"}, "id": 25540, 

197 "description": "livre 2", "slug": "les-l", 

198 "application_categories": [280, 283], "product_type": "physical", 

199 "name": "les l n", "language": "fr", "popularity": 99, "gender": null 

200 } 

201 ] 

202 ''' 


204 items = [] 

205 for item in extract_images_from_json_2017(text_json): 

206 print(item) 

207 items.append(item) 


209 from pandas import DataFrame 

210 df = DataFrame(items) 

211 print(df) 

212 """ 

213 for record in enumerate_json_items(filename, encoding=encoding, fLOG=fLOG): 

214 images = [] 

215 if "best_offer" in record and record["best_offer"]: 

216 best = record["best_offer"] 

217 if "assigned_images" in best and best["assigned_images"]: 

218 images.extend(best["assigned_images"]) 

219 else: 

220 continue 

221 product = best.get("product") 

222 if product is None: 

223 continue 

224 if "assigned_images" in record and record["assigned_images"]: 

225 images.extend(record["assigned_images"]) 

226 res = {} 

227 res["product_pk"] = product.get("pk") 

228 res["product_id"] = product.get("id") 

229 res["id2"] = record.get("id2") 

230 res["sku"] = best.get("sku") 

231 res["created_on"] = record.get("created_on") 

232 res["keywords"] = record.get("keywords") 

233 if isinstance(res["keywords"], list): 

234 res['keywords'] = ";".join(res['keywords']) 

235 res["availability"] = best.get("availability") 

236 res["eco_tax"] = best.get("eco_tax") 

237 res["restock_date"] = best.get("restock_date") 

238 res["status"] = best.get("status") 

239 res["number_of_items"] = best.get("number_of_items") 

240 res["price_with_vat"] = best.get("price_with_vat") 

241 res["price_without_vat"] = best.get("price_without_vat") 

242 res["previous_price_without_vat"] = best.get( 

243 "previous_price_without_vat") 

244 res["max_order_quantity"] = best.get("max_order_quantity") 

245 res["stock"] = best.get("stock") 

246 res["start_selling_date"] = best.get("start_selling_date") 

247 res["description"] = record.get("description") 

248 if isinstance(res["description"], str): 

249 res["description"] = res["description"].replace( 

250 "\n", "\\n").replace("\t", "\\t").replace("\r", "") 

251 res["last_modified"] = best.get("last_modified") 

252 res["name"] = record.get("name") 

253 res["product_type"] = record.get("product_type") 

254 res["gender"] = record.get("gender") 

255 res["popularity"] = record.get("popularity") 

256 res["application_categories"] = record.get("application_categories") 

257 if isinstance(res["application_categories"], list): 

258 res["application_categories"] = ",".join( 

259 map(str, res["application_categories"])) 

260 res["language"] = record.get("language") 

261 paths = list(im.get("image_path") for im in images) 

262 done = set() 

263 for p in paths: 

264 if p and p not in done: 

265 res["image_path"] = p 

266 yield res 

267 done.add(p)