Coverage for src/ensae_projects/hackathon/json_helper.py: 93%
119 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Helpers for the hackathon 2017 (Label Emmaüs).
5"""
6import os
7from io import BytesIO
8import ijson
9from pyquickhelper.loghelper import noLOG
12def enumerate_json_items(filename, encoding=None, fLOG=noLOG):
13 """
14 Enumerates items from a JSON file or string.
16 @param filename filename or string or stream to parse
17 @param encoding encoding
18 @param fLOG logging function
19 @return iterator on records at first level.
21 It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``.
23 .. exref::
24 :title: Processes a json file by streaming.
26 The module :epkg:`ijson` can read a JSON file by streaming.
27 This module is needed because a record can be written on multiple lines.
28 This function leverages it produces the following results.
30 .. runpython::
31 :showcode:
33 from ensae_projects.hackathon import enumerate_json_items
35 text_json = '''
36 [
37 {
38 "glossary": {
39 "title": "example glossary",
40 "GlossDiv": {
41 "title": "S",
42 "GlossList": [{
43 "GlossEntry": {
44 "ID": "SGML",
45 "SortAs": "SGML",
46 "GlossTerm": "Standard Generalized Markup Language",
47 "Acronym": "SGML",
48 "Abbrev": "ISO 8879:1986",
49 "GlossDef": {
50 "para": "A meta-markup language, used to create markup languages such as DocBook.",
51 "GlossSeeAlso": ["GML", "XML"]
52 },
53 "GlossSee": "markup"
54 }
55 }]
56 }
57 }
58 },
59 {
60 "glossary": {
61 "title": "example glossary",
62 "GlossDiv": {
63 "title": "S",
64 "GlossList": {
65 "GlossEntry": [{
66 "ID": "SGML",
67 "SortAs": "SGML",
68 "GlossTerm": "Standard Generalized Markup Language",
69 "Acronym": "SGML",
70 "Abbrev": "ISO 8879:1986",
71 "GlossDef": {
72 "para": "A meta-markup language, used to create markup languages such as DocBook.",
73 "GlossSeeAlso": ["GML", "XML"]
74 },
75 "GlossSee": "markup"
76 }]
77 }
78 }
79 }
80 }
81 ]
82 '''
84 for item in enumerate_json_items(text_json):
85 print('------------')
86 print(item)
87 """
88 if isinstance(filename, str):
89 if "{" not in filename and os.path.exists(filename):
90 with open(filename, "rb", encoding=encoding) as f:
91 for el in enumerate_json_items(f, encoding=encoding, fLOG=fLOG):
92 yield el
93 elif isinstance(filename, str):
94 st = BytesIO(filename.encode('utf-8'))
95 for el in enumerate_json_items(st, encoding=encoding, fLOG=fLOG):
96 yield el
97 else:
98 raise TypeError(
99 "Unable to process type '{}'.".format(type(filename)))
100 else:
101 parser = ijson.parse(filename)
102 current = None
103 curkey = None
104 stack = []
105 nbyield = 0
106 for i, (_, event, value) in enumerate(parser):
107 if i % 1000000 == 0:
108 fLOG("[enumerate_json_items] i={0} yielded={1}".format(
109 i, nbyield))
110 if event == "start_array":
111 if curkey is None:
112 current = []
113 else:
114 if not isinstance(current, dict):
115 raise RuntimeError(
116 "Type issue {0}".format(type(current)))
117 c = []
118 current[curkey] = c # pylint: disable=E1137
119 current = c
120 curkey = None
121 stack.append(current)
122 elif event == "end_array":
123 stack.pop()
124 if len(stack) == 0:
125 # We should be done.
126 current = None
127 else:
128 current = stack[-1]
129 elif event == "start_map":
130 c = {}
131 if curkey is None:
132 current.append(c)
133 else:
134 current[curkey] = c # pylint: disable=E1137
135 stack.append(c)
136 current = c
137 curkey = None
138 elif event == "end_map":
139 stack.pop()
140 current = stack[-1]
141 if len(stack) == 1:
142 nbyield += 1
143 yield current[-1]
144 # We clear the memory.
145 current.clear()
146 elif event == "map_key":
147 curkey = value
148 elif event in {"string", "number", "boolean"}:
149 if curkey is None:
150 current.append(value)
151 else:
152 current[curkey] = value # pylint: disable=E1137
153 curkey = None
154 elif event == "null":
155 if curkey is None:
156 current.append(None)
157 else:
158 current[curkey] = None # pylint: disable=E1137
159 curkey = None
160 else:
161 raise ValueError("Unknown event '{0}'".format(event))
164def extract_images_from_json_2017(filename, encoding=None, fLOG=noLOG):
165 """
166 Extracts fields from a JSON files such as images.
168 @param filename filename
169 @param encoding encoding
170 @param fLOG logging function
171 @return iterator on images
173 ..warning:: Copy between two iterations?
175 If you plan to store the enumerated dictionaries, you should
176 copy them because dictionary are reused.
178 One example on dummy data implementing a subset of the fields
179 the JSON contains. This can be easily converted into a dataframe.
181 .. runpython::
182 :showcode:
184 from ensae_projects.hackathon import extract_images_from_json_2017
186 text_json = '''
187 [
188 {"assigned_images": [],
189 "best_offer": {"created_on": "2016-11-04T23:20:53+01:00", "images": [], "offer_longitude": null, "availability": "in_stock",
190 "start_selling_date": null, "delay_before_shipping": 0.00, "free_return": null, "free_shipping": null,
191 "assigned_images": [{"image_path": "https://coucou.JPEG"}],
192 "id": 1306501, "eco_tax": 0.000000, "keywords": ["boutique", "test"],
193 "sku": "AAAA27160018",
194 "product": {"pk": 2550, "external_id": null, "id": 2580},
195 "description": "livre l", "last_modified": "2016-11-04T23:27:01+01:00",
196 "name": "les names", "language": "fr"}, "id": 25540,
197 "description": "livre 2", "slug": "les-l",
198 "application_categories": [280, 283], "product_type": "physical",
199 "name": "les l n", "language": "fr", "popularity": 99, "gender": null
200 }
201 ]
202 '''
204 items = []
205 for item in extract_images_from_json_2017(text_json):
206 print(item)
207 items.append(item)
209 from pandas import DataFrame
210 df = DataFrame(items)
211 print(df)
212 """
213 for record in enumerate_json_items(filename, encoding=encoding, fLOG=fLOG):
214 images = []
215 if "best_offer" in record and record["best_offer"]:
216 best = record["best_offer"]
217 if "assigned_images" in best and best["assigned_images"]:
218 images.extend(best["assigned_images"])
219 else:
220 continue
221 product = best.get("product")
222 if product is None:
223 continue
224 if "assigned_images" in record and record["assigned_images"]:
225 images.extend(record["assigned_images"])
226 res = {}
227 res["product_pk"] = product.get("pk")
228 res["product_id"] = product.get("id")
229 res["id2"] = record.get("id2")
230 res["sku"] = best.get("sku")
231 res["created_on"] = record.get("created_on")
232 res["keywords"] = record.get("keywords")
233 if isinstance(res["keywords"], list):
234 res['keywords'] = ";".join(res['keywords'])
235 res["availability"] = best.get("availability")
236 res["eco_tax"] = best.get("eco_tax")
237 res["restock_date"] = best.get("restock_date")
238 res["status"] = best.get("status")
239 res["number_of_items"] = best.get("number_of_items")
240 res["price_with_vat"] = best.get("price_with_vat")
241 res["price_without_vat"] = best.get("price_without_vat")
242 res["previous_price_without_vat"] = best.get(
243 "previous_price_without_vat")
244 res["max_order_quantity"] = best.get("max_order_quantity")
245 res["stock"] = best.get("stock")
246 res["start_selling_date"] = best.get("start_selling_date")
247 res["description"] = record.get("description")
248 if isinstance(res["description"], str):
249 res["description"] = res["description"].replace(
250 "\n", "\\n").replace("\t", "\\t").replace("\r", "")
251 res["last_modified"] = best.get("last_modified")
252 res["name"] = record.get("name")
253 res["product_type"] = record.get("product_type")
254 res["gender"] = record.get("gender")
255 res["popularity"] = record.get("popularity")
256 res["application_categories"] = record.get("application_categories")
257 if isinstance(res["application_categories"], list):
258 res["application_categories"] = ",".join(
259 map(str, res["application_categories"]))
260 res["language"] = record.get("language")
261 paths = list(im.get("image_path") for im in images)
262 done = set()
263 for p in paths:
264 if p and p not in done:
265 res["image_path"] = p
266 yield res
267 done.add(p)