Coverage for src/ensae_projects/hackathon/json

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Helpers for the hackathon 2017 (Label Emmaüs).

5"""

6import os

7from io import BytesIO

8import ijson

9from pyquickhelper.loghelper import noLOG

12def enumerate_json_items(filename, encoding=None, fLOG=noLOG):

13 """

14 Enumerates items from a JSON file or string.

16 @param filename filename or string or stream to parse

17 @param encoding encoding

18 @param fLOG logging function

19 @return iterator on records at first level.

21 It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``.

23 .. exref::

24 :title: Processes a json file by streaming.

26 The module :epkg:`ijson` can read a JSON file by streaming.

27 This module is needed because a record can be written on multiple lines.

28 This function leverages it produces the following results.

30 .. runpython::

31 :showcode:

33 from ensae_projects.hackathon import enumerate_json_items

35 text_json = '''

36 [

37 {

38 "glossary": {

39 "title": "example glossary",

40 "GlossDiv": {

41 "title": "S",

42 "GlossList": [{

43 "GlossEntry": {

44 "ID": "SGML",

45 "SortAs": "SGML",

46 "GlossTerm": "Standard Generalized Markup Language",

47 "Acronym": "SGML",

48 "Abbrev": "ISO 8879:1986",

49 "GlossDef": {

50 "para": "A meta-markup language, used to create markup languages such as DocBook.",

51 "GlossSeeAlso": ["GML", "XML"]

52 },

53 "GlossSee": "markup"

54 }

55 }]

56 }

57 }

58 },

59 {

60 "glossary": {

61 "title": "example glossary",

62 "GlossDiv": {

63 "title": "S",

64 "GlossList": {

65 "GlossEntry": [{

66 "ID": "SGML",

67 "SortAs": "SGML",

68 "GlossTerm": "Standard Generalized Markup Language",

69 "Acronym": "SGML",

70 "Abbrev": "ISO 8879:1986",

71 "GlossDef": {

72 "para": "A meta-markup language, used to create markup languages such as DocBook.",

73 "GlossSeeAlso": ["GML", "XML"]

74 },

75 "GlossSee": "markup"

76 }]

77 }

78 }

79 }

80 }

81 ]

82 '''

84 for item in enumerate_json_items(text_json):

85 print('------------')

86 print(item)

87 """

88 if isinstance(filename, str):

89 if "{" not in filename and os.path.exists(filename):

90 with open(filename, "rb", encoding=encoding) as f:

91 for el in enumerate_json_items(f, encoding=encoding, fLOG=fLOG):

92 yield el

93 elif isinstance(filename, str):

94 st = BytesIO(filename.encode('utf-8'))

95 for el in enumerate_json_items(st, encoding=encoding, fLOG=fLOG):

96 yield el

97 else:

98 raise TypeError(

99 "Unable to process type '{}'.".format(type(filename)))

100 else:

101 parser = ijson.parse(filename)

102 current = None

103 curkey = None

104 stack = []

105 nbyield = 0

106 for i, (_, event, value) in enumerate(parser):

107 if i % 1000000 == 0:

108 fLOG("[enumerate_json_items] i={0} yielded={1}".format(

109 i, nbyield))

110 if event == "start_array":

111 if curkey is None:

112 current = []

113 else:

114 if not isinstance(current, dict):

115 raise RuntimeError(

116 "Type issue {0}".format(type(current)))

117 c = []

118 current[curkey] = c # pylint: disable=E1137

119 current = c

120 curkey = None

121 stack.append(current)

122 elif event == "end_array":

123 stack.pop()

124 if len(stack) == 0:

125 # We should be done.

126 current = None

127 else:

128 current = stack[-1]

129 elif event == "start_map":

130 c = {}

131 if curkey is None:

132 current.append(c)

133 else:

134 current[curkey] = c # pylint: disable=E1137

135 stack.append(c)

136 current = c

137 curkey = None

138 elif event == "end_map":

139 stack.pop()

140 current = stack[-1]

141 if len(stack) == 1:

142 nbyield += 1

143 yield current[-1]

144 # We clear the memory.

145 current.clear()

146 elif event == "map_key":

147 curkey = value

148 elif event in {"string", "number", "boolean"}:

149 if curkey is None:

150 current.append(value)

151 else:

152 current[curkey] = value # pylint: disable=E1137

153 curkey = None

154 elif event == "null":

155 if curkey is None:

156 current.append(None)

157 else:

158 current[curkey] = None # pylint: disable=E1137

159 curkey = None

160 else:

161 raise ValueError("Unknown event '{0}'".format(event))

162

163

164def extract_images_from_json_2017(filename, encoding=None, fLOG=noLOG):

165 """

166 Extracts fields from a JSON files such as images.

167

168 @param filename filename

169 @param encoding encoding

170 @param fLOG logging function

171 @return iterator on images

172

173 ..warning:: Copy between two iterations?

174

175 If you plan to store the enumerated dictionaries, you should

176 copy them because dictionary are reused.

177

178 One example on dummy data implementing a subset of the fields

179 the JSON contains. This can be easily converted into a dataframe.

180

181 .. runpython::

182 :showcode:

183

184 from ensae_projects.hackathon import extract_images_from_json_2017

185

186 text_json = '''

187 [

188 {"assigned_images": [],

189 "best_offer": {"created_on": "2016-11-04T23:20:53+01:00", "images": [], "offer_longitude": null, "availability": "in_stock",

190 "start_selling_date": null, "delay_before_shipping": 0.00, "free_return": null, "free_shipping": null,

191 "assigned_images": [{"image_path": "https://coucou.JPEG"}],

192 "id": 1306501, "eco_tax": 0.000000, "keywords": ["boutique", "test"],

193 "sku": "AAAA27160018",

194 "product": {"pk": 2550, "external_id": null, "id": 2580},

195 "description": "livre l", "last_modified": "2016-11-04T23:27:01+01:00",

196 "name": "les names", "language": "fr"}, "id": 25540,

197 "description": "livre 2", "slug": "les-l",

198 "application_categories": [280, 283], "product_type": "physical",

199 "name": "les l n", "language": "fr", "popularity": 99, "gender": null

200 }

201 ]

202 '''

203

204 items = []

205 for item in extract_images_from_json_2017(text_json):

206 print(item)

207 items.append(item)

208

209 from pandas import DataFrame

210 df = DataFrame(items)

211 print(df)

212 """

213 for record in enumerate_json_items(filename, encoding=encoding, fLOG=fLOG):

214 images = []

215 if "best_offer" in record and record["best_offer"]:

216 best = record["best_offer"]

217 if "assigned_images" in best and best["assigned_images"]:

218 images.extend(best["assigned_images"])

219 else:

220 continue

221 product = best.get("product")

222 if product is None:

223 continue

224 if "assigned_images" in record and record["assigned_images"]:

225 images.extend(record["assigned_images"])

226 res = {}

227 res["product_pk"] = product.get("pk")

228 res["product_id"] = product.get("id")

229 res["id2"] = record.get("id2")

230 res["sku"] = best.get("sku")

231 res["created_on"] = record.get("created_on")

232 res["keywords"] = record.get("keywords")

233 if isinstance(res["keywords"], list):

234 res['keywords'] = ";".join(res['keywords'])

235 res["availability"] = best.get("availability")

236 res["eco_tax"] = best.get("eco_tax")

237 res["restock_date"] = best.get("restock_date")

238 res["status"] = best.get("status")

239 res["number_of_items"] = best.get("number_of_items")

240 res["price_with_vat"] = best.get("price_with_vat")

241 res["price_without_vat"] = best.get("price_without_vat")

242 res["previous_price_without_vat"] = best.get(

243 "previous_price_without_vat")

244 res["max_order_quantity"] = best.get("max_order_quantity")

245 res["stock"] = best.get("stock")

246 res["start_selling_date"] = best.get("start_selling_date")

247 res["description"] = record.get("description")

248 if isinstance(res["description"], str):

249 res["description"] = res["description"].replace(

250 "\n", "\\n").replace("\t", "\\t").replace("\r", "")

251 res["last_modified"] = best.get("last_modified")

252 res["name"] = record.get("name")

253 res["product_type"] = record.get("product_type")

254 res["gender"] = record.get("gender")

255 res["popularity"] = record.get("popularity")

256 res["application_categories"] = record.get("application_categories")

257 if isinstance(res["application_categories"], list):

258 res["application_categories"] = ",".join(

259 map(str, res["application_categories"]))

260 res["language"] = record.get("language")

261 paths = list(im.get("image_path") for im in images)

262 done = set()

263 for p in paths:

264 if p and p not in done:

265 res["image_path"] = p

266 yield res

267 done.add(p)

Coverage for src/ensae_projects/hackathon/json_helper.py: 93%

119 statements