Coverage for src/ensae_projects/hackathon/image_helper.py: 75%

303 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-07-20 04:37 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Helpers for the hackathon 2017 (Label Emmaüs). 

5""" 

6import os 

7from io import BytesIO 

8from collections import Counter 

9import hashlib 

10import pickle 

11import warnings 

12import shutil 

13import numpy 

14from numpy.random import RandomState 

15import pandas 

16from PIL import Image 

17from sklearn.model_selection import train_test_split 

18 

19 

20def resize_image(filename_or_bytes, maxdim=512, dest=None, format=None): # pylint: disable=W0622 

21 """ 

22 Resizes an image until one of its dimension becomes smaller 

23 than *maxdim* after dividing the dimensions by two many times. 

24 

25 @param filename_or_bytes filename or bytes 

26 @param maxdim maximum dimension 

27 @param dest if filename is a str 

28 @param format saved image format (if *filename_or_bytes* is bytes) 

29 @return same type 

30 """ 

31 if isinstance(filename_or_bytes, str): 

32 ext = os.path.splitext(filename_or_bytes)[-1][1:] 

33 with open(filename_or_bytes, "rb") as f: 

34 r = resize_image(f.read(), maxdim=maxdim, format=ext) 

35 if dest is None: 

36 dest = filename_or_bytes 

37 with open(dest, "wb") as f: 

38 f.write(r) 

39 return None 

40 elif isinstance(filename_or_bytes, bytes): 

41 st = BytesIO(filename_or_bytes) 

42 img = Image.open(st) 

43 new_size = img.size 

44 mn = min(new_size) 

45 while mn > maxdim: 

46 new_size = (new_size[0] // 2, new_size[1] // 2) 

47 mn = min(new_size) 

48 if new_size == img.size: 

49 return filename_or_bytes 

50 else: 

51 mapping = {'jpg': 'jpeg'} 

52 img = img.resize(new_size) 

53 st = BytesIO() 

54 img.save(st, format=mapping.get(format.lower(), format)) 

55 return st.getvalue() 

56 else: 

57 raise TypeError("Unexpected type '{0}'".format( 

58 type(filename_or_bytes))) 

59 

60 

61def read_image(filename_or_bytes): 

62 """ 

63 Reads an image. 

64 

65 @param filename_or_bytes filename or bytes 

66 @return *Image* from :epkg:`PIL` 

67 """ 

68 if isinstance(filename_or_bytes, str): 

69 with open(filename_or_bytes, "rb") as f: 

70 return read_image(f.read()) 

71 elif isinstance(filename_or_bytes, bytes): 

72 st = BytesIO(filename_or_bytes) 

73 return Image.open(st) 

74 else: 

75 raise TypeError("Unexpected type '{0}'".format( 

76 type(filename_or_bytes))) 

77 

78 

79def enumerate_image_class(folder, abspath=True, ext={'.jpg', '.png'}): # pylint: disable=W0102 

80 """ 

81 Lists all images in one folder assuming subfolders 

82 indicates the class of each image belongs to. 

83 

84 @param folder folder 

85 @param abspath use absolute paths 

86 @param ext allowed extensions 

87 @return list of (filename, class) 

88 """ 

89 if not os.path.exists(folder): 

90 raise FileNotFoundError("Unable to find '{0}'".format(folder)) 

91 for root, _, files in os.walk(folder, topdown=False): 

92 for name in files: 

93 e = os.path.splitext(name)[-1] 

94 if e not in ext: 

95 continue 

96 if abspath: 

97 name = os.path.join(root, name) 

98 else: 

99 name = os.path.join(os.path.relpath(root, folder), name) 

100 fold = os.path.split(name)[0] 

101 sub = os.path.split(fold)[-1] 

102 yield name, sub 

103 

104 

105def histogram_image_size(folder, ext={'.jpg', '.png'}): # pylint: disable=W0102 

106 """ 

107 Computes the distribution of images size. 

108 

109 @param folder folder 

110 @param ext allowed extensions 

111 @return histogram 

112 """ 

113 def get_size(name): 

114 r = read_image(name) 

115 return r.size 

116 

117 return Counter(map(lambda r: get_size(r[0]), enumerate_image_class(folder, ext=ext))) 

118 

119 

120def img2gray(img, mode='L'): 

121 """ 

122 Converts an image (:epkg:`PIL`) to gray scale. 

123 

124 @param img see *Image* from :epkg:`PIL` 

125 @param mode ``'L'`` or ``'LA'`` 

126 @return see *Image* from :epkg:`PIL` 

127 """ 

128 return img.convert(mode) 

129 

130 

131def stream_apply_image_transform(src_folder, dest_folder, transform, # pylint: disable=W0102 

132 ext={'.png', '.jpg'}, fLOG=None): 

133 """ 

134 Applies a transform on every image in a folder, 

135 saves it in another one. It keeps the same subfolders. 

136 

137 @param src_folder source folder 

138 @param dest_folder destination folder 

139 @param transform function, ``trans(img) -> img`` 

140 @param ext image extension to consider 

141 @param logging function 

142 @return number of processed image 

143 

144 The function yields every created filename and returns 

145 an iterator. 

146 

147 Example:: 

148 

149 list(stream_apply_image_transform(src, fest, lambda im: img2gray(im))) 

150 """ 

151 if not os.path.exists(dest_folder): 

152 os.makedirs(dest_folder) 

153 for ii, (img, sub) in enumerate(enumerate_image_class(src_folder, ext=ext, abspath=False)): 

154 if fLOG is not None and ii % 1000 == 0: 

155 fLOG( 

156 "[apply_image_transform] processing image {0}: '{1}' - class '{2}'".format(ii, img, sub)) 

157 i = read_image(os.path.join(src_folder, img)) 

158 fold, name = os.path.split(img) 

159 n = transform(i) 

160 dfold = os.path.join(dest_folder, fold) 

161 if not os.path.exists(dfold): 

162 os.makedirs(dfold) 

163 fd = os.path.join(dfold, name) 

164 n.save(fd) 

165 yield fd 

166 

167 

168def image_zoom(img, new_size, **kwargs): 

169 """ 

170 Resizes an *image* (from :epkg:`PIL`). 

171 

172 

173 @param img :epkg:`PIL.Image` 

174 @param new_size size after zoom 

175 @param kwargs additional arguments 

176 @return new image 

177 """ 

178 return img.resize(new_size, **kwargs) 

179 

180 

181def stream_image2features(src_folder, dest_folder, transform, batch_size=1000, # pylint: disable=W0102 

182 prefix="batch", ext={'.png', '.jpg'}, fLOG=None): 

183 """ 

184 Considers all images in a folder, transform them into 

185 features (function *transform*) and saves them 

186 with :epkg:`pickle` into :epkg:`numpy` arrays by batch. 

187 

188 @param src_folder folder which contains images 

189 @param dest_folder destination of the batches 

190 @param transform from image to features, 

191 function, ``trans(img) -> numpy.array`` 

192 @param batch_size number of images to save together 

193 @param prefix prefix name for the batch files 

194 @param ext list of extensions to process 

195 @param fLOG logging function 

196 @return list of written files (iterator) 

197 

198 The function yields a batch file when one is ready. It does 

199 not wait the end before returning all of them. The saved files 

200 contains two arrays, first one for the features, second one 

201 for the classes. 

202 

203 Example:: 

204 

205 list(stream_image2features(this, temp, 

206 lambda im: numpy.array(image_zoom(img2gray(im), (10, 12))))) 

207 """ 

208 if not os.path.exists(dest_folder): 

209 os.makedirs(dest_folder) 

210 

211 def save_batch(nb, features, subs): 

212 conc = numpy.vstack(features) 

213 

214 name = "%s%d.pkl" % (os.path.join(dest_folder, prefix), nb) 

215 fold = os.path.dirname(name) 

216 if not os.path.exists(fold): 

217 os.makedirs(fold) 

218 

219 with open(name, "wb") as f: 

220 pickle.dump([conc, numpy.array(subs)], f) 

221 return name 

222 

223 features = [] 

224 subs = [] 

225 nbatch = 0 

226 for ii, (img, sub) in enumerate(enumerate_image_class(src_folder, ext=ext, abspath=False)): 

227 i = read_image(os.path.join(src_folder, img)) 

228 feat = transform(i) 

229 features.append(feat) 

230 subs.append(sub) 

231 

232 if len(features) >= batch_size: 

233 filename = save_batch(nbatch, features, subs) 

234 yield os.path.relpath(filename, dest_folder) 

235 if fLOG: 

236 fLOG( 

237 "[stream_image2features] save file '{0}' - {1} seen images.".format(filename, ii)) 

238 nbatch += 1 

239 features.clear() 

240 subs.clear() 

241 

242 if len(features) > 0: 

243 filename = save_batch(nbatch, features, subs) 

244 yield os.path.relpath(filename, dest_folder) 

245 if fLOG: 

246 fLOG("[stream_image2features] save file '{0}'.".format(filename)) 

247 

248 

249def load_batch_features(batch_file): 

250 """ 

251 Loads a batch file saved by @see fn stream_image2features. 

252 

253 @param batch_file batch file 

254 @return features, classes 

255 """ 

256 with open(batch_file, "rb") as f: 

257 return pickle.load(f) 

258 

259 

260def enumerate_batch_features(folder, batch_or_image=False): 

261 """ 

262 Enumerates all batches saved in a folder. 

263 

264 @param folder folder where to find the batches. 

265 @param batch_or_image False to enumerate filenames, 

266 True for couple (features, class) 

267 @return enumerator 

268 """ 

269 batches = os.listdir(folder) 

270 for b in batches: 

271 ext = os.path.splitext(b)[-1] 

272 if ext == '.pkl': 

273 if batch_or_image: 

274 feat, cl = load_batch_features(os.path.join(folder, b)) 

275 for i in range(cl): 

276 yield feat[i], cl[i] 

277 else: 

278 yield b 

279 

280 

281def stream_download_images(urls, dest_folder, fLOG=None, use_request=None, 

282 skipif_done=True, dummys=None, skip=0): 

283 """ 

284 Downloads images based on their urls. 

285 

286 @param urls filename or list of urls 

287 @param dest_folder destination folder 

288 @param fLOG logging function 

289 @param use_request None to let the function choose, 

290 True to use :epkg:`urllib3`, 

291 False to use :epkg:`*py:urllib:request`. 

292 @param skipif_done skip if the image was already downloaded 

293 @param dummys some website returns a dummy image to tell there is no 

294 image at this specific address, if an image is part 

295 of this set of images, it is ignored, 

296 if the value is None, it is replaced by a default set 

297 of images 

298 @param skip skip the first images 

299 @return enumerator on created files 

300 

301 The function continue if an error occurs. 

302 Use ``fLOG=print`` to see which url failed. 

303 Parameter *dummys* can be set to avoid images like 

304 the following: 

305 

306 .. image:: empty.jpg 

307 :width: 100 

308 

309 The function does not download an image already downloaded 

310 but still yields it. 

311 """ 

312 if isinstance(urls, str): 

313 with open(urls, "r", encoding='utf-8') as f: 

314 urls = [_.strip("\n\r\t ") for _ in f.readlines()] 

315 urls = [_ for _ in urls if _] 

316 

317 if not os.path.exists(dest_folder): 

318 os.makedirs(dest_folder) 

319 

320 if use_request is None: 

321 use_request = False 

322 

323 leave = None 

324 

325 try: 

326 if use_request: 

327 raise ImportError("Cannot use urllib3") 

328 import urllib3 

329 from urllib3.exceptions import HTTPError 

330 

331 timeout = urllib3.Timeout(connect=2.0, read=8.0) 

332 http = urllib3.PoolManager(timeout=timeout) 

333 

334 def download(url, fLOG): 

335 with warnings.catch_warnings(record=True) as ws: 

336 warnings.simplefilter("ignore") 

337 try: 

338 r = http.request('GET', url) 

339 if r.status == 200: 

340 return r.data 

341 else: 

342 if fLOG: 

343 fLOG("[stream_download_images] error {0} for url '{1}'.".format( 

344 r.status, url)) 

345 return None 

346 for w in ws: 

347 if fLOG: 

348 fLOG( 

349 "[stream_download_images] warning {0} for url '{1}'.".format(w, url)) 

350 except HTTPError as e: 

351 if fLOG: 

352 fLOG( 

353 "[stream_download_images] fails for url '{0}' due to {1}.".format(url, e)) 

354 return None 

355 

356 leave = http 

357 

358 except ImportError: 

359 if not use_request: 

360 raise 

361 from urllib.request import Request, urlopen 

362 from urllib.error import URLError 

363 

364 def download(url, fLOG): 

365 with warnings.catch_warnings(record=True) as ws: 

366 warnings.simplefilter("ignore") 

367 try: 

368 req = Request(url) 

369 with urlopen(req, timeout=10) as f: 

370 try: 

371 return f.read() 

372 except Exception as e: # pylint: disable=W0703 

373 if fLOG: 

374 fLOG( 

375 "[stream_download_images] error {0} for url '{1}'.".format(e, url)) 

376 return None 

377 for w in ws: 

378 if fLOG: 

379 fLOG( 

380 "[stream_download_images] warning {0} for url '{1}'.".format(w, url)) 

381 except URLError as e: 

382 if fLOG: 

383 fLOG( 

384 "[stream_download_images] fails for url '{0}' due to {1}.".format(url, e)) 

385 return None 

386 

387 this = os.path.dirname(__file__) 

388 if dummys is None: 

389 dummys = [ 

390 numpy.array(read_image(os.path.join(this, "empty.jpg"))).ravel(), 

391 ] 

392 

393 for i, url in enumerate(urls): 

394 if i < skip: 

395 continue 

396 if fLOG and i % 100 == 0: 

397 fLOG( 

398 "[stream_download_images] ... {0}/{1}: '{2}'".format(i + 1, len(urls), url)) 

399 name = url.split('/')[-1].replace("..", ".") 

400 for c in "?:%": 

401 name = name.replace(c, "") 

402 ext = os.path.splitext(name)[-1] 

403 if len(ext) not in (4, 5) or ext[0] != '.': 

404 if fLOG: 

405 fLOG( 

406 "[stream_download_images] wrong filename for url '{0}'.".format(url)) 

407 continue 

408 

409 dest = os.path.join(dest_folder, name) 

410 if skipif_done and os.path.exists(dest): 

411 yield name 

412 continue 

413 try: 

414 data = download(url, fLOG) 

415 except UnicodeEncodeError as e: 

416 if fLOG: 

417 fLOG( 

418 "[stream_download_images] fails for url '{0}' due to {1}.".format(url, e)) 

419 continue 

420 

421 if data is not None: 

422 

423 try: 

424 img = read_image(data) 

425 except Exception as e: # pylint: disable=W0703 

426 if fLOG: 

427 fLOG("[stream_download_images] cannot load image for url '{0}' due to {1}".format( 

428 url, e)) 

429 continue 

430 

431 imgr = numpy.array(img).ravel() 

432 ok = True 

433 for idu, dummy in enumerate(dummys): 

434 if imgr.shape != dummy.shape: 

435 continue 

436 if numpy.max(numpy.abs(dummy - imgr)) == 0: 

437 ok = False 

438 if fLOG: 

439 fLOG("[stream_download_images] empty image '{0}' equal to dummy {1}".format( 

440 url, idu)) 

441 break 

442 

443 if ok: 

444 with open(dest, "wb") as f: 

445 f.write(data) 

446 yield name 

447 

448 if leave: 

449 leave.clear() 

450 

451 

452def stream_copy_images(src_folder, dest_folder, valid, ext={'.jpg', '.png'}, fLOG=None): # pylint: disable=W0102 

453 """ 

454 Copies all images from *src_folder* to *dest_folder* 

455 if *valid(name)* is True. 

456 

457 @param src_folder source folder 

458 @param dest_folder destination folder 

459 @param valid function ``valid(name) -> bool`` 

460 @param ext allowed extensions 

461 @param fLOG loggung function 

462 @return iterator on copied files 

463 """ 

464 for i, (img, sub) in enumerate(enumerate_image_class(src_folder, ext=ext, abspath=False)): 

465 if fLOG is not None and i % 1000 == 0: 

466 fLOG( 

467 "[stream_copy_images] copy image {0}: '{1}' - class '{2}'".format(i, img, sub)) 

468 if not valid(img): 

469 continue 

470 dst = os.path.join(dest_folder, img) 

471 fold = os.path.dirname(dst) 

472 if not os.path.exists(fold): 

473 os.makedirs(fold) 

474 shutil.copy(os.path.join(src_folder, img), dst) 

475 yield dst 

476 

477 

478def stream_random_sample(folder, n=1000, seed=None, abspath=True, # pylint: disable=W0102 

479 ext={'.jpg', '.png'}): 

480 """ 

481 Extracts a random sample from a folder which contains many images. 

482 Relies on fonction @see fn enumerate_image_class. 

483 

484 @param folder folder 

485 @param n number of requested images 

486 @param seed seed 

487 @param abspath use absolute paths 

488 @param ext allowed extensions 

489 @return list of (filename, class) 

490 

491 The function is a streaming function, it yields the current 

492 state of a sample drawn with the :epkg:`reservoir sampling` 

493 algorithm. It also works with 

494 

495 .. runpython:: 

496 :showcode: 

497 

498 import os 

499 from ensae_projects.hackathon.image_helper import stream_random_sample, last_element 

500 

501 this = os.path.join(os.path.dirname(__file__), '..') 

502 res = last_element(stream_random_sample(this, abspath=False, ext={'.py', '.rst', '.pyc'})) 

503 print(res) 

504 """ 

505 sample = [] 

506 state = RandomState(seed=seed) 

507 for i, (img, sub) in enumerate(enumerate_image_class(folder, abspath=abspath, ext=ext)): 

508 if len(sample) < n: 

509 sample.append((img, sub)) 

510 else: 

511 j = state.randint(0, i) 

512 if j < n: 

513 sample[j] = (img, sub) 

514 yield sample 

515 

516 

517def last_element(iter): # pylint: disable=W0622 

518 """ 

519 Returns the last element of sequence assuming they 

520 were generated by an iterator or a generator. 

521 

522 @param iter iterator or generator 

523 @return element 

524 

525 .. runpython:: 

526 :showcode: 

527 

528 from ensae_projects.hackathon.image_helper import last_element 

529 

530 def gen(): 

531 for i in range(10): 

532 yield "A%d" % i 

533 

534 print(last_element(gen())) 

535 """ 

536 el = None 

537 for el in iter: 

538 pass 

539 return el 

540 

541 

542def plot_gallery_random_images(folder, n=12, seed=None, # pylint: disable=W0102 

543 ext={'.jpg', '.png'}, **kwargs): 

544 """ 

545 Plots a gallery of images using :epkg:`matplotlib`. 

546 Extracts a random sample from a folder which contains many images. 

547 Relies on fonction @see fn enumerate_image_class. 

548 Calls :epkg:`plot_gallery_images` to build the gallery. 

549 

550 @param folder folder 

551 @param n number of requested images 

552 @param seed seed 

553 @param ext allowed extensions 

554 @param kwargs argument to send to :epkg:`matplotlib` 

555 @return tuple (ax, random sample) 

556 

557 The function is a streaming function, it yields the current 

558 state of a sample drawn with the :epkg:`reservoir sampling` 

559 algorithm. It also works with 

560 """ 

561 def hash_md5(name): 

562 m = hashlib.md5() 

563 m.update(name.encode('utf-8')) 

564 res = m.hexdigest() 

565 if len(res) > 4: 

566 res = res[:4] 

567 return res 

568 

569 from mlinsights.plotting import plot_gallery_images 

570 rnd = last_element(stream_random_sample( 

571 folder, n=n, seed=seed, abspath=False, ext=ext)) 

572 imgs = [os.path.join(folder, _[0]) for _ in rnd] 

573 if isinstance(imgs[0], str): 

574 texts = [hash_md5(nm) for nm in imgs] 

575 else: 

576 texts = list(str(_) for _ in range(len(imgs))) 

577 ax = plot_gallery_images(imgs, texts, **kwargs) 

578 rnd = [(t,) + cu for t, cu in zip(texts, rnd)] 

579 return ax, rnd 

580 

581 

582def folder_split_train_test(src_folder, dest_train, dest_test, seed=None, # pylint: disable=W0102 

583 ext={'.jpg', '.png'}, test_size=0.25): 

584 """ 

585 Splits images from a folder into train and test. 

586 The function saves images into two separate folders. 

587 

588 @param src_folder source folder 

589 @param dest_train destination folder for the train set 

590 @param dest_test destination folder for the test set 

591 @param ext desired extensions 

592 @param seed random seed 

593 @param test_size test ratio 

594 @return list of copied files in a 2-uple 

595 

596 The function relies on @see fn enumerate_image_class 

597 to extract the image from folder *src_folder*. 

598 The subfolder is used to perform a stratitied split. 

599 """ 

600 images = list(enumerate_image_class(src_folder, abspath=False, ext=ext)) 

601 df = pandas.DataFrame(data=images, columns=["name", "sub"]) 

602 df_train, df_test = train_test_split(df, test_size=test_size, random_state=seed, 

603 shuffle=True, stratify=df['sub']) 

604 

605 def dump_images(imgs, fold): 

606 copied = [] 

607 for img in imgs: 

608 src = os.path.join(src_folder, img) 

609 dst = os.path.join(fold, img) 

610 d = os.path.dirname(dst) 

611 if not os.path.exists(d): 

612 os.makedirs(d) 

613 shutil.copy(src, dst) 

614 copied.append(dst) 

615 return copied 

616 

617 img_train = df_train["name"] 

618 img_test = df_test["name"] 

619 tr = dump_images(img_train, dest_train) 

620 te = dump_images(img_test, dest_test) 

621 return tr, te