Coverage for src/ensae_projects/hackathon/image_helper.py: 75%
303 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Helpers for the hackathon 2017 (Label Emmaüs).
5"""
6import os
7from io import BytesIO
8from collections import Counter
9import hashlib
10import pickle
11import warnings
12import shutil
13import numpy
14from numpy.random import RandomState
15import pandas
16from PIL import Image
17from sklearn.model_selection import train_test_split
20def resize_image(filename_or_bytes, maxdim=512, dest=None, format=None): # pylint: disable=W0622
21 """
22 Resizes an image until one of its dimension becomes smaller
23 than *maxdim* after dividing the dimensions by two many times.
25 @param filename_or_bytes filename or bytes
26 @param maxdim maximum dimension
27 @param dest if filename is a str
28 @param format saved image format (if *filename_or_bytes* is bytes)
29 @return same type
30 """
31 if isinstance(filename_or_bytes, str):
32 ext = os.path.splitext(filename_or_bytes)[-1][1:]
33 with open(filename_or_bytes, "rb") as f:
34 r = resize_image(f.read(), maxdim=maxdim, format=ext)
35 if dest is None:
36 dest = filename_or_bytes
37 with open(dest, "wb") as f:
38 f.write(r)
39 return None
40 elif isinstance(filename_or_bytes, bytes):
41 st = BytesIO(filename_or_bytes)
42 img = Image.open(st)
43 new_size = img.size
44 mn = min(new_size)
45 while mn > maxdim:
46 new_size = (new_size[0] // 2, new_size[1] // 2)
47 mn = min(new_size)
48 if new_size == img.size:
49 return filename_or_bytes
50 else:
51 mapping = {'jpg': 'jpeg'}
52 img = img.resize(new_size)
53 st = BytesIO()
54 img.save(st, format=mapping.get(format.lower(), format))
55 return st.getvalue()
56 else:
57 raise TypeError("Unexpected type '{0}'".format(
58 type(filename_or_bytes)))
61def read_image(filename_or_bytes):
62 """
63 Reads an image.
65 @param filename_or_bytes filename or bytes
66 @return *Image* from :epkg:`PIL`
67 """
68 if isinstance(filename_or_bytes, str):
69 with open(filename_or_bytes, "rb") as f:
70 return read_image(f.read())
71 elif isinstance(filename_or_bytes, bytes):
72 st = BytesIO(filename_or_bytes)
73 return Image.open(st)
74 else:
75 raise TypeError("Unexpected type '{0}'".format(
76 type(filename_or_bytes)))
79def enumerate_image_class(folder, abspath=True, ext={'.jpg', '.png'}): # pylint: disable=W0102
80 """
81 Lists all images in one folder assuming subfolders
82 indicates the class of each image belongs to.
84 @param folder folder
85 @param abspath use absolute paths
86 @param ext allowed extensions
87 @return list of (filename, class)
88 """
89 if not os.path.exists(folder):
90 raise FileNotFoundError("Unable to find '{0}'".format(folder))
91 for root, _, files in os.walk(folder, topdown=False):
92 for name in files:
93 e = os.path.splitext(name)[-1]
94 if e not in ext:
95 continue
96 if abspath:
97 name = os.path.join(root, name)
98 else:
99 name = os.path.join(os.path.relpath(root, folder), name)
100 fold = os.path.split(name)[0]
101 sub = os.path.split(fold)[-1]
102 yield name, sub
105def histogram_image_size(folder, ext={'.jpg', '.png'}): # pylint: disable=W0102
106 """
107 Computes the distribution of images size.
109 @param folder folder
110 @param ext allowed extensions
111 @return histogram
112 """
113 def get_size(name):
114 r = read_image(name)
115 return r.size
117 return Counter(map(lambda r: get_size(r[0]), enumerate_image_class(folder, ext=ext)))
120def img2gray(img, mode='L'):
121 """
122 Converts an image (:epkg:`PIL`) to gray scale.
124 @param img see *Image* from :epkg:`PIL`
125 @param mode ``'L'`` or ``'LA'``
126 @return see *Image* from :epkg:`PIL`
127 """
128 return img.convert(mode)
131def stream_apply_image_transform(src_folder, dest_folder, transform, # pylint: disable=W0102
132 ext={'.png', '.jpg'}, fLOG=None):
133 """
134 Applies a transform on every image in a folder,
135 saves it in another one. It keeps the same subfolders.
137 @param src_folder source folder
138 @param dest_folder destination folder
139 @param transform function, ``trans(img) -> img``
140 @param ext image extension to consider
141 @param logging function
142 @return number of processed image
144 The function yields every created filename and returns
145 an iterator.
147 Example::
149 list(stream_apply_image_transform(src, fest, lambda im: img2gray(im)))
150 """
151 if not os.path.exists(dest_folder):
152 os.makedirs(dest_folder)
153 for ii, (img, sub) in enumerate(enumerate_image_class(src_folder, ext=ext, abspath=False)):
154 if fLOG is not None and ii % 1000 == 0:
155 fLOG(
156 "[apply_image_transform] processing image {0}: '{1}' - class '{2}'".format(ii, img, sub))
157 i = read_image(os.path.join(src_folder, img))
158 fold, name = os.path.split(img)
159 n = transform(i)
160 dfold = os.path.join(dest_folder, fold)
161 if not os.path.exists(dfold):
162 os.makedirs(dfold)
163 fd = os.path.join(dfold, name)
164 n.save(fd)
165 yield fd
168def image_zoom(img, new_size, **kwargs):
169 """
170 Resizes an *image* (from :epkg:`PIL`).
173 @param img :epkg:`PIL.Image`
174 @param new_size size after zoom
175 @param kwargs additional arguments
176 @return new image
177 """
178 return img.resize(new_size, **kwargs)
181def stream_image2features(src_folder, dest_folder, transform, batch_size=1000, # pylint: disable=W0102
182 prefix="batch", ext={'.png', '.jpg'}, fLOG=None):
183 """
184 Considers all images in a folder, transform them into
185 features (function *transform*) and saves them
186 with :epkg:`pickle` into :epkg:`numpy` arrays by batch.
188 @param src_folder folder which contains images
189 @param dest_folder destination of the batches
190 @param transform from image to features,
191 function, ``trans(img) -> numpy.array``
192 @param batch_size number of images to save together
193 @param prefix prefix name for the batch files
194 @param ext list of extensions to process
195 @param fLOG logging function
196 @return list of written files (iterator)
198 The function yields a batch file when one is ready. It does
199 not wait the end before returning all of them. The saved files
200 contains two arrays, first one for the features, second one
201 for the classes.
203 Example::
205 list(stream_image2features(this, temp,
206 lambda im: numpy.array(image_zoom(img2gray(im), (10, 12)))))
207 """
208 if not os.path.exists(dest_folder):
209 os.makedirs(dest_folder)
211 def save_batch(nb, features, subs):
212 conc = numpy.vstack(features)
214 name = "%s%d.pkl" % (os.path.join(dest_folder, prefix), nb)
215 fold = os.path.dirname(name)
216 if not os.path.exists(fold):
217 os.makedirs(fold)
219 with open(name, "wb") as f:
220 pickle.dump([conc, numpy.array(subs)], f)
221 return name
223 features = []
224 subs = []
225 nbatch = 0
226 for ii, (img, sub) in enumerate(enumerate_image_class(src_folder, ext=ext, abspath=False)):
227 i = read_image(os.path.join(src_folder, img))
228 feat = transform(i)
229 features.append(feat)
230 subs.append(sub)
232 if len(features) >= batch_size:
233 filename = save_batch(nbatch, features, subs)
234 yield os.path.relpath(filename, dest_folder)
235 if fLOG:
236 fLOG(
237 "[stream_image2features] save file '{0}' - {1} seen images.".format(filename, ii))
238 nbatch += 1
239 features.clear()
240 subs.clear()
242 if len(features) > 0:
243 filename = save_batch(nbatch, features, subs)
244 yield os.path.relpath(filename, dest_folder)
245 if fLOG:
246 fLOG("[stream_image2features] save file '{0}'.".format(filename))
249def load_batch_features(batch_file):
250 """
251 Loads a batch file saved by @see fn stream_image2features.
253 @param batch_file batch file
254 @return features, classes
255 """
256 with open(batch_file, "rb") as f:
257 return pickle.load(f)
260def enumerate_batch_features(folder, batch_or_image=False):
261 """
262 Enumerates all batches saved in a folder.
264 @param folder folder where to find the batches.
265 @param batch_or_image False to enumerate filenames,
266 True for couple (features, class)
267 @return enumerator
268 """
269 batches = os.listdir(folder)
270 for b in batches:
271 ext = os.path.splitext(b)[-1]
272 if ext == '.pkl':
273 if batch_or_image:
274 feat, cl = load_batch_features(os.path.join(folder, b))
275 for i in range(cl):
276 yield feat[i], cl[i]
277 else:
278 yield b
281def stream_download_images(urls, dest_folder, fLOG=None, use_request=None,
282 skipif_done=True, dummys=None, skip=0):
283 """
284 Downloads images based on their urls.
286 @param urls filename or list of urls
287 @param dest_folder destination folder
288 @param fLOG logging function
289 @param use_request None to let the function choose,
290 True to use :epkg:`urllib3`,
291 False to use :epkg:`*py:urllib:request`.
292 @param skipif_done skip if the image was already downloaded
293 @param dummys some website returns a dummy image to tell there is no
294 image at this specific address, if an image is part
295 of this set of images, it is ignored,
296 if the value is None, it is replaced by a default set
297 of images
298 @param skip skip the first images
299 @return enumerator on created files
301 The function continue if an error occurs.
302 Use ``fLOG=print`` to see which url failed.
303 Parameter *dummys* can be set to avoid images like
304 the following:
306 .. image:: empty.jpg
307 :width: 100
309 The function does not download an image already downloaded
310 but still yields it.
311 """
312 if isinstance(urls, str):
313 with open(urls, "r", encoding='utf-8') as f:
314 urls = [_.strip("\n\r\t ") for _ in f.readlines()]
315 urls = [_ for _ in urls if _]
317 if not os.path.exists(dest_folder):
318 os.makedirs(dest_folder)
320 if use_request is None:
321 use_request = False
323 leave = None
325 try:
326 if use_request:
327 raise ImportError("Cannot use urllib3")
328 import urllib3
329 from urllib3.exceptions import HTTPError
331 timeout = urllib3.Timeout(connect=2.0, read=8.0)
332 http = urllib3.PoolManager(timeout=timeout)
334 def download(url, fLOG):
335 with warnings.catch_warnings(record=True) as ws:
336 warnings.simplefilter("ignore")
337 try:
338 r = http.request('GET', url)
339 if r.status == 200:
340 return r.data
341 else:
342 if fLOG:
343 fLOG("[stream_download_images] error {0} for url '{1}'.".format(
344 r.status, url))
345 return None
346 for w in ws:
347 if fLOG:
348 fLOG(
349 "[stream_download_images] warning {0} for url '{1}'.".format(w, url))
350 except HTTPError as e:
351 if fLOG:
352 fLOG(
353 "[stream_download_images] fails for url '{0}' due to {1}.".format(url, e))
354 return None
356 leave = http
358 except ImportError:
359 if not use_request:
360 raise
361 from urllib.request import Request, urlopen
362 from urllib.error import URLError
364 def download(url, fLOG):
365 with warnings.catch_warnings(record=True) as ws:
366 warnings.simplefilter("ignore")
367 try:
368 req = Request(url)
369 with urlopen(req, timeout=10) as f:
370 try:
371 return f.read()
372 except Exception as e: # pylint: disable=W0703
373 if fLOG:
374 fLOG(
375 "[stream_download_images] error {0} for url '{1}'.".format(e, url))
376 return None
377 for w in ws:
378 if fLOG:
379 fLOG(
380 "[stream_download_images] warning {0} for url '{1}'.".format(w, url))
381 except URLError as e:
382 if fLOG:
383 fLOG(
384 "[stream_download_images] fails for url '{0}' due to {1}.".format(url, e))
385 return None
387 this = os.path.dirname(__file__)
388 if dummys is None:
389 dummys = [
390 numpy.array(read_image(os.path.join(this, "empty.jpg"))).ravel(),
391 ]
393 for i, url in enumerate(urls):
394 if i < skip:
395 continue
396 if fLOG and i % 100 == 0:
397 fLOG(
398 "[stream_download_images] ... {0}/{1}: '{2}'".format(i + 1, len(urls), url))
399 name = url.split('/')[-1].replace("..", ".")
400 for c in "?:%":
401 name = name.replace(c, "")
402 ext = os.path.splitext(name)[-1]
403 if len(ext) not in (4, 5) or ext[0] != '.':
404 if fLOG:
405 fLOG(
406 "[stream_download_images] wrong filename for url '{0}'.".format(url))
407 continue
409 dest = os.path.join(dest_folder, name)
410 if skipif_done and os.path.exists(dest):
411 yield name
412 continue
413 try:
414 data = download(url, fLOG)
415 except UnicodeEncodeError as e:
416 if fLOG:
417 fLOG(
418 "[stream_download_images] fails for url '{0}' due to {1}.".format(url, e))
419 continue
421 if data is not None:
423 try:
424 img = read_image(data)
425 except Exception as e: # pylint: disable=W0703
426 if fLOG:
427 fLOG("[stream_download_images] cannot load image for url '{0}' due to {1}".format(
428 url, e))
429 continue
431 imgr = numpy.array(img).ravel()
432 ok = True
433 for idu, dummy in enumerate(dummys):
434 if imgr.shape != dummy.shape:
435 continue
436 if numpy.max(numpy.abs(dummy - imgr)) == 0:
437 ok = False
438 if fLOG:
439 fLOG("[stream_download_images] empty image '{0}' equal to dummy {1}".format(
440 url, idu))
441 break
443 if ok:
444 with open(dest, "wb") as f:
445 f.write(data)
446 yield name
448 if leave:
449 leave.clear()
452def stream_copy_images(src_folder, dest_folder, valid, ext={'.jpg', '.png'}, fLOG=None): # pylint: disable=W0102
453 """
454 Copies all images from *src_folder* to *dest_folder*
455 if *valid(name)* is True.
457 @param src_folder source folder
458 @param dest_folder destination folder
459 @param valid function ``valid(name) -> bool``
460 @param ext allowed extensions
461 @param fLOG loggung function
462 @return iterator on copied files
463 """
464 for i, (img, sub) in enumerate(enumerate_image_class(src_folder, ext=ext, abspath=False)):
465 if fLOG is not None and i % 1000 == 0:
466 fLOG(
467 "[stream_copy_images] copy image {0}: '{1}' - class '{2}'".format(i, img, sub))
468 if not valid(img):
469 continue
470 dst = os.path.join(dest_folder, img)
471 fold = os.path.dirname(dst)
472 if not os.path.exists(fold):
473 os.makedirs(fold)
474 shutil.copy(os.path.join(src_folder, img), dst)
475 yield dst
478def stream_random_sample(folder, n=1000, seed=None, abspath=True, # pylint: disable=W0102
479 ext={'.jpg', '.png'}):
480 """
481 Extracts a random sample from a folder which contains many images.
482 Relies on fonction @see fn enumerate_image_class.
484 @param folder folder
485 @param n number of requested images
486 @param seed seed
487 @param abspath use absolute paths
488 @param ext allowed extensions
489 @return list of (filename, class)
491 The function is a streaming function, it yields the current
492 state of a sample drawn with the :epkg:`reservoir sampling`
493 algorithm. It also works with
495 .. runpython::
496 :showcode:
498 import os
499 from ensae_projects.hackathon.image_helper import stream_random_sample, last_element
501 this = os.path.join(os.path.dirname(__file__), '..')
502 res = last_element(stream_random_sample(this, abspath=False, ext={'.py', '.rst', '.pyc'}))
503 print(res)
504 """
505 sample = []
506 state = RandomState(seed=seed)
507 for i, (img, sub) in enumerate(enumerate_image_class(folder, abspath=abspath, ext=ext)):
508 if len(sample) < n:
509 sample.append((img, sub))
510 else:
511 j = state.randint(0, i)
512 if j < n:
513 sample[j] = (img, sub)
514 yield sample
517def last_element(iter): # pylint: disable=W0622
518 """
519 Returns the last element of sequence assuming they
520 were generated by an iterator or a generator.
522 @param iter iterator or generator
523 @return element
525 .. runpython::
526 :showcode:
528 from ensae_projects.hackathon.image_helper import last_element
530 def gen():
531 for i in range(10):
532 yield "A%d" % i
534 print(last_element(gen()))
535 """
536 el = None
537 for el in iter:
538 pass
539 return el
542def plot_gallery_random_images(folder, n=12, seed=None, # pylint: disable=W0102
543 ext={'.jpg', '.png'}, **kwargs):
544 """
545 Plots a gallery of images using :epkg:`matplotlib`.
546 Extracts a random sample from a folder which contains many images.
547 Relies on fonction @see fn enumerate_image_class.
548 Calls :epkg:`plot_gallery_images` to build the gallery.
550 @param folder folder
551 @param n number of requested images
552 @param seed seed
553 @param ext allowed extensions
554 @param kwargs argument to send to :epkg:`matplotlib`
555 @return tuple (ax, random sample)
557 The function is a streaming function, it yields the current
558 state of a sample drawn with the :epkg:`reservoir sampling`
559 algorithm. It also works with
560 """
561 def hash_md5(name):
562 m = hashlib.md5()
563 m.update(name.encode('utf-8'))
564 res = m.hexdigest()
565 if len(res) > 4:
566 res = res[:4]
567 return res
569 from mlinsights.plotting import plot_gallery_images
570 rnd = last_element(stream_random_sample(
571 folder, n=n, seed=seed, abspath=False, ext=ext))
572 imgs = [os.path.join(folder, _[0]) for _ in rnd]
573 if isinstance(imgs[0], str):
574 texts = [hash_md5(nm) for nm in imgs]
575 else:
576 texts = list(str(_) for _ in range(len(imgs)))
577 ax = plot_gallery_images(imgs, texts, **kwargs)
578 rnd = [(t,) + cu for t, cu in zip(texts, rnd)]
579 return ax, rnd
582def folder_split_train_test(src_folder, dest_train, dest_test, seed=None, # pylint: disable=W0102
583 ext={'.jpg', '.png'}, test_size=0.25):
584 """
585 Splits images from a folder into train and test.
586 The function saves images into two separate folders.
588 @param src_folder source folder
589 @param dest_train destination folder for the train set
590 @param dest_test destination folder for the test set
591 @param ext desired extensions
592 @param seed random seed
593 @param test_size test ratio
594 @return list of copied files in a 2-uple
596 The function relies on @see fn enumerate_image_class
597 to extract the image from folder *src_folder*.
598 The subfolder is used to perform a stratitied split.
599 """
600 images = list(enumerate_image_class(src_folder, abspath=False, ext=ext))
601 df = pandas.DataFrame(data=images, columns=["name", "sub"])
602 df_train, df_test = train_test_split(df, test_size=test_size, random_state=seed,
603 shuffle=True, stratify=df['sub'])
605 def dump_images(imgs, fold):
606 copied = []
607 for img in imgs:
608 src = os.path.join(src_folder, img)
609 dst = os.path.join(fold, img)
610 d = os.path.dirname(dst)
611 if not os.path.exists(d):
612 os.makedirs(d)
613 shutil.copy(src, dst)
614 copied.append(dst)
615 return copied
617 img_train = df_train["name"]
618 img_test = df_test["name"]
619 tr = dump_images(img_train, dest_train)
620 te = dump_images(img_test, dest_test)
621 return tr, te