import mermaid from ''; mermaid.initialize({ startOnLoad: true });
Material for the hackathon ENSAE / BRGM / 2018. Les images sont extraites de tweets mais sont retweetées sans être retweetées.
%matplotlib inline
import matplotlib.pyplot as plt
from jyquickhelper import add_notebook_menu
add_notebook_menu()
Pour le challenge, il faut repérer les doublons dans les images. Pour cela, je zoom chaque image sur un carré 50x50 en noir et blanc, suivi d'une ACP puis k plus proches voisins pour détecter les doublons.
folder = "c:/temp/suricatenat_images"
from ensae_projects.hackathon.image_helper import apply_image_transform, image_zoom, img2gray
dest_folder = "img5050"
list(apply_image_transform(folder, dest_folder, lambda img: image_zoom(img2gray(img), (50, 50)), fLOG=print))
Pas utilisé par la suite.
from ensae_projects.hackathon.image_helper import stream_image2features
import numpy
dest_folder = "img5050"
dest_batch = "batch"
for b in stream_image2features(dest_folder, dest_batch, numpy.array, fLOG=print):
pass
%matplotlib inline
from ensae_projects.hackathon.image_knn import ImageNearestNeighbors
folder = "img5050"
knn = ImageNearestNeighbors()
knn.fit(folder, fLOG=print)
[ImageNearestNeighbors] processing image 0: 'inondation_2016\735614357036519425_CjVtTTrUoAAUUZp.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 1000: 'inondation_2016\737596119933321217_Cjx3w1FVAAAyyY1.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 2000: 'inondation_2016\737891662077255685_Cj2EjjXWUAA8Dhq.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 3000: 'inondation_2016\738050337521709056_Cj4UpFDUoAIR2gD.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 4000: 'inondation_2016\738283056302313472_Cj7oe7VWkAAPwAT.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 5000: 'inondation_2016\738366585526718464_Cj80fFNXEAAx9T2.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 6000: 'inondation_2016\738439428159377408_Cj92vvAUYAARP2A.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 7000: 'inondation_2016\738629637845221376_CkAjvUFVAAErbJF.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 8000: 'inondation_2016\738695722296614912_CkBf1CbXIAAonp1.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 9000: 'inondation_2016\738766013416787968_CkCfqR3XIAEuX8m.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 10000: 'inondation_2016\738894521304526849_CkEUnRhW0AEh1e1.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 11000: 'inondation_2016\739101985295728640_CkHRVZ-WUAAyCrp.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 12000: 'inondation_2016\739400457899114496_CkLgzBAWkAE9hCa.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 13000: 'inondation_2016\739732522427424768_CkQOztKWYAAlOul.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 14000: 'inondation_2016\740054590863859712_CkUzuikWgAAJUTK.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 15000: 'inondation_2016\740416207296299008_CkZ8nAnWYAAG7cC.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 16000: 'inondation_2016\740833843914153985_Ckf4dIFWEAANSOT.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 17000: 'inondation_2016\742361701924937728_Ck1mBsLWkAE6EQX.jpg' - class 'inondation_2016' [ImageNearestNeighbors] processing image 18000: 'inondation_2018\955391968712019968_DUI76ywW4AA2J1b.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 19000: 'inondation_2018\956216357934325761_LKmRQ9hLmVxOkWtm.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 20000: 'inondation_2018\957254473604268032_DUjZ2vSWkAAdzd2.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 21000: 'inondation_2018\959020320320565248_DU8fYlpX4AAZIRV.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 22000: 'inondation_2018\964034081381109761_DWDv4vHWsAAMQIS.jpg' - class 'inondation_2018' [ImageNearestNeighbors] processing image 23000: 'seisme_Amatrice\768290329543995392_MwkGcfSrCBzWbxwK.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 24000: 'seisme_Amatrice\768326333034364928_CqmktbfXEAAw2RU.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 25000: 'seisme_Amatrice\768345861646581760_Cqm2eUjWYAAWS68.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 26000: 'seisme_Amatrice\768361403522646016_CqnEgFrWcAANqdO.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 27000: 'seisme_Amatrice\768374709645967361_CqnQt96XEAAew2V.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 28000: 'seisme_Amatrice\768387852862455810_CqncoxLWYAAulnb.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 29000: 'seisme_Amatrice\768401257769865216_CqnlYItWAAAbc7p.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 30000: 'seisme_Amatrice\768417849652027394_Cqnz5_gXgAAx967.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 31000: 'seisme_Amatrice\768433724564377600_CqoGZC4WAAEh8zG.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 32000: 'seisme_Amatrice\768451168372662272_CqoWQCGW8AQIbHV.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 33000: 'seisme_Amatrice\768468307288743936_Cqol1cDXgAEychm.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 34000: 'seisme_Amatrice\768488406091386880_Cqo4H6GWIAAr4YP.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 35000: 'seisme_Amatrice\768511762429800448_CqpNXTxXYAATvNk.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 36000: 'seisme_Amatrice\768543842845032448_CqplczAWIAAhINz.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 37000: 'seisme_Amatrice\768647190260518912_CqrIhKnUkAAyvf6.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 38000: 'seisme_Amatrice\768716815279063040_CqsH3mqUEAA6gXD.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 39000: 'seisme_Amatrice\768743738634080256_CqsgWwgWcAE0rWO.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 40000: 'seisme_Amatrice\768772807568351232_Cqs6ORfWIAAXlf8.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 41000: 'seisme_Amatrice\768804543748575232_CqtXniSXYAAp7Tt.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 42000: 'seisme_Amatrice\768843712357076993_Cqt7R_1WYAE6tr6.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 43000: 'seisme_Amatrice\768901703898771456_Cquv7mKWgAAn6ZX.jpg' - class 'seisme_Amatrice' [ImageNearestNeighbors] processing image 44000: 'suricatenat_inondation_aude\1052220109740228608_Dpo8nOhXgAYLNEm.jpg' - class 'suricatenat_inondation_aude'
from ensae_projects.hackathon.image_helper import enumerate_image_class
folder = "img5050"
iter = enumerate_image_class(folder)
imgs = [_[0] for _ in zip(iter, range(0,1000000))]
len(imgs)
44053
for i, img in enumerate(imgs):
dist, ind = knn.kneighbors(img[0])
if dist[0, 1] <= 10:
print("dist =", dist)
print("ind =", ind)
break
dist = [[ 0. 0. 7.93725393 366.16662874 380.73481585]] ind = [[ 12 3 10 21464 8684]]
knn.plot_neighbors(ind, dist, obs=img[0], folder_or_images=folder);
pairs = []
for i, img in enumerate(imgs):
if i % 1000 == 0:
print("{0}/{1} done".format(i, len(imgs)))
dist, ind = knn.kneighbors(img[0])
sub = ind.ravel()[dist.ravel() <= 10]
if len(sub) > 0:
for j in sub:
pairs.append((i, j))
0/44053 done 1000/44053 done 2000/44053 done 3000/44053 done 4000/44053 done 5000/44053 done 6000/44053 done 7000/44053 done 8000/44053 done 9000/44053 done 10000/44053 done 11000/44053 done 12000/44053 done 13000/44053 done 14000/44053 done 15000/44053 done 16000/44053 done 17000/44053 done 18000/44053 done 19000/44053 done 20000/44053 done 21000/44053 done 22000/44053 done 23000/44053 done 24000/44053 done 25000/44053 done 26000/44053 done 27000/44053 done 28000/44053 done 29000/44053 done 30000/44053 done 31000/44053 done 32000/44053 done 33000/44053 done 34000/44053 done 35000/44053 done 36000/44053 done 37000/44053 done 38000/44053 done 39000/44053 done 40000/44053 done 41000/44053 done 42000/44053 done 43000/44053 done 44000/44053 done
pairs[:10]
[(0, 0), (1, 1), (2, 2), (3, 12), (3, 3), (3, 10), (4, 4), (5, 133), (5, 1549), (5, 158)]
pairs2 = [(i,j) for i,j in pairs if i != j]
len(pairs), len(pairs2)
(75725, 33675)
pairs2[:10]
[(3, 12), (3, 10), (5, 133), (5, 1549), (5, 158), (5, 5632), (5, 16784), (8, 14699), (8, 23), (8, 35)]
dist, ind = knn.kneighbors(imgs[5][0])
knn.plot_neighbors(ind, dist, obs=imgs[5][0], folder_or_images=folder);
distincts = []
for i, j in pairs2:
distincts.append(i)
distincts.append(j)
distincts = set(distincts)
connex = {}
for k in distincts:
connex[k] = k
n = 0
while n < 10:
modif = 0
for i, j in pairs2:
a = min(connex[i], connex[j])
if a != connex[i] or a != connex[j]:
modif += 1
connex[i] = connex[j] = a
print(n, modif)
n += 1
0 9096 1 6 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0
len(connex), len(set(connex.values()))
(13271, 4185)
names = knn.image_names_
names[:2]
['inondation_2016/735614357036519425_CjVtTTrUoAAUUZp.jpg', 'inondation_2016/735616090261184512_CjVu73ZVEAAlWmu.jpg']
dups = []
for i, j in connex.items():
if i != j:
dups.append(names[i])
len(dups)
9086
for i, img in enumerate(imgs):
dist, ind = knn.kneighbors(img[0])
if 10 < dist[0, 1] <= 30:
print("dist =", dist)
print("ind =", ind)
break
dist = [[ 0. 21.97726098 21.97726098 21.97726098 161.13348504]] ind = [[ 285 308 351 311 3005]]
obs = imgs[ind[0, 0]][0]
knn.plot_neighbors(ind, dist, obs=obs, folder_or_images=folder);
not_allowed = set(dups)
len(not_allowed)
9086
list(sorted(not_allowed))[:5]
['inondation_2016/735805396657397762_CjYbG-DUgAQTu19.jpg', 'inondation_2016/735829559329853440_CjYxFcrXEAAvjlH.jpg', 'inondation_2016/735870604038045696_CjZWafAXEAA3sOb.jpg', 'inondation_2016/735892072960512000_CjZp8CoWsAIOhL5.jpg', 'inondation_2016/735892650583306240_CjZqdvoXAAEaSRM.jpg']
from ensae_projects.hackathon.image_helper import stream_copy_images
src_folder = "c:/temp/suricatenat_images/"
dest_folder = "c:/temp/suricatenat_clean/"
def valid(name):
spl = name.split("suricatenat_images")[-1].replace("\\", "/").strip("/\\")
return spl not in allowed
for img in stream_copy_images(src_folder, dest_folder, valid, fLOG=print):
pass
[stream_copy_images] copy image 0: 'bing\01-9.jpg' - class 'bing' [stream_copy_images] copy image 1000: 'imagenet1\3271012508_955158b073.jpg' - class 'imagenet1' [stream_copy_images] copy image 2000: 'imagenet2\3287016043_987800dc67.jpg' - class 'imagenet2' [stream_copy_images] copy image 3000: 'imagenet4\106994_5349_big_200907_voyager11.jpg' - class 'imagenet4' [stream_copy_images] copy image 4000: 'imagenet5\532346050_dafb11ec86.jpg' - class 'imagenet5' [stream_copy_images] copy image 5000: 'inondation_2016\736966968138473472_Cjo7jTrXAAAeffo.jpg' - class 'inondation_2016' [stream_copy_images] copy image 6000: 'inondation_2016\737629970399252480_CjySiGiUkAUr8TC.jpg' - class 'inondation_2016' [stream_copy_images] copy image 7000: 'inondation_2016\737923554407256064_Cj2hYNwWUAElsOP.jpg' - class 'inondation_2016' [stream_copy_images] copy image 8000: 'inondation_2016\738072076880347136_Cj4opLHXEAAuuGK.jpg' - class 'inondation_2016' [stream_copy_images] copy image 9000: 'inondation_2016\738298504267730945_Cj72k1kUoAAIKUA.jpg' - class 'inondation_2016' [stream_copy_images] copy image 10000: 'inondation_2016\738378724442296321_Cj8_iRUXEAEIYex.jpg' - class 'inondation_2016' [stream_copy_images] copy image 11000: 'inondation_2016\738456441082793984_Cj-GNbPWkAAecmj.jpg' - class 'inondation_2016' [stream_copy_images] copy image 12000: 'inondation_2016\738642491671379968_CkAvbhyVAAQdhnl.jpg' - class 'inondation_2016' [stream_copy_images] copy image 13000: 'inondation_2016\738708144893927424_CkBrBsFXIAAesMt.jpg' - class 'inondation_2016' [stream_copy_images] copy image 14000: 'inondation_2016\738775822753013760_CkCosKRXEAAL3QS.jpg' - class 'inondation_2016' [stream_copy_images] copy image 15000: 'inondation_2016\738983572388913152_CkFlodbW0AAjH1A.jpg' - class 'inondation_2016' [stream_copy_images] copy image 16000: 'inondation_2016\739133036877467649_CkHtiX3XEAAQ5qt.jpg' - class 'inondation_2016' [stream_copy_images] copy image 17000: 'inondation_2016\739435820709519360_CkMA9WNXAAEBBwW.jpg' - class 'inondation_2016' [stream_copy_images] copy image 18000: 'inondation_2016\739759634534141958_CkQnd1TUUAQli3i.jpg' - class 'inondation_2016' [stream_copy_images] copy image 19000: 'inondation_2016\740101248225935361_CkVVPYDWUAAc8U3.jpg' - class 'inondation_2016' [stream_copy_images] copy image 20000: 'inondation_2016\740462147130556416_CkamZeeXAAIf6ru.jpg' - class 'inondation_2016' [stream_copy_images] copy image 21000: 'inondation_2016\740924772062769152_CkhLHExW0AIpwYC.jpg' - class 'inondation_2016' [stream_copy_images] copy image 22000: 'inondation_2016\742979124050964480_Ck-XkQfXEAE46Wh.jpg' - class 'inondation_2016' [stream_copy_images] copy image 23000: 'inondation_2018\955500762070769664_DUKe4P3WAAEBJFC.jpg' - class 'inondation_2018' [stream_copy_images] copy image 24000: 'inondation_2018\956447069216165890_DUX7giCXUAANfkI.jpg' - class 'inondation_2018' [stream_copy_images] copy image 25000: 'inondation_2018\957555126931279872_DUnrT9aXUAARFxJ.jpg' - class 'inondation_2018' [stream_copy_images] copy image 26000: 'inondation_2018\959394452564598784_DVB0KQsWkAA4Bta.jpg' - class 'inondation_2018' [stream_copy_images] copy image 27000: 'inondation_2018\965549350599487488_DWZSA7cWsAEWGaK.jpg' - class 'inondation_2018' [stream_copy_images] copy image 28000: 'seisme_Amatrice\768296828550819841_CqmJ4k4UsAEcTaF.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 29000: 'seisme_Amatrice\768330792049205248_CqmooXdXgAAJ2b3.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 30000: 'seisme_Amatrice\768348574694408192_Cqm4itvWcAAsv_s.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 31000: 'seisme_Amatrice\768363756728516608_CqnGo90WIAAfA1I.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 32000: 'seisme_Amatrice\768376884677738496_CqnOR17WIAAP2Hn.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 33000: 'seisme_Amatrice\768390411228422144_Cqne_UWWYAAnY6V.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 34000: 'seisme_Amatrice\768404063755141120_CqnrVy8XYAAYIGO.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 35000: 'seisme_Amatrice\768420565745106944_Cqn6bjbWIAEAbck.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 36000: 'seisme_Amatrice\768436635444908032_CqoI-OfWIAEpe5T.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 37000: 'seisme_Amatrice\768453842098880512_CqoYsPEXEAARM5o.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 38000: 'seisme_Amatrice\768471447140458496_CqoosvJW8AIjpEA.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 39000: 'seisme_Amatrice\768492129882517506_Cqo7hBpW8AA64OU.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 40000: 'seisme_Amatrice\768516668515577856_CqpR0mDWIAAEGkL.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 41000: 'seisme_Amatrice\768550981206441984_Cqpw-qOWAAAVGTB.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 42000: 'seisme_Amatrice\768679088013778944_CqrlaXlVUAAcqVG.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 43000: 'seisme_Amatrice\768721000015892480_CqsLrL6UkAAofwM.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 44000: 'seisme_Amatrice\768749206500741120_CqslU7hWEAA_Cyn.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 45000: 'seisme_Amatrice\768777504609931264_Cqs_DzcWAAAZ2_h.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 46000: 'seisme_Amatrice\768810730250461184_CqtdRvNWAAAE_HJ.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 47000: 'seisme_Amatrice\768850688487022592_Cqsp-GyWgAEJ6Kp.jpg' - class 'seisme_Amatrice' [stream_copy_images] copy image 48000: 'seisme_Amatrice\768916332322648064_Cqu9OPSWgAEzaux.jpg' - class 'seisme_Amatrice'
l1 = list(enumerate_image_class("c:/temp/suricatenat_images/"))
l2 = list(enumerate_image_class("c:/temp/suricatenat_clean/"))
len(l1), len(l2)
(48884, 39798)
from ensae_projects.hackathon.image_helper import stream_random_sample, last_element
rnd = last_element(stream_random_sample("c:/temp/suricatenat_clean/", abspath=False))
rnd[:5]
[('imagenet2\\2611787731_6b65bdaf6a.jpg', 'imagenet2'), ('inondation_2016\\740608740169224192_CkcruUEXIAEsWUl.jpg', 'inondation_2016'), ('inondation_2016\\738614580658606080_CkAWBegUgAA5Z9l.jpg', 'inondation_2016'), ('inondation_2018\\956548703552245760_DUZX5TRWsAAyDqH.jpg', 'inondation_2018'), ('inondation_2018\\956925376936148993_DUeuiGQX4AAocq-.jpg', 'inondation_2018')]
import os
import shutil
src_folder = "c:/temp/suricatenat_clean/"
dest_folder = "c:/temp/suricatenat_sample/"
for img, sub in rnd:
src = os.path.join(src_folder, img)
dst = os.path.join(dest_folder, img)
d = os.path.dirname(dst)
if not os.path.exists(d):
os.makedirs(d)
shutil.copy(src, dst)