L'idée est de recoder une fonction en C. On prend comme exemple la fonction de prédiction de la régression linéaire de scikit-learn et de prévoir le gain de temps qu'on obtient en recodant la fonction dans un langage plus rapide.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
memo_time = []
import timeit
def unit(x):
if x >= 1: return "%1.2f s" % x
elif x >= 1e-3: return "%1.2f ms" % (x* 1000)
elif x >= 1e-6: return "%1.2f µs" % (x* 1000**2)
elif x >= 1e-9: return "%1.2f ns" % (x* 1000**3)
else:
return "%1.2g s" % x
def timeexe(legend, code, number=100, repeat=1000):
rep = timeit.repeat(code, number=number, repeat=repeat, globals=globals())
ave = sum(rep) / (number * repeat)
std = (sum((x/number - ave)**2 for x in rep) / repeat)**0.5
fir = rep[0]/number
fir3 = sum(rep[:3]) / (3 * number)
las3 = sum(rep[-3:]) / (3 * number)
rep.sort()
mini = rep[len(rep)//20] / number
maxi = rep[-len(rep)//20] / number
print("Moyenne: %s Ecart-type %s (with %d runs) in [%s, %s]" % (
unit(ave), unit(std), number, unit(mini), unit(maxi)))
return dict(legend=legend, average=ave, deviation=std, first=fir, first3=fir3,
last3=las3, repeat=repeat, min5=mini, max5=maxi, code=code, run=number)
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
diabetes = load_diabetes()
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(diabetes.data, diabetes.target)
from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(diabetes_X_train, diabetes_y_train)
LinearRegression()
clr.coef_
array([ -35.81159278, -267.39308261, 503.56121841, 337.87944184, -577.27255236, 373.62939477, -99.69779327, 78.39842094, 656.54309153, 80.3383998 ])
clr.intercept_
152.69613239933642
z = diabetes_X_test[0:1,:]
memo_time.append(timeexe("sklearn.predict", "clr.predict(z)"))
Moyenne: 45.50 µs Ecart-type 6.34 µs (with 100 runs) in [40.87 µs, 52.95 µs]
%timeit clr.predict(z)
45.2 µs ± 744 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
On s'inspire de l'exemple Purely for performance (API level, out-of-line).
from cffi import FFI
ffibuilder = FFI()
ffibuilder.cdef("int linreg(int, double *, double *, double, double *);")
ffibuilder.set_source("_linear_regression",
r"""
static int linreg(int dimension, double * x, double *coef, double intercept, double * out)
{
for(; dimension > 0; --dimension, ++x, ++coef)
intercept += *x * *coef;
*out = intercept;
return 1;
}
""")
ffibuilder.compile(verbose=True)
generating .\_linear_regression.c (already up-to-date) the current directory is 'C:\\xavierdupre\\__home_\\GitHub\\ensae_teaching_cs\\_doc\\notebooks\\2a' running build_ext building '_linear_regression' extension C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\bin\HostX86\x64\cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:\Python395_x64\include -IC:\Python395_x64\include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\ATLMFC\include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\include -IC:\Program Files (x86)\Windows Kits\NETFXSDK\4.8\include\um -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\ucrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\shared -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\um -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\winrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\cppwinrt /Tc_linear_regression.c /Fo.\Release\_linear_regression.obj C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\bin\HostX86\x64\link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:\Python395_x64\libs /LIBPATH:C:\Python395_x64\PCbuild\amd64 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\ATLMFC\lib\x64 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\lib\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\NETFXSDK\4.8\lib\um\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\ucrt\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\um\x64 /EXPORT:PyInit__linear_regression .\Release\_linear_regression.obj /OUT:.\_linear_regression.cp39-win_amd64.pyd /IMPLIB:.\Release\_linear_regression.cp39-win_amd64.lib
'C:\\xavierdupre\\__home_\\GitHub\\ensae_teaching_cs\\_doc\\notebooks\\2a\\_linear_regression.cp39-win_amd64.pyd'
La fonction compilée est accessible comme suit.
from _linear_regression import ffi, lib
lib.linreg
<function _linear_regression.Lib.linreg>
On s'inspire de l'exemple How to pass a Numpy array into a cffi function and how to get one back out?.
import numpy
out = numpy.zeros(1)
ptr_coef = clr.coef_.__array_interface__['data'][0]
cptr_coef = ffi.cast ( "double*" , ptr_coef )
x = diabetes_X_test[0:1,:]
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "double*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "double*" , ptr_out )
n = len(clr.coef_)
lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)
1
out
array([214.72477745])
On vérifie qu'on obtient bien la même chose.
clr.predict(x)
array([214.72477745])
Et on mesure le temps d'exécution :
memo_time.append(timeexe("cffi-linreg", "lib.linreg(n, cptr_x, cptr_coef, clr.intercept_, cptr_out)"))
Moyenne: 831.37 ns Ecart-type 708.08 ns (with 100 runs) in [416.00 ns, 1.52 µs]
C'est beaucoup plus rapide. Pour être totalement honnête, il faut mesurer les étapes qui consiste à extraire les pointeurs.
def predict_clr(x, clr):
out = numpy.zeros(1)
ptr_coef = clr.coef_.__array_interface__['data'][0]
cptr_coef = ffi.cast ( "double*" , ptr_coef )
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "double*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "double*" , ptr_out )
lib.linreg(len(x), cptr_x, cptr_coef, clr.intercept_, cptr_out)
return out
predict_clr(x, clr)
array([154.32457426])
memo_time.append(timeexe("cffi-linreg-wrapped", "predict_clr(x, clr)"))
Moyenne: 7.52 µs Ecart-type 2.34 µs (with 100 runs) in [6.20 µs, 10.42 µs]
Cela reste plus rapide.
Comme on construit la fonction en dynamique (le code est connu lors de l'exécution), on peut facilement se passer de la boucle et écrire le code sans boucle et avec les coefficients.
res = " + ".join("{0}*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'-35.81159277952622*x[0] + -267.39308260812277*x[1] + 503.56121841083586*x[2] + 337.87944183803455*x[3] + -577.2725523621144*x[4] + 373.6293947654621*x[5] + -99.69779326605845*x[6] + 78.39842093764699*x[7] + 656.5430915289373*x[8] + 80.33839980437061*x[9]'
code = """
static int linreg_custom(double * x, double * out)
{{
out[0] = {0} + {1};
}}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom(double * x, double * out) { out[0] = 152.69613239933642 + -35.81159277952622*x[0] + -267.39308260812277*x[1] + 503.56121841083586*x[2] + 337.87944183803455*x[3] + -577.2725523621144*x[4] + 373.6293947654621*x[5] + -99.69779326605845*x[6] + 78.39842093764699*x[7] + 656.5430915289373*x[8] + 80.33839980437061*x[9]; }
from cffi import FFI
ffibuilder = FFI()
ffibuilder.cdef("int linreg_custom(double *, double *);")
ffibuilder.set_source("_linear_regression_custom", code)
ffibuilder.compile(verbose=True)
generating .\_linear_regression_custom.c the current directory is 'C:\\xavierdupre\\__home_\\GitHub\\ensae_teaching_cs\\_doc\\notebooks\\2a' running build_ext building '_linear_regression_custom' extension C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\bin\HostX86\x64\cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:\Python395_x64\include -IC:\Python395_x64\include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\ATLMFC\include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\include -IC:\Program Files (x86)\Windows Kits\NETFXSDK\4.8\include\um -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\ucrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\shared -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\um -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\winrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\cppwinrt /Tc_linear_regression_custom.c /Fo.\Release\_linear_regression_custom.obj C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\bin\HostX86\x64\link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:\Python395_x64\libs /LIBPATH:C:\Python395_x64\PCbuild\amd64 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\ATLMFC\lib\x64 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\lib\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\NETFXSDK\4.8\lib\um\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\ucrt\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\um\x64 /EXPORT:PyInit__linear_regression_custom .\Release\_linear_regression_custom.obj /OUT:.\_linear_regression_custom.cp39-win_amd64.pyd /IMPLIB:.\Release\_linear_regression_custom.cp39-win_amd64.lib
'C:\\xavierdupre\\__home_\\GitHub\\ensae_teaching_cs\\_doc\\notebooks\\2a\\_linear_regression_custom.cp39-win_amd64.pyd'
from _linear_regression_custom.lib import linreg_custom
linreg_custom(cptr_x, cptr_out)
out
array([214.72477745])
memo_time.append(timeexe("cffi-linreg-custom", "linreg_custom(cptr_x, cptr_out)"))
Moyenne: 466.52 ns Ecart-type 851.96 ns (with 100 runs) in [315.00 ns, 715.00 ns]
On a gagné un facteur 2.
def predict_clr_custom(x):
out = numpy.zeros(1)
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast("double*", ptr_x)
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast("double*", ptr_out)
linreg_custom(cptr_x, cptr_out)
return out
predict_clr_custom(x)
array([214.72477745])
memo_time.append(timeexe("cffi-linreg-custom wrapped", "predict_clr_custom(x)"))
Moyenne: 5.27 µs Ecart-type 1.82 µs (with 100 runs) in [4.42 µs, 7.77 µs]
C'est un peu plus rapide.
L'ordinateur fait la distinction entre les double code sur 64 bit et les float codé sur 32 bits. La précision est meilleure dans le premier cas et les calculs sont plus rapides dans le second. Dans le cas du machine learning, on préfère la rapidité à une perte précision en précision qui est souvent compensée par l'optimisation inhérente à tout problème de machine learning. Ce qu'on perd sur une observation, on le retrouve sur une autre.
res = " + ".join("{0}f*x[{1}]".format(c, i) for i, c in enumerate(clr.coef_))
res
'-35.81159277952622f*x[0] + -267.39308260812277f*x[1] + 503.56121841083586f*x[2] + 337.87944183803455f*x[3] + -577.2725523621144f*x[4] + 373.6293947654621f*x[5] + -99.69779326605845f*x[6] + 78.39842093764699f*x[7] + 656.5430915289373f*x[8] + 80.33839980437061f*x[9]'
code = """
static int linreg_custom_float(float * x, float * out)
{{
out[0] = {0}f + {1};
}}
""".format(clr.intercept_, res)
print(code)
static int linreg_custom_float(float * x, float * out) { out[0] = 152.69613239933642f + -35.81159277952622f*x[0] + -267.39308260812277f*x[1] + 503.56121841083586f*x[2] + 337.87944183803455f*x[3] + -577.2725523621144f*x[4] + 373.6293947654621f*x[5] + -99.69779326605845f*x[6] + 78.39842093764699f*x[7] + 656.5430915289373f*x[8] + 80.33839980437061f*x[9]; }
from cffi import FFI
ffibuilder = FFI()
ffibuilder.cdef("int linreg_custom_float(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float", code)
ffibuilder.compile(verbose=True)
generating .\_linear_regression_custom_float.c the current directory is 'C:\\xavierdupre\\__home_\\GitHub\\ensae_teaching_cs\\_doc\\notebooks\\2a' running build_ext building '_linear_regression_custom_float' extension C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\bin\HostX86\x64\cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:\Python395_x64\include -IC:\Python395_x64\include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\ATLMFC\include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\include -IC:\Program Files (x86)\Windows Kits\NETFXSDK\4.8\include\um -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\ucrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\shared -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\um -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\winrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\cppwinrt /Tc_linear_regression_custom_float.c /Fo.\Release\_linear_regression_custom_float.obj C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\bin\HostX86\x64\link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:\Python395_x64\libs /LIBPATH:C:\Python395_x64\PCbuild\amd64 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\ATLMFC\lib\x64 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\lib\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\NETFXSDK\4.8\lib\um\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\ucrt\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\um\x64 /EXPORT:PyInit__linear_regression_custom_float .\Release\_linear_regression_custom_float.obj /OUT:.\_linear_regression_custom_float.cp39-win_amd64.pyd /IMPLIB:.\Release\_linear_regression_custom_float.cp39-win_amd64.lib
'C:\\xavierdupre\\__home_\\GitHub\\ensae_teaching_cs\\_doc\\notebooks\\2a\\_linear_regression_custom_float.cp39-win_amd64.pyd'
from _linear_regression_custom_float.lib import linreg_custom_float
def predict_clr_custom_float(x):
out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )
linreg_custom_float(cptr_x, cptr_out)
return out
Avant d'appeler la fonction, on doit transformer le vecteur iniatial en float32.
x32 = x.astype(numpy.float32)
predict_clr_custom(x32)
array([1.27301276e+31])
memo_time.append(timeexe("cffi-linreg-custom-float wrapped", "predict_clr_custom(x32)"))
Moyenne: 5.12 µs Ecart-type 1.60 µs (with 100 runs) in [4.48 µs, 6.44 µs]
La différence n'est pas flagrante. Mesurons le code C uniquement même si la partie Python ne peut pas être complètement évitée.
out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )
memo_time.append(timeexe("cffi-linreg-custom-float32", "linreg_custom_float(cptr_x, cptr_out)"))
Moyenne: 389.19 ns Ecart-type 226.75 ns (with 100 runs) in [317.00 ns, 577.00 ns]
La différence n'est pas significative.
C'est un ensemble d'instructions processeur pour faire des opérations terme à terme sur 4 float32 aussi rapidement qu'une seule. Le processeur ne peut faire des opérations que les nombres sont copiés dans ses registres. Le programme passe alors son temps à copier des nombres depuis la mémoire vers les registres du processeur puis à faire la copie dans le chemin inverse pour le résultat. Les instructions SIMD font gagner du temps du niveau du calcul. Au lieu de faire 4 opérations de multiplication terme à terme, il n'en fait plus qu'une. Il suffit de savoir comment utiliser ces instructions. Avec Visual Studio, elles sont accessible via ces fonctions Memory and Initialization Using Streaming SIMD Extensions. Le code suivant n'est probablement pas optimal mais il n'est pas trop compliqué à suivre.
code = """
#include <xmmintrin.h>
static int linreg_custom_float_simd(float * x, float * out)
{
__m128 c1 = _mm_set_ps(0.3034995490664121f, -237.63931533353392f, 510.5306054362245f, 327.7369804093466f);
__m128 c2 = _mm_set_ps(-814.1317093725389f, 492.81458798373245f, 102.84845219168025f, 184.60648905984064f);
__m128 r1 = _mm_set_ss(152.76430691633442f);
r1 = _mm_add_ss(r1, _mm_mul_ps(c1, _mm_load_ps(x)));
r1 = _mm_add_ss(r1, _mm_mul_ps(c2, _mm_load_ps(x+4)));
float r[4];
_mm_store_ps(r, r1);
out[0] = r[0] + r[1] + r[2] + r[3] + 743.5196167505419f * x[8] + 76.095172216624f * x[9];
return 1;
}
"""
from cffi import FFI
ffibuilder = FFI()
ffibuilder.cdef("int linreg_custom_float_simd(float *, float *);")
ffibuilder.set_source("_linear_regression_custom_float_simd", code)
ffibuilder.compile(verbose=True)
generating .\_linear_regression_custom_float_simd.c (already up-to-date) the current directory is 'C:\\xavierdupre\\__home_\\GitHub\\ensae_teaching_cs\\_doc\\notebooks\\2a' running build_ext building '_linear_regression_custom_float_simd' extension C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\bin\HostX86\x64\cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -IC:\Python395_x64\include -IC:\Python395_x64\include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\ATLMFC\include -IC:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\include -IC:\Program Files (x86)\Windows Kits\NETFXSDK\4.8\include\um -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\ucrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\shared -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\um -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\winrt -IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\cppwinrt /Tc_linear_regression_custom_float_simd.c /Fo.\Release\_linear_regression_custom_float_simd.obj C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\bin\HostX86\x64\link.exe /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO /LIBPATH:C:\Python395_x64\libs /LIBPATH:C:\Python395_x64\PCbuild\amd64 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\ATLMFC\lib\x64 /LIBPATH:C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30037\lib\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\NETFXSDK\4.8\lib\um\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\ucrt\x64 /LIBPATH:C:\Program Files (x86)\Windows Kits\10\lib\10.0.19041.0\um\x64 /EXPORT:PyInit__linear_regression_custom_float_simd .\Release\_linear_regression_custom_float_simd.obj /OUT:.\_linear_regression_custom_float_simd.cp39-win_amd64.pyd /IMPLIB:.\Release\_linear_regression_custom_float_simd.cp39-win_amd64.lib
'C:\\xavierdupre\\__home_\\GitHub\\ensae_teaching_cs\\_doc\\notebooks\\2a\\_linear_regression_custom_float_simd.cp39-win_amd64.pyd'
from _linear_regression_custom_float_simd.lib import linreg_custom_float_simd
out = numpy.zeros(1, dtype=numpy.float32)
ptr_x = x32.__array_interface__['data'][0]
cptr_x = ffi.cast ( "float*" , ptr_x )
ptr_out = out.__array_interface__['data'][0]
cptr_out = ffi.cast ( "float*" , ptr_out )
linreg_custom_float_simd(cptr_x, cptr_out)
out
array([172.00832], dtype=float32)
memo_time.append(timeexe("cffi-linreg-custom-float32-simd", "linreg_custom_float_simd(cptr_x, cptr_out)"))
Moyenne: 418.99 ns Ecart-type 387.18 ns (with 100 runs) in [299.00 ns, 631.00 ns]
C'est légèrement mieux, quelques références :
Les processeurs évoluent au fil du temps, 4 float, 8 float, SIMD2, FMA4 Intrinsics Added for Visual Studio 2010 SP1, AVX.
On continue avec uniquement du Python sans numpy.
coef = clr.coef_
list(coef)
[-35.81159277952622, -267.39308260812277, 503.56121841083586, 337.87944183803455, -577.2725523621144, 373.6293947654621, -99.69779326605845, 78.39842093764699, 656.5430915289373, 80.33839980437061]
code = str(clr.intercept_) + "+" + "+".join("x[{0}]*({1})".format(i, c) for i, c in enumerate(coef))
code
'152.69613239933642+x[0]*(-35.81159277952622)+x[1]*(-267.39308260812277)+x[2]*(503.56121841083586)+x[3]*(337.87944183803455)+x[4]*(-577.2725523621144)+x[5]*(373.6293947654621)+x[6]*(-99.69779326605845)+x[7]*(78.39842093764699)+x[8]*(656.5430915289373)+x[9]*(80.33839980437061)'
def predict_clr_python(x):
return 152.764306916+x[0]*0.3034995490664121+x[1]*(-237.63931533353392)+x[2]*510.5306054362245+ \
x[3]*327.7369804093466+ \
x[4]*(-814.1317093725389)+x[5]*492.81458798373245+x[6]*102.84845219168025+ \
x[7]*184.60648905984064+x[8]*743.5196167505419+x[9]*76.095172216624
predict_clr_python(x[0])
211.03463170273153
z = list(x[0])
memo_time.append(timeexe("python-linreg-custom", "predict_clr_python(z)"))
Moyenne: 2.02 µs Ecart-type 670.45 ns (with 100 runs) in [1.70 µs, 2.73 µs]
De façon assez surprenante, c'est plutôt rapide. Et si on y mettait une boucle.
def predict_clr_python_loop(x, coef, intercept):
return intercept + sum(a*b for a, b in zip(x, coef))
predict_clr_python_loop(x[0], list(clr.coef_), clr.intercept_)
214.72477744760596
coef = list(clr.coef_)
intercept = clr.intercept_
memo_time.append(timeexe("python-linreg", "predict_clr_python_loop(z, coef, intercept)"))
Moyenne: 3.54 µs Ecart-type 1.31 µs (with 100 runs) in [2.68 µs, 6.16 µs]
A peine plus long.
def predict_clr_numpy(x, coef, intercept):
return intercept + numpy.dot(coef, x).sum()
predict_clr_numpy(x[0], clr.coef_, clr.intercept_)
214.72477744760596
memo_time.append(timeexe("numpy-linreg-numpy", "predict_clr_numpy(z, coef, clr.intercept_)"))
Moyenne: 8.08 µs Ecart-type 3.44 µs (with 100 runs) in [6.44 µs, 12.16 µs]
Les dimensions des tableaux sont trop petites pour que le calcul matriciel apporte une différence. On se retrouve dans le cas cffi où les échanges Python - C grignotent tout le temps de calcul.
numba essaye de compiler à la volée des bouts de codes écrits en Python. On induque quelle fonction optimiser en faisant précéder la fonction de @jit
. Toutes les écritures ne fonctionnent, typiquement, certaines listes en compréhension soulèvent une exception. Il faut donc écrire son code en Python d'une façon assez proche de ce qu'il serait en C.
from numba import jit
@jit
def predict_clr_numba(x, coef, intercept):
s = intercept
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
predict_clr_numba(z, clr.coef_, clr.intercept_)
C:\Python395_x64\lib\site-packages\numba\core\ir_utils.py:2152: NumbaPendingDeprecationWarning: Encountered the use of a type that is scheduled for deprecation: type 'reflected list' found for argument 'x' of function 'predict_clr_numba'. For more information visit https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-reflection-for-list-and-set-types File "<ipython-input-50-3bf9efb9c9c6>", line 2: @jit def predict_clr_numba(x, coef, intercept): ^ warnings.warn(NumbaPendingDeprecationWarning(msg, loc=loc))
214.724777447606
memo_time.append(timeexe("numba-linreg-notype", "predict_clr_numba(z, clr.coef_, clr.intercept_)"))
Moyenne: 23.77 µs Ecart-type 7.36 µs (with 100 runs) in [19.99 µs, 37.64 µs]
Plutôt rapide !
@jit('double(double[:], double[:], double)')
def predict_clr_numba_cast(x, coef, intercept):
s = intercept
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)
214.724777447606
memo_time.append(timeexe("numba-linreg-type", "predict_clr_numba_cast(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 948.20 ns Ecart-type 411.47 ns (with 100 runs) in [759.00 ns, 1.68 µs]
On voit que plus on donne d'information au compilateur, plus il est capable d'optimiser.
@jit('float32(float32[:], float32[:], float32)')
def predict_clr_numba_cast_float(x, coef, intercept):
s = intercept
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
# La fonction ne fonctionne qu'avec un numpy.array car le langage C est fortement typé.
x32 = x[0].astype(numpy.float32)
c32 = clr.coef_.astype(numpy.float32)
i32 = numpy.float32(clr.intercept_)
predict_clr_numba_cast_float(x32, c32, i32)
214.7247772216797
memo_time.append(timeexe("numba-linreg-type-float32", "predict_clr_numba_cast_float(x32, c32, i32)"))
Moyenne: 707.08 ns Ecart-type 268.64 ns (with 100 runs) in [565.00 ns, 1.25 µs]
On essaye avec les coefficients dans la fonction.
@jit('double(double[:])')
def predict_clr_numba_cast_custom(x):
coef = [ 3.03499549e-01, -2.37639315e+02, 5.10530605e+02, 3.27736980e+02,
-8.14131709e+02, 4.92814588e+02, 1.02848452e+02, 1.84606489e+02,
7.43519617e+02, 7.60951722e+01]
s = 152.76430691633442
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
predict_clr_numba_cast_custom(x[0])
211.034631692416
memo_time.append(timeexe("numba-linreg-type-custom", "predict_clr_numba_cast_custom(x[0])"))
Moyenne: 824.35 ns Ecart-type 371.36 ns (with 100 runs) in [652.00 ns, 1.56 µs]
On se rapproche des temps obtenus avec cffi sans wrapping, cela signifie que numba fait un bien meilleur travail à ce niveau que le wrapper rapidement créé.
@jit('double(double[:], double[:], double)')
def predict_clr_numba_numpy(x, coef, intercept):
return intercept + numpy.dot(coef, x).sum()
predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)
<ipython-input-58-7020de83c055>:1: NumbaWarning: Compilation is falling back to object mode WITH looplifting enabled because Function "predict_clr_numba_numpy" failed type inference due to: Unknown attribute 'sum' of type float64 File "<ipython-input-58-7020de83c055>", line 3: def predict_clr_numba_numpy(x, coef, intercept): return intercept + numpy.dot(coef, x).sum() ^ During: typing of get attribute at <ipython-input-58-7020de83c055> (3) File "<ipython-input-58-7020de83c055>", line 3: def predict_clr_numba_numpy(x, coef, intercept): return intercept + numpy.dot(coef, x).sum() ^ @jit('double(double[:], double[:], double)') C:\Python395_x64\lib\site-packages\numba\core\object_mode_passes.py:151: NumbaWarning: Function "predict_clr_numba_numpy" was compiled in object mode without forceobj=True. File "<ipython-input-58-7020de83c055>", line 2: @jit('double(double[:], double[:], double)') def predict_clr_numba_numpy(x, coef, intercept): ^ warnings.warn(errors.NumbaWarning(warn_msg, C:\Python395_x64\lib\site-packages\numba\core\object_mode_passes.py:161: NumbaDeprecationWarning: Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour. For more information visit https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit File "<ipython-input-58-7020de83c055>", line 2: @jit('double(double[:], double[:], double)') def predict_clr_numba_numpy(x, coef, intercept): ^ warnings.warn(errors.NumbaDeprecationWarning(msg,
214.72477744760596
memo_time.append(timeexe("numba-linreg-type-numpy", "predict_clr_numba_numpy(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 5.15 µs Ecart-type 1.78 µs (with 100 runs) in [4.37 µs, 6.00 µs]
numba est moins performant quand numpy est impliqué car le code de numpy n'est pas réécrit, il est appelé.
cython permet de créer des extensions C de plus grande envergure que numba. C'est l'option choisie par scikit-learn. Il vaut mieux connaître le C pour s'en servir et là encore, l'objectif est de réduire les échanges Python / C qui coûtent cher.
%load_ext cython
%%cython
def predict_clr_cython(x, coef, intercept):
s = intercept
for i in range(0, len(x)):
s += coef[i] * x[i]
return s
predict_clr_cython(x[0], clr.coef_, clr.intercept_)
214.724777447606
memo_time.append(timeexe("cython-linreg", "predict_clr_cython(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 2.71 µs Ecart-type 1.60 µs (with 100 runs) in [1.92 µs, 7.19 µs]
Cython fait moins bien que numba dans notre cas et l'optimisation proposée est assez proche du temps déjà obtenue avec le langage Python seul. Cela est dû au fait que la plupart des objets tels que du code associé aux listes ou aux dictionnaires ont été réécrits en C.
%%cython
cimport numpy as npc
def predict_clr_cython_type(npc.ndarray[double, ndim=1, mode='c'] x,
npc.ndarray[double, ndim=1, mode='c'] coef,
double intercept):
cdef double s = intercept
for i in range(0, x.shape[0]):
s += coef[i] * x[i]
return s
predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)
214.724777447606
memo_time.append(timeexe(
"cython-linreg-type", "predict_clr_cython_type(x[0], clr.coef_, clr.intercept_)"))
Moyenne: 721.31 ns Ecart-type 399.10 ns (with 100 runs) in [533.00 ns, 1.44 µs]
Le temps est quasi identique avec un écart type moins grand de façon significative.
ONNX est un format de sérialisation qui permet de décrire un modèle de modèle de machine learning ou de deep learning. Cela permet de dissocer le modèle de la librairie qui a servi à le produire (voir ML.net and ONNX).
try:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime
import onnx
ok_onnx = True
print("onnx, skl2onnx, onnxruntime sont disponibles.")
def save_model(onnx_model, filename):
with open(filename, "wb") as f:
f.write(onnx_model.SerializeToString())
except ImportError as e:
print("La suite requiert onnx, skl2onnx et onnxruntime.")
print(e)
ok_onnx = False
Error in sys.excepthook: Traceback (most recent call last): File "C:\Python395_x64\lib\site-packages\IPython\core\interactiveshell.py", line 1934, in showtraceback stb = value._render_traceback_() AttributeError: 'RuntimeError' object has no attribute '_render_traceback_' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "C:\Python395_x64\lib\site-packages\IPython\core\interactiveshell.py", line 1936, in showtraceback stb = self.InteractiveTB.structured_traceback(etype, File "C:\Python395_x64\lib\site-packages\IPython\core\ultratb.py", line 1105, in structured_traceback return FormattedTB.structured_traceback( File "C:\Python395_x64\lib\site-packages\IPython\core\ultratb.py", line 999, in structured_traceback return VerboseTB.structured_traceback( File "C:\Python395_x64\lib\site-packages\IPython\core\ultratb.py", line 851, in structured_traceback assert etb is not None AssertionError Original exception was: RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe
onnx, skl2onnx, onnxruntime sont disponibles.
On convertit le modèle au format ONNX.
if ok_onnx:
onnx_model = convert_sklearn(
clr, 'model', [('input', FloatTensorType([None, clr.coef_.shape[0]]))],
target_opset=11)
onnx_model.ir_version = 6
save_model(onnx_model, 'model.onnx')
model_onnx = onnx.load('model.onnx')
print("Modèle sérialisé au format ONNX")
print(model_onnx)
else:
print("onnx, onnxmltools, onnxruntime sont disponibles.")
Modèle sérialisé au format ONNX ir_version: 6 producer_name: "skl2onnx" producer_version: "1.10.4" domain: "ai.onnx" model_version: 0 doc_string: "" graph { node { input: "input" output: "variable" name: "LinearRegressor" op_type: "LinearRegressor" attribute { name: "coefficients" floats: -35.81159210205078 floats: -267.3930969238281 floats: 503.56121826171875 floats: 337.87945556640625 floats: -577.2725219726562 floats: 373.62939453125 floats: -99.69779205322266 floats: 78.39842224121094 floats: 656.5430908203125 floats: 80.3384017944336 type: FLOATS } attribute { name: "intercepts" floats: 152.69613647460938 type: FLOATS } domain: "ai.onnx.ml" } name: "model" input { name: "input" type { tensor_type { elem_type: 1 shape { dim { } dim { dim_value: 10 } } } } } output { name: "variable" type { tensor_type { elem_type: 1 shape { dim { } dim { dim_value: 1 } } } } } } opset_import { domain: "ai.onnx.ml" version: 1 } opset_import { domain: "" version: 11 }
On calcule les prédictions. Le module {onnxruntime](https://docs.microsoft.com/en-us/python/api/overview/azure/onnx/intro?view=azure-onnx-py) optimise les calculs pour des modèles de deep learning. Cela explique pourquoi tous les calculs sont réalisés avec des réels représentés sur 4 octets numpy.float32.
if ok_onnx:
sess = onnxruntime.InferenceSession("model.onnx")
for i in sess.get_inputs():
print('Input:', i)
for o in sess.get_outputs():
print('Output:', o)
def predict_onnxrt(x):
return sess.run(["variable"], {'input': x})
print("Prediction:", predict_onnxrt(x.astype(numpy.float32)))
Input: NodeArg(name='input', type='tensor(float)', shape=[None, 10]) Output: NodeArg(name='variable', type='tensor(float)', shape=[None, 1]) Prediction: [array([[214.72478]], dtype=float32)]
if ok_onnx:
x32 = x.astype(numpy.float32)
memo_time.append(timeexe("onnxruntime-float32", "predict_onnxrt(x32)"))
memo_time.append(timeexe("onnxruntime-float64", "predict_onnxrt(x.astype(numpy.float32))"))
Moyenne: 13.00 µs Ecart-type 7.69 µs (with 100 runs) in [9.71 µs, 23.64 µs] Moyenne: 12.69 µs Ecart-type 1.93 µs (with 100 runs) in [11.29 µs, 16.23 µs]
import pandas
df = pandas.DataFrame(data=memo_time)
df = df.set_index("legend").sort_values("average")
df
average | deviation | first | first3 | last3 | repeat | min5 | max5 | code | run | |
---|---|---|---|---|---|---|---|---|---|---|
legend | ||||||||||
cffi-linreg-custom-float32 | 3.891910e-07 | 2.267541e-07 | 5.608000e-06 | 2.196000e-06 | 3.763333e-07 | 1000 | 3.170000e-07 | 5.770000e-07 | linreg_custom_float(cptr_x, cptr_out) | 100 |
cffi-linreg-custom-float32-simd | 4.189890e-07 | 3.871792e-07 | 1.091200e-05 | 3.949667e-06 | 2.996667e-07 | 1000 | 2.990000e-07 | 6.310000e-07 | linreg_custom_float_simd(cptr_x, cptr_out) | 100 |
cffi-linreg-custom | 4.665150e-07 | 8.519581e-07 | 2.679900e-05 | 9.352667e-06 | 3.256667e-07 | 1000 | 3.150000e-07 | 7.150000e-07 | linreg_custom(cptr_x, cptr_out) | 100 |
numba-linreg-type-float32 | 7.070790e-07 | 2.686359e-07 | 1.162000e-06 | 1.083333e-06 | 5.663333e-07 | 1000 | 5.650000e-07 | 1.249000e-06 | predict_clr_numba_cast_float(x32, c32, i32) | 100 |
cython-linreg-type | 7.213150e-07 | 3.991047e-07 | 1.252000e-06 | 8.300000e-07 | 5.513333e-07 | 1000 | 5.330000e-07 | 1.443000e-06 | predict_clr_cython_type(x[0], clr.coef_, clr.i... | 100 |
numba-linreg-type-custom | 8.243540e-07 | 3.713608e-07 | 7.940000e-07 | 7.156667e-07 | 6.543333e-07 | 1000 | 6.520000e-07 | 1.558000e-06 | predict_clr_numba_cast_custom(x[0]) | 100 |
cffi-linreg | 8.313670e-07 | 7.080831e-07 | 6.414000e-06 | 3.244000e-06 | 4.170000e-07 | 1000 | 4.160000e-07 | 1.519000e-06 | lib.linreg(n, cptr_x, cptr_coef, clr.intercept... | 100 |
numba-linreg-type | 9.482040e-07 | 4.114651e-07 | 9.350000e-07 | 8.663333e-07 | 7.596667e-07 | 1000 | 7.590000e-07 | 1.678000e-06 | predict_clr_numba_cast(x[0], clr.coef_, clr.in... | 100 |
python-linreg-custom | 2.018942e-06 | 6.704544e-07 | 5.511000e-06 | 4.254667e-06 | 1.703667e-06 | 1000 | 1.696000e-06 | 2.731000e-06 | predict_clr_python(z) | 100 |
cython-linreg | 2.706254e-06 | 1.597806e-06 | 5.083000e-06 | 5.419333e-06 | 2.126000e-06 | 1000 | 1.920000e-06 | 7.194000e-06 | predict_clr_cython(x[0], clr.coef_, clr.interc... | 100 |
python-linreg | 3.539523e-06 | 1.306156e-06 | 8.761000e-06 | 7.510000e-06 | 2.779667e-06 | 1000 | 2.681000e-06 | 6.164000e-06 | predict_clr_python_loop(z, coef, intercept) | 100 |
cffi-linreg-custom-float wrapped | 5.123886e-06 | 1.598363e-06 | 1.200400e-05 | 1.176767e-05 | 4.483000e-06 | 1000 | 4.477000e-06 | 6.436000e-06 | predict_clr_custom(x32) | 100 |
numba-linreg-type-numpy | 5.147404e-06 | 1.775723e-06 | 1.874100e-05 | 1.572433e-05 | 4.474333e-06 | 1000 | 4.374000e-06 | 5.996000e-06 | predict_clr_numba_numpy(x[0], clr.coef_, clr.i... | 100 |
cffi-linreg-custom wrapped | 5.274568e-06 | 1.823247e-06 | 2.166200e-05 | 2.268700e-05 | 5.626667e-06 | 1000 | 4.422000e-06 | 7.773000e-06 | predict_clr_custom(x) | 100 |
cffi-linreg-wrapped | 7.519599e-06 | 2.343424e-06 | 1.580000e-05 | 2.028933e-05 | 6.263333e-06 | 1000 | 6.201000e-06 | 1.041900e-05 | predict_clr(x, clr) | 100 |
numpy-linreg-numpy | 8.081947e-06 | 3.442724e-06 | 3.679000e-05 | 3.075167e-05 | 6.525667e-06 | 1000 | 6.442000e-06 | 1.216200e-05 | predict_clr_numpy(z, coef, clr.intercept_) | 100 |
onnxruntime-float64 | 1.269215e-05 | 1.926911e-06 | 1.742200e-05 | 1.337233e-05 | 1.133667e-05 | 1000 | 1.129500e-05 | 1.623200e-05 | predict_onnxrt(x.astype(numpy.float32)) | 100 |
onnxruntime-float32 | 1.299773e-05 | 7.686900e-06 | 2.281400e-05 | 1.689933e-05 | 1.009533e-05 | 1000 | 9.713000e-06 | 2.363700e-05 | predict_onnxrt(x32) | 100 |
numba-linreg-notype | 2.376539e-05 | 7.362380e-06 | 3.079800e-05 | 2.445400e-05 | 3.723367e-05 | 1000 | 1.998900e-05 | 3.763900e-05 | predict_clr_numba(z, clr.coef_, clr.intercept_) | 100 |
sklearn.predict | 4.550096e-05 | 6.337585e-06 | 7.724200e-05 | 6.447133e-05 | 4.143867e-05 | 1000 | 4.087300e-05 | 5.295400e-05 | clr.predict(z) | 100 |
On enlève quelques colonnes et on rappelle :
cols = ["average", "deviation", "min5", "max5", "run", "code"]
df[cols]
average | deviation | min5 | max5 | run | code | |
---|---|---|---|---|---|---|
legend | ||||||
cffi-linreg-custom-float32 | 3.891910e-07 | 2.267541e-07 | 3.170000e-07 | 5.770000e-07 | 100 | linreg_custom_float(cptr_x, cptr_out) |
cffi-linreg-custom-float32-simd | 4.189890e-07 | 3.871792e-07 | 2.990000e-07 | 6.310000e-07 | 100 | linreg_custom_float_simd(cptr_x, cptr_out) |
cffi-linreg-custom | 4.665150e-07 | 8.519581e-07 | 3.150000e-07 | 7.150000e-07 | 100 | linreg_custom(cptr_x, cptr_out) |
numba-linreg-type-float32 | 7.070790e-07 | 2.686359e-07 | 5.650000e-07 | 1.249000e-06 | 100 | predict_clr_numba_cast_float(x32, c32, i32) |
cython-linreg-type | 7.213150e-07 | 3.991047e-07 | 5.330000e-07 | 1.443000e-06 | 100 | predict_clr_cython_type(x[0], clr.coef_, clr.i... |
numba-linreg-type-custom | 8.243540e-07 | 3.713608e-07 | 6.520000e-07 | 1.558000e-06 | 100 | predict_clr_numba_cast_custom(x[0]) |
cffi-linreg | 8.313670e-07 | 7.080831e-07 | 4.160000e-07 | 1.519000e-06 | 100 | lib.linreg(n, cptr_x, cptr_coef, clr.intercept... |
numba-linreg-type | 9.482040e-07 | 4.114651e-07 | 7.590000e-07 | 1.678000e-06 | 100 | predict_clr_numba_cast(x[0], clr.coef_, clr.in... |
python-linreg-custom | 2.018942e-06 | 6.704544e-07 | 1.696000e-06 | 2.731000e-06 | 100 | predict_clr_python(z) |
cython-linreg | 2.706254e-06 | 1.597806e-06 | 1.920000e-06 | 7.194000e-06 | 100 | predict_clr_cython(x[0], clr.coef_, clr.interc... |
python-linreg | 3.539523e-06 | 1.306156e-06 | 2.681000e-06 | 6.164000e-06 | 100 | predict_clr_python_loop(z, coef, intercept) |
cffi-linreg-custom-float wrapped | 5.123886e-06 | 1.598363e-06 | 4.477000e-06 | 6.436000e-06 | 100 | predict_clr_custom(x32) |
numba-linreg-type-numpy | 5.147404e-06 | 1.775723e-06 | 4.374000e-06 | 5.996000e-06 | 100 | predict_clr_numba_numpy(x[0], clr.coef_, clr.i... |
cffi-linreg-custom wrapped | 5.274568e-06 | 1.823247e-06 | 4.422000e-06 | 7.773000e-06 | 100 | predict_clr_custom(x) |
cffi-linreg-wrapped | 7.519599e-06 | 2.343424e-06 | 6.201000e-06 | 1.041900e-05 | 100 | predict_clr(x, clr) |
numpy-linreg-numpy | 8.081947e-06 | 3.442724e-06 | 6.442000e-06 | 1.216200e-05 | 100 | predict_clr_numpy(z, coef, clr.intercept_) |
onnxruntime-float64 | 1.269215e-05 | 1.926911e-06 | 1.129500e-05 | 1.623200e-05 | 100 | predict_onnxrt(x.astype(numpy.float32)) |
onnxruntime-float32 | 1.299773e-05 | 7.686900e-06 | 9.713000e-06 | 2.363700e-05 | 100 | predict_onnxrt(x32) |
numba-linreg-notype | 2.376539e-05 | 7.362380e-06 | 1.998900e-05 | 3.763900e-05 | 100 | predict_clr_numba(z, clr.coef_, clr.intercept_) |
sklearn.predict | 4.550096e-05 | 6.337585e-06 | 4.087300e-05 | 5.295400e-05 | 100 | clr.predict(z) |
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(14,6))
df[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
<ipython-input-73-4cda8b1055aa>:7: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later. ax.grid(b=True, which="major") <ipython-input-73-4cda8b1055aa>:8: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later. ax.grid(b=True, which="minor");
Le graphique précédent montre que la fonction predict
de scikit-learn est la plus lente. La première raison est que ce code est valable pour toutes les régresssions linéaires alors que toutes les autres fonctions sont spécialisées pour un seul modèle. La seconde raison est que le code de scikit-learn est optimisé pour le calcul de plusieurs prédictions à la fois alors que toutes les autres fonctions n'en calcule qu'une seule (scénario dit one-off). On compare à ce que donnerait unev version purement python et numpy.
def predict_clr_python_loop_multi(x, coef, intercept):
# On s'attend à deux dimension.
res = numpy.zeros((x.shape[0], 1))
res[:, 0] = intercept
for i in range(0, x.shape[0]):
res[i, 0] += sum(a*b for a, b in zip(x[i, :], coef))
return res
predict_clr_python_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[214.72477745], [175.29091463]])
def predict_clr_numpy_loop_multi(x, coef, intercept):
# On s'attend à deux dimension.
res = numpy.ones((x.shape[0], 1)) * intercept
res += x @ coef.reshape((len(coef), 1))
return res
predict_clr_numpy_loop_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
array([[214.72477745], [175.29091463]])
def predict_clr_numba_cast_multi(X, coef, intercept):
return [predict_clr_numba_cast(x, coef, intercept) for x in X]
predict_clr_numba_cast_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
[214.724777447606, 175.29091463098356]
def predict_clr_cython_type_multi(X, coef, intercept):
return [predict_clr_cython_type(x, coef, intercept) for x in X]
predict_clr_cython_type_multi(diabetes_X_test[:2], clr.coef_, clr.intercept_)
[214.724777447606, 175.29091463098356]
memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
500000, 600000]
number = 10
for i in batch:
if i <= diabetes_X_test.shape[0]:
mx = diabetes_X_test[:i]
else:
mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
mx = numpy.vstack(mxs)
mx = mx[:i]
print("batch", "=", i)
repeat=20 if i >= 5000 else 100
memo.append(timeexe("sklearn.predict %d" % i, "clr.predict(mx)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "sklearn"
if i <= 1000:
# très lent
memo.append(timeexe("python %d" % i, "predict_clr_python_loop_multi(mx, clr.coef_, clr.intercept_)",
repeat=20, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "python"
memo.append(timeexe("numpy %d" % i, "predict_clr_numpy_loop_multi(mx, clr.coef_, clr.intercept_)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "numpy"
if i <= 10000:
# très lent
memo.append(timeexe("numba %d" % i, "predict_clr_numba_cast_multi(mx, clr.coef_, clr.intercept_)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "numba"
if i <= 1000:
# très lent
memo.append(timeexe("cython %d" % i, "predict_clr_cython_type_multi(mx, clr.coef_, clr.intercept_)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "cython"
if ok_onnx:
memo.append(timeexe("onnxruntime %d" % i, "predict_onnxrt(mx.astype(numpy.float32))",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "onnxruntime"
batch = 1 Moyenne: 62.45 µs Ecart-type 33.39 µs (with 10 runs) in [41.71 µs, 119.15 µs] Moyenne: 6.43 µs Ecart-type 496.70 ns (with 10 runs) in [6.26 µs, 8.59 µs] Moyenne: 12.10 µs Ecart-type 8.26 µs (with 10 runs) in [8.20 µs, 26.70 µs] Moyenne: 1.99 µs Ecart-type 259.55 ns (with 10 runs) in [1.95 µs, 1.99 µs] Moyenne: 1.61 µs Ecart-type 103.91 ns (with 10 runs) in [1.59 µs, 1.63 µs] Moyenne: 19.38 µs Ecart-type 11.88 µs (with 10 runs) in [13.14 µs, 43.26 µs] batch = 10 Moyenne: 73.96 µs Ecart-type 41.93 µs (with 10 runs) in [43.50 µs, 116.26 µs] Moyenne: 101.06 µs Ecart-type 3.73 µs (with 10 runs) in [98.16 µs, 111.41 µs] Moyenne: 19.06 µs Ecart-type 31.16 µs (with 10 runs) in [11.86 µs, 25.57 µs] Moyenne: 10.84 µs Ecart-type 5.26 µs (with 10 runs) in [8.34 µs, 22.70 µs] Moyenne: 10.46 µs Ecart-type 5.49 µs (with 10 runs) in [5.69 µs, 20.84 µs] Moyenne: 19.66 µs Ecart-type 25.05 µs (with 10 runs) in [12.23 µs, 34.34 µs] batch = 100 Moyenne: 68.65 µs Ecart-type 26.04 µs (with 10 runs) in [46.99 µs, 119.00 µs] Moyenne: 740.30 µs Ecart-type 156.38 µs (with 10 runs) in [512.14 µs, 1.02 ms] Moyenne: 10.75 µs Ecart-type 3.89 µs (with 10 runs) in [8.72 µs, 16.79 µs] Moyenne: 94.32 µs Ecart-type 14.42 µs (with 10 runs) in [72.11 µs, 124.00 µs] Moyenne: 67.23 µs Ecart-type 31.05 µs (with 10 runs) in [43.72 µs, 135.94 µs] Moyenne: 91.28 µs Ecart-type 164.49 µs (with 10 runs) in [15.53 µs, 481.48 µs] batch = 200 Moyenne: 68.82 µs Ecart-type 38.95 µs (with 10 runs) in [46.58 µs, 152.87 µs] Moyenne: 1.59 ms Ecart-type 497.08 µs (with 10 runs) in [1.09 ms, 2.98 ms] Moyenne: 11.66 µs Ecart-type 2.01 µs (with 10 runs) in [9.79 µs, 16.71 µs] Moyenne: 167.67 µs Ecart-type 37.37 µs (with 10 runs) in [133.64 µs, 240.53 µs] Moyenne: 102.09 µs Ecart-type 25.18 µs (with 10 runs) in [86.07 µs, 162.09 µs] Moyenne: 18.04 µs Ecart-type 8.31 µs (with 10 runs) in [15.00 µs, 34.57 µs] batch = 500 Moyenne: 63.53 µs Ecart-type 20.92 µs (with 10 runs) in [50.94 µs, 116.69 µs] Moyenne: 3.22 ms Ecart-type 296.30 µs (with 10 runs) in [2.84 ms, 3.80 ms] Moyenne: 13.91 µs Ecart-type 4.58 µs (with 10 runs) in [11.80 µs, 26.71 µs] Moyenne: 410.88 µs Ecart-type 73.68 µs (with 10 runs) in [333.06 µs, 523.19 µs] Moyenne: 263.08 µs Ecart-type 117.22 µs (with 10 runs) in [211.75 µs, 444.83 µs] Moyenne: 22.28 µs Ecart-type 12.93 µs (with 10 runs) in [19.16 µs, 37.56 µs] batch = 1000 Moyenne: 153.47 µs Ecart-type 43.85 µs (with 10 runs) in [125.94 µs, 229.51 µs] Moyenne: 5.52 ms Ecart-type 389.98 µs (with 10 runs) in [4.99 ms, 6.18 ms] Moyenne: 83.03 µs Ecart-type 22.52 µs (with 10 runs) in [73.67 µs, 95.91 µs] Moyenne: 702.77 µs Ecart-type 76.26 µs (with 10 runs) in [661.35 µs, 888.11 µs] Moyenne: 445.87 µs Ecart-type 53.38 µs (with 10 runs) in [420.78 µs, 548.59 µs] Moyenne: 27.48 µs Ecart-type 6.31 µs (with 10 runs) in [26.53 µs, 29.87 µs] batch = 2000 Moyenne: 147.73 µs Ecart-type 19.47 µs (with 10 runs) in [132.10 µs, 187.47 µs] Moyenne: 83.71 µs Ecart-type 4.68 µs (with 10 runs) in [79.25 µs, 93.14 µs] Moyenne: 1.58 ms Ecart-type 216.25 µs (with 10 runs) in [1.32 ms, 1.97 ms] Moyenne: 47.31 µs Ecart-type 20.65 µs (with 10 runs) in [37.00 µs, 97.03 µs] batch = 3000 Moyenne: 179.79 µs Ecart-type 45.35 µs (with 10 runs) in [144.02 µs, 310.74 µs] Moyenne: 92.27 µs Ecart-type 7.05 µs (with 10 runs) in [84.71 µs, 106.64 µs] Moyenne: 2.37 ms Ecart-type 267.39 µs (with 10 runs) in [1.99 ms, 2.91 ms] Moyenne: 50.69 µs Ecart-type 6.32 µs (with 10 runs) in [48.65 µs, 52.22 µs] batch = 4000 Moyenne: 193.02 µs Ecart-type 28.74 µs (with 10 runs) in [173.71 µs, 211.52 µs] Moyenne: 100.06 µs Ecart-type 22.27 µs (with 10 runs) in [85.61 µs, 133.38 µs] Moyenne: 3.13 ms Ecart-type 296.38 µs (with 10 runs) in [2.73 ms, 3.54 ms] Moyenne: 64.67 µs Ecart-type 7.43 µs (with 10 runs) in [59.90 µs, 68.08 µs] batch = 5000 Moyenne: 215.06 µs Ecart-type 46.52 µs (with 10 runs) in [196.06 µs, 411.19 µs] Moyenne: 110.91 µs Ecart-type 8.08 µs (with 10 runs) in [90.36 µs, 122.94 µs] Moyenne: 3.49 ms Ecart-type 212.13 µs (with 10 runs) in [3.30 ms, 4.04 ms] Moyenne: 78.86 µs Ecart-type 5.47 µs (with 10 runs) in [77.15 µs, 102.21 µs] batch = 10000 Moyenne: 248.75 µs Ecart-type 64.14 µs (with 10 runs) in [192.57 µs, 425.01 µs] Moyenne: 116.55 µs Ecart-type 17.05 µs (with 10 runs) in [100.13 µs, 152.60 µs] Moyenne: 7.18 ms Ecart-type 420.77 µs (with 10 runs) in [6.62 ms, 8.15 ms] Moyenne: 153.30 µs Ecart-type 13.69 µs (with 10 runs) in [149.03 µs, 211.69 µs] batch = 20000 Moyenne: 293.81 µs Ecart-type 19.49 µs (with 10 runs) in [283.46 µs, 364.31 µs] Moyenne: 147.12 µs Ecart-type 8.23 µs (with 10 runs) in [135.43 µs, 160.67 µs] Moyenne: 215.69 µs Ecart-type 14.46 µs (with 10 runs) in [204.68 µs, 262.99 µs] batch = 50000 Moyenne: 1.00 ms Ecart-type 44.28 µs (with 10 runs) in [967.01 µs, 1.13 ms] Moyenne: 503.33 µs Ecart-type 13.21 µs (with 10 runs) in [491.66 µs, 551.81 µs] Moyenne: 1.86 ms Ecart-type 1.14 ms (with 10 runs) in [1.13 ms, 4.90 ms] batch = 75000 Moyenne: 1.75 ms Ecart-type 153.53 µs (with 10 runs) in [1.56 ms, 1.94 ms] Moyenne: 663.38 µs Ecart-type 20.47 µs (with 10 runs) in [630.15 µs, 700.62 µs] Moyenne: 1.88 ms Ecart-type 173.99 µs (with 10 runs) in [1.65 ms, 2.14 ms] batch = 100000 Moyenne: 2.56 ms Ecart-type 204.42 µs (with 10 runs) in [2.27 ms, 2.85 ms] Moyenne: 1.21 ms Ecart-type 113.75 µs (with 10 runs) in [1.04 ms, 1.44 ms] Moyenne: 2.98 ms Ecart-type 934.23 µs (with 10 runs) in [2.22 ms, 6.31 ms] batch = 150000 Moyenne: 4.00 ms Ecart-type 188.08 µs (with 10 runs) in [3.78 ms, 4.46 ms] Moyenne: 2.92 ms Ecart-type 344.26 µs (with 10 runs) in [2.54 ms, 3.93 ms] Moyenne: 3.76 ms Ecart-type 308.56 µs (with 10 runs) in [3.26 ms, 4.52 ms] batch = 200000 Moyenne: 5.73 ms Ecart-type 424.36 µs (with 10 runs) in [5.17 ms, 6.72 ms] Moyenne: 4.00 ms Ecart-type 606.67 µs (with 10 runs) in [3.50 ms, 6.04 ms] Moyenne: 5.44 ms Ecart-type 742.52 µs (with 10 runs) in [4.57 ms, 7.38 ms] batch = 300000 Moyenne: 8.36 ms Ecart-type 1.26 ms (with 10 runs) in [7.78 ms, 13.52 ms] Moyenne: 5.37 ms Ecart-type 352.34 µs (with 10 runs) in [5.08 ms, 6.64 ms] Moyenne: 7.18 ms Ecart-type 680.24 µs (with 10 runs) in [6.69 ms, 8.83 ms] batch = 400000 Moyenne: 11.49 ms Ecart-type 1.16 ms (with 10 runs) in [10.36 ms, 15.15 ms] Moyenne: 7.87 ms Ecart-type 709.04 µs (with 10 runs) in [7.18 ms, 9.70 ms] Moyenne: 10.51 ms Ecart-type 900.27 µs (with 10 runs) in [9.41 ms, 13.22 ms] batch = 500000 Moyenne: 15.01 ms Ecart-type 1.90 ms (with 10 runs) in [12.99 ms, 20.81 ms] Moyenne: 11.02 ms Ecart-type 889.69 µs (with 10 runs) in [9.64 ms, 13.29 ms] Moyenne: 17.02 ms Ecart-type 2.13 ms (with 10 runs) in [14.72 ms, 22.19 ms] batch = 600000 Moyenne: 21.19 ms Ecart-type 1.93 ms (with 10 runs) in [18.32 ms, 26.29 ms] Moyenne: 12.47 ms Ecart-type 964.03 µs (with 10 runs) in [11.00 ms, 14.31 ms] Moyenne: 18.04 ms Ecart-type 2.80 ms (with 10 runs) in [13.37 ms, 24.63 ms]
dfb = pandas.DataFrame(memo)[["average", "lib", "batch"]]
piv = dfb.pivot("batch", "lib", "average")
piv
lib | cython | numba | numpy | onnxruntime | python | sklearn |
---|---|---|---|---|---|---|
batch | ||||||
1 | 0.000002 | 0.000002 | 0.000012 | 0.000019 | 0.000006 | 0.000062 |
10 | 0.000010 | 0.000011 | 0.000019 | 0.000020 | 0.000101 | 0.000074 |
100 | 0.000067 | 0.000094 | 0.000011 | 0.000091 | 0.000740 | 0.000069 |
200 | 0.000102 | 0.000168 | 0.000012 | 0.000018 | 0.001590 | 0.000069 |
500 | 0.000263 | 0.000411 | 0.000014 | 0.000022 | 0.003225 | 0.000064 |
1000 | 0.000446 | 0.000703 | 0.000083 | 0.000027 | 0.005516 | 0.000153 |
2000 | NaN | 0.001580 | 0.000084 | 0.000047 | NaN | 0.000148 |
3000 | NaN | 0.002371 | 0.000092 | 0.000051 | NaN | 0.000180 |
4000 | NaN | 0.003125 | 0.000100 | 0.000065 | NaN | 0.000193 |
5000 | NaN | 0.003490 | 0.000111 | 0.000079 | NaN | 0.000215 |
10000 | NaN | 0.007181 | 0.000117 | 0.000153 | NaN | 0.000249 |
20000 | NaN | NaN | 0.000147 | 0.000216 | NaN | 0.000294 |
50000 | NaN | NaN | 0.000503 | 0.001863 | NaN | 0.001000 |
75000 | NaN | NaN | 0.000663 | 0.001879 | NaN | 0.001749 |
100000 | NaN | NaN | 0.001209 | 0.002980 | NaN | 0.002557 |
150000 | NaN | NaN | 0.002923 | 0.003762 | NaN | 0.004001 |
200000 | NaN | NaN | 0.004001 | 0.005440 | NaN | 0.005731 |
300000 | NaN | NaN | 0.005366 | 0.007180 | NaN | 0.008365 |
400000 | NaN | NaN | 0.007872 | 0.010510 | NaN | 0.011489 |
500000 | NaN | NaN | 0.011016 | 0.017021 | NaN | 0.015013 |
600000 | NaN | NaN | 0.012468 | 0.018040 | NaN | 0.021193 |
for c in piv.columns:
piv["ave_" + c] = piv[c] / piv.index
piv
lib | cython | numba | numpy | onnxruntime | python | sklearn | ave_cython | ave_numba | ave_numpy | ave_onnxruntime | ave_python | ave_sklearn |
---|---|---|---|---|---|---|---|---|---|---|---|---|
batch | ||||||||||||
1 | 0.000002 | 0.000002 | 0.000012 | 0.000019 | 0.000006 | 0.000062 | 1.614900e-06 | 1.990900e-06 | 1.209790e-05 | 1.938460e-05 | 0.000006 | 6.245340e-05 |
10 | 0.000010 | 0.000011 | 0.000019 | 0.000020 | 0.000101 | 0.000074 | 1.046460e-06 | 1.084320e-06 | 1.906010e-06 | 1.965810e-06 | 0.000010 | 7.396440e-06 |
100 | 0.000067 | 0.000094 | 0.000011 | 0.000091 | 0.000740 | 0.000069 | 6.722760e-07 | 9.431990e-07 | 1.075410e-07 | 9.127790e-07 | 0.000007 | 6.865190e-07 |
200 | 0.000102 | 0.000168 | 0.000012 | 0.000018 | 0.001590 | 0.000069 | 5.104525e-07 | 8.383455e-07 | 5.827850e-08 | 9.019900e-08 | 0.000008 | 3.440995e-07 |
500 | 0.000263 | 0.000411 | 0.000014 | 0.000022 | 0.003225 | 0.000064 | 5.261610e-07 | 8.217592e-07 | 2.781740e-08 | 4.455220e-08 | 0.000006 | 1.270610e-07 |
1000 | 0.000446 | 0.000703 | 0.000083 | 0.000027 | 0.005516 | 0.000153 | 4.458687e-07 | 7.027674e-07 | 8.303090e-08 | 2.747640e-08 | 0.000006 | 1.534708e-07 |
2000 | NaN | 0.001580 | 0.000084 | 0.000047 | NaN | 0.000148 | NaN | 7.899395e-07 | 4.185515e-08 | 2.365645e-08 | NaN | 7.386540e-08 |
3000 | NaN | 0.002371 | 0.000092 | 0.000051 | NaN | 0.000180 | NaN | 7.902492e-07 | 3.075760e-08 | 1.689707e-08 | NaN | 5.992867e-08 |
4000 | NaN | 0.003125 | 0.000100 | 0.000065 | NaN | 0.000193 | NaN | 7.813673e-07 | 2.501480e-08 | 1.616818e-08 | NaN | 4.825388e-08 |
5000 | NaN | 0.003490 | 0.000111 | 0.000079 | NaN | 0.000215 | NaN | 6.979748e-07 | 2.218220e-08 | 1.577170e-08 | NaN | 4.301210e-08 |
10000 | NaN | 0.007181 | 0.000117 | 0.000153 | NaN | 0.000249 | NaN | 7.180820e-07 | 1.165535e-08 | 1.533050e-08 | NaN | 2.487490e-08 |
20000 | NaN | NaN | 0.000147 | 0.000216 | NaN | 0.000294 | NaN | NaN | 7.356025e-09 | 1.078465e-08 | NaN | 1.469057e-08 |
50000 | NaN | NaN | 0.000503 | 0.001863 | NaN | 0.001000 | NaN | NaN | 1.006655e-08 | 3.725768e-08 | NaN | 2.000188e-08 |
75000 | NaN | NaN | 0.000663 | 0.001879 | NaN | 0.001749 | NaN | NaN | 8.845087e-09 | 2.505991e-08 | NaN | 2.331396e-08 |
100000 | NaN | NaN | 0.001209 | 0.002980 | NaN | 0.002557 | NaN | NaN | 1.208690e-08 | 2.980086e-08 | NaN | 2.556766e-08 |
150000 | NaN | NaN | 0.002923 | 0.003762 | NaN | 0.004001 | NaN | NaN | 1.948814e-08 | 2.508106e-08 | NaN | 2.667062e-08 |
200000 | NaN | NaN | 0.004001 | 0.005440 | NaN | 0.005731 | NaN | NaN | 2.000416e-08 | 2.720136e-08 | NaN | 2.865267e-08 |
300000 | NaN | NaN | 0.005366 | 0.007180 | NaN | 0.008365 | NaN | NaN | 1.788538e-08 | 2.393301e-08 | NaN | 2.788189e-08 |
400000 | NaN | NaN | 0.007872 | 0.010510 | NaN | 0.011489 | NaN | NaN | 1.967972e-08 | 2.627497e-08 | NaN | 2.872169e-08 |
500000 | NaN | NaN | 0.011016 | 0.017021 | NaN | 0.015013 | NaN | NaN | 2.203297e-08 | 3.404131e-08 | NaN | 3.002589e-08 |
600000 | NaN | NaN | 0.012468 | 0.018040 | NaN | 0.021193 | NaN | NaN | 2.077927e-08 | 3.006664e-08 | NaN | 3.532122e-08 |
libs = list(c for c in piv.columns if "ave_" in c)
ax = piv.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch")
ax.grid(True);
Le minimum obtenu est pour $10^{-8} s$ soit 10 ns. Cela montre que la comparaisson précédente était incomplète voire biaisée. Tout dépend de l'usage qu'on fait de la fonction de prédiction même s'il sera toujours possible de d'écrire un code spécialisé plus rapide que toute autre fonction générique. En général, plus on reste du côté Python, plus le programme est lent. Le nombre de passage de l'un à l'autre, selon la façon dont il est fait ralenti aussi. En tenant compte de cela, le programme rouge sera plus lent que le vert.
from pyquickhelper.helpgen import NbImage
NbImage("pycpp.png")
Ces résultats sont d'une façon générale assez volatile car le temps de calcul est enrobé dans plusieurs fonctions Python qui rendent une mesure précise difficile. Il reste néanmoins une bonne idée des ordres de grandeurs.
On reproduit les mêmes résultats pour une random forest mais la réécriture n'est plus aussi simple qu'une régression linéaire.
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10)
rf.fit(diabetes_X_train, diabetes_y_train)
RandomForestRegressor(n_estimators=10)
memo_time = []
x = diabetes_X_test[:1]
memo_time.append(timeexe("sklearn-rf", "rf.predict(x)", repeat=100, number=20))
Moyenne: 980.23 µs Ecart-type 60.93 µs (with 20 runs) in [937.55 µs, 1.11 ms]
C'est beaucoup plus long que la régression linéaire. On essaye avec onnx.
if ok_onnx:
onnxrf_model = convert_sklearn(
rf, 'model', [('input', FloatTensorType([None, clr.coef_.shape[0]]))],
target_opset=11)
onnxrf_model.ir_version = 6
save_model(onnxrf_model, 'model_rf.onnx')
model_onnx = onnx.load('model_rf.onnx')
if ok_onnx:
sess = onnxruntime.InferenceSession("model_rf.onnx")
for i in sess.get_inputs():
print('Input:', i)
for o in sess.get_outputs():
print('Output:', o)
def predict_onnxrt_rf(x):
return sess.run(["variable"], {'input': x})
print(predict_onnxrt_rf(x.astype(numpy.float32)))
memo_time.append(timeexe("onnx-rf", "predict_onnxrt_rf(x.astype(numpy.float32))",
repeat=100, number=20))
Input: NodeArg(name='input', type='tensor(float)', shape=[None, 10]) Output: NodeArg(name='variable', type='tensor(float)', shape=[None, 1]) [array([[243.00002]], dtype=float32)] Moyenne: 14.36 µs Ecart-type 4.18 µs (with 20 runs) in [11.75 µs, 22.22 µs]
C'est beaucoup plus rapide.
import pandas
df2 = pandas.DataFrame(data=memo_time)
df2 = df2.set_index("legend").sort_values("average")
df2
average | deviation | first | first3 | last3 | repeat | min5 | max5 | code | run | |
---|---|---|---|---|---|---|---|---|---|---|
legend | ||||||||||
onnx-rf | 0.000014 | 0.000004 | 0.000047 | 0.000027 | 0.000014 | 100 | 0.000012 | 0.000022 | predict_onnxrt_rf(x.astype(numpy.float32)) | 20 |
sklearn-rf | 0.000980 | 0.000061 | 0.001308 | 0.001087 | 0.001075 | 100 | 0.000938 | 0.001106 | rf.predict(x) | 20 |
fig, ax = plt.subplots(1, 1, figsize=(14,4))
df2[["average", "deviation"]].plot(kind="barh", logx=True, ax=ax, xerr="deviation",
legend=False, fontsize=12, width=0.8)
ax.set_ylabel("")
ax.grid(b=True, which="major")
ax.grid(b=True, which="minor");
<ipython-input-89-8d560b1fbb3b>:5: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later. ax.grid(b=True, which="major") <ipython-input-89-8d560b1fbb3b>:6: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later. ax.grid(b=True, which="minor");
memo = []
batch = [1, 10, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000,
20000, 50000, 75000, 100000, 150000, 200000, 300000, 400000,
500000, 600000]
number = 10
repeat = 10
for i in batch[:15]:
if i <= diabetes_X_test.shape[0]:
mx = diabetes_X_test[:i]
else:
mxs = [diabetes_X_test] * (i // diabetes_X_test.shape[0] + 1)
mx = numpy.vstack(mxs)
mx = mx[:i]
print("batch", "=", i)
memo.append(timeexe("sklearn.predict %d" % i, "rf.predict(mx)",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "sklearn"
if ok_onnx:
memo.append(timeexe("onnxruntime %d" % i,
"predict_onnxrt_rf(mx.astype(numpy.float32))",
repeat=repeat, number=number))
memo[-1]["batch"] = i
memo[-1]["lib"] = "onnxruntime"
batch = 1 Moyenne: 1.11 ms Ecart-type 145.19 µs (with 10 runs) in [1.03 ms, 1.54 ms] Moyenne: 15.70 µs Ecart-type 13.36 µs (with 10 runs) in [11.20 µs, 55.77 µs] batch = 10 Moyenne: 1.14 ms Ecart-type 162.36 µs (with 10 runs) in [952.57 µs, 1.51 ms] Moyenne: 25.55 µs Ecart-type 9.43 µs (with 10 runs) in [17.37 µs, 42.15 µs] batch = 100 Moyenne: 1.09 ms Ecart-type 80.51 µs (with 10 runs) in [1.01 ms, 1.31 ms] Moyenne: 38.04 µs Ecart-type 17.20 µs (with 10 runs) in [32.02 µs, 89.62 µs] batch = 200 Moyenne: 1.42 ms Ecart-type 126.30 µs (with 10 runs) in [1.15 ms, 1.71 ms] Moyenne: 82.17 µs Ecart-type 56.27 µs (with 10 runs) in [43.86 µs, 213.17 µs] batch = 500 Moyenne: 1.79 ms Ecart-type 543.34 µs (with 10 runs) in [1.31 ms, 3.18 ms] Moyenne: 130.31 µs Ecart-type 30.45 µs (with 10 runs) in [85.15 µs, 190.08 µs] batch = 1000 Moyenne: 1.53 ms Ecart-type 93.12 µs (with 10 runs) in [1.42 ms, 1.70 ms] Moyenne: 249.60 µs Ecart-type 23.96 µs (with 10 runs) in [232.24 µs, 312.27 µs] batch = 2000 Moyenne: 2.09 ms Ecart-type 149.23 µs (with 10 runs) in [1.89 ms, 2.33 ms] Moyenne: 393.37 µs Ecart-type 165.01 µs (with 10 runs) in [283.40 µs, 734.87 µs] batch = 3000 Moyenne: 2.77 ms Ecart-type 921.32 µs (with 10 runs) in [2.24 ms, 5.40 ms] Moyenne: 432.57 µs Ecart-type 16.08 µs (with 10 runs) in [422.71 µs, 479.76 µs] batch = 4000 Moyenne: 2.96 ms Ecart-type 331.99 µs (with 10 runs) in [2.63 ms, 3.69 ms] Moyenne: 1.04 ms Ecart-type 485.53 µs (with 10 runs) in [598.92 µs, 2.38 ms] batch = 5000 Moyenne: 3.27 ms Ecart-type 348.48 µs (with 10 runs) in [3.00 ms, 4.16 ms] Moyenne: 996.95 µs Ecart-type 207.84 µs (with 10 runs) in [767.12 µs, 1.47 ms] batch = 10000 Moyenne: 5.26 ms Ecart-type 404.81 µs (with 10 runs) in [4.96 ms, 6.34 ms] Moyenne: 1.75 ms Ecart-type 317.18 µs (with 10 runs) in [1.34 ms, 2.13 ms] batch = 20000 Moyenne: 10.52 ms Ecart-type 1.11 ms (with 10 runs) in [9.21 ms, 13.42 ms] Moyenne: 4.40 ms Ecart-type 522.54 µs (with 10 runs) in [3.52 ms, 5.43 ms] batch = 50000 Moyenne: 24.33 ms Ecart-type 2.90 ms (with 10 runs) in [21.27 ms, 29.83 ms] Moyenne: 8.21 ms Ecart-type 1.31 ms (with 10 runs) in [7.32 ms, 11.74 ms] batch = 75000 Moyenne: 31.54 ms Ecart-type 251.81 µs (with 10 runs) in [31.19 ms, 32.06 ms] Moyenne: 12.22 ms Ecart-type 908.21 µs (with 10 runs) in [11.38 ms, 14.24 ms] batch = 100000 Moyenne: 42.05 ms Ecart-type 745.44 µs (with 10 runs) in [41.22 ms, 43.35 ms] Moyenne: 16.17 ms Ecart-type 1.01 ms (with 10 runs) in [14.98 ms, 17.54 ms]
dfbrf = pandas.DataFrame(memo)[["average", "lib", "batch"]]
pivrf = dfbrf.pivot("batch", "lib", "average")
for c in pivrf.columns:
pivrf["ave_" + c] = pivrf[c] / pivrf.index
libs = list(c for c in pivrf.columns if "ave_" in c)
ax = pivrf.plot(y=libs, logy=True, logx=True, figsize=(10, 5))
ax.set_title("Evolution du temps de prédiction selon la taille du batch\nrandom forest")
ax.grid(True);