Benchmark operator LeakyRelu#

The operator LeakyRelu is equivalent to the function: LeayRelu(x) = \begin{array}{l} x \text{ if } x > 0  \\
\alpha x \text{otherwise} \end{array}. But it could be rewritten into the following decomposition LeayRelu(x) = x (\indicatrice{x} + \alpha (1 - \indicatrice{x})) =
x ((1 - \alpha) \indicatrice{x} + \alpha). Let’s compare the two implementation with onnx runtimes.

The ONNX graphs for both implementations of LeakyRely#

import numpy
from numpy.testing import assert_almost_equal
import matplotlib.pyplot as plt
from pandas import DataFrame
from onnx import TensorProto
from onnxruntime import InferenceSession, get_device
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import (
    OnnxLeakyRelu, OnnxSign, OnnxMul, OnnxAdd, OnnxDiv,
    OnnxGreater, OnnxCast)
from cpyquickhelper.numbers.speed_measure import measure_time
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
from mlprodict.plotting.plotting import onnx_simple_text_plot
from onnxcustom.plotting.plotting_onnx import plot_onnxs
from tqdm import tqdm


print([code_optimisation(), get_device()])

Out:

['AVX-omp=8', 'CPU']

First implementation: the operator LeayRelu.

def build_leaky_relu(alpha=0.5, target_opset=15):
    x = OnnxLeakyRelu('X', alpha=alpha, op_version=target_opset,
                      output_names=['Y'])
    return x.to_onnx({'X': FloatTensorType()},
                     outputs={'Y': FloatTensorType()},
                     target_opset=target_opset)


onx_leaky = build_leaky_relu()
print(onnx_simple_text_plot(onx_leaky))

Out:

opset: domain='' version=15
input: name='X' type=dtype('float32') shape=()
LeakyRelu(X, alpha=0.50) -> Y
output: name='Y' type=dtype('float32') shape=()

Second option, the formula introduced above must adapted as ONNX operator Sign returns -1 if x is negative and not 0.

def build_leaky_relu_decomposed(alpha=0.5, target_opset=15):
    signo = OnnxSign('X', op_version=target_opset)
    sign = OnnxDiv(
        OnnxAdd(signo, numpy.array([1], dtype=numpy.float32),
                op_version=target_opset),
        numpy.array([2], dtype=numpy.float32), op_version=target_opset)
    fact = OnnxAdd(
        OnnxMul(sign, numpy.array([1 - alpha], dtype=numpy.float32),
                op_version=target_opset),
        numpy.array([alpha], dtype=numpy.float32),
        op_version=target_opset)
    x = OnnxMul('X', fact, op_version=target_opset,
                output_names=['Y'])
    return x.to_onnx({'X': FloatTensorType()},
                     outputs={'Y': FloatTensorType()},
                     target_opset=target_opset)


onx_leaky_dec = build_leaky_relu_decomposed()
print(onnx_simple_text_plot(onx_leaky_dec))

Out:

opset: domain='' version=15
input: name='X' type=dtype('float32') shape=()
init: name='Ad_Addcst' type=dtype('float32') shape=(1,) -- array([1.], dtype=float32)
init: name='Di_Divcst' type=dtype('float32') shape=(1,) -- array([2.], dtype=float32)
init: name='Mu_Mulcst' type=dtype('float32') shape=(1,) -- array([0.5], dtype=float32)
Identity(Mu_Mulcst) -> Ad_Addcst1
Sign(X) -> Si_output0
  Add(Si_output0, Ad_Addcst) -> Ad_C01
    Div(Ad_C01, Di_Divcst) -> Di_C0
      Mul(Di_C0, Mu_Mulcst) -> Mu_C0
  Add(Mu_C0, Ad_Addcst1) -> Ad_C0
    Mul(X, Ad_C0) -> Y
output: name='Y' type=dtype('float32') shape=()

Third option, use of operater Greater

def build_leaky_relu_decomposed_greater(alpha=0.5, target_opset=15):
    signo = OnnxGreater('X', numpy.array([0], dtype=numpy.float32),
                        op_version=target_opset)
    sign = OnnxCast(signo, to=TensorProto.FLOAT,
                    op_version=target_opset)
    fact = OnnxAdd(
        OnnxMul(sign, numpy.array([1 - alpha], dtype=numpy.float32),
                op_version=target_opset),
        numpy.array([alpha], dtype=numpy.float32),
        op_version=target_opset)
    x = OnnxMul('X', fact, op_version=target_opset,
                output_names=['Y'])
    return x.to_onnx({'X': FloatTensorType()},
                     outputs={'Y': FloatTensorType()},
                     target_opset=target_opset)


onx_leaky_dec_greater = build_leaky_relu_decomposed_greater()
print(onnx_simple_text_plot(onx_leaky_dec_greater))

Out:

opset: domain='' version=15
input: name='X' type=dtype('float32') shape=()
init: name='Gr_Greatercst' type=dtype('float32') shape=(1,) -- array([0.], dtype=float32)
init: name='Mu_Mulcst' type=dtype('float32') shape=(1,) -- array([0.5], dtype=float32)
Greater(X, Gr_Greatercst) -> Gr_C0
  Cast(Gr_C0, to=1) -> Ca_output0
    Mul(Ca_output0, Mu_Mulcst) -> Mu_C0
Identity(Mu_Mulcst) -> Ad_Addcst
  Add(Mu_C0, Ad_Addcst) -> Ad_C0
    Mul(X, Ad_C0) -> Y
output: name='Y' type=dtype('float32') shape=()

Visually

plot_onnxs(onx_leaky, onx_leaky_dec, onx_leaky_dec_greater,
           title=["One operator", "Decomposed\nLeakyRelu",
                  "Decomposed\nLeakyRelu Greater"])
One operator, Decomposed LeakyRelu, Decomposed LeakyRelu Greater

Out:

array([<AxesSubplot:title={'center':'One operator'}>,
       <AxesSubplot:title={'center':'Decomposed\nLeakyRelu'}>,
       <AxesSubplot:title={'center':'Decomposed\nLeakyRelu Greater'}>],
      dtype=object)

Check that both graph returns are equivalent#

sess1 = InferenceSession(onx_leaky.SerializeToString(),
                         providers=['CPUExecutionProvider'])
sess_dec = InferenceSession(onx_leaky_dec.SerializeToString(),
                            providers=['CPUExecutionProvider'])
sess_dec_greater = InferenceSession(onx_leaky_dec_greater.SerializeToString(),
                                    providers=['CPUExecutionProvider'])

for shape in [(1, ), (10, ), (5, 5), (7, 2, 4)]:
    rnd = numpy.random.randn(*shape).astype(numpy.float32)
    res1 = sess1.run(None, {'X': rnd})[0]
    res_dec = sess_dec.run(None, {'X': rnd})[0]
    res_dec_greater = sess_dec_greater.run(None, {'X': rnd})[0]
    assert_almost_equal(res1, res_dec)
    assert_almost_equal(res1, res_dec_greater)

Benchmark#

fcts = [('leakyrelu', sess1), ('dec', sess_dec),
        ('dec_greater', sess_dec_greater)]

N = 100
data = []
for dim in tqdm([10, 128, 256, 512, 1000, 2000]):
    for shape in [(N, dim), (dim, N)]:
        rnd = numpy.random.randn(*shape).astype(numpy.float32)
        for name, sess in fcts:
            repeat = int(4001 / dim)
            obs = measure_time(
                lambda: sess.run(None, {'X': rnd}),
                context=dict(rnd=rnd, sess=sess),
                div_by_number=True, repeat=repeat, number=200)
            obs['name'] = name
            obs['N'] = N
            obs['dim'] = dim
            obs['orient'] = shape[0] == N
            obs['shape'] = "%dx%d" % shape
            data.append(obs)

df = DataFrame(data)
df[['name', 'N', 'dim', 'average', 'deviation']]

print(df[['name', 'N', 'dim', 'average']])

Out:

  0%|          | 0/6 [00:00<?, ?it/s]
 17%|#6        | 1/6 [00:41<03:26, 41.24s/it]
 33%|###3      | 2/6 [00:48<01:24, 21.08s/it]
 50%|#####     | 3/6 [00:53<00:42, 14.01s/it]
 67%|######6   | 4/6 [00:59<00:21, 10.54s/it]
 83%|########3 | 5/6 [01:03<00:08,  8.43s/it]
100%|##########| 6/6 [01:08<00:00,  7.03s/it]
100%|##########| 6/6 [01:08<00:00, 11.34s/it]
           name    N   dim   average
0     leakyrelu  100    10  0.000071
1           dec  100    10  0.000099
2   dec_greater  100    10  0.000089
3     leakyrelu  100    10  0.000070
4           dec  100    10  0.000098
5   dec_greater  100    10  0.000089
6     leakyrelu  100   128  0.000115
7           dec  100   128  0.000278
8   dec_greater  100   128  0.000172
9     leakyrelu  100   128  0.000114
10          dec  100   128  0.000275
11  dec_greater  100   128  0.000168
12    leakyrelu  100   256  0.000152
13          dec  100   256  0.000494
14  dec_greater  100   256  0.000283
15    leakyrelu  100   256  0.000152
16          dec  100   256  0.000494
17  dec_greater  100   256  0.000285
18    leakyrelu  100   512  0.000234
19          dec  100   512  0.001017
20  dec_greater  100   512  0.000604
21    leakyrelu  100   512  0.000239
22          dec  100   512  0.001022
23  dec_greater  100   512  0.000602
24    leakyrelu  100  1000  0.000492
25          dec  100  1000  0.001394
26  dec_greater  100  1000  0.001016
27    leakyrelu  100  1000  0.000534
28          dec  100  1000  0.001396
29  dec_greater  100  1000  0.000998
30    leakyrelu  100  2000  0.000902
31          dec  100  2000  0.002670
32  dec_greater  100  2000  0.001786
33    leakyrelu  100  2000  0.000908
34          dec  100  2000  0.002656
35  dec_greater  100  2000  0.001679

Other to way to look at it.

def speedup(piv):
    for c in piv.columns:
        if c == 'leakyrelu':
            continue
        piv[c] = piv['leakyrelu'] / piv[c]
    piv['leakyrelu'] = 1
    return piv


piv = speedup(df.pivot('shape', 'name', 'average'))
piv
name dec dec_greater leakyrelu
shape
1000x100 0.382746 0.535237 1
100x10 0.715840 0.797699 1
100x1000 0.353166 0.484377 1
100x128 0.413386 0.665806 1
100x2000 0.337779 0.505036 1
100x256 0.307795 0.537700 1
100x512 0.230077 0.387769 1
10x100 0.715538 0.792875 1
128x100 0.415119 0.677408 1
2000x100 0.341906 0.540911 1
256x100 0.306918 0.531903 1
512x100 0.233977 0.397312 1


Graph.

fig, ax = plt.subplots(1, 2, figsize=(12, 5))
speedup(df[df.orient].pivot('dim', 'name', 'average')).plot(ax=ax[0])
ax[0].set_title("LeakyRelu speedup, shape=(%d,dim)"
                "\nThe higher the better" % N)
speedup(df[~df.orient].pivot('dim', 'name', 'average')).plot(ax=ax[1])
ax[1].set_title("LeakyRelu speedup, shape=(dim,%d)"
                "\nThe higher the better" % N)
LeakyRelu speedup, shape=(100,dim) The higher the better, LeakyRelu speedup, shape=(dim,100) The higher the better

Out:

Text(0.5, 1.0, 'LeakyRelu speedup, shape=(dim,100)\nThe higher the better')

This kind of benchmark helps finding better implementation of operator runtime.

# plt.show()

Total running time of the script: ( 1 minutes 11.264 seconds)

Gallery generated by Sphinx-Gallery