Benchmark operator Slice#

This short code compares the execution of the operator Slice on CPU and GPU in three configurations.

A simple example#

import numpy
from numpy.testing import assert_almost_equal
from pandas import DataFrame, pivot_table
from onnxruntime import InferenceSession, get_device
from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
    OrtValue as C_OrtValue)
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxSlice, OnnxAdd, OnnxMul
from cpyquickhelper.numbers.speed_measure import measure_time
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
from mlprodict.onnxrt import OnnxInference
from mlprodict.plotting.plotting_onnx import plot_onnx
from onnxcustom.utils.onnxruntime_helper import get_ort_device
from tqdm import tqdm


print([code_optimisation(), get_device()])

Out:

['AVX-omp=8', 'CPU']

The graph to compare.

def build_ort_op(op_version=14, save=None, slices=None):  # opset=13, 14, ...
    if slices is None:
        starts = numpy.array([1, 1], dtype=numpy.int64)
        ends = numpy.array([-1, -1], dtype=numpy.int64)
        axes = None
    else:
        starts, ends = slices
        if starts[0] is None:
            indexes = [i for i in range(len(starts)) if starts[i] is not None]
            starts = numpy.array(
                [n for n in starts if n is not None], dtype=numpy.int64)
            ends = numpy.array(
                [n for n in ends if n is not None], dtype=numpy.int64)
            axes = numpy.array(indexes, dtype=numpy.int64)
        else:
            starts = numpy.array(starts, dtype=numpy.int64)
            ends = numpy.array(ends, dtype=numpy.int64)
            axes = None

    if axes is None:
        node1 = OnnxSlice('X', starts, ends, op_version=op_version)
    else:
        node1 = OnnxSlice('X', starts, ends, axes, op_version=op_version)
    node2 = OnnxAdd(node1, numpy.array([1], dtype=numpy.float32),
                    op_version=op_version)
    if axes is None:
        node3 = OnnxSlice(node2, starts, ends, op_version=op_version)
    else:
        node3 = OnnxSlice(node2, starts, ends, axes, op_version=op_version)
    node4 = OnnxMul(node3, numpy.array([2], dtype=numpy.float32),
                    op_version=op_version, output_names=['Y'])
    onx = node4.to_onnx(inputs=[('X', FloatTensorType([None, None]))],
                        target_opset=op_version)
    return onx


onx = build_ort_op()
plot_onnx(onx)
plot benchmark op short

Out:

<AxesSubplot:>

Execution on CPU#

x = numpy.random.rand(50, 50).astype(numpy.float32)

oinf = OnnxInference(onx)
oinf.run({'X': x}, verbose=1, fLOG=print)

Out:

+ki='Sl_Slicecst': (2,) (dtype=int64 min=1 max=1)
+ki='Sl_Slicecst1': (2,) (dtype=int64 min=-1 max=-1)
+ki='Ad_Addcst': (1,) (dtype=float32 min=1.0 max=1.0)
+ki='Mu_Mulcst': (1,) (dtype=float32 min=2.0 max=2.0)
-- OnnxInference: run 6 nodes
Onnx-Identity(Sl_Slicecst1) -> Sl_Slicecst3    (name='Sl_Slicecst3_op')
+kr='Sl_Slicecst3': (2,) (dtype=int64 min=-1 max=-1)
Onnx-Slice(X, Sl_Slicecst, Sl_Slicecst1) -> Sl_output01    (name='Sl_Slice')
+kr='Sl_output01': (48, 48) (dtype=float32 min=9.288798901252449e-05 max=0.9997351169586182)
Onnx-Identity(Sl_Slicecst) -> Sl_Slicecst2    (name='Sl_Slicecst2_op')
+kr='Sl_Slicecst2': (2,) (dtype=int64 min=1 max=1)
Onnx-Add(Sl_output01, Ad_Addcst) -> Ad_C0    (name='Ad_Add')
+kr='Ad_C0': (48, 48) (dtype=float32 min=1.00009286403656 max=1.9997351169586182)
Onnx-Slice(Ad_C0, Sl_Slicecst2, Sl_Slicecst3) -> Sl_output0    (name='Sl_Slice1')
+kr='Sl_output0': (46, 46) (dtype=float32 min=1.00009286403656 max=1.9997351169586182)
Onnx-Mul(Sl_output0, Mu_Mulcst) -> Y    (name='Mu_Mul')
+kr='Y': (46, 46) (dtype=float32 min=2.00018572807312 max=3.9994702339172363)

{'Y': array([[2.159975 , 2.688986 , 3.5766497, ..., 3.4398422, 3.865087 ,
        2.9408178],
       [2.3866725, 3.8495946, 3.3857317, ..., 3.8949046, 3.9811583,
        2.0954792],
       [3.0558975, 3.834601 , 2.156315 , ..., 3.7635875, 3.3348975,
        2.2004378],
       ...,
       [3.903675 , 2.4898612, 3.0952504, ..., 3.2075324, 2.7862916,
        2.701054 ],
       [2.2475376, 3.5546653, 3.0427   , ..., 3.9084997, 3.4153483,
        2.0248337],
       [3.2491202, 3.7794542, 2.139915 , ..., 3.2050679, 2.0153677,
        3.824234 ]], dtype=float32)}

With onnxruntime.

sess = InferenceSession(onx.SerializeToString(),
                        providers=["CPUExecutionProvider"])
y_cpu = sess.run(None, {'X': x})[0]

Execution on GPU#

If available…

if get_device().upper() == 'GPU':
    dev = get_ort_device('cuda:0')
    try:
        gx = C_OrtValue.ortvalue_from_numpy(x, dev)
        cuda = True
    except RuntimeError as e:
        print(e)
        cuda = False
else:
    cuda = False

if cuda:
    sessg = InferenceSession(onx.SerializeToString(),
                             providers=["CUDAExecutionProvider"])

    io_binding = sessg.io_binding()._iobinding
    io_binding.bind_input(
        'X', dev, numpy.float32, gx.shape(), gx.data_ptr())
    io_binding.bind_output('Y', dev)
    sessg._sess.run_with_iobinding(io_binding, None)
    y_gpu = io_binding.copy_outputs_to_cpu()[0]
    assert_almost_equal(y_cpu, y_gpu)

Benchmark#

data = []
shapes = ([(n, n) for n in [10, 100, 1000]] +
          [(n, 100) for n in [10, 100, 1000, 10000]] +
          [(100, n) for n in [10, 100, 1000, 10000]])
slices = [([1, 1], [-1, -1]), ([1], [-1]), ([None, 1], [None, -1])]
shape_slices = [(sh, sl) for sh in shapes for sl in slices]

for shape, slices in tqdm(shape_slices):
    onx = build_ort_op(slices=slices)
    x = numpy.random.rand(*shape).astype(numpy.float32)

    number = 100
    if x.size >= 100000:
        number = 10

    sess = InferenceSession(
        onx.SerializeToString(),
        providers=["CPUExecutionProvider"])
    sess.run(None, {'X': x})

    obs = dict(
        shape=str(shape).replace(
            " ", ""), slice=str(slices).replace(
            " ", ""))
    r = measure_time(lambda: sess.run(None, {'X': x}),
                     number=number, div_by_number=True,
                     context={})
    obs.update(r)
    obs['provider'] = 'CPU'
    data.append(obs)

    if cuda:
        def sess_run(sess, io_binding, x, dev):
            io_binding.bind_input(
                'X', dev, numpy.float32, gx.shape(), gx.data_ptr())
            io_binding.bind_output('Y', dev)
            sess._sess.run_with_iobinding(io_binding)

        io_binding = sess.io_binding()._iobinding
        sess = InferenceSession(
            onx.SerializeToString(),
            providers=["CUDAExecutionProvider"])
        dev = get_ort_device('cuda:0')
        gx = C_OrtValue.ortvalue_from_numpy(x, dev)
        sess_run(sess, io_binding, gx, dev)
        obs = dict(
            shape=str(shape).replace(
                " ", ""), slice=str(slices).replace(
                " ", ""))
        r = measure_time(
            lambda: sess_run(sess, io_binding, io_binding, gx, dev),
            number=number,
            div_by_number=True,
            context={
                'sess': sess, 'gx': gx, 'io_binding': io_binding,
                'dev': dev, 'sess_run': sess_run})
        obs.update(r)
        obs['provider'] = 'GPU'
        data.append(obs)

df = DataFrame(data)
print(df)

Out:

  0%|          | 0/33 [00:00<?, ?it/s]
  3%|3         | 1/33 [00:00<00:04,  7.89it/s]
  6%|6         | 2/33 [00:00<00:03,  8.40it/s]
  9%|9         | 3/33 [00:00<00:03,  8.42it/s]
 12%|#2        | 4/33 [00:00<00:03,  7.49it/s]
 15%|#5        | 5/33 [00:00<00:03,  7.08it/s]
 18%|#8        | 6/33 [00:00<00:04,  6.73it/s]
 21%|##1       | 7/33 [00:01<00:08,  3.04it/s]
 24%|##4       | 8/33 [00:02<00:11,  2.25it/s]
 27%|##7       | 9/33 [00:02<00:12,  1.93it/s]
 30%|###       | 10/33 [00:03<00:09,  2.53it/s]
 33%|###3      | 11/33 [00:03<00:06,  3.23it/s]
 36%|###6      | 12/33 [00:03<00:05,  3.94it/s]
 39%|###9      | 13/33 [00:03<00:04,  4.46it/s]
 42%|####2     | 14/33 [00:03<00:03,  4.93it/s]
 45%|####5     | 15/33 [00:03<00:03,  5.25it/s]
 48%|####8     | 16/33 [00:03<00:02,  5.88it/s]
 52%|#####1    | 17/33 [00:03<00:02,  6.44it/s]
 55%|#####4    | 18/33 [00:04<00:02,  6.81it/s]
 58%|#####7    | 19/33 [00:04<00:04,  3.06it/s]
 61%|######    | 20/33 [00:05<00:05,  2.19it/s]
 64%|######3   | 21/33 [00:06<00:06,  1.82it/s]
 67%|######6   | 22/33 [00:06<00:04,  2.37it/s]
 70%|######9   | 23/33 [00:06<00:03,  2.99it/s]
 73%|#######2  | 24/33 [00:06<00:02,  3.66it/s]
 76%|#######5  | 25/33 [00:06<00:01,  4.20it/s]
 79%|#######8  | 26/33 [00:07<00:01,  4.69it/s]
 82%|########1 | 27/33 [00:07<00:01,  5.04it/s]
 85%|########4 | 28/33 [00:07<00:00,  5.74it/s]
 88%|########7 | 29/33 [00:07<00:00,  6.36it/s]
 91%|######### | 30/33 [00:07<00:00,  6.77it/s]
 94%|#########3| 31/33 [00:08<00:00,  3.32it/s]
 97%|#########6| 32/33 [00:08<00:00,  2.45it/s]
100%|##########| 33/33 [00:09<00:00,  2.02it/s]
100%|##########| 33/33 [00:09<00:00,  3.43it/s]
          shape                 slice  ...  context_size  provider
0       (10,10)       ([1,1],[-1,-1])  ...            64       CPU
1       (10,10)            ([1],[-1])  ...            64       CPU
2       (10,10)  ([None,1],[None,-1])  ...            64       CPU
3     (100,100)       ([1,1],[-1,-1])  ...            64       CPU
4     (100,100)            ([1],[-1])  ...            64       CPU
5     (100,100)  ([None,1],[None,-1])  ...            64       CPU
6   (1000,1000)       ([1,1],[-1,-1])  ...            64       CPU
7   (1000,1000)            ([1],[-1])  ...            64       CPU
8   (1000,1000)  ([None,1],[None,-1])  ...            64       CPU
9      (10,100)       ([1,1],[-1,-1])  ...            64       CPU
10     (10,100)            ([1],[-1])  ...            64       CPU
11     (10,100)  ([None,1],[None,-1])  ...            64       CPU
12    (100,100)       ([1,1],[-1,-1])  ...            64       CPU
13    (100,100)            ([1],[-1])  ...            64       CPU
14    (100,100)  ([None,1],[None,-1])  ...            64       CPU
15   (1000,100)       ([1,1],[-1,-1])  ...            64       CPU
16   (1000,100)            ([1],[-1])  ...            64       CPU
17   (1000,100)  ([None,1],[None,-1])  ...            64       CPU
18  (10000,100)       ([1,1],[-1,-1])  ...            64       CPU
19  (10000,100)            ([1],[-1])  ...            64       CPU
20  (10000,100)  ([None,1],[None,-1])  ...            64       CPU
21     (100,10)       ([1,1],[-1,-1])  ...            64       CPU
22     (100,10)            ([1],[-1])  ...            64       CPU
23     (100,10)  ([None,1],[None,-1])  ...            64       CPU
24    (100,100)       ([1,1],[-1,-1])  ...            64       CPU
25    (100,100)            ([1],[-1])  ...            64       CPU
26    (100,100)  ([None,1],[None,-1])  ...            64       CPU
27   (100,1000)       ([1,1],[-1,-1])  ...            64       CPU
28   (100,1000)            ([1],[-1])  ...            64       CPU
29   (100,1000)  ([None,1],[None,-1])  ...            64       CPU
30  (100,10000)       ([1,1],[-1,-1])  ...            64       CPU
31  (100,10000)            ([1],[-1])  ...            64       CPU
32  (100,10000)  ([None,1],[None,-1])  ...            64       CPU

[33 rows x 11 columns]

Better display#

piv = pivot_table(
    df, index=["shape", "slice"], columns="provider", values="average")
if 'GPU' in piv.columns:
    piv['ratio'] = piv['GPU'] / piv['CPU']
print(piv)

Out:

provider                               CPU
shape       slice
(10,10)     ([1,1],[-1,-1])       0.000089
            ([1],[-1])            0.000090
            ([None,1],[None,-1])  0.000090
(10,100)    ([1,1],[-1,-1])       0.000094
            ([1],[-1])            0.000094
            ([None,1],[None,-1])  0.000096
(100,10)    ([1,1],[-1,-1])       0.000104
            ([1],[-1])            0.000104
            ([None,1],[None,-1])  0.000102
(100,100)   ([1,1],[-1,-1])       0.000132
            ([1],[-1])            0.000130
            ([None,1],[None,-1])  0.000134
(100,1000)  ([1,1],[-1,-1])       0.000894
            ([1],[-1])            0.000897
            ([None,1],[None,-1])  0.000922
(100,10000) ([1,1],[-1,-1])       0.005912
            ([1],[-1])            0.005992
            ([None,1],[None,-1])  0.006151
(1000,100)  ([1,1],[-1,-1])       0.000944
            ([1],[-1])            0.000924
            ([None,1],[None,-1])  0.000944
(1000,1000) ([1,1],[-1,-1])       0.006292
            ([1],[-1])            0.006287
            ([None,1],[None,-1])  0.006166
(10000,100) ([1,1],[-1,-1])       0.006828
            ([1],[-1])            0.006926
            ([None,1],[None,-1])  0.006932

Graphs.

piv.plot()
plot benchmark op short

Out:

<AxesSubplot:xlabel='shape,slice'>

Total running time of the script: ( 0 minutes 11.703 seconds)

Gallery generated by Sphinx-Gallery