Benchmark and profile of operator Slice#

This short code compares the execution of the operator Slice between numpy and onnxruntime for three configurations.

A simple example#

import json
import numpy
from numpy.testing import assert_almost_equal
import pandas
from pandas import DataFrame
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession, get_device, SessionOptions
from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
    OrtValue as C_OrtValue)
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import OnnxSlice, OnnxAdd, OnnxMul
from cpyquickhelper.numbers import measure_time
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import (
    code_optimisation)
from mlprodict.onnxrt.ops_whole.session import OnnxWholeSession
from onnxcustom.utils.onnxruntime_helper import get_ort_device

print([code_optimisation(), get_device()])

Out:

['AVX-omp=8', 'CPU']

The functions to compare.

def build_ort_op(op_version=14, save=None, **kwargs):  # opset=13, 14, ...
    slices = kwargs['slices']
    slice1, slice2 = slices
    slice1 = slice(0, None) if slice1 is None else slice(*slice1)
    slice2 = slice(0, None) if slice2 is None else slice(*slice2)

    axes = []
    starts = []
    ends = []
    for i in [0, 1]:
        if slices[i] is None:
            continue
        axes.append(i)
        starts.append(slices[i][0])
        ends.append(slices[i][1])
    starts = numpy.array(starts, dtype=numpy.int64)
    ends = numpy.array(ends, dtype=numpy.int64)
    axes = numpy.array(axes, dtype=numpy.int64)
    node1 = OnnxSlice('X', starts, ends, axes, op_version=op_version)
    node2 = OnnxAdd(node1, numpy.array([1], dtype=numpy.float32),
                    op_version=op_version)
    node3 = OnnxSlice(node2, starts, ends, axes,
                      op_version=op_version)
    node4 = OnnxMul(node3, numpy.array([2], dtype=numpy.float32),
                    op_version=op_version, output_names=['Y'])
    onx = node4.to_onnx(inputs=[('X', FloatTensorType([None, None]))],
                        target_opset=op_version)
    sess = InferenceSession(onx.SerializeToString(),
                            providers=["CPUExecutionProvider"])
    if save is not None:
        with open(save, "wb") as f:
            f.write(onx.SerializeToString())

    def npy_fct(x):
        return ((x[slice1, slice2] + 1)[slice1, slice2] * 2).copy()

    rnd = numpy.random.randn(10, 10).astype(numpy.float32)
    expected = npy_fct(rnd)
    got = sess.run(None, {'X': rnd})[0]
    try:
        assert_almost_equal(expected, got)
    except AssertionError as e:
        raise AssertionError(
            "kwargs=%r slice1=%r slice2=%r shapes=%r ? %r "
            "(x[slice1, slice2].shape)=%r" % (
                kwargs, slice1, slice2, expected.shape,
                got.shape, rnd[slice1, slice2].shape)) from e

    if get_device().upper() == 'GPU':
        sessg = InferenceSession(onx.SerializeToString(),
                                 providers=["CUDAExecutionProvider"])
        io_binding = sessg.io_binding()._iobinding
        device = get_ort_device('cuda:0')

        def run_gpu(x):
            io_binding.bind_input(
                'X', device, numpy.float32, x.shape(), x.data_ptr())
            io_binding.bind_output('Y', device)
            return sessg._sess.run_with_iobinding(io_binding, None)

        return onx, lambda x: sess.run(None, {'X': x}), npy_fct, run_gpu
    else:
        return onx, lambda x: sess.run(None, {'X': x}), npy_fct, None

The benchmark.

def loop_fct(fct, xs):
    for x in xs:
        fct(x)


def benchmark_op(repeat=10, number=10, name="Slice", shape_slice_fct=None,
                 save=None, opset=14, repeat_profile=1500, verbose=1):
    if verbose:
        print("[benchmark_op] start repeat=%d number=%d repeat_profile=%d"
              " opset=%d." % (repeat, number, repeat_profile, opset))
    res = []
    for dim in tqdm([8, 16, 32, 64, 100, 128, 200,
                     256, 400, 512, 600, 784, 800,
                     1000, 1024, 1200]):
        shape, slices = shape_slice_fct(dim)
        onx, ort_fct, npy_fct, ort_fct_gpu = build_ort_op(
            save=save, op_version=opset, slices=slices)

        n_arrays = 20
        if dim >= 512:
            n_arrays = 10
        xs = [numpy.random.rand(*shape).astype(numpy.float32)
              for _ in range(n_arrays)]
        info = dict(shape=shape)

        ctx = dict(xs=xs, loop_fct=loop_fct)

        # numpy
        ctx['fct'] = npy_fct
        obs = measure_time(
            lambda: loop_fct(npy_fct, xs),
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'numpy'
        obs['shape'] = ",".join(map(str, shape))
        obs['slices'] = str(slices)
        obs.update(info)
        res.append(obs)

        # onnxruntime
        ctx['fct'] = ort_fct
        obs = measure_time(
            lambda: loop_fct(ort_fct, xs),
            div_by_number=True, context=ctx, repeat=repeat, number=number)
        obs['dim'] = dim
        obs['fct'] = 'ort'
        obs['shape'] = ",".join(map(str, shape))
        obs['slices'] = str(slices)
        obs.update(info)
        res.append(obs)

        if ort_fct_gpu is not None:

            # onnxruntime
            dev = get_ort_device('cuda:0')
            ctx['xs'] = [
                C_OrtValue.ortvalue_from_numpy(x, dev)
                for x in xs]
            ctx['fct'] = ort_fct_gpu
            obs = measure_time(
                lambda: loop_fct(ort_fct_gpu, ctx['xs']),
                div_by_number=True, context=ctx, repeat=repeat, number=number)
            obs['dim'] = dim
            obs['fct'] = 'ort_gpu'
            obs['shape'] = ",".join(map(str, shape))
            obs['slices'] = str(slices)
            obs.update(info)
            res.append(obs)

    # profiling CPU
    if verbose:
        print("[benchmark_op] done.")
        print("[benchmark_op] profile CPU.")
    so = SessionOptions()
    so.enable_profiling = True
    sess = InferenceSession(onx.SerializeToString(), so,
                            providers=["CPUExecutionProvider"])
    for i in range(0, repeat_profile):
        sess.run(None, {'X': xs[-1]}, )
    prof = sess.end_profiling()
    with open(prof, "r") as f:
        js = json.load(f)
    dfprof = DataFrame(OnnxWholeSession.process_profiling(js))
    dfprof['shape'] = ",".join(map(str, shape))
    dfprof['slices'] = str(slices)
    if verbose:
        print("[benchmark_op] done.")

    # profiling CPU
    if ort_fct_gpu is not None:
        if verbose:
            print("[benchmark_op] profile GPU.")
        so = SessionOptions()
        so.enable_profiling = True
        sess = InferenceSession(onx.SerializeToString(), so,
                                providers=["CUDAExecutionProvider"])
        io_binding = sess.io_binding()._iobinding
        device = get_ort_device('cpu')

        for i in range(0, repeat_profile):
            x = ctx['xs'][-1]
            io_binding.bind_input(
                'X', device, numpy.float32, x.shape(), x.data_ptr())
            io_binding.bind_output('Y', device)
            sess._sess.run_with_iobinding(io_binding, None)

        prof = sess.end_profiling()
        with open(prof, "r") as f:
            js = json.load(f)
        dfprofgpu = DataFrame(OnnxWholeSession.process_profiling(js))
        dfprofgpu['shape'] = ",".join(map(str, shape))
        dfprofgpu['slices'] = str(slices)
        if verbose:
            print("[benchmark_op] profile done.")
    else:
        dfprofgpu = None

    # Dataframes
    shape_name = str(shape).replace(str(dim), "N")
    df = pandas.DataFrame(res)
    piv = df.pivot('shape', 'fct', 'average')

    rs = piv.copy()
    for c in ['numpy', 'ort', 'ort_gpu']:
        if c in rs.columns:
            rs["numpy/%s" % c] = rs['numpy'] / rs[c]
    rs = rs[[c for c in rs.columns if "/numpy" not in c]].copy()

    # Graphs.
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    piv.plot(logx=True, logy=True, ax=ax[0],
             title="%s benchmark\n%r"
                   " lower better" % (name, shape_name))
    ax[0].legend(prop={"size": 9})
    rs.plot(logx=True, logy=True, ax=ax[1],
            title="%s Speedup, baseline=numpy\n%r"
                  " higher better" % (name, shape_name))
    ax[1].plot([min(rs.index), max(rs.index)], [0.5, 0.5], 'g--')
    ax[1].plot([min(rs.index), max(rs.index)], [2., 2.], 'g--')
    ax[1].legend(prop={"size": 9})
    return dfprof, dfprofgpu, df, rs, ax

The results.

nth = int(code_optimisation().split('=')[1])
cols_profile = ["shape", "slices", "args_op_name", 'args_provider']

shape = (100, N) - slice = [1:-1], :#

dfs = []
dfprof, dfprofgpu, df, piv, ax = benchmark_op(
    shape_slice_fct=lambda dim: ((256, dim), ((1, -1), None)),
    save="bslice.onnx", number=nth * 4, repeat=8, repeat_profile=100 * nth)

dfs.append(df)
piv2 = df.pivot("fct", "shape", "average")
print("slices = [1:-1], :")
print(piv.to_markdown())
print(dfprof.drop(['pid', 'tid', 'ts'], axis=1).groupby(
    cols_profile).sum().to_markdown())
if dfprofgpu is not None:
    print(dfprofgpu.drop(['pid', 'tid'], axis=1).groupby(
        cols_profile).sum().to_markdown())
Slice benchmark '(256, N)' lower better, Slice Speedup, baseline=numpy '(256, N)' higher better

Out:

[benchmark_op] start repeat=8 number=32 repeat_profile=800 opset=14.

  0%|          | 0/16 [00:00<?, ?it/s]
  6%|6         | 1/16 [00:00<00:14,  1.04it/s]
 12%|#2        | 2/16 [00:02<00:14,  1.02s/it]
 19%|#8        | 3/16 [00:03<00:14,  1.09s/it]
 25%|##5       | 4/16 [00:04<00:14,  1.24s/it]
 31%|###1      | 5/16 [00:06<00:16,  1.46s/it]
 38%|###7      | 6/16 [00:08<00:17,  1.72s/it]
 44%|####3     | 7/16 [00:12<00:20,  2.29s/it]
 50%|#####     | 8/16 [00:16<00:23,  2.89s/it]
 56%|#####6    | 9/16 [00:23<00:30,  4.29s/it]
 62%|######2   | 10/16 [00:28<00:26,  4.43s/it]
 69%|######8   | 11/16 [00:33<00:23,  4.72s/it]
 75%|#######5  | 12/16 [00:40<00:20,  5.19s/it]
 81%|########1 | 13/16 [00:46<00:16,  5.58s/it]
 88%|########7 | 14/16 [00:54<00:12,  6.26s/it]
 94%|#########3| 15/16 [01:02<00:06,  6.77s/it]
100%|##########| 16/16 [01:11<00:00,  7.51s/it]
100%|##########| 16/16 [01:11<00:00,  4.48s/it]
[benchmark_op] done.
[benchmark_op] profile CPU.
[benchmark_op] done.
slices = [1:-1], :
| shape       |      numpy |        ort |   numpy/ort |
|:------------|-----------:|-----------:|------------:|
| (256, 8)    | 0.0012342  | 0.00239926 |    0.514407 |
| (256, 16)   | 0.00140168 | 0.002609   |    0.537249 |
| (256, 32)   | 0.00159413 | 0.0028315  |    0.562997 |
| (256, 64)   | 0.00197222 | 0.00366633 |    0.537927 |
| (256, 100)  | 0.0023845  | 0.00462276 |    0.515817 |
| (256, 128)  | 0.00281461 | 0.00563747 |    0.499269 |
| (256, 200)  | 0.00440235 | 0.00893159 |    0.492896 |
| (256, 256)  | 0.00465045 | 0.0113058  |    0.411333 |
| (256, 400)  | 0.00883246 | 0.0195775  |    0.451154 |
| (256, 512)  | 0.00700593 | 0.0112226  |    0.62427  |
| (256, 600)  | 0.00825281 | 0.0124378  |    0.663525 |
| (256, 784)  | 0.00940551 | 0.0146032  |    0.644073 |
| (256, 800)  | 0.00999317 | 0.0149651  |    0.667764 |
| (256, 1000) | 0.0121003  | 0.0179254  |    0.675039 |
| (256, 1024) | 0.0120951  | 0.0184837  |    0.654367 |
| (256, 1200) | 0.0148708  | 0.0207549  |    0.716498 |
|                                                                  |              dur |
|:-----------------------------------------------------------------|-----------------:|
| ('256,1200', '((1, -1), None)', 'Add', 'CPUExecutionProvider')   | 244040           |
| ('256,1200', '((1, -1), None)', 'Mul', 'CPUExecutionProvider')   | 244545           |
| ('256,1200', '((1, -1), None)', 'Slice', 'CPUExecutionProvider') |      1.00289e+06 |

shape = (100, N) - slice = :, [1:-1]#

dfs = []
dfprof, dfprofgpu, df, piv, ax = benchmark_op(
    shape_slice_fct=lambda dim: ((256, dim), (None, (1, -1))),
    save="bslice.onnx", number=nth * 4, repeat=8, repeat_profile=100 * nth)

dfs.append(df)
piv2 = df.pivot("fct", "shape", "average")
print("slices = :, [1:-1]")
print(piv.to_markdown())
print(dfprof.drop(['pid', 'tid', 'ts'], axis=1).groupby(
    cols_profile).sum().to_markdown())
if dfprofgpu is not None:
    print(dfprofgpu.drop(['pid', 'tid'], axis=1).groupby(
        cols_profile).sum().to_markdown())
Slice benchmark '(256, N)' lower better, Slice Speedup, baseline=numpy '(256, N)' higher better

Out:

[benchmark_op] start repeat=8 number=32 repeat_profile=800 opset=14.

  0%|          | 0/16 [00:00<?, ?it/s]
  6%|6         | 1/16 [00:01<00:17,  1.18s/it]
 12%|#2        | 2/16 [00:02<00:17,  1.25s/it]
 19%|#8        | 3/16 [00:03<00:17,  1.33s/it]
 25%|##5       | 4/16 [00:05<00:19,  1.60s/it]
 31%|###1      | 5/16 [00:08<00:20,  1.88s/it]
 38%|###7      | 6/16 [00:11<00:22,  2.24s/it]
 44%|####3     | 7/16 [00:15<00:27,  3.04s/it]
 50%|#####     | 8/16 [00:21<00:31,  3.89s/it]
 56%|#####6    | 9/16 [00:31<00:39,  5.65s/it]
 62%|######2   | 10/16 [00:37<00:34,  5.72s/it]
 69%|######8   | 11/16 [00:43<00:29,  5.91s/it]
 75%|#######5  | 12/16 [00:50<00:25,  6.33s/it]
 81%|########1 | 13/16 [00:58<00:20,  6.70s/it]
 88%|########7 | 14/16 [01:07<00:14,  7.41s/it]
 94%|#########3| 15/16 [01:16<00:07,  7.98s/it]
100%|##########| 16/16 [01:27<00:00,  8.79s/it]
100%|##########| 16/16 [01:27<00:00,  5.45s/it]
[benchmark_op] done.
[benchmark_op] profile CPU.
[benchmark_op] done.
slices = :, [1:-1]
| shape       |      numpy |        ort |   numpy/ort |
|:------------|-----------:|-----------:|------------:|
| (256, 8)    | 0.00203436 | 0.00244519 |    0.831987 |
| (256, 16)   | 0.0023793  | 0.00255901 |    0.929771 |
| (256, 32)   | 0.00263454 | 0.00280389 |    0.939599 |
| (256, 64)   | 0.00398038 | 0.00370694 |    1.07376  |
| (256, 100)  | 0.00469887 | 0.00439478 |    1.0692   |
| (256, 128)  | 0.00579809 | 0.0054333  |    1.06714  |
| (256, 200)  | 0.00923359 | 0.00887462 |    1.04045  |
| (256, 256)  | 0.0105318  | 0.0114353  |    0.920987 |
| (256, 400)  | 0.0170741  | 0.0197054  |    0.866467 |
| (256, 512)  | 0.0114206  | 0.0112651  |    1.01381  |
| (256, 600)  | 0.0120928  | 0.0123732  |    0.977339 |
| (256, 784)  | 0.0133372  | 0.0147364  |    0.905052 |
| (256, 800)  | 0.0141837  | 0.0148913  |    0.952485 |
| (256, 1000) | 0.0166793  | 0.0182041  |    0.916234 |
| (256, 1024) | 0.0174357  | 0.0184857  |    0.943203 |
| (256, 1200) | 0.0202868  | 0.0208993  |    0.970689 |
|                                                                  |              dur |
|:-----------------------------------------------------------------|-----------------:|
| ('256,1200', '(None, (1, -1))', 'Add', 'CPUExecutionProvider')   | 244121           |
| ('256,1200', '(None, (1, -1))', 'Mul', 'CPUExecutionProvider')   | 245184           |
| ('256,1200', '(None, (1, -1))', 'Slice', 'CPUExecutionProvider') |      1.00975e+06 |

shape = (100, N) - slice = [1:-1], [1:-1]#

dfs = []
dfprof, dfprofgpu, df, piv, ax = benchmark_op(
    shape_slice_fct=lambda dim: ((256, dim), ((1, -1), (1, -1))),
    save="bslice.onnx", number=nth * 4, repeat=8, repeat_profile=100 * nth)

dfs.append(df)
piv2 = df.pivot("fct", "shape", "average")
print("slices = [1:-1], [1:-1]")
print(piv.to_markdown())
print(dfprof.drop(['pid', 'tid', 'ts'], axis=1).groupby(
    cols_profile).sum().to_markdown())
if dfprofgpu is not None:
    print(dfprofgpu.drop(['pid', 'tid'], axis=1).groupby(
        cols_profile).sum().to_markdown())
Slice benchmark '(256, N)' lower better, Slice Speedup, baseline=numpy '(256, N)' higher better

Out:

[benchmark_op] start repeat=8 number=32 repeat_profile=800 opset=14.

  0%|          | 0/16 [00:00<?, ?it/s]
  6%|6         | 1/16 [00:01<00:17,  1.16s/it]
 12%|#2        | 2/16 [00:02<00:16,  1.20s/it]
 19%|#8        | 3/16 [00:03<00:16,  1.30s/it]
 25%|##5       | 4/16 [00:05<00:18,  1.53s/it]
 31%|###1      | 5/16 [00:07<00:19,  1.80s/it]
 38%|###7      | 6/16 [00:10<00:21,  2.14s/it]
 44%|####3     | 7/16 [00:15<00:25,  2.84s/it]
 50%|#####     | 8/16 [00:20<00:30,  3.76s/it]
 56%|#####6    | 9/16 [00:30<00:38,  5.55s/it]
 62%|######2   | 10/16 [00:36<00:33,  5.62s/it]
 69%|######8   | 11/16 [00:42<00:29,  5.82s/it]
 75%|#######5  | 12/16 [00:50<00:25,  6.39s/it]
 81%|########1 | 13/16 [00:57<00:20,  6.69s/it]
 88%|########7 | 14/16 [01:06<00:14,  7.33s/it]
 94%|#########3| 15/16 [01:15<00:07,  7.82s/it]
100%|##########| 16/16 [01:25<00:00,  8.63s/it]
100%|##########| 16/16 [01:25<00:00,  5.35s/it]
[benchmark_op] done.
[benchmark_op] profile CPU.
[benchmark_op] done.
slices = [1:-1], [1:-1]
| shape       |      numpy |        ort |   numpy/ort |
|:------------|-----------:|-----------:|------------:|
| (256, 8)    | 0.0020187  | 0.00238914 |    0.84495  |
| (256, 16)   | 0.00223667 | 0.00246948 |    0.905725 |
| (256, 32)   | 0.00261896 | 0.00275258 |    0.951458 |
| (256, 64)   | 0.00372098 | 0.00353096 |    1.05381  |
| (256, 100)  | 0.00456468 | 0.00415776 |    1.09787  |
| (256, 128)  | 0.00545945 | 0.00528378 |    1.03325  |
| (256, 200)  | 0.0081073  | 0.00837941 |    0.967527 |
| (256, 256)  | 0.0111545  | 0.0109034  |    1.02303  |
| (256, 400)  | 0.0183303  | 0.0183348  |    0.999755 |
| (256, 512)  | 0.0111766  | 0.0110651  |    1.01008  |
| (256, 600)  | 0.011812   | 0.0124068  |    0.95206  |
| (256, 784)  | 0.0145341  | 0.0151284  |    0.96072  |
| (256, 800)  | 0.0139486  | 0.014525   |    0.960321 |
| (256, 1000) | 0.0164508  | 0.0175113  |    0.939437 |
| (256, 1024) | 0.0166651  | 0.0178007  |    0.936201 |
| (256, 1200) | 0.019451   | 0.0211321  |    0.920445 |
|                                                                     |    dur |
|:--------------------------------------------------------------------|-------:|
| ('256,1200', '((1, -1), (1, -1))', 'Add', 'CPUExecutionProvider')   | 256741 |
| ('256,1200', '((1, -1), (1, -1))', 'Mul', 'CPUExecutionProvider')   | 256854 |
| ('256,1200', '((1, -1), (1, -1))', 'Slice', 'CPUExecutionProvider') | 994580 |

Total running time of the script: ( 4 minutes 19.786 seconds)

Gallery generated by Sphinx-Gallery