Benchmark onnxruntime optimization#

onnxruntime does optimize the ONNX graph before running the inference. It tries for example to fuse a matrix multiplication following or followed by a transpose, choosing the most efficient path.

One ONNX file#

This section creates an ONNX graph if there is not one.

import os
from collections import OrderedDict, Counter
import numpy
import onnx
from cpyquickhelper.numbers.speed_measure import measure_time
import pandas
from onnxruntime import InferenceSession, SessionOptions, get_device
from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
    SessionIOBinding, OrtDevice as C_OrtDevice, OrtValue as C_OrtValue,
    GraphOptimizationLevel)
from sklearn.neighbors import RadiusNeighborsRegressor
from skl2onnx import to_onnx
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation

Available optimisation on this machine.

print(code_optimisation())

Out:

AVX-omp=8

Building the model#

filename = "onnx_to_profile.onnx"

if not os.path.exists(filename):
    print("Generate a graph for %r." % filename)
    X = numpy.random.randn(1000, 10).astype(numpy.float64)
    y = X.sum(axis=1).reshape((-1, 1))

    model = RadiusNeighborsRegressor()
    model.fit(X, y)
    onx = to_onnx(model, X, options={'optim': 'cdist'})

    with open(filename, "wb") as f:
        f.write(onx.SerializeToString())

Functions#

We need to generate random inputs to test the graph.

def random_input(typ, shape, batch):
    if typ == 'tensor(double)':
        dtype = numpy.float64
    elif typ == 'tensor(float)':
        dtype = numpy.float32
    else:
        raise NotImplementedError(
            "Unable to guess dtype from %r." % typ)

    if len(shape) <= 1:
        new_shape = shape
    elif shape[0] is None:
        new_shape = tuple([batch] + list(shape[1:]))
    else:
        new_shape = shape
    return numpy.random.randn(*new_shape).astype(dtype)


def random_feed(sess, batch=10):
    """
    Creates a dictionary of random inputs.

    :param batch: dimension to use as batch dimension if unknown
    :return: dictionary
    """
    inputs = sess.get_inputs()
    res = OrderedDict()
    for inp in inputs:
        name = inp.name
        typ = inp.type
        shape = inp.shape
        res[name] = random_input(typ, shape, batch)
    return res

A function which calls the API for any device.

def run_with_iobinding(sess, bind, ort_device, feed_ort_value, outputs):
    for name, (value, dtype) in feed_ort_value.items():
        bind.bind_input(name, ort_device, dtype, value.shape(),
                        value.data_ptr())
    for out in outputs:
        bind.bind_output(out, ort_device)
    sess._sess.run_with_iobinding(bind, None)
    ortvalues = bind.get_outputs()
    return [o.numpy() for o in ortvalues]

Benchmark#

Let’s choose the device available on this machine. batch dimension is set to 10.

batch = 200

if get_device().upper() == 'GPU':
    ort_device = C_OrtDevice(
        C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0)
    provider = 'CUDAExecutionProvider'
else:
    ort_device = C_OrtDevice(
        C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
    provider = 'CPUExecutionProvider'
print("provider = %r" % provider)

Out:

provider = 'CPUExecutionProvider'

We load the graph.

with open(filename, 'rb') as f:
    onx = onnx.load(f)

Create of the session.

data = []
files = []
legend = []

for graph_opt, name_opt in tqdm([
        (GraphOptimizationLevel.ORT_DISABLE_ALL, "ORT_DISABLE_ALL"),
        (GraphOptimizationLevel.ORT_ENABLE_BASIC, "ORT_ENABLE_BASIC"),
        (GraphOptimizationLevel.ORT_ENABLE_EXTENDED, "ORT_ENABLE_EXTENDED"),
        (GraphOptimizationLevel.ORT_ENABLE_ALL, "ORT_ENABLE_ALL")]):

    so = SessionOptions()
    so.graph_optimization_level = graph_opt
    so.optimized_model_filepath = (
        os.path.split(filename)[-1] + ".optimized.%s.onnx" % name_opt)
    files.append(so.optimized_model_filepath)
    legend.append(name_opt)
    sess = InferenceSession(onx.SerializeToString(), so,
                            providers=[provider])
    bind = SessionIOBinding(sess._sess)

    #####################################
    # Creates random data
    feed = random_feed(sess, batch)

    #####################################
    # moving the data on CPU or GPU
    feed_ort_value = OrderedDict(
        (name, (C_OrtValue.ortvalue_from_numpy(v, ort_device), v.dtype))
        for name, v in feed.items())
    outputs = [o.name for o in sess.get_outputs()]

    #######################################
    # The profiling.

    obs = measure_time(
        lambda: run_with_iobinding(
            sess, bind, ort_device, feed_ort_value, outputs),
        context=dict(run_with_iobinding=run_with_iobinding,
                     feed_ort_value=feed_ort_value, outputs=outputs,
                     sess=sess, bind=bind, ort_device=ort_device),
        repeat=10, number=10, div_by_number=True)
    obs['name'] = name_opt
    data.append(obs)


df = pandas.DataFrame(data)
df

Out:

  0%|          | 0/4 [00:00<?, ?it/s]
 25%|##5       | 1/4 [00:02<00:08,  2.92s/it]
 50%|#####     | 2/4 [00:05<00:05,  2.89s/it]
 75%|#######5  | 3/4 [00:08<00:02,  2.88s/it]
100%|##########| 4/4 [00:11<00:00,  2.88s/it]
100%|##########| 4/4 [00:11<00:00,  2.88s/it]
average deviation min_exec max_exec repeat number ttime context_size name
0 0.029124 0.000085 0.029031 0.029351 10 10 0.291235 360 ORT_DISABLE_ALL
1 0.028541 0.000046 0.028466 0.028604 10 10 0.285413 360 ORT_ENABLE_BASIC
2 0.028503 0.000035 0.028466 0.028563 10 10 0.285028 360 ORT_ENABLE_EXTENDED
3 0.028527 0.000035 0.028496 0.028617 10 10 0.285272 360 ORT_ENABLE_ALL


Graph#

df = df.set_index('name')
dev = df[['deviation']].copy()
dev.columns = ['average']
ax = df[['average']].plot.bar(yerr=dev)
ax.set_title(os.path.split(filename)[-1])
ax.tick_params(axis='x', labelrotation=15)
onnx_to_profile.onnx

The result are similar because the optimized model was very similar.

data = []
for name in files:
    with open(name, "rb") as f:
        onx = onnx.load(f)
    op_names = [op.op_type for op in onx.graph.node]
    data.append(Counter(op_names))

df = pandas.DataFrame(data).T
df.columns = legend
df
ORT_DISABLE_ALL ORT_ENABLE_BASIC ORT_ENABLE_EXTENDED ORT_ENABLE_ALL
CDist 1 1 1 1
Shape 2 2 2 2
Less 1 1 1 1
Cast 3 2 2 2
ConstantOfShape 1 1 1 1
ReduceSum 2 2 2 2
CumSum 1 1 1 1
Neg 1 1 1 1
Add 1 1 1 1
Where 1 1 1 1
Flatten 1 1 1 1
ArrayFeatureExtractor 1 1 1 1
Reshape 3 3 3 3
Mul 1 1 1 1
Div 1 1 1 1


Graph.

ax = df.plot.barh(yerr=dev)
ax.set_title(os.path.split(filename)[-1])

# import matplotlib.pyplot as plt
# plt.show()
onnx_to_profile.onnx

Out:

Text(0.5, 1.0, 'onnx_to_profile.onnx')

Total running time of the script: ( 0 minutes 12.767 seconds)

Gallery generated by Sphinx-Gallery