Profile onnxruntime execution#

The following examples converts a model into ONNX and runs it with onnxruntime. This one is then uses to profile the execution by looking the time spent in each operator. This analysis gives some hints on how to optimize the processing time by looking the nodes consuming most of the ressources.

Neareast Neighbours#

import json
import numpy
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.axes_divider import make_axes_area_auto_adjustable
import pandas
from onnxruntime import InferenceSession, SessionOptions, get_device
from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
    SessionIOBinding, OrtDevice as C_OrtDevice, OrtValue as C_OrtValue)
from sklearn.neighbors import RadiusNeighborsRegressor
from skl2onnx import to_onnx
from tqdm import tqdm
from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
from mlprodict.plotting.plotting import onnx_simple_text_plot, plot_onnx
from mlprodict.onnxrt.ops_whole.session import OnnxWholeSession

Available optimisation on this machine.

print(code_optimisation())

Out:

AVX-omp=8

Building the model#

X = numpy.random.randn(1000, 10).astype(numpy.float64)
y = X.sum(axis=1).reshape((-1, 1))

model = RadiusNeighborsRegressor()
model.fit(X, y)

Out:

RadiusNeighborsRegressor()

Conversion to ONNX#

onx = to_onnx(model, X, options={'optim': 'cdist'})

print(onnx_simple_text_plot(onx))

Out:

opset: domain='com.microsoft' version=1
opset: domain='' version=15
opset: domain='ai.onnx.ml' version=1
input: name='X' type=dtype('float64') shape=(0, 10)
init: name='knny_ArrayFeatureExtractorcst' type=dtype('float64') shape=(1000,)
init: name='cond_CDistcst' type=dtype('float64') shape=(10000,)
init: name='cond_Lesscst' type=dtype('float64') shape=(1,) -- array([1.])
init: name='arange_CumSumcst' type=dtype('int64') shape=(1,) -- array([1])
init: name='knny_Reshapecst' type=dtype('int64') shape=(2,) -- array([  -1, 1000])
init: name='Re_Reshapecst' type=dtype('int64') shape=(2,) -- array([-1,  1])
CDist(X, cond_CDistcst) -> cond_dist
  Less(cond_dist, cond_Lesscst) -> cond_C0
    Cast(cond_C0, to=11) -> nnbin_output0
      ReduceSum(nnbin_output0, arange_CumSumcst, keepdims=0) -> norm_reduced0
  Shape(cond_dist) -> arange_shape0
    ConstantOfShape(arange_shape0) -> arange_output01
      Cast(arange_output01, to=7) -> arange_output0
        CumSum(arange_output0, arange_CumSumcst) -> arange_y0
          Neg(arange_y0) -> arange_Y0
        Add(arange_Y0, arange_output0) -> arange_C0
    Where(cond_C0, arange_C0, arange_output0) -> nnind_output0
      Flatten(nnind_output0) -> knny_output0
        ArrayFeatureExtractor(knny_ArrayFeatureExtractorcst, knny_output0) -> knny_Z0
          Reshape(knny_Z0, knny_Reshapecst, allowzero=0) -> knny_reshaped0
            Cast(knny_reshaped0, to=11) -> final_output0
      Mul(final_output0, nnbin_output0) -> final_C0
        ReduceSum(final_C0, arange_CumSumcst, keepdims=0) -> final_reduced0
          Shape(final_reduced0) -> normr_shape0
        Reshape(norm_reduced0, normr_shape0, allowzero=0) -> normr_reshaped0
          Div(final_reduced0, normr_reshaped0) -> Di_C0
            Reshape(Di_C0, Re_Reshapecst, allowzero=0) -> variable
output: name='variable' type=dtype('float64') shape=(0, 1)

The ONNX graph looks like the following.

_, ax = plt.subplots(1, 1, figsize=(8, 15))
plot_onnx(onx, ax=ax)
plot profile ort

Out:

<AxesSubplot:>

Profiling#

The profiling is enabled by setting attribute enable_profling in SessionOptions. Method end_profiling collects all the results and stores it on disk in JSON format.

so = SessionOptions()
so.enable_profiling = True
sess = InferenceSession(onx.SerializeToString(), so,
                        providers=['CPUExecutionProvider'])
feeds = {'X': X[:100]}

for i in tqdm(range(0, 10)):
    sess.run(None, feeds)

prof = sess.end_profiling()
print(prof)

Out:

  0%|          | 0/10 [00:00<?, ?it/s]
 60%|######    | 6/10 [00:00<00:00, 57.58it/s]
100%|##########| 10/10 [00:00<00:00, 58.26it/s]
onnxruntime_profile__2022-04-05_01-44-38.json

Better rendering#

with open(prof, "r") as f:
    js = json.load(f)
df = pandas.DataFrame(OnnxWholeSession.process_profiling(js))
df
cat pid tid dur ts ph name args_op_name args_thread_scheduling_stats args_input_type_shape args_activation_size args_parameter_size args_graph_index args_output_size args_provider args_output_type_shape args_exec_plan_index
0 Session 17007 17007 2154 10 X model_loading_array NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 Session 17007 17007 9565 2279 X session_initialization NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 Node 17007 17007 3 14552 X cond_CDist_fence_before CDist NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 Node 17007 17007 3979 14567 X cond_CDist_kernel_time CDist {'main_thread': {'thread_pool_name': 'session-... [{'double': [100, 10]}, {'double': [1000, 10]}] 8000 80000 0 800000 CPUExecutionProvider [{'double': [100, 1000]}] 0
4 Node 17007 17007 1 18614 X cond_CDist_fence_after CDist NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
617 Node 17007 17007 1 185626 X Re_Reshape_fence_before Reshape NaN NaN NaN NaN NaN NaN NaN NaN NaN
618 Node 17007 17007 56 185631 X Re_Reshape_kernel_time Reshape {'main_thread': {'thread_pool_name': 'session-... [{'double': [100]}, {'int64': [2]}] 800 16 20 800 CPUExecutionProvider [{'double': [100, 1]}] 20
619 Node 17007 17007 1 185707 X Re_Reshape_fence_after Reshape NaN NaN NaN NaN NaN NaN NaN NaN NaN
620 Session 17007 17007 16133 169584 X SequentialExecutor::Execute NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
621 Session 17007 17007 16164 169569 X model_run NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

622 rows × 17 columns



Graphs#

First graph is by operator type.

gr_dur = df[['dur', "args_op_name"]].groupby(
    "args_op_name").sum().sort_values('dur')
gr_n = df[['dur', "args_op_name"]].groupby(
    "args_op_name").count().sort_values('dur')
gr_n = gr_n.loc[gr_dur.index, :]

fig, ax = plt.subplots(1, 2, figsize=(8, 4))
gr_dur.plot.barh(ax=ax[0])
gr_n.plot.barh(ax=ax[1])
ax[0].set_title("duration")
ax[1].set_title("n occurences")
fig.suptitle(model.__class__.__name__)
RadiusNeighborsRegressor, duration, n occurences

Out:

Text(0.5, 0.98, 'RadiusNeighborsRegressor')

Second graph is by operator name.

gr_dur = df[['dur', "args_op_name", "name"]].groupby(
    ["args_op_name", "name"]).sum().sort_values('dur')
gr_dur.head(n=5)
dur
args_op_name name
ReduceSum norm_ReduceSum_fence_before 0
Reshape Re_Reshape_fence_before 6
Flatten knny_Flatten_fence_after 9
ReduceSum norm_ReduceSum_fence_after 9
Cast nnbin_Cast_fence_before 9


And the graph.

_, ax = plt.subplots(1, 1, figsize=(8, gr_dur.shape[0] // 2))
gr_dur.plot.barh(ax=ax)
ax.set_title("duration per node")
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(7)
make_axes_area_auto_adjustable(ax)
duration per node

The model spends most of its time in CumSum operator. Operator Shape gets called the highest number of times.

# plt.show()

GPU or CPU#

if get_device().upper() == 'GPU':
    ort_device = C_OrtDevice(
        C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0)
else:
    ort_device = C_OrtDevice(
        C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)

# session
sess = InferenceSession(onx.SerializeToString(), so,
                        providers=['CPUExecutionProvider',
                                   'CUDAExecutionProvider'])
bind = SessionIOBinding(sess._sess)

# moving the data on CPU or GPU
ort_value = C_OrtValue.ortvalue_from_numpy(X, ort_device)

Out:

somewhere/workspace/onnxcustom/onnxcustom_UT_39_std/_venv/lib/python3.9/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py:55: UserWarning: Specified provider 'CUDAExecutionProvider' is not in available provider names.Available providers: 'CPUExecutionProvider'
  warnings.warn("Specified provider '{}' is not in available provider names."

A function which calls the API for any device.

def run_with_iobinding(sess, bind, ort_device, ort_value, dtype):
    bind.bind_input('X', ort_device, dtype, ort_value.shape(),
                    ort_value.data_ptr())
    bind.bind_output('variable', ort_device)
    sess._sess.run_with_iobinding(bind, None)
    ortvalues = bind.get_outputs()
    return ortvalues[0].numpy()

The profiling.

for i in tqdm(range(0, 10)):
    run_with_iobinding(sess, bind, ort_device, ort_value, X.dtype)

prof = sess.end_profiling()
with open(prof, "r") as f:
    js = json.load(f)
df = pandas.DataFrame(OnnxWholeSession.process_profiling(js))
df

Out:

  0%|          | 0/10 [00:00<?, ?it/s]
 10%|#         | 1/10 [00:00<00:01,  6.32it/s]
 20%|##        | 2/10 [00:00<00:01,  6.51it/s]
 30%|###       | 3/10 [00:00<00:01,  6.58it/s]
 40%|####      | 4/10 [00:00<00:00,  6.60it/s]
 50%|#####     | 5/10 [00:00<00:00,  6.62it/s]
 60%|######    | 6/10 [00:00<00:00,  6.63it/s]
 70%|#######   | 7/10 [00:01<00:00,  6.65it/s]
 80%|########  | 8/10 [00:01<00:00,  6.65it/s]
 90%|######### | 9/10 [00:01<00:00,  6.65it/s]
100%|##########| 10/10 [00:01<00:00,  6.65it/s]
100%|##########| 10/10 [00:01<00:00,  6.62it/s]
cat pid tid dur ts ph name args_op_name args_thread_scheduling_stats args_input_type_shape args_activation_size args_parameter_size args_graph_index args_output_size args_provider args_output_type_shape args_exec_plan_index
0 Session 17007 17007 1328 5 X model_loading_array NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 Session 17007 17007 8873 1441 X session_initialization NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 Node 17007 17007 2 19133 X cond_CDist_fence_before CDist NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 Node 17007 17007 26113 19146 X cond_CDist_kernel_time CDist {'main_thread': {'thread_pool_name': 'session-... [{'double': [1000, 10]}, {'double': [1000, 10]}] 80000 80000 0 8000000 CPUExecutionProvider [{'double': [1000, 1000]}] 0
4 Node 17007 17007 1 45296 X cond_CDist_fence_after CDist NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
617 Node 17007 17007 0 1528887 X Re_Reshape_fence_before Reshape NaN NaN NaN NaN NaN NaN NaN NaN NaN
618 Node 17007 17007 58 1528892 X Re_Reshape_kernel_time Reshape {'main_thread': {'thread_pool_name': 'session-... [{'double': [1000]}, {'int64': [2]}] 8000 16 20 8000 CPUExecutionProvider [{'double': [1000, 1]}] 20
619 Node 17007 17007 1 1528971 X Re_Reshape_fence_after Reshape NaN NaN NaN NaN NaN NaN NaN NaN NaN
620 Session 17007 17007 148715 1380266 X SequentialExecutor::Execute NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
621 Session 17007 17007 148753 1380244 X model_run NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

622 rows × 17 columns



First graph is by operator type.

gr_dur = df[['dur', "args_op_name"]].groupby(
    "args_op_name").sum().sort_values('dur')
gr_n = df[['dur', "args_op_name"]].groupby(
    "args_op_name").count().sort_values('dur')
gr_n = gr_n.loc[gr_dur.index, :]

fig, ax = plt.subplots(1, 2, figsize=(8, 4))
gr_dur.plot.barh(ax=ax[0])
gr_n.plot.barh(ax=ax[1])
ax[0].set_title("duration")
ax[1].set_title("n occurences")
fig.suptitle(model.__class__.__name__)
RadiusNeighborsRegressor, duration, n occurences

Out:

Text(0.5, 0.98, 'RadiusNeighborsRegressor')

Second graph is by operator name.

gr_dur = df[['dur', "args_op_name", "name"]].groupby(
    ["args_op_name", "name"]).sum().sort_values('dur')
gr_dur.head(n=5)
dur
args_op_name name
ReduceSum norm_ReduceSum_fence_before 0
Reshape Re_Reshape_fence_before 2
Shape arange_Shape_fence_after 6
Reshape normr_Reshape_fence_after 7
knny_Reshape_fence_after 7


And the graph.

_, ax = plt.subplots(1, 1, figsize=(8, gr_dur.shape[0] // 2))
gr_dur.plot.barh(ax=ax)
ax.set_title("duration per node")
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(7)
make_axes_area_auto_adjustable(ax)
duration per node

It shows the same results.

# plt.show()

Total running time of the script: ( 0 minutes 23.126 seconds)

Gallery generated by Sphinx-Gallery