Note
Click here to download the full example code
Dataframe as an input#
A pipeline usually ingests data as a matrix. It may be converted in a matrix if all the data share the same type. But data held in a dataframe have usually multiple types, float, integer or string for categories. ONNX also supports that case.
A dataset with categories#
from mlinsights.plotting import pipeline2dot
import numpy
import pprint
from mlprodict.onnx_conv import guess_schema_from_data
from onnxruntime import InferenceSession
from pyquickhelper.helpgen.graphviz_helper import plot_graphviz
from mlprodict.onnxrt import OnnxInference
from mlprodict.onnx_conv import to_onnx as to_onnx_ext
from skl2onnx import to_onnx
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
data = DataFrame([
dict(CAT1='a', CAT2='c', num1=0.5, num2=0.6, y=0),
dict(CAT1='b', CAT2='d', num1=0.4, num2=0.8, y=1),
dict(CAT1='a', CAT2='d', num1=0.5, num2=0.56, y=0),
dict(CAT1='a', CAT2='d', num1=0.55, num2=0.56, y=1),
dict(CAT1='a', CAT2='c', num1=0.35, num2=0.86, y=0),
dict(CAT1='a', CAT2='c', num1=0.5, num2=0.68, y=1),
])
cat_cols = ['CAT1', 'CAT2']
train_data = data.drop('y', axis=1)
categorical_transformer = Pipeline([
('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('cat', categorical_transformer, cat_cols)],
remainder='passthrough')
pipe = Pipeline([('preprocess', preprocessor),
('rf', RandomForestClassifier())])
pipe.fit(train_data, data['y'])
Display.
dot = pipeline2dot(pipe, train_data)
ax = plot_graphviz(dot)
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)

Conversion to ONNX#
Function to_onnx does not handle dataframes.
try:
onx = to_onnx(pipe, train_data[:1])
except NotImplementedError as e:
print(e)
But it possible to use an extended one.
onx = to_onnx_ext(
pipe, train_data[:1],
options={RandomForestClassifier: {'zipmap': False}})
Graph#
oinf = OnnxInference(onx)
ax = plot_graphviz(oinf.to_dot())
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)

Prediction with ONNX#
onnxruntime does not support dataframes.
sess = InferenceSession(onx.SerializeToString(),
providers=['CPUExecutionProvider'])
try:
sess.run(None, train_data)
except Exception as e:
print(e)
run(): incompatible function arguments. The following argument types are supported:
1. (self: onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession, arg0: List[str], arg1: Dict[str, object], arg2: onnxruntime.capi.onnxruntime_pybind11_state.RunOptions) -> List[object]
Invoked with: <onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession object at 0x7faf8eea20b0>, ['label', 'probabilities'], CAT1 CAT2 num1 num2
0 a c 0.50 0.60
1 b d 0.40 0.80
2 a d 0.50 0.56
3 a d 0.55 0.56
4 a c 0.35 0.86
5 a c 0.50 0.68, None
Let’s use a shortcut
oinf = OnnxInference(onx)
got = oinf.run(train_data)
print(pipe.predict(train_data))
print(got['label'])
[0 1 0 1 0 1]
[0 1 0 1 0 1]
And probilities.
print(pipe.predict_proba(train_data))
print(got['probabilities'])
[[0.78 0.22]
[0.26 0.74]
[0.69 0.31]
[0.28 0.72]
[0.73 0.27]
[0.27 0.73]]
[[0.78 0.22 ]
[0.2600001 0.7399999 ]
[0.69000006 0.30999997]
[0.28000015 0.71999985]
[0.73 0.26999995]
[0.2700001 0.7299999 ]]
It looks ok. Let’s dig into the details to directly use onnxruntime.
Unhide conversion logic with a dataframe#
A dataframe can be seen as a set of columns with different types. That’s what ONNX should see: a list of inputs, the input name is the column name, the input type is the column type.
init = guess_schema_from_data(train_data)
pprint.pprint(init)
[('CAT1', StringTensorType(shape=[None, 1])),
('CAT2', StringTensorType(shape=[None, 1])),
('num1', DoubleTensorType(shape=[None, 1])),
('num2', DoubleTensorType(shape=[None, 1]))]
Let’s use float instead.
for c in train_data.columns:
if c not in cat_cols:
train_data[c] = train_data[c].astype(numpy.float32)
init = guess_schema_from_data(train_data)
pprint.pprint(init)
[('CAT1', StringTensorType(shape=[None, 1])),
('CAT2', StringTensorType(shape=[None, 1])),
('num1', FloatTensorType(shape=[None, 1])),
('num2', FloatTensorType(shape=[None, 1]))]
Let’s convert with skl2onnx only.
onx2 = to_onnx(
pipe, initial_types=init,
options={RandomForestClassifier: {'zipmap': False}})
Let’s run it with onnxruntime. We need to convert the dataframe into a dictionary where column names become keys, and column values become values.
inputs = {c: train_data[c].values.reshape((-1, 1))
for c in train_data.columns}
pprint.pprint(inputs)
{'CAT1': array([['a'],
['b'],
['a'],
['a'],
['a'],
['a']], dtype=object),
'CAT2': array([['c'],
['d'],
['d'],
['d'],
['c'],
['c']], dtype=object),
'num1': array([[0.5 ],
[0.4 ],
[0.5 ],
[0.55],
[0.35],
[0.5 ]], dtype=float32),
'num2': array([[0.6 ],
[0.8 ],
[0.56],
[0.56],
[0.86],
[0.68]], dtype=float32)}
Inference.
sess2 = InferenceSession(onx2.SerializeToString(),
providers=['CPUExecutionProvider'])
got2 = sess2.run(None, inputs)
print(pipe.predict(train_data))
print(got2[0])
[0 1 0 1 0 1]
[0 1 0 1 0 1]
And probilities.
print(pipe.predict_proba(train_data))
print(got2[1])
[[0.78 0.22]
[0.26 0.74]
[0.69 0.31]
[0.28 0.72]
[0.73 0.27]
[0.27 0.73]]
[[0.78 0.22000003]
[0.2600004 0.7399996 ]
[0.69000006 0.30999997]
[0.2800004 0.7199996 ]
[0.73 0.27 ]
[0.2700004 0.7299996 ]]
Total running time of the script: ( 0 minutes 3.322 seconds)