Note

Click here to download the full example code

Pytorch and onnxruntime#

This example compares the training between pytorch and ORTModule.

Functions
Training
Training with pytorch
Training with ORTModule
Visualisation
Performance

Functions #

The first function creates the neural network.

import time
import copy
import numpy
import onnx  # noqa
from pandas import DataFrame
from onnxruntime import get_device
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from mlprodict.plotting.plotting_onnx import plot_onnx
from tqdm import tqdm
import torch
import torch.nn.functional as F
from onnxruntime.training import ORTModule
from deeponnxcustom.tools.onnx_helper import save_as_onnx


def from_numpy(v, device=None, requires_grad=False):
    """
    Convers a numpy array into a torch array and
    sets *device* and *requires_grad*.
    """
    v = torch.from_numpy(v)
    if device is not None:
        v = v.to(device)
    v.requires_grad_(requires_grad)
    return v


def build_class_model():

    class CustomNet(torch.nn.Module):
        def __init__(self, n_features, hidden_layer_sizes, n_output):
            super(CustomNet, self).__init__()
            self.hidden = []

            size = n_features
            for i, hid in enumerate(hidden_layer_sizes):
                self.hidden.append(torch.nn.Linear(size, hid))
                size = hid
                setattr(self, "hid%d" % i, self.hidden[-1])
            self.hidden.append(torch.nn.Linear(size, n_output))
            setattr(self, "predict", self.hidden[-1])

        def forward(self, x):
            for hid in self.hidden:
                x = hid(x)
                x = F.relu(x)
            return x

    return CustomNet


cls = build_class_model()
model = cls(5, (4, 3), 1)
save_as_onnx(model, "plot_ortmodule.onnx", 5)
plot_onnx("plot_ortmodule.onnx", temp_dot="plot_ortmodule.dot")

Out:

<AxesSubplot:>

Training #

The function train_model works with a torch.nn.Module and ORTModule.

def train_model(nn, X_train, y_train, max_iter=25,
                learning_rate_init=1e-5, batch_size=10,
                device='cpu', opset=12, verbose=False,
                use_ortmodule=False):
    """
    Compares :epkg:`onnxruntime-training` to :epkg:`scikit-learn` for
    training. Training algorithm is SGD.

    :param nn: model to train
    :param max_iter: number of iterations
    :param learning_rate_init: initial learning rate
    :param batch_size: batch size
    :param device: `'cpu'` or `'cuda'`
    :param opset: opset to choose for the conversion
    :param use_ortmodule: use :epkg:`ORTModule`
    :param verbose: displays intermediate information
    """
    max_iter = int(max_iter)
    learning_rate_init = float(learning_rate_init)
    batch_size = int(batch_size)

    if verbose:
        print("N=%d" % N)
        print("n_features=%d" % n_features)
        print("hidden_layer_sizes=%r" % (hidden_layer_sizes, ))
        print("max_iter=%d" % max_iter)
        print("learning_rate_init=%f" % learning_rate_init)
        print("batch_size=%d" % batch_size)
        print("opset=%r (unused)" % opset)
        print("device=%r" % device)

    device0 = device
    device = torch.device(
        "cuda:0" if device in ('cuda', 'cuda:0', 'gpu') else "cpu")

    if verbose:
        print("fixed device=%r" % device)
        print('------------------')

    if device0 == 'cpu':
        nn.cpu()
    else:
        nn.cuda(device=device)

    if verbose:
        print("n_parameters=%d, n_layers=%d" % (
            len(list(nn.parameters())), len(nn.hidden)))
        for i, p in enumerate(nn.parameters()):
            print("  p[%d].shape=%r" % (i, p.shape))

    if use_ortmodule:
        nn = ORTModule(nn)

    optimizer = torch.optim.SGD(nn.parameters(), lr=learning_rate_init)
    criterion = torch.nn.MSELoss(size_average=False)
    batch_no = X_train.shape[0] // batch_size

    # training
    inputs = torch.tensor(
        X_train[:1], requires_grad=True, device=device)
    nn(inputs)

    def train_torch():
        if verbose:
            loop = tqdm(range(max_iter))
        else:
            loop = range(max_iter)

        losses = []
        for epoch in loop:
            running_loss = 0.0
            x, y = shuffle(X_train, y_train)
            for i in range(batch_no):
                start = i * batch_size
                end = start + batch_size
                inputs = torch.tensor(
                    x[start:end], requires_grad=True, device=device)
                labels = torch.tensor(
                    y[start:end], requires_grad=True, device=device)

                def step_torch():
                    optimizer.zero_grad()
                    outputs = nn(inputs)
                    loss = criterion(outputs, torch.unsqueeze(labels, dim=1))
                    loss.backward()
                    optimizer.step()
                    return loss

                loss = step_torch()
                running_loss += loss.item()
            losses.append(running_loss / X_train.shape[0])
        return losses

    begin = time.perf_counter()
    losses = train_torch()
    dur = time.perf_counter() - begin

    if verbose:
        print("time_torch=%r, running_loss=%r" % (dur, losses[-1]))

    return nn, losses, dur

Some data.

N = 1000
n_features = 5

X, y = make_regression(N, n_features=n_features, bias=2)
X = X.astype(numpy.float32)
y = y.astype(numpy.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y)

And one model

hidden_layer_sizes = (5, 3)

cls_net = build_class_model()
nn = cls_net(n_features, hidden_layer_sizes, 1)

Training with pytorch #

device = "cuda" if get_device() == 'GPU' else 'cpu'

t_x_train = from_numpy(X_train, device=device)
t_y_train = from_numpy(y_train, device=device)

trained_nn_tch, losses_tch, duration = train_model(
    copy.deepcopy(nn), t_x_train, t_y_train, verbose=True,
    device=device, learning_rate_init=1e-4)

print("Torch, %d iterations, final loss=%f, duration=%f" % (
    len(losses_tch), losses_tch[-1], duration))
print('type(trained_nn_tch):', type(trained_nn_tch))

Out:

N=1000
n_features=5
hidden_layer_sizes=(5, 3)
max_iter=25
learning_rate_init=0.000100
batch_size=10
opset=12 (unused)
device='cpu'
fixed device=device(type='cpu')
------------------
n_parameters=6, n_layers=3
  p[0].shape=torch.Size([5, 5])
  p[1].shape=torch.Size([5])
  p[2].shape=torch.Size([3, 5])
  p[3].shape=torch.Size([3])
  p[4].shape=torch.Size([1, 3])
  p[5].shape=torch.Size([1])
somewhere/workspace/deeponnxcustom/deeponnxcustom_UT_39_std/_venv/lib/python3.9/site-packages/torch/nn/_reduction.py:42: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
  warnings.warn(warning.format(ret))
somewhere/workspace/deeponnxcustom/deeponnxcustom_UT_39_std/_doc/examples/plot_ortmodule.py:143: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  inputs = torch.tensor(

  0%|          | 0/25 [00:00<?, ?it/s]somewhere/workspace/deeponnxcustom/deeponnxcustom_UT_39_std/_doc/examples/plot_ortmodule.py:160: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  inputs = torch.tensor(
somewhere/workspace/deeponnxcustom/deeponnxcustom_UT_39_std/_doc/examples/plot_ortmodule.py:162: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  labels = torch.tensor(

  4%|4         | 1/25 [00:00<00:03,  6.06it/s]
  8%|8         | 2/25 [00:00<00:03,  6.21it/s]
 12%|#2        | 3/25 [00:00<00:03,  6.18it/s]
 16%|#6        | 4/25 [00:00<00:03,  6.19it/s]
 20%|##        | 5/25 [00:00<00:03,  6.23it/s]
 24%|##4       | 6/25 [00:00<00:03,  6.25it/s]
 28%|##8       | 7/25 [00:01<00:02,  6.27it/s]
 32%|###2      | 8/25 [00:01<00:02,  6.24it/s]
 36%|###6      | 9/25 [00:01<00:02,  6.23it/s]
 40%|####      | 10/25 [00:01<00:02,  6.26it/s]
 44%|####4     | 11/25 [00:01<00:02,  6.24it/s]
 48%|####8     | 12/25 [00:01<00:02,  6.22it/s]
 52%|#####2    | 13/25 [00:02<00:01,  6.22it/s]
 56%|#####6    | 14/25 [00:02<00:01,  6.21it/s]
 60%|######    | 15/25 [00:02<00:01,  6.21it/s]
 64%|######4   | 16/25 [00:02<00:01,  6.25it/s]
 68%|######8   | 17/25 [00:02<00:01,  6.23it/s]
 72%|#######2  | 18/25 [00:02<00:01,  6.22it/s]
 76%|#######6  | 19/25 [00:03<00:00,  6.25it/s]
 80%|########  | 20/25 [00:03<00:00,  6.27it/s]
 84%|########4 | 21/25 [00:03<00:00,  6.25it/s]
 88%|########8 | 22/25 [00:03<00:00,  6.23it/s]
 92%|#########2| 23/25 [00:03<00:00,  6.26it/s]
 96%|#########6| 24/25 [00:03<00:00,  6.28it/s]
100%|##########| 25/25 [00:04<00:00,  6.29it/s]
100%|##########| 25/25 [00:04<00:00,  6.24it/s]
time_torch=4.009416713990504, running_loss=23255.341375
Torch, 25 iterations, final loss=23255.341375, duration=4.009417
type(trained_nn_tch): <class '__main__.build_class_model.<locals>.CustomNet'>

Training with ORTModule #

trained_nn_ort, losses_ort, duration = train_model(
    copy.deepcopy(nn), X_train, y_train, verbose=True,
    device=device, learning_rate_init=1e-4, use_ortmodule=True)

print("ORT, %d iterations, final loss=%f, duration=%f" % (
    len(losses_ort), losses_ort[-1], duration))
print('type(trained_nn_ort):', type(trained_nn_ort))

Out:

N=1000
n_features=5
hidden_layer_sizes=(5, 3)
max_iter=25
learning_rate_init=0.000100
batch_size=10
opset=12 (unused)
device='cpu'
fixed device=device(type='cpu')
------------------
n_parameters=6, n_layers=3
  p[0].shape=torch.Size([5, 5])
  p[1].shape=torch.Size([5])
  p[2].shape=torch.Size([3, 5])
  p[3].shape=torch.Size([3])
  p[4].shape=torch.Size([1, 3])
  p[5].shape=torch.Size([1])
somewhere/workspace/deeponnxcustom/deeponnxcustom_UT_39_std/_venv/lib/python3.9/site-packages/torch/nn/_reduction.py:42: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
  warnings.warn(warning.format(ret))
somewhere/workspace/deeponnxcustom/deeponnxcustom_UT_39_std/_venv/lib/python3.9/site-packages/onnxruntime/training/ortmodule/_training_manager.py:190: UserWarning: Fast path enabled - skipping checks. Rebuild graph: True, Execution agent: True, Device check: True
  warnings.warn(

  0%|          | 0/25 [00:00<?, ?it/s]
  4%|4         | 1/25 [00:00<00:04,  5.32it/s]
  8%|8         | 2/25 [00:00<00:04,  5.46it/s]
 12%|#2        | 3/25 [00:00<00:03,  5.51it/s]
 16%|#6        | 4/25 [00:00<00:03,  5.52it/s]
 20%|##        | 5/25 [00:00<00:03,  5.53it/s]
 24%|##4       | 6/25 [00:01<00:03,  5.54it/s]
 28%|##8       | 7/25 [00:01<00:03,  5.55it/s]
 32%|###2      | 8/25 [00:01<00:03,  5.56it/s]
 36%|###6      | 9/25 [00:01<00:02,  5.57it/s]
 40%|####      | 10/25 [00:01<00:02,  5.57it/s]
 44%|####4     | 11/25 [00:01<00:02,  5.58it/s]
 48%|####8     | 12/25 [00:02<00:02,  5.59it/s]
 52%|#####2    | 13/25 [00:02<00:02,  5.59it/s]
 56%|#####6    | 14/25 [00:02<00:01,  5.59it/s]
 60%|######    | 15/25 [00:02<00:01,  5.59it/s]
 64%|######4   | 16/25 [00:02<00:01,  5.60it/s]
 68%|######8   | 17/25 [00:03<00:01,  5.60it/s]
 72%|#######2  | 18/25 [00:03<00:01,  5.60it/s]
 76%|#######6  | 19/25 [00:03<00:01,  5.61it/s]
 80%|########  | 20/25 [00:03<00:00,  5.61it/s]
 84%|########4 | 21/25 [00:03<00:00,  5.61it/s]
 88%|########8 | 22/25 [00:03<00:00,  5.61it/s]
 92%|#########2| 23/25 [00:04<00:00,  5.61it/s]
 96%|#########6| 24/25 [00:04<00:00,  5.61it/s]
100%|##########| 25/25 [00:04<00:00,  5.60it/s]
100%|##########| 25/25 [00:04<00:00,  5.58it/s]
time_torch=4.484404376999009, running_loss=23255.341333333334
ORT, 25 iterations, final loss=23255.341333, duration=4.484404
type(trained_nn_ort): <class 'onnxruntime.training.ortmodule.ortmodule.ORTModule'>

Visualisation #

df = DataFrame(dict(torch=losses_tch, ort=losses_ort))
df.plot(title="Training loss / iterations")

Out:

<AxesSubplot:title={'center':'Training loss / iterations'}>

Performance #

N = X_test.shape[0]

t_x_test = from_numpy(X_test, device=device)
t_y_test = from_numpy(y_test, device=device)

pred_tch = trained_nn_tch(t_x_test)
pred_ort = trained_nn_ort(t_x_test)

error_tch = ((pred_tch.ravel() - t_y_test.ravel()) ** 2).sum() / N
error_ort = ((pred_ort.ravel() - t_y_test.ravel()) ** 2).sum() / N

print("error torch: %f" % error_tch)
print("error ort: %f" % error_ort)

# import matplotlib.pyplot as plt
# plt.show()

Out:

error torch: 23460.455078
error ort: 23460.455078

Total running time of the script: ( 0 minutes 10.295 seconds)

Gallery generated by Sphinx-Gallery

ONNX in a torch function

Benchmarks and profilings

Pytorch and onnxruntime#

Functions#

Training#

Training with pytorch#

Training with ORTModule#

Visualisation#

Performance#

Functions #

Training #

Training with pytorch #

Training with ORTModule #

Visualisation #

Performance #