Note
Click here to download the full example code
Forward backward on a neural network on GPU (Nesterov) and penalty#
This example does the same as Forward backward on a neural network on GPU but updates the weights using Nesterov momentum.
A neural network with scikit-learn#
import warnings
import numpy
import onnx
from pandas import DataFrame
from onnxruntime import get_device
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from onnxcustom.plotting.plotting_onnx import plot_onnxs
from mlprodict.onnx_conv import to_onnx
from mlprodict.plotting.text_plot import onnx_simple_text_plot
from onnxcustom.utils.orttraining_helper import get_train_initializer
from onnxcustom.utils.onnx_helper import onnx_rename_weights
from onnxcustom.training.optimizers_partial import (
OrtGradientForwardBackwardOptimizer)
from onnxcustom.training.sgd_learning_rate import LearningRateSGDNesterov
from onnxcustom.training.sgd_learning_penalty import ElasticLearningPenalty
X, y = make_regression(1000, n_features=10, bias=2)
X = X.astype(numpy.float32)
y = y.astype(numpy.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y)
nn = MLPRegressor(hidden_layer_sizes=(10, 10), max_iter=100,
solver='sgd', learning_rate_init=5e-5,
n_iter_no_change=1000, batch_size=10, alpha=0,
momentum=0.9, nesterovs_momentum=True)
with warnings.catch_warnings():
warnings.simplefilter('ignore')
nn.fit(X_train, y_train)
print(nn.loss_curve_)
Out:
[13689.16526570638, 91.7557507443428, 4.649383627573649, 2.5948190772533417, 1.7974506411949793, 1.2746547999978066, 0.8853114599982898, 0.7218076496322949, 0.6351874788602193, 0.5119553685684999, 0.4288527140766382, 0.38650360981623333, 0.36558753594756127, 0.3098751598596573, 0.2760043459633986, 0.24989255496611198, 0.22109427648286026, 0.2174149859075745, 0.20371884312480687, 0.17956425132850806, 0.17394759574284155, 0.1533257754519582, 0.16605487483243148, 0.1402435441315174, 0.1360019328817725, 0.12801098541046182, 0.11994707521051168, 0.11173540908843278, 0.11873933079342047, 0.10600779481853048, 0.10563321474939585, 0.10062460877622167, 0.09315281248341004, 0.0988034976584216, 0.08800906119247277, 0.09059339290484786, 0.08555087928970655, 0.08111379235672454, 0.0786132145114243, 0.08379405161365866, 0.0754975994117558, 0.07838810011744499, 0.07541584953665734, 0.06973540396119157, 0.06646330761412779, 0.06717006959021092, 0.06358123812203606, 0.06679932817816735, 0.06430457157393296, 0.06468540085479617, 0.06309005339940389, 0.060976478575418394, 0.059586922780921064, 0.05699720985566576, 0.058460857709869744, 0.05800200457374255, 0.05783799896637599, 0.05946166674296061, 0.0547905424858133, 0.054829340009018776, 0.05512160603577892, 0.053804953054835396, 0.05130224403614799, 0.05159920582547784, 0.05158132251352072, 0.04974819915369153, 0.050237851353983086, 0.04863181858944396, 0.04957028084124128, 0.047807619171217085, 0.048763600445042056, 0.049349545184522864, 0.04866982779155175, 0.047656926562388735, 0.04616892060264945, 0.047021689740940925, 0.04410986665015419, 0.04561400677698354, 0.04263253377129634, 0.04437223461767038, 0.04321897224833568, 0.04216633066069335, 0.043267434056227405, 0.042707170359790325, 0.04074323491503795, 0.04279285515348116, 0.04207661844479541, 0.04092800240808477, 0.03897427704806129, 0.038144353556757174, 0.04032668794815739, 0.039126982853437464, 0.040249466576303045, 0.03883387777333458, 0.039946185710529485, 0.03867558625061065, 0.03737739828104774, 0.03682649721391499, 0.03755189497023821, 0.038114412079254784]
Score:
print("mean_squared_error=%r" % mean_squared_error(y_test, nn.predict(X_test)))
Out:
mean_squared_error=0.34603828
Conversion to ONNX#
onx = to_onnx(nn, X_train[:1].astype(numpy.float32), target_opset=15)
plot_onnxs(onx)
weights = list(sorted(get_train_initializer(onx)))
print(weights)

Out:
['coefficient', 'coefficient1', 'coefficient2', 'intercepts', 'intercepts1', 'intercepts2']
Training graph with forward backward#
device = "cuda" if get_device().upper() == 'GPU' else 'cpu'
print("device=%r get_device()=%r" % (device, get_device()))
onx = onnx_rename_weights(onx)
train_session = OrtGradientForwardBackwardOptimizer(
onx, device=device, verbose=1,
learning_rate=LearningRateSGDNesterov(1e-4, nesterov=True, momentum=0.9),
warm_start=False, max_iter=100, batch_size=10)
train_session.fit(X, y)
Out:
device='cpu' get_device()='CPU'
0%| | 0/100 [00:00<?, ?it/s]
1%|1 | 1/100 [00:00<00:22, 4.46it/s]
2%|2 | 2/100 [00:00<00:22, 4.43it/s]
3%|3 | 3/100 [00:00<00:22, 4.41it/s]
4%|4 | 4/100 [00:00<00:21, 4.41it/s]
5%|5 | 5/100 [00:01<00:21, 4.41it/s]
6%|6 | 6/100 [00:01<00:21, 4.41it/s]
7%|7 | 7/100 [00:01<00:21, 4.41it/s]
8%|8 | 8/100 [00:01<00:20, 4.42it/s]
9%|9 | 9/100 [00:02<00:20, 4.40it/s]
10%|# | 10/100 [00:02<00:20, 4.39it/s]
11%|#1 | 11/100 [00:02<00:20, 4.38it/s]
12%|#2 | 12/100 [00:02<00:20, 4.38it/s]
13%|#3 | 13/100 [00:02<00:19, 4.38it/s]
14%|#4 | 14/100 [00:03<00:19, 4.37it/s]
15%|#5 | 15/100 [00:03<00:19, 4.38it/s]
16%|#6 | 16/100 [00:03<00:19, 4.38it/s]
17%|#7 | 17/100 [00:03<00:18, 4.38it/s]
18%|#8 | 18/100 [00:04<00:18, 4.38it/s]
19%|#9 | 19/100 [00:04<00:18, 4.40it/s]
20%|## | 20/100 [00:04<00:18, 4.42it/s]
21%|##1 | 21/100 [00:04<00:17, 4.42it/s]
22%|##2 | 22/100 [00:04<00:17, 4.43it/s]
23%|##3 | 23/100 [00:05<00:17, 4.42it/s]
24%|##4 | 24/100 [00:05<00:17, 4.42it/s]
25%|##5 | 25/100 [00:05<00:16, 4.42it/s]
26%|##6 | 26/100 [00:05<00:16, 4.43it/s]
27%|##7 | 27/100 [00:06<00:16, 4.44it/s]
28%|##8 | 28/100 [00:06<00:16, 4.44it/s]
29%|##9 | 29/100 [00:06<00:15, 4.44it/s]
30%|### | 30/100 [00:06<00:15, 4.43it/s]
31%|###1 | 31/100 [00:07<00:15, 4.43it/s]
32%|###2 | 32/100 [00:07<00:15, 4.44it/s]
33%|###3 | 33/100 [00:07<00:15, 4.44it/s]
34%|###4 | 34/100 [00:07<00:14, 4.43it/s]
35%|###5 | 35/100 [00:07<00:14, 4.43it/s]
36%|###6 | 36/100 [00:08<00:14, 4.43it/s]
37%|###7 | 37/100 [00:08<00:14, 4.43it/s]
38%|###8 | 38/100 [00:08<00:14, 4.42it/s]
39%|###9 | 39/100 [00:08<00:13, 4.43it/s]
40%|#### | 40/100 [00:09<00:13, 4.42it/s]
41%|####1 | 41/100 [00:09<00:13, 4.41it/s]
42%|####2 | 42/100 [00:09<00:13, 4.41it/s]
43%|####3 | 43/100 [00:09<00:12, 4.41it/s]
44%|####4 | 44/100 [00:09<00:12, 4.40it/s]
45%|####5 | 45/100 [00:10<00:12, 4.40it/s]
46%|####6 | 46/100 [00:10<00:12, 4.40it/s]
47%|####6 | 47/100 [00:10<00:12, 4.40it/s]
48%|####8 | 48/100 [00:10<00:11, 4.41it/s]
49%|####9 | 49/100 [00:11<00:11, 4.40it/s]
50%|##### | 50/100 [00:11<00:11, 4.40it/s]
51%|#####1 | 51/100 [00:11<00:11, 4.40it/s]
52%|#####2 | 52/100 [00:11<00:10, 4.40it/s]
53%|#####3 | 53/100 [00:12<00:10, 4.40it/s]
54%|#####4 | 54/100 [00:12<00:10, 4.40it/s]
55%|#####5 | 55/100 [00:12<00:10, 4.40it/s]
56%|#####6 | 56/100 [00:12<00:09, 4.41it/s]
57%|#####6 | 57/100 [00:12<00:09, 4.40it/s]
58%|#####8 | 58/100 [00:13<00:09, 4.40it/s]
59%|#####8 | 59/100 [00:13<00:09, 4.40it/s]
60%|###### | 60/100 [00:13<00:09, 4.40it/s]
61%|######1 | 61/100 [00:13<00:08, 4.41it/s]
62%|######2 | 62/100 [00:14<00:08, 4.41it/s]
63%|######3 | 63/100 [00:14<00:08, 4.40it/s]
64%|######4 | 64/100 [00:14<00:08, 4.41it/s]
65%|######5 | 65/100 [00:14<00:07, 4.41it/s]
66%|######6 | 66/100 [00:14<00:07, 4.41it/s]
67%|######7 | 67/100 [00:15<00:07, 4.40it/s]
68%|######8 | 68/100 [00:15<00:07, 4.40it/s]
69%|######9 | 69/100 [00:15<00:07, 4.40it/s]
70%|####### | 70/100 [00:15<00:06, 4.41it/s]
71%|#######1 | 71/100 [00:16<00:06, 4.41it/s]
72%|#######2 | 72/100 [00:16<00:06, 4.41it/s]
73%|#######3 | 73/100 [00:16<00:06, 4.41it/s]
74%|#######4 | 74/100 [00:16<00:05, 4.41it/s]
75%|#######5 | 75/100 [00:17<00:05, 4.41it/s]
76%|#######6 | 76/100 [00:17<00:05, 4.40it/s]
77%|#######7 | 77/100 [00:17<00:05, 4.40it/s]
78%|#######8 | 78/100 [00:17<00:05, 4.39it/s]
79%|#######9 | 79/100 [00:17<00:04, 4.39it/s]
80%|######## | 80/100 [00:18<00:04, 4.39it/s]
81%|########1 | 81/100 [00:18<00:04, 4.40it/s]
82%|########2 | 82/100 [00:18<00:04, 4.40it/s]
83%|########2 | 83/100 [00:18<00:03, 4.41it/s]
84%|########4 | 84/100 [00:19<00:03, 4.41it/s]
85%|########5 | 85/100 [00:19<00:03, 4.41it/s]
86%|########6 | 86/100 [00:19<00:03, 4.40it/s]
87%|########7 | 87/100 [00:19<00:02, 4.40it/s]
88%|########8 | 88/100 [00:19<00:02, 4.40it/s]
89%|########9 | 89/100 [00:20<00:02, 4.40it/s]
90%|######### | 90/100 [00:20<00:02, 4.41it/s]
91%|#########1| 91/100 [00:20<00:02, 4.40it/s]
92%|#########2| 92/100 [00:20<00:01, 4.41it/s]
93%|#########3| 93/100 [00:21<00:01, 4.40it/s]
94%|#########3| 94/100 [00:21<00:01, 4.40it/s]
95%|#########5| 95/100 [00:21<00:01, 4.40it/s]
96%|#########6| 96/100 [00:21<00:00, 4.39it/s]
97%|#########7| 97/100 [00:22<00:00, 4.40it/s]
98%|#########8| 98/100 [00:22<00:00, 4.39it/s]
99%|#########9| 99/100 [00:22<00:00, 4.40it/s]
100%|##########| 100/100 [00:22<00:00, 4.40it/s]
100%|##########| 100/100 [00:22<00:00, 4.41it/s]
OrtGradientForwardBackwardOptimizer(model_onnx='ir_version...', weights_to_train="['I0_coeff...", loss_output_name='loss', max_iter=100, training_optimizer_name='SGDOptimizer', batch_size=10, learning_rate=LearningRateSGDNesterov(eta0=0.0001, alpha=0.0001, power_t=0.25, learning_rate='invscaling', momentum=0.9, nesterov=True), value=3.1622776601683795e-05, device='cpu', warm_start=False, verbose=1, validation_every=10, learning_loss=SquareLearningLoss(), enable_logging=False, weight_name=None, learning_penalty=NoLearningPenalty(), exc=True)
Let’s see the weights.
state_tensors = train_session.get_state()
And the loss.
print(train_session.train_losses_)
df = DataFrame({'ort losses': train_session.train_losses_,
'skl losses:': nn.loss_curve_})
df.plot(title="Train loss against iterations (Nesterov)", logy=True)

Out:
[3130.368, 18.732786, 8.782759, 4.662987, 3.5358355, 2.2379062, 2.0110888, 1.4150627, 1.3563325, 1.0484885, 1.0823491, 0.7789048, 0.6522911, 0.64664274, 0.47164518, 0.55573845, 0.49858397, 0.504501, 0.477184, 0.39480156, 0.36533645, 0.33328384, 0.31991947, 0.29673743, 0.27256164, 0.29752114, 0.22641943, 0.2335147, 0.2719339, 0.26593828, 0.18966302, 0.22315162, 0.18326737, 0.18820351, 0.18388198, 0.21446659, 0.19665642, 0.17781933, 0.15517798, 0.17332165, 0.17328648, 0.16581976, 0.1642249, 0.15336338, 0.14892954, 0.14062904, 0.15288725, 0.16279854, 0.17945854, 0.11618008, 0.12503687, 0.17332283, 0.13908692, 0.13753873, 0.13397205, 0.12734419, 0.13572647, 0.12536834, 0.12060101, 0.12844919, 0.11359391, 0.14157191, 0.1421669, 0.14367269, 0.1083925, 0.09146446, 0.11235092, 0.11523097, 0.10359235, 0.11370159, 0.12755904, 0.114748985, 0.10519836, 0.11907223, 0.10175072, 0.09999031, 0.11850316, 0.12765878, 0.0916389, 0.09053176, 0.0857964, 0.09085918, 0.09788595, 0.08369264, 0.11704714, 0.07964846, 0.10061881, 0.09443776, 0.06586561, 0.098216526, 0.094929844, 0.105257705, 0.080187045, 0.120990746, 0.097052984, 0.100940645, 0.08430198, 0.1023317, 0.09304747, 0.084736824]
<AxesSubplot:title={'center':'Train loss against iterations (Nesterov)'}>
The convergence rate is different but both classes do not update the learning exactly the same way.
Regularization#
Default parameters for MLPRegressor suggest to penalize weights during training: alpha=1e-4.
nn = MLPRegressor(hidden_layer_sizes=(10, 10), max_iter=100,
solver='sgd', learning_rate_init=5e-5,
n_iter_no_change=1000, batch_size=10, alpha=1e-4,
momentum=0.9, nesterovs_momentum=True)
with warnings.catch_warnings():
warnings.simplefilter('ignore')
nn.fit(X_train, y_train)
print(nn.loss_curve_)
Out:
[16925.227527803556, 133.0551626022662, 5.3601373485844945, 2.8570205609372783, 1.8681195500288008, 1.352136299099668, 1.1213359523994448, 0.8786676367611571, 0.7795861063898085, 0.6842196166339557, 0.618143480872774, 0.5092322058792431, 0.5214818535900275, 0.43350286077299127, 0.3747708899494489, 0.3729585092393398, 0.3674547356944084, 0.3112230614336332, 0.2919397004592579, 0.2730200425107638, 0.28718040407601986, 0.25925811573967933, 0.22987443296599386, 0.22644076932239535, 0.2107776988305728, 0.19227200695352556, 0.18815035757013956, 0.17692219021575048, 0.18523538126141234, 0.1517408892923435, 0.1613010879564285, 0.1524273388816992, 0.14226737867110575, 0.14102457750843364, 0.1339809631915887, 0.1220833397352735, 0.1321089367071628, 0.12727790340774064, 0.1264002723717849, 0.11620328127149741, 0.10991683337531878, 0.12169686444644134, 0.10350258639145693, 0.10792311866116922, 0.11235495794891112, 0.09648877592075267, 0.09774628427893324, 0.09804847498800752, 0.09064238116018375, 0.09432376242547429, 0.09317767655677399, 0.08397823389888605, 0.0890420351388812, 0.0812699030062159, 0.0798033143543641, 0.07714706405753295, 0.0767479731821696, 0.07129691830141739, 0.0774604360090415, 0.0672014956052979, 0.06975962093914745, 0.06516785950627323, 0.06355210312581458, 0.06667021294324799, 0.06303195333296059, 0.05952659788588682, 0.05929840160451731, 0.05884053621215423, 0.05976430059884787, 0.05503657083478769, 0.053907410404109964, 0.05188659416247807, 0.05157012472287813, 0.05354192620701791, 0.05225116883513528, 0.04745761615337633, 0.04782393656477728, 0.050142081270647065, 0.045592689022850984, 0.04563582799879509, 0.044798845451024166, 0.04477875623816152, 0.043829500478303446, 0.042947176072833944, 0.03938143999367954, 0.04368404039475322, 0.04043318261856039, 0.03825321433705887, 0.03775531529172263, 0.03899387021829585, 0.036648529332029826, 0.0377433476645112, 0.03680462592760721, 0.033720300496949754, 0.03423753929695288, 0.035463142327976226, 0.03711934005187649, 0.03412673814655345, 0.032764130491079395, 0.03254216458246608]
Let’s do the same with onnxruntime.
train_session = OrtGradientForwardBackwardOptimizer(
onx, device=device, verbose=1,
learning_rate=LearningRateSGDNesterov(1e-4, nesterov=True, momentum=0.9),
learning_penalty=ElasticLearningPenalty(l1=0, l2=1e-4),
warm_start=False, max_iter=100, batch_size=10)
train_session.fit(X, y)
Out:
0%| | 0/100 [00:00<?, ?it/s]
1%|1 | 1/100 [00:00<00:31, 3.14it/s]
2%|2 | 2/100 [00:00<00:31, 3.13it/s]
3%|3 | 3/100 [00:00<00:31, 3.12it/s]
4%|4 | 4/100 [00:01<00:30, 3.12it/s]
5%|5 | 5/100 [00:01<00:30, 3.13it/s]
6%|6 | 6/100 [00:01<00:30, 3.12it/s]
7%|7 | 7/100 [00:02<00:29, 3.12it/s]
8%|8 | 8/100 [00:02<00:29, 3.13it/s]
9%|9 | 9/100 [00:02<00:29, 3.13it/s]
10%|# | 10/100 [00:03<00:28, 3.12it/s]
11%|#1 | 11/100 [00:03<00:28, 3.12it/s]
12%|#2 | 12/100 [00:03<00:28, 3.12it/s]
13%|#3 | 13/100 [00:04<00:27, 3.12it/s]
14%|#4 | 14/100 [00:04<00:27, 3.12it/s]
15%|#5 | 15/100 [00:04<00:27, 3.12it/s]
16%|#6 | 16/100 [00:05<00:26, 3.11it/s]
17%|#7 | 17/100 [00:05<00:26, 3.11it/s]
18%|#8 | 18/100 [00:05<00:26, 3.11it/s]
19%|#9 | 19/100 [00:06<00:26, 3.11it/s]
20%|## | 20/100 [00:06<00:25, 3.11it/s]
21%|##1 | 21/100 [00:06<00:25, 3.11it/s]
22%|##2 | 22/100 [00:07<00:25, 3.11it/s]
23%|##3 | 23/100 [00:07<00:24, 3.11it/s]
24%|##4 | 24/100 [00:07<00:24, 3.11it/s]
25%|##5 | 25/100 [00:08<00:24, 3.11it/s]
26%|##6 | 26/100 [00:08<00:23, 3.11it/s]
27%|##7 | 27/100 [00:08<00:23, 3.11it/s]
28%|##8 | 28/100 [00:08<00:23, 3.11it/s]
29%|##9 | 29/100 [00:09<00:22, 3.11it/s]
30%|### | 30/100 [00:09<00:22, 3.11it/s]
31%|###1 | 31/100 [00:09<00:22, 3.11it/s]
32%|###2 | 32/100 [00:10<00:21, 3.11it/s]
33%|###3 | 33/100 [00:10<00:21, 3.11it/s]
34%|###4 | 34/100 [00:10<00:21, 3.11it/s]
35%|###5 | 35/100 [00:11<00:20, 3.11it/s]
36%|###6 | 36/100 [00:11<00:20, 3.11it/s]
37%|###7 | 37/100 [00:11<00:20, 3.12it/s]
38%|###8 | 38/100 [00:12<00:19, 3.13it/s]
39%|###9 | 39/100 [00:12<00:19, 3.13it/s]
40%|#### | 40/100 [00:12<00:19, 3.14it/s]
41%|####1 | 41/100 [00:13<00:18, 3.14it/s]
42%|####2 | 42/100 [00:13<00:18, 3.14it/s]
43%|####3 | 43/100 [00:13<00:18, 3.14it/s]
44%|####4 | 44/100 [00:14<00:17, 3.14it/s]
45%|####5 | 45/100 [00:14<00:17, 3.14it/s]
46%|####6 | 46/100 [00:14<00:17, 3.14it/s]
47%|####6 | 47/100 [00:15<00:16, 3.14it/s]
48%|####8 | 48/100 [00:15<00:16, 3.14it/s]
49%|####9 | 49/100 [00:15<00:16, 3.14it/s]
50%|##### | 50/100 [00:16<00:15, 3.14it/s]
51%|#####1 | 51/100 [00:16<00:15, 3.14it/s]
52%|#####2 | 52/100 [00:16<00:15, 3.14it/s]
53%|#####3 | 53/100 [00:16<00:14, 3.14it/s]
54%|#####4 | 54/100 [00:17<00:14, 3.14it/s]
55%|#####5 | 55/100 [00:17<00:14, 3.14it/s]
56%|#####6 | 56/100 [00:17<00:14, 3.14it/s]
57%|#####6 | 57/100 [00:18<00:13, 3.13it/s]
58%|#####8 | 58/100 [00:18<00:13, 3.13it/s]
59%|#####8 | 59/100 [00:18<00:13, 3.13it/s]
60%|###### | 60/100 [00:19<00:12, 3.13it/s]
61%|######1 | 61/100 [00:19<00:12, 3.12it/s]
62%|######2 | 62/100 [00:19<00:12, 3.12it/s]
63%|######3 | 63/100 [00:20<00:11, 3.12it/s]
64%|######4 | 64/100 [00:20<00:11, 3.12it/s]
65%|######5 | 65/100 [00:20<00:11, 3.12it/s]
66%|######6 | 66/100 [00:21<00:10, 3.13it/s]
67%|######7 | 67/100 [00:21<00:10, 3.13it/s]
68%|######8 | 68/100 [00:21<00:10, 3.13it/s]
69%|######9 | 69/100 [00:22<00:09, 3.12it/s]
70%|####### | 70/100 [00:22<00:09, 3.12it/s]
71%|#######1 | 71/100 [00:22<00:09, 3.12it/s]
72%|#######2 | 72/100 [00:23<00:08, 3.12it/s]
73%|#######3 | 73/100 [00:23<00:08, 3.12it/s]
74%|#######4 | 74/100 [00:23<00:08, 3.12it/s]
75%|#######5 | 75/100 [00:24<00:08, 3.12it/s]
76%|#######6 | 76/100 [00:24<00:07, 3.12it/s]
77%|#######7 | 77/100 [00:24<00:07, 3.12it/s]
78%|#######8 | 78/100 [00:24<00:07, 3.12it/s]
79%|#######9 | 79/100 [00:25<00:06, 3.12it/s]
80%|######## | 80/100 [00:25<00:06, 3.12it/s]
81%|########1 | 81/100 [00:25<00:06, 3.13it/s]
82%|########2 | 82/100 [00:26<00:05, 3.13it/s]
83%|########2 | 83/100 [00:26<00:05, 3.13it/s]
84%|########4 | 84/100 [00:26<00:05, 3.13it/s]
85%|########5 | 85/100 [00:27<00:04, 3.13it/s]
86%|########6 | 86/100 [00:27<00:04, 3.13it/s]
87%|########7 | 87/100 [00:27<00:04, 3.14it/s]
88%|########8 | 88/100 [00:28<00:03, 3.13it/s]
89%|########9 | 89/100 [00:28<00:03, 3.13it/s]
90%|######### | 90/100 [00:28<00:03, 3.13it/s]
91%|#########1| 91/100 [00:29<00:02, 3.13it/s]
92%|#########2| 92/100 [00:29<00:02, 3.14it/s]
93%|#########3| 93/100 [00:29<00:02, 3.14it/s]
94%|#########3| 94/100 [00:30<00:01, 3.14it/s]
95%|#########5| 95/100 [00:30<00:01, 3.14it/s]
96%|#########6| 96/100 [00:30<00:01, 3.13it/s]
97%|#########7| 97/100 [00:31<00:00, 3.14it/s]
98%|#########8| 98/100 [00:31<00:00, 3.13it/s]
99%|#########9| 99/100 [00:31<00:00, 3.13it/s]
100%|##########| 100/100 [00:31<00:00, 3.13it/s]
100%|##########| 100/100 [00:31<00:00, 3.13it/s]
OrtGradientForwardBackwardOptimizer(model_onnx='ir_version...', weights_to_train="['I0_coeff...", loss_output_name='loss', max_iter=100, training_optimizer_name='SGDOptimizer', batch_size=10, learning_rate=LearningRateSGDNesterov(eta0=0.0001, alpha=0.0001, power_t=0.25, learning_rate='invscaling', momentum=0.9, nesterov=True), value=3.1622776601683795e-05, device='cpu', warm_start=False, verbose=1, validation_every=10, learning_loss=SquareLearningLoss(), enable_logging=False, weight_name=None, learning_penalty=ElasticLearningPenalty(l1=0, l2=0.0001), exc=True)
Let’s see the weights.
state_tensors = train_session.get_state()
And the loss.
print(train_session.train_losses_)
df = DataFrame({'ort losses': train_session.train_losses_,
'skl losses:': nn.loss_curve_})
df.plot(title="Train loss against iterations (Nesterov + penalty)", logy=True)

Out:
[2376.8562, 18.777214, 6.261327, 3.7860122, 2.116756, 1.9259775, 1.2176373, 1.0647117, 0.84909034, 0.87173456, 0.60593724, 0.5714514, 0.5858055, 0.6806195, 0.51168406, 0.4000029, 0.4199983, 0.38385007, 0.32245404, 0.33961573, 0.28147948, 0.22554532, 0.27522147, 0.25831553, 0.28255552, 0.25820896, 0.24750453, 0.21620429, 0.19236444, 0.26671842, 0.2671131, 0.26723152, 0.25716183, 0.26160723, 0.23834701, 0.24007009, 0.1853468, 0.21914482, 0.22316913, 0.24479637, 0.20455456, 0.22006026, 0.23460646, 0.18471348, 0.25287777, 0.22262591, 0.2235146, 0.2196551, 0.21143629, 0.19294795, 0.20023958, 0.20165259, 0.17378624, 0.19911997, 0.21006893, 0.18072456, 0.16020702, 0.18161526, 0.15229107, 0.15785366, 0.14939909, 0.17135319, 0.1426875, 0.15112786, 0.14876057, 0.14915729, 0.13350922, 0.13337027, 0.1207417, 0.115688965, 0.1260909, 0.11945328, 0.121438496, 0.12557794, 0.11982714, 0.1163248, 0.11425857, 0.10942742, 0.106703356, 0.09725315, 0.104726516, 0.10828836, 0.097253434, 0.09400714, 0.10321168, 0.098187916, 0.09755767, 0.10881713, 0.1050384, 0.09307868, 0.09833063, 0.10899615, 0.08859904, 0.097545795, 0.102440655, 0.07907492, 0.08571978, 0.09271612, 0.10559986, 0.09473409]
<AxesSubplot:title={'center':'Train loss against iterations (Nesterov + penalty)'}>
All ONNX graphs#
Method Method save_onnx_graph
can export all the ONNX graph used by the model on disk.
def print_graph(d):
for k, v in sorted(d.items()):
if isinstance(v, dict):
print_graph(v)
else:
print("\n++++++", v.replace("\\", "/"), "\n")
with open(v, "rb") as f:
print(onnx_simple_text_plot(onnx.load(f)))
all_files = train_session.save_onnx_graph('.')
print_graph(all_files)
# import matplotlib.pyplot as plt
# plt.show()
Out:
++++++ ./SquareLLoss.learning_loss.loss_grad_onnx_.onnx
opset: domain='' version=14
input: name='X1' type=dtype('float32') shape=(0, 0)
input: name='X2' type=dtype('float32') shape=(0, 0)
init: name='Mu_Mulcst' type=dtype('float32') shape=(1,) -- array([0.5], dtype=float32)
init: name='Re_Reshapecst' type=dtype('int64') shape=(1,) -- array([-1])
init: name='Mu_Mulcst1' type=dtype('float32') shape=(1,) -- array([-1.], dtype=float32)
Sub(X1, X2) -> Su_C0
ReduceSumSquare(Su_C0) -> Re_reduced0
Mul(Re_reduced0, Mu_Mulcst) -> Mu_C0
Reshape(Mu_C0, Re_Reshapecst) -> Y
Mul(Su_C0, Mu_Mulcst1) -> Y_grad
output: name='Y' type=dtype('float32') shape=()
output: name='Y_grad' type=dtype('float32') shape=()
++++++ ./SquareLLoss.learning_loss.loss_score_onnx_.onnx
opset: domain='' version=14
input: name='X1' type=dtype('float32') shape=(0, 0)
input: name='X2' type=dtype('float32') shape=(0, 0)
Sub(X1, X2) -> Su_C0
Mul(Su_C0, Su_C0) -> Y
output: name='Y' type=dtype('float32') shape=(0, 1)
++++++ ./ElasticLPenalty.learning_penalty.penalty_grad_onnx_.onnx
opset: domain='' version=14
input: name='X' type=dtype('float32') shape=()
init: name='Mu_Mulcst' type=dtype('float32') shape=(1,) -- array([0.9998], dtype=float32)
init: name='Mu_Mulcst1' type=dtype('float32') shape=(1,) -- array([0.], dtype=float32)
Mul(X, Mu_Mulcst) -> Mu_C0
Sign(X) -> Si_output0
Mul(Si_output0, Mu_Mulcst1) -> Mu_C02
Sub(Mu_C0, Mu_C02) -> Y
output: name='Y' type=dtype('float32') shape=()
++++++ ./ElasticLPenalty.learning_penalty.penalty_onnx_.onnx
opset: domain='' version=14
input: name='loss' type=dtype('float32') shape=()
input: name='W0' type=dtype('float32') shape=()
input: name='W1' type=dtype('float32') shape=()
input: name='W2' type=dtype('float32') shape=()
input: name='W3' type=dtype('float32') shape=()
input: name='W4' type=dtype('float32') shape=()
input: name='W5' type=dtype('float32') shape=()
init: name='Mu_Mulcst' type=dtype('float32') shape=(1,) -- array([0.], dtype=float32)
init: name='Mu_Mulcst1' type=dtype('float32') shape=(1,) -- array([1.e-04], dtype=float32)
init: name='Re_Reshapecst' type=dtype('int64') shape=(1,) -- array([-1])
Abs(W0) -> Ab_Y0
ReduceSum(Ab_Y0) -> Re_reduced0
Mul(Re_reduced0, Mu_Mulcst) -> Mu_C0
Identity(Mu_Mulcst1) -> Mu_Mulcst3
ReduceSumSquare(W0) -> Re_reduced02
Mul(Re_reduced02, Mu_Mulcst1) -> Mu_C02
Add(Mu_C0, Mu_C02) -> Ad_C06
Abs(W1) -> Ab_Y02
ReduceSum(Ab_Y02) -> Re_reduced03
ReduceSumSquare(W1) -> Re_reduced04
Mul(Re_reduced04, Mu_Mulcst3) -> Mu_C04
Identity(Mu_Mulcst) -> Mu_Mulcst2
Mul(Re_reduced03, Mu_Mulcst2) -> Mu_C03
Add(Mu_C03, Mu_C04) -> Ad_C07
Add(Ad_C06, Ad_C07) -> Ad_C05
Abs(W2) -> Ab_Y03
ReduceSum(Ab_Y03) -> Re_reduced05
ReduceSumSquare(W2) -> Re_reduced06
ReduceSumSquare(W5) -> Re_reduced012
Identity(Mu_Mulcst1) -> Mu_Mulcst11
Mul(Re_reduced012, Mu_Mulcst11) -> Mu_C012
Identity(Mu_Mulcst) -> Mu_Mulcst4
Mul(Re_reduced05, Mu_Mulcst4) -> Mu_C05
ReduceSumSquare(W3) -> Re_reduced08
Identity(Mu_Mulcst1) -> Mu_Mulcst5
Mul(Re_reduced06, Mu_Mulcst5) -> Mu_C06
Add(Mu_C05, Mu_C06) -> Ad_C08
Add(Ad_C05, Ad_C08) -> Ad_C04
Abs(W3) -> Ab_Y04
ReduceSum(Ab_Y04) -> Re_reduced07
Identity(Mu_Mulcst) -> Mu_Mulcst6
Mul(Re_reduced07, Mu_Mulcst6) -> Mu_C07
ReduceSumSquare(W4) -> Re_reduced010
Identity(Mu_Mulcst1) -> Mu_Mulcst7
Mul(Re_reduced08, Mu_Mulcst7) -> Mu_C08
Add(Mu_C07, Mu_C08) -> Ad_C09
Add(Ad_C04, Ad_C09) -> Ad_C03
Abs(W4) -> Ab_Y05
ReduceSum(Ab_Y05) -> Re_reduced09
Identity(Mu_Mulcst) -> Mu_Mulcst8
Mul(Re_reduced09, Mu_Mulcst8) -> Mu_C09
Identity(Mu_Mulcst1) -> Mu_Mulcst9
Mul(Re_reduced010, Mu_Mulcst9) -> Mu_C010
Add(Mu_C09, Mu_C010) -> Ad_C010
Add(Ad_C03, Ad_C010) -> Ad_C02
Abs(W5) -> Ab_Y06
ReduceSum(Ab_Y06) -> Re_reduced011
Identity(Mu_Mulcst) -> Mu_Mulcst10
Mul(Re_reduced011, Mu_Mulcst10) -> Mu_C011
Add(Mu_C011, Mu_C012) -> Ad_C011
Add(Ad_C02, Ad_C011) -> Ad_C01
Add(loss, Ad_C01) -> Ad_C0
Reshape(Ad_C0, Re_Reshapecst) -> Y
output: name='Y' type=dtype('float32') shape=(0,)
++++++ ./LRateSGDNesterov.learning_rate.axpyw_onnx_.onnx
opset: domain='' version=14
input: name='X1' type=dtype('float32') shape=()
input: name='X2' type=dtype('float32') shape=()
input: name='G' type=dtype('float32') shape=()
input: name='alpha' type=dtype('float32') shape=(1,)
input: name='beta' type=dtype('float32') shape=(1,)
Mul(X1, alpha) -> Mu_C0
Mul(G, beta) -> Mu_C03
Add(Mu_C0, Mu_C03) -> Z
Mul(Z, beta) -> Mu_C02
Add(Mu_C0, Mu_C02) -> Ad_C0
Add(Ad_C0, X2) -> Y
output: name='Y' type=dtype('float32') shape=()
output: name='Z' type=dtype('float32') shape=()
++++++ ./GradFBOptimizer.model_onnx.onnx
opset: domain='' version=14
input: name='X' type=dtype('float32') shape=(0, 10)
init: name='I0_coefficient' type=dtype('float32') shape=(100,)
init: name='I1_intercepts' type=dtype('float32') shape=(10,)
init: name='I2_coefficient1' type=dtype('float32') shape=(100,)
init: name='I3_intercepts1' type=dtype('float32') shape=(10,)
init: name='I4_coefficient2' type=dtype('float32') shape=(10,)
init: name='I5_intercepts2' type=dtype('float32') shape=(1,) -- array([0.9706432], dtype=float32)
init: name='I6_shape_tensor' type=dtype('int64') shape=(2,) -- array([-1, 1])
Cast(X, to=1) -> r0
MatMul(r0, I0_coefficient) -> r1
Add(r1, I1_intercepts) -> r2
Relu(r2) -> r3
MatMul(r3, I2_coefficient1) -> r4
Add(r4, I3_intercepts1) -> r5
Relu(r5) -> r6
MatMul(r6, I4_coefficient2) -> r7
Add(r7, I5_intercepts2) -> r8
Reshape(r8, I6_shape_tensor) -> variable
output: name='variable' type=dtype('float32') shape=(0, 1)
++++++ ./OrtGradientForwardBackwardFunction_140271354529584.train_function_._optimized_pre_grad_model.onnx
opset: domain='' version=14
opset: domain='com.microsoft.experimental' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='ai.onnx.training' version=1
opset: domain='com.ms.internal.nhwc' version=1
opset: domain='org.pytorch.aten' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='ai.onnx.ml' version=3
opset: domain='com.microsoft' version=1
input: name='X' type=dtype('float32') shape=(0, 10)
input: name='I0_coefficient' type=dtype('float32') shape=(10, 10)
input: name='I1_intercepts' type=dtype('float32') shape=(1, 10)
input: name='I2_coefficient1' type=dtype('float32') shape=(10, 10)
input: name='I3_intercepts1' type=dtype('float32') shape=(1, 10)
input: name='I4_coefficient2' type=dtype('float32') shape=(10, 1)
input: name='I5_intercepts2' type=dtype('float32') shape=(1, 1)
init: name='I6_shape_tensor' type=dtype('int64') shape=(2,) -- array([-1, 1])
MatMul(X, I0_coefficient) -> r1
Add(r1, I1_intercepts) -> r2
Relu(r2) -> r3
MatMul(r3, I2_coefficient1) -> r4
Add(r4, I3_intercepts1) -> r5
Relu(r5) -> r6
MatMul(r6, I4_coefficient2) -> r7
Add(r7, I5_intercepts2) -> r8
Reshape(r8, I6_shape_tensor, allowzero=0) -> variable
output: name='variable' type=dtype('float32') shape=(0, 1)
++++++ ./OrtGradientForwardBackwardFunction_140271354529584.train_function_._trained_onnx.onnx
opset: domain='' version=14
opset: domain='com.microsoft.experimental' version=1
opset: domain='ai.onnx.preview.training' version=1
opset: domain='ai.onnx.training' version=1
opset: domain='com.ms.internal.nhwc' version=1
opset: domain='org.pytorch.aten' version=1
opset: domain='com.microsoft.nchwc' version=1
opset: domain='ai.onnx.ml' version=3
opset: domain='com.microsoft' version=1
input: name='X' type=dtype('float32') shape=(0, 10)
input: name='I0_coefficient' type=dtype('float32') shape=(10, 10)
input: name='I1_intercepts' type=dtype('float32') shape=(1, 10)
input: name='I2_coefficient1' type=dtype('float32') shape=(10, 10)
input: name='I3_intercepts1' type=dtype('float32') shape=(1, 10)
input: name='I4_coefficient2' type=dtype('float32') shape=(10, 1)
input: name='I5_intercepts2' type=dtype('float32') shape=(1, 1)
init: name='I6_shape_tensor' type=dtype('int64') shape=(2,) -- array([-1, 1])
init: name='n1_Grad/A_target_shape' type=dtype('int64') shape=(2,) -- array([-1, 10])
init: name='n1_Grad/dY_target_shape' type=dtype('int64') shape=(2,) -- array([-1, 10])
init: name='n4_Grad/A_target_shape' type=dtype('int64') shape=(2,) -- array([-1, 10])
init: name='n4_Grad/dY_target_shape' type=dtype('int64') shape=(2,) -- array([-1, 10])
init: name='n7_Grad/A_target_shape' type=dtype('int64') shape=(2,) -- array([-1, 10])
init: name='n7_Grad/dY_target_shape' type=dtype('int64') shape=(2,) -- array([-1, 1])
MatMul(X, I0_coefficient) -> r1
Add(r1, I1_intercepts) -> r2
Relu(r2) -> r3
MatMul(r3, I2_coefficient1) -> r4
Add(r4, I3_intercepts1) -> r5
Relu(r5) -> r6
MatMul(r6, I4_coefficient2) -> r7
Add(r7, I5_intercepts2) -> r8
Reshape(r8, I6_shape_tensor, allowzero=0) -> variable
YieldOp(variable) -> variable_grad
Shape(r8) -> n9_Grad/x_shape
Reshape(variable_grad, n9_Grad/x_shape, allowzero=0) -> r8_grad
Shape(I5_intercepts2) -> n8_Grad/Shape_I5_intercepts2
Shape(r7) -> n8_Grad/Shape_r7
BroadcastGradientArgs(n8_Grad/Shape_r7, n8_Grad/Shape_I5_intercepts2) -> n8_Grad/ReduceAxes_r7, n8_Grad/ReduceAxes_I5_intercepts2
ReduceSum(r8_grad, n8_Grad/ReduceAxes_I5_intercepts2, noop_with_empty_axes=1, keepdims=1) -> n8_Grad/ReduceSum_r8_grad_for_I5_intercepts2
Reshape(n8_Grad/ReduceSum_r8_grad_for_I5_intercepts2, n8_Grad/Shape_I5_intercepts2, allowzero=0) -> I5_intercepts2_grad
ReduceSum(r8_grad, n8_Grad/ReduceAxes_r7, noop_with_empty_axes=1, keepdims=1) -> n8_Grad/ReduceSum_r8_grad_for_r7
Reshape(n8_Grad/ReduceSum_r8_grad_for_r7, n8_Grad/Shape_r7, allowzero=0) -> r7_grad
Reshape(r7_grad, n7_Grad/dY_target_shape, allowzero=0) -> n7_Grad/dY_reshape_2d
Reshape(r6, n7_Grad/A_target_shape, allowzero=0) -> n7_Grad/A_reshape_2d
Gemm(n7_Grad/A_reshape_2d, n7_Grad/dY_reshape_2d, beta=1.00, transB=0, transA=1, alpha=1.00) -> I4_coefficient2_grad
FusedMatMul(r7_grad, I4_coefficient2, transB=1, alpha=1.00, transA=0) -> n7_Grad/PreReduceGrad0
Shape(n7_Grad/PreReduceGrad0) -> n7_Grad/Shape_n7_Grad/PreReduceGrad0
Shape(r6) -> n7_Grad/Shape_r6
BroadcastGradientArgs(n7_Grad/Shape_r6, n7_Grad/Shape_n7_Grad/PreReduceGrad0) -> n7_Grad/ReduceAxes_r6_for_r6,
ReduceSum(n7_Grad/PreReduceGrad0, n7_Grad/ReduceAxes_r6_for_r6, noop_with_empty_axes=1, keepdims=1) -> n7_Grad/ReduceSum_n7_Grad/PreReduceGrad0_for_r6
Reshape(n7_Grad/ReduceSum_n7_Grad/PreReduceGrad0_for_r6, n7_Grad/Shape_r6, allowzero=0) -> r6_grad
ReluGrad(r6_grad, r6) -> r5_grad
Shape(I3_intercepts1) -> n5_Grad/Shape_I3_intercepts1
Shape(r4) -> n5_Grad/Shape_r4
BroadcastGradientArgs(n5_Grad/Shape_r4, n5_Grad/Shape_I3_intercepts1) -> n5_Grad/ReduceAxes_r4, n5_Grad/ReduceAxes_I3_intercepts1
ReduceSum(r5_grad, n5_Grad/ReduceAxes_I3_intercepts1, noop_with_empty_axes=1, keepdims=1) -> n5_Grad/ReduceSum_r5_grad_for_I3_intercepts1
Reshape(n5_Grad/ReduceSum_r5_grad_for_I3_intercepts1, n5_Grad/Shape_I3_intercepts1, allowzero=0) -> I3_intercepts1_grad
ReduceSum(r5_grad, n5_Grad/ReduceAxes_r4, noop_with_empty_axes=1, keepdims=1) -> n5_Grad/ReduceSum_r5_grad_for_r4
Reshape(n5_Grad/ReduceSum_r5_grad_for_r4, n5_Grad/Shape_r4, allowzero=0) -> r4_grad
Reshape(r4_grad, n4_Grad/dY_target_shape, allowzero=0) -> n4_Grad/dY_reshape_2d
Reshape(r3, n4_Grad/A_target_shape, allowzero=0) -> n4_Grad/A_reshape_2d
Gemm(n4_Grad/A_reshape_2d, n4_Grad/dY_reshape_2d, beta=1.00, transB=0, transA=1, alpha=1.00) -> I2_coefficient1_grad
FusedMatMul(r4_grad, I2_coefficient1, transB=1, alpha=1.00, transA=0) -> n4_Grad/PreReduceGrad0
Shape(n4_Grad/PreReduceGrad0) -> n4_Grad/Shape_n4_Grad/PreReduceGrad0
Shape(r3) -> n4_Grad/Shape_r3
BroadcastGradientArgs(n4_Grad/Shape_r3, n4_Grad/Shape_n4_Grad/PreReduceGrad0) -> n4_Grad/ReduceAxes_r3_for_r3,
ReduceSum(n4_Grad/PreReduceGrad0, n4_Grad/ReduceAxes_r3_for_r3, noop_with_empty_axes=1, keepdims=1) -> n4_Grad/ReduceSum_n4_Grad/PreReduceGrad0_for_r3
Reshape(n4_Grad/ReduceSum_n4_Grad/PreReduceGrad0_for_r3, n4_Grad/Shape_r3, allowzero=0) -> r3_grad
ReluGrad(r3_grad, r3) -> r2_grad
Shape(I1_intercepts) -> n2_Grad/Shape_I1_intercepts
Shape(r1) -> n2_Grad/Shape_r1
BroadcastGradientArgs(n2_Grad/Shape_r1, n2_Grad/Shape_I1_intercepts) -> n2_Grad/ReduceAxes_r1, n2_Grad/ReduceAxes_I1_intercepts
ReduceSum(r2_grad, n2_Grad/ReduceAxes_I1_intercepts, noop_with_empty_axes=1, keepdims=1) -> n2_Grad/ReduceSum_r2_grad_for_I1_intercepts
Reshape(n2_Grad/ReduceSum_r2_grad_for_I1_intercepts, n2_Grad/Shape_I1_intercepts, allowzero=0) -> I1_intercepts_grad
ReduceSum(r2_grad, n2_Grad/ReduceAxes_r1, noop_with_empty_axes=1, keepdims=1) -> n2_Grad/ReduceSum_r2_grad_for_r1
Reshape(n2_Grad/ReduceSum_r2_grad_for_r1, n2_Grad/Shape_r1, allowzero=0) -> r1_grad
Reshape(r1_grad, n1_Grad/dY_target_shape, allowzero=0) -> n1_Grad/dY_reshape_2d
Reshape(X, n1_Grad/A_target_shape, allowzero=0) -> n1_Grad/A_reshape_2d
Gemm(n1_Grad/A_reshape_2d, n1_Grad/dY_reshape_2d, beta=1.00, transB=0, transA=1, alpha=1.00) -> I0_coefficient_grad
FusedMatMul(r1_grad, I0_coefficient, transB=1, alpha=1.00, transA=0) -> n1_Grad/PreReduceGrad0
Shape(n1_Grad/PreReduceGrad0) -> n1_Grad/Shape_n1_Grad/PreReduceGrad0
Shape(X) -> n1_Grad/Shape_X
BroadcastGradientArgs(n1_Grad/Shape_X, n1_Grad/Shape_n1_Grad/PreReduceGrad0) -> n1_Grad/ReduceAxes_X_for_X,
ReduceSum(n1_Grad/PreReduceGrad0, n1_Grad/ReduceAxes_X_for_X, noop_with_empty_axes=1, keepdims=1) -> n1_Grad/ReduceSum_n1_Grad/PreReduceGrad0_for_X
Reshape(n1_Grad/ReduceSum_n1_Grad/PreReduceGrad0_for_X, n1_Grad/Shape_X, allowzero=0) -> X_grad
output: name='X_grad' type=dtype('float32') shape=(0, 10)
output: name='I0_coefficient_grad' type=dtype('float32') shape=(10, 10)
output: name='I1_intercepts_grad' type=dtype('float32') shape=(1, 10)
output: name='I2_coefficient1_grad' type=dtype('float32') shape=(10, 10)
output: name='I3_intercepts1_grad' type=dtype('float32') shape=(1, 10)
output: name='I4_coefficient2_grad' type=dtype('float32') shape=(10, 1)
output: name='I5_intercepts2_grad' type=dtype('float32') shape=(1, 1)
++++++ ./GradFBOptimizer.zero_onnx_.onnx
opset: domain='' version=14
input: name='X' type=dtype('float32') shape=()
init: name='Mu_Mulcst' type=dtype('float32') shape=(1,) -- array([0.], dtype=float32)
Mul(X, Mu_Mulcst) -> Y
output: name='Y' type=dtype('float32') shape=()
Total running time of the script: ( 1 minutes 28.352 seconds)