Coverage for mlprodict/cli/convert_validate.py: 100%

115 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-04 02:28 +0100

1""" 

2@file 

3@brief Command line about validation of prediction runtime. 

4""" 

5import os 

6import pickle 

7from logging import getLogger 

8import warnings 

9from pandas import read_csv 

10from ..onnx_conv import to_onnx 

11from ..onnxrt import OnnxInference 

12from ..onnx_tools.optim import onnx_optimisations 

13from ..onnxrt.validate.validate_difference import measure_relative_difference 

14from ..onnx_conv import guess_schema_from_data, guess_schema_from_model 

15 

16 

17def convert_validate(pkl, data=None, schema=None, 

18 method="predict", name='Y', 

19 target_opset=None, 

20 outonnx="model.onnx", 

21 runtime='python', metric="l1med", 

22 use_double=None, noshape=False, 

23 optim='onnx', rewrite_ops=True, 

24 options=None, fLOG=print, verbose=1, 

25 register=True): 

26 """ 

27 Converts a model stored in *pkl* file and measure the differences 

28 between the model and the ONNX predictions. 

29 

30 :param pkl: pickle file 

31 :param data: data file, loaded with pandas, 

32 converted to a single array, the data is used to guess 

33 the schema if *schema* not specified 

34 :param schema: initial type of the model 

35 :param method: method to call 

36 :param name: output name 

37 :param target_opset: target opset 

38 :param outonnx: produced ONNX model 

39 :param runtime: runtime to use to compute predictions, 

40 'python', 'python_compiled', 

41 'onnxruntime1' or 'onnxruntime2' 

42 :param metric: the metric 'l1med' is given by function 

43 :func:`measure_relative_difference 

44 <mlprodict.onnxrt.validate.validate_difference.measure_relative_difference>` 

45 :param noshape: run the conversion with no shape information 

46 :param use_double: use double for the runtime if possible, 

47 two possible options, ``"float64"`` or ``'switch'``, 

48 the first option produces an ONNX file with doubles, 

49 the second option loads an ONNX file (float or double) 

50 and replaces matrices in ONNX with the matrices coming from 

51 the model, this second way is just for testing purposes 

52 :param optim: applies optimisations on the first ONNX graph, 

53 use 'onnx' to reduce the number of node Identity and 

54 redundant subgraphs 

55 :param rewrite_ops: rewrites some converters from :epkg:`sklearn-onnx` 

56 :param options: additional options for conversion, 

57 dictionary as a string 

58 :param verbose: verbose level 

59 :param register: registers additional converters implemented by this package 

60 :param fLOG: logging function 

61 :return: a dictionary with the results 

62 

63 .. cmdref:: 

64 :title: Converts and compares an ONNX file 

65 :cmd: -m mlprodict convert_validate --help 

66 :lid: l-cmd-convert_validate 

67 

68 The command converts and validates a :epkg:`scikit-learn` model. 

69 An example to check the prediction of a logistic regression. 

70 

71 :: 

72 

73 import os 

74 import pickle 

75 import pandas 

76 from sklearn.datasets import load_iris 

77 from sklearn.model_selection import train_test_split 

78 from sklearn.linear_model import LogisticRegression 

79 from mlprodict.__main__ import main 

80 from mlprodict.cli import convert_validate 

81 

82 iris = load_iris() 

83 X, y = iris.data, iris.target 

84 X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) 

85 clr = LogisticRegression() 

86 clr.fit(X_train, y_train) 

87 

88 pandas.DataFrame(X_test).to_csv("data.csv", index=False) 

89 with open("model.pkl", "wb") as f: 

90 pickle.dump(clr, f) 

91 

92 And the command line to check the predictions 

93 using a command line. 

94 

95 :: 

96 

97 convert_validate --pkl model.pkl --data data.csv 

98 --method predict,predict_proba 

99 --name output_label,output_probability 

100 --verbose 1 

101 """ 

102 from skl2onnx.common.data_types import FloatTensorType, DoubleTensorType # delayed 

103 if fLOG is None: 

104 verbose = 0 # pragma: no cover 

105 if use_double not in (None, 'float64', 'switch'): 

106 raise ValueError( # pragma: no cover 

107 "use_double must be either None, 'float64' or 'switch'") 

108 if optim == '': 

109 optim = None # pragma: no cover 

110 if target_opset == '': 

111 target_opset = None # pragma: no cover 

112 if verbose == 0: 

113 logger = getLogger('skl2onnx') 

114 logger.disabled = True 

115 if not os.path.exists(pkl): 

116 raise FileNotFoundError( # pragma: no cover 

117 f"Unable to find model '{pkl}'.") 

118 if os.path.exists(outonnx): 

119 warnings.warn(f"File '{outonnx}' will be overwritten.") 

120 if verbose > 0: 

121 fLOG(f"[convert_validate] load model '{pkl}'") 

122 with open(pkl, "rb") as f: 

123 model = pickle.load(f) 

124 

125 if use_double == 'float64': 

126 tensor_type = DoubleTensorType 

127 else: 

128 tensor_type = FloatTensorType 

129 if options in (None, ''): 

130 options = None 

131 else: 

132 from ..onnxrt.validate.validate_scenarios import ( 

133 interpret_options_from_string) 

134 options = interpret_options_from_string(options) 

135 if verbose > 0: 

136 fLOG(f"[convert_validate] options={repr(options)}") 

137 

138 if register: 

139 from ..onnx_conv import ( 

140 register_converters, register_rewritten_operators, 

141 register_new_operators) 

142 register_converters() 

143 register_rewritten_operators() 

144 register_new_operators() 

145 

146 # data and schema 

147 if data is None or not os.path.exists(data): 

148 if schema is None: 

149 schema = guess_schema_from_model(model, tensor_type) 

150 if verbose > 0: 

151 fLOG(f"[convert_validate] model schema={schema}") 

152 df = None 

153 else: 

154 if verbose > 0: 

155 fLOG(f"[convert_validate] load data '{data}'") 

156 df = read_csv(data) 

157 if verbose > 0: 

158 fLOG("[convert_validate] convert data into matrix") 

159 if schema is None: 

160 schema = guess_schema_from_data(df, tensor_type) 

161 if schema is None: 

162 schema = [ # pragma: no cover 

163 ('X', tensor_type([None, df.shape[1]]))] 

164 if len(schema) == 1: 

165 df = df.values # pylint: disable=E1101 

166 if verbose > 0: 

167 fLOG(f"[convert_validate] data schema={schema}") 

168 

169 if noshape: 

170 if verbose > 0: 

171 fLOG( # pragma: no cover 

172 "[convert_validate] convert the model with no shape information") 

173 schema = [(name, col.__class__([None, None])) for name, col in schema] 

174 onx = to_onnx( 

175 model, initial_types=schema, rewrite_ops=rewrite_ops, 

176 target_opset=target_opset, options=options) 

177 else: 

178 if verbose > 0: 

179 fLOG("[convert_validate] convert the model with shapes") 

180 onx = to_onnx( 

181 model, initial_types=schema, target_opset=target_opset, 

182 rewrite_ops=rewrite_ops, options=options) 

183 

184 if optim is not None: 

185 if verbose > 0: 

186 fLOG(f"[convert_validate] run optimisations '{optim}'") 

187 onx = onnx_optimisations(onx, optim=optim) 

188 if verbose > 0: 

189 fLOG(f"[convert_validate] saves to '{outonnx}'") 

190 memory = onx.SerializeToString() 

191 with open(outonnx, 'wb') as f: 

192 f.write(memory) 

193 

194 if verbose > 0: 

195 fLOG("[convert_validate] creates OnnxInference session") 

196 sess = OnnxInference( 

197 onx, runtime=runtime, runtime_options=dict( 

198 log_severity_level=3)) 

199 if use_double == "switch": 

200 if verbose > 0: 

201 fLOG("[convert_validate] switch to double") 

202 sess.switch_initializers_dtype(model) 

203 

204 if verbose > 0: 

205 fLOG("[convert_validate] compute prediction from model") 

206 

207 if ',' in method: 

208 methods = method.split(',') 

209 else: 

210 methods = [method] 

211 if ',' in name: 

212 names = name.split(',') 

213 else: 

214 names = [name] 

215 

216 if len(names) != len(methods): 

217 raise ValueError( 

218 f"Number of methods and outputs do not match: {names}, {methods}") 

219 

220 if metric != 'l1med': 

221 raise ValueError( # pragma: no cover 

222 f"Unknown metric '{metric}'") 

223 

224 if df is None: 

225 # no test on data 

226 return dict(onnx=memory) 

227 

228 if verbose > 0: 

229 fLOG( 

230 f"[convert_validate] compute predictions from ONNX with name '{name}'") 

231 

232 ort_preds = sess.run( 

233 {'X': df}, verbose=max(verbose - 1, 0), fLOG=fLOG) 

234 

235 metrics = [] 

236 out_skl_preds = [] 

237 out_ort_preds = [] 

238 for method_, name_ in zip(methods, names): 

239 if verbose > 0: 

240 fLOG( 

241 f"[convert_validate] compute predictions with method '{method_}'") 

242 meth = getattr(model, method_) 

243 skl_pred = meth(df) 

244 out_skl_preds.append(df) 

245 

246 if name_ not in ort_preds: 

247 raise KeyError( 

248 f"Unable to find output name '{name_}' in {list(sorted(ort_preds))}") 

249 

250 ort_pred = ort_preds[name_] 

251 out_ort_preds.append(ort_pred) 

252 diff = measure_relative_difference(skl_pred, ort_pred) 

253 if verbose > 0: 

254 fLOG(f"[convert_validate] {metric}={diff}") 

255 metrics.append(diff) 

256 

257 return dict(skl_pred=out_skl_preds, ort_pred=out_ort_preds, 

258 metrics=metrics, onnx=memory)