Coverage for mlprodict/onnx_tools/optim/sklearn_helper.py: 94%
129 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
1"""
2@file
3@brief Helpers to manipulate :epkg:`scikit-learn` models.
4"""
5import inspect
6import multiprocessing
7import numpy
8from sklearn.base import (
9 TransformerMixin, ClassifierMixin, RegressorMixin, BaseEstimator)
10from sklearn.pipeline import Pipeline, FeatureUnion
11from sklearn.compose import ColumnTransformer
14def enumerate_pipeline_models(pipe, coor=None, vs=None):
15 """
16 Enumerates all the models within a pipeline.
18 @param pipe *scikit-learn* pipeline
19 @param coor current coordinate
20 @param vs subset of variables for the model, None for all
21 @return iterator on models ``tuple(coordinate, model)``
23 Example:
25 .. runpython::
26 :showcode:
27 :warningout: DeprecationWarning
29 from sklearn.datasets import load_iris
30 from sklearn.decomposition import PCA
31 from sklearn.linear_model import LogisticRegression
32 from sklearn.pipeline import make_pipeline
33 from sklearn.model_selection import train_test_split
34 from mlprodict.onnx_tools.optim.sklearn_helper import enumerate_pipeline_models
36 iris = load_iris()
37 X, y = iris.data, iris.target
38 X_train, __, y_train, _ = train_test_split(X, y, random_state=11)
39 clr = make_pipeline(PCA(n_components=2),
40 LogisticRegression(solver="liblinear"))
41 clr.fit(X_train, y_train)
43 for a in enumerate_pipeline_models(clr):
44 print(a)
45 """
46 if coor is None:
47 coor = (0,)
48 yield coor, pipe, vs
49 if hasattr(pipe, 'transformer_and_mapper_list') and len(pipe.transformer_and_mapper_list):
50 # azureml DataTransformer
51 raise NotImplementedError( # pragma: no cover
52 "Unable to handle this specific case.")
53 elif hasattr(pipe, 'mapper') and pipe.mapper:
54 # azureml DataTransformer
55 for couple in enumerate_pipeline_models( # pragma: no cover
56 pipe.mapper, coor + (0,)):
57 yield couple
58 elif hasattr(pipe, 'built_features'): # pragma: no cover
59 # sklearn_pandas.dataframe_mapper.DataFrameMapper
60 for i, (columns, transformers, _) in enumerate(
61 pipe.built_features):
62 if isinstance(columns, str):
63 columns = (columns,)
64 if transformers is None:
65 yield (coor + (i,)), None, columns
66 else:
67 for couple in enumerate_pipeline_models(transformers, coor + (i,), columns):
68 yield couple
69 elif isinstance(pipe, Pipeline):
70 for i, (_, model) in enumerate(pipe.steps):
71 for couple in enumerate_pipeline_models(model, coor + (i,)):
72 yield couple
73 elif isinstance(pipe, ColumnTransformer):
74 for i, (_, fitted_transformer, column) in enumerate(pipe.transformers):
75 for couple in enumerate_pipeline_models(
76 fitted_transformer, coor + (i,), column):
77 yield couple
78 elif isinstance(pipe, FeatureUnion):
79 for i, (_, model) in enumerate(pipe.transformer_list):
80 for couple in enumerate_pipeline_models(model, coor + (i,)):
81 yield couple
82 elif isinstance(pipe, (TransformerMixin, ClassifierMixin, RegressorMixin)):
83 pass
84 elif isinstance(pipe, BaseEstimator):
85 pass
86 elif isinstance(pipe, (list, numpy.ndarray)):
87 for i, m in enumerate(pipe):
88 for couple in enumerate_pipeline_models(m, coor + (i,)):
89 yield couple
90 else:
91 raise TypeError( # pragma: no cover
92 f"pipe is not a scikit-learn object: {type(pipe)}\n{pipe}")
95def enumerate_fitted_arrays(model):
96 """
97 Enumerate all fitted arrays included in a
98 :epkg:`scikit-learn` object.
100 @param model :epkg:`scikit-learn` object
101 @return enumerator
103 One example:
105 .. runpython::
106 :showcode:
107 :warningout: DeprecationWarning
109 from sklearn.datasets import load_iris
110 from sklearn.decomposition import PCA
111 from sklearn.linear_model import LogisticRegression
112 from sklearn.pipeline import make_pipeline
113 from sklearn.model_selection import train_test_split
114 from mlprodict.onnx_tools.optim.sklearn_helper import enumerate_fitted_arrays
116 iris = load_iris()
117 X, y = iris.data, iris.target
118 X_train, __, y_train, _ = train_test_split(X, y, random_state=11)
119 clr = make_pipeline(PCA(n_components=2),
120 LogisticRegression(solver="liblinear"))
121 clr.fit(X_train, y_train)
123 for a in enumerate_fitted_arrays(clr):
124 print(a)
125 """
126 def enumerate__(obj):
127 if isinstance(obj, (tuple, list)):
128 for el in obj:
129 for o in enumerate__(el):
130 yield (obj, el, o)
131 elif isinstance(obj, dict):
132 for k, v in obj.items():
133 for o in enumerate__(v):
134 yield (obj, k, v, o)
135 elif hasattr(obj, '__dict__'):
136 for k, v in obj.__dict__.items():
137 if k[-1] != '_' and k[0] != '_':
138 continue
139 if isinstance(v, numpy.ndarray):
140 yield (obj, k, v)
141 else:
142 for row in enumerate__(v):
143 yield row
145 for row in enumerate_pipeline_models(model):
146 coord = row[:-1]
147 sub = row[1]
148 last = row[2:]
149 for sub_row in enumerate__(sub):
150 yield coord + (sub, sub_row) + last
153def pairwise_array_distances(l1, l2, metric='l1med'):
154 """
155 Computes pairwise distances between two lists of arrays
156 *l1* and *l2*. The distance is 1e9 if shapes are not equal.
158 @param l1 first list of arrays
159 @param l2 second list of arrays
160 @param metric metric to use, `'l1med'` compute
161 the average absolute error divided
162 by the ansolute median
163 @return matrix
164 """
165 dist = numpy.full((len(l1), len(l2)), 1e9)
166 for i, a1 in enumerate(l1):
167 if not isinstance(a1, numpy.ndarray):
168 continue # pragma: no cover
169 for j, a2 in enumerate(l2):
170 if not isinstance(a2, numpy.ndarray):
171 continue # pragma: no cover
172 if a1.shape != a2.shape:
173 continue
174 a = numpy.median(numpy.abs(a1))
175 if a == 0:
176 a = 1
177 diff = numpy.sum(numpy.abs(a1 - a2)) / a
178 dist[i, j] = diff / diff.size
179 return dist
182def max_depth(estimator):
183 """
184 Retrieves the max depth assuming the estimator
185 is a decision tree.
186 """
187 n_nodes = estimator.tree_.node_count
188 children_left = estimator.tree_.children_left
189 children_right = estimator.tree_.children_right
191 node_depth = numpy.zeros(shape=n_nodes, dtype=numpy.int64)
192 is_leaves = numpy.zeros(shape=n_nodes, dtype=bool)
193 stack = [(0, -1)] # seed is the root node id and its parent depth
194 while len(stack) > 0:
195 node_id, parent_depth = stack.pop()
196 node_depth[node_id] = parent_depth + 1
198 # If we have a test node
199 if children_left[node_id] != children_right[node_id]:
200 stack.append((children_left[node_id], parent_depth + 1))
201 stack.append((children_right[node_id], parent_depth + 1))
202 else:
203 is_leaves[node_id] = True
204 return max(node_depth)
207def inspect_sklearn_model(model, recursive=True):
208 """
209 Inspects a :epkg:`scikit-learn` model and produces
210 some figures which tries to represent the complexity of it.
212 @param model model
213 @param recursive recursive look
214 @return dictionary
216 .. runpython::
217 :showcode:
218 :warningout: DeprecationWarning
220 import pprint
221 from sklearn.ensemble import RandomForestClassifier
222 from sklearn.linear_model import LogisticRegression
223 from sklearn.datasets import load_iris
224 from mlprodict.onnx_tools.optim.sklearn_helper import inspect_sklearn_model
226 iris = load_iris()
227 X = iris.data
228 y = iris.target
229 lr = LogisticRegression()
230 lr.fit(X, y)
231 pprint.pprint((lr, inspect_sklearn_model(lr)))
234 iris = load_iris()
235 X = iris.data
236 y = iris.target
237 rf = RandomForestClassifier()
238 rf.fit(X, y)
239 pprint.pprint((rf, inspect_sklearn_model(rf)))
240 """
241 def update(sts, st):
242 for k, v in st.items():
243 if k in sts:
244 if 'max_' in k:
245 sts[k] = max(v, sts[k])
246 else:
247 sts[k] += v
248 else:
249 sts[k] = v
251 def insmodel(m):
252 st = {'nop': 1}
253 if hasattr(m, 'tree_') and hasattr(m.tree_, 'node_count'):
254 st['nnodes'] = m.tree_.node_count
255 st['ntrees'] = 1
256 st['max_depth'] = max_depth(m)
257 try:
258 if hasattr(m, 'coef_'):
259 st['ncoef'] = len(m.coef_)
260 st['nlin'] = 1
261 except KeyError: # pragma: no cover
262 # added to deal with xgboost 1.0 (KeyError: 'weight')
263 pass
264 if hasattr(m, 'estimators_'):
265 for est in m.estimators_:
266 st_ = inspect_sklearn_model(est, recursive=recursive)
267 update(st, st_)
268 return st
270 if recursive:
271 sts = {}
272 for __, m, _ in enumerate_pipeline_models(model):
273 st = inspect_sklearn_model(m, recursive=False)
274 update(sts, st)
275 st = insmodel(m)
276 update(sts, st)
277 return st
278 return insmodel(model)
281def set_n_jobs(model, params, n_jobs=None):
282 """
283 Looks into model signature and add parameter *n_jobs*
284 if available. The function does not overwrite the parameter.
286 @param model model class
287 @param params current set of parameters
288 @param n_jobs number of CPU or *n_jobs* if specified or 0
289 @return new set of parameters
291 On this machine, the default value is the following.
293 .. runpython::
294 :showcode:
295 :warningout: DeprecationWarning
297 import multiprocessing
298 print(multiprocessing.cpu_count())
299 """
300 if params is not None and 'n_jobs' in params:
301 return params
302 sig = inspect.signature(model.__init__)
303 if 'n_jobs' not in sig.parameters:
304 return params
305 if n_jobs == 0:
306 n_jobs = None
307 params = params.copy() if params else {}
308 params['n_jobs'] = n_jobs or multiprocessing.cpu_count()
309 return params