Coverage for mlprodict/onnx_tools/optim/sklearn_helper.py: 94%

129 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-04 02:28 +0100

1""" 

2@file 

3@brief Helpers to manipulate :epkg:`scikit-learn` models. 

4""" 

5import inspect 

6import multiprocessing 

7import numpy 

8from sklearn.base import ( 

9 TransformerMixin, ClassifierMixin, RegressorMixin, BaseEstimator) 

10from sklearn.pipeline import Pipeline, FeatureUnion 

11from sklearn.compose import ColumnTransformer 

12 

13 

14def enumerate_pipeline_models(pipe, coor=None, vs=None): 

15 """ 

16 Enumerates all the models within a pipeline. 

17 

18 @param pipe *scikit-learn* pipeline 

19 @param coor current coordinate 

20 @param vs subset of variables for the model, None for all 

21 @return iterator on models ``tuple(coordinate, model)`` 

22 

23 Example: 

24 

25 .. runpython:: 

26 :showcode: 

27 :warningout: DeprecationWarning 

28 

29 from sklearn.datasets import load_iris 

30 from sklearn.decomposition import PCA 

31 from sklearn.linear_model import LogisticRegression 

32 from sklearn.pipeline import make_pipeline 

33 from sklearn.model_selection import train_test_split 

34 from mlprodict.onnx_tools.optim.sklearn_helper import enumerate_pipeline_models 

35 

36 iris = load_iris() 

37 X, y = iris.data, iris.target 

38 X_train, __, y_train, _ = train_test_split(X, y, random_state=11) 

39 clr = make_pipeline(PCA(n_components=2), 

40 LogisticRegression(solver="liblinear")) 

41 clr.fit(X_train, y_train) 

42 

43 for a in enumerate_pipeline_models(clr): 

44 print(a) 

45 """ 

46 if coor is None: 

47 coor = (0,) 

48 yield coor, pipe, vs 

49 if hasattr(pipe, 'transformer_and_mapper_list') and len(pipe.transformer_and_mapper_list): 

50 # azureml DataTransformer 

51 raise NotImplementedError( # pragma: no cover 

52 "Unable to handle this specific case.") 

53 elif hasattr(pipe, 'mapper') and pipe.mapper: 

54 # azureml DataTransformer 

55 for couple in enumerate_pipeline_models( # pragma: no cover 

56 pipe.mapper, coor + (0,)): 

57 yield couple 

58 elif hasattr(pipe, 'built_features'): # pragma: no cover 

59 # sklearn_pandas.dataframe_mapper.DataFrameMapper 

60 for i, (columns, transformers, _) in enumerate( 

61 pipe.built_features): 

62 if isinstance(columns, str): 

63 columns = (columns,) 

64 if transformers is None: 

65 yield (coor + (i,)), None, columns 

66 else: 

67 for couple in enumerate_pipeline_models(transformers, coor + (i,), columns): 

68 yield couple 

69 elif isinstance(pipe, Pipeline): 

70 for i, (_, model) in enumerate(pipe.steps): 

71 for couple in enumerate_pipeline_models(model, coor + (i,)): 

72 yield couple 

73 elif isinstance(pipe, ColumnTransformer): 

74 for i, (_, fitted_transformer, column) in enumerate(pipe.transformers): 

75 for couple in enumerate_pipeline_models( 

76 fitted_transformer, coor + (i,), column): 

77 yield couple 

78 elif isinstance(pipe, FeatureUnion): 

79 for i, (_, model) in enumerate(pipe.transformer_list): 

80 for couple in enumerate_pipeline_models(model, coor + (i,)): 

81 yield couple 

82 elif isinstance(pipe, (TransformerMixin, ClassifierMixin, RegressorMixin)): 

83 pass 

84 elif isinstance(pipe, BaseEstimator): 

85 pass 

86 elif isinstance(pipe, (list, numpy.ndarray)): 

87 for i, m in enumerate(pipe): 

88 for couple in enumerate_pipeline_models(m, coor + (i,)): 

89 yield couple 

90 else: 

91 raise TypeError( # pragma: no cover 

92 f"pipe is not a scikit-learn object: {type(pipe)}\n{pipe}") 

93 

94 

95def enumerate_fitted_arrays(model): 

96 """ 

97 Enumerate all fitted arrays included in a 

98 :epkg:`scikit-learn` object. 

99 

100 @param model :epkg:`scikit-learn` object 

101 @return enumerator 

102 

103 One example: 

104 

105 .. runpython:: 

106 :showcode: 

107 :warningout: DeprecationWarning 

108 

109 from sklearn.datasets import load_iris 

110 from sklearn.decomposition import PCA 

111 from sklearn.linear_model import LogisticRegression 

112 from sklearn.pipeline import make_pipeline 

113 from sklearn.model_selection import train_test_split 

114 from mlprodict.onnx_tools.optim.sklearn_helper import enumerate_fitted_arrays 

115 

116 iris = load_iris() 

117 X, y = iris.data, iris.target 

118 X_train, __, y_train, _ = train_test_split(X, y, random_state=11) 

119 clr = make_pipeline(PCA(n_components=2), 

120 LogisticRegression(solver="liblinear")) 

121 clr.fit(X_train, y_train) 

122 

123 for a in enumerate_fitted_arrays(clr): 

124 print(a) 

125 """ 

126 def enumerate__(obj): 

127 if isinstance(obj, (tuple, list)): 

128 for el in obj: 

129 for o in enumerate__(el): 

130 yield (obj, el, o) 

131 elif isinstance(obj, dict): 

132 for k, v in obj.items(): 

133 for o in enumerate__(v): 

134 yield (obj, k, v, o) 

135 elif hasattr(obj, '__dict__'): 

136 for k, v in obj.__dict__.items(): 

137 if k[-1] != '_' and k[0] != '_': 

138 continue 

139 if isinstance(v, numpy.ndarray): 

140 yield (obj, k, v) 

141 else: 

142 for row in enumerate__(v): 

143 yield row 

144 

145 for row in enumerate_pipeline_models(model): 

146 coord = row[:-1] 

147 sub = row[1] 

148 last = row[2:] 

149 for sub_row in enumerate__(sub): 

150 yield coord + (sub, sub_row) + last 

151 

152 

153def pairwise_array_distances(l1, l2, metric='l1med'): 

154 """ 

155 Computes pairwise distances between two lists of arrays 

156 *l1* and *l2*. The distance is 1e9 if shapes are not equal. 

157 

158 @param l1 first list of arrays 

159 @param l2 second list of arrays 

160 @param metric metric to use, `'l1med'` compute 

161 the average absolute error divided 

162 by the ansolute median 

163 @return matrix 

164 """ 

165 dist = numpy.full((len(l1), len(l2)), 1e9) 

166 for i, a1 in enumerate(l1): 

167 if not isinstance(a1, numpy.ndarray): 

168 continue # pragma: no cover 

169 for j, a2 in enumerate(l2): 

170 if not isinstance(a2, numpy.ndarray): 

171 continue # pragma: no cover 

172 if a1.shape != a2.shape: 

173 continue 

174 a = numpy.median(numpy.abs(a1)) 

175 if a == 0: 

176 a = 1 

177 diff = numpy.sum(numpy.abs(a1 - a2)) / a 

178 dist[i, j] = diff / diff.size 

179 return dist 

180 

181 

182def max_depth(estimator): 

183 """ 

184 Retrieves the max depth assuming the estimator 

185 is a decision tree. 

186 """ 

187 n_nodes = estimator.tree_.node_count 

188 children_left = estimator.tree_.children_left 

189 children_right = estimator.tree_.children_right 

190 

191 node_depth = numpy.zeros(shape=n_nodes, dtype=numpy.int64) 

192 is_leaves = numpy.zeros(shape=n_nodes, dtype=bool) 

193 stack = [(0, -1)] # seed is the root node id and its parent depth 

194 while len(stack) > 0: 

195 node_id, parent_depth = stack.pop() 

196 node_depth[node_id] = parent_depth + 1 

197 

198 # If we have a test node 

199 if children_left[node_id] != children_right[node_id]: 

200 stack.append((children_left[node_id], parent_depth + 1)) 

201 stack.append((children_right[node_id], parent_depth + 1)) 

202 else: 

203 is_leaves[node_id] = True 

204 return max(node_depth) 

205 

206 

207def inspect_sklearn_model(model, recursive=True): 

208 """ 

209 Inspects a :epkg:`scikit-learn` model and produces 

210 some figures which tries to represent the complexity of it. 

211 

212 @param model model 

213 @param recursive recursive look 

214 @return dictionary 

215 

216 .. runpython:: 

217 :showcode: 

218 :warningout: DeprecationWarning 

219 

220 import pprint 

221 from sklearn.ensemble import RandomForestClassifier 

222 from sklearn.linear_model import LogisticRegression 

223 from sklearn.datasets import load_iris 

224 from mlprodict.onnx_tools.optim.sklearn_helper import inspect_sklearn_model 

225 

226 iris = load_iris() 

227 X = iris.data 

228 y = iris.target 

229 lr = LogisticRegression() 

230 lr.fit(X, y) 

231 pprint.pprint((lr, inspect_sklearn_model(lr))) 

232 

233 

234 iris = load_iris() 

235 X = iris.data 

236 y = iris.target 

237 rf = RandomForestClassifier() 

238 rf.fit(X, y) 

239 pprint.pprint((rf, inspect_sklearn_model(rf))) 

240 """ 

241 def update(sts, st): 

242 for k, v in st.items(): 

243 if k in sts: 

244 if 'max_' in k: 

245 sts[k] = max(v, sts[k]) 

246 else: 

247 sts[k] += v 

248 else: 

249 sts[k] = v 

250 

251 def insmodel(m): 

252 st = {'nop': 1} 

253 if hasattr(m, 'tree_') and hasattr(m.tree_, 'node_count'): 

254 st['nnodes'] = m.tree_.node_count 

255 st['ntrees'] = 1 

256 st['max_depth'] = max_depth(m) 

257 try: 

258 if hasattr(m, 'coef_'): 

259 st['ncoef'] = len(m.coef_) 

260 st['nlin'] = 1 

261 except KeyError: # pragma: no cover 

262 # added to deal with xgboost 1.0 (KeyError: 'weight') 

263 pass 

264 if hasattr(m, 'estimators_'): 

265 for est in m.estimators_: 

266 st_ = inspect_sklearn_model(est, recursive=recursive) 

267 update(st, st_) 

268 return st 

269 

270 if recursive: 

271 sts = {} 

272 for __, m, _ in enumerate_pipeline_models(model): 

273 st = inspect_sklearn_model(m, recursive=False) 

274 update(sts, st) 

275 st = insmodel(m) 

276 update(sts, st) 

277 return st 

278 return insmodel(model) 

279 

280 

281def set_n_jobs(model, params, n_jobs=None): 

282 """ 

283 Looks into model signature and add parameter *n_jobs* 

284 if available. The function does not overwrite the parameter. 

285 

286 @param model model class 

287 @param params current set of parameters 

288 @param n_jobs number of CPU or *n_jobs* if specified or 0 

289 @return new set of parameters 

290 

291 On this machine, the default value is the following. 

292 

293 .. runpython:: 

294 :showcode: 

295 :warningout: DeprecationWarning 

296 

297 import multiprocessing 

298 print(multiprocessing.cpu_count()) 

299 """ 

300 if params is not None and 'n_jobs' in params: 

301 return params 

302 sig = inspect.signature(model.__init__) 

303 if 'n_jobs' not in sig.parameters: 

304 return params 

305 if n_jobs == 0: 

306 n_jobs = None 

307 params = params.copy() if params else {} 

308 params['n_jobs'] = n_jobs or multiprocessing.cpu_count() 

309 return params