Pipeline can be big with scikit-learn, let's dig into a visual way to look a them.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline
import warnings
warnings.simplefilter("ignore")
Let's vizualize a simple pipeline, a single model not even trained.
import pandas
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
iris = datasets.load_iris()
X = iris.data[:, :4]
df = pandas.DataFrame(X)
df.columns = ["X1", "X2", "X3", "X4"]
clf = LogisticRegression()
clf
LogisticRegression()
The trick consists in converting the pipeline in a graph through the DOT language.
from mlinsights.plotting import pipeline2dot
dot = pipeline2dot(clf, df)
print(dot)
digraph{ orientation=portrait; nodesep=0.05; ranksep=0.25; sch0[label="<f0> X1|<f1> X2|<f2> X3|<f3> X4",shape=record,fontsize=8]; node1[label="union",shape=box,style="filled,rounded",color=cyan,fontsize=12]; sch0:f0 -> node1; sch0:f1 -> node1; sch0:f2 -> node1; sch0:f3 -> node1; sch1[label="<f0> -v-0",shape=record,fontsize=8]; node1 -> sch1:f0; node2[label="LogisticRegression",shape=box,style="filled,rounded",color=yellow,fontsize=12]; sch1:f0 -> node2; sch2[label="<f0> PredictedLabel|<f1> Probabilities",shape=record,fontsize=8]; node2 -> sch2:f0; node2 -> sch2:f1; }
It is lot better with an image.
dot_file = "graph.dot"
with open(dot_file, "w", encoding="utf-8") as f:
f.write(dot)
# might be needed on windows
import sys
import os
if sys.platform.startswith("win") and "Graphviz" not in os.environ["PATH"]:
os.environ['PATH'] = os.environ['PATH'] + r';C:\Program Files (x86)\Graphviz2.38\bin'
from pyquickhelper.loghelper import run_cmd
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True, fLOG=print);
[run_cmd] execute dot -G=300 -Tpng graph.dot -ograph.dot.png end of execution dot -G=300 -Tpng graph.dot -ograph.dot.png
from PIL import Image
img = Image.open("graph.dot.png")
img
scikit-learn instroduced a couple of transform to play with features in a single pipeline. The following example is taken from Column Transformer with Mixed Types.
from sklearn import datasets
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
columns = ['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',
'cabin', 'embarked', 'boat', 'body', 'home.dest']
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
])
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression(solver='lbfgs'))])
clf
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['age', 'fare']), ('cat', Pipeline(steps=[('imputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['embarked', 'sex', 'pclass'])])), ('classifier', LogisticRegression())])
Let's see it first as a simplified text.
from mlinsights.plotting import pipeline2str
print(pipeline2str(clf))
Pipeline ColumnTransformer Pipeline(age,fare) SimpleImputer StandardScaler Pipeline(embarked,sex,pclass) SimpleImputer OneHotEncoder LogisticRegression
dot = pipeline2dot(clf, columns)
dot_file = "graph2.dot"
with open(dot_file, "w", encoding="utf-8") as f:
f.write(dot)
cmd = "dot -G=300 -Tpng {0} -o{0}.png".format(dot_file)
run_cmd(cmd, wait=True, fLOG=print);
[run_cmd] execute dot -G=300 -Tpng graph2.dot -ograph2.dot.png end of execution dot -G=300 -Tpng graph2.dot -ograph2.dot.png
img = Image.open("graph2.dot.png")
img
from jyquickhelper import RenderJsDot
RenderJsDot(dot)
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
model = Pipeline([('poly', PolynomialFeatures()),
('union', FeatureUnion([
('scaler2', MinMaxScaler()),
('scaler3', StandardScaler())]))])
dot = pipeline2dot(model, columns)
RenderJsDot(dot)
It is difficult to access intermediate outputs with scikit-learn but it may be interesting to do so. The method http://www.xavierdupre.fr/app/mlinsights/helpsphinx/mlinsights/helpers/pipeline.html#mlinsights.helpers.pipeline.alter_pipeline_for_debugging modifies the pipeline to intercept intermediate outputs.
from numpy.random import randn
model = Pipeline([('scaler1', StandardScaler()),
('union', FeatureUnion([
('scaler2', StandardScaler()),
('scaler3', MinMaxScaler())])),
('lr', LinearRegression())])
X = randn(4, 5)
y = randn(4)
model.fit(X, y)
Pipeline(steps=[('scaler1', StandardScaler()), ('union', FeatureUnion(transformer_list=[('scaler2', StandardScaler()), ('scaler3', MinMaxScaler())])), ('lr', LinearRegression())])
print(pipeline2str(model))
Pipeline StandardScaler FeatureUnion StandardScaler MinMaxScaler LinearRegression
Let's now modify the pipeline to get the intermediate outputs.
from mlinsights.helpers.pipeline import alter_pipeline_for_debugging
alter_pipeline_for_debugging(model)
The function adds a member _debug
which stores inputs and outputs in every piece of the pipeline.
model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler)
model.predict(X)
array([ 0.73619378, 0.87936142, -0.56528874, -0.2675163 ])
The member was populated with inputs and outputs.
model.steps[0][1]._debug
BaseEstimatorDebugInformation(StandardScaler) transform( shape=(4, 5) type=<class 'numpy.ndarray'> [[ 1.22836841 2.35164607 -0.37367786 0.61490475 -0.45377634] [-0.77187962 0.43540786 0.20465106 0.8910651 -0.23104796] [-0.36750208 0.35154324 1.78609517 -1.59325463 1.51595267] [ 1.37547609 1.59470748 -0.5932628 0.57822003 0.56034736]] ) -> ( shape=(4, 5) type=<class 'numpy.ndarray'> [[ 0.90946066 1.4000516 -0.67682808 0.49311806 -1.03765861] [-1.20030006 -0.89626498 -0.05514595 0.76980985 -0.7493565 ] [-0.77378303 -0.99676381 1.64484777 -1.71929067 1.51198065] [ 1.06462242 0.49297719 -0.91287374 0.45636275 0.27503446]] )
Every piece behaves the same way.
from mlinsights.helpers.pipeline import enumerate_pipeline_models
for coor, model, vars in enumerate_pipeline_models(model):
print(coor)
print(model._debug)
(0,) BaseEstimatorDebugInformation(Pipeline) predict( shape=(4, 5) type=<class 'numpy.ndarray'> [[ 1.22836841 2.35164607 -0.37367786 0.61490475 -0.45377634] [-0.77187962 0.43540786 0.20465106 0.8910651 -0.23104796] [-0.36750208 0.35154324 1.78609517 -1.59325463 1.51595267] [ 1.37547609 1.59470748 -0.5932628 0.57822003 0.56034736]] ) -> ( shape=(4,) type=<class 'numpy.ndarray'> [ 0.73619378 0.87936142 -0.56528874 -0.2675163 ] ) (0, 0) BaseEstimatorDebugInformation(StandardScaler) transform( shape=(4, 5) type=<class 'numpy.ndarray'> [[ 1.22836841 2.35164607 -0.37367786 0.61490475 -0.45377634] [-0.77187962 0.43540786 0.20465106 0.8910651 -0.23104796] [-0.36750208 0.35154324 1.78609517 -1.59325463 1.51595267] [ 1.37547609 1.59470748 -0.5932628 0.57822003 0.56034736]] ) -> ( shape=(4, 5) type=<class 'numpy.ndarray'> [[ 0.90946066 1.4000516 -0.67682808 0.49311806 -1.03765861] [-1.20030006 -0.89626498 -0.05514595 0.76980985 -0.7493565 ] [-0.77378303 -0.99676381 1.64484777 -1.71929067 1.51198065] [ 1.06462242 0.49297719 -0.91287374 0.45636275 0.27503446]] ) (0, 1) BaseEstimatorDebugInformation(FeatureUnion) transform( shape=(4, 5) type=<class 'numpy.ndarray'> [[ 0.90946066 1.4000516 -0.67682808 0.49311806 -1.03765861] [-1.20030006 -0.89626498 -0.05514595 0.76980985 -0.7493565 ] [-0.77378303 -0.99676381 1.64484777 -1.71929067 1.51198065] [ 1.06462242 0.49297719 -0.91287374 0.45636275 0.27503446]] ) -> ( shape=(4, 10) type=<class 'numpy.ndarray'> [[ 0.90946066 1.4000516 -0.67682808 0.49311806 -1.03765861 0.93149357 1. 0.09228748 0.88883864 0. ] [-1.20030006 -0.89626498 -0.05514595 0.76980985 -0.7493565 0. 0.04193015 0.33534839 1. 0.11307564] [-0.77378303 -0.99676381 1.64484777 -1.71929067 1.51198065 0.18831419 ... ) (0, 1, 0) BaseEstimatorDebugInformation(StandardScaler) transform( shape=(4, 5) type=<class 'numpy.ndarray'> [[ 0.90946066 1.4000516 -0.67682808 0.49311806 -1.03765861] [-1.20030006 -0.89626498 -0.05514595 0.76980985 -0.7493565 ] [-0.77378303 -0.99676381 1.64484777 -1.71929067 1.51198065] [ 1.06462242 0.49297719 -0.91287374 0.45636275 0.27503446]] ) -> ( shape=(4, 5) type=<class 'numpy.ndarray'> [[ 0.90946066 1.4000516 -0.67682808 0.49311806 -1.03765861] [-1.20030006 -0.89626498 -0.05514595 0.76980985 -0.7493565 ] [-0.77378303 -0.99676381 1.64484777 -1.71929067 1.51198065] [ 1.06462242 0.49297719 -0.91287374 0.45636275 0.27503446]] ) (0, 1, 1) BaseEstimatorDebugInformation(MinMaxScaler) transform( shape=(4, 5) type=<class 'numpy.ndarray'> [[ 0.90946066 1.4000516 -0.67682808 0.49311806 -1.03765861] [-1.20030006 -0.89626498 -0.05514595 0.76980985 -0.7493565 ] [-0.77378303 -0.99676381 1.64484777 -1.71929067 1.51198065] [ 1.06462242 0.49297719 -0.91287374 0.45636275 0.27503446]] ) -> ( shape=(4, 5) type=<class 'numpy.ndarray'> [[0.93149357 1. 0.09228748 0.88883864 0. ] [0. 0.04193015 0.33534839 1. 0.11307564] [0.18831419 0. 1. 0. 1. ] [1. 0.62155016 0. 0.87407214 0.51485443]] ) (0, 2) BaseEstimatorDebugInformation(LinearRegression) predict( shape=(4, 10) type=<class 'numpy.ndarray'> [[ 0.90946066 1.4000516 -0.67682808 0.49311806 -1.03765861 0.93149357 1. 0.09228748 0.88883864 0. ] [-1.20030006 -0.89626498 -0.05514595 0.76980985 -0.7493565 0. 0.04193015 0.33534839 1. 0.11307564] [-0.77378303 -0.99676381 1.64484777 -1.71929067 1.51198065 0.18831419 ... ) -> ( shape=(4,) type=<class 'numpy.ndarray'> [ 0.73619378 0.87936142 -0.56528874 -0.2675163 ] )