.. _decisiontreelogregrst: ===================================== Decision Tree and Logistic Regression ===================================== .. only:: html **Links:** :download:`notebook `, :downloadlink:`html `, :download:`PDF `, :download:`python `, :downloadlink:`slides `, :githublink:`GitHub|_doc/notebooks/sklearn/decision_tree_logreg.ipynb|*` The notebook demonstrates the model *DecisionTreeLogisticRegression* which replaces the decision based on one variable by a logistic regression. .. code:: ipython3 from jyquickhelper import add_notebook_menu add_notebook_menu() .. contents:: :local: .. code:: ipython3 %matplotlib inline .. code:: ipython3 import warnings warnings.simplefilter("ignore") Iris dataset and logistic regression ------------------------------------ The following code shows the border defined by two machine learning models on the `Iris dataset `__. .. code:: ipython3 import numpy import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split def plot_classifier_decision_zone(clf, X, y, title=None, ax=None): if ax is None: ax = plt.gca() x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 dhx = (x_max - x_min) / 100 dhy = (y_max - y_min) / 100 xx, yy = numpy.meshgrid(numpy.arange(x_min, x_max, dhx), numpy.arange(y_min, y_max, dhy)) Z = clf.predict(numpy.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, alpha=0.5) ax.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor='k', lw=0.5) if title is not None: ax.set_title(title) iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target y = y % 2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, shuffle=True) .. code:: ipython3 from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier lr = LogisticRegression() lr.fit(X_train, y_train) dt = DecisionTreeClassifier(criterion='entropy') dt.fit(X_train, y_train) fig, ax = plt.subplots(1, 2, figsize=(10, 4)) plot_classifier_decision_zone(lr, X_test, y_test, ax=ax[0], title="LogisticRegression") plot_classifier_decision_zone(dt, X_test, y_test, ax=ax[1], title="DecisionTreeClassifier") .. image:: decision_tree_logreg_6_0.png The logistic regression is not very stable on this sort of problem. No linear separator can work on this dataset. Let’s dig into it. DecisionTreeLogisticRegression ------------------------------ .. code:: ipython3 from mlinsights.mlmodel import DecisionTreeLogisticRegression dtlr = DecisionTreeLogisticRegression( estimator=LogisticRegression(solver='liblinear'), min_samples_leaf=10, min_samples_split=10, max_depth=1, fit_improve_algo='none') dtlr.fit(X_train, y_train) dtlr2 = DecisionTreeLogisticRegression( estimator=LogisticRegression(solver='liblinear'), min_samples_leaf=4, min_samples_split=4, max_depth=10, fit_improve_algo='intercept_sort_always') dtlr2.fit(X_train, y_train) fig, ax = plt.subplots(2, 2, figsize=(10, 8)) plot_classifier_decision_zone( dtlr, X_train, y_train, ax=ax[0, 0], title="DecisionTreeLogisticRegression\ndepth=%d - train" % dtlr.tree_depth_) plot_classifier_decision_zone( dtlr2, X_train, y_train, ax=ax[0, 1], title="DecisionTreeLogisticRegression\ndepth=%d - train" % dtlr2.tree_depth_) plot_classifier_decision_zone( dtlr, X_test, y_test, ax=ax[1, 0], title="DecisionTreeLogisticRegression\ndepth=%d - test" % dtlr.tree_depth_) plot_classifier_decision_zone( dtlr2, X_test, y_test, ax=ax[1, 1], title="DecisionTreeLogisticRegression\ndepth=%d - test" % dtlr2.tree_depth_) .. image:: decision_tree_logreg_9_0.png .. code:: ipython3 from pandas import DataFrame rows = [] for model in [lr, dt, dtlr, dtlr2]: val = (" - depth=%d" % model.tree_depth_) if hasattr(model, 'tree_depth_') else "" obs = dict(name="%s%s" % (model.__class__.__name__, val), score=model.score(X_test, y_test)) rows.append(obs) DataFrame(rows) .. raw:: html
name score
0 LogisticRegression 0.644444
1 DecisionTreeClassifier 0.933333
2 DecisionTreeLogisticRegression - depth=1 0.700000
3 DecisionTreeLogisticRegression - depth=5 0.855556
A first example --------------- .. code:: ipython3 import numpy from scipy.spatial.distance import cdist def random_set_simple(n): X = numpy.random.rand(n, 2) y = ((X[:, 0] ** 2 + X[:, 1] ** 2) <= 1).astype(numpy.int32).ravel() return X, y X, y = random_set_simple(2000) X_train, X_test, y_train, y_test = train_test_split(X, y) dt = DecisionTreeClassifier(max_depth=3) dt.fit(X_train, y_train) dt8 = DecisionTreeClassifier(max_depth=10) dt8.fit(X_train, y_train) fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True) plot_classifier_decision_zone(dt, X_test, y_test, ax=ax[0], title="DecisionTree - max_depth=%d\nacc=%1.2f" % ( dt.max_depth, dt.score(X_test, y_test))) plot_classifier_decision_zone(dt8, X_test, y_test, ax=ax[1], title="DecisionTree - max_depth=%d\nacc=%1.2f" % ( dt8.max_depth, dt8.score(X_test, y_test))) ax[0].set_xlim([0, 1]) ax[1].set_xlim([0, 1]) ax[0].set_ylim([0, 1]); .. image:: decision_tree_logreg_12_0.png .. code:: ipython3 dtlr = DecisionTreeLogisticRegression( max_depth=3, fit_improve_algo='intercept_sort_always', verbose=1) dtlr.fit(X_train, y_train) dtlr8 = DecisionTreeLogisticRegression( max_depth=10, min_samples_split=4, fit_improve_algo='intercept_sort_always') dtlr8.fit(X_train, y_train) fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True) plot_classifier_decision_zone(dtlr, X_test, y_test, ax=ax[0], title="DecisionTreeLogReg - depth=%d\nacc=%1.2f" % ( dtlr.tree_depth_, dtlr.score(X_test, y_test))) plot_classifier_decision_zone(dtlr8, X_test, y_test, ax=ax[1], title="DecisionTreeLogReg - depth=%d\nacc=%1.2f" % ( dtlr8.tree_depth_, dtlr8.score(X_test, y_test))) ax[0].set_xlim([0, 1]) ax[1].set_xlim([0, 1]) ax[0].set_ylim([0, 1]); .. parsed-literal:: [DTLR ] trained acc 0.96 N=1500 [DTLRI] change intercept 11.677031 --> 10.877451 in [0.278070, 16.549686] [DTLR*] above: n_class=2 N=1500 - 1106/1500 [DTLR ] trained acc 0.99 N=1106 [DTLRI] change intercept 6.021739 --> 1.840312 in [0.063825, 2.640076] [DTLR*] above: n_class=1 N=1106 - 743/1500 [DTLR*] below: n_class=2 N=1106 - 363/1500 [DTLR ] trained acc 0.96 N=363 [DTLRI] change intercept 3.970377 --> 0.770538 in [0.461779, 0.985259] [DTLR*] below: n_class=2 N=1500 - 394/1500 [DTLR ] trained acc 0.80 N=394 [DTLRI] change intercept 4.763873 --> 5.983343 in [5.225083, 8.055335] [DTLR*] above: n_class=2 N=394 - 162/1500 [DTLR ] trained acc 0.54 N=162 [DTLRI] change intercept 1.289949 --> 1.351619 in [1.036507, 1.533679] [DTLR*] below: n_class=1 N=394 - 232/1500 .. image:: decision_tree_logreg_13_1.png .. code:: ipython3 from mlinsights.mltree import predict_leaves def draw_border(clr, X, y, fct=None, incx=0.1, incy=0.1, figsize=None, border=True, ax=None, s=10., linewidths=0.1): _unused_ = ["Red", "Green", "Yellow", "Blue", "Orange", "Purple", "Cyan", "Magenta", "Lime", "Pink", "Teal", "Lavender", "Brown", "Beige", "Maroon", "Mint", "Olive", "Coral", "Navy", "Grey", "White", "Black"] h = .02 x_min, x_max = X[:, 0].min() - incx, X[:, 0].max() + incx y_min, y_max = X[:, 1].min() - incy, X[:, 1].max() + incy xx, yy = numpy.meshgrid(numpy.arange(x_min, x_max, h), numpy.arange(y_min, y_max, h)) if fct is None: Z = clr.predict(numpy.c_[xx.ravel(), yy.ravel()]) else: Z = fct(clr, numpy.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot cmap = plt.cm.tab20 Z = Z.reshape(xx.shape) if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize or (4, 3)) ax.pcolormesh(xx, yy, Z, cmap=cmap) # Plot also the training points ax.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=cmap, s=s, linewidths=linewidths) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) return ax fig, ax = plt.subplots(1, 2, figsize=(14,4)) draw_border(dt, X_test, y_test, border=False, ax=ax[0]) ax[0].set_title("Iris") draw_border(dt, X, y, border=False, ax=ax[1], fct=lambda m, x: predict_leaves(m, x)) ax[1].set_title("DecisionTree"); .. image:: decision_tree_logreg_14_0.png .. code:: ipython3 from tqdm import tqdm fig, ax = plt.subplots(6, 4, figsize=(12, 16)) for i, depth in tqdm(enumerate((1, 2, 3, 4, 5, 6))): dtl = DecisionTreeLogisticRegression( max_depth=depth, fit_improve_algo='intercept_sort_always', min_samples_leaf=2) dtl.fit(X_train, y_train) draw_border(dtl, X_test, y_test, border=False, ax=ax[i, 0], s=4.) draw_border(dtl, X, y, border=False, ax=ax[i, 1], fct=lambda m, x: predict_leaves(m, x), s=4.) ax[i, 0].set_title("Depth=%d nodes=%d score=%1.2f" % ( dtl.tree_depth_, dtl.n_nodes_, dtl.score(X_test, y_test))) ax[i, 1].set_title("DTLR Leaves zones"); dtl = DecisionTreeClassifier(max_depth=depth) dtl.fit(X_train, y_train) draw_border(dtl, X_test, y_test, border=False, ax=ax[i, 2], s=4.) draw_border(dtl, X, y, border=False, ax=ax[i, 3], fct=lambda m, x: predict_leaves(m, x), s=4.) ax[i, 2].set_title("Depth=%d nodes=%d score=%1.2f" % ( dtl.max_depth, dtl.tree_.node_count, dtl.score(X_test, y_test))) ax[i, 3].set_title("DT Leaves zones"); for k in range(ax.shape[1]): ax[i, k].get_xaxis().set_visible(False) .. parsed-literal:: 6it [00:02, 2.92it/s] .. image:: decision_tree_logreg_15_1.png Another example designed to fail -------------------------------- Designed to be difficult with a regular decision tree. .. code:: ipython3 from scipy.spatial.distance import cdist def random_set(n): X = numpy.random.rand(n, 2) y = (cdist(X, numpy.array([[0.5, 0.5]]), metric='minkowski', p=1) <= 0.5).astype(numpy.int32).ravel() return X, y X, y = random_set(2000) X_train, X_test, y_train, y_test = train_test_split(X, y) dt = DecisionTreeClassifier(max_depth=3) dt.fit(X_train, y_train) dt8 = DecisionTreeClassifier(max_depth=10) dt8.fit(X_train, y_train) fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True) plot_classifier_decision_zone(dt, X_test, y_test, ax=ax[0], title="DecisionTree - max_depth=%d\nacc=%1.2f" % ( dt.max_depth, dt.score(X_test, y_test))) plot_classifier_decision_zone(dt8, X_test, y_test, ax=ax[1], title="DecisionTree - max_depth=%d\nacc=%1.2f" % ( dt8.max_depth, dt8.score(X_test, y_test))) ax[0].set_xlim([0, 1]) ax[1].set_xlim([0, 1]) ax[0].set_ylim([0, 1]); .. image:: decision_tree_logreg_17_0.png The example is a square rotated by 45 degrees. Every sample in the square is a positive sample, every sample outside is a negative one. The tree approximates the border with horizontal and vertical lines. .. code:: ipython3 dtlr = DecisionTreeLogisticRegression( max_depth=3, fit_improve_algo='intercept_sort_always', verbose=1) dtlr.fit(X_train, y_train) dtlr8 = DecisionTreeLogisticRegression( max_depth=10, min_samples_split=4, fit_improve_algo='intercept_sort_always') dtlr8.fit(X_train, y_train) fig, ax = plt.subplots(1, 2, figsize=(10, 4), sharey=True) plot_classifier_decision_zone(dtlr, X_test, y_test, ax=ax[0], title="DecisionTreeLogReg - depth=%d\nacc=%1.2f" % ( dtlr.tree_depth_, dtlr.score(X_test, y_test))) plot_classifier_decision_zone(dtlr8, X_test, y_test, ax=ax[1], title="DecisionTreeLogReg - depth=%d\nacc=%1.2f" % ( dtlr8.tree_depth_, dtlr8.score(X_test, y_test))) ax[0].set_xlim([0, 1]) ax[1].set_xlim([0, 1]) ax[0].set_ylim([0, 1]); .. parsed-literal:: [DTLR ] trained acc 0.50 N=1500 [DTLRI] change intercept 0.001126 --> 0.019908 in [0.001172, 0.038195] [DTLR*] above: n_class=2 N=1500 - 749/1500 [DTLR ] trained acc 0.64 N=749 [DTLRI] change intercept -1.972404 --> -2.003562 in [-3.382932, -0.149398] [DTLR*] above: n_class=2 N=749 - 377/1500 [DTLR ] trained acc 0.64 N=377 [DTLRI] change intercept 1.136431 --> 0.564497 in [0.399068, 0.831867] [DTLR*] below: n_class=2 N=749 - 372/1500 [DTLR ] trained acc 0.77 N=372 [DTLRI] change intercept -2.481437 --> -1.962176 in [-3.275774, -0.156925] [DTLR*] below: n_class=2 N=1500 - 751/1500 [DTLR ] trained acc 0.66 N=751 [DTLRI] change intercept 4.143107 --> 4.117942 in [2.662598, 6.063896] [DTLR*] above: n_class=2 N=751 - 388/1500 [DTLR ] trained acc 0.64 N=388 [DTLRI] change intercept -0.412468 --> -0.999464 in [-1.346126, -0.659144] [DTLR*] below: n_class=2 N=751 - 363/1500 [DTLR ] trained acc 0.75 N=363 [DTLRI] change intercept 5.485085 --> 6.009627 in [5.307328, 7.827812] .. image:: decision_tree_logreg_19_1.png Leave zones ----------- We use method *decision_path* to understand which leaf is responsible for which zone. .. code:: ipython3 fig, ax = plt.subplots(1, 2, figsize=(14,4)) draw_border(dtlr, X_test, y_test, border=False, ax=ax[0]) ax[0].set_title("Iris") draw_border(dtlr, X, y, border=False, ax=ax[1], fct=lambda m, x: predict_leaves(m, x)) ax[1].set_title("DecisionTreeLogisticRegression"); .. image:: decision_tree_logreg_21_0.png .. code:: ipython3 from tqdm import tqdm fig, ax = plt.subplots(6, 4, figsize=(12, 16)) for i, depth in tqdm(enumerate((1, 2, 3, 4, 5, 6))): dtl = DecisionTreeLogisticRegression( max_depth=depth, fit_improve_algo='intercept_sort_always', min_samples_leaf=2) dtl.fit(X_train, y_train) draw_border(dtl, X_test, y_test, border=False, ax=ax[i, 0], s=4.) draw_border(dtl, X, y, border=False, ax=ax[i, 1], fct=lambda m, x: predict_leaves(m, x), s=4.) ax[i, 0].set_title("Depth=%d nodes=%d score=%1.2f" % ( dtl.tree_depth_, dtl.n_nodes_, dtl.score(X_test, y_test))) ax[i, 1].set_title("DTLR Leaves zones"); dtl = DecisionTreeClassifier(max_depth=depth) dtl.fit(X_train, y_train) draw_border(dtl, X_test, y_test, border=False, ax=ax[i, 2], s=4.) draw_border(dtl, X, y, border=False, ax=ax[i, 3], fct=lambda m, x: predict_leaves(m, x), s=4.) ax[i, 2].set_title("Depth=%d nodes=%d score=%1.2f" % ( dtl.max_depth, dtl.tree_.node_count, dtl.score(X_test, y_test))) ax[i, 3].set_title("DT Leaves zones"); .. parsed-literal:: 6it [00:02, 2.29it/s] .. image:: decision_tree_logreg_22_1.png