Caching algorithm for a GridSearchCV#

Ideas #

The goal is to measure the impact of using a cache while optimizing a pipeline:

[
    ('scale', MinMaxScaler()),
    ('pca', PCA(2)),
    ('poly', PolynomialFeatures()),
    ('bins', KBinsDiscretizer()),
    ('lr', LogisticRegression(solver='liblinear'))
]

With the following parameters:

params_grid = {
    'scale__feature_range': [(0, 1), (-1, 1)],
    'pca__n_components': [2, 4],
    'poly__degree': [2, 3],
    'bins__n_bins': [5],
    'bins__encode': ["onehot-dense", "ordinal"],
    'lr__penalty': ['l1', 'l2'],
}

It looks into different ways to speed up the optimization by caching. One option is not implemented in scikit-learn: PipelineCache, it implements a cache in memory as opposed of joblib which stores everything on disk. This implementation is faster when the training runs with one process, joblib does a better job if the number of jobs and processes is higher even if it may store a huge load of data.

Graphs #

(Source code, png, hires.png, pdf)

(Source code, png, hires.png, pdf)

Machine used to run the test #

<<<

from pyquickhelper.pandashelper import df2rst
import pandas
name = os.path.join(
    __WD__, "../../scikit-learn/results/bench_plot_gridsearch_cache.time.csv")
df = pandas.read_csv(name)
print(df2rst(df, number_format=4))

>>>

name	version	value
date	2019-12-11
python		3.7.2 (default, Mar 1 2019, 18:34:21) [GCC 6.3.0 20170516]
platform		linux
OS		Linux-4.9.0-8-amd64-x86_64-with-debian-9.6
machine		x86_64
processor
release		4.9.0-8-amd64
architecture		(‘64bit’, ‘’)
numpy	1.17.4	openblas, language=c
pandas	0.25.3
sklearn	0.22

Raw results #

bench_polynomial_features.csv

<<<

from pyquickhelper.pandashelper import df2rst
import pandas
name = os.path.join(
    __WD__, "../../scikit-learn/results/bench_plot_gridsearch_cache.csv")
df = pandas.read_csv(name)
print(df2rst(df, number_format=4))

>>>

test	N	n_jobs	dim	repeat	number	min	max	mean	lower	upper	count	median
joblib	1000	1	5	3	1	1.786	2.188	1.972	1.786	2.188	3	1.944
memory	1000	1	5	3	1	1.103	1.26	1.17	1.103	1.26	3	1.148
no	1000	1	5	3	1	1.43	1.552	1.499	1.43	1.552	3	1.515
joblib	1000	2	5	3	1	1.768	1.876	1.812	1.768	1.876	3	1.793
memory	1000	2	5	3	1	1.101	1.369	1.197	1.101	1.369	3	1.121
no	1000	2	5	3	1	1.506	1.562	1.534	1.506	1.562	3	1.532
joblib	1000	3	5	3	1	1.867	1.937	1.894	1.867	1.937	3	1.878
memory	1000	3	5	3	1	1.136	1.206	1.161	1.136	1.206	3	1.14
no	1000	3	5	3	1	1.483	1.67	1.552	1.483	1.67	3	1.501
joblib	10000	1	5	3	1	8.196	11.08	9.748	8.196	11.08	3	9.971
memory	10000	1	5	3	1	8.239	9.818	8.98	8.239	9.818	3	8.881
no	10000	1	5	3	1	7.404	9.878	9.044	7.404	9.878	3	9.849
joblib	10000	2	5	3	1	7.962	9.023	8.493	7.962	9.023	3	8.495
memory	10000	2	5	3	1	7.151	8.844	7.834	7.151	8.844	3	7.507
no	10000	2	5	3	1	7.863	8.447	8.113	7.863	8.447	3	8.028
joblib	10000	3	5	3	1	8.618	11.33	9.953	8.618	11.33	3	9.912
memory	10000	3	5	3	1	7.799	10.2	8.881	7.799	10.2	3	8.65
no	10000	3	5	3	1	8.4	10.19	9.442	8.4	10.19	3	9.734
joblib	20000	1	5	3	1	25.18	29.03	27.56	25.18	29.03	3	28.45
memory	20000	1	5	3	1	24.73	26.98	25.89	24.73	26.98	3	25.95
no	20000	1	5	3	1	29.89	30.82	30.23	29.89	30.82	3	29.99
joblib	20000	2	5	3	1	24.89	31.79	27.85	24.89	31.79	3	26.87
memory	20000	2	5	3	1	22.84	24.54	23.55	22.84	24.54	3	23.26
no	20000	2	5	3	1	24.3	29.11	27.19	24.3	29.11	3	28.16
joblib	20000	3	5	3	1	19.18	26.3	22.04	19.18	26.3	3	20.64
memory	20000	3	5	3	1	21.35	29.79	26.07	21.35	29.79	3	27.07
no	20000	3	5	3	1	22.34	26.51	25	22.34	26.51	3	26.15
joblib	1000	1	10	3	1	2.099	2.566	2.344	2.099	2.566	3	2.367
memory	1000	1	10	3	1	1.395	1.656	1.55	1.395	1.656	3	1.6
no	1000	1	10	3	1	1.892	2.824	2.374	1.892	2.824	3	2.406
joblib	1000	2	10	3	1	2.986	3.915	3.567	2.986	3.915	3	3.799
memory	1000	2	10	3	1	1.818	2.047	1.954	1.818	2.047	3	1.998
no	1000	2	10	3	1	2.193	2.872	2.589	2.193	2.872	3	2.701
joblib	1000	3	10	3	1	2.43	2.837	2.611	2.43	2.837	3	2.566
memory	1000	3	10	3	1	1.355	1.999	1.611	1.355	1.999	3	1.478
no	1000	3	10	3	1	2.121	3.114	2.553	2.121	3.114	3	2.423
joblib	10000	1	10	3	1	11.42	12.6	12.05	11.42	12.6	3	12.14
memory	10000	1	10	3	1	8.256	12.67	10.11	8.256	12.67	3	9.419
no	10000	1	10	3	1	10.9	12.12	11.65	10.9	12.12	3	11.94
joblib	10000	2	10	3	1	9.461	10.88	10.4	9.461	10.88	3	10.85
memory	10000	2	10	3	1	8.861	10.52	9.494	8.861	10.52	3	9.1
no	10000	2	10	3	1	11.74	17.74	14.12	11.74	17.74	3	12.88
joblib	10000	3	10	3	1	14.77	18.09	16.53	14.77	18.09	3	16.71
memory	10000	3	10	3	1	12.91	14.14	13.72	12.91	14.14	3	14.11
no	10000	3	10	3	1	16.75	19.33	18.33	16.75	19.33	3	18.9
joblib	20000	1	10	3	1	21.82	27.33	24.12	21.82	27.33	3	23.2
memory	20000	1	10	3	1	24.41	26.39	25.68	24.41	26.39	3	26.24
no	20000	1	10	3	1	22.65	28.6	26.1	22.65	28.6	3	27.04
joblib	20000	2	10	3	1	22.67	25.3	24.15	22.67	25.3	3	24.46
memory	20000	2	10	3	1	20.16	28.96	23.9	20.16	28.96	3	22.57
no	20000	2	10	3	1	24.36	25.48	24.81	24.36	25.48	3	24.61
joblib	20000	3	10	3	1	19.6	29.83	25.95	19.6	29.83	3	28.42
memory	20000	3	10	3	1	19.58	40.71	27.74	19.58	40.71	3	22.92
no	20000	3	10	3	1	22.91	34.48	29.58	22.91	34.48	3	31.34
joblib	1000	1	20	3	1	2.007	2.417	2.256	2.007	2.417	3	2.343
memory	1000	1	20	3	1	1.396	1.49	1.453	1.396	1.49	3	1.474
no	1000	1	20	3	1	2.056	2.842	2.512	2.056	2.842	3	2.639
joblib	1000	2	20	3	1	2.218	2.34	2.271	2.218	2.34	3	2.256
memory	1000	2	20	3	1	1.196	2.251	1.627	1.196	2.251	3	1.434
no	1000	2	20	3	1	2.237	2.67	2.404	2.237	2.67	3	2.306
joblib	1000	3	20	3	1	2.293	2.39	2.352	2.293	2.39	3	2.372
memory	1000	3	20	3	1	1.304	1.38	1.333	1.304	1.38	3	1.314
no	1000	3	20	3	1	2.309	2.512	2.393	2.309	2.512	3	2.358
joblib	10000	1	20	3	1	9.696	9.876	9.815	9.696	9.876	3	9.873
memory	10000	1	20	3	1	9.165	9.837	9.567	9.165	9.837	3	9.699
no	10000	1	20	3	1	9.676	12.24	11.09	9.676	12.24	3	11.36
joblib	10000	2	20	3	1	9.842	10.89	10.22	9.842	10.89	3	9.922
memory	10000	2	20	3	1	8.27	10.54	9.606	8.27	10.54	3	10.01
no	10000	2	20	3	1	11.54	12.53	11.9	11.54	12.53	3	11.63
joblib	10000	3	20	3	1	9.975	10.16	10.05	9.975	10.16	3	10
memory	10000	3	20	3	1	7.778	9.401	8.725	7.778	9.401	3	8.994
no	10000	3	20	3	1	11.04	12.39	11.57	11.04	12.39	3	11.28
joblib	20000	1	20	3	1	24.89	29.8	26.53	24.89	29.8	3	24.9
memory	20000	1	20	3	1	20.2	29.04	25.39	20.2	29.04	3	26.94
no	20000	1	20	3	1	27.14	31.51	29.55	27.14	31.51	3	30.01
joblib	20000	2	20	3	1	22.52	25.54	24.47	22.52	25.54	3	25.34
memory	20000	2	20	3	1	21.14	24.59	22.77	21.14	24.59	3	22.59
no	20000	2	20	3	1	29.46	39.01	33.61	29.46	39.01	3	32.36
joblib	20000	3	20	3	1	32.22	32.69	32.46	32.22	32.69	3	32.48
memory	20000	3	20	3	1	22.5	160.4	69.74	22.5	160.4	3	26.38
no	20000	3	20	3	1	32.19	43.45	38.42	32.19	43.45	3	39.62

Benchmark code #

# coding: utf-8
"""
Benchmark of grid search using caching.
"""
# Authors: Xavier Dupré (benchmark)
# License: MIT
from time import time
from itertools import combinations, chain
from itertools import combinations_with_replacement as combinations_w_r

import numpy as np
from numpy.random import rand
from numpy.testing import assert_almost_equal
import matplotlib.pyplot as plt
import pandas
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.utils._testing import ignore_warnings
import sklearn.utils
from mlinsights.mlbatch import PipelineCache, MLCache

##############################
# Starts a dask cluster
# +++++++++++++++++++++

has_dask = False

"""
# Does work yet.
from distributed import Client, LocalCluster
try:
    cluster = LocalCluster()
    print(cluster)
    client = Client(cluster)
    print(client)
    has_dask = True
except Exception as e:
    print("Cannot use dask due to {0}.".format(e))
    has_dask = False
"""

##############################
# Implementations to benchmark
# ++++++++++++++++++++++++++++

from pymlbenchmark.benchmark import BenchPerf, BenchPerfTest
from pymlbenchmark.datasets import random_binary_classification


class GridSearchBenchPerfTest(BenchPerfTest):

    def __init__(self, dim=None, n_jobs=1, **opts):
        assert dim is not None
        BenchPerfTest.__init__(self, **opts)
        self.n_jobs = n_jobs

    def _make_model(self, dim, cache, n_jobs):
        if cache in (None, 'no'):
            cl = Pipeline
            ps = dict()
        elif cache == 'joblib':
            cl = Pipeline
            ps = dict(memory='jb-%d-%d' % (dim, n_jobs))
        elif cache == 'dask':
            cl = Pipeline
            ps = dict(memory='dk-%d-%d' % (dim, n_jobs))
        elif cache == "memory":
            cl = PipelineCache
            ps = dict(cache_name='memory-%d-%d' % (dim, n_jobs))
        else:
            raise ValueError("Unknown cache value: '{0}'.".format(cache))

        model = cl([
            ('scale', MinMaxScaler()),
            ('pca', PCA(2)),
            ('poly', PolynomialFeatures()),
            ('bins', KBinsDiscretizer()),
            ('lr', LogisticRegression(solver='liblinear'))],
            **ps)
        params_grid = {
            'scale__feature_range': [(0, 1)],
            'pca__n_components': [2, 4],
            'poly__degree': [2, 3],
            'bins__n_bins': [5],
            'bins__encode': ["onehot-dense", "ordinal"],
            'lr__penalty': ['l1', 'l2'],
        }

        return GridSearchCV(model, params_grid, n_jobs=n_jobs, verbose=0)

    def data(self, N=None, dim=None, **opts):
        # The benchmark requires a new datasets each time.
        assert N is not None
        assert dim is not None
        return random_binary_classification(N, dim)

    def fcts(self, dim=None, **kwargs):
        # The function returns the prediction functions to tests.
        global has_dask
        options = ['no', 'joblib', 'memory']
        if has_dask:
            options.append('dask')
        models = {}
        for cache in options:
            models[cache] = self._make_model(dim, cache, self.n_jobs)

        def fit_model(X, y, cache):
            if cache == "joblib":
                sklearn.utils.parallel_backend("loky", self.n_jobs)
            elif cache == "dask":
                sklearn.utils.parallel_backend("dask", self.n_jobs)
            else:
                sklearn.utils.parallel_backend("threading", self.n_jobs)
            model = models[cache]
            model.fit(X, y)
            if cache == 'memory':
                MLCache.remove_cache(model.best_estimator_.cache_name)

        res = []
        for cache in sorted(models):
            res.append({'test': cache, 'fct': lambda X,
                        y, c=cache: fit_model(X, y, c)})
        return res


##############################
# Benchmark
# +++++++++


@ignore_warnings(category=(FutureWarning, UserWarning, DeprecationWarning))
def run_bench(repeat=3, verbose=False, number=1):
    pbefore = dict(dim=[10, 15])
    pafter = dict(N=[100, 1000, 10000], n_jobs=[1, 3])

    bp = BenchPerf(pbefore, pafter, GridSearchBenchPerfTest)

    with sklearn.config_context(assume_finite=True):
        start = time()
        results = list(bp.enumerate_run_benchs(repeat=repeat, verbose=verbose))
        end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec\n" % (end - start))
    return results_df


##############################
# Run the benchmark
# +++++++++++++++++

filename = "bench_plot_gridsearch_cache"
df = run_bench(verbose=True)
df.to_csv("%s.csv" % filename, index=False)
print(df.head())

if has_dask:
    cluster.close()

#########################
# Extract information about the machine used
# ++++++++++++++++++++++++++++++++++++++++++

from pymlbenchmark.context import machine_information
pkgs = ['numpy', 'pandas', 'sklearn']
dfi = pandas.DataFrame(machine_information(pkgs))
dfi.to_csv("%s.time.csv" % filename, index=False)
print(dfi)


#############################
# Plot the results
# ++++++++++++++++

from pymlbenchmark.plotting import plot_bench_results
print(df.columns)
plot_bench_results(df, row_cols=['N'],
                   col_cols=['n_jobs'], x_value='dim',
                   hue_cols=['test'],
                   cmp_col_values='test',
                   title="GridSearchCV\nBenchmark caching strategies")
import sys
if "--quiet" not in sys.argv:
    plt.show()

Caching algorithm for a GridSearchCV#

Ideas#

Graphs#

Machine used to run the test#

Raw results#

Benchmark code#

Ideas #

Graphs #

Machine used to run the test #

Raw results #

Benchmark code #