Benchmark of PolynomialFeatures + partialfit of SGDClassifier

This benchmark looks into a new implementation of PolynomialFeatures proposed in PR13290. It tests the following configurations:

This example takes the example Benchmark of PolynomialFeatures + partialfit of SGDClassifier (standalone) and rewrites it with module pymlbenchmark.

from pymlbenchmark.plotting import plot_bench_results
from pymlbenchmark.context import machine_information
from time import perf_counter as time

import matplotlib.pyplot as plt
import pandas
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDClassifier
try:
    from sklearn.utils._testing import ignore_warnings
except ImportError:
    from sklearn.utils.testing import ignore_warnings
from mlinsights.mlmodel import ExtendedFeatures

Implementation to benchmark

from pymlbenchmark.benchmark import BenchPerf, BenchPerfTest
from pymlbenchmark.datasets import random_binary_classification


class PolyBenchPerfTest(BenchPerfTest):
    def __init__(self, dim=None, **opts):
        # Models are fitted here. Every not measured
        # should take place here.
        assert dim is not None
        BenchPerfTest.__init__(self, **opts)
        self.model1 = SGDClassifier()
        self.model2 = make_pipeline(PolynomialFeatures(), SGDClassifier())
        self.model3 = make_pipeline(
            ExtendedFeatures(kind='poly'), SGDClassifier())
        self.model4 = make_pipeline(ExtendedFeatures(
            kind='poly-slow'), SGDClassifier())
        X, y = random_binary_classification(10000, dim)
        self.model1.fit(PolynomialFeatures().fit_transform(X), y)
        self.model2.fit(X, y)
        self.model3.fit(X, y)
        self.model4.fit(X, y)

    def data(self, N=None, dim=None):
        # The benchmark requires a new datasets each time.
        assert N is not None
        assert dim is not None
        return random_binary_classification(N, dim)

    def fcts(self, dim=None, **kwargs):
        # The function returns the prediction functions to tests.

        def preprocess(X, y):
            return PolynomialFeatures().fit_transform(X), y

        def partial_fit_model1(X, y, model=self.model1):
            return model.partial_fit(X, y)

        def partial_fit_model2(X, y, model=self.model2):
            X2 = model.steps[0][1].transform(X)
            return model.steps[1][1].partial_fit(X2, y)

        def partial_fit_model3(X, y, model=self.model3):
            X2 = model.steps[0][1].transform(X)
            return model.steps[1][1].partial_fit(X2, y)

        def partial_fit_model4(X, y, model=self.model4):
            X2 = model.steps[0][1].transform(X)
            return model.steps[1][1].partial_fit(X2, y)

        return [{'test': 'SGD-ONLY', 'fct': (preprocess, partial_fit_model1)},
                {'test': 'SGD-SKL', 'fct': partial_fit_model2},
                {'test': 'SGD-FAST', 'fct': partial_fit_model3},
                {'test': 'SGD-SLOW', 'fct': partial_fit_model4}]

    def validate(self, results, **kwargs):
        for ind, row, model in results:
            assert isinstance(row, dict)  # test options
            assert isinstance(model, SGDClassifier)  # trained model

Benchmark function

@ignore_warnings(category=(FutureWarning, DeprecationWarning))
def run_bench(repeat=100, verbose=False):
    pbefore = dict(dim=[5, 10, 50])
    pafter = dict(N=[10, 100, 1000])
    bp = BenchPerf(pbefore, pafter, PolyBenchPerfTest)

    with sklearn.config_context(assume_finite=True):
        start = time()
        results = list(bp.enumerate_run_benchs(repeat=repeat, verbose=verbose))
        end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec\n" % (end - start))
    return results_df

Run the benchmark

df = run_bench(verbose=True)
df.to_csv("plot_bench_polynomial_features_partial_fit.perf.csv", index=False)
print(df.head())

Out:

  0%|          | 0/9 [00:00<?, ?it/s]
 11%|#1        | 1/9 [00:01<00:14,  1.78s/it]
 22%|##2       | 2/9 [00:02<00:09,  1.31s/it]
 33%|###3      | 3/9 [00:05<00:10,  1.80s/it]
 44%|####4     | 4/9 [00:06<00:07,  1.43s/it]
 56%|#####5    | 5/9 [00:07<00:06,  1.53s/it]
 67%|######6   | 6/9 [01:12<01:08, 22.87s/it]
 78%|#######7  | 7/9 [01:19<00:35, 17.97s/it]
 89%|########8 | 8/9 [01:39<00:18, 18.42s/it]
 89%|########8 | 8/9 [01:39<00:12, 12.41s/it]
Total time = 99.286 sec

       test    N  dim  repeat  ...     upper  count    median  error_c
0  SGD-ONLY   10    5     100  ...  0.000853    100  0.000767        0
1   SGD-SKL   10    5     100  ...  0.001229    100  0.001110        0
2  SGD-FAST   10    5     100  ...  0.001184    100  0.000954        0
3  SGD-SLOW   10    5     100  ...  0.001873    100  0.001778        0
4  SGD-ONLY  100    5     100  ...  0.000896    100  0.000834        0

[5 rows x 15 columns]

Extract information about the machine used

pkgs = ['numpy', 'pandas', 'sklearn']
dfi = pandas.DataFrame(machine_information(pkgs))
dfi.to_csv("plot_bench_polynomial_features_partial_fit.time.csv", index=False)
print(dfi)

Out:

                         name  ...                                              value
0                        date  ...                                                NaN
1                      python  ...  3.9.1 (default, Jan 18 2021, 16:35:58) \n[GCC ...
2                    platform  ...                                              linux
3                          OS  ...        Linux-4.19.0-13-amd64-x86_64-with-glibc2.28
4                     machine  ...                                             x86_64
5                   processor  ...
6                     release  ...                                    4.19.0-13-amd64
7                architecture  ...                                       (64bit, ELF)
8                        arch  ...                                             X86_64
9                   brand_raw  ...            Intel(R) Atom(TM) CPU  C2750  @ 2.40GHz
10                      count  ...                                                  8
11                      flags  ...  3dnowprefetch acpi aes aperfmperf apic arat ar...
12              hz_advertised  ...                                    [2400000000, 0]
13         l1_data_cache_size  ...                                              24576
14  l1_instruction_cache_size  ...                                              32768
15     l2_cache_associativity  ...                                                  8
16         l2_cache_line_size  ...                                               1024
17              l2_cache_size  ...                                            1048576
18              l3_cache_size  ...                                            1048576
19                   stepping  ...                                                  8
20                      numpy  ...                               openblas, language=c
21                     pandas  ...                                                NaN
22                    sklearn  ...                                                NaN

[23 rows x 3 columns]

Plot the results

print(df.columns)
plot_bench_results(df, row_cols='N', col_cols=None,
                   x_value='dim', hue_cols=None,
                   cmp_col_values='test',
                   title="PolynomialFeatures + partial_fit\n"
                   "Benchmark scikit-learn PR13290")
plt.show()
PolynomialFeatures + partial_fit Benchmark scikit-learn PR13290, -

Out:

Index(['test', 'N', 'dim', 'repeat', 'number', 'min', 'max', 'min3', 'max3',
       'mean', 'lower', 'upper', 'count', 'median', 'error_c'],
      dtype='object')

Total running time of the script: ( 1 minutes 43.995 seconds)

Gallery generated by Sphinx-Gallery