{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Faster Polynomial Features"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["
run previous cell, wait for 2 seconds
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["%matplotlib inline"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Polynomial Features\n", "\n", "The current implementation of [PolynomialFeatures](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html) (0.20.2) implements a term by term product for each pair $X_i, X_j$ of features where $i \\leqslant j$ which is not the most efficient way to do it."]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["import numpy.random\n", "X = numpy.random.random((100, 5))"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"data": {"text/plain": ["['1',\n", " 'x0',\n", " 'x1',\n", " 'x2',\n", " 'x3',\n", " 'x4',\n", " 'x0^2',\n", " 'x0 x1',\n", " 'x0 x2',\n", " 'x0 x3',\n", " 'x0 x4',\n", " 'x1^2',\n", " 'x1 x2',\n", " 'x1 x3',\n", " 'x1 x4',\n", " 'x2^2',\n", " 'x2 x3',\n", " 'x2 x4',\n", " 'x3^2',\n", " 'x3 x4',\n", " 'x4^2']"]}, "execution_count": 5, "metadata": {}, "output_type": "execute_result"}], "source": ["from sklearn.preprocessing import PolynomialFeatures\n", "poly = PolynomialFeatures(degree=2)\n", "Xpoly = poly.fit_transform(X)\n", "poly.get_feature_names()"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["114 \u00b5s \u00b1 12.4 \u00b5s per loop (mean \u00b1 std. dev. of 7 runs, 10000 loops each)\n"]}], "source": ["%timeit poly.transform(X)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["The class [ExtendedFeatures](http://www.xavierdupre.fr/app/mlinsights/helpsphinx/mlinsights/mlmodel/extended_features.html) implements a different way to compute the polynomial features as it tries to reduce the number of calls to numpy by using broacasted vector multplications."]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/plain": ["['1',\n", " 'x0',\n", " 'x1',\n", " 'x2',\n", " 'x3',\n", " 'x4',\n", " 'x0^2',\n", " 'x0 x1',\n", " 'x0 x2',\n", " 'x0 x3',\n", " 'x0 x4',\n", " 'x1^2',\n", " 'x1 x2',\n", " 'x1 x3',\n", " 'x1 x4',\n", " 'x2^2',\n", " 'x2 x3',\n", " 'x2 x4',\n", " 'x3^2',\n", " 'x3 x4',\n", " 'x4^2']"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["from mlinsights.mlmodel import ExtendedFeatures\n", "ext = ExtendedFeatures(poly_degree=2)\n", "Xpoly = ext.fit_transform(X)\n", "ext.get_feature_names()"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["68.7 \u00b5s \u00b1 10.6 \u00b5s per loop (mean \u00b1 std. dev. of 7 runs, 10000 loops each)\n"]}], "source": ["%timeit ext.transform(X)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Comparison with 5 features"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": ["from cpyquickhelper.numbers import measure_time"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
averagedeviationmin_execmax_execrepeatnumbercontext_sizenamesize
630.0378300.0055770.0312480.044832510240ext+fit100000
640.0726710.0053600.0675590.082539510240poly200000
650.0757120.0182710.0604760.100143510240ext200000
660.1067550.0198610.0798800.139184510240poly+fit200000
670.0740900.0091420.0639250.085899510240ext+fit200000
\n", "
"], "text/plain": [" average deviation min_exec max_exec repeat number context_size \\\n", "63 0.037830 0.005577 0.031248 0.044832 5 10 240 \n", "64 0.072671 0.005360 0.067559 0.082539 5 10 240 \n", "65 0.075712 0.018271 0.060476 0.100143 5 10 240 \n", "66 0.106755 0.019861 0.079880 0.139184 5 10 240 \n", "67 0.074090 0.009142 0.063925 0.085899 5 10 240 \n", "\n", " name size \n", "63 ext+fit 100000 \n", "64 poly 200000 \n", "65 ext 200000 \n", "66 poly+fit 200000 \n", "67 ext+fit 200000 "]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["res = []\n", "for n in [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, \n", " 5000, 10000, 20000, 50000, 100000, 200000]:\n", " X = numpy.random.random((n, 5))\n", " poly.fit(X)\n", " ext.fit(X)\n", " r1 = measure_time(\"poly.transform(X)\", context=dict(X=X, poly=poly), repeat=5, number=10, div_by_number=True)\n", " r2 = measure_time(\"ext.transform(X)\", context=dict(X=X, ext=ext), repeat=5, number=10, div_by_number=True)\n", " r3 = measure_time(\"poly.fit_transform(X)\", context=dict(X=X, poly=poly), repeat=5, number=10, div_by_number=True)\n", " r4 = measure_time(\"ext.fit_transform(X)\", context=dict(X=X, ext=ext), repeat=5, number=10, div_by_number=True)\n", " r1[\"name\"] = \"poly\"\n", " r2[\"name\"] = \"ext\"\n", " r3[\"name\"] = \"poly+fit\"\n", " r4[\"name\"] = \"ext+fit\"\n", " r1[\"size\"] = n\n", " r2[\"size\"] = n\n", " r3[\"size\"] = n\n", " r4[\"size\"] = n\n", " res.append(r1)\n", " res.append(r2)\n", " res.append(r3)\n", " res.append(r4)\n", " \n", "import pandas\n", "df = pandas.DataFrame(res)\n", "df.tail()"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameextext+fitpolypoly+fit
size
10.0000680.0004020.0002380.000275
20.0000660.0001560.0001660.000213
50.0000310.0004270.0001650.000196
100.0000480.0002370.0001340.000306
200.0000700.0001880.0001090.000153
\n", "
"], "text/plain": ["name ext ext+fit poly poly+fit\n", "size \n", "1 0.000068 0.000402 0.000238 0.000275\n", "2 0.000066 0.000156 0.000166 0.000213\n", "5 0.000031 0.000427 0.000165 0.000196\n", "10 0.000048 0.000237 0.000134 0.000306\n", "20 0.000070 0.000188 0.000109 0.000153"]}, "execution_count": 11, "metadata": {}, "output_type": "execute_result"}], "source": ["piv = df.pivot(\"size\", \"name\", \"average\")\n", "piv[:5]"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["
"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["ax = piv.plot(logy=True, logx=True)\n", "ax.set_title(\"Polynomial Features for 5 features\\ndegree=2\")\n", "ax.set_ylabel(\"seconds\")\n", "ax.set_xlabel(\"number of observations\");"]}, {"cell_type": "markdown", "metadata": {}, "source": ["The gain is mostly visible for small dimensions."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Comparison with 1000 observations\n", "\n", "In this experiment, the number of observations is fixed to 1000 but the number of features varies."]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
averagedeviationmin_execmax_execrepeatnumbercontext_sizenamenfeatnumf
370.0093310.0016030.0082800.012519530240ext40861
380.0226190.0028680.0187930.026324530240extslow40861
390.0131880.0003700.0128280.013888530240poly501326
400.0128170.0001020.0127000.012951530240ext501326
410.0303840.0007170.0299550.031813530240extslow501326
\n", "
"], "text/plain": [" average deviation min_exec max_exec repeat number context_size \\\n", "37 0.009331 0.001603 0.008280 0.012519 5 30 240 \n", "38 0.022619 0.002868 0.018793 0.026324 5 30 240 \n", "39 0.013188 0.000370 0.012828 0.013888 5 30 240 \n", "40 0.012817 0.000102 0.012700 0.012951 5 30 240 \n", "41 0.030384 0.000717 0.029955 0.031813 5 30 240 \n", "\n", " name nfeat numf \n", "37 ext 40 861 \n", "38 extslow 40 861 \n", "39 poly 50 1326 \n", "40 ext 50 1326 \n", "41 extslow 50 1326 "]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["poly = PolynomialFeatures(degree=2)\n", "ext = ExtendedFeatures(poly_degree=2)\n", "# implementation of PolynomialFeatures in 0.20.2\n", "extslow = ExtendedFeatures(poly_degree=2, kind=\"poly-slow\") \n", "\n", "\n", "res = []\n", "for n in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 40, 50]:\n", " X = numpy.random.random((1000, n))\n", " poly.fit(X)\n", " ext.fit(X)\n", " extslow.fit(X)\n", " r1 = measure_time(\"poly.transform(X)\", context=dict(X=X, poly=poly), repeat=5, number=30, div_by_number=True)\n", " r2 = measure_time(\"ext.transform(X)\", context=dict(X=X, ext=ext), repeat=5, number=30, div_by_number=True)\n", " r3 = measure_time(\"extslow.transform(X)\", context=dict(X=X, extslow=extslow), repeat=5, number=30, div_by_number=True)\n", " r1[\"name\"] = \"poly\"\n", " r2[\"name\"] = \"ext\"\n", " r3[\"name\"] = \"extslow\"\n", " r1[\"nfeat\"] = n\n", " r2[\"nfeat\"] = n\n", " r3[\"nfeat\"] = n\n", " x1 = poly.transform(X)\n", " x2 = ext.transform(X)\n", " x3 = extslow.transform(X)\n", " r1[\"numf\"] = x1.shape[1]\n", " r2[\"numf\"] = x2.shape[1]\n", " r3[\"numf\"] = x3.shape[1]\n", " res.append(r1)\n", " res.append(r2)\n", " res.append(r3)\n", " \n", "import pandas\n", "df = pandas.DataFrame(res)\n", "df.tail()"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameextextslowpoly
nfeat
10.0000260.0000590.000152
20.0000550.0001000.000113
30.0001610.0003810.000237
40.0001480.0002210.000219
50.0001850.0003400.000236
\n", "
"], "text/plain": ["name ext extslow poly\n", "nfeat \n", "1 0.000026 0.000059 0.000152\n", "2 0.000055 0.000100 0.000113\n", "3 0.000161 0.000381 0.000237\n", "4 0.000148 0.000221 0.000219\n", "5 0.000185 0.000340 0.000236"]}, "execution_count": 14, "metadata": {}, "output_type": "execute_result"}], "source": ["piv = df.pivot(\"nfeat\", \"name\", \"average\")\n", "piv[:5]"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["
"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["ax = piv.plot(logy=True, logx=True)\n", "ax.set_title(\"Polynomial Features for 1000 observations\\ndegree=2\")\n", "ax.set_ylabel(\"seconds\")\n", "ax.set_xlabel(\"number of features\");"]}, {"cell_type": "markdown", "metadata": {}, "source": ["It is twice faster."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Comparison for different degrees\n", "\n", "In this experiment, the number of observations and features is fixed, the degree increases."]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
averagedeviationmin_execmax_execrepeatnumbercontext_sizenamedegreenumf
90.0019600.0000670.0019150.002094530240ext6210
100.0031310.0001180.0030090.003327530240poly7330
110.0030760.0002330.0028450.003393530240ext7330
120.0042990.0000460.0042430.004367530240poly8495
130.0041570.0000350.0041140.004217530240ext8495
\n", "
"], "text/plain": [" average deviation min_exec max_exec repeat number context_size \\\n", "9 0.001960 0.000067 0.001915 0.002094 5 30 240 \n", "10 0.003131 0.000118 0.003009 0.003327 5 30 240 \n", "11 0.003076 0.000233 0.002845 0.003393 5 30 240 \n", "12 0.004299 0.000046 0.004243 0.004367 5 30 240 \n", "13 0.004157 0.000035 0.004114 0.004217 5 30 240 \n", "\n", " name degree numf \n", "9 ext 6 210 \n", "10 poly 7 330 \n", "11 ext 7 330 \n", "12 poly 8 495 \n", "13 ext 8 495 "]}, "execution_count": 16, "metadata": {}, "output_type": "execute_result"}], "source": ["res = []\n", "for n in [2, 3, 4, 5, 6, 7, 8]:\n", " X = numpy.random.random((1000, 4))\n", " poly = PolynomialFeatures(degree=n)\n", " ext = ExtendedFeatures(poly_degree=n)\n", " poly.fit(X)\n", " ext.fit(X)\n", " r1 = measure_time(\"poly.transform(X)\", context=dict(X=X, poly=poly), repeat=5, number=30, div_by_number=True)\n", " r2 = measure_time(\"ext.transform(X)\", context=dict(X=X, ext=ext), repeat=5, number=30, div_by_number=True)\n", " r1[\"name\"] = \"poly\"\n", " r2[\"name\"] = \"ext\"\n", " r1[\"degree\"] = n\n", " r2[\"degree\"] = n\n", " x1 = poly.transform(X)\n", " x2 = ext.transform(X)\n", " r1[\"numf\"] = x1.shape[1]\n", " r2[\"numf\"] = x2.shape[1]\n", " res.append(r1)\n", " res.append(r2)\n", " \n", "import pandas\n", "df = pandas.DataFrame(res)\n", "df.tail()"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameextpoly
degree
20.0001400.000312
30.0003040.000363
40.0005060.000579
50.0007150.000789
60.0019600.002032
\n", "
"], "text/plain": ["name ext poly\n", "degree \n", "2 0.000140 0.000312\n", "3 0.000304 0.000363\n", "4 0.000506 0.000579\n", "5 0.000715 0.000789\n", "6 0.001960 0.002032"]}, "execution_count": 17, "metadata": {}, "output_type": "execute_result"}], "source": ["piv = df.pivot(\"degree\", \"name\", \"average\")\n", "piv[:5]"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["
"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["ax = piv.plot(logy=True, logx=True)\n", "ax.set_title(\"Polynomial Features for 1000 observations\\nnumber of features is 4\")\n", "ax.set_ylabel(\"seconds\")\n", "ax.set_xlabel(\"degree\");"]}, {"cell_type": "markdown", "metadata": {}, "source": ["It is worth transposing."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Same experiment with interaction_only=True"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
averagedeviationmin_execmax_execrepeatnumbercontext_sizenamesize
290.0106910.0000730.0106180.010764230240ext50000
300.0266120.0007940.0258170.027406230240poly100000
310.0250520.0015830.0234690.026635230240ext100000
320.0587720.0013450.0574270.060118230240poly200000
330.0547710.0045550.0502160.059327230240ext200000
\n", "
"], "text/plain": [" average deviation min_exec max_exec repeat number context_size \\\n", "29 0.010691 0.000073 0.010618 0.010764 2 30 240 \n", "30 0.026612 0.000794 0.025817 0.027406 2 30 240 \n", "31 0.025052 0.001583 0.023469 0.026635 2 30 240 \n", "32 0.058772 0.001345 0.057427 0.060118 2 30 240 \n", "33 0.054771 0.004555 0.050216 0.059327 2 30 240 \n", "\n", " name size \n", "29 ext 50000 \n", "30 poly 100000 \n", "31 ext 100000 \n", "32 poly 200000 \n", "33 ext 200000 "]}, "execution_count": 19, "metadata": {}, "output_type": "execute_result"}], "source": ["res = []\n", "for n in [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, \n", " 5000, 10000, 20000, 50000, 100000, 200000]:\n", " poly = PolynomialFeatures(degree=2, interaction_only=True)\n", " ext = ExtendedFeatures(poly_degree=2, poly_interaction_only=True)\n", " X = numpy.random.random((n, 5))\n", " poly.fit(X)\n", " ext.fit(X)\n", " r1 = measure_time(\"poly.transform(X)\", context=dict(X=X, poly=poly), repeat=2, number=30, div_by_number=True)\n", " r2 = measure_time(\"ext.transform(X)\", context=dict(X=X, ext=ext), repeat=2, number=30, div_by_number=True)\n", " r1[\"name\"] = \"poly\"\n", " r2[\"name\"] = \"ext\"\n", " r1[\"size\"] = n\n", " r2[\"size\"] = n\n", " res.append(r1)\n", " res.append(r2)\n", " \n", "import pandas\n", "df = pandas.DataFrame(res)\n", "df.tail()"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameextpoly
size
10.0000420.000086
20.0000340.000104
50.0000680.000089
100.0000320.000092
200.0000400.000103
\n", "
"], "text/plain": ["name ext poly\n", "size \n", "1 0.000042 0.000086\n", "2 0.000034 0.000104\n", "5 0.000068 0.000089\n", "10 0.000032 0.000092\n", "20 0.000040 0.000103"]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["piv = df.pivot(\"size\", \"name\", \"average\")\n", "piv[:5]"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["
"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["ax = piv.plot(logy=True, logx=True)\n", "ax.set_title(\"Polynomial Features for 5 features\\ndegree is 2 + interaction_only=True\")\n", "ax.set_ylabel(\"seconds\")\n", "ax.set_xlabel(\"N obs\");"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Memory profiler"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [{"data": {"text/plain": ["258.02734375"]}, "execution_count": 22, "metadata": {}, "output_type": "execute_result"}], "source": ["from memory_profiler import memory_usage\n", "poly = PolynomialFeatures(degree=2, interaction_only=True)\n", "poly.fit(X)\n", "memory_usage((poly.transform, (X,)), interval=0.1, max_usage=True)"]}, {"cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["10000\n", "50000\n", "100000\n", "200000\n"]}, {"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
memorynamesize
3699.679688ext50000
41243.664062poly100000
51205.515625ext100000
61952.316406poly200000
72029.765625ext200000
\n", "
"], "text/plain": [" memory name size\n", "3 699.679688 ext 50000\n", "4 1243.664062 poly 100000\n", "5 1205.515625 ext 100000\n", "6 1952.316406 poly 200000\n", "7 2029.765625 ext 200000"]}, "execution_count": 23, "metadata": {}, "output_type": "execute_result"}], "source": ["def pick_value(v):\n", " try:\n", " return v[0]\n", " except TypeError:\n", " return v\n", "\n", "res = []\n", "for n in [10000, 50000, 100000, 200000]:\n", " X = numpy.random.random((n, 50))\n", " print(n)\n", " poly = PolynomialFeatures(degree=2, interaction_only=True)\n", " ext = ExtendedFeatures(poly_degree=2, poly_interaction_only=True)\n", " poly.fit(X)\n", " ext.fit(X)\n", " r1 = memory_usage((poly.transform, (X,)), interval=0.1, max_usage=True)\n", " r2 = memory_usage((ext.transform, (X,)), interval=0.1, max_usage=True)\n", " r1 = {\"memory\": pick_value(r1)}\n", " r2 = {\"memory\": pick_value(r2)}\n", " r1[\"name\"] = \"poly\"\n", " r2[\"name\"] = \"ext\"\n", " r1[\"size\"] = n\n", " r2[\"size\"] = n\n", " res.append(r1)\n", " res.append(r2)\n", " \n", "import pandas\n", "df = pandas.DataFrame(res)\n", "df.tail()"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameextpoly
size
10000392.445312396.347656
50000699.679688718.839844
1000001205.5156251243.664062
2000002029.7656251952.316406
\n", "
"], "text/plain": ["name ext poly\n", "size \n", "10000 392.445312 396.347656\n", "50000 699.679688 718.839844\n", "100000 1205.515625 1243.664062\n", "200000 2029.765625 1952.316406"]}, "execution_count": 24, "metadata": {}, "output_type": "execute_result"}], "source": ["piv = df.pivot(\"size\", \"name\", \"memory\")\n", "piv[:5]"]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [{"data": {"image/png": "\n", "text/plain": ["
"]}, "metadata": {"needs_background": "light"}, "output_type": "display_data"}], "source": ["ax = piv.plot(logy=True, logx=True)\n", "ax.set_title(\"Polynomial Features for 50 features\\ndegree is 2 - memory\")\n", "ax.set_ylabel(\"Mb\")\n", "ax.set_xlabel(\"N obs\");"]}, {"cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": []}, {"cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2"}}, "nbformat": 4, "nbformat_minor": 2}