{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Uncommon operation with dataframes\n", "\n", "Cheat sheet on uncommand operation with pandas such as reading a big file."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["
run previous cell, wait for 2 seconds
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Pointer on notebooks\n", "\n", "* [Rappel de ce que vous savez d\u00e9j\u00e0 mais avez peut-\u00eatre oubli\u00e9](http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx/notebooks/td2_eco_rappels_1a.html)\n", "* [Python pour un Data Scientist / Economiste](http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx/td_2a.html)\n", "* [Exercices Pratiques](http://www.xavierdupre.fr/app/actuariat_python/helpsphinx/i_seances_base.html)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## List of strings into binaries features"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featurestarget
0[a, b, c]0
1[a, b]1
2[c, b]2
\n", "
"], "text/plain": [" features target\n", "0 [a, b, c] 0\n", "1 [a, b] 1\n", "2 [c, b] 2"]}, "execution_count": 3, "metadata": {}, "output_type": "execute_result"}], "source": ["import pandas\n", "df = pandas.DataFrame([{\"target\":0, \"features\":[\"a\", \"b\", \"c\"]},\n", " {\"target\":1, \"features\":[\"a\", \"b\"]},\n", " {\"target\":2, \"features\":[\"c\", \"b\"]}])\n", "df"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abc
0111
1110
2011
\n", "
"], "text/plain": [" a b c\n", "0 1 1 1\n", "1 1 1 0\n", "2 0 1 1"]}, "execution_count": 4, "metadata": {}, "output_type": "execute_result"}], "source": ["df.features.str.join(\"*\").str.get_dummies(\"*\")"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## Big files\n", "\n", "Let's save some data first."]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["from sklearn.datasets import load_breast_cancer\n", "data = load_breast_cancer()\n", "import pandas\n", "df = pandas.DataFrame(data.data, columns=data.feature_names)\n", "df.to_csv(\"cancer.txt\", sep=\"\\t\", encoding=\"utf-8\", index=False)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### first lines : nrows"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mean radius\tmean texture\tmean perimeter\tmean area\tmean smoothness\tmean compactness\tmean concavity\tmean concave points\tmean symmetry\tmean fractal dimension\tradius error\ttexture error\tperimeter error\tarea error\tsmoothness error\tcompactness error\tconcavity error\tconcave points error\tsymmetry error\tfractal dimension error\tworst radius\tworst texture\tworst perimeter\tworst area\tworst smoothness\tworst compactness\tworst concavity\tworst concave points\tworst symmetry\tworst fractal dimension
017.99\\t10.38\\t122.8\\t1001.0\\t0.1184\\t0.2776\\t0...
120.57\\t17.77\\t132.9\\t1326.0\\t0.08474\\t0.07864\\...
219.69\\t21.25\\t130.0\\t1203.0\\t0.1096\\t0.1599\\t0...
\n", "
"], "text/plain": [" mean radius\\tmean texture\\tmean perimeter\\tmean area\\tmean smoothness\\tmean compactness\\tmean concavity\\tmean concave points\\tmean symmetry\\tmean fractal dimension\\tradius error\\ttexture error\\tperimeter error\\tarea error\\tsmoothness error\\tcompactness error\\tconcavity error\\tconcave points error\\tsymmetry error\\tfractal dimension error\\tworst radius\\tworst texture\\tworst perimeter\\tworst area\\tworst smoothness\\tworst compactness\\tworst concavity\\tworst concave points\\tworst symmetry\\tworst fractal dimension\n", "0 17.99\\t10.38\\t122.8\\t1001.0\\t0.1184\\t0.2776\\t0... \n", "1 20.57\\t17.77\\t132.9\\t1326.0\\t0.08474\\t0.07864\\... \n", "2 19.69\\t21.25\\t130.0\\t1203.0\\t0.1096\\t0.1599\\t0... "]}, "execution_count": 6, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"cancer.txt\", nrows=3)\n", "df"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst radiusworst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimension
017.9910.38122.81001.00.118400.277600.30010.147100.24190.07871...25.3817.33184.62019.00.16220.66560.71190.26540.46010.11890
120.5717.77132.91326.00.084740.078640.08690.070170.18120.05667...24.9923.41158.81956.00.12380.18660.24160.18600.27500.08902
219.6921.25130.01203.00.109600.159900.19740.127900.20690.05999...23.5725.53152.51709.00.14440.42450.45040.24300.36130.08758
\n", "

3 rows \u00d7 30 columns

\n", "
"], "text/plain": [" mean radius mean texture mean perimeter mean area mean smoothness \\\n", "0 17.99 10.38 122.8 1001.0 0.11840 \n", "1 20.57 17.77 132.9 1326.0 0.08474 \n", "2 19.69 21.25 130.0 1203.0 0.10960 \n", "\n", " mean compactness mean concavity mean concave points mean symmetry \\\n", "0 0.27760 0.3001 0.14710 0.2419 \n", "1 0.07864 0.0869 0.07017 0.1812 \n", "2 0.15990 0.1974 0.12790 0.2069 \n", "\n", " mean fractal dimension ... worst radius \\\n", "0 0.07871 ... 25.38 \n", "1 0.05667 ... 24.99 \n", "2 0.05999 ... 23.57 \n", "\n", " worst texture worst perimeter worst area worst smoothness \\\n", "0 17.33 184.6 2019.0 0.1622 \n", "1 23.41 158.8 1956.0 0.1238 \n", "2 25.53 152.5 1709.0 0.1444 \n", "\n", " worst compactness worst concavity worst concave points worst symmetry \\\n", "0 0.6656 0.7119 0.2654 0.4601 \n", "1 0.1866 0.2416 0.1860 0.2750 \n", "2 0.4245 0.4504 0.2430 0.3613 \n", "\n", " worst fractal dimension \n", "0 0.11890 \n", "1 0.08902 \n", "2 0.08758 \n", "\n", "[3 rows x 30 columns]"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"cancer.txt\", nrows=3, sep=\"\\t\")\n", "df"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### middle lines : nrows + skiprows"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...20212223242526272829
014.42019.7794.48642.50.097520.114100.093880.058390.18790.06390...16.3330.86109.50826.40.14310.30260.31940.15650.27180.09353
113.61024.9888.05582.70.094880.085110.086250.044890.16090.05871...16.9935.27108.60906.50.12650.19430.31690.11840.26510.07397
26.98113.4343.79143.50.117000.075680.000000.000000.19300.07818...7.9319.5450.41185.20.15840.12020.00000.00000.29320.09382
\n", "

3 rows \u00d7 30 columns

\n", "
"], "text/plain": [" 0 1 2 3 4 5 6 7 8 \\\n", "0 14.420 19.77 94.48 642.5 0.09752 0.11410 0.09388 0.05839 0.1879 \n", "1 13.610 24.98 88.05 582.7 0.09488 0.08511 0.08625 0.04489 0.1609 \n", "2 6.981 13.43 43.79 143.5 0.11700 0.07568 0.00000 0.00000 0.1930 \n", "\n", " 9 ... 20 21 22 23 24 25 26 \\\n", "0 0.06390 ... 16.33 30.86 109.50 826.4 0.1431 0.3026 0.3194 \n", "1 0.05871 ... 16.99 35.27 108.60 906.5 0.1265 0.1943 0.3169 \n", "2 0.07818 ... 7.93 19.54 50.41 185.2 0.1584 0.1202 0.0000 \n", "\n", " 27 28 29 \n", "0 0.1565 0.2718 0.09353 \n", "1 0.1184 0.2651 0.07397 \n", "2 0.0000 0.2932 0.09382 \n", "\n", "[3 rows x 30 columns]"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"cancer.txt\", nrows=3, skiprows=100, sep=\"\\t\", header=None)\n", "df"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### big files : iterator"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["0 (3, 30)\n", "1 (3, 30)\n", "2 (3, 30)\n", "3 (3, 30)\n"]}], "source": ["for piece, df in enumerate(pandas.read_csv(\"cancer.txt\", iterator=True, sep=\"\\t\", chunksize=3)):\n", " print(piece, df.shape)\n", " if piece > 2:\n", " break"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### sample on big files : iterator + concat"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/plain": ["(57, 30)"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["samples = []\n", "for df in pandas.read_csv(\"cancer.txt\", iterator=True, sep=\"\\t\", chunksize=30):\n", " sample = df.sample(3)\n", " samples.append(sample)\n", "dfsample = pandas.concat(samples)\n", "dfsample.shape"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0"}}, "nbformat": 4, "nbformat_minor": 2}