{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Uncommon operation with dataframes\n", "\n", "Cheat sheet on uncommand operation with pandas such as reading a big file."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["<div id=\"my_id_menu_nb\">run previous cell, wait for 2 seconds</div>\n", "<script>\n", "function repeat_indent_string(n){\n", "    var a = \"\" ;\n", "    for ( ; n > 0 ; --n)\n", "        a += \"    \";\n", "    return a;\n", "}\n", "// look up into all sections and builds an automated menu //\n", "var update_menu_string = function(begin, lfirst, llast, sformat, send, keep_item, begin_format, end_format) {\n", "    var anchors = document.getElementsByClassName(\"section\");\n", "    if (anchors.length == 0) {\n", "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n", "    }\n", "    var i,t;\n", "    var text_menu = begin;\n", "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n", "    var ind = \"\";\n", "    var memo_level = 1;\n", "    var href;\n", "    var tags = [];\n", "    var main_item = 0;\n", "    var format_open = 0;\n", "    for (i = 0; i <= llast; i++)\n", "        tags.push(\"h\" + i);\n", "\n", "    for (i = 0; i < anchors.length; i++) {\n", "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n", "\n", "        var child = null;\n", "        for(t = 0; t < tags.length; t++) {\n", "            var r = anchors[i].getElementsByTagName(tags[t]);\n", "            if (r.length > 0) {\n", "child = r[0];\n", "break;\n", "            }\n", "        }\n", "        if (child == null) {\n", "            text_memo += \"null\\n\";\n", "            continue;\n", "        }\n", "        if (anchors[i].hasAttribute(\"id\")) {\n", "            // when converted in RST\n", "            href = anchors[i].id;\n", "            text_memo += \"#1-\" + href;\n", "            // passer \u00e0 child suivant (le chercher)\n", "        }\n", "        else if (child.hasAttribute(\"id\")) {\n", "            // in a notebook\n", "            href = child.id;\n", "            text_memo += \"#2-\" + href;\n", "        }\n", "        else {\n", "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n", "            continue;\n", "        }\n", "        var title = child.textContent;\n", "        var level = parseInt(child.tagName.substring(1,2));\n", "\n", "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n", "\n", "        if ((level < lfirst) || (level > llast)) {\n", "            continue ;\n", "        }\n", "        if (title.endsWith('\u00b6')) {\n", "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\")\n", "         .replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\");\n", "        }\n", "        if (title.length == 0) {\n", "            continue;\n", "        }\n", "\n", "        while (level < memo_level) {\n", "            text_menu += end_format + \"</ul>\\n\";\n", "            format_open -= 1;\n", "            memo_level -= 1;\n", "        }\n", "        if (level == lfirst) {\n", "            main_item += 1;\n", "        }\n", "        if (keep_item != -1 && main_item != keep_item + 1) {\n", "            // alert(main_item + \" - \" + level + \" - \" + keep_item);\n", "            continue;\n", "        }\n", "        while (level > memo_level) {\n", "            text_menu += \"<ul>\\n\";\n", "            memo_level += 1;\n", "        }\n", "        text_menu += repeat_indent_string(level-2);\n", "        text_menu += begin_format + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n", "        format_open += 1;\n", "    }\n", "    while (1 < memo_level) {\n", "        text_menu += end_format + \"</ul>\\n\";\n", "        memo_level -= 1;\n", "        format_open -= 1;\n", "    }\n", "    text_menu += send;\n", "    //text_menu += \"\\n\" + text_memo;\n", "\n", "    while (format_open > 0) {\n", "        text_menu += end_format;\n", "        format_open -= 1;\n", "    }\n", "    return text_menu;\n", "};\n", "var update_menu = function() {\n", "    var sbegin = \"\";\n", "    var sformat = '<a href=\"#__HREF__\">__TITLE__</a>';\n", "    var send = \"\";\n", "    var begin_format = '<li>';\n", "    var end_format = '</li>';\n", "    var keep_item = -1;\n", "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send, keep_item,\n", "       begin_format, end_format);\n", "    var menu = document.getElementById(\"my_id_menu_nb\");\n", "    menu.innerHTML=text_menu;\n", "};\n", "window.setTimeout(update_menu,2000);\n", "            </script>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Pointer on notebooks\n", "\n", "* [Rappel de ce que vous savez d\u00e9j\u00e0 mais avez peut-\u00eatre oubli\u00e9](http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx/notebooks/td2_eco_rappels_1a.html)\n", "* [Python pour un Data Scientist / Economiste](http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx/td_2a.html)\n", "* [Exercices Pratiques](http://www.xavierdupre.fr/app/actuariat_python/helpsphinx/i_seances_base.html)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## List of strings into binaries features"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>features</th>\n", "      <th>target</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>[a, b, c]</td>\n", "      <td>0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>[a, b]</td>\n", "      <td>1</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>[c, b]</td>\n", "      <td>2</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["    features  target\n", "0  [a, b, c]       0\n", "1     [a, b]       1\n", "2     [c, b]       2"]}, "execution_count": 3, "metadata": {}, "output_type": "execute_result"}], "source": ["import pandas\n", "df = pandas.DataFrame([{\"target\":0, \"features\":[\"a\", \"b\", \"c\"]},\n", "                       {\"target\":1, \"features\":[\"a\", \"b\"]},\n", "                       {\"target\":2, \"features\":[\"c\", \"b\"]}])\n", "df"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>a</th>\n", "      <th>b</th>\n", "      <th>c</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>1</td>\n", "      <td>1</td>\n", "      <td>1</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>1</td>\n", "      <td>1</td>\n", "      <td>0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>0</td>\n", "      <td>1</td>\n", "      <td>1</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   a  b  c\n", "0  1  1  1\n", "1  1  1  0\n", "2  0  1  1"]}, "execution_count": 4, "metadata": {}, "output_type": "execute_result"}], "source": ["df.features.str.join(\"*\").str.get_dummies(\"*\")"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## Big files\n", "\n", "Let's save some data first."]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["from sklearn.datasets import load_breast_cancer\n", "data = load_breast_cancer()\n", "import pandas\n", "df = pandas.DataFrame(data.data, columns=data.feature_names)\n", "df.to_csv(\"cancer.txt\", sep=\"\\t\", encoding=\"utf-8\", index=False)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### first lines : nrows"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>mean radius\tmean texture\tmean perimeter\tmean area\tmean smoothness\tmean compactness\tmean concavity\tmean concave points\tmean symmetry\tmean fractal dimension\tradius error\ttexture error\tperimeter error\tarea error\tsmoothness error\tcompactness error\tconcavity error\tconcave points error\tsymmetry error\tfractal dimension error\tworst radius\tworst texture\tworst perimeter\tworst area\tworst smoothness\tworst compactness\tworst concavity\tworst concave points\tworst symmetry\tworst fractal dimension</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>17.99\\t10.38\\t122.8\\t1001.0\\t0.1184\\t0.2776\\t0...</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>20.57\\t17.77\\t132.9\\t1326.0\\t0.08474\\t0.07864\\...</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>19.69\\t21.25\\t130.0\\t1203.0\\t0.1096\\t0.1599\\t0...</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["  mean radius\\tmean texture\\tmean perimeter\\tmean area\\tmean smoothness\\tmean compactness\\tmean concavity\\tmean concave points\\tmean symmetry\\tmean fractal dimension\\tradius error\\ttexture error\\tperimeter error\\tarea error\\tsmoothness error\\tcompactness error\\tconcavity error\\tconcave points error\\tsymmetry error\\tfractal dimension error\\tworst radius\\tworst texture\\tworst perimeter\\tworst area\\tworst smoothness\\tworst compactness\\tworst concavity\\tworst concave points\\tworst symmetry\\tworst fractal dimension\n", "0  17.99\\t10.38\\t122.8\\t1001.0\\t0.1184\\t0.2776\\t0...                                                                                                                                                                                                                                                                                                                                                                                                                                                                               \n", "1  20.57\\t17.77\\t132.9\\t1326.0\\t0.08474\\t0.07864\\...                                                                                                                                                                                                                                                                                                                                                                                                                                                                               \n", "2  19.69\\t21.25\\t130.0\\t1203.0\\t0.1096\\t0.1599\\t0...                                                                                                                                                                                                                                                                                                                                                                                                                                                                               "]}, "execution_count": 6, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"cancer.txt\", nrows=3)\n", "df"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>mean radius</th>\n", "      <th>mean texture</th>\n", "      <th>mean perimeter</th>\n", "      <th>mean area</th>\n", "      <th>mean smoothness</th>\n", "      <th>mean compactness</th>\n", "      <th>mean concavity</th>\n", "      <th>mean concave points</th>\n", "      <th>mean symmetry</th>\n", "      <th>mean fractal dimension</th>\n", "      <th>...</th>\n", "      <th>worst radius</th>\n", "      <th>worst texture</th>\n", "      <th>worst perimeter</th>\n", "      <th>worst area</th>\n", "      <th>worst smoothness</th>\n", "      <th>worst compactness</th>\n", "      <th>worst concavity</th>\n", "      <th>worst concave points</th>\n", "      <th>worst symmetry</th>\n", "      <th>worst fractal dimension</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>17.99</td>\n", "      <td>10.38</td>\n", "      <td>122.8</td>\n", "      <td>1001.0</td>\n", "      <td>0.11840</td>\n", "      <td>0.27760</td>\n", "      <td>0.3001</td>\n", "      <td>0.14710</td>\n", "      <td>0.2419</td>\n", "      <td>0.07871</td>\n", "      <td>...</td>\n", "      <td>25.38</td>\n", "      <td>17.33</td>\n", "      <td>184.6</td>\n", "      <td>2019.0</td>\n", "      <td>0.1622</td>\n", "      <td>0.6656</td>\n", "      <td>0.7119</td>\n", "      <td>0.2654</td>\n", "      <td>0.4601</td>\n", "      <td>0.11890</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>20.57</td>\n", "      <td>17.77</td>\n", "      <td>132.9</td>\n", "      <td>1326.0</td>\n", "      <td>0.08474</td>\n", "      <td>0.07864</td>\n", "      <td>0.0869</td>\n", "      <td>0.07017</td>\n", "      <td>0.1812</td>\n", "      <td>0.05667</td>\n", "      <td>...</td>\n", "      <td>24.99</td>\n", "      <td>23.41</td>\n", "      <td>158.8</td>\n", "      <td>1956.0</td>\n", "      <td>0.1238</td>\n", "      <td>0.1866</td>\n", "      <td>0.2416</td>\n", "      <td>0.1860</td>\n", "      <td>0.2750</td>\n", "      <td>0.08902</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>19.69</td>\n", "      <td>21.25</td>\n", "      <td>130.0</td>\n", "      <td>1203.0</td>\n", "      <td>0.10960</td>\n", "      <td>0.15990</td>\n", "      <td>0.1974</td>\n", "      <td>0.12790</td>\n", "      <td>0.2069</td>\n", "      <td>0.05999</td>\n", "      <td>...</td>\n", "      <td>23.57</td>\n", "      <td>25.53</td>\n", "      <td>152.5</td>\n", "      <td>1709.0</td>\n", "      <td>0.1444</td>\n", "      <td>0.4245</td>\n", "      <td>0.4504</td>\n", "      <td>0.2430</td>\n", "      <td>0.3613</td>\n", "      <td>0.08758</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "<p>3 rows \u00d7 30 columns</p>\n", "</div>"], "text/plain": ["   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n", "0        17.99         10.38           122.8     1001.0          0.11840   \n", "1        20.57         17.77           132.9     1326.0          0.08474   \n", "2        19.69         21.25           130.0     1203.0          0.10960   \n", "\n", "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n", "0           0.27760          0.3001              0.14710         0.2419   \n", "1           0.07864          0.0869              0.07017         0.1812   \n", "2           0.15990          0.1974              0.12790         0.2069   \n", "\n", "   mean fractal dimension           ...             worst radius  \\\n", "0                 0.07871           ...                    25.38   \n", "1                 0.05667           ...                    24.99   \n", "2                 0.05999           ...                    23.57   \n", "\n", "   worst texture  worst perimeter  worst area  worst smoothness  \\\n", "0          17.33            184.6      2019.0            0.1622   \n", "1          23.41            158.8      1956.0            0.1238   \n", "2          25.53            152.5      1709.0            0.1444   \n", "\n", "   worst compactness  worst concavity  worst concave points  worst symmetry  \\\n", "0             0.6656           0.7119                0.2654          0.4601   \n", "1             0.1866           0.2416                0.1860          0.2750   \n", "2             0.4245           0.4504                0.2430          0.3613   \n", "\n", "   worst fractal dimension  \n", "0                  0.11890  \n", "1                  0.08902  \n", "2                  0.08758  \n", "\n", "[3 rows x 30 columns]"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"cancer.txt\", nrows=3, sep=\"\\t\")\n", "df"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### middle lines : nrows + skiprows"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style scoped>\n", "    .dataframe tbody tr th:only-of-type {\n", "        vertical-align: middle;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: right;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "      <th>2</th>\n", "      <th>3</th>\n", "      <th>4</th>\n", "      <th>5</th>\n", "      <th>6</th>\n", "      <th>7</th>\n", "      <th>8</th>\n", "      <th>9</th>\n", "      <th>...</th>\n", "      <th>20</th>\n", "      <th>21</th>\n", "      <th>22</th>\n", "      <th>23</th>\n", "      <th>24</th>\n", "      <th>25</th>\n", "      <th>26</th>\n", "      <th>27</th>\n", "      <th>28</th>\n", "      <th>29</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>14.420</td>\n", "      <td>19.77</td>\n", "      <td>94.48</td>\n", "      <td>642.5</td>\n", "      <td>0.09752</td>\n", "      <td>0.11410</td>\n", "      <td>0.09388</td>\n", "      <td>0.05839</td>\n", "      <td>0.1879</td>\n", "      <td>0.06390</td>\n", "      <td>...</td>\n", "      <td>16.33</td>\n", "      <td>30.86</td>\n", "      <td>109.50</td>\n", "      <td>826.4</td>\n", "      <td>0.1431</td>\n", "      <td>0.3026</td>\n", "      <td>0.3194</td>\n", "      <td>0.1565</td>\n", "      <td>0.2718</td>\n", "      <td>0.09353</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>13.610</td>\n", "      <td>24.98</td>\n", "      <td>88.05</td>\n", "      <td>582.7</td>\n", "      <td>0.09488</td>\n", "      <td>0.08511</td>\n", "      <td>0.08625</td>\n", "      <td>0.04489</td>\n", "      <td>0.1609</td>\n", "      <td>0.05871</td>\n", "      <td>...</td>\n", "      <td>16.99</td>\n", "      <td>35.27</td>\n", "      <td>108.60</td>\n", "      <td>906.5</td>\n", "      <td>0.1265</td>\n", "      <td>0.1943</td>\n", "      <td>0.3169</td>\n", "      <td>0.1184</td>\n", "      <td>0.2651</td>\n", "      <td>0.07397</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>6.981</td>\n", "      <td>13.43</td>\n", "      <td>43.79</td>\n", "      <td>143.5</td>\n", "      <td>0.11700</td>\n", "      <td>0.07568</td>\n", "      <td>0.00000</td>\n", "      <td>0.00000</td>\n", "      <td>0.1930</td>\n", "      <td>0.07818</td>\n", "      <td>...</td>\n", "      <td>7.93</td>\n", "      <td>19.54</td>\n", "      <td>50.41</td>\n", "      <td>185.2</td>\n", "      <td>0.1584</td>\n", "      <td>0.1202</td>\n", "      <td>0.0000</td>\n", "      <td>0.0000</td>\n", "      <td>0.2932</td>\n", "      <td>0.09382</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "<p>3 rows \u00d7 30 columns</p>\n", "</div>"], "text/plain": ["       0      1      2      3        4        5        6        7       8   \\\n", "0  14.420  19.77  94.48  642.5  0.09752  0.11410  0.09388  0.05839  0.1879   \n", "1  13.610  24.98  88.05  582.7  0.09488  0.08511  0.08625  0.04489  0.1609   \n", "2   6.981  13.43  43.79  143.5  0.11700  0.07568  0.00000  0.00000  0.1930   \n", "\n", "        9    ...        20     21      22     23      24      25      26  \\\n", "0  0.06390   ...     16.33  30.86  109.50  826.4  0.1431  0.3026  0.3194   \n", "1  0.05871   ...     16.99  35.27  108.60  906.5  0.1265  0.1943  0.3169   \n", "2  0.07818   ...      7.93  19.54   50.41  185.2  0.1584  0.1202  0.0000   \n", "\n", "       27      28       29  \n", "0  0.1565  0.2718  0.09353  \n", "1  0.1184  0.2651  0.07397  \n", "2  0.0000  0.2932  0.09382  \n", "\n", "[3 rows x 30 columns]"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"cancer.txt\", nrows=3, skiprows=100, sep=\"\\t\", header=None)\n", "df"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### big files : iterator"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["0 (3, 30)\n", "1 (3, 30)\n", "2 (3, 30)\n", "3 (3, 30)\n"]}], "source": ["for piece, df in enumerate(pandas.read_csv(\"cancer.txt\", iterator=True, sep=\"\\t\", chunksize=3)):\n", "    print(piece, df.shape)\n", "    if piece > 2:\n", "        break"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### sample on big files : iterator + concat"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/plain": ["(57, 30)"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["samples = []\n", "for df in pandas.read_csv(\"cancer.txt\", iterator=True, sep=\"\\t\", chunksize=30):\n", "    sample = df.sample(3)\n", "    samples.append(sample)\n", "dfsample = pandas.concat(samples)\n", "dfsample.shape"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0"}}, "nbformat": 4, "nbformat_minor": 2}