{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# First steps with pandas_streaming\n", "\n", "A few difference between [pandas](http://pandas.pydata.org/) and *pandas_streaming*."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["<div id=\"my_id_menu_nb\">run previous cell, wait for 2 seconds</div>\n", "<script>\n", "function repeat_indent_string(n){\n", "    var a = \"\" ;\n", "    for ( ; n > 0 ; --n)\n", "        a += \"    \";\n", "    return a;\n", "}\n", "var update_menu_string = function(begin, lfirst, llast, sformat, send, keep_item, begin_format, end_format) {\n", "    var anchors = document.getElementsByClassName(\"section\");\n", "    if (anchors.length == 0) {\n", "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n", "    }\n", "    var i,t;\n", "    var text_menu = begin;\n", "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n", "    var ind = \"\";\n", "    var memo_level = 1;\n", "    var href;\n", "    var tags = [];\n", "    var main_item = 0;\n", "    var format_open = 0;\n", "    for (i = 0; i <= llast; i++)\n", "        tags.push(\"h\" + i);\n", "\n", "    for (i = 0; i < anchors.length; i++) {\n", "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n", "\n", "        var child = null;\n", "        for(t = 0; t < tags.length; t++) {\n", "            var r = anchors[i].getElementsByTagName(tags[t]);\n", "            if (r.length > 0) {\n", "child = r[0];\n", "break;\n", "            }\n", "        }\n", "        if (child == null) {\n", "            text_memo += \"null\\n\";\n", "            continue;\n", "        }\n", "        if (anchors[i].hasAttribute(\"id\")) {\n", "            // when converted in RST\n", "            href = anchors[i].id;\n", "            text_memo += \"#1-\" + href;\n", "            // passer \u00e0 child suivant (le chercher)\n", "        }\n", "        else if (child.hasAttribute(\"id\")) {\n", "            // in a notebook\n", "            href = child.id;\n", "            text_memo += \"#2-\" + href;\n", "        }\n", "        else {\n", "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n", "            continue;\n", "        }\n", "        var title = child.textContent;\n", "        var level = parseInt(child.tagName.substring(1,2));\n", "\n", "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n", "\n", "        if ((level < lfirst) || (level > llast)) {\n", "            continue ;\n", "        }\n", "        if (title.endsWith('\u00b6')) {\n", "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\")\n", "         .replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\");\n", "        }\n", "        if (title.length == 0) {\n", "            continue;\n", "        }\n", "\n", "        while (level < memo_level) {\n", "            text_menu += end_format + \"</ul>\\n\";\n", "            format_open -= 1;\n", "            memo_level -= 1;\n", "        }\n", "        if (level == lfirst) {\n", "            main_item += 1;\n", "        }\n", "        if (keep_item != -1 && main_item != keep_item + 1) {\n", "            // alert(main_item + \" - \" + level + \" - \" + keep_item);\n", "            continue;\n", "        }\n", "        while (level > memo_level) {\n", "            text_menu += \"<ul>\\n\";\n", "            memo_level += 1;\n", "        }\n", "        text_menu += repeat_indent_string(level-2);\n", "        text_menu += begin_format + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n", "        format_open += 1;\n", "    }\n", "    while (1 < memo_level) {\n", "        text_menu += end_format + \"</ul>\\n\";\n", "        memo_level -= 1;\n", "        format_open -= 1;\n", "    }\n", "    text_menu += send;\n", "    //text_menu += \"\\n\" + text_memo;\n", "\n", "    while (format_open > 0) {\n", "        text_menu += end_format;\n", "        format_open -= 1;\n", "    }\n", "    return text_menu;\n", "};\n", "var update_menu = function() {\n", "    var sbegin = \"\";\n", "    var sformat = '<a href=\"#__HREF__\">__TITLE__</a>';\n", "    var send = \"\";\n", "    var begin_format = '<li>';\n", "    var end_format = '</li>';\n", "    var keep_item = -1;\n", "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send, keep_item,\n", "       begin_format, end_format);\n", "    var menu = document.getElementById(\"my_id_menu_nb\");\n", "    menu.innerHTML=text_menu;\n", "};\n", "window.setTimeout(update_menu,2000);\n", "            </script>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## pandas to pandas_streaming"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>X</th>\n", "      <th>Y</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     X  Y\n", "0  4.5  a\n", "1  6.0  b\n", "2  7.0  c"]}, "execution_count": 3, "metadata": {}, "output_type": "execute_result"}], "source": ["from pandas import DataFrame\n", "df = DataFrame(data=dict(X=[4.5, 6, 7], Y=[\"a\", \"b\", \"c\"]))\n", "df"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["We create a streaming dataframe:"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"data": {"text/plain": ["<pandas_streaming.df.dataframe.StreamingDataFrame at 0x15c2c606160>"]}, "execution_count": 4, "metadata": {}, "output_type": "execute_result"}], "source": ["from pandas_streaming.df import StreamingDataFrame\n", "sdf = StreamingDataFrame.read_df(df)\n", "sdf"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>X</th>\n", "      <th>Y</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     X  Y\n", "0  4.5  a\n", "1  6.0  b\n", "2  7.0  c"]}, "execution_count": 5, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf.to_dataframe()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Internally, StreamingDataFrame implements an iterator on dataframes and then tries to replicate the same interface as [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) possibly wherever it is possible to manipulate data without loading everything into memory."]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>X</th>\n", "      <th>Y</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "    </tr>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     X  Y\n", "0  4.5  a\n", "1  6.0  b\n", "2  7.0  c\n", "0  4.5  a\n", "1  6.0  b\n", "2  7.0  c"]}, "execution_count": 6, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf2 = sdf.concat(sdf)\n", "sdf2.to_dataframe()"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>Y</th>\n", "      <th>Z</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>a</td>\n", "      <td>10</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>b</td>\n", "      <td>20</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   Y   Z\n", "0  a  10\n", "1  b  20"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["m = DataFrame(dict(Y=[\"a\", \"b\"], Z=[10, 20]))\n", "m"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>X</th>\n", "      <th>Y</th>\n", "      <th>Z</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "      <td>10.0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "      <td>20.0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "      <td>NaN</td>\n", "    </tr>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "      <td>10.0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "      <td>20.0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "      <td>NaN</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     X  Y     Z\n", "0  4.5  a  10.0\n", "1  6.0  b  20.0\n", "2  7.0  c   NaN\n", "0  4.5  a  10.0\n", "1  6.0  b  20.0\n", "2  7.0  c   NaN"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf3 = sdf2.merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")\n", "sdf3.to_dataframe()"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>X</th>\n", "      <th>Y</th>\n", "      <th>Z</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "      <td>10.0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "      <td>10.0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "      <td>20.0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "      <td>20.0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "      <td>NaN</td>\n", "    </tr>\n", "    <tr>\n", "      <th>5</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "      <td>NaN</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     X  Y     Z\n", "0  4.5  a  10.0\n", "1  4.5  a  10.0\n", "2  6.0  b  20.0\n", "3  6.0  b  20.0\n", "4  7.0  c   NaN\n", "5  7.0  c   NaN"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf2.to_dataframe().merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")"]}, {"cell_type": "markdown", "metadata": {}, "source": ["The order might be different."]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>X</th>\n", "      <th>Y</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>4.5</td>\n", "      <td>a</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     X  Y\n", "0  4.5  a\n", "1  4.5  a"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["sdftr, sdfte = sdf2.train_test_split(test_size=0.5)\n", "sdfte.head()"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<style>\n", "    .dataframe thead tr:only-child th {\n", "        text-align: right;\n", "    }\n", "\n", "    .dataframe thead th {\n", "        text-align: left;\n", "    }\n", "\n", "    .dataframe tbody tr th {\n", "        vertical-align: top;\n", "    }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>X</th>\n", "      <th>Y</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>6.0</td>\n", "      <td>b</td>\n", "    </tr>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>7.0</td>\n", "      <td>c</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["     X  Y\n", "0  6.0  b\n", "1  7.0  c\n", "2  6.0  b\n", "0  7.0  c"]}, "execution_count": 11, "metadata": {}, "output_type": "execute_result"}], "source": ["sdftr.head()"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## split a big file"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [{"data": {"text/plain": ["'example.txt'"]}, "execution_count": 12, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf2.to_csv(\"example.txt\")"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/plain": ["['example.train.txt', 'example.test.txt']"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["new_sdf = StreamingDataFrame.read_csv(\"example.txt\")\n", "new_sdf.train_test_split(\"example.{}.txt\", streaming=False)"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/plain": ["['example.test.txt', 'example.train.txt', 'example.txt']"]}, "execution_count": 14, "metadata": {}, "output_type": "execute_result"}], "source": ["import glob\n", "glob.glob(\"ex*.txt\")"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1"}}, "nbformat": 4, "nbformat_minor": 2}