{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# First steps with pandas_streaming\n", "\n", "A few difference between [pandas](http://pandas.pydata.org/) and *pandas_streaming*."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## pandas to pandas_streaming"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " X | \n", " Y | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 4.5 | \n", " a | \n", "
\n", " \n", " 1 | \n", " 6.0 | \n", " b | \n", "
\n", " \n", " 2 | \n", " 7.0 | \n", " c | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" X Y\n", "0 4.5 a\n", "1 6.0 b\n", "2 7.0 c"]}, "execution_count": 3, "metadata": {}, "output_type": "execute_result"}], "source": ["from pandas import DataFrame\n", "df = DataFrame(data=dict(X=[4.5, 6, 7], Y=[\"a\", \"b\", \"c\"]))\n", "df"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["We create a streaming dataframe:"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"data": {"text/plain": [""]}, "execution_count": 4, "metadata": {}, "output_type": "execute_result"}], "source": ["from pandas_streaming.df import StreamingDataFrame\n", "sdf = StreamingDataFrame.read_df(df)\n", "sdf"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " X | \n", " Y | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 4.5 | \n", " a | \n", "
\n", " \n", " 1 | \n", " 6.0 | \n", " b | \n", "
\n", " \n", " 2 | \n", " 7.0 | \n", " c | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" X Y\n", "0 4.5 a\n", "1 6.0 b\n", "2 7.0 c"]}, "execution_count": 5, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf.to_dataframe()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Internally, StreamingDataFrame implements an iterator on dataframes and then tries to replicate the same interface as [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) possibly wherever it is possible to manipulate data without loading everything into memory."]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " X | \n", " Y | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 4.5 | \n", " a | \n", "
\n", " \n", " 1 | \n", " 6.0 | \n", " b | \n", "
\n", " \n", " 2 | \n", " 7.0 | \n", " c | \n", "
\n", " \n", " 0 | \n", " 4.5 | \n", " a | \n", "
\n", " \n", " 1 | \n", " 6.0 | \n", " b | \n", "
\n", " \n", " 2 | \n", " 7.0 | \n", " c | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" X Y\n", "0 4.5 a\n", "1 6.0 b\n", "2 7.0 c\n", "0 4.5 a\n", "1 6.0 b\n", "2 7.0 c"]}, "execution_count": 6, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf2 = sdf.concat(sdf)\n", "sdf2.to_dataframe()"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " Y | \n", " Z | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " a | \n", " 10 | \n", "
\n", " \n", " 1 | \n", " b | \n", " 20 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" Y Z\n", "0 a 10\n", "1 b 20"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["m = DataFrame(dict(Y=[\"a\", \"b\"], Z=[10, 20]))\n", "m"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " X | \n", " Y | \n", " Z | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 4.5 | \n", " a | \n", " 10.0 | \n", "
\n", " \n", " 1 | \n", " 6.0 | \n", " b | \n", " 20.0 | \n", "
\n", " \n", " 2 | \n", " 7.0 | \n", " c | \n", " NaN | \n", "
\n", " \n", " 0 | \n", " 4.5 | \n", " a | \n", " 10.0 | \n", "
\n", " \n", " 1 | \n", " 6.0 | \n", " b | \n", " 20.0 | \n", "
\n", " \n", " 2 | \n", " 7.0 | \n", " c | \n", " NaN | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" X Y Z\n", "0 4.5 a 10.0\n", "1 6.0 b 20.0\n", "2 7.0 c NaN\n", "0 4.5 a 10.0\n", "1 6.0 b 20.0\n", "2 7.0 c NaN"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf3 = sdf2.merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")\n", "sdf3.to_dataframe()"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " X | \n", " Y | \n", " Z | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 4.5 | \n", " a | \n", " 10.0 | \n", "
\n", " \n", " 1 | \n", " 4.5 | \n", " a | \n", " 10.0 | \n", "
\n", " \n", " 2 | \n", " 6.0 | \n", " b | \n", " 20.0 | \n", "
\n", " \n", " 3 | \n", " 6.0 | \n", " b | \n", " 20.0 | \n", "
\n", " \n", " 4 | \n", " 7.0 | \n", " c | \n", " NaN | \n", "
\n", " \n", " 5 | \n", " 7.0 | \n", " c | \n", " NaN | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" X Y Z\n", "0 4.5 a 10.0\n", "1 4.5 a 10.0\n", "2 6.0 b 20.0\n", "3 6.0 b 20.0\n", "4 7.0 c NaN\n", "5 7.0 c NaN"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf2.to_dataframe().merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")"]}, {"cell_type": "markdown", "metadata": {}, "source": ["The order might be different."]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " X | \n", " Y | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 4.5 | \n", " a | \n", "
\n", " \n", " 1 | \n", " 4.5 | \n", " a | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" X Y\n", "0 4.5 a\n", "1 4.5 a"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["sdftr, sdfte = sdf2.train_test_split(test_size=0.5)\n", "sdfte.head()"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
\n", " \n", " \n", " | \n", " X | \n", " Y | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 6.0 | \n", " b | \n", "
\n", " \n", " 1 | \n", " 7.0 | \n", " c | \n", "
\n", " \n", " 2 | \n", " 6.0 | \n", " b | \n", "
\n", " \n", " 0 | \n", " 7.0 | \n", " c | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" X Y\n", "0 6.0 b\n", "1 7.0 c\n", "2 6.0 b\n", "0 7.0 c"]}, "execution_count": 11, "metadata": {}, "output_type": "execute_result"}], "source": ["sdftr.head()"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## split a big file"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [{"data": {"text/plain": ["'example.txt'"]}, "execution_count": 12, "metadata": {}, "output_type": "execute_result"}], "source": ["sdf2.to_csv(\"example.txt\")"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/plain": ["['example.train.txt', 'example.test.txt']"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["new_sdf = StreamingDataFrame.read_csv(\"example.txt\")\n", "new_sdf.train_test_split(\"example.{}.txt\", streaming=False)"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/plain": ["['example.test.txt', 'example.train.txt', 'example.txt']"]}, "execution_count": 14, "metadata": {}, "output_type": "execute_result"}], "source": ["import glob\n", "glob.glob(\"ex*.txt\")"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1"}}, "nbformat": 4, "nbformat_minor": 2}