{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Clean, process dates in text files\n", "\n", "Material for the hackathon ENSAE / Red Cross / 2015. Cleaning the data, change encoding."]}, {"cell_type": "code", "execution_count": 1, "metadata": {"collapsed": true}, "outputs": [], "source": ["%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "plt.style.use('ggplot')"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/html": ["
run previous cell, wait for 2 seconds
\n", ""], "text/plain": [""]}, "execution_count": 3, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## prepare data"]}, {"cell_type": "code", "execution_count": 3, "metadata": {"collapsed": true}, "outputs": [], "source": ["import ensae_projects"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": [" Volume in drive C has no label.\n", " Volume Serial Number is F074-BCDF\n", "\n", " Directory of C:\\PythonENSAE11\\workspace\n", "\n", "11/25/2015 03:11 PM .\n", "11/25/2015 03:11 PM ..\n", "11/25/2015 03:06 PM .ipynb_checkpoints\n", "11/22/2015 05:23 PM docs\n", "11/25/2015 02:23 PM 16,704 download_data_azure.ipynb\n", "11/25/2015 02:10 PM 5,658 ITMMASTER.schema.txt\n", "11/25/2015 02:22 PM 103,096,479 ITMMASTER.txt\n", "11/25/2015 03:11 PM 8,463 process_file.ipynb\n", "11/25/2015 03:11 PM 51,798,216 SINVOICE_.clean.txt\n", "11/25/2015 02:11 PM 1,362,433,753 SINVOICE_.txt\n", "11/25/2015 02:13 PM 1,252,461,865 SINVOICEV_.txt\n", "11/25/2015 02:21 PM 8,821,375,868 stojou.csv\n", " 8 File(s) 11,591,197,006 bytes\n", " 4 Dir(s) 48,990,134,272 bytes free\n"]}], "source": ["%ls"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["2015-11-25 15:13:11 start\n"]}], "source": ["from ensae_projects.datainc import change_encoding, convert_dates\n", "from pyquickhelper.loghelper import fLOG\n", "fLOG(OutputPrint=True)\n", "fLOG(\"start\")"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["2015-11-25 15:13:24 SINVOICE_.txt - 10000 lines\n", "2015-11-25 15:13:36 SINVOICE_.txt - 20000 lines\n", "2015-11-25 15:13:49 SINVOICE_.txt - 30000 lines\n", "2015-11-25 15:14:02 SINVOICE_.txt - 40000 lines\n", "2015-11-25 15:14:15 SINVOICE_.txt - 50000 lines\n", "2015-11-25 15:14:28 SINVOICE_.txt - 60000 lines\n", "2015-11-25 15:14:41 SINVOICE_.txt - 70000 lines\n", "2015-11-25 15:14:53 SINVOICE_.txt - 80000 lines\n", "2015-11-25 15:15:06 SINVOICE_.txt - 90000 lines\n", "2015-11-25 15:15:19 SINVOICE_.txt - 100000 lines\n", "2015-11-25 15:15:32 SINVOICE_.txt - 110000 lines\n", "2015-11-25 15:15:45 SINVOICE_.txt - 120000 lines\n", "2015-11-25 15:15:57 SINVOICE_.txt - 130000 lines\n", "2015-11-25 15:16:10 SINVOICE_.txt - 140000 lines\n", "2015-11-25 15:16:23 SINVOICE_.txt - 150000 lines\n", "2015-11-25 15:16:36 SINVOICE_.txt - 160000 lines\n", "2015-11-25 15:16:49 SINVOICE_.txt - 170000 lines\n", "2015-11-25 15:17:02 SINVOICE_.txt - 180000 lines\n", "2015-11-25 15:17:15 SINVOICE_.txt - 190000 lines\n", "2015-11-25 15:17:27 SINVOICE_.txt - 200000 lines\n", "2015-11-25 15:17:40 SINVOICE_.txt - 210000 lines\n", "2015-11-25 15:17:53 SINVOICE_.txt - 220000 lines\n", "2015-11-25 15:18:06 SINVOICE_.txt - 230000 lines\n", "2015-11-25 15:18:19 SINVOICE_.txt - 240000 lines\n", "2015-11-25 15:18:32 SINVOICE_.txt - 250000 lines\n", "2015-11-25 15:18:45 SINVOICE_.txt - 260000 lines\n", "2015-11-25 15:18:58 SINVOICE_.txt - 270000 lines\n", "2015-11-25 15:19:11 SINVOICE_.txt - 280000 lines\n", "2015-11-25 15:19:24 SINVOICE_.txt - 290000 lines\n", "2015-11-25 15:19:37 SINVOICE_.txt - 300000 lines\n", "2015-11-25 15:19:49 SINVOICE_.txt - 310000 lines\n", "2015-11-25 15:20:03 SINVOICE_.txt - 320000 lines\n", "2015-11-25 15:20:16 SINVOICE_.txt - 330000 lines\n", "2015-11-25 15:20:29 SINVOICE_.txt - 340000 lines\n", "2015-11-25 15:20:43 SINVOICE_.txt - 350000 lines\n", "2015-11-25 15:20:57 SINVOICE_.txt - 360000 lines\n", "2015-11-25 15:21:10 SINVOICE_.txt - 370000 lines\n", "2015-11-25 15:21:23 SINVOICE_.txt - 380000 lines\n", "2015-11-25 15:21:36 SINVOICE_.txt - 390000 lines\n", "2015-11-25 15:21:49 SINVOICE_.txt - 400000 lines"]}], "source": ["def transform(i, line):\n", " spl = line.split(\"\\t\")\n", " if i == 0:\n", " spl = [_.replace(\"_0\", \"\").strip('\" ') for _ in spl]\n", " else:\n", " spl = [convert_dates(_, 'F').strip('\" ') for _ in spl]\n", " return \"\\t\".join(spl)\n", "\n", "change_encoding(\"SINVOICE_.txt\", \"SINVOICE_.clean.txt\", \"latin-1\", \"latin-1\", process=transform, fLOG=fLOG)"]}, {"cell_type": "code", "execution_count": 7, "metadata": {"collapsed": true}, "outputs": [], "source": ["change_encoding(\"SINVOICEV_.txt\", \"SINVOICEV_.clean.txt\", \"latin-1\", \"latin-1\", process=transform, fLOG=fLOG)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {"collapsed": true}, "outputs": [], "source": ["def transform2(i, line):\n", " spl = line.split(\",\")\n", " spl = [convert_dates(_, 'F').strip('\" ') for _ in spl]\n", " return \"\\t\".join(spl)\n", "\n", "change_encoding(\"ITMMASTER.txt\", \"ITMMASTER.clean.txt\", \"latin-1\", \"latin-1\", process=transform2, fLOG=fLOG)"]}, {"cell_type": "code", "execution_count": 9, "metadata": {"collapsed": true}, "outputs": [], "source": ["def transform3(i, line):\n", " spl = line.split(\",\")\n", " if i == 0:\n", " spl = [_.replace(\"_0\", \"\").strip('\" ') for _ in spl]\n", " else:\n", " spl = [convert_dates(_, 'F').strip('\" ') for _ in spl]\n", " return \"\\t\".join(spl)\n", "\n", "change_encoding(\"stojou.csv\", \"stojou.clean.txt\", \"latin-1\", \"latin-1\", process=transform3, fLOG=fLOG)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Le dernier exemple est plus lent mais il permet de traiter le o\u00f9 les lignes dont les valeurs incluent le s\u00e9parateur de colonnes."]}, {"cell_type": "code", "execution_count": 10, "metadata": {"collapsed": true}, "outputs": [], "source": ["from ensae_projects.datainc import change_encoding_improve, convert_dates\n", "from pyquickhelper.loghelper import fLOG\n", "fLOG(OutputPrint=True)\n", "import re\n", "\n", "reg = re.compile(';\"(.*?)\"')\n", "\n", "def transform4(i, line, hist):\n", " a, b = clean_column_name_sql_dump(i, line, hist)\n", " return a.replace(\"_0\", \"\"), b\n", "\n", "change_encoding_improve(\"export_SINVOICE.csv\", \"export_SINVOICE.clean2.txt\", \"latin-1\", \"latin-1\", \n", " process=transform4, fLOG=fLOG)"]}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1"}}, "nbformat": 4, "nbformat_minor": 2}