{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Cheat Sheet on files\n", "\n", "Cheat sheet on files."]}, {"cell_type": "code", "execution_count": 1, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## change the encoding of a file"]}, {"cell_type": "code", "execution_count": 2, "metadata": {"collapsed": true}, "outputs": [], "source": ["with open(\"essai.txt\", \"w\", encoding=\"latin-1\") as f:\n", " f.write(\"\u00e9e\\n\u00e0\u00e0\")"]}, {"cell_type": "code", "execution_count": 3, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["1"]}, "execution_count": 4, "metadata": {}, "output_type": "execute_result"}], "source": ["from ensae_projects.datainc import change_encoding\n", "change_encoding(\"essai.txt\", \"essai.utf8.txt\", enc1=\"latin-1\", enc2=\"utf-8\")"]}, {"cell_type": "code", "execution_count": 4, "metadata": {"collapsed": false}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\u00e9e\n", "\u00e0\u00e0\n"]}], "source": ["with open(\"essai.utf8.txt\", \"r\", encoding=\"utf8\") as f:\n", " s = f.read()\n", "print(s)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## select a subset of columns from a tsv files"]}, {"cell_type": "code", "execution_count": 5, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["'OnlineNewsPopularity/OnlineNewsPopularity.csv'"]}, "execution_count": 6, "metadata": {}, "output_type": "execute_result"}], "source": ["import pyensae.datasource\n", "%load_ext pyensae\n", "files = pyensae.datasource.download_data(\"OnlineNewsPopularity.zip\", \n", " website=\"http://archive.ics.uci.edu/ml/machine-learning-databases/00332/\")\n", "files[1]"]}, {"cell_type": "code", "execution_count": 6, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "url, timedelta, n_tokens_title, n_tokens_content, n_unique_tokens, n_non_stop_words, n_non_stop_unique_tokens, num_hrefs, num_self_hrefs, num_imgs, num_videos, average_token_length, num_keywords, data_channel_is_lifestyle, data_channel_is_entertainment, data_channel_is_bus, data_channel_is_socmed, data_channel_is_tech, data_channel_is_world, kw_min_min, kw_max_min, kw_avg_min, kw_min_max, kw_max_max, kw_avg_max, kw_min_avg, kw_max_avg, kw_avg_avg, self_reference_min_shares, self_reference_max_shares, self_reference_avg_sharess, weekday_is_monday, weekday_is_tuesday, weekday_is_wednesday, weekday_is_thursday, weekday_is_friday, weekday_is_saturday, weekday_is_sunday, is_weekend, LDA_00, LDA_01, LDA_02, LDA_03, LDA_04, global_subjectivity, global_sentiment_polarity, global_rate_positive_words, global_rate_negative_words, rate_positive_words, rate_negative_words, avg_positive_polarity, min_positive_polarity, max_positive_polarity, avg_negative_polarity, min_negative_polarity, max_negative_polarity, title_subjectivity, title_sentiment_polarity, abs_title_subjectivity, abs_title_sentiment_polarity, shares\n", "http://mashable.com/2013/01/07/amazon-instant-video-browser/, 731.0, 12.0, 219.0, 0.663594466988, 0.999999992308, 0.815384609112, 4.0, 2.0, 1.0, 0.0, 4.6803652968, 5.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 496.0, 496.0, 496.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.500331204081, 0.378278929586, 0.0400046751006, 0.0412626477296, 0.0401225435029, 0.521617145481, 0.0925619834711, 0.0456621004566, 0.013698630137, 0.769230769231, 0.230769230769, 0.378636363636, 0.1, 0.7, -0.35, -0.6, -0.2, 0.5, -0.1875, 0.0, 0.1875, 593\n", "\n", "
"], "text/plain": [""]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["%head OnlineNewsPopularity/OnlineNewsPopularity.csv -n 2"]}, {"cell_type": "code", "execution_count": 7, "metadata": {"collapsed": false}, "outputs": [], "source": ["from ensae_projects.datainc import enumerate_text_lines\n", "def clean_column_name(s):\n", " return s.strip()\n", "bigfile = enumerate_text_lines(\"OnlineNewsPopularity/OnlineNewsPopularity.csv\", \n", " encoding=\"utf-8\", header=True, quotes_as_str=False,\n", " sep=\",\",\n", " clean_column_name=clean_column_name, fLOG=print)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["39644"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["res = list(map(lambda row: {\"LDA_00\": row[\"LDA_00\"], \"title_sentiment_polarity\":row[\"title_sentiment_polarity\"]}, \n", " bigfile))\n", "len(res)"]}, {"cell_type": "code", "execution_count": 9, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " LDA_00 | \n", " title_sentiment_polarity | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 0.500331204081 | \n", " -0.1875 | \n", "
\n", " \n", " 1 | \n", " 0.799755687423 | \n", " 0.0 | \n", "
\n", " \n", " 2 | \n", " 0.217792288518 | \n", " 0.0 | \n", "
\n", " \n", " 3 | \n", " 0.0285732164707 | \n", " 0.0 | \n", "
\n", " \n", " 4 | \n", " 0.0286328101715 | \n", " 0.136363636364 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" LDA_00 title_sentiment_polarity\n", "0 0.500331204081 -0.1875\n", "1 0.799755687423 0.0\n", "2 0.217792288518 0.0\n", "3 0.0285732164707 0.0\n", "4 0.0286328101715 0.136363636364"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["import pandas\n", "df = pandas.DataFrame(res)\n", "df.head()"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## look at the head or tail of a file\n", "\n", "We use magic commands [%head](http://www.xavierdupre.fr/app/pyensae/helpsphinx/all_NB.html?highlight=head#head) and [%tail](http://www.xavierdupre.fr/app/pyensae/helpsphinx/all_NB.html?highlight=head#tail)."]}, {"cell_type": "code", "execution_count": 10, "metadata": {"collapsed": false}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["The pyensae extension is already loaded. To reload it, use:\n", " %reload_ext pyensae\n"]}], "source": ["%load_ext pyensae"]}, {"cell_type": "code", "execution_count": 11, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "e\n", "\n", "
"], "text/plain": [""]}, "execution_count": 12, "metadata": {}, "output_type": "execute_result"}], "source": ["%head essai.txt -n 1 -s ignore"]}, {"cell_type": "code", "execution_count": 12, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "e\n", "\n", "
"], "text/plain": [""]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["%tail essai.txt -n 1 -s ignore"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## select lines of a flat file based on a regular expression"]}, {"cell_type": "code", "execution_count": 13, "metadata": {"collapsed": false}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["The pyensae extension is already loaded. To reload it, use:\n", " %reload_ext pyensae\n"]}], "source": ["%load_ext pyensae"]}, {"cell_type": "code", "execution_count": 14, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "\u00e9e\n", "\n", "
"], "text/plain": [""]}, "execution_count": 15, "metadata": {}, "output_type": "execute_result"}], "source": ["%grep essai.utf8.txt .*\u00e9.*"]}, {"cell_type": "markdown", "metadata": {}, "source": ["More complex, we extract all lines containing a substring and we add the header to the file to make it look like a dataframe.\n", "We do that usually when we cannot load a big file into memory with [pandas](http://pandas.pydata.org/) due to the lack of memory. This code relies on magic command [grep](http://www.xavierdupre.fr/app/pyensae/helpsphinx/all_NB.html#grep) and function [enumerate_grep](http://www.xavierdupre.fr/app/pyensae/helpsphinx/pyensae/file_helper/content_helper.html?highlight=enumerate_grep#pyensae.file_helper.content_helper.enumerate_grep)."]}, {"cell_type": "code", "execution_count": 15, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "first_name,name\n", "Xavier,Dupr\u00e9\n", "Sloane,Dupr\u00e9\n", "\n", "
"], "text/plain": [""]}, "execution_count": 16, "metadata": {}, "output_type": "execute_result"}], "source": ["import pandas\n", "import pyensae\n", "df = pandas.DataFrame([dict(name=\"Dupr\u00e9\", first_name=\"Xavier\"),\n", " dict(name=\"Dupr\u00e9\", first_name=\"Sloane\")])\n", "df.to_csv(\"data.txt\", encoding=\"utf8\", index=False)\n", "%head data.txt"]}, {"cell_type": "code", "execution_count": 16, "metadata": {"collapsed": true}, "outputs": [], "source": ["raw = %grep data.txt Xavier --raw"]}, {"cell_type": "code", "execution_count": 17, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["'Xavier,Dupr\u00e9\\n'"]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["raw"]}, {"cell_type": "code", "execution_count": 18, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["'first_name,name\\n'"]}, "execution_count": 19, "metadata": {}, "output_type": "execute_result"}], "source": ["header = %head data.txt -n 1 --raw\n", "header"]}, {"cell_type": "code", "execution_count": 19, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "first_name,name\n", "Xavier,Dupr\u00e9\n", "\n", "
"], "text/plain": [""]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["with open(\"data_xavier.txt\", \"w\", encoding=\"utf8\") as f:\n", " f.write(header)\n", " f.write(raw)\n", " \n", "%head data_xavier.txt"]}, {"cell_type": "code", "execution_count": 20, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " first_name | \n", " name | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " Xavier | \n", " Dupr\u00e9 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" first_name name\n", "0 Xavier Dupr\u00e9"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["pandas.read_csv(\"data_xavier.txt\")"]}, {"cell_type": "code", "execution_count": 21, "metadata": {"collapsed": true}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2"}}, "nbformat": 4, "nbformat_minor": 2}