{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Donn\u00e9es multidimensionnelles SQL - correction\n", "\n", "Correction de la s\u00e9ance sur l'utilisation du SQL depuis un notebook."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Populating the interactive namespace from numpy and matplotlib\n"]}, {"data": {"text/html": ["
run previous cell, wait for 2 seconds
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "plt.style.use('ggplot')\n", "from pyquickhelper.helpgen import NbImage\n", "from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 1 : filtre\n", "\n", "On veut comparer les esp\u00e9rances de vie pour deux pays et deux ann\u00e9es."]}, {"cell_type": "code", "execution_count": 2, "metadata": {"collapsed": true}, "outputs": [], "source": ["from actuariat_python.data import table_mortalite_euro_stat \n", "table_mortalite_euro_stat()\n", "import pandas\n", "df = pandas.read_csv(\"mortalite.txt\", sep=\"\\t\", encoding=\"utf8\", low_memory=False)\n", "# ..."]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["import os\n", "if not os.path.exists('mortalite.db3'):\n", " import sqlite3\n", " from pandas.io import sql\n", " cnx = sqlite3.connect('mortalite.db3')\n", " df.to_sql(name='mortalite', con=cnx)\n", " cnx.close()"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## Exercice 2 : \u00e9chantillon al\u00e9atoire"]}, {"cell_type": "code", "execution_count": 4, "metadata": {"collapsed": true}, "outputs": [], "source": ["import random #loi uniforme\n", "def echantillon(proportion):\n", " return 1 if random.random() < proportion else 0"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["import sqlite3\n", "from pandas.io import sql\n", "cnx = sqlite3.connect('mortalite.db3')"]}, {"cell_type": "code", "execution_count": 6, "metadata": {"collapsed": true}, "outputs": [], "source": ["cnx.create_function('echantillon', 1, echantillon)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Que faut-il \u00e9crire ici pour r\u00e9cup\u00e9rer 1% de la table ?"]}, {"cell_type": "code", "execution_count": 7, "metadata": {"collapsed": true}, "outputs": [], "source": ["import pandas\n", "#example = pandas.read_sql(' ??? ', cnx)\n", "#example"]}, {"cell_type": "code", "execution_count": 8, "metadata": {"collapsed": true}, "outputs": [], "source": ["cnx.close()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Exercice 3 : reducer SQL"]}, {"cell_type": "code", "execution_count": 9, "metadata": {"collapsed": true}, "outputs": [], "source": ["import sqlite3, pandas\n", "from pandas.io import sql\n", "cnx = sqlite3.connect('mortalite.db3')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Il faut compl\u00e9ter le programme suivant."]}, {"cell_type": "code", "execution_count": 10, "metadata": {"collapsed": true}, "outputs": [], "source": ["class ReducerMediane:\n", " def __init__(self):\n", " self.indicateur = []\n", " def step(self, value):\n", " if value >= 0:\n", " self.indicateur.append(value)\n", " def finalize(self):\n", " self.indicateur.sort()\n", " return self.indicateur[len(self.indicateur)//2]\n", " \n", "cnx.create_aggregate(\"ReducerMediane\", 1, ReducerMediane) "]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["query = \"\"\"SELECT annee,age,age_num, ReducerMediane(valeur) AS mediane FROM mortalite \n", " WHERE indicateur==\"LIFEXP\" AND genre==\"F\"\n", " GROUP BY annee,age,age_num\"\"\"\n", "df = pandas.read_sql(query, cnx)"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
anneeageage_nummediane
01960NoneNaN66.7
11960Y01173.7
21960Y02272.8
31960Y03371.9
41960Y04471.0
\n", "
"], "text/plain": [" annee age age_num mediane\n", "0 1960 None NaN 66.7\n", "1 1960 Y01 1 73.7\n", "2 1960 Y02 2 72.8\n", "3 1960 Y03 3 71.9\n", "4 1960 Y04 4 71.0"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["df.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Un reducer \u00e0 deux entr\u00e9es m\u00eame si cela n'a pas beaucoup de sens ici :"]}, {"cell_type": "code", "execution_count": 13, "metadata": {"collapsed": true}, "outputs": [], "source": ["class ReducerMediane2:\n", " def __init__(self):\n", " self.indicateur = []\n", " def step(self, value, value2):\n", " if value >= 0:\n", " self.indicateur.append(value)\n", " if value2 >= 0:\n", " self.indicateur.append(value2)\n", " def finalize(self):\n", " self.indicateur.sort()\n", " return self.indicateur[len(self.indicateur)//2]\n", " \n", "cnx.create_aggregate(\"ReducerMediane2\", 2, ReducerMediane2)"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
anneeageage_nummediane2
01960NoneNaN66.7
11960Y01174.0
21960Y02273.2
31960Y03372.3
41960Y04471.3
\n", "
"], "text/plain": [" annee age age_num mediane2\n", "0 1960 None NaN 66.7\n", "1 1960 Y01 1 74.0\n", "2 1960 Y02 2 73.2\n", "3 1960 Y03 3 72.3\n", "4 1960 Y04 4 71.3"]}, "execution_count": 15, "metadata": {}, "output_type": "execute_result"}], "source": ["query = \"\"\"SELECT annee,age,age_num, ReducerMediane2(valeur, valeur+1) AS mediane2 FROM mortalite \n", " WHERE indicateur==\"LIFEXP\" AND genre==\"F\"\n", " GROUP BY annee,age,age_num\"\"\"\n", "df = pandas.read_sql(query, cnx)\n", "df.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Il n'est apparemment pas possible de retourner deux r\u00e9sultats mais on peut utiliser une ruse qui consise \u00e0 les concat\u00e9ner dans une cha\u00eene de carac\u00e8res."]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": ["class ReducerQuantile:\n", " def __init__(self):\n", " self.indicateur = []\n", " def step(self, value):\n", " if value >= 0:\n", " self.indicateur.append(value)\n", " def finalize(self):\n", " self.indicateur.sort()\n", " q1 = self.indicateur[len(self.indicateur)//4]\n", " q2 = self.indicateur[3*len(self.indicateur)//4]\n", " n = len(self.indicateur)\n", " return \"%f;%f;%s\" % (q1,q2,n)\n", " \n", "cnx.create_aggregate(\"ReducerQuantile\", 1, ReducerQuantile)"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
anneeageage_numquantiles
01960NoneNaN4.400000;72.800000;20
11960Y01173.000000;74.000000;10
21960Y02272.100000;73.200000;10
31960Y03371.200000;72.300000;10
41960Y04470.300000;71.300000;10
\n", "
"], "text/plain": [" annee age age_num quantiles\n", "0 1960 None NaN 4.400000;72.800000;20\n", "1 1960 Y01 1 73.000000;74.000000;10\n", "2 1960 Y02 2 72.100000;73.200000;10\n", "3 1960 Y03 3 71.200000;72.300000;10\n", "4 1960 Y04 4 70.300000;71.300000;10"]}, "execution_count": 17, "metadata": {}, "output_type": "execute_result"}], "source": ["query = \"\"\"SELECT annee,age,age_num, ReducerQuantile(valeur) AS quantiles FROM mortalite \n", " WHERE indicateur==\"LIFEXP\" AND genre==\"F\"\n", " GROUP BY annee,age,age_num\"\"\"\n", "df = pandas.read_sql(query, cnx)\n", "df.head()"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": ["cnx.close()"]}, {"cell_type": "code", "execution_count": 18, "metadata": {"collapsed": true}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1"}}, "nbformat": 4, "nbformat_minor": 2}