{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 3A.mr - Reservoir Sampling distribu\u00e9 - \u00e9nonc\u00e9 - correction\n", "\n", "Correction."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["<b>Plan</b>\n", "<div id=\"my_menu_id\">run previous cell, wait for 2 seconds</div>\n", "<script>\n", "function repeat_indent_string(n){\n", "    var a = \"\" ;\n", "    for ( ; n > 0 ; --n) {\n", "        a += \"    \";\n", "    }\n", "    return a;\n", "}\n", "var update_menu_string = function(begin, lfirst, llast, sformat, send) {\n", "    var anchors = document.getElementsByClassName(\"section\");\n", "    if (anchors.length == 0) {\n", "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n", "    }\n", "    var i,t;\n", "    var text_menu = begin;\n", "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n", "    var ind = \"\";\n", "    var memo_level = 1;\n", "    var href;\n", "    var tags = [];\n", "    for (i = 0; i <= llast; i++) {\n", "        tags.push(\"h\" + i);\n", "    }\n", "\n", "    for (i = 0; i < anchors.length; i++) {\n", "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n", "\n", "        var child = null;\n", "        for(t = 0; t < tags.length; t++) {\n", "            var r = anchors[i].getElementsByTagName(tags[t]);\n", "            if (r.length > 0) {\n", "child = r[0];\n", "break;\n", "            }\n", "        }\n", "        if (child == null){\n", "            text_memo += \"null\\n\";\n", "            continue;\n", "        }\n", "\n", "        if (anchors[i].hasAttribute(\"id\")) {\n", "            // when converted in RST\n", "            href = anchors[i].id;\n", "            text_memo += \"#1-\" + href;\n", "            // passer \u00e0 child suivant (le chercher)\n", "        }\n", "        else if (child.hasAttribute(\"id\")) {\n", "            // in a notebook\n", "            href = child.id;\n", "            text_memo += \"#2-\" + href;\n", "        }\n", "        else {\n", "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n", "            continue;\n", "        }\n", "        var title = child.textContent;\n", "        var level = parseInt(child.tagName.substring(1,2));\n", "\n", "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n", "\n", "        if ((level < lfirst) || (level > llast)) {\n", "            continue ;\n", "        }\n", "        if (title.endsWith('\u00b6')) {\n", "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\").replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\")\n", "        }\n", "\n", "        if (title.length == 0) {\n", "            continue;\n", "        }\n", "        while (level > memo_level) {\n", "            text_menu += \"<ul>\\n\";\n", "            memo_level += 1;\n", "        }\n", "        while (level < memo_level) {\n", "            text_menu += \"</ul>\\n\";\n", "            memo_level -= 1;\n", "        }\n", "        text_menu += repeat_indent_string(level-2) + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n", "    }\n", "    while (1 < memo_level) {\n", "        text_menu += \"</ul>\\n\";\n", "        memo_level -= 1;\n", "    }\n", "    text_menu += send;\n", "    //text_menu += \"\\n\" + text_memo;\n", "    return text_menu;\n", "};\n", "var update_menu = function() {\n", "    var sbegin = \"\";\n", "    var sformat = '<li><a href=\"#__HREF__\">__TITLE__</a></li>';\n", "    var send = \"\";\n", "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send);\n", "    var menu = document.getElementById(\"my_menu_id\");\n", "    menu.innerHTML=text_menu;\n", "};\n", "window.setTimeout(update_menu,2000);\n", "            </script>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## cr\u00e9ation d'un fichier \u00e0 sampler"]}, {"cell_type": "code", "execution_count": 2, "metadata": {"collapsed": true}, "outputs": [], "source": ["with open(\"sample4.txt\", \"w\", encoding=\"utf8\") as f:\n", "    for i in range(0,100000):\n", "        f.write(\"{0}\\t{1}{0}\\n\".format(i, chr(i%26 + 65)))\n", "    f.write(\"100001\\tAAAAAA\")"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"data": {"text/html": ["<pre>\n", "0\tA0\n", "1\tB1\n", "2\tC2\n", "3\tD3\n", "4\tE4\n", "5\tF5\n", "6\tG6\n", "7\tH7\n", "8\tI8\n", "9\tJ9\n", "\n", "</pre>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 4, "metadata": {}, "output_type": "execute_result"}], "source": ["%load_ext pyensae\n", "%head sample4.txt"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## connexion"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"data": {"text/plain": ["dict"]}, "execution_count": 5, "metadata": {}, "output_type": "execute_result"}], "source": ["import os\n", "blobhp = {}\n", "if \"HDCREDENTIALS\" in os.environ:\n", "    blobhp[\"blob_storage\"], blobhp[\"password1\"], blobhp[\"hadoop_server\"], blobhp[\"password2\"], blobhp[\"username\"] = \\\n", "        os.environ[\"HDCREDENTIALS\"].split(\"**\")\n", "    r = type(blobhp)\n", "else:\n", "    from pyquickhelper.ipythonhelper import open_html_form\n", "    params={\"blob_storage\":\"\", \"password1\":\"\", \"hadoop_server\":\"\", \"password2\":\"\", \"username\":\"axavier\"}\n", "    r = open_html_form(params=params,title=\"server + hadoop + credentials\", key_save=\"blobhp\")\n", "r"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["import pyensae\n", "%load_ext pyensae\n", "%load_ext pyenbc\n", "blobstorage = blobhp[\"blob_storage\"]\n", "blobpassword = blobhp[\"password1\"]\n", "hadoop_server = blobhp[\"hadoop_server\"]\n", "hadoop_password = blobhp[\"password2\"]\n", "username = blobhp[\"username\"]"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/plain": ["(<pyensae.remote.azure_connection.AzureClient at 0x942e860>,\n", " <azure.storage.blob.blobservice.BlobService at 0x942e898>)"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["client, bs = %hd_open\n", "client, bs"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## upload du fichier"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"data": {"text/plain": ["'$PSEUDO/sampling/sample4.txt'"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_up sample3.txt /$PSEUDO/sampling/sample4.txt"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>name</th>\n", "      <th>last_modified</th>\n", "      <th>content_type</th>\n", "      <th>content_length</th>\n", "      <th>blob_type</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>axavier/sampling/datafu-1.2.0.jar</td>\n", "      <td>Fri, 13 Nov 2015 00:03:49 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1600826</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>axavier/sampling/out_sampled_rs4_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 01:08:22 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>axavier/sampling/out_sampled_rs4_2015.txt/_SUC...</td>\n", "      <td>Fri, 13 Nov 2015 01:08:22 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>axavier/sampling/out_sampled_rs4_2015.txt/part...</td>\n", "      <td>Fri, 13 Nov 2015 01:08:21 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>12785</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>axavier/sampling/sample.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:02:50 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377780</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>5</th>\n", "      <td>axavier/sampling/sample2.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:35:55 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377793</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>6</th>\n", "      <td>axavier/sampling/sample3.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:39:40 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377793</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>7</th>\n", "      <td>axavier/sampling/sample4.txt</td>\n", "      <td>Sun, 15 Nov 2015 12:24:22 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377793</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>8</th>\n", "      <td>axavier/sampling/sample4_hash.txt</td>\n", "      <td>Fri, 13 Nov 2015 14:50:39 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>9</th>\n", "      <td>axavier/sampling/sample4_hash.txt/_SUCCESS</td>\n", "      <td>Fri, 13 Nov 2015 14:50:39 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>10</th>\n", "      <td>axavier/sampling/sample4_hash.txt/part-r-00000</td>\n", "      <td>Fri, 13 Nov 2015 14:50:38 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>4771358</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>11</th>\n", "      <td>axavier/sampling/sampled4_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:50:20 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>12</th>\n", "      <td>axavier/sampling/sampled4_2015.txt/_SUCCESS</td>\n", "      <td>Fri, 13 Nov 2015 00:50:20 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>13</th>\n", "      <td>axavier/sampling/sampled4_2015.txt/part-m-00000</td>\n", "      <td>Fri, 13 Nov 2015 00:50:19 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1277794</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>14</th>\n", "      <td>axavier/sampling/sampled_rs4_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 01:04:51 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>15</th>\n", "      <td>axavier/sampling/sampled_rs4_2015.txt/_SUCCESS</td>\n", "      <td>Fri, 13 Nov 2015 01:04:51 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>16</th>\n", "      <td>axavier/sampling/sampled_rs4_2015.txt/part-m-0...</td>\n", "      <td>Fri, 13 Nov 2015 01:04:50 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1277794</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>17</th>\n", "      <td>axavier/sampling/sampled_srs4_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:56:09 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>18</th>\n", "      <td>axavier/sampling/sampled_srs4_2015.txt/_SUCCESS</td>\n", "      <td>Fri, 13 Nov 2015 00:56:09 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>19</th>\n", "      <td>axavier/sampling/sampled_srs4_2015.txt/part-m-...</td>\n", "      <td>Fri, 13 Nov 2015 00:56:09 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1277794</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>20</th>\n", "      <td>axavier/sampling/sampled_srs_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:52:34 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>21</th>\n", "      <td>axavier/sampling/sampled_srs_2015.txt/_SUCCESS</td>\n", "      <td>Fri, 13 Nov 2015 00:52:34 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>22</th>\n", "      <td>axavier/sampling/sampled_srs_2015.txt/part-m-0...</td>\n", "      <td>Fri, 13 Nov 2015 00:52:34 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1277794</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                                                 name  \\\n", "0                   axavier/sampling/datafu-1.2.0.jar   \n", "1           axavier/sampling/out_sampled_rs4_2015.txt   \n", "2   axavier/sampling/out_sampled_rs4_2015.txt/_SUC...   \n", "3   axavier/sampling/out_sampled_rs4_2015.txt/part...   \n", "4                         axavier/sampling/sample.txt   \n", "5                        axavier/sampling/sample2.txt   \n", "6                        axavier/sampling/sample3.txt   \n", "7                        axavier/sampling/sample4.txt   \n", "8                   axavier/sampling/sample4_hash.txt   \n", "9          axavier/sampling/sample4_hash.txt/_SUCCESS   \n", "10     axavier/sampling/sample4_hash.txt/part-r-00000   \n", "11                 axavier/sampling/sampled4_2015.txt   \n", "12        axavier/sampling/sampled4_2015.txt/_SUCCESS   \n", "13    axavier/sampling/sampled4_2015.txt/part-m-00000   \n", "14              axavier/sampling/sampled_rs4_2015.txt   \n", "15     axavier/sampling/sampled_rs4_2015.txt/_SUCCESS   \n", "16  axavier/sampling/sampled_rs4_2015.txt/part-m-0...   \n", "17             axavier/sampling/sampled_srs4_2015.txt   \n", "18    axavier/sampling/sampled_srs4_2015.txt/_SUCCESS   \n", "19  axavier/sampling/sampled_srs4_2015.txt/part-m-...   \n", "20              axavier/sampling/sampled_srs_2015.txt   \n", "21     axavier/sampling/sampled_srs_2015.txt/_SUCCESS   \n", "22  axavier/sampling/sampled_srs_2015.txt/part-m-0...   \n", "\n", "                    last_modified              content_type  content_length  \\\n", "0   Fri, 13 Nov 2015 00:03:49 GMT  application/octet-stream         1600826   \n", "1   Fri, 13 Nov 2015 01:08:22 GMT                                         0   \n", "2   Fri, 13 Nov 2015 01:08:22 GMT  application/octet-stream               0   \n", "3   Fri, 13 Nov 2015 01:08:21 GMT  application/octet-stream           12785   \n", "4   Fri, 13 Nov 2015 00:02:50 GMT  application/octet-stream         1377780   \n", "5   Fri, 13 Nov 2015 00:35:55 GMT  application/octet-stream         1377793   \n", "6   Fri, 13 Nov 2015 00:39:40 GMT  application/octet-stream         1377793   \n", "7   Sun, 15 Nov 2015 12:24:22 GMT  application/octet-stream         1377793   \n", "8   Fri, 13 Nov 2015 14:50:39 GMT                                         0   \n", "9   Fri, 13 Nov 2015 14:50:39 GMT  application/octet-stream               0   \n", "10  Fri, 13 Nov 2015 14:50:38 GMT  application/octet-stream         4771358   \n", "11  Fri, 13 Nov 2015 00:50:20 GMT                                         0   \n", "12  Fri, 13 Nov 2015 00:50:20 GMT  application/octet-stream               0   \n", "13  Fri, 13 Nov 2015 00:50:19 GMT  application/octet-stream         1277794   \n", "14  Fri, 13 Nov 2015 01:04:51 GMT                                         0   \n", "15  Fri, 13 Nov 2015 01:04:51 GMT  application/octet-stream               0   \n", "16  Fri, 13 Nov 2015 01:04:50 GMT  application/octet-stream         1277794   \n", "17  Fri, 13 Nov 2015 00:56:09 GMT                                         0   \n", "18  Fri, 13 Nov 2015 00:56:09 GMT  application/octet-stream               0   \n", "19  Fri, 13 Nov 2015 00:56:09 GMT  application/octet-stream         1277794   \n", "20  Fri, 13 Nov 2015 00:52:34 GMT                                         0   \n", "21  Fri, 13 Nov 2015 00:52:34 GMT  application/octet-stream               0   \n", "22  Fri, 13 Nov 2015 00:52:34 GMT  application/octet-stream         1277794   \n", "\n", "    blob_type  \n", "0   BlockBlob  \n", "1   BlockBlob  \n", "2   BlockBlob  \n", "3   BlockBlob  \n", "4   BlockBlob  \n", "5   BlockBlob  \n", "6   BlockBlob  \n", "7   BlockBlob  \n", "8   BlockBlob  \n", "9   BlockBlob  \n", "10  BlockBlob  \n", "11  BlockBlob  \n", "12  BlockBlob  \n", "13  BlockBlob  \n", "14  BlockBlob  \n", "15  BlockBlob  \n", "16  BlockBlob  \n", "17  BlockBlob  \n", "18  BlockBlob  \n", "19  BlockBlob  \n", "20  BlockBlob  \n", "21  BlockBlob  \n", "22  BlockBlob  "]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Code python pour le reservoir sampling"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/plain": ["['0a', '1b', '2c', '3d', '4e']"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["ensemble = [ \"%d%s\" % (i, chr(i%26 + 97)) for i in range(0,10000)]\n", "ensemble[:5]"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/plain": ["['8681x',\n", " '8356k',\n", " '5490e',\n", " '4405l',\n", " '5890o',\n", " '2689l',\n", " '8672o',\n", " '3603p',\n", " '8599t',\n", " '6086c']"]}, "execution_count": 11, "metadata": {}, "output_type": "execute_result"}], "source": ["import random\n", "def reservoir_sampling(ensemble, k):\n", "    N = len(ensemble)\n", "    echantillon = []\n", "    for i, e in enumerate(ensemble):\n", "        if len(echantillon) < k:\n", "            echantillon.append(e)\n", "        else:\n", "            j = random.randint(0, i)\n", "            if j < k:\n", "                echantillon[j] = e\n", "    return echantillon\n", "\n", "reservoir_sampling(ensemble, 10)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## python \u00e0 jython\n", "\n", "On s'assure que le code pr\u00e9c\u00e9dent fonctionne en jython (python compil\u00e9 en java). On s'inspire pour cela de la documentation [jython-udfs](https://pig.apache.org/docs/r0.12.0/udf.html#jython-udfs)."]}, {"cell_type": "markdown", "metadata": {}, "source": ["### On cr\u00e9\u00e9 d'abord un script PIG pour r\u00e9cup\u00e9rer le schema et les premi\u00e8res lignes"]}, {"cell_type": "code", "execution_count": 11, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore.pig\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", "        USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "ens_group = GROUP ensemble ALL;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE FLATTEN(ensemble);\n", "DESCRIBE sampled;\n", "\n", "--ens10 = LIMIT ensemble 10;\n", "--ens_group10 = LIMIT en_group10 ;\n", "--DUMP ens10;\n", "--DUMP ens_group10;"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Si la fonction suivante provoque une erreur ::\n", "\n", "    AzureException: STATUS: 403, JSON: Expecting value: line 1 column 1 (char 0)\n", "    <Response [403]>\n", "    unable to submit job: sample_explore.pig\n", "    \n", "V\u00e9rifier les identifiants utilis\u00e9s pour se connecter."]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0185'}"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0185', None, None, False, 'RUNNING')"]}, "execution_count": 14, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "markdown", "metadata": {}, "source": ["La sortie standard contient les informations souhait\u00e9es :"]}, {"cell_type": "code", "execution_count": 14, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["<pre>\n", "2015-11-15 12:33:06,608 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 12:33:06,608 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: wasb://clusterensaeazure1-1@hdblobstorage.blob.core.windows.net\n", "2015-11-15 12:33:08,233 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 12:33:09,374 [main] INFO  org.apache.pig.Main - Pig script completed in 4 seconds and 578 milliseconds (4578 ms)\n", "\n", "</pre><br /><b>OUT:</b><br /><pre>\n", "ensemble: {x: int,v: chararray}\n", "ens_group: {group: chararray,ensemble: {(x: int,v: chararray)}}\n", "sampled: {ensemble::x: int,ensemble::v: chararray}\n", "\n", "</pre>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 15, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 5"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Et la sortie du second dump :: \n", "    \n", "    (all,{(100001,AAAAAA),(99999,D99999),(99998,C99998)..."]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Le code Jython"]}, {"cell_type": "code", "execution_count": 15, "metadata": {"collapsed": true}, "outputs": [], "source": ["import pyensae"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["%%PYTHON reservoir_sampling.py\n", "\n", "import random\n", "\n", "@outputSchemaFunction(\"rsSchema\")\n", "def reservoir_sampling(ensemble):\n", "    ensemble = eval(ensemble)\n", "    k = 10\n", "    N = len(ensemble)\n", "    echantillon = []\n", "    for i, e in enumerate(ensemble):\n", "        if len(echantillon) < k:\n", "            echantillon.append(e)\n", "        else:\n", "            j = random.randint(0, i)\n", "            if j < k:\n", "                echantillon[j] = e\n", "    return echantillon\n", "\n", "@schemaFunction(\"rsSchema\")\n", "def rsSchema(input):\n", "    return input"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [{"data": {"text/html": ["<pre>\n", "[(99998, 'C99998'), (99999, 'D99999'), (100001, 'AAAAAA')]\n", "\n", "</pre>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["%%jython reservoir_sampling.py reservoir_sampling\n", "{(100001,\"AAAAAA\"),(99999,\"D99999\"),(99998,\"C99998\")}"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On ajoute le code jython au script pr\u00e9c\u00e9dent :"]}, {"cell_type": "code", "execution_count": 18, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore_complete.pig\n", "\n", "REGISTER '$CONTAINER/$SCRIPTPIG/reservoir_sampling.py' using jython as myrs;\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", "        USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "ens_group = GROUP ensemble ALL;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE FLATTEN(myrs.reservoir_sample(ensemble));\n", "DESCRIBE sampled;\n", "\n", "STORE sampled INTO \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_rs.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0229'}"]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore_complete.pig -d reservoir_sampling.py\n", "jid"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0229', None, 'done', False, 'RUNNING')"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 21, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["<pre>\n", "15/11/15 18:43:49 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL\n", "15/11/15 18:43:49 INFO pig.ExecTypeProvider: Trying ExecType : MAPREDUCE\n", "15/11/15 18:43:49 INFO pig.ExecTypeProvider: Picked MAPREDUCE as the ExecType\n", "2015-11-15 18:43:49,598 [main] INFO  org.apache.pig.Main - Apache Pig version 0.14.0.2.2.7.1-33 (r: unknown) compiled Oct 13 2015, 04:18:06\n", "2015-11-15 18:43:49,598 [main] INFO  org.apache.pig.Main - Logging error messages to: C:\\apps\\dist\\hadoop-2.6.0.2.2.7.1-33\\logs\\pig_1447613029598.log\n", "2015-11-15 18:43:50,848 [main] INFO  org.apache.pig.impl.util.Utils - Default bootup file D:\\Users\\hdp/.pigbootup not found\n", "2015-11-15 18:43:51,145 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address\n", "2015-11-15 18:43:51,145 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 18:43:51,145 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: wasb://clusterensaeazure1-1@hdblobstorage.blob.core.windows.net\n", "2015-11-15 18:43:51,879 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 18:43:52,192 [main] INFO  org.apache.pig.scripting.jython.JythonScriptEngine - created tmp python.cachedir=D:\\Users\\hdp\\AppData\\Local\\Temp\\pig_jython_3357684506669481882\n", "2015-11-15 18:43:54,817 [main] WARN  org.apache.pig.scripting.jython.JythonScriptEngine - pig.cmd.args.remainders is empty. This is not expected unless on testing.\n", "2015-11-15 18:43:57,645 [main] INFO  org.apache.pig.scripting.jython.JythonScriptEngine - Register scripting UDF: myrs.reservoir_sampling\n", "2015-11-15 18:43:58,535 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 18:43:59,660 [main] <b><font color=\"#DD0000\">ERROR</font></b> org.apache.pig.PigServer - exception during parsing: Error during parsing. Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "Failed to parse: Pig script failed to parse: \n", "<file sample_explore_complete.pig, line 9, column 45> Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: <b><font color=\"#DD0000\">ERROR</font></b> 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:199)\n", "\tat org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1735)\n", "\tat org.apache.pig.PigServer$Graph.access$000(PigServer.java:1443)\n", "\tat org.apache.pig.PigServer.parseAndBuild(PigServer.java:387)\n", "\tat org.apache.pig.tools.grunt.GruntParser.processDescribe(GruntParser.java:300)\n", "\tat org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:412)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:230)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:205)\n", "\tat org.apache.pig.tools.grunt.Grunt.exec(Grunt.java:81)\n", "\tat org.apache.pig.Main.run(Main.java:495)\n", "\tat org.apache.pig.Main.main(Main.java:170)\n", "Caused by: \n", "<file sample_explore_complete.pig, line 9, column 45> Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: <b><font color=\"#DD0000\">ERROR</font></b> 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1572)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.func_eval(LogicalPlanGenerator.java:9372)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.projectable_expr(LogicalPlanGenerator.java:11051)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.var_expr(LogicalPlanGenerator.java:10810)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.expr(LogicalPlanGenerator.java:10159)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_clause(LogicalPlanGenerator.java:7629)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_generated_item(LogicalPlanGenerator.java:7452)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.generate_clause(LogicalPlanGenerator.java:17590)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_plan(LogicalPlanGenerator.java:15982)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_clause(LogicalPlanGenerator.java:15849)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.op_clause(LogicalPlanGenerator.java:1933)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.general_statement(LogicalPlanGenerator.java:1102)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.statement(LogicalPlanGenerator.java:560)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.query(LogicalPlanGenerator.java:421)\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:191)\n", "\t... 10 more\n", "Caused by: org.apache.pig.backend.executionengine.ExecException: <b><font color=\"#DD0000\">ERROR</font></b> 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.impl.PigContext.resolveClassName(PigContext.java:677)\n", "\tat org.apache.pig.impl.PigContext.getClassForAlias(PigContext.java:793)\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1569)\n", "\t... 24 more\n", "2015-11-15 18:43:59,707 [main] <b><font color=\"#DD0000\">ERROR</font></b> org.apache.pig.tools.grunt.Grunt - <b><font color=\"#DD0000\">ERROR</font></b> 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "2015-11-15 18:43:59,707 [main] <b><font color=\"#DD0000\">ERROR</font></b> org.apache.pig.tools.grunt.Grunt - org.apache.pig.impl.logicalLayer.FrontendException: <b><font color=\"#DD0000\">ERROR</font></b> 1000: Error during parsing. Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1748)\n", "\tat org.apache.pig.PigServer$Graph.access$000(PigServer.java:1443)\n", "\tat org.apache.pig.PigServer.parseAndBuild(PigServer.java:387)\n", "\tat org.apache.pig.tools.grunt.GruntParser.processDescribe(GruntParser.java:300)\n", "\tat org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:412)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:230)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:205)\n", "\tat org.apache.pig.tools.grunt.Grunt.exec(Grunt.java:81)\n", "\tat org.apache.pig.Main.run(Main.java:495)\n", "\tat org.apache.pig.Main.main(Main.java:170)\n", "Caused by: Failed to parse: Pig script failed to parse: \n", "<file sample_explore_complete.pig, line 9, column 45> Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: <b><font color=\"#DD0000\">ERROR</font></b> 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:199)\n", "\tat org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1735)\n", "\t... 9 more\n", "Caused by: \n", "<file sample_explore_complete.pig, line 9, column 45> Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: <b><font color=\"#DD0000\">ERROR</font></b> 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1572)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.func_eval(LogicalPlanGenerator.java:9372)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.projectable_expr(LogicalPlanGenerator.java:11051)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.var_expr(LogicalPlanGenerator.java:10810)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.expr(LogicalPlanGenerator.java:10159)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_clause(LogicalPlanGenerator.java:7629)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_generated_item(LogicalPlanGenerator.java:7452)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.generate_clause(LogicalPlanGenerator.java:17590)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_plan(LogicalPlanGenerator.java:15982)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_clause(LogicalPlanGenerator.java:15849)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.op_clause(LogicalPlanGenerator.java:1933)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.general_statement(LogicalPlanGenerator.java:1102)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.statement(LogicalPlanGenerator.java:560)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.query(LogicalPlanGenerator.java:421)\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:191)\n", "\t... 10 more\n", "Caused by: org.apache.pig.backend.executionengine.ExecException: <b><font color=\"#DD0000\">ERROR</font></b> 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.impl.PigContext.resolveClassName(PigContext.java:677)\n", "\tat org.apache.pig.impl.PigContext.getClassForAlias(PigContext.java:793)\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1569)\n", "\t... 24 more\n", "\n", "Details also at logfile: C:\\apps\\dist\\hadoop-2.6.0.2.2.7.1-33\\logs\\pig_1447613029598.log\n", "2015-11-15 18:43:59,754 [main] INFO  org.apache.pig.Main - Pig script completed in 10 seconds and 453 milliseconds (10453 ms)\n", "\n", "</pre><br /><b>OUT:</b><br /><pre>\n", "ensemble: {x: int,v: chararray}\n", "ens_group: {group: chararray,ensemble: {(x: int,v: chararray)}}\n", "\n", "</pre>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 22, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 100"]}, {"cell_type": "markdown", "metadata": {}, "source": ["A corriger plus tard. Dans l'imm\u00e9diat, on utilisera la librairie [datafu](https://datafu.incubator.apache.org/docs/datafu/guide/sampling.html). Si le cluster ne reconna\u00eet pas la librairie, voir la section java pour comprendre comment l'importer. On la d\u00e9clare dans le script par l'instruction ``REGISTER``."]}, {"cell_type": "code", "execution_count": 22, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore_datafu.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", "        USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "ens_group = GROUP ensemble ALL;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE FLATTEN(RS(ensemble));\n", "DESCRIBE sampled;\n", "\n", "STORE sampled \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_datafu_rs.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0193'}"]}, "execution_count": 24, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore_datafu.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0193', '50% complete', None, False, 'RUNNING')"]}, "execution_count": 25, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 25, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["<pre>\n", "\n", "</pre><br />"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 26, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 100"]}, {"cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>name</th>\n", "      <th>last_modified</th>\n", "      <th>content_type</th>\n", "      <th>content_length</th>\n", "      <th>blob_type</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>axavier/sampling/sample_datafu_rs.txt</td>\n", "      <td>Sun, 15 Nov 2015 13:23:40 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>axavier/sampling/sample_datafu_rs.txt/_SUCCESS</td>\n", "      <td>Sun, 15 Nov 2015 13:23:40 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>axavier/sampling/sample_datafu_rs.txt/part-r-0...</td>\n", "      <td>Sun, 15 Nov 2015 13:23:38 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>12780</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                                                name  \\\n", "0              axavier/sampling/sample_datafu_rs.txt   \n", "1     axavier/sampling/sample_datafu_rs.txt/_SUCCESS   \n", "2  axavier/sampling/sample_datafu_rs.txt/part-r-0...   \n", "\n", "                   last_modified              content_type  content_length  \\\n", "0  Sun, 15 Nov 2015 13:23:40 GMT                                         0   \n", "1  Sun, 15 Nov 2015 13:23:40 GMT  application/octet-stream               0   \n", "2  Sun, 15 Nov 2015 13:23:38 GMT  application/octet-stream           12780   \n", "\n", "   blob_type  \n", "0  BlockBlob  \n", "1  BlockBlob  \n", "2  BlockBlob  "]}, "execution_count": 27, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling/sample_datafu"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## version distribu\u00e9e\n", "\n", "Astuce : on distribue puis on recombine les \u00e9chantillons en faisant un dernier reservoir sampling mais pond\u00e9r\u00e9. Comment distribuer ? Le second sampling est remplac\u00e9 par une m\u00e9thode d'\u00e9chantillonage classique car le reservoir sampling pond\u00e9r\u00e9 n'est pas disponible dans la librairie datafu version 1.2.0."]}, {"cell_type": "code", "execution_count": 27, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore_datafu_dist.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "DEFINE WeightedSample datafu.pig.sampling.WeightedSample();\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", "        USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "keys = FOREACH ensemble GENERATE x, v, x%10 AS key;\n", "DESCRIBE keys;\n", "ens_group = GROUP keys BY key ;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE COUNT(keys) AS weigth, FLATTEN(RS(keys));\n", "DESCRIBE sampled;\n", "wsampled = FOREACH (GROUP sampled ALL) GENERATE FLATTEN(WeightedSample(sampled, 0, 1000));\n", "DESCRIBE wsampled;\n", "\n", "STORE wsampled \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_datafu_rs_dist2.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0238'}"]}, "execution_count": 29, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore_datafu_dist.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0238', '100% complete', 'done', True, 'SUCCEEDED')"]}, "execution_count": 30, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 30, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["<pre>\n", "2015-11-15 19:22:17,553 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:22:17,553 [main] INFO  org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:22:17,615 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:22:17,803 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2015-11-15 19:22:17,803 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:22:17,803 [main] INFO  org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:22:17,865 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:22:17,943 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!\n", "2015-11-15 19:22:17,975 [main] INFO  org.apache.pig.Main - Pig script completed in 1 minute, 42 seconds and 839 milliseconds (102839 ms)\n", "\n", "</pre><br /><b>OUT:</b><br /><pre>\n", "ensemble: {x: int,v: chararray}\n", "keys: {x: int,v: chararray,key: int}\n", "ens_group: {group: int,keys: {(x: int,v: chararray,key: int)}}\n", "sampled: {weigth: long,datafu.pig.sampling.reservoirsample_keys_4::x: int,datafu.pig.sampling.reservoirsample_keys_4::v: chararray,datafu.pig.sampling.reservoirsample_keys_4::key: int}\n", "wsampled: {datafu.pig.sampling.weightedsample_sampled_12::weigth: long,datafu.pig.sampling.weightedsample_sampled_12::datafu.pig.sampling.reservoirsample_keys_11::x: int,datafu.pig.sampling.weightedsample_sampled_12::datafu.pig.sampling.reservoirsample_keys_11::v: chararray,datafu.pig.sampling.weightedsample_sampled_12::datafu.pig.sampling.reservoirsample_keys_11::key: int}\n", "\n", "</pre>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 31, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 10"]}, {"cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>name</th>\n", "      <th>last_modified</th>\n", "      <th>content_type</th>\n", "      <th>content_length</th>\n", "      <th>blob_type</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>axavier/sampling/sample_datafu_rs_dist2.txt</td>\n", "      <td>Sun, 15 Nov 2015 19:22:05 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>axavier/sampling/sample_datafu_rs_dist2.txt/_S...</td>\n", "      <td>Sun, 15 Nov 2015 19:22:06 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>axavier/sampling/sample_datafu_rs_dist2.txt/pa...</td>\n", "      <td>Sun, 15 Nov 2015 19:22:05 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>20770</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                                                name  \\\n", "0        axavier/sampling/sample_datafu_rs_dist2.txt   \n", "1  axavier/sampling/sample_datafu_rs_dist2.txt/_S...   \n", "2  axavier/sampling/sample_datafu_rs_dist2.txt/pa...   \n", "\n", "                   last_modified              content_type  content_length  \\\n", "0  Sun, 15 Nov 2015 19:22:05 GMT                                         0   \n", "1  Sun, 15 Nov 2015 19:22:06 GMT  application/octet-stream               0   \n", "2  Sun, 15 Nov 2015 19:22:05 GMT  application/octet-stream           20770   \n", "\n", "   blob_type  \n", "0  BlockBlob  \n", "1  BlockBlob  \n", "2  BlockBlob  "]}, "execution_count": 32, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling/sample_datafu_rs_dist2"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": ["df = %blob_head /$PSEUDO/sampling/sample_datafu_rs_dist2.txt -m"]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>10001</th>\n", "      <th>21260</th>\n", "      <th>S21260</th>\n", "      <th>0</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>10000</td>\n", "      <td>25191</td>\n", "      <td>X25191</td>\n", "      <td>1</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>10000</td>\n", "      <td>73760</td>\n", "      <td>Y73760</td>\n", "      <td>0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>10000</td>\n", "      <td>90105</td>\n", "      <td>P90105</td>\n", "      <td>5</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>10000</td>\n", "      <td>46070</td>\n", "      <td>Y46070</td>\n", "      <td>0</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>10001</td>\n", "      <td>58590</td>\n", "      <td>M58590</td>\n", "      <td>0</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   10001  21260  S21260  0\n", "0  10000  25191  X25191  1\n", "1  10000  73760  Y73760  0\n", "2  10000  90105  P90105  5\n", "3  10000  46070  Y46070  0\n", "4  10001  58590  M58590  0"]}, "execution_count": 34, "metadata": {}, "output_type": "execute_result"}], "source": ["df.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## version distribu\u00e9e am\u00e9lior\u00e9e\n", "\n", "Le probl\u00e8me de la version pr\u00e9c\u00e9dente : chaque sous-ensemble trait\u00e9 d'un seul bloc utilise une s\u00e9quence de nombres al\u00e9atoires sur laquelle on ne conna\u00eet pas grand chose. Si les m\u00eames *seed* sont utilis\u00e9es, il est possible que les s\u00e9quences, m\u00eame si elles simulent le hasard, soient extr\u00eamement corr\u00e9l\u00e9es entre chaque bloc. Il faut rem\u00e9dier \u00e0 cela.\n", "\n", "Il faut \u00e9galement s'assurer que chaque bloc n'est pas *skewed*."]}, {"cell_type": "code", "execution_count": 34, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG_azure script_rs.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "DEFINE MD5 datafu.pig.hash.MD5();\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "DEFINE WeightedSample datafu.pig.sampling.WeightedSample();\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", "        USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "\n", "ens_group = GROUP ensemble BY (x,v);\n", "DESCRIBE ens_group;\n", "\n", "compte_group = FOREACH ens_group \n", "            GENERATE group.x AS x, \n", "                     group.v AS v, \n", "                     COUNT(ensemble) AS nb_ligne ;\n", "DESCRIBE compte_group;\n", "\n", "hash_group = FOREACH compte_group \n", "                GENERATE x, v, nb_ligne,\n", "                        SUBSTRING(MD5(v), 0, 1) AS val;\n", "DESCRIBE hash_group;  \n", "\n", "group_hash = GROUP hash_group BY val ;\n", "DESCRIBE group_hash;\n", "\n", "rs_parall = FOREACH group_hash GENERATE\n", "                    COUNT(hash_group) AS nb_hash,\n", "                    FLATTEN(RS(hash_group)) ;\n", "DESCRIBE rs_parall;\n", "\n", "wsampled = FOREACH (GROUP rs_parall ALL) GENERATE FLATTEN(WeightedSample(rs_parall, 0, 1000));\n", "DESCRIBE wsampled;\n", "\n", "STORE wsampled \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_distributed_hash.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0244'}"]}, "execution_count": 36, "metadata": {}, "output_type": "execute_result"}], "source": ["jid=%hd_pig_submit script_rs.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0244', '100% complete', None, False, 'RUNNING')"]}, "execution_count": 37, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 37, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["<pre>\n", "2015-11-15 19:52:05,138 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:52:05,138 [main] INFO  org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:52:05,200 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:52:05,435 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2015-11-15 19:52:05,435 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:52:05,435 [main] INFO  org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:52:05,513 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:52:05,560 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!\n", "2015-11-15 19:52:05,607 [main] INFO  org.apache.pig.Main - Pig script completed in 2 minutes, 29 seconds and 962 milliseconds (149962 ms)\n", "\n", "</pre><br /><b>OUT:</b><br /><pre>\n", "ensemble: {x: int,v: chararray}\n", "ens_group: {group: (x: int,v: chararray),ensemble: {(x: int,v: chararray)}}\n", "compte_group: {x: int,v: chararray,nb_ligne: long}\n", "hash_group: {x: int,v: chararray,nb_ligne: long,val: chararray}\n", "group_hash: {group: chararray,hash_group: {(x: int,v: chararray,nb_ligne: long,val: chararray)}}\n", "rs_parall: {nb_hash: long,datafu.pig.sampling.reservoirsample_hash_group_4::x: int,datafu.pig.sampling.reservoirsample_hash_group_4::v: chararray,datafu.pig.sampling.reservoirsample_hash_group_4::nb_ligne: long,datafu.pig.sampling.reservoirsample_hash_group_4::val: chararray}\n", "wsampled: {datafu.pig.sampling.weightedsample_rs_parall_12::nb_hash: long,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::x: int,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::v: chararray,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::nb_ligne: long,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::val: chararray}\n", "\n", "</pre>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 38, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 10"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>name</th>\n", "      <th>last_modified</th>\n", "      <th>content_type</th>\n", "      <th>content_length</th>\n", "      <th>blob_type</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>axavier/sampling/sample_distributed_hash.txt</td>\n", "      <td>Sun, 15 Nov 2015 19:51:56 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>axavier/sampling/sample_distributed_hash.txt/_...</td>\n", "      <td>Sun, 15 Nov 2015 19:51:56 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>axavier/sampling/sample_distributed_hash.txt/p...</td>\n", "      <td>Sun, 15 Nov 2015 19:51:55 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>21750</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                                                name  \\\n", "0       axavier/sampling/sample_distributed_hash.txt   \n", "1  axavier/sampling/sample_distributed_hash.txt/_...   \n", "2  axavier/sampling/sample_distributed_hash.txt/p...   \n", "\n", "                   last_modified              content_type  content_length  \\\n", "0  Sun, 15 Nov 2015 19:51:56 GMT                                         0   \n", "1  Sun, 15 Nov 2015 19:51:56 GMT  application/octet-stream               0   \n", "2  Sun, 15 Nov 2015 19:51:55 GMT  application/octet-stream           21750   \n", "\n", "   blob_type  \n", "0  BlockBlob  \n", "1  BlockBlob  \n", "2  BlockBlob  "]}, "execution_count": 39, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling/sample_distributed_hash.txt"]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>6693</th>\n", "      <th>27244</th>\n", "      <th>W27244</th>\n", "      <th>1</th>\n", "      <th>6</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>6749</td>\n", "      <td>51104</td>\n", "      <td>O51104</td>\n", "      <td>1</td>\n", "      <td>1</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>6605</td>\n", "      <td>91527</td>\n", "      <td>H91527</td>\n", "      <td>1</td>\n", "      <td>6</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>6630</td>\n", "      <td>75027</td>\n", "      <td>R75027</td>\n", "      <td>1</td>\n", "      <td>4</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>6789</td>\n", "      <td>58148</td>\n", "      <td>M58148</td>\n", "      <td>1</td>\n", "      <td>1</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>6659</td>\n", "      <td>71659</td>\n", "      <td>D71659</td>\n", "      <td>1</td>\n", "      <td>5</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   6693  27244  W27244  1  6\n", "0  6749  51104  O51104  1  1\n", "1  6605  91527  H91527  1  6\n", "2  6630  75027  R75027  1  4\n", "3  6789  58148  M58148  1  1\n", "4  6659  71659  D71659  1  5"]}, "execution_count": 40, "metadata": {}, "output_type": "execute_result"}], "source": ["df =%blob_head /$PSEUDO/sampling/sample_distributed_hash.txt -m\n", "df.head()"]}, {"cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [{"data": {"text/plain": ["'sample_distributed_hash.txt'"]}, "execution_count": 41, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_downmerge /$PSEUDO/sampling/sample_distributed_hash.txt sample_distributed_hash.txt"]}, {"cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [{"data": {"text/html": ["<pre>\n", "6693\t27244\tW27244\t1\t6\n", "6749\t51104\tO51104\t1\t1\n", "6605\t91527\tH91527\t1\t6\n", "6630\t75027\tR75027\t1\t4\n", "6789\t58148\tM58148\t1\t1\n", "6659\t71659\tD71659\t1\t5\n", "6811\t74380\tU74380\t1\t9\n", "6749\t20125\tB20125\t1\t2\n", "6587\t33466\tE33466\t1\t5\n", "6587\t21645\tN21645\t1\t5\n", "\n", "</pre>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 42, "metadata": {}, "output_type": "execute_result"}], "source": ["%head sample_distributed_hash.txt"]}, {"cell_type": "code", "execution_count": 42, "metadata": {"collapsed": true}, "outputs": [], "source": []}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## version java\n", "\n", "On s'inspire de l'exemple suivant [Sampling](http://datafu.incubator.apache.org/docs/datafu/guide/sampling.html).\n", " On t\u00e9l\u00e9charge [datafu 1.2](http://datafu.incubator.apache.org/docs/datafu/) depuis [Maven](http://mvnrepository.com/artifact/com.linkedin.datafu/datafu/1.2.0). Ce n'est pas la derni\u00e8re version mais suivre les  instructions pour *builder* datafu (voir [documentation](http://datafu.incubator.apache.org/docs/datafu/1.2.0/)). En particulier, la version pond\u00e9r\u00e9e du reservoir sampling n'est pas disponible (voir [history](https://github.com/apache/incubator-datafu/commits/master/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java), la version 1.2.0 est sorti en d\u00e9cembre 2013).\n", " \n", "L'impl\u00e9mentation [java](https://github.com/apache/incubator-datafu/blob/master/datafu-pig/src/main/java/datafu/pig/sampling/ReservoirSample.java) n'a pas l'air de r\u00e9soudre un probl\u00e8me qui peut survenir si la taille de l'\u00e9chantillon demand\u00e9e est trop grande. Voir section suivante."]}, {"cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [{"data": {"text/plain": ["'datafu-1.2.0.jar'"]}, "execution_count": 44, "metadata": {}, "output_type": "execute_result"}], "source": ["import pyensae.datasource\n", "pyensae.datasource.download_data(\"datafu-1.2.0.jar\", url=\"http://central.maven.org/maven2/com/linkedin/datafu/datafu/1.2.0/\")"]}, {"cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [{"data": {"text/plain": ["'$PSEUDO/sampling/datafu-1.2.0.jar'"]}, "execution_count": 45, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_up datafu-1.2.0.jar /$PSEUDO/sampling/datafu-1.2.0.jar"]}, {"cell_type": "code", "execution_count": 45, "metadata": {"collapsed": true}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>name</th>\n", "      <th>last_modified</th>\n", "      <th>content_type</th>\n", "      <th>content_length</th>\n", "      <th>blob_type</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>axavier/sampling/datafu-1.2.0.jar</td>\n", "      <td>Fri, 13 Nov 2015 00:03:49 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1600826</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>axavier/sampling/sample.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:02:50 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377780</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>axavier/sampling/sample2.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:35:55 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377793</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>axavier/sampling/sample3.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:39:40 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377793</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>axavier/sampling/sample4.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:41:49 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377793</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                                name                  last_modified  \\\n", "0  axavier/sampling/datafu-1.2.0.jar  Fri, 13 Nov 2015 00:03:49 GMT   \n", "1        axavier/sampling/sample.txt  Fri, 13 Nov 2015 00:02:50 GMT   \n", "2       axavier/sampling/sample2.txt  Fri, 13 Nov 2015 00:35:55 GMT   \n", "3       axavier/sampling/sample3.txt  Fri, 13 Nov 2015 00:39:40 GMT   \n", "4       axavier/sampling/sample4.txt  Fri, 13 Nov 2015 00:41:49 GMT   \n", "\n", "               content_type  content_length  blob_type  \n", "0  application/octet-stream         1600826  BlockBlob  \n", "1  application/octet-stream         1377780  BlockBlob  \n", "2  application/octet-stream         1377793  BlockBlob  \n", "3  application/octet-stream         1377793  BlockBlob  \n", "4  application/octet-stream         1377793  BlockBlob  "]}, "execution_count": 46, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling"]}, {"cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": ["%%PIG_azure sample.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "\n", "dset = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", "        USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "sampled = FOREACH (GROUP dset ALL) GENERATE FLATTEN(RS(dset));\n", "STORE sampled INTO '$CONTAINER/$PSEUDO/sampling/out_sampled_rs4_2015.txt' USING PigStorage() ;"]}, {"cell_type": "code", "execution_count": 47, "metadata": {"collapsed": true}, "outputs": [], "source": ["jid = %hd_pig_submit sample.pig"]}, {"cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0136', None, None, False, 'RUNNING')"]}, "execution_count": 49, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "st[\"id\"],st[\"percentComplete\"],st[\"completed\"],st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"]"]}, {"cell_type": "code", "execution_count": 49, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["<pre>\n", "\n", "</pre><br />"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 50, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 10"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>name</th>\n", "      <th>last_modified</th>\n", "      <th>content_type</th>\n", "      <th>content_length</th>\n", "      <th>blob_type</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>axavier/sampling/datafu-1.2.0.jar</td>\n", "      <td>Fri, 13 Nov 2015 00:03:49 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1600826</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>axavier/sampling/out_sampled_rs4_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 01:08:22 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>axavier/sampling/out_sampled_rs4_2015.txt/_SUC...</td>\n", "      <td>Fri, 13 Nov 2015 01:08:22 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>axavier/sampling/out_sampled_rs4_2015.txt/part...</td>\n", "      <td>Fri, 13 Nov 2015 01:08:21 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>12785</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>axavier/sampling/sample.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:02:50 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377780</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>5</th>\n", "      <td>axavier/sampling/sample2.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:35:55 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377793</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>6</th>\n", "      <td>axavier/sampling/sample3.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:39:40 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377793</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>7</th>\n", "      <td>axavier/sampling/sample4.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:41:49 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1377793</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>8</th>\n", "      <td>axavier/sampling/sampled4_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:50:20 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>9</th>\n", "      <td>axavier/sampling/sampled4_2015.txt/_SUCCESS</td>\n", "      <td>Fri, 13 Nov 2015 00:50:20 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>10</th>\n", "      <td>axavier/sampling/sampled4_2015.txt/part-m-00000</td>\n", "      <td>Fri, 13 Nov 2015 00:50:19 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1277794</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>11</th>\n", "      <td>axavier/sampling/sampled_rs4_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 01:04:51 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>12</th>\n", "      <td>axavier/sampling/sampled_rs4_2015.txt/_SUCCESS</td>\n", "      <td>Fri, 13 Nov 2015 01:04:51 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>13</th>\n", "      <td>axavier/sampling/sampled_rs4_2015.txt/part-m-0...</td>\n", "      <td>Fri, 13 Nov 2015 01:04:50 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1277794</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>14</th>\n", "      <td>axavier/sampling/sampled_srs4_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:56:09 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>15</th>\n", "      <td>axavier/sampling/sampled_srs4_2015.txt/_SUCCESS</td>\n", "      <td>Fri, 13 Nov 2015 00:56:09 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>16</th>\n", "      <td>axavier/sampling/sampled_srs4_2015.txt/part-m-...</td>\n", "      <td>Fri, 13 Nov 2015 00:56:09 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1277794</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>17</th>\n", "      <td>axavier/sampling/sampled_srs_2015.txt</td>\n", "      <td>Fri, 13 Nov 2015 00:52:34 GMT</td>\n", "      <td></td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>18</th>\n", "      <td>axavier/sampling/sampled_srs_2015.txt/_SUCCESS</td>\n", "      <td>Fri, 13 Nov 2015 00:52:34 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>0</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "    <tr>\n", "      <th>19</th>\n", "      <td>axavier/sampling/sampled_srs_2015.txt/part-m-0...</td>\n", "      <td>Fri, 13 Nov 2015 00:52:34 GMT</td>\n", "      <td>application/octet-stream</td>\n", "      <td>1277794</td>\n", "      <td>BlockBlob</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["                                                 name  \\\n", "0                   axavier/sampling/datafu-1.2.0.jar   \n", "1           axavier/sampling/out_sampled_rs4_2015.txt   \n", "2   axavier/sampling/out_sampled_rs4_2015.txt/_SUC...   \n", "3   axavier/sampling/out_sampled_rs4_2015.txt/part...   \n", "4                         axavier/sampling/sample.txt   \n", "5                        axavier/sampling/sample2.txt   \n", "6                        axavier/sampling/sample3.txt   \n", "7                        axavier/sampling/sample4.txt   \n", "8                  axavier/sampling/sampled4_2015.txt   \n", "9         axavier/sampling/sampled4_2015.txt/_SUCCESS   \n", "10    axavier/sampling/sampled4_2015.txt/part-m-00000   \n", "11              axavier/sampling/sampled_rs4_2015.txt   \n", "12     axavier/sampling/sampled_rs4_2015.txt/_SUCCESS   \n", "13  axavier/sampling/sampled_rs4_2015.txt/part-m-0...   \n", "14             axavier/sampling/sampled_srs4_2015.txt   \n", "15    axavier/sampling/sampled_srs4_2015.txt/_SUCCESS   \n", "16  axavier/sampling/sampled_srs4_2015.txt/part-m-...   \n", "17              axavier/sampling/sampled_srs_2015.txt   \n", "18     axavier/sampling/sampled_srs_2015.txt/_SUCCESS   \n", "19  axavier/sampling/sampled_srs_2015.txt/part-m-0...   \n", "\n", "                    last_modified              content_type  content_length  \\\n", "0   Fri, 13 Nov 2015 00:03:49 GMT  application/octet-stream         1600826   \n", "1   Fri, 13 Nov 2015 01:08:22 GMT                                         0   \n", "2   Fri, 13 Nov 2015 01:08:22 GMT  application/octet-stream               0   \n", "3   Fri, 13 Nov 2015 01:08:21 GMT  application/octet-stream           12785   \n", "4   Fri, 13 Nov 2015 00:02:50 GMT  application/octet-stream         1377780   \n", "5   Fri, 13 Nov 2015 00:35:55 GMT  application/octet-stream         1377793   \n", "6   Fri, 13 Nov 2015 00:39:40 GMT  application/octet-stream         1377793   \n", "7   Fri, 13 Nov 2015 00:41:49 GMT  application/octet-stream         1377793   \n", "8   Fri, 13 Nov 2015 00:50:20 GMT                                         0   \n", "9   Fri, 13 Nov 2015 00:50:20 GMT  application/octet-stream               0   \n", "10  Fri, 13 Nov 2015 00:50:19 GMT  application/octet-stream         1277794   \n", "11  Fri, 13 Nov 2015 01:04:51 GMT                                         0   \n", "12  Fri, 13 Nov 2015 01:04:51 GMT  application/octet-stream               0   \n", "13  Fri, 13 Nov 2015 01:04:50 GMT  application/octet-stream         1277794   \n", "14  Fri, 13 Nov 2015 00:56:09 GMT                                         0   \n", "15  Fri, 13 Nov 2015 00:56:09 GMT  application/octet-stream               0   \n", "16  Fri, 13 Nov 2015 00:56:09 GMT  application/octet-stream         1277794   \n", "17  Fri, 13 Nov 2015 00:52:34 GMT                                         0   \n", "18  Fri, 13 Nov 2015 00:52:34 GMT  application/octet-stream               0   \n", "19  Fri, 13 Nov 2015 00:52:34 GMT  application/octet-stream         1277794   \n", "\n", "    blob_type  \n", "0   BlockBlob  \n", "1   BlockBlob  \n", "2   BlockBlob  \n", "3   BlockBlob  \n", "4   BlockBlob  \n", "5   BlockBlob  \n", "6   BlockBlob  \n", "7   BlockBlob  \n", "8   BlockBlob  \n", "9   BlockBlob  \n", "10  BlockBlob  \n", "11  BlockBlob  \n", "12  BlockBlob  \n", "13  BlockBlob  \n", "14  BlockBlob  \n", "15  BlockBlob  \n", "16  BlockBlob  \n", "17  BlockBlob  \n", "18  BlockBlob  \n", "19  BlockBlob  "]}, "execution_count": 51, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling"]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [{"data": {"text/plain": ["'out_sampled_rs4_2015.txt'"]}, "execution_count": 52, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_downmerge /$PSEUDO/sampling/out_sampled_rs4_2015.txt out_sampled_rs4_2015.txt -o"]}, {"cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [{"data": {"text/html": ["<pre>\n", "90648\tM90648\n", "49678\tS49678\n", "41434\tQ41434\n", "30149\tP30149\n", "15836\tC15836\n", "61110\tK61110\n", "3838\tQ3838\n", "81515\tF81515\n", "48052\tE48052\n", "16332\tE16332\n", "\n", "</pre>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["%head out_sampled_rs4_2015.txt"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## fin"]}, {"cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [{"data": {"text/plain": ["True"]}, "execution_count": 54, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_close"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## version avec it\u00e9rateur"]}, {"cell_type": "code", "execution_count": 54, "metadata": {"collapsed": true}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4"}}, "nbformat": 4, "nbformat_minor": 2}