{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 3A.mr - Reservoir Sampling distribu\u00e9 - \u00e9nonc\u00e9 - correction\n", "\n", "Correction."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["Plan\n", "
run previous cell, wait for 2 seconds
\n", ""], "text/plain": [""]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## cr\u00e9ation d'un fichier \u00e0 sampler"]}, {"cell_type": "code", "execution_count": 2, "metadata": {"collapsed": true}, "outputs": [], "source": ["with open(\"sample4.txt\", \"w\", encoding=\"utf8\") as f:\n", " for i in range(0,100000):\n", " f.write(\"{0}\\t{1}{0}\\n\".format(i, chr(i%26 + 65)))\n", " f.write(\"100001\\tAAAAAA\")"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "0\tA0\n", "1\tB1\n", "2\tC2\n", "3\tD3\n", "4\tE4\n", "5\tF5\n", "6\tG6\n", "7\tH7\n", "8\tI8\n", "9\tJ9\n", "\n", "
"], "text/plain": [""]}, "execution_count": 4, "metadata": {}, "output_type": "execute_result"}], "source": ["%load_ext pyensae\n", "%head sample4.txt"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## connexion"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"data": {"text/plain": ["dict"]}, "execution_count": 5, "metadata": {}, "output_type": "execute_result"}], "source": ["import os\n", "blobhp = {}\n", "if \"HDCREDENTIALS\" in os.environ:\n", " blobhp[\"blob_storage\"], blobhp[\"password1\"], blobhp[\"hadoop_server\"], blobhp[\"password2\"], blobhp[\"username\"] = \\\n", " os.environ[\"HDCREDENTIALS\"].split(\"**\")\n", " r = type(blobhp)\n", "else:\n", " from pyquickhelper.ipythonhelper import open_html_form\n", " params={\"blob_storage\":\"\", \"password1\":\"\", \"hadoop_server\":\"\", \"password2\":\"\", \"username\":\"axavier\"}\n", " r = open_html_form(params=params,title=\"server + hadoop + credentials\", key_save=\"blobhp\")\n", "r"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["import pyensae\n", "%load_ext pyensae\n", "%load_ext pyenbc\n", "blobstorage = blobhp[\"blob_storage\"]\n", "blobpassword = blobhp[\"password1\"]\n", "hadoop_server = blobhp[\"hadoop_server\"]\n", "hadoop_password = blobhp[\"password2\"]\n", "username = blobhp[\"username\"]"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/plain": ["(,\n", " )"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["client, bs = %hd_open\n", "client, bs"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## upload du fichier"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"data": {"text/plain": ["'$PSEUDO/sampling/sample4.txt'"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_up sample3.txt /$PSEUDO/sampling/sample4.txt"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelast_modifiedcontent_typecontent_lengthblob_type
0axavier/sampling/datafu-1.2.0.jarFri, 13 Nov 2015 00:03:49 GMTapplication/octet-stream1600826BlockBlob
1axavier/sampling/out_sampled_rs4_2015.txtFri, 13 Nov 2015 01:08:22 GMT0BlockBlob
2axavier/sampling/out_sampled_rs4_2015.txt/_SUC...Fri, 13 Nov 2015 01:08:22 GMTapplication/octet-stream0BlockBlob
3axavier/sampling/out_sampled_rs4_2015.txt/part...Fri, 13 Nov 2015 01:08:21 GMTapplication/octet-stream12785BlockBlob
4axavier/sampling/sample.txtFri, 13 Nov 2015 00:02:50 GMTapplication/octet-stream1377780BlockBlob
5axavier/sampling/sample2.txtFri, 13 Nov 2015 00:35:55 GMTapplication/octet-stream1377793BlockBlob
6axavier/sampling/sample3.txtFri, 13 Nov 2015 00:39:40 GMTapplication/octet-stream1377793BlockBlob
7axavier/sampling/sample4.txtSun, 15 Nov 2015 12:24:22 GMTapplication/octet-stream1377793BlockBlob
8axavier/sampling/sample4_hash.txtFri, 13 Nov 2015 14:50:39 GMT0BlockBlob
9axavier/sampling/sample4_hash.txt/_SUCCESSFri, 13 Nov 2015 14:50:39 GMTapplication/octet-stream0BlockBlob
10axavier/sampling/sample4_hash.txt/part-r-00000Fri, 13 Nov 2015 14:50:38 GMTapplication/octet-stream4771358BlockBlob
11axavier/sampling/sampled4_2015.txtFri, 13 Nov 2015 00:50:20 GMT0BlockBlob
12axavier/sampling/sampled4_2015.txt/_SUCCESSFri, 13 Nov 2015 00:50:20 GMTapplication/octet-stream0BlockBlob
13axavier/sampling/sampled4_2015.txt/part-m-00000Fri, 13 Nov 2015 00:50:19 GMTapplication/octet-stream1277794BlockBlob
14axavier/sampling/sampled_rs4_2015.txtFri, 13 Nov 2015 01:04:51 GMT0BlockBlob
15axavier/sampling/sampled_rs4_2015.txt/_SUCCESSFri, 13 Nov 2015 01:04:51 GMTapplication/octet-stream0BlockBlob
16axavier/sampling/sampled_rs4_2015.txt/part-m-0...Fri, 13 Nov 2015 01:04:50 GMTapplication/octet-stream1277794BlockBlob
17axavier/sampling/sampled_srs4_2015.txtFri, 13 Nov 2015 00:56:09 GMT0BlockBlob
18axavier/sampling/sampled_srs4_2015.txt/_SUCCESSFri, 13 Nov 2015 00:56:09 GMTapplication/octet-stream0BlockBlob
19axavier/sampling/sampled_srs4_2015.txt/part-m-...Fri, 13 Nov 2015 00:56:09 GMTapplication/octet-stream1277794BlockBlob
20axavier/sampling/sampled_srs_2015.txtFri, 13 Nov 2015 00:52:34 GMT0BlockBlob
21axavier/sampling/sampled_srs_2015.txt/_SUCCESSFri, 13 Nov 2015 00:52:34 GMTapplication/octet-stream0BlockBlob
22axavier/sampling/sampled_srs_2015.txt/part-m-0...Fri, 13 Nov 2015 00:52:34 GMTapplication/octet-stream1277794BlockBlob
\n", "
"], "text/plain": [" name \\\n", "0 axavier/sampling/datafu-1.2.0.jar \n", "1 axavier/sampling/out_sampled_rs4_2015.txt \n", "2 axavier/sampling/out_sampled_rs4_2015.txt/_SUC... \n", "3 axavier/sampling/out_sampled_rs4_2015.txt/part... \n", "4 axavier/sampling/sample.txt \n", "5 axavier/sampling/sample2.txt \n", "6 axavier/sampling/sample3.txt \n", "7 axavier/sampling/sample4.txt \n", "8 axavier/sampling/sample4_hash.txt \n", "9 axavier/sampling/sample4_hash.txt/_SUCCESS \n", "10 axavier/sampling/sample4_hash.txt/part-r-00000 \n", "11 axavier/sampling/sampled4_2015.txt \n", "12 axavier/sampling/sampled4_2015.txt/_SUCCESS \n", "13 axavier/sampling/sampled4_2015.txt/part-m-00000 \n", "14 axavier/sampling/sampled_rs4_2015.txt \n", "15 axavier/sampling/sampled_rs4_2015.txt/_SUCCESS \n", "16 axavier/sampling/sampled_rs4_2015.txt/part-m-0... \n", "17 axavier/sampling/sampled_srs4_2015.txt \n", "18 axavier/sampling/sampled_srs4_2015.txt/_SUCCESS \n", "19 axavier/sampling/sampled_srs4_2015.txt/part-m-... \n", "20 axavier/sampling/sampled_srs_2015.txt \n", "21 axavier/sampling/sampled_srs_2015.txt/_SUCCESS \n", "22 axavier/sampling/sampled_srs_2015.txt/part-m-0... \n", "\n", " last_modified content_type content_length \\\n", "0 Fri, 13 Nov 2015 00:03:49 GMT application/octet-stream 1600826 \n", "1 Fri, 13 Nov 2015 01:08:22 GMT 0 \n", "2 Fri, 13 Nov 2015 01:08:22 GMT application/octet-stream 0 \n", "3 Fri, 13 Nov 2015 01:08:21 GMT application/octet-stream 12785 \n", "4 Fri, 13 Nov 2015 00:02:50 GMT application/octet-stream 1377780 \n", "5 Fri, 13 Nov 2015 00:35:55 GMT application/octet-stream 1377793 \n", "6 Fri, 13 Nov 2015 00:39:40 GMT application/octet-stream 1377793 \n", "7 Sun, 15 Nov 2015 12:24:22 GMT application/octet-stream 1377793 \n", "8 Fri, 13 Nov 2015 14:50:39 GMT 0 \n", "9 Fri, 13 Nov 2015 14:50:39 GMT application/octet-stream 0 \n", "10 Fri, 13 Nov 2015 14:50:38 GMT application/octet-stream 4771358 \n", "11 Fri, 13 Nov 2015 00:50:20 GMT 0 \n", "12 Fri, 13 Nov 2015 00:50:20 GMT application/octet-stream 0 \n", "13 Fri, 13 Nov 2015 00:50:19 GMT application/octet-stream 1277794 \n", "14 Fri, 13 Nov 2015 01:04:51 GMT 0 \n", "15 Fri, 13 Nov 2015 01:04:51 GMT application/octet-stream 0 \n", "16 Fri, 13 Nov 2015 01:04:50 GMT application/octet-stream 1277794 \n", "17 Fri, 13 Nov 2015 00:56:09 GMT 0 \n", "18 Fri, 13 Nov 2015 00:56:09 GMT application/octet-stream 0 \n", "19 Fri, 13 Nov 2015 00:56:09 GMT application/octet-stream 1277794 \n", "20 Fri, 13 Nov 2015 00:52:34 GMT 0 \n", "21 Fri, 13 Nov 2015 00:52:34 GMT application/octet-stream 0 \n", "22 Fri, 13 Nov 2015 00:52:34 GMT application/octet-stream 1277794 \n", "\n", " blob_type \n", "0 BlockBlob \n", "1 BlockBlob \n", "2 BlockBlob \n", "3 BlockBlob \n", "4 BlockBlob \n", "5 BlockBlob \n", "6 BlockBlob \n", "7 BlockBlob \n", "8 BlockBlob \n", "9 BlockBlob \n", "10 BlockBlob \n", "11 BlockBlob \n", "12 BlockBlob \n", "13 BlockBlob \n", "14 BlockBlob \n", "15 BlockBlob \n", "16 BlockBlob \n", "17 BlockBlob \n", "18 BlockBlob \n", "19 BlockBlob \n", "20 BlockBlob \n", "21 BlockBlob \n", "22 BlockBlob "]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Code python pour le reservoir sampling"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/plain": ["['0a', '1b', '2c', '3d', '4e']"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["ensemble = [ \"%d%s\" % (i, chr(i%26 + 97)) for i in range(0,10000)]\n", "ensemble[:5]"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/plain": ["['8681x',\n", " '8356k',\n", " '5490e',\n", " '4405l',\n", " '5890o',\n", " '2689l',\n", " '8672o',\n", " '3603p',\n", " '8599t',\n", " '6086c']"]}, "execution_count": 11, "metadata": {}, "output_type": "execute_result"}], "source": ["import random\n", "def reservoir_sampling(ensemble, k):\n", " N = len(ensemble)\n", " echantillon = []\n", " for i, e in enumerate(ensemble):\n", " if len(echantillon) < k:\n", " echantillon.append(e)\n", " else:\n", " j = random.randint(0, i)\n", " if j < k:\n", " echantillon[j] = e\n", " return echantillon\n", "\n", "reservoir_sampling(ensemble, 10)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## python \u00e0 jython\n", "\n", "On s'assure que le code pr\u00e9c\u00e9dent fonctionne en jython (python compil\u00e9 en java). On s'inspire pour cela de la documentation [jython-udfs](https://pig.apache.org/docs/r0.12.0/udf.html#jython-udfs)."]}, {"cell_type": "markdown", "metadata": {}, "source": ["### On cr\u00e9\u00e9 d'abord un script PIG pour r\u00e9cup\u00e9rer le schema et les premi\u00e8res lignes"]}, {"cell_type": "code", "execution_count": 11, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore.pig\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "ens_group = GROUP ensemble ALL;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE FLATTEN(ensemble);\n", "DESCRIBE sampled;\n", "\n", "--ens10 = LIMIT ensemble 10;\n", "--ens_group10 = LIMIT en_group10 ;\n", "--DUMP ens10;\n", "--DUMP ens_group10;"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Si la fonction suivante provoque une erreur ::\n", "\n", " AzureException: STATUS: 403, JSON: Expecting value: line 1 column 1 (char 0)\n", " \n", " unable to submit job: sample_explore.pig\n", " \n", "V\u00e9rifier les identifiants utilis\u00e9s pour se connecter."]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0185'}"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0185', None, None, False, 'RUNNING')"]}, "execution_count": 14, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "markdown", "metadata": {}, "source": ["La sortie standard contient les informations souhait\u00e9es :"]}, {"cell_type": "code", "execution_count": 14, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["
\n", "2015-11-15 12:33:06,608 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 12:33:06,608 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: wasb://clusterensaeazure1-1@hdblobstorage.blob.core.windows.net\n", "2015-11-15 12:33:08,233 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 12:33:09,374 [main] INFO  org.apache.pig.Main - Pig script completed in 4 seconds and 578 milliseconds (4578 ms)\n", "\n", "

OUT:
\n", "ensemble: {x: int,v: chararray}\n", "ens_group: {group: chararray,ensemble: {(x: int,v: chararray)}}\n", "sampled: {ensemble::x: int,ensemble::v: chararray}\n", "\n", "
"], "text/plain": [""]}, "execution_count": 15, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 5"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Et la sortie du second dump :: \n", " \n", " (all,{(100001,AAAAAA),(99999,D99999),(99998,C99998)..."]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Le code Jython"]}, {"cell_type": "code", "execution_count": 15, "metadata": {"collapsed": true}, "outputs": [], "source": ["import pyensae"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": ["%%PYTHON reservoir_sampling.py\n", "\n", "import random\n", "\n", "@outputSchemaFunction(\"rsSchema\")\n", "def reservoir_sampling(ensemble):\n", " ensemble = eval(ensemble)\n", " k = 10\n", " N = len(ensemble)\n", " echantillon = []\n", " for i, e in enumerate(ensemble):\n", " if len(echantillon) < k:\n", " echantillon.append(e)\n", " else:\n", " j = random.randint(0, i)\n", " if j < k:\n", " echantillon[j] = e\n", " return echantillon\n", "\n", "@schemaFunction(\"rsSchema\")\n", "def rsSchema(input):\n", " return input"]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "[(99998, 'C99998'), (99999, 'D99999'), (100001, 'AAAAAA')]\n", "\n", "
"], "text/plain": [""]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["%%jython reservoir_sampling.py reservoir_sampling\n", "{(100001,\"AAAAAA\"),(99999,\"D99999\"),(99998,\"C99998\")}"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On ajoute le code jython au script pr\u00e9c\u00e9dent :"]}, {"cell_type": "code", "execution_count": 18, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore_complete.pig\n", "\n", "REGISTER '$CONTAINER/$SCRIPTPIG/reservoir_sampling.py' using jython as myrs;\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "ens_group = GROUP ensemble ALL;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE FLATTEN(myrs.reservoir_sample(ensemble));\n", "DESCRIBE sampled;\n", "\n", "STORE sampled INTO \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_rs.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0229'}"]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore_complete.pig -d reservoir_sampling.py\n", "jid"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0229', None, 'done', False, 'RUNNING')"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 21, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["
\n", "15/11/15 18:43:49 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL\n", "15/11/15 18:43:49 INFO pig.ExecTypeProvider: Trying ExecType : MAPREDUCE\n", "15/11/15 18:43:49 INFO pig.ExecTypeProvider: Picked MAPREDUCE as the ExecType\n", "2015-11-15 18:43:49,598 [main] INFO  org.apache.pig.Main - Apache Pig version 0.14.0.2.2.7.1-33 (r: unknown) compiled Oct 13 2015, 04:18:06\n", "2015-11-15 18:43:49,598 [main] INFO  org.apache.pig.Main - Logging error messages to: C:\\apps\\dist\\hadoop-2.6.0.2.2.7.1-33\\logs\\pig_1447613029598.log\n", "2015-11-15 18:43:50,848 [main] INFO  org.apache.pig.impl.util.Utils - Default bootup file D:\\Users\\hdp/.pigbootup not found\n", "2015-11-15 18:43:51,145 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address\n", "2015-11-15 18:43:51,145 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 18:43:51,145 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: wasb://clusterensaeazure1-1@hdblobstorage.blob.core.windows.net\n", "2015-11-15 18:43:51,879 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 18:43:52,192 [main] INFO  org.apache.pig.scripting.jython.JythonScriptEngine - created tmp python.cachedir=D:\\Users\\hdp\\AppData\\Local\\Temp\\pig_jython_3357684506669481882\n", "2015-11-15 18:43:54,817 [main] WARN  org.apache.pig.scripting.jython.JythonScriptEngine - pig.cmd.args.remainders is empty. This is not expected unless on testing.\n", "2015-11-15 18:43:57,645 [main] INFO  org.apache.pig.scripting.jython.JythonScriptEngine - Register scripting UDF: myrs.reservoir_sampling\n", "2015-11-15 18:43:58,535 [main] INFO  org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 18:43:59,660 [main] ERROR org.apache.pig.PigServer - exception during parsing: Error during parsing. Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "Failed to parse: Pig script failed to parse: \n", " Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:199)\n", "\tat org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1735)\n", "\tat org.apache.pig.PigServer$Graph.access$000(PigServer.java:1443)\n", "\tat org.apache.pig.PigServer.parseAndBuild(PigServer.java:387)\n", "\tat org.apache.pig.tools.grunt.GruntParser.processDescribe(GruntParser.java:300)\n", "\tat org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:412)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:230)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:205)\n", "\tat org.apache.pig.tools.grunt.Grunt.exec(Grunt.java:81)\n", "\tat org.apache.pig.Main.run(Main.java:495)\n", "\tat org.apache.pig.Main.main(Main.java:170)\n", "Caused by: \n", " Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1572)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.func_eval(LogicalPlanGenerator.java:9372)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.projectable_expr(LogicalPlanGenerator.java:11051)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.var_expr(LogicalPlanGenerator.java:10810)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.expr(LogicalPlanGenerator.java:10159)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_clause(LogicalPlanGenerator.java:7629)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_generated_item(LogicalPlanGenerator.java:7452)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.generate_clause(LogicalPlanGenerator.java:17590)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_plan(LogicalPlanGenerator.java:15982)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_clause(LogicalPlanGenerator.java:15849)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.op_clause(LogicalPlanGenerator.java:1933)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.general_statement(LogicalPlanGenerator.java:1102)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.statement(LogicalPlanGenerator.java:560)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.query(LogicalPlanGenerator.java:421)\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:191)\n", "\t... 10 more\n", "Caused by: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.impl.PigContext.resolveClassName(PigContext.java:677)\n", "\tat org.apache.pig.impl.PigContext.getClassForAlias(PigContext.java:793)\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1569)\n", "\t... 24 more\n", "2015-11-15 18:43:59,707 [main] ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "2015-11-15 18:43:59,707 [main] ERROR org.apache.pig.tools.grunt.Grunt - org.apache.pig.impl.logicalLayer.FrontendException: ERROR 1000: Error during parsing. Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1748)\n", "\tat org.apache.pig.PigServer$Graph.access$000(PigServer.java:1443)\n", "\tat org.apache.pig.PigServer.parseAndBuild(PigServer.java:387)\n", "\tat org.apache.pig.tools.grunt.GruntParser.processDescribe(GruntParser.java:300)\n", "\tat org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:412)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:230)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:205)\n", "\tat org.apache.pig.tools.grunt.Grunt.exec(Grunt.java:81)\n", "\tat org.apache.pig.Main.run(Main.java:495)\n", "\tat org.apache.pig.Main.main(Main.java:170)\n", "Caused by: Failed to parse: Pig script failed to parse: \n", " Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:199)\n", "\tat org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1735)\n", "\t... 9 more\n", "Caused by: \n", " Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1572)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.func_eval(LogicalPlanGenerator.java:9372)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.projectable_expr(LogicalPlanGenerator.java:11051)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.var_expr(LogicalPlanGenerator.java:10810)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.expr(LogicalPlanGenerator.java:10159)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_clause(LogicalPlanGenerator.java:7629)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_generated_item(LogicalPlanGenerator.java:7452)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.generate_clause(LogicalPlanGenerator.java:17590)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_plan(LogicalPlanGenerator.java:15982)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_clause(LogicalPlanGenerator.java:15849)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.op_clause(LogicalPlanGenerator.java:1933)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.general_statement(LogicalPlanGenerator.java:1102)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.statement(LogicalPlanGenerator.java:560)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.query(LogicalPlanGenerator.java:421)\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:191)\n", "\t... 10 more\n", "Caused by: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.impl.PigContext.resolveClassName(PigContext.java:677)\n", "\tat org.apache.pig.impl.PigContext.getClassForAlias(PigContext.java:793)\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1569)\n", "\t... 24 more\n", "\n", "Details also at logfile: C:\\apps\\dist\\hadoop-2.6.0.2.2.7.1-33\\logs\\pig_1447613029598.log\n", "2015-11-15 18:43:59,754 [main] INFO  org.apache.pig.Main - Pig script completed in 10 seconds and 453 milliseconds (10453 ms)\n", "\n", "

OUT:
\n", "ensemble: {x: int,v: chararray}\n", "ens_group: {group: chararray,ensemble: {(x: int,v: chararray)}}\n", "\n", "
"], "text/plain": [""]}, "execution_count": 22, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 100"]}, {"cell_type": "markdown", "metadata": {}, "source": ["A corriger plus tard. Dans l'imm\u00e9diat, on utilisera la librairie [datafu](https://datafu.incubator.apache.org/docs/datafu/guide/sampling.html). Si le cluster ne reconna\u00eet pas la librairie, voir la section java pour comprendre comment l'importer. On la d\u00e9clare dans le script par l'instruction ``REGISTER``."]}, {"cell_type": "code", "execution_count": 22, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore_datafu.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "ens_group = GROUP ensemble ALL;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE FLATTEN(RS(ensemble));\n", "DESCRIBE sampled;\n", "\n", "STORE sampled \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_datafu_rs.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0193'}"]}, "execution_count": 24, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore_datafu.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0193', '50% complete', None, False, 'RUNNING')"]}, "execution_count": 25, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 25, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["
\n", "\n", "

"], "text/plain": [""]}, "execution_count": 26, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 100"]}, {"cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelast_modifiedcontent_typecontent_lengthblob_type
0axavier/sampling/sample_datafu_rs.txtSun, 15 Nov 2015 13:23:40 GMT0BlockBlob
1axavier/sampling/sample_datafu_rs.txt/_SUCCESSSun, 15 Nov 2015 13:23:40 GMTapplication/octet-stream0BlockBlob
2axavier/sampling/sample_datafu_rs.txt/part-r-0...Sun, 15 Nov 2015 13:23:38 GMTapplication/octet-stream12780BlockBlob
\n", "
"], "text/plain": [" name \\\n", "0 axavier/sampling/sample_datafu_rs.txt \n", "1 axavier/sampling/sample_datafu_rs.txt/_SUCCESS \n", "2 axavier/sampling/sample_datafu_rs.txt/part-r-0... \n", "\n", " last_modified content_type content_length \\\n", "0 Sun, 15 Nov 2015 13:23:40 GMT 0 \n", "1 Sun, 15 Nov 2015 13:23:40 GMT application/octet-stream 0 \n", "2 Sun, 15 Nov 2015 13:23:38 GMT application/octet-stream 12780 \n", "\n", " blob_type \n", "0 BlockBlob \n", "1 BlockBlob \n", "2 BlockBlob "]}, "execution_count": 27, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling/sample_datafu"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## version distribu\u00e9e\n", "\n", "Astuce : on distribue puis on recombine les \u00e9chantillons en faisant un dernier reservoir sampling mais pond\u00e9r\u00e9. Comment distribuer ? Le second sampling est remplac\u00e9 par une m\u00e9thode d'\u00e9chantillonage classique car le reservoir sampling pond\u00e9r\u00e9 n'est pas disponible dans la librairie datafu version 1.2.0."]}, {"cell_type": "code", "execution_count": 27, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore_datafu_dist.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "DEFINE WeightedSample datafu.pig.sampling.WeightedSample();\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "keys = FOREACH ensemble GENERATE x, v, x%10 AS key;\n", "DESCRIBE keys;\n", "ens_group = GROUP keys BY key ;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE COUNT(keys) AS weigth, FLATTEN(RS(keys));\n", "DESCRIBE sampled;\n", "wsampled = FOREACH (GROUP sampled ALL) GENERATE FLATTEN(WeightedSample(sampled, 0, 1000));\n", "DESCRIBE wsampled;\n", "\n", "STORE wsampled \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_datafu_rs_dist2.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0238'}"]}, "execution_count": 29, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore_datafu_dist.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0238', '100% complete', 'done', True, 'SUCCEEDED')"]}, "execution_count": 30, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 30, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["
\n", "2015-11-15 19:22:17,553 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:22:17,553 [main] INFO  org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:22:17,615 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:22:17,803 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2015-11-15 19:22:17,803 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:22:17,803 [main] INFO  org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:22:17,865 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:22:17,943 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!\n", "2015-11-15 19:22:17,975 [main] INFO  org.apache.pig.Main - Pig script completed in 1 minute, 42 seconds and 839 milliseconds (102839 ms)\n", "\n", "

OUT:
\n", "ensemble: {x: int,v: chararray}\n", "keys: {x: int,v: chararray,key: int}\n", "ens_group: {group: int,keys: {(x: int,v: chararray,key: int)}}\n", "sampled: {weigth: long,datafu.pig.sampling.reservoirsample_keys_4::x: int,datafu.pig.sampling.reservoirsample_keys_4::v: chararray,datafu.pig.sampling.reservoirsample_keys_4::key: int}\n", "wsampled: {datafu.pig.sampling.weightedsample_sampled_12::weigth: long,datafu.pig.sampling.weightedsample_sampled_12::datafu.pig.sampling.reservoirsample_keys_11::x: int,datafu.pig.sampling.weightedsample_sampled_12::datafu.pig.sampling.reservoirsample_keys_11::v: chararray,datafu.pig.sampling.weightedsample_sampled_12::datafu.pig.sampling.reservoirsample_keys_11::key: int}\n", "\n", "
"], "text/plain": [""]}, "execution_count": 31, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 10"]}, {"cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelast_modifiedcontent_typecontent_lengthblob_type
0axavier/sampling/sample_datafu_rs_dist2.txtSun, 15 Nov 2015 19:22:05 GMT0BlockBlob
1axavier/sampling/sample_datafu_rs_dist2.txt/_S...Sun, 15 Nov 2015 19:22:06 GMTapplication/octet-stream0BlockBlob
2axavier/sampling/sample_datafu_rs_dist2.txt/pa...Sun, 15 Nov 2015 19:22:05 GMTapplication/octet-stream20770BlockBlob
\n", "
"], "text/plain": [" name \\\n", "0 axavier/sampling/sample_datafu_rs_dist2.txt \n", "1 axavier/sampling/sample_datafu_rs_dist2.txt/_S... \n", "2 axavier/sampling/sample_datafu_rs_dist2.txt/pa... \n", "\n", " last_modified content_type content_length \\\n", "0 Sun, 15 Nov 2015 19:22:05 GMT 0 \n", "1 Sun, 15 Nov 2015 19:22:06 GMT application/octet-stream 0 \n", "2 Sun, 15 Nov 2015 19:22:05 GMT application/octet-stream 20770 \n", "\n", " blob_type \n", "0 BlockBlob \n", "1 BlockBlob \n", "2 BlockBlob "]}, "execution_count": 32, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling/sample_datafu_rs_dist2"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": ["df = %blob_head /$PSEUDO/sampling/sample_datafu_rs_dist2.txt -m"]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1000121260S212600
01000025191X251911
11000073760Y737600
21000090105P901055
31000046070Y460700
41000158590M585900
\n", "
"], "text/plain": [" 10001 21260 S21260 0\n", "0 10000 25191 X25191 1\n", "1 10000 73760 Y73760 0\n", "2 10000 90105 P90105 5\n", "3 10000 46070 Y46070 0\n", "4 10001 58590 M58590 0"]}, "execution_count": 34, "metadata": {}, "output_type": "execute_result"}], "source": ["df.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## version distribu\u00e9e am\u00e9lior\u00e9e\n", "\n", "Le probl\u00e8me de la version pr\u00e9c\u00e9dente : chaque sous-ensemble trait\u00e9 d'un seul bloc utilise une s\u00e9quence de nombres al\u00e9atoires sur laquelle on ne conna\u00eet pas grand chose. Si les m\u00eames *seed* sont utilis\u00e9es, il est possible que les s\u00e9quences, m\u00eame si elles simulent le hasard, soient extr\u00eamement corr\u00e9l\u00e9es entre chaque bloc. Il faut rem\u00e9dier \u00e0 cela.\n", "\n", "Il faut \u00e9galement s'assurer que chaque bloc n'est pas *skewed*."]}, {"cell_type": "code", "execution_count": 34, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG_azure script_rs.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "DEFINE MD5 datafu.pig.hash.MD5();\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "DEFINE WeightedSample datafu.pig.sampling.WeightedSample();\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "\n", "ens_group = GROUP ensemble BY (x,v);\n", "DESCRIBE ens_group;\n", "\n", "compte_group = FOREACH ens_group \n", " GENERATE group.x AS x, \n", " group.v AS v, \n", " COUNT(ensemble) AS nb_ligne ;\n", "DESCRIBE compte_group;\n", "\n", "hash_group = FOREACH compte_group \n", " GENERATE x, v, nb_ligne,\n", " SUBSTRING(MD5(v), 0, 1) AS val;\n", "DESCRIBE hash_group; \n", "\n", "group_hash = GROUP hash_group BY val ;\n", "DESCRIBE group_hash;\n", "\n", "rs_parall = FOREACH group_hash GENERATE\n", " COUNT(hash_group) AS nb_hash,\n", " FLATTEN(RS(hash_group)) ;\n", "DESCRIBE rs_parall;\n", "\n", "wsampled = FOREACH (GROUP rs_parall ALL) GENERATE FLATTEN(WeightedSample(rs_parall, 0, 1000));\n", "DESCRIBE wsampled;\n", "\n", "STORE wsampled \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_distributed_hash.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0244'}"]}, "execution_count": 36, "metadata": {}, "output_type": "execute_result"}], "source": ["jid=%hd_pig_submit script_rs.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0244', '100% complete', None, False, 'RUNNING')"]}, "execution_count": 37, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 37, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["
\n", "2015-11-15 19:52:05,138 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:52:05,138 [main] INFO  org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:52:05,200 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:52:05,435 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2015-11-15 19:52:05,435 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:52:05,435 [main] INFO  org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:52:05,513 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:52:05,560 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!\n", "2015-11-15 19:52:05,607 [main] INFO  org.apache.pig.Main - Pig script completed in 2 minutes, 29 seconds and 962 milliseconds (149962 ms)\n", "\n", "

OUT:
\n", "ensemble: {x: int,v: chararray}\n", "ens_group: {group: (x: int,v: chararray),ensemble: {(x: int,v: chararray)}}\n", "compte_group: {x: int,v: chararray,nb_ligne: long}\n", "hash_group: {x: int,v: chararray,nb_ligne: long,val: chararray}\n", "group_hash: {group: chararray,hash_group: {(x: int,v: chararray,nb_ligne: long,val: chararray)}}\n", "rs_parall: {nb_hash: long,datafu.pig.sampling.reservoirsample_hash_group_4::x: int,datafu.pig.sampling.reservoirsample_hash_group_4::v: chararray,datafu.pig.sampling.reservoirsample_hash_group_4::nb_ligne: long,datafu.pig.sampling.reservoirsample_hash_group_4::val: chararray}\n", "wsampled: {datafu.pig.sampling.weightedsample_rs_parall_12::nb_hash: long,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::x: int,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::v: chararray,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::nb_ligne: long,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::val: chararray}\n", "\n", "
"], "text/plain": [""]}, "execution_count": 38, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 10"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelast_modifiedcontent_typecontent_lengthblob_type
0axavier/sampling/sample_distributed_hash.txtSun, 15 Nov 2015 19:51:56 GMT0BlockBlob
1axavier/sampling/sample_distributed_hash.txt/_...Sun, 15 Nov 2015 19:51:56 GMTapplication/octet-stream0BlockBlob
2axavier/sampling/sample_distributed_hash.txt/p...Sun, 15 Nov 2015 19:51:55 GMTapplication/octet-stream21750BlockBlob
\n", "
"], "text/plain": [" name \\\n", "0 axavier/sampling/sample_distributed_hash.txt \n", "1 axavier/sampling/sample_distributed_hash.txt/_... \n", "2 axavier/sampling/sample_distributed_hash.txt/p... \n", "\n", " last_modified content_type content_length \\\n", "0 Sun, 15 Nov 2015 19:51:56 GMT 0 \n", "1 Sun, 15 Nov 2015 19:51:56 GMT application/octet-stream 0 \n", "2 Sun, 15 Nov 2015 19:51:55 GMT application/octet-stream 21750 \n", "\n", " blob_type \n", "0 BlockBlob \n", "1 BlockBlob \n", "2 BlockBlob "]}, "execution_count": 39, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling/sample_distributed_hash.txt"]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
669327244W2724416
0674951104O5110411
1660591527H9152716
2663075027R7502714
3678958148M5814811
4665971659D7165915
\n", "
"], "text/plain": [" 6693 27244 W27244 1 6\n", "0 6749 51104 O51104 1 1\n", "1 6605 91527 H91527 1 6\n", "2 6630 75027 R75027 1 4\n", "3 6789 58148 M58148 1 1\n", "4 6659 71659 D71659 1 5"]}, "execution_count": 40, "metadata": {}, "output_type": "execute_result"}], "source": ["df =%blob_head /$PSEUDO/sampling/sample_distributed_hash.txt -m\n", "df.head()"]}, {"cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [{"data": {"text/plain": ["'sample_distributed_hash.txt'"]}, "execution_count": 41, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_downmerge /$PSEUDO/sampling/sample_distributed_hash.txt sample_distributed_hash.txt"]}, {"cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "6693\t27244\tW27244\t1\t6\n", "6749\t51104\tO51104\t1\t1\n", "6605\t91527\tH91527\t1\t6\n", "6630\t75027\tR75027\t1\t4\n", "6789\t58148\tM58148\t1\t1\n", "6659\t71659\tD71659\t1\t5\n", "6811\t74380\tU74380\t1\t9\n", "6749\t20125\tB20125\t1\t2\n", "6587\t33466\tE33466\t1\t5\n", "6587\t21645\tN21645\t1\t5\n", "\n", "
"], "text/plain": [""]}, "execution_count": 42, "metadata": {}, "output_type": "execute_result"}], "source": ["%head sample_distributed_hash.txt"]}, {"cell_type": "code", "execution_count": 42, "metadata": {"collapsed": true}, "outputs": [], "source": []}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## version java\n", "\n", "On s'inspire de l'exemple suivant [Sampling](http://datafu.incubator.apache.org/docs/datafu/guide/sampling.html).\n", " On t\u00e9l\u00e9charge [datafu 1.2](http://datafu.incubator.apache.org/docs/datafu/) depuis [Maven](http://mvnrepository.com/artifact/com.linkedin.datafu/datafu/1.2.0). Ce n'est pas la derni\u00e8re version mais suivre les instructions pour *builder* datafu (voir [documentation](http://datafu.incubator.apache.org/docs/datafu/1.2.0/)). En particulier, la version pond\u00e9r\u00e9e du reservoir sampling n'est pas disponible (voir [history](https://github.com/apache/incubator-datafu/commits/master/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java), la version 1.2.0 est sorti en d\u00e9cembre 2013).\n", " \n", "L'impl\u00e9mentation [java](https://github.com/apache/incubator-datafu/blob/master/datafu-pig/src/main/java/datafu/pig/sampling/ReservoirSample.java) n'a pas l'air de r\u00e9soudre un probl\u00e8me qui peut survenir si la taille de l'\u00e9chantillon demand\u00e9e est trop grande. Voir section suivante."]}, {"cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [{"data": {"text/plain": ["'datafu-1.2.0.jar'"]}, "execution_count": 44, "metadata": {}, "output_type": "execute_result"}], "source": ["import pyensae.datasource\n", "pyensae.datasource.download_data(\"datafu-1.2.0.jar\", url=\"http://central.maven.org/maven2/com/linkedin/datafu/datafu/1.2.0/\")"]}, {"cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [{"data": {"text/plain": ["'$PSEUDO/sampling/datafu-1.2.0.jar'"]}, "execution_count": 45, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_up datafu-1.2.0.jar /$PSEUDO/sampling/datafu-1.2.0.jar"]}, {"cell_type": "code", "execution_count": 45, "metadata": {"collapsed": true}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelast_modifiedcontent_typecontent_lengthblob_type
0axavier/sampling/datafu-1.2.0.jarFri, 13 Nov 2015 00:03:49 GMTapplication/octet-stream1600826BlockBlob
1axavier/sampling/sample.txtFri, 13 Nov 2015 00:02:50 GMTapplication/octet-stream1377780BlockBlob
2axavier/sampling/sample2.txtFri, 13 Nov 2015 00:35:55 GMTapplication/octet-stream1377793BlockBlob
3axavier/sampling/sample3.txtFri, 13 Nov 2015 00:39:40 GMTapplication/octet-stream1377793BlockBlob
4axavier/sampling/sample4.txtFri, 13 Nov 2015 00:41:49 GMTapplication/octet-stream1377793BlockBlob
\n", "
"], "text/plain": [" name last_modified \\\n", "0 axavier/sampling/datafu-1.2.0.jar Fri, 13 Nov 2015 00:03:49 GMT \n", "1 axavier/sampling/sample.txt Fri, 13 Nov 2015 00:02:50 GMT \n", "2 axavier/sampling/sample2.txt Fri, 13 Nov 2015 00:35:55 GMT \n", "3 axavier/sampling/sample3.txt Fri, 13 Nov 2015 00:39:40 GMT \n", "4 axavier/sampling/sample4.txt Fri, 13 Nov 2015 00:41:49 GMT \n", "\n", " content_type content_length blob_type \n", "0 application/octet-stream 1600826 BlockBlob \n", "1 application/octet-stream 1377780 BlockBlob \n", "2 application/octet-stream 1377793 BlockBlob \n", "3 application/octet-stream 1377793 BlockBlob \n", "4 application/octet-stream 1377793 BlockBlob "]}, "execution_count": 46, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling"]}, {"cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": ["%%PIG_azure sample.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "\n", "dset = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "sampled = FOREACH (GROUP dset ALL) GENERATE FLATTEN(RS(dset));\n", "STORE sampled INTO '$CONTAINER/$PSEUDO/sampling/out_sampled_rs4_2015.txt' USING PigStorage() ;"]}, {"cell_type": "code", "execution_count": 47, "metadata": {"collapsed": true}, "outputs": [], "source": ["jid = %hd_pig_submit sample.pig"]}, {"cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0136', None, None, False, 'RUNNING')"]}, "execution_count": 49, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "st[\"id\"],st[\"percentComplete\"],st[\"completed\"],st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"]"]}, {"cell_type": "code", "execution_count": 49, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["
\n", "\n", "

"], "text/plain": [""]}, "execution_count": 50, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 10"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namelast_modifiedcontent_typecontent_lengthblob_type
0axavier/sampling/datafu-1.2.0.jarFri, 13 Nov 2015 00:03:49 GMTapplication/octet-stream1600826BlockBlob
1axavier/sampling/out_sampled_rs4_2015.txtFri, 13 Nov 2015 01:08:22 GMT0BlockBlob
2axavier/sampling/out_sampled_rs4_2015.txt/_SUC...Fri, 13 Nov 2015 01:08:22 GMTapplication/octet-stream0BlockBlob
3axavier/sampling/out_sampled_rs4_2015.txt/part...Fri, 13 Nov 2015 01:08:21 GMTapplication/octet-stream12785BlockBlob
4axavier/sampling/sample.txtFri, 13 Nov 2015 00:02:50 GMTapplication/octet-stream1377780BlockBlob
5axavier/sampling/sample2.txtFri, 13 Nov 2015 00:35:55 GMTapplication/octet-stream1377793BlockBlob
6axavier/sampling/sample3.txtFri, 13 Nov 2015 00:39:40 GMTapplication/octet-stream1377793BlockBlob
7axavier/sampling/sample4.txtFri, 13 Nov 2015 00:41:49 GMTapplication/octet-stream1377793BlockBlob
8axavier/sampling/sampled4_2015.txtFri, 13 Nov 2015 00:50:20 GMT0BlockBlob
9axavier/sampling/sampled4_2015.txt/_SUCCESSFri, 13 Nov 2015 00:50:20 GMTapplication/octet-stream0BlockBlob
10axavier/sampling/sampled4_2015.txt/part-m-00000Fri, 13 Nov 2015 00:50:19 GMTapplication/octet-stream1277794BlockBlob
11axavier/sampling/sampled_rs4_2015.txtFri, 13 Nov 2015 01:04:51 GMT0BlockBlob
12axavier/sampling/sampled_rs4_2015.txt/_SUCCESSFri, 13 Nov 2015 01:04:51 GMTapplication/octet-stream0BlockBlob
13axavier/sampling/sampled_rs4_2015.txt/part-m-0...Fri, 13 Nov 2015 01:04:50 GMTapplication/octet-stream1277794BlockBlob
14axavier/sampling/sampled_srs4_2015.txtFri, 13 Nov 2015 00:56:09 GMT0BlockBlob
15axavier/sampling/sampled_srs4_2015.txt/_SUCCESSFri, 13 Nov 2015 00:56:09 GMTapplication/octet-stream0BlockBlob
16axavier/sampling/sampled_srs4_2015.txt/part-m-...Fri, 13 Nov 2015 00:56:09 GMTapplication/octet-stream1277794BlockBlob
17axavier/sampling/sampled_srs_2015.txtFri, 13 Nov 2015 00:52:34 GMT0BlockBlob
18axavier/sampling/sampled_srs_2015.txt/_SUCCESSFri, 13 Nov 2015 00:52:34 GMTapplication/octet-stream0BlockBlob
19axavier/sampling/sampled_srs_2015.txt/part-m-0...Fri, 13 Nov 2015 00:52:34 GMTapplication/octet-stream1277794BlockBlob
\n", "
"], "text/plain": [" name \\\n", "0 axavier/sampling/datafu-1.2.0.jar \n", "1 axavier/sampling/out_sampled_rs4_2015.txt \n", "2 axavier/sampling/out_sampled_rs4_2015.txt/_SUC... \n", "3 axavier/sampling/out_sampled_rs4_2015.txt/part... \n", "4 axavier/sampling/sample.txt \n", "5 axavier/sampling/sample2.txt \n", "6 axavier/sampling/sample3.txt \n", "7 axavier/sampling/sample4.txt \n", "8 axavier/sampling/sampled4_2015.txt \n", "9 axavier/sampling/sampled4_2015.txt/_SUCCESS \n", "10 axavier/sampling/sampled4_2015.txt/part-m-00000 \n", "11 axavier/sampling/sampled_rs4_2015.txt \n", "12 axavier/sampling/sampled_rs4_2015.txt/_SUCCESS \n", "13 axavier/sampling/sampled_rs4_2015.txt/part-m-0... \n", "14 axavier/sampling/sampled_srs4_2015.txt \n", "15 axavier/sampling/sampled_srs4_2015.txt/_SUCCESS \n", "16 axavier/sampling/sampled_srs4_2015.txt/part-m-... \n", "17 axavier/sampling/sampled_srs_2015.txt \n", "18 axavier/sampling/sampled_srs_2015.txt/_SUCCESS \n", "19 axavier/sampling/sampled_srs_2015.txt/part-m-0... \n", "\n", " last_modified content_type content_length \\\n", "0 Fri, 13 Nov 2015 00:03:49 GMT application/octet-stream 1600826 \n", "1 Fri, 13 Nov 2015 01:08:22 GMT 0 \n", "2 Fri, 13 Nov 2015 01:08:22 GMT application/octet-stream 0 \n", "3 Fri, 13 Nov 2015 01:08:21 GMT application/octet-stream 12785 \n", "4 Fri, 13 Nov 2015 00:02:50 GMT application/octet-stream 1377780 \n", "5 Fri, 13 Nov 2015 00:35:55 GMT application/octet-stream 1377793 \n", "6 Fri, 13 Nov 2015 00:39:40 GMT application/octet-stream 1377793 \n", "7 Fri, 13 Nov 2015 00:41:49 GMT application/octet-stream 1377793 \n", "8 Fri, 13 Nov 2015 00:50:20 GMT 0 \n", "9 Fri, 13 Nov 2015 00:50:20 GMT application/octet-stream 0 \n", "10 Fri, 13 Nov 2015 00:50:19 GMT application/octet-stream 1277794 \n", "11 Fri, 13 Nov 2015 01:04:51 GMT 0 \n", "12 Fri, 13 Nov 2015 01:04:51 GMT application/octet-stream 0 \n", "13 Fri, 13 Nov 2015 01:04:50 GMT application/octet-stream 1277794 \n", "14 Fri, 13 Nov 2015 00:56:09 GMT 0 \n", "15 Fri, 13 Nov 2015 00:56:09 GMT application/octet-stream 0 \n", "16 Fri, 13 Nov 2015 00:56:09 GMT application/octet-stream 1277794 \n", "17 Fri, 13 Nov 2015 00:52:34 GMT 0 \n", "18 Fri, 13 Nov 2015 00:52:34 GMT application/octet-stream 0 \n", "19 Fri, 13 Nov 2015 00:52:34 GMT application/octet-stream 1277794 \n", "\n", " blob_type \n", "0 BlockBlob \n", "1 BlockBlob \n", "2 BlockBlob \n", "3 BlockBlob \n", "4 BlockBlob \n", "5 BlockBlob \n", "6 BlockBlob \n", "7 BlockBlob \n", "8 BlockBlob \n", "9 BlockBlob \n", "10 BlockBlob \n", "11 BlockBlob \n", "12 BlockBlob \n", "13 BlockBlob \n", "14 BlockBlob \n", "15 BlockBlob \n", "16 BlockBlob \n", "17 BlockBlob \n", "18 BlockBlob \n", "19 BlockBlob "]}, "execution_count": 51, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling"]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [{"data": {"text/plain": ["'out_sampled_rs4_2015.txt'"]}, "execution_count": 52, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_downmerge /$PSEUDO/sampling/out_sampled_rs4_2015.txt out_sampled_rs4_2015.txt -o"]}, {"cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", "90648\tM90648\n", "49678\tS49678\n", "41434\tQ41434\n", "30149\tP30149\n", "15836\tC15836\n", "61110\tK61110\n", "3838\tQ3838\n", "81515\tF81515\n", "48052\tE48052\n", "16332\tE16332\n", "\n", "
"], "text/plain": [""]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["%head out_sampled_rs4_2015.txt"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## fin"]}, {"cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [{"data": {"text/plain": ["True"]}, "execution_count": 54, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_close"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## version avec it\u00e9rateur"]}, {"cell_type": "code", "execution_count": 54, "metadata": {"collapsed": true}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4"}}, "nbformat": 4, "nbformat_minor": 2}