"]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["%%jython reservoir_sampling.py reservoir_sampling\n", "{(100001,\"AAAAAA\"),(99999,\"D99999\"),(99998,\"C99998\")}"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On ajoute le code jython au script pr\u00e9c\u00e9dent :"]}, {"cell_type": "code", "execution_count": 18, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore_complete.pig\n", "\n", "REGISTER '$CONTAINER/$SCRIPTPIG/reservoir_sampling.py' using jython as myrs;\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "ens_group = GROUP ensemble ALL;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE FLATTEN(myrs.reservoir_sample(ensemble));\n", "DESCRIBE sampled;\n", "\n", "STORE sampled INTO \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_rs.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0229'}"]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore_complete.pig -d reservoir_sampling.py\n", "jid"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0229', None, 'done', False, 'RUNNING')"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 21, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["\n", "15/11/15 18:43:49 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL\n", "15/11/15 18:43:49 INFO pig.ExecTypeProvider: Trying ExecType : MAPREDUCE\n", "15/11/15 18:43:49 INFO pig.ExecTypeProvider: Picked MAPREDUCE as the ExecType\n", "2015-11-15 18:43:49,598 [main] INFO org.apache.pig.Main - Apache Pig version 0.14.0.2.2.7.1-33 (r: unknown) compiled Oct 13 2015, 04:18:06\n", "2015-11-15 18:43:49,598 [main] INFO org.apache.pig.Main - Logging error messages to: C:\\apps\\dist\\hadoop-2.6.0.2.2.7.1-33\\logs\\pig_1447613029598.log\n", "2015-11-15 18:43:50,848 [main] INFO org.apache.pig.impl.util.Utils - Default bootup file D:\\Users\\hdp/.pigbootup not found\n", "2015-11-15 18:43:51,145 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address\n", "2015-11-15 18:43:51,145 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 18:43:51,145 [main] INFO org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: wasb://clusterensaeazure1-1@hdblobstorage.blob.core.windows.net\n", "2015-11-15 18:43:51,879 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 18:43:52,192 [main] INFO org.apache.pig.scripting.jython.JythonScriptEngine - created tmp python.cachedir=D:\\Users\\hdp\\AppData\\Local\\Temp\\pig_jython_3357684506669481882\n", "2015-11-15 18:43:54,817 [main] WARN org.apache.pig.scripting.jython.JythonScriptEngine - pig.cmd.args.remainders is empty. This is not expected unless on testing.\n", "2015-11-15 18:43:57,645 [main] INFO org.apache.pig.scripting.jython.JythonScriptEngine - Register scripting UDF: myrs.reservoir_sampling\n", "2015-11-15 18:43:58,535 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-11-15 18:43:59,660 [main] ERROR org.apache.pig.PigServer - exception during parsing: Error during parsing. Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "Failed to parse: Pig script failed to parse: \n", " Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:199)\n", "\tat org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1735)\n", "\tat org.apache.pig.PigServer$Graph.access$000(PigServer.java:1443)\n", "\tat org.apache.pig.PigServer.parseAndBuild(PigServer.java:387)\n", "\tat org.apache.pig.tools.grunt.GruntParser.processDescribe(GruntParser.java:300)\n", "\tat org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:412)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:230)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:205)\n", "\tat org.apache.pig.tools.grunt.Grunt.exec(Grunt.java:81)\n", "\tat org.apache.pig.Main.run(Main.java:495)\n", "\tat org.apache.pig.Main.main(Main.java:170)\n", "Caused by: \n", " Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1572)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.func_eval(LogicalPlanGenerator.java:9372)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.projectable_expr(LogicalPlanGenerator.java:11051)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.var_expr(LogicalPlanGenerator.java:10810)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.expr(LogicalPlanGenerator.java:10159)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_clause(LogicalPlanGenerator.java:7629)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_generated_item(LogicalPlanGenerator.java:7452)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.generate_clause(LogicalPlanGenerator.java:17590)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_plan(LogicalPlanGenerator.java:15982)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_clause(LogicalPlanGenerator.java:15849)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.op_clause(LogicalPlanGenerator.java:1933)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.general_statement(LogicalPlanGenerator.java:1102)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.statement(LogicalPlanGenerator.java:560)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.query(LogicalPlanGenerator.java:421)\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:191)\n", "\t... 10 more\n", "Caused by: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.impl.PigContext.resolveClassName(PigContext.java:677)\n", "\tat org.apache.pig.impl.PigContext.getClassForAlias(PigContext.java:793)\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1569)\n", "\t... 24 more\n", "2015-11-15 18:43:59,707 [main] ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "2015-11-15 18:43:59,707 [main] ERROR org.apache.pig.tools.grunt.Grunt - org.apache.pig.impl.logicalLayer.FrontendException: ERROR 1000: Error during parsing. Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1748)\n", "\tat org.apache.pig.PigServer$Graph.access$000(PigServer.java:1443)\n", "\tat org.apache.pig.PigServer.parseAndBuild(PigServer.java:387)\n", "\tat org.apache.pig.tools.grunt.GruntParser.processDescribe(GruntParser.java:300)\n", "\tat org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:412)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:230)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:205)\n", "\tat org.apache.pig.tools.grunt.Grunt.exec(Grunt.java:81)\n", "\tat org.apache.pig.Main.run(Main.java:495)\n", "\tat org.apache.pig.Main.main(Main.java:170)\n", "Caused by: Failed to parse: Pig script failed to parse: \n", " Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:199)\n", "\tat org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1735)\n", "\t... 9 more\n", "Caused by: \n", " Failed to generate logical plan. Nested exception: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1572)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.func_eval(LogicalPlanGenerator.java:9372)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.projectable_expr(LogicalPlanGenerator.java:11051)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.var_expr(LogicalPlanGenerator.java:10810)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.expr(LogicalPlanGenerator.java:10159)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_clause(LogicalPlanGenerator.java:7629)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.flatten_generated_item(LogicalPlanGenerator.java:7452)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.generate_clause(LogicalPlanGenerator.java:17590)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_plan(LogicalPlanGenerator.java:15982)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.foreach_clause(LogicalPlanGenerator.java:15849)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.op_clause(LogicalPlanGenerator.java:1933)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.general_statement(LogicalPlanGenerator.java:1102)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.statement(LogicalPlanGenerator.java:560)\n", "\tat org.apache.pig.parser.LogicalPlanGenerator.query(LogicalPlanGenerator.java:421)\n", "\tat org.apache.pig.parser.QueryParserDriver.parse(QueryParserDriver.java:191)\n", "\t... 10 more\n", "Caused by: org.apache.pig.backend.executionengine.ExecException: ERROR 1070: Could not resolve myrs.reservoir_sample using imports: [, java.lang., org.apache.pig.builtin., org.apache.pig.impl.builtin.]\n", "\tat org.apache.pig.impl.PigContext.resolveClassName(PigContext.java:677)\n", "\tat org.apache.pig.impl.PigContext.getClassForAlias(PigContext.java:793)\n", "\tat org.apache.pig.parser.LogicalPlanBuilder.buildUDF(LogicalPlanBuilder.java:1569)\n", "\t... 24 more\n", "\n", "Details also at logfile: C:\\apps\\dist\\hadoop-2.6.0.2.2.7.1-33\\logs\\pig_1447613029598.log\n", "2015-11-15 18:43:59,754 [main] INFO org.apache.pig.Main - Pig script completed in 10 seconds and 453 milliseconds (10453 ms)\n", "\n", "
OUT:
\n", "ensemble: {x: int,v: chararray}\n", "ens_group: {group: chararray,ensemble: {(x: int,v: chararray)}}\n", "\n", "
"], "text/plain": [""]}, "execution_count": 22, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 100"]}, {"cell_type": "markdown", "metadata": {}, "source": ["A corriger plus tard. Dans l'imm\u00e9diat, on utilisera la librairie [datafu](https://datafu.incubator.apache.org/docs/datafu/guide/sampling.html). Si le cluster ne reconna\u00eet pas la librairie, voir la section java pour comprendre comment l'importer. On la d\u00e9clare dans le script par l'instruction ``REGISTER``."]}, {"cell_type": "code", "execution_count": 22, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore_datafu.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "ens_group = GROUP ensemble ALL;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE FLATTEN(RS(ensemble));\n", "DESCRIBE sampled;\n", "\n", "STORE sampled \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_datafu_rs.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0193'}"]}, "execution_count": 24, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore_datafu.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0193', '50% complete', None, False, 'RUNNING')"]}, "execution_count": 25, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 25, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["\n", "\n", "
"], "text/plain": [""]}, "execution_count": 26, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 100"]}, {"cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " name | \n", " last_modified | \n", " content_type | \n", " content_length | \n", " blob_type | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " axavier/sampling/sample_datafu_rs.txt | \n", " Sun, 15 Nov 2015 13:23:40 GMT | \n", " | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 1 | \n", " axavier/sampling/sample_datafu_rs.txt/_SUCCESS | \n", " Sun, 15 Nov 2015 13:23:40 GMT | \n", " application/octet-stream | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 2 | \n", " axavier/sampling/sample_datafu_rs.txt/part-r-0... | \n", " Sun, 15 Nov 2015 13:23:38 GMT | \n", " application/octet-stream | \n", " 12780 | \n", " BlockBlob | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" name \\\n", "0 axavier/sampling/sample_datafu_rs.txt \n", "1 axavier/sampling/sample_datafu_rs.txt/_SUCCESS \n", "2 axavier/sampling/sample_datafu_rs.txt/part-r-0... \n", "\n", " last_modified content_type content_length \\\n", "0 Sun, 15 Nov 2015 13:23:40 GMT 0 \n", "1 Sun, 15 Nov 2015 13:23:40 GMT application/octet-stream 0 \n", "2 Sun, 15 Nov 2015 13:23:38 GMT application/octet-stream 12780 \n", "\n", " blob_type \n", "0 BlockBlob \n", "1 BlockBlob \n", "2 BlockBlob "]}, "execution_count": 27, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling/sample_datafu"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## version distribu\u00e9e\n", "\n", "Astuce : on distribue puis on recombine les \u00e9chantillons en faisant un dernier reservoir sampling mais pond\u00e9r\u00e9. Comment distribuer ? Le second sampling est remplac\u00e9 par une m\u00e9thode d'\u00e9chantillonage classique car le reservoir sampling pond\u00e9r\u00e9 n'est pas disponible dans la librairie datafu version 1.2.0."]}, {"cell_type": "code", "execution_count": 27, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG sample_explore_datafu_dist.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "DEFINE WeightedSample datafu.pig.sampling.WeightedSample();\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "keys = FOREACH ensemble GENERATE x, v, x%10 AS key;\n", "DESCRIBE keys;\n", "ens_group = GROUP keys BY key ;\n", "DESCRIBE ens_group;\n", "sampled = FOREACH ens_group GENERATE COUNT(keys) AS weigth, FLATTEN(RS(keys));\n", "DESCRIBE sampled;\n", "wsampled = FOREACH (GROUP sampled ALL) GENERATE FLATTEN(WeightedSample(sampled, 0, 1000));\n", "DESCRIBE wsampled;\n", "\n", "STORE wsampled \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_datafu_rs_dist2.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0238'}"]}, "execution_count": 29, "metadata": {}, "output_type": "execute_result"}], "source": ["jid = %hd_pig_submit sample_explore_datafu_dist.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0238', '100% complete', 'done', True, 'SUCCEEDED')"]}, "execution_count": 30, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 30, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["\n", "2015-11-15 19:22:17,553 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:22:17,553 [main] INFO org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:22:17,615 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:22:17,803 [main] INFO org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2015-11-15 19:22:17,803 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:22:17,803 [main] INFO org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:22:17,865 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:22:17,943 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!\n", "2015-11-15 19:22:17,975 [main] INFO org.apache.pig.Main - Pig script completed in 1 minute, 42 seconds and 839 milliseconds (102839 ms)\n", "\n", "
OUT:
\n", "ensemble: {x: int,v: chararray}\n", "keys: {x: int,v: chararray,key: int}\n", "ens_group: {group: int,keys: {(x: int,v: chararray,key: int)}}\n", "sampled: {weigth: long,datafu.pig.sampling.reservoirsample_keys_4::x: int,datafu.pig.sampling.reservoirsample_keys_4::v: chararray,datafu.pig.sampling.reservoirsample_keys_4::key: int}\n", "wsampled: {datafu.pig.sampling.weightedsample_sampled_12::weigth: long,datafu.pig.sampling.weightedsample_sampled_12::datafu.pig.sampling.reservoirsample_keys_11::x: int,datafu.pig.sampling.weightedsample_sampled_12::datafu.pig.sampling.reservoirsample_keys_11::v: chararray,datafu.pig.sampling.weightedsample_sampled_12::datafu.pig.sampling.reservoirsample_keys_11::key: int}\n", "\n", "
"], "text/plain": [""]}, "execution_count": 31, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 10"]}, {"cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " name | \n", " last_modified | \n", " content_type | \n", " content_length | \n", " blob_type | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " axavier/sampling/sample_datafu_rs_dist2.txt | \n", " Sun, 15 Nov 2015 19:22:05 GMT | \n", " | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 1 | \n", " axavier/sampling/sample_datafu_rs_dist2.txt/_S... | \n", " Sun, 15 Nov 2015 19:22:06 GMT | \n", " application/octet-stream | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 2 | \n", " axavier/sampling/sample_datafu_rs_dist2.txt/pa... | \n", " Sun, 15 Nov 2015 19:22:05 GMT | \n", " application/octet-stream | \n", " 20770 | \n", " BlockBlob | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" name \\\n", "0 axavier/sampling/sample_datafu_rs_dist2.txt \n", "1 axavier/sampling/sample_datafu_rs_dist2.txt/_S... \n", "2 axavier/sampling/sample_datafu_rs_dist2.txt/pa... \n", "\n", " last_modified content_type content_length \\\n", "0 Sun, 15 Nov 2015 19:22:05 GMT 0 \n", "1 Sun, 15 Nov 2015 19:22:06 GMT application/octet-stream 0 \n", "2 Sun, 15 Nov 2015 19:22:05 GMT application/octet-stream 20770 \n", "\n", " blob_type \n", "0 BlockBlob \n", "1 BlockBlob \n", "2 BlockBlob "]}, "execution_count": 32, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling/sample_datafu_rs_dist2"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": ["df = %blob_head /$PSEUDO/sampling/sample_datafu_rs_dist2.txt -m"]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " 10001 | \n", " 21260 | \n", " S21260 | \n", " 0 | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 10000 | \n", " 25191 | \n", " X25191 | \n", " 1 | \n", "
\n", " \n", " 1 | \n", " 10000 | \n", " 73760 | \n", " Y73760 | \n", " 0 | \n", "
\n", " \n", " 2 | \n", " 10000 | \n", " 90105 | \n", " P90105 | \n", " 5 | \n", "
\n", " \n", " 3 | \n", " 10000 | \n", " 46070 | \n", " Y46070 | \n", " 0 | \n", "
\n", " \n", " 4 | \n", " 10001 | \n", " 58590 | \n", " M58590 | \n", " 0 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" 10001 21260 S21260 0\n", "0 10000 25191 X25191 1\n", "1 10000 73760 Y73760 0\n", "2 10000 90105 P90105 5\n", "3 10000 46070 Y46070 0\n", "4 10001 58590 M58590 0"]}, "execution_count": 34, "metadata": {}, "output_type": "execute_result"}], "source": ["df.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## version distribu\u00e9e am\u00e9lior\u00e9e\n", "\n", "Le probl\u00e8me de la version pr\u00e9c\u00e9dente : chaque sous-ensemble trait\u00e9 d'un seul bloc utilise une s\u00e9quence de nombres al\u00e9atoires sur laquelle on ne conna\u00eet pas grand chose. Si les m\u00eames *seed* sont utilis\u00e9es, il est possible que les s\u00e9quences, m\u00eame si elles simulent le hasard, soient extr\u00eamement corr\u00e9l\u00e9es entre chaque bloc. Il faut rem\u00e9dier \u00e0 cela.\n", "\n", "Il faut \u00e9galement s'assurer que chaque bloc n'est pas *skewed*."]}, {"cell_type": "code", "execution_count": 34, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG_azure script_rs.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "DEFINE MD5 datafu.pig.hash.MD5();\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "DEFINE WeightedSample datafu.pig.sampling.WeightedSample();\n", "\n", "ensemble = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "DESCRIBE ensemble;\n", "\n", "ens_group = GROUP ensemble BY (x,v);\n", "DESCRIBE ens_group;\n", "\n", "compte_group = FOREACH ens_group \n", " GENERATE group.x AS x, \n", " group.v AS v, \n", " COUNT(ensemble) AS nb_ligne ;\n", "DESCRIBE compte_group;\n", "\n", "hash_group = FOREACH compte_group \n", " GENERATE x, v, nb_ligne,\n", " SUBSTRING(MD5(v), 0, 1) AS val;\n", "DESCRIBE hash_group; \n", "\n", "group_hash = GROUP hash_group BY val ;\n", "DESCRIBE group_hash;\n", "\n", "rs_parall = FOREACH group_hash GENERATE\n", " COUNT(hash_group) AS nb_hash,\n", " FLATTEN(RS(hash_group)) ;\n", "DESCRIBE rs_parall;\n", "\n", "wsampled = FOREACH (GROUP rs_parall ALL) GENERATE FLATTEN(WeightedSample(rs_parall, 0, 1000));\n", "DESCRIBE wsampled;\n", "\n", "STORE wsampled \n", "INTO '$CONTAINER/$PSEUDO/sampling/sample_distributed_hash.txt' USING PigStorage();"]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1446540516812_0244'}"]}, "execution_count": 36, "metadata": {}, "output_type": "execute_result"}], "source": ["jid=%hd_pig_submit script_rs.pig\n", "jid"]}, {"cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0244', '100% complete', None, False, 'RUNNING')"]}, "execution_count": 37, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "(st[\"id\"],st[\"percentComplete\"],st[\"completed\"],\n", "st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"])"]}, {"cell_type": "code", "execution_count": 37, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["\n", "2015-11-15 19:52:05,138 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:52:05,138 [main] INFO org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:52:05,200 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:52:05,435 [main] INFO org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2015-11-15 19:52:05,435 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.89.128.19:9010\n", "2015-11-15 19:52:05,435 [main] INFO org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.89.128.19:10200\n", "2015-11-15 19:52:05,513 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-11-15 19:52:05,560 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!\n", "2015-11-15 19:52:05,607 [main] INFO org.apache.pig.Main - Pig script completed in 2 minutes, 29 seconds and 962 milliseconds (149962 ms)\n", "\n", "
OUT:
\n", "ensemble: {x: int,v: chararray}\n", "ens_group: {group: (x: int,v: chararray),ensemble: {(x: int,v: chararray)}}\n", "compte_group: {x: int,v: chararray,nb_ligne: long}\n", "hash_group: {x: int,v: chararray,nb_ligne: long,val: chararray}\n", "group_hash: {group: chararray,hash_group: {(x: int,v: chararray,nb_ligne: long,val: chararray)}}\n", "rs_parall: {nb_hash: long,datafu.pig.sampling.reservoirsample_hash_group_4::x: int,datafu.pig.sampling.reservoirsample_hash_group_4::v: chararray,datafu.pig.sampling.reservoirsample_hash_group_4::nb_ligne: long,datafu.pig.sampling.reservoirsample_hash_group_4::val: chararray}\n", "wsampled: {datafu.pig.sampling.weightedsample_rs_parall_12::nb_hash: long,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::x: int,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::v: chararray,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::nb_ligne: long,datafu.pig.sampling.weightedsample_rs_parall_12::datafu.pig.sampling.reservoirsample_hash_group_11::val: chararray}\n", "\n", "
"], "text/plain": [""]}, "execution_count": 38, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 10"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " name | \n", " last_modified | \n", " content_type | \n", " content_length | \n", " blob_type | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " axavier/sampling/sample_distributed_hash.txt | \n", " Sun, 15 Nov 2015 19:51:56 GMT | \n", " | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 1 | \n", " axavier/sampling/sample_distributed_hash.txt/_... | \n", " Sun, 15 Nov 2015 19:51:56 GMT | \n", " application/octet-stream | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 2 | \n", " axavier/sampling/sample_distributed_hash.txt/p... | \n", " Sun, 15 Nov 2015 19:51:55 GMT | \n", " application/octet-stream | \n", " 21750 | \n", " BlockBlob | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" name \\\n", "0 axavier/sampling/sample_distributed_hash.txt \n", "1 axavier/sampling/sample_distributed_hash.txt/_... \n", "2 axavier/sampling/sample_distributed_hash.txt/p... \n", "\n", " last_modified content_type content_length \\\n", "0 Sun, 15 Nov 2015 19:51:56 GMT 0 \n", "1 Sun, 15 Nov 2015 19:51:56 GMT application/octet-stream 0 \n", "2 Sun, 15 Nov 2015 19:51:55 GMT application/octet-stream 21750 \n", "\n", " blob_type \n", "0 BlockBlob \n", "1 BlockBlob \n", "2 BlockBlob "]}, "execution_count": 39, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling/sample_distributed_hash.txt"]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " 6693 | \n", " 27244 | \n", " W27244 | \n", " 1 | \n", " 6 | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 6749 | \n", " 51104 | \n", " O51104 | \n", " 1 | \n", " 1 | \n", "
\n", " \n", " 1 | \n", " 6605 | \n", " 91527 | \n", " H91527 | \n", " 1 | \n", " 6 | \n", "
\n", " \n", " 2 | \n", " 6630 | \n", " 75027 | \n", " R75027 | \n", " 1 | \n", " 4 | \n", "
\n", " \n", " 3 | \n", " 6789 | \n", " 58148 | \n", " M58148 | \n", " 1 | \n", " 1 | \n", "
\n", " \n", " 4 | \n", " 6659 | \n", " 71659 | \n", " D71659 | \n", " 1 | \n", " 5 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" 6693 27244 W27244 1 6\n", "0 6749 51104 O51104 1 1\n", "1 6605 91527 H91527 1 6\n", "2 6630 75027 R75027 1 4\n", "3 6789 58148 M58148 1 1\n", "4 6659 71659 D71659 1 5"]}, "execution_count": 40, "metadata": {}, "output_type": "execute_result"}], "source": ["df =%blob_head /$PSEUDO/sampling/sample_distributed_hash.txt -m\n", "df.head()"]}, {"cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [{"data": {"text/plain": ["'sample_distributed_hash.txt'"]}, "execution_count": 41, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_downmerge /$PSEUDO/sampling/sample_distributed_hash.txt sample_distributed_hash.txt"]}, {"cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "6693\t27244\tW27244\t1\t6\n", "6749\t51104\tO51104\t1\t1\n", "6605\t91527\tH91527\t1\t6\n", "6630\t75027\tR75027\t1\t4\n", "6789\t58148\tM58148\t1\t1\n", "6659\t71659\tD71659\t1\t5\n", "6811\t74380\tU74380\t1\t9\n", "6749\t20125\tB20125\t1\t2\n", "6587\t33466\tE33466\t1\t5\n", "6587\t21645\tN21645\t1\t5\n", "\n", "
"], "text/plain": [""]}, "execution_count": 42, "metadata": {}, "output_type": "execute_result"}], "source": ["%head sample_distributed_hash.txt"]}, {"cell_type": "code", "execution_count": 42, "metadata": {"collapsed": true}, "outputs": [], "source": []}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## version java\n", "\n", "On s'inspire de l'exemple suivant [Sampling](http://datafu.incubator.apache.org/docs/datafu/guide/sampling.html).\n", " On t\u00e9l\u00e9charge [datafu 1.2](http://datafu.incubator.apache.org/docs/datafu/) depuis [Maven](http://mvnrepository.com/artifact/com.linkedin.datafu/datafu/1.2.0). Ce n'est pas la derni\u00e8re version mais suivre les instructions pour *builder* datafu (voir [documentation](http://datafu.incubator.apache.org/docs/datafu/1.2.0/)). En particulier, la version pond\u00e9r\u00e9e du reservoir sampling n'est pas disponible (voir [history](https://github.com/apache/incubator-datafu/commits/master/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java), la version 1.2.0 est sorti en d\u00e9cembre 2013).\n", " \n", "L'impl\u00e9mentation [java](https://github.com/apache/incubator-datafu/blob/master/datafu-pig/src/main/java/datafu/pig/sampling/ReservoirSample.java) n'a pas l'air de r\u00e9soudre un probl\u00e8me qui peut survenir si la taille de l'\u00e9chantillon demand\u00e9e est trop grande. Voir section suivante."]}, {"cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [{"data": {"text/plain": ["'datafu-1.2.0.jar'"]}, "execution_count": 44, "metadata": {}, "output_type": "execute_result"}], "source": ["import pyensae.datasource\n", "pyensae.datasource.download_data(\"datafu-1.2.0.jar\", url=\"http://central.maven.org/maven2/com/linkedin/datafu/datafu/1.2.0/\")"]}, {"cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [{"data": {"text/plain": ["'$PSEUDO/sampling/datafu-1.2.0.jar'"]}, "execution_count": 45, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_up datafu-1.2.0.jar /$PSEUDO/sampling/datafu-1.2.0.jar"]}, {"cell_type": "code", "execution_count": 45, "metadata": {"collapsed": true}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " name | \n", " last_modified | \n", " content_type | \n", " content_length | \n", " blob_type | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " axavier/sampling/datafu-1.2.0.jar | \n", " Fri, 13 Nov 2015 00:03:49 GMT | \n", " application/octet-stream | \n", " 1600826 | \n", " BlockBlob | \n", "
\n", " \n", " 1 | \n", " axavier/sampling/sample.txt | \n", " Fri, 13 Nov 2015 00:02:50 GMT | \n", " application/octet-stream | \n", " 1377780 | \n", " BlockBlob | \n", "
\n", " \n", " 2 | \n", " axavier/sampling/sample2.txt | \n", " Fri, 13 Nov 2015 00:35:55 GMT | \n", " application/octet-stream | \n", " 1377793 | \n", " BlockBlob | \n", "
\n", " \n", " 3 | \n", " axavier/sampling/sample3.txt | \n", " Fri, 13 Nov 2015 00:39:40 GMT | \n", " application/octet-stream | \n", " 1377793 | \n", " BlockBlob | \n", "
\n", " \n", " 4 | \n", " axavier/sampling/sample4.txt | \n", " Fri, 13 Nov 2015 00:41:49 GMT | \n", " application/octet-stream | \n", " 1377793 | \n", " BlockBlob | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" name last_modified \\\n", "0 axavier/sampling/datafu-1.2.0.jar Fri, 13 Nov 2015 00:03:49 GMT \n", "1 axavier/sampling/sample.txt Fri, 13 Nov 2015 00:02:50 GMT \n", "2 axavier/sampling/sample2.txt Fri, 13 Nov 2015 00:35:55 GMT \n", "3 axavier/sampling/sample3.txt Fri, 13 Nov 2015 00:39:40 GMT \n", "4 axavier/sampling/sample4.txt Fri, 13 Nov 2015 00:41:49 GMT \n", "\n", " content_type content_length blob_type \n", "0 application/octet-stream 1600826 BlockBlob \n", "1 application/octet-stream 1377780 BlockBlob \n", "2 application/octet-stream 1377793 BlockBlob \n", "3 application/octet-stream 1377793 BlockBlob \n", "4 application/octet-stream 1377793 BlockBlob "]}, "execution_count": 46, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling"]}, {"cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": ["%%PIG_azure sample.pig\n", "\n", "REGISTER '$CONTAINER/$PSEUDO/sampling/datafu-1.2.0.jar';\n", "\n", "DEFINE RS datafu.pig.sampling.ReservoirSample('1000');\n", "\n", "dset = LOAD '$CONTAINER/$PSEUDO/sampling/sample4.txt' \n", " USING PigStorage('\\t') AS (x:int, v:chararray) ;\n", "sampled = FOREACH (GROUP dset ALL) GENERATE FLATTEN(RS(dset));\n", "STORE sampled INTO '$CONTAINER/$PSEUDO/sampling/out_sampled_rs4_2015.txt' USING PigStorage() ;"]}, {"cell_type": "code", "execution_count": 47, "metadata": {"collapsed": true}, "outputs": [], "source": ["jid = %hd_pig_submit sample.pig"]}, {"cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [{"data": {"text/plain": ["('job_1446540516812_0136', None, None, False, 'RUNNING')"]}, "execution_count": 49, "metadata": {}, "output_type": "execute_result"}], "source": ["st = %hd_job_status jid[\"id\"]\n", "st[\"id\"],st[\"percentComplete\"],st[\"completed\"],st[\"status\"][\"jobComplete\"],st[\"status\"][\"state\"]"]}, {"cell_type": "code", "execution_count": 49, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["\n", "\n", "
"], "text/plain": [""]}, "execution_count": 50, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr jid[\"id\"] -n 10"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " name | \n", " last_modified | \n", " content_type | \n", " content_length | \n", " blob_type | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " axavier/sampling/datafu-1.2.0.jar | \n", " Fri, 13 Nov 2015 00:03:49 GMT | \n", " application/octet-stream | \n", " 1600826 | \n", " BlockBlob | \n", "
\n", " \n", " 1 | \n", " axavier/sampling/out_sampled_rs4_2015.txt | \n", " Fri, 13 Nov 2015 01:08:22 GMT | \n", " | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 2 | \n", " axavier/sampling/out_sampled_rs4_2015.txt/_SUC... | \n", " Fri, 13 Nov 2015 01:08:22 GMT | \n", " application/octet-stream | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 3 | \n", " axavier/sampling/out_sampled_rs4_2015.txt/part... | \n", " Fri, 13 Nov 2015 01:08:21 GMT | \n", " application/octet-stream | \n", " 12785 | \n", " BlockBlob | \n", "
\n", " \n", " 4 | \n", " axavier/sampling/sample.txt | \n", " Fri, 13 Nov 2015 00:02:50 GMT | \n", " application/octet-stream | \n", " 1377780 | \n", " BlockBlob | \n", "
\n", " \n", " 5 | \n", " axavier/sampling/sample2.txt | \n", " Fri, 13 Nov 2015 00:35:55 GMT | \n", " application/octet-stream | \n", " 1377793 | \n", " BlockBlob | \n", "
\n", " \n", " 6 | \n", " axavier/sampling/sample3.txt | \n", " Fri, 13 Nov 2015 00:39:40 GMT | \n", " application/octet-stream | \n", " 1377793 | \n", " BlockBlob | \n", "
\n", " \n", " 7 | \n", " axavier/sampling/sample4.txt | \n", " Fri, 13 Nov 2015 00:41:49 GMT | \n", " application/octet-stream | \n", " 1377793 | \n", " BlockBlob | \n", "
\n", " \n", " 8 | \n", " axavier/sampling/sampled4_2015.txt | \n", " Fri, 13 Nov 2015 00:50:20 GMT | \n", " | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 9 | \n", " axavier/sampling/sampled4_2015.txt/_SUCCESS | \n", " Fri, 13 Nov 2015 00:50:20 GMT | \n", " application/octet-stream | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 10 | \n", " axavier/sampling/sampled4_2015.txt/part-m-00000 | \n", " Fri, 13 Nov 2015 00:50:19 GMT | \n", " application/octet-stream | \n", " 1277794 | \n", " BlockBlob | \n", "
\n", " \n", " 11 | \n", " axavier/sampling/sampled_rs4_2015.txt | \n", " Fri, 13 Nov 2015 01:04:51 GMT | \n", " | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 12 | \n", " axavier/sampling/sampled_rs4_2015.txt/_SUCCESS | \n", " Fri, 13 Nov 2015 01:04:51 GMT | \n", " application/octet-stream | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 13 | \n", " axavier/sampling/sampled_rs4_2015.txt/part-m-0... | \n", " Fri, 13 Nov 2015 01:04:50 GMT | \n", " application/octet-stream | \n", " 1277794 | \n", " BlockBlob | \n", "
\n", " \n", " 14 | \n", " axavier/sampling/sampled_srs4_2015.txt | \n", " Fri, 13 Nov 2015 00:56:09 GMT | \n", " | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 15 | \n", " axavier/sampling/sampled_srs4_2015.txt/_SUCCESS | \n", " Fri, 13 Nov 2015 00:56:09 GMT | \n", " application/octet-stream | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 16 | \n", " axavier/sampling/sampled_srs4_2015.txt/part-m-... | \n", " Fri, 13 Nov 2015 00:56:09 GMT | \n", " application/octet-stream | \n", " 1277794 | \n", " BlockBlob | \n", "
\n", " \n", " 17 | \n", " axavier/sampling/sampled_srs_2015.txt | \n", " Fri, 13 Nov 2015 00:52:34 GMT | \n", " | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 18 | \n", " axavier/sampling/sampled_srs_2015.txt/_SUCCESS | \n", " Fri, 13 Nov 2015 00:52:34 GMT | \n", " application/octet-stream | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 19 | \n", " axavier/sampling/sampled_srs_2015.txt/part-m-0... | \n", " Fri, 13 Nov 2015 00:52:34 GMT | \n", " application/octet-stream | \n", " 1277794 | \n", " BlockBlob | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" name \\\n", "0 axavier/sampling/datafu-1.2.0.jar \n", "1 axavier/sampling/out_sampled_rs4_2015.txt \n", "2 axavier/sampling/out_sampled_rs4_2015.txt/_SUC... \n", "3 axavier/sampling/out_sampled_rs4_2015.txt/part... \n", "4 axavier/sampling/sample.txt \n", "5 axavier/sampling/sample2.txt \n", "6 axavier/sampling/sample3.txt \n", "7 axavier/sampling/sample4.txt \n", "8 axavier/sampling/sampled4_2015.txt \n", "9 axavier/sampling/sampled4_2015.txt/_SUCCESS \n", "10 axavier/sampling/sampled4_2015.txt/part-m-00000 \n", "11 axavier/sampling/sampled_rs4_2015.txt \n", "12 axavier/sampling/sampled_rs4_2015.txt/_SUCCESS \n", "13 axavier/sampling/sampled_rs4_2015.txt/part-m-0... \n", "14 axavier/sampling/sampled_srs4_2015.txt \n", "15 axavier/sampling/sampled_srs4_2015.txt/_SUCCESS \n", "16 axavier/sampling/sampled_srs4_2015.txt/part-m-... \n", "17 axavier/sampling/sampled_srs_2015.txt \n", "18 axavier/sampling/sampled_srs_2015.txt/_SUCCESS \n", "19 axavier/sampling/sampled_srs_2015.txt/part-m-0... \n", "\n", " last_modified content_type content_length \\\n", "0 Fri, 13 Nov 2015 00:03:49 GMT application/octet-stream 1600826 \n", "1 Fri, 13 Nov 2015 01:08:22 GMT 0 \n", "2 Fri, 13 Nov 2015 01:08:22 GMT application/octet-stream 0 \n", "3 Fri, 13 Nov 2015 01:08:21 GMT application/octet-stream 12785 \n", "4 Fri, 13 Nov 2015 00:02:50 GMT application/octet-stream 1377780 \n", "5 Fri, 13 Nov 2015 00:35:55 GMT application/octet-stream 1377793 \n", "6 Fri, 13 Nov 2015 00:39:40 GMT application/octet-stream 1377793 \n", "7 Fri, 13 Nov 2015 00:41:49 GMT application/octet-stream 1377793 \n", "8 Fri, 13 Nov 2015 00:50:20 GMT 0 \n", "9 Fri, 13 Nov 2015 00:50:20 GMT application/octet-stream 0 \n", "10 Fri, 13 Nov 2015 00:50:19 GMT application/octet-stream 1277794 \n", "11 Fri, 13 Nov 2015 01:04:51 GMT 0 \n", "12 Fri, 13 Nov 2015 01:04:51 GMT application/octet-stream 0 \n", "13 Fri, 13 Nov 2015 01:04:50 GMT application/octet-stream 1277794 \n", "14 Fri, 13 Nov 2015 00:56:09 GMT 0 \n", "15 Fri, 13 Nov 2015 00:56:09 GMT application/octet-stream 0 \n", "16 Fri, 13 Nov 2015 00:56:09 GMT application/octet-stream 1277794 \n", "17 Fri, 13 Nov 2015 00:52:34 GMT 0 \n", "18 Fri, 13 Nov 2015 00:52:34 GMT application/octet-stream 0 \n", "19 Fri, 13 Nov 2015 00:52:34 GMT application/octet-stream 1277794 \n", "\n", " blob_type \n", "0 BlockBlob \n", "1 BlockBlob \n", "2 BlockBlob \n", "3 BlockBlob \n", "4 BlockBlob \n", "5 BlockBlob \n", "6 BlockBlob \n", "7 BlockBlob \n", "8 BlockBlob \n", "9 BlockBlob \n", "10 BlockBlob \n", "11 BlockBlob \n", "12 BlockBlob \n", "13 BlockBlob \n", "14 BlockBlob \n", "15 BlockBlob \n", "16 BlockBlob \n", "17 BlockBlob \n", "18 BlockBlob \n", "19 BlockBlob "]}, "execution_count": 51, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /$PSEUDO/sampling"]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [{"data": {"text/plain": ["'out_sampled_rs4_2015.txt'"]}, "execution_count": 52, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_downmerge /$PSEUDO/sampling/out_sampled_rs4_2015.txt out_sampled_rs4_2015.txt -o"]}, {"cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "90648\tM90648\n", "49678\tS49678\n", "41434\tQ41434\n", "30149\tP30149\n", "15836\tC15836\n", "61110\tK61110\n", "3838\tQ3838\n", "81515\tF81515\n", "48052\tE48052\n", "16332\tE16332\n", "\n", "
"], "text/plain": [""]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["%head out_sampled_rs4_2015.txt"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## fin"]}, {"cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [{"data": {"text/plain": ["True"]}, "execution_count": 54, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_close"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## version avec it\u00e9rateur"]}, {"cell_type": "code", "execution_count": 54, "metadata": {"collapsed": true}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4"}}, "nbformat": 4, "nbformat_minor": 2}