)"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["import pyensae\n", "blobstorage = \n", "blobpassword = \n", "hadoop_server = \n", "hadoop_password = \n", "username = \"centrale\"\n", "client, bs = %hd_open\n", "client, bs"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Upload the data"]}, {"cell_type": "code", "execution_count": 7, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["'centrale2/bank_full_tab_no.txt'"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_up bank_full_tab_no.txt hdblobstorage/centrale2/bank_full_tab_no.txt"]}, {"cell_type": "code", "execution_count": 8, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " name | \n", " last_modified | \n", " content_type | \n", " content_length | \n", " blob_type | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " centrale2/bank_full_tab_no.txt | \n", " 2016-06-16 10:18:58+00:00 | \n", " None | \n", " 3751188 | \n", " BlockBlob | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" name last_modified content_type \\\n", "0 centrale2/bank_full_tab_no.txt 2016-06-16 10:18:58+00:00 None \n", "\n", " content_length blob_type \n", "0 3751188 BlockBlob "]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls hdblobstorage/centrale2"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Submit a PIG query"]}, {"cell_type": "code", "execution_count": 9, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["'age:double, job:chararray, marital:chararray, education:chararray, default:chararray, balance:double, housing:chararray, loan:chararray, contact:chararray, day:double, month:chararray, duration:double, campaign:double, pdays:double, previous:double, poutcome:chararray, y:chararray'"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["mapping = {'int64': 'double', 'float': 'double', 'object': 'chararray'}\n", "schema = [\"%s:%s\" % (_[0], mapping.get(str(_[1]), _[1])) for _ in zip(df.columns, df.dtypes)]\n", "schema = \", \".join(schema)\n", "schema"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On ajoute l'instruction [DESCRIBE](http://pig.apache.org/docs/r0.16.0/test.html#describe)."]}, {"cell_type": "code", "execution_count": 10, "metadata": {"collapsed": false}, "outputs": [], "source": ["%%PIG_azure aggage3.pig\n", "values = LOAD '$CONTAINER/centrale/bank_full_tab_no.txt' USING PigStorage('\\t') AS (age:double, \n", " job:chararray, marital:chararray, education:chararray, \n", " default:chararray, balance:double, housing:chararray, loan:chararray, \n", " contact:chararray, day:double, month:chararray, duration:double, \n", " campaign:double, pdays:double, previous:double, poutcome:chararray, y:chararray);\n", "DESCRIBE values;\n", "gr = GROUP values BY loan ;\n", "DESCRIBE gr;\n", "agg = FOREACH gr GENERATE group, AVG(age) AS avg_age ;\n", "DESCRIBE agg;\n", "STORE agg INTO '$CONTAINER/centrale/bank_full_tab_no_agg.txt' USING PigStorage('\\t') ;"]}, {"cell_type": "code", "execution_count": 11, "metadata": {"collapsed": false}, "outputs": [], "source": ["jid = %hd_pig_submit aggage3.pig"]}, {"cell_type": "code", "execution_count": 12, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1466069083851_0005'}"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["jid"]}, {"cell_type": "code", "execution_count": 13, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["[{'detail': None, 'id': 'job_1466069083851_0005'},\n", " {'detail': None, 'id': 'job_1466069083851_0004'},\n", " {'detail': None, 'id': 'job_1466069083851_0003'},\n", " {'detail': None, 'id': 'job_1466069083851_0002'},\n", " {'detail': None, 'id': 'job_1466069083851_0001'}]"]}, "execution_count": 14, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_queue"]}, {"cell_type": "code", "execution_count": 14, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["'RUNNING'"]}, "execution_count": 15, "metadata": {}, "output_type": "execute_result"}], "source": ["df = %hd_job_status jid['id']\n", "df[\"status\"][\"state\"]"]}, {"cell_type": "code", "execution_count": 15, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "16/06/16 21:05:43 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL\n", "16/06/16 21:05:43 INFO pig.ExecTypeProvider: Trying ExecType : MAPREDUCE\n", "16/06/16 21:05:43 INFO pig.ExecTypeProvider: Picked MAPREDUCE as the ExecType\n", "2016-06-16 21:05:43,576 [main] INFO org.apache.pig.Main - Apache Pig version 0.15.0.2.3.3.1-21 (r: unknown) compiled May 04 2016, 20:06:44\n", "2016-06-16 21:05:43,576 [main] INFO org.apache.pig.Main - Logging error messages to: C:\\apps\\dist\\hadoop-2.7.1.2.3.3.1-21\\logs\\pig_1466111143557.log\n", "2016-06-16 21:05:45,088 [main] INFO org.apache.pig.impl.util.Utils - Default bootup file D:\\Users\\hdp/.pigbootup not found\n", "2016-06-16 21:05:45,498 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address\n", "2016-06-16 21:05:45,498 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2016-06-16 21:05:45,498 [main] INFO org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: wasb://clusterensaeazure1-3@hdblobstorage.blob.core.windows.net\n", "2016-06-16 21:05:47,452 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2016-06-16 21:05:49,057 [main] ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1025: \n", " Invalid field projection. Projected field [age] does not exist in schema: group:chararray,values:bag{:tuple(age:double,job:chararray,marital:chararray,education:chararray,default:chararray,balance:double,housing:chararray,loan:chararray,contact:chararray,day:double,month:chararray,duration:double,campaign:double,pdays:double,previous:double,poutcome:chararray,y:chararray)}.\n", "2016-06-16 21:05:49,057 [main] ERROR org.apache.pig.tools.grunt.Grunt - org.apache.pig.impl.logicalLayer.FrontendException: ERROR 1001: Unable to describe schema for alias agg\n", "\tat org.apache.pig.PigServer.dumpSchema(PigServer.java:823)\n", "\tat org.apache.pig.tools.grunt.GruntParser.processDescribe(GruntParser.java:321)\n", "\tat org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:416)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:230)\n", "\tat org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:205)\n", "\tat org.apache.pig.tools.grunt.Grunt.exec(Grunt.java:81)\n", "\tat org.apache.pig.Main.run(Main.java:502)\n", "\tat org.apache.pig.Main.main(Main.java:177)\n", "Caused by: org.apache.pig.impl.plan.PlanValidationException: ERROR 1025: \n", " Invalid field projection. Projected field [age] does not exist in schema: group:chararray,values:bag{:tuple(age:double,job:chararray,marital:chararray,education:chararray,default:chararray,balance:double,housing:chararray,loan:chararray,contact:chararray,day:double,month:chararray,duration:double,campaign:double,pdays:double,previous:double,poutcome:chararray,y:chararray)}.\n", "\tat org.apache.pig.newplan.logical.expression.ProjectExpression.findColNum(ProjectExpression.java:191)\n", "\tat org.apache.pig.newplan.logical.expression.ProjectExpression.setColumnNumberFromAlias(ProjectExpression.java:174)\n", "\tat org.apache.pig.newplan.logical.visitor.ColumnAliasConversionVisitor$1.visit(ColumnAliasConversionVisitor.java:53)\n", "\tat org.apache.pig.newplan.logical.expression.ProjectExpression.accept(ProjectExpression.java:215)\n", "\tat org.apache.pig.newplan.DependencyOrderWalker.walk(DependencyOrderWalker.java:75)\n", "\tat org.apache.pig.newplan.PlanVisitor.visit(PlanVisitor.java:52)\n", "\tat org.apache.pig.newplan.logical.optimizer.AllExpressionVisitor.visit(AllExpressionVisitor.java:142)\n", "\tat org.apache.pig.newplan.logical.relational.LOInnerLoad.accept(LOInnerLoad.java:128)\n", "\tat org.apache.pig.newplan.DependencyOrderWalker.walk(DependencyOrderWalker.java:75)\n", "\tat org.apache.pig.newplan.logical.optimizer.AllExpressionVisitor.visit(AllExpressionVisitor.java:124)\n", "\tat org.apache.pig.newplan.logical.relational.LOForEach.accept(LOForEach.java:87)\n", "\tat org.apache.pig.newplan.DependencyOrderWalker.walk(DependencyOrderWalker.java:75)\n", "\tat org.apache.pig.newplan.PlanVisitor.visit(PlanVisitor.java:52)\n", "\tat org.apache.pig.newplan.logical.relational.LogicalPlan.validate(LogicalPlan.java:175)\n", "\tat org.apache.pig.PigServer$Graph.compile(PigServer.java:1767)\n", "\tat org.apache.pig.PigServer$Graph.access$300(PigServer.java:1443)\n", "\tat org.apache.pig.PigServer.buildStorePlan(PigServer.java:1339)\n", "\tat org.apache.pig.PigServer.getOperatorForAlias(PigServer.java:1418)\n", "\tat org.apache.pig.PigServer.dumpSchema(PigServer.java:806)\n", "\t... 7 more\n", "\n", "Details also at logfile: C:\\apps\\dist\\hadoop-2.7.1.2.3.3.1-21\\logs\\pig_1466111143557.log\n", "2016-06-16 21:05:49,119 [main] INFO org.apache.pig.Main - Pig script completed in 5 seconds and 954 milliseconds (5954 ms)\n", "\n", "
OUT:
\n", "values: {age: double,job: chararray,marital: chararray,education: chararray,default: chararray,balance: double,housing: chararray,loan: chararray,contact: chararray,day: double,month: chararray,duration: double,campaign: double,pdays: double,previous: double,poutcome: chararray,y: chararray}\n", "gr: {group: chararray,values: {(age: double,job: chararray,marital: chararray,education: chararray,default: chararray,balance: double,housing: chararray,loan: chararray,contact: chararray,day: double,month: chararray,duration: double,campaign: double,pdays: double,previous: double,poutcome: chararray,y: chararray)}}\n", "\n", "
"], "text/plain": [""]}, "execution_count": 16, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_tail_stderr -n 100 jid['id']"]}, {"cell_type": "code", "execution_count": 16, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG_azure aggage4.pig\n", "values = LOAD '$CONTAINER/centrale/bank_full_tab_no.txt' USING PigStorage('\\t') AS (age:double, \n", " job:chararray, marital:chararray, education:chararray, \n", " default:chararray, balance:double, housing:chararray, loan:chararray, \n", " contact:chararray, day:double, month:chararray, duration:double, \n", " campaign:double, \n", " pdays:double, previous:double, poutcome:chararray, y:chararray);\n", "DESCRIBE values;\n", "gr = GROUP values BY loan ;\n", "DESCRIBE gr;\n", "agg = FOREACH gr GENERATE group, AVG(values.age) AS avg_age ;\n", "DESCRIBE agg;\n", "STORE agg INTO '$CONTAINER/centrale/bank_full_tab_no_agg2.txt' USING PigStorage('\\t') ;"]}, {"cell_type": "code", "execution_count": 17, "metadata": {"collapsed": true}, "outputs": [], "source": ["jid = %hd_pig_submit aggage4.pig"]}, {"cell_type": "code", "execution_count": 18, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["{'id': 'job_1466069083851_0008'}"]}, "execution_count": 19, "metadata": {}, "output_type": "execute_result"}], "source": ["jid"]}, {"cell_type": "code", "execution_count": 19, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["[{'detail': None, 'id': 'job_1466069083851_0009'},\n", " {'detail': None, 'id': 'job_1466069083851_0008'},\n", " {'detail': None, 'id': 'job_1466069083851_0007'},\n", " {'detail': None, 'id': 'job_1466069083851_0006'},\n", " {'detail': None, 'id': 'job_1466069083851_0005'},\n", " {'detail': None, 'id': 'job_1466069083851_0004'},\n", " {'detail': None, 'id': 'job_1466069083851_0003'},\n", " {'detail': None, 'id': 'job_1466069083851_0002'},\n", " {'detail': None, 'id': 'job_1466069083851_0001'}]"]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["%hd_queue"]}, {"cell_type": "code", "execution_count": 20, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["'RUNNING'"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["df = %hd_job_status jid['id']\n", "df[\"status\"][\"state\"]"]}, {"cell_type": "code", "execution_count": 21, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "2016-06-16 21:13:19,066 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2016-06-16 21:13:19,410 [main] INFO org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2016-06-16 21:13:19,410 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.106.128.41:9010\n", "2016-06-16 21:13:19,410 [main] INFO org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.106.128.41:10200\n", "2016-06-16 21:13:19,504 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2016-06-16 21:13:19,629 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 100% complete\n", "2016-06-16 21:13:19,629 [main] INFO org.apache.pig.tools.pigstats.mapreduce.SimplePigStats - Script Statistics: \n", "\n", "HadoopVersion\tPigVersion\tUserId\tStartedAt\tFinishedAt\tFeatures\n", "2.7.1.2.3.3.1-21\t0.15.0.2.3.3.1-21\thdp\t2016-06-16 21:12:27\t2016-06-16 21:13:19\tGROUP_BY\n", "\n", "Success!\n", "\n", "Job Stats (time in seconds):\n", "JobId\tMaps\tReduces\tMaxMapTime\tMinMapTime\tAvgMapTime\tMedianMapTime\tMaxReduceTime\tMinReduceTime\tAvgReduceTime\tMedianReducetime\tAlias\tFeature\tOutputs\n", "job_1466069083851_0009\t1\t1\t12\t12\t12\t12\t9\t9\t9\t9\tagg,gr,values\tGROUP_BY,COMBINER\twasb://hdblobstorage@hdblobstorage.blob.core.windows.net//centrale/bank_full_tab_no_agg2.txt,\n", "\n", "Input(s):\n", "Successfully read 45212 records from: \"wasb://hdblobstorage@hdblobstorage.blob.core.windows.net//centrale/bank_full_tab_no.txt\"\n", "\n", "Output(s):\n", "Successfully stored 3 records in: \"wasb://hdblobstorage@hdblobstorage.blob.core.windows.net//centrale/bank_full_tab_no_agg2.txt\"\n", "\n", "Counters:\n", "Total records written : 3\n", "Total bytes written : 0\n", "Spillable Memory Manager spill count : 0\n", "Total bags proactively spilled: 0\n", "Total records proactively spilled: 0\n", "\n", "Job DAG:\n", "job_1466069083851_0009\n", "\n", "\n", "2016-06-16 21:13:19,848 [main] INFO org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2016-06-16 21:13:19,848 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.106.128.41:9010\n", "2016-06-16 21:13:19,848 [main] INFO org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.106.128.41:10200\n", "2016-06-16 21:13:19,926 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2016-06-16 21:13:20,160 [main] INFO org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2016-06-16 21:13:20,160 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.106.128.41:9010\n", "2016-06-16 21:13:20,160 [main] INFO org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.106.128.41:10200\n", "2016-06-16 21:13:20,238 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2016-06-16 21:13:20,506 [main] INFO org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://headnodehost:8188/ws/v1/timeline/\n", "2016-06-16 21:13:20,506 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at headnodehost/100.106.128.41:9010\n", "2016-06-16 21:13:20,506 [main] INFO org.apache.hadoop.yarn.client.AHSProxy - Connecting to Application History server at headnodehost/100.106.128.41:10200\n", "2016-06-16 21:13:20,582 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2016-06-16 21:13:20,646 [main] WARN org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Encountered Warning FIELD_DISCARDED_TYPE_CONVERSION_FAILED 7 time(s).\n", "2016-06-16 21:13:20,646 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!\n", "2016-06-16 21:13:20,725 [main] INFO org.apache.pig.Main - Pig script completed in 1 minute, 2 seconds and 364 milliseconds (62364 ms)\n", "\n", "
OUT:
\n", "values: {age: double,job: chararray,marital: chararray,education: chararray,default: chararray,balance: double,housing: chararray,loan: chararray,contact: chararray,day: double,month: chararray,duration: double,campaign: double,pdays: double,previous: double,poutcome: chararray,y: chararray}\n", "gr: {group: chararray,values: {(age: double,job: chararray,marital: chararray,education: chararray,default: chararray,balance: double,housing: chararray,loan: chararray,contact: chararray,day: double,month: chararray,duration: double,campaign: double,pdays: double,previous: double,poutcome: chararray,y: chararray)}}\n", "agg: {group: chararray,avg_age: double}\n", "\n", "
"], "text/plain": [""]}, "execution_count": 22, "metadata": {}, "output_type": "execute_result"}], "source": ["hd_tail_stderr -n 50 jid['id']"]}, {"cell_type": "code", "execution_count": 22, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " name | \n", " last_modified | \n", " content_type | \n", " content_length | \n", " blob_type | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " centrale/bank_full.csv | \n", " 2016-06-15 22:17:59+00:00 | \n", " None | \n", " 4610348 | \n", " BlockBlob | \n", "
\n", " \n", " 1 | \n", " centrale/bank_full_tab.txt | \n", " 2016-06-15 22:19:46+00:00 | \n", " None | \n", " 3751306 | \n", " BlockBlob | \n", "
\n", " \n", " 2 | \n", " centrale/bank_full_tab_no.txt | \n", " 2016-06-15 23:00:52+00:00 | \n", " None | \n", " 3751306 | \n", " BlockBlob | \n", "
\n", " \n", " 3 | \n", " centrale/bank_full_tab_no_agg.txt | \n", " 2016-06-16 10:32:11+00:00 | \n", " None | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 4 | \n", " centrale/bank_full_tab_no_agg.txt/_SUCCESS | \n", " 2016-06-16 10:32:11+00:00 | \n", " None | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 5 | \n", " centrale/bank_full_tab_no_agg.txt/part-r-00000 | \n", " 2016-06-16 10:32:11+00:00 | \n", " None | \n", " 49 | \n", " BlockBlob | \n", "
\n", " \n", " 6 | \n", " centrale/bank_full_tab_no_agg2.txt | \n", " 2016-06-16 21:13:14+00:00 | \n", " None | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 7 | \n", " centrale/bank_full_tab_no_agg2.txt/_SUCCESS | \n", " 2016-06-16 21:13:14+00:00 | \n", " None | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 8 | \n", " centrale/bank_full_tab_no_agg2.txt/part-r-00000 | \n", " 2016-06-16 21:13:13+00:00 | \n", " None | \n", " 49 | \n", " BlockBlob | \n", "
\n", " \n", " 9 | \n", " centrale/scripts/pig/aggage.pig | \n", " 2016-06-15 23:15:54+00:00 | \n", " None | \n", " 782 | \n", " BlockBlob | \n", "
\n", " \n", " 10 | \n", " centrale/scripts/pig/aggage.pig.log | \n", " 2016-06-15 23:16:40+00:00 | \n", " None | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 11 | \n", " centrale/scripts/pig/aggage.pig.log/exit | \n", " 2016-06-15 23:16:40+00:00 | \n", " None | \n", " 3 | \n", " BlockBlob | \n", "
\n", " \n", " 12 | \n", " centrale/scripts/pig/aggage.pig.log/stderr | \n", " 2016-06-15 23:16:30+00:00 | \n", " None | \n", " 4060 | \n", " BlockBlob | \n", "
\n", " \n", " 13 | \n", " centrale/scripts/pig/aggage.pig.log/stdout | \n", " 2016-06-15 23:16:30+00:00 | \n", " None | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 14 | \n", " centrale/scripts/pig/aggage2.pig | \n", " 2016-06-16 10:28:16+00:00 | \n", " None | \n", " 853 | \n", " BlockBlob | \n", "
\n", " \n", " 15 | \n", " centrale/scripts/pig/aggage2.pig.log | \n", " 2016-06-16 10:29:04+00:00 | \n", " None | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 16 | \n", " centrale/scripts/pig/aggage2.pig.log/exit | \n", " 2016-06-16 10:29:04+00:00 | \n", " None | \n", " 3 | \n", " BlockBlob | \n", "
\n", " \n", " 17 | \n", " centrale/scripts/pig/aggage2.pig.log/stderr | \n", " 2016-06-16 10:28:54+00:00 | \n", " None | \n", " 4883 | \n", " BlockBlob | \n", "
\n", " \n", " 18 | \n", " centrale/scripts/pig/aggage2.pig.log/stdout | \n", " 2016-06-16 10:28:54+00:00 | \n", " None | \n", " 613 | \n", " BlockBlob | \n", "
\n", " \n", " 19 | \n", " centrale/scripts/pig/aggage3.pig | \n", " 2016-06-16 21:05:11+00:00 | \n", " None | \n", " 853 | \n", " BlockBlob | \n", "
\n", " \n", " 20 | \n", " centrale/scripts/pig/aggage3.pig.log | \n", " 2016-06-16 21:05:59+00:00 | \n", " None | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 21 | \n", " centrale/scripts/pig/aggage3.pig.log/exit | \n", " 2016-06-16 21:05:59+00:00 | \n", " None | \n", " 3 | \n", " BlockBlob | \n", "
\n", " \n", " 22 | \n", " centrale/scripts/pig/aggage3.pig.log/stderr | \n", " 2016-06-16 21:05:49+00:00 | \n", " None | \n", " 4883 | \n", " BlockBlob | \n", "
\n", " \n", " 23 | \n", " centrale/scripts/pig/aggage3.pig.log/stdout | \n", " 2016-06-16 21:05:49+00:00 | \n", " None | \n", " 613 | \n", " BlockBlob | \n", "
\n", " \n", " 24 | \n", " centrale/scripts/pig/aggage4.pig | \n", " 2016-06-16 21:11:47+00:00 | \n", " None | \n", " 861 | \n", " BlockBlob | \n", "
\n", " \n", " 25 | \n", " centrale/scripts/pig/aggage4.pig.log | \n", " 2016-06-16 21:13:31+00:00 | \n", " None | \n", " 0 | \n", " BlockBlob | \n", "
\n", " \n", " 26 | \n", " centrale/scripts/pig/aggage4.pig.log/exit | \n", " 2016-06-16 21:13:31+00:00 | \n", " None | \n", " 3 | \n", " BlockBlob | \n", "
\n", " \n", " 27 | \n", " centrale/scripts/pig/aggage4.pig.log/stderr | \n", " 2016-06-16 21:13:21+00:00 | \n", " None | \n", " 16643 | \n", " BlockBlob | \n", "
\n", " \n", " 28 | \n", " centrale/scripts/pig/aggage4.pig.log/stdout | \n", " 2016-06-16 21:13:21+00:00 | \n", " None | \n", " 654 | \n", " BlockBlob | \n", "
\n", " \n", " 29 | \n", " centrale2/bank_full_tab_no.txt | \n", " 2016-06-16 10:18:58+00:00 | \n", " None | \n", " 3751188 | \n", " BlockBlob | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" name last_modified \\\n", "0 centrale/bank_full.csv 2016-06-15 22:17:59+00:00 \n", "1 centrale/bank_full_tab.txt 2016-06-15 22:19:46+00:00 \n", "2 centrale/bank_full_tab_no.txt 2016-06-15 23:00:52+00:00 \n", "3 centrale/bank_full_tab_no_agg.txt 2016-06-16 10:32:11+00:00 \n", "4 centrale/bank_full_tab_no_agg.txt/_SUCCESS 2016-06-16 10:32:11+00:00 \n", "5 centrale/bank_full_tab_no_agg.txt/part-r-00000 2016-06-16 10:32:11+00:00 \n", "6 centrale/bank_full_tab_no_agg2.txt 2016-06-16 21:13:14+00:00 \n", "7 centrale/bank_full_tab_no_agg2.txt/_SUCCESS 2016-06-16 21:13:14+00:00 \n", "8 centrale/bank_full_tab_no_agg2.txt/part-r-00000 2016-06-16 21:13:13+00:00 \n", "9 centrale/scripts/pig/aggage.pig 2016-06-15 23:15:54+00:00 \n", "10 centrale/scripts/pig/aggage.pig.log 2016-06-15 23:16:40+00:00 \n", "11 centrale/scripts/pig/aggage.pig.log/exit 2016-06-15 23:16:40+00:00 \n", "12 centrale/scripts/pig/aggage.pig.log/stderr 2016-06-15 23:16:30+00:00 \n", "13 centrale/scripts/pig/aggage.pig.log/stdout 2016-06-15 23:16:30+00:00 \n", "14 centrale/scripts/pig/aggage2.pig 2016-06-16 10:28:16+00:00 \n", "15 centrale/scripts/pig/aggage2.pig.log 2016-06-16 10:29:04+00:00 \n", "16 centrale/scripts/pig/aggage2.pig.log/exit 2016-06-16 10:29:04+00:00 \n", "17 centrale/scripts/pig/aggage2.pig.log/stderr 2016-06-16 10:28:54+00:00 \n", "18 centrale/scripts/pig/aggage2.pig.log/stdout 2016-06-16 10:28:54+00:00 \n", "19 centrale/scripts/pig/aggage3.pig 2016-06-16 21:05:11+00:00 \n", "20 centrale/scripts/pig/aggage3.pig.log 2016-06-16 21:05:59+00:00 \n", "21 centrale/scripts/pig/aggage3.pig.log/exit 2016-06-16 21:05:59+00:00 \n", "22 centrale/scripts/pig/aggage3.pig.log/stderr 2016-06-16 21:05:49+00:00 \n", "23 centrale/scripts/pig/aggage3.pig.log/stdout 2016-06-16 21:05:49+00:00 \n", "24 centrale/scripts/pig/aggage4.pig 2016-06-16 21:11:47+00:00 \n", "25 centrale/scripts/pig/aggage4.pig.log 2016-06-16 21:13:31+00:00 \n", "26 centrale/scripts/pig/aggage4.pig.log/exit 2016-06-16 21:13:31+00:00 \n", "27 centrale/scripts/pig/aggage4.pig.log/stderr 2016-06-16 21:13:21+00:00 \n", "28 centrale/scripts/pig/aggage4.pig.log/stdout 2016-06-16 21:13:21+00:00 \n", "29 centrale2/bank_full_tab_no.txt 2016-06-16 10:18:58+00:00 \n", "\n", " content_type content_length blob_type \n", "0 None 4610348 BlockBlob \n", "1 None 3751306 BlockBlob \n", "2 None 3751306 BlockBlob \n", "3 None 0 BlockBlob \n", "4 None 0 BlockBlob \n", "5 None 49 BlockBlob \n", "6 None 0 BlockBlob \n", "7 None 0 BlockBlob \n", "8 None 49 BlockBlob \n", "9 None 782 BlockBlob \n", "10 None 0 BlockBlob \n", "11 None 3 BlockBlob \n", "12 None 4060 BlockBlob \n", "13 None 0 BlockBlob \n", "14 None 853 BlockBlob \n", "15 None 0 BlockBlob \n", "16 None 3 BlockBlob \n", "17 None 4883 BlockBlob \n", "18 None 613 BlockBlob \n", "19 None 853 BlockBlob \n", "20 None 0 BlockBlob \n", "21 None 3 BlockBlob \n", "22 None 4883 BlockBlob \n", "23 None 613 BlockBlob \n", "24 None 861 BlockBlob \n", "25 None 0 BlockBlob \n", "26 None 3 BlockBlob \n", "27 None 16643 BlockBlob \n", "28 None 654 BlockBlob \n", "29 None 3751188 BlockBlob "]}, "execution_count": 23, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_ls /centrale"]}, {"cell_type": "code", "execution_count": 23, "metadata": {"collapsed": false}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["usage: blob_downmerge [-h] [-o] remotepath localfile\n", "\n", "download a set of files from a blob storage folder, files will be merged, we\n", "assume the container is the first element to the remote path\n", "\n", "positional arguments:\n", " remotepath remote path of the folder to download\n", " localfile local name for the downloaded merged file\n", "\n", "optional arguments:\n", " -h, --help show this help message and exit\n", " -o, --overwrite overwrite the local file\n", "usage: blob_downmerge [-h] [-o] remotepath localfile\n", "\n"]}], "source": ["%blob_downmerge --help"]}, {"cell_type": "code", "execution_count": 24, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/plain": ["'agg_hadoop3.txt'"]}, "execution_count": 25, "metadata": {}, "output_type": "execute_result"}], "source": ["%blob_down /centrale/bank_full_tab_no_agg2.txt/part-r-00000 agg_hadoop3.txt"]}, {"cell_type": "code", "execution_count": 25, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " 0 | \n", " 1 | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " no | \n", " 41.008823 | \n", "
\n", " \n", " 1 | \n", " yes | \n", " 40.555632 | \n", "
\n", " \n", " 2 | \n", " loan | \n", " NaN | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" 0 1\n", "0 no 41.008823\n", "1 yes 40.555632\n", "2 loan NaN"]}, "execution_count": 26, "metadata": {}, "output_type": "execute_result"}], "source": ["import pandas\n", "df = pandas.read_csv(\"agg_hadoop3.txt\", sep=\"\\t\", header=-1)\n", "df"]}, {"cell_type": "markdown", "metadata": {}, "source": ["J'ai oubli\u00e9 d'enlever le header. On v\u00e9rifie que les calcus sont bons en les faisant en local."]}, {"cell_type": "code", "execution_count": 26, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " age | \n", " job | \n", " marital | \n", " education | \n", " default | \n", " balance | \n", " housing | \n", " loan | \n", " contact | \n", " day | \n", " month | \n", " duration | \n", " campaign | \n", " pdays | \n", " previous | \n", " poutcome | \n", " y | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 58 | \n", " management | \n", " married | \n", " tertiary | \n", " no | \n", " 2143 | \n", " yes | \n", " no | \n", " unknown | \n", " 5 | \n", " may | \n", " 261 | \n", " 1 | \n", " -1 | \n", " 0 | \n", " unknown | \n", " no | \n", "
\n", " \n", " 1 | \n", " 44 | \n", " technician | \n", " single | \n", " secondary | \n", " no | \n", " 29 | \n", " yes | \n", " no | \n", " unknown | \n", " 5 | \n", " may | \n", " 151 | \n", " 1 | \n", " -1 | \n", " 0 | \n", " unknown | \n", " no | \n", "
\n", " \n", " 2 | \n", " 33 | \n", " entrepreneur | \n", " married | \n", " secondary | \n", " no | \n", " 2 | \n", " yes | \n", " yes | \n", " unknown | \n", " 5 | \n", " may | \n", " 76 | \n", " 1 | \n", " -1 | \n", " 0 | \n", " unknown | \n", " no | \n", "
\n", " \n", " 3 | \n", " 47 | \n", " blue-collar | \n", " married | \n", " unknown | \n", " no | \n", " 1506 | \n", " yes | \n", " no | \n", " unknown | \n", " 5 | \n", " may | \n", " 92 | \n", " 1 | \n", " -1 | \n", " 0 | \n", " unknown | \n", " no | \n", "
\n", " \n", " 4 | \n", " 33 | \n", " unknown | \n", " single | \n", " unknown | \n", " no | \n", " 1 | \n", " no | \n", " no | \n", " unknown | \n", " 5 | \n", " may | \n", " 198 | \n", " 1 | \n", " -1 | \n", " 0 | \n", " unknown | \n", " no | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" age job marital education default balance housing loan \\\n", "0 58 management married tertiary no 2143 yes no \n", "1 44 technician single secondary no 29 yes no \n", "2 33 entrepreneur married secondary no 2 yes yes \n", "3 47 blue-collar married unknown no 1506 yes no \n", "4 33 unknown single unknown no 1 no no \n", "\n", " contact day month duration campaign pdays previous poutcome y \n", "0 unknown 5 may 261 1 -1 0 unknown no \n", "1 unknown 5 may 151 1 -1 0 unknown no \n", "2 unknown 5 may 76 1 -1 0 unknown no \n", "3 unknown 5 may 92 1 -1 0 unknown no \n", "4 unknown 5 may 198 1 -1 0 unknown no "]}, "execution_count": 27, "metadata": {}, "output_type": "execute_result"}], "source": ["df = pandas.read_csv(\"bank-full.csv\", sep=\";\")\n", "df.head()"]}, {"cell_type": "code", "execution_count": 27, "metadata": {"collapsed": false}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " age | \n", "
\n", " \n", " loan | \n", " | \n", "
\n", " \n", " \n", " \n", " no | \n", " 41.008823 | \n", "
\n", " \n", " yes | \n", " 40.555632 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" age\n", "loan \n", "no 41.008823\n", "yes 40.555632"]}, "execution_count": 28, "metadata": {}, "output_type": "execute_result"}], "source": ["df[[\"loan\", \"age\"]].groupby(\"loan\").mean()"]}, {"cell_type": "code", "execution_count": 28, "metadata": {"collapsed": true}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2"}}, "nbformat": 4, "nbformat_minor": 2}