``. S'il n'y a pas d'erreur, il rejoint la file d'attente."]}, {"cell_type": "code", "execution_count": 41, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/html": ["\n", "2015-10-29 00:59:45,391 [main] INFO org.apache.pig.Main - Apache Pig version 0.12.0-cdh5.0.2 (rexported) compiled Jun 09 2014, 11:14:51\n", "2015-10-29 00:59:45,392 [main] INFO org.apache.pig.Main - Logging error messages to: /home/xavierdupre/pig_1446076785387.log\n", "2015-10-29 00:59:46,194 [main] INFO org.apache.pig.impl.util.Utils - Default bootup file /home/xavierdupre/.pigbootup not found\n", "2015-10-29 00:59:46,430 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address\n", "2015-10-29 00:59:46,430 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-10-29 00:59:46,431 [main] INFO org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: hdfs://nameservice1\n", "2015-10-29 00:59:48,188 [main] WARN org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_CHARARRAY 1 time(s).\n", "2015-10-29 00:59:48,211 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig features used in the script: FILTER\n", "2015-10-29 00:59:48,285 [main] INFO org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer - {RULES_ENABLED=[AddForEach, ColumnMapKeyPrune, DuplicateForEachColumnRewrite, GroupByConstParallelSetter, ImplicitSplitInserter, LimitOptimizer, LoadTypeCastInserter, MergeFilter, MergeForEach, NewPartitionFilterOptimizer, PartitionFilterOptimizer, PushDownForEachFlatten, PushUpFilter, SplitFilter, StreamTypeCastInserter], RULES_DISABLED=[FilterLogicExpressionSimplifier]}\n", "2015-10-29 00:59:48,333 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - mapred.textoutputformat.separator is deprecated. Instead, use mapreduce.output.textoutputformat.separator\n", "2015-10-29 00:59:48,511 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false\n", "2015-10-29 00:59:48,564 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 1\n", "2015-10-29 00:59:48,565 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 1\n", "2015-10-29 00:59:48,728 [main] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at ws09-sr1.tl.teralab-datascience.fr/10.200.209.11:8032\n", "2015-10-29 00:59:48,926 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job\n", "2015-10-29 00:59:49,021 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - mapred.job.reduce.markreset.buffer.percent is deprecated. Instead, use mapreduce.reduce.markreset.buffer.percent\n", "2015-10-29 00:59:49,021 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3\n", "2015-10-29 00:59:49,022 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - mapred.output.compress is deprecated. Instead, use mapreduce.output.fileoutputformat.compress\n", "2015-10-29 00:59:49,025 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job6897107947228914959.jar\n", "2015-10-29 00:59:53,488 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job6897107947228914959.jar created\n", "2015-10-29 00:59:53,489 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - mapred.jar is deprecated. Instead, use mapreduce.job.jar\n", "2015-10-29 00:59:53,538 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job\n", "2015-10-29 00:59:53,556 [main] INFO org.apache.pig.data.SchemaTupleFrontend - Key [pig.schematuple] is false, will not generate code.\n", "2015-10-29 00:59:53,557 [main] INFO org.apache.pig.data.SchemaTupleFrontend - Starting process to move generated code to distributed cache\n", "2015-10-29 00:59:53,559 [main] INFO org.apache.pig.data.SchemaTupleFrontend - Setting key [pig.schematuple.classes] with classes to deserialize []\n", "2015-10-29 00:59:53,654 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission.\n", "2015-10-29 00:59:53,656 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - mapred.job.tracker.http.address is deprecated. Instead, use mapreduce.jobtracker.http.address\n", "2015-10-29 00:59:53,668 [JobControl] INFO org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at ws09-sr1.tl.teralab-datascience.fr/10.200.209.11:8032\n", "2015-10-29 00:59:53,752 [JobControl] INFO org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-10-29 00:59:55,086 [JobControl] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1\n", "2015-10-29 00:59:55,086 [JobControl] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1\n", "2015-10-29 00:59:55,127 [JobControl] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1\n", "2015-10-29 00:59:55,999 [JobControl] INFO org.apache.hadoop.mapreduce.JobSubmitter - number of splits:1\n", "2015-10-29 00:59:56,108 [JobControl] INFO org.apache.hadoop.conf.Configuration.deprecation - fs.default.name is deprecated. Instead, use fs.defaultFS\n", "2015-10-29 00:59:56,748 [JobControl] INFO org.apache.hadoop.mapreduce.JobSubmitter - Submitting tokens for job: job_1444669880271_0036\n", "2015-10-29 00:59:57,100 [JobControl] INFO org.apache.hadoop.yarn.client.api.impl.YarnClientImpl - Submitted application application_1444669880271_0036\n", "2015-10-29 00:59:57,172 [JobControl] INFO org.apache.hadoop.mapreduce.Job - The url to track the job: http://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0036/\n", "2015-10-29 00:59:57,173 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_1444669880271_0036\n", "2015-10-29 00:59:57,173 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Processing aliases filt,myinput\n", "2015-10-29 00:59:57,174 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - detailed locations: M: myinput[2,10],filt[6,7],myinput[-1,-1] C: R: \n", "2015-10-29 00:59:57,233 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 0% complete\n", "2015-10-29 01:00:15,280 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 50% complete\n", "2015-10-29 01:00:22,643 [main] INFO org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server\n", "2015-10-29 01:00:22,911 [main] INFO org.apache.hadoop.conf.Configuration.deprecation - mapred.reduce.tasks is deprecated. Instead, use mapreduce.job.reduces\n", "2015-10-29 01:00:22,984 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 100% complete\n", "2015-10-29 01:00:22,988 [main] INFO org.apache.pig.tools.pigstats.SimplePigStats - Script Statistics: \n", "\n", "HadoopVersion\tPigVersion\tUserId\tStartedAt\tFinishedAt\tFeatures\n", "2.3.0-cdh5.0.2\t0.12.0-cdh5.0.2\txavierdupre\t2015-10-29 00:59:48\t2015-10-29 01:00:22\tFILTER\n", "\n", "Success!\n", "\n", "Job Stats (time in seconds):\n", "JobId\tMaps\tReduces\tMaxMapTime\tMinMapTIme\tAvgMapTime\tMedianMapTime\tMaxReduceTime\tMinReduceTime\tAvgReduceTime\tMedianReducetime\tAlias\tFeature\tOutputs\n", "job_1444669880271_0036\t1\t0\t4\t4\t4\t4\tn/a\tn/a\tn/a\tn/a\tfilt,myinput\tMAP_ONLY\thdfs://nameservice1/user/xavierdupre/ConfLongDemo_JSI.small.example2.walking_test20.txt,\n", "\n", "Input(s):\n", "Successfully read 1000 records (133117 bytes) from: \"hdfs://nameservice1/user/xavierdupre/ConfLongDemo_JSI.small.example.txt\"\n", "\n", "Output(s):\n", "Successfully stored 170 records (22166 bytes) in: \"hdfs://nameservice1/user/xavierdupre/ConfLongDemo_JSI.small.example2.walking_test20.txt\"\n", "\n", "Counters:\n", "Total records written : 170\n", "Total bytes written : 22166\n", "Spillable Memory Manager spill count : 0\n", "Total bags proactively spilled: 0\n", "Total records proactively spilled: 0\n", "\n", "Job DAG:\n", "job_1444669880271_0036\n", "\n", "\n", "2015-10-29 01:00:23,159 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!\n", "\n", "
"], "text/plain": [""]}, "execution_count": 42, "metadata": {}, "output_type": "execute_result"}], "source": ["%pig_submit select2.pig -r None"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Derri\u00e8re cette command magique, il y a la fonction [pig_submit](http://www.xavierdupre.fr/app/pyensae/helpsphinx/pyensae/remote/ssh_remote_connection.html?highlight=pig_submit#pyensae.remote.ssh_remote_connection.ASSHClient.pig_submit)."]}, {"cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Help on method pig_submit in module pyensae.remote.ssh_remote_connection:\n", "\n", "pig_submit(pig_file, dependencies=None, params=None, redirection='redirection.pig', local=False, stop_on_failure=False, check=False, no_exception=True, fLOG=) method of pyensae.remote.ssh_remote_connection.ASSHClient instance\n", " submits a PIG script, it first upload the script\n", " to the default folder and submit it\n", " \n", " @param pig_file pig script (local)\n", " @param dependencies others files to upload (still in the default folder)\n", " @param params parameters to send to the job\n", " @param redirection string empty or not\n", " @param local local run or not (option `-x local `_) (in that case, redirection will be empty)\n", " @param stop_on_failure if True, add option ``-stop_on_failure`` on the command line\n", " @param check if True, add option ``-check`` (in that case, redirection will be empty)\n", " @param no_exception sent to @see me execute_command\n", " @param fLOG logging function\n", " @return out, err from @see me execute_command\n", " \n", " If *redirection* is not empty, the job is submitted but\n", " the function returns after the standard output and error were\n", " redirected to ``redirection.out`` and ``redirection.err``.\n", " \n", " The first file will contain the results of commands\n", " `DESCRIBE `_\n", " `DUMP `_,\n", " `EXPLAIN `_.\n", " The standard error receives logs and exceptions.\n", " \n", " The function executes the command line::\n", " \n", " pig -execute -f \n", " \n", " With redirection::\n", " \n", " pig -execute -f 2> redirection.pig.err 1> redirection.pig.out &\n", " \n", " .. versionadded:: 1.1\n", "\n"]}], "source": ["help(ssh.pig_submit)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On retrouve bien les m\u00eames r\u00e9sultats. Cependant, on n'a pas envie d'attendre la fin d'un job pour reprendre la main sur le notebook. Pour ce faire, on cr\u00e9e un second job."]}, {"cell_type": "code", "execution_count": 43, "metadata": {"collapsed": true}, "outputs": [], "source": ["%%PIG select3.pig\n", "\n", "myinput = LOAD 'ConfLongDemo_JSI.small.example.txt' \n", " using PigStorage(',') \n", " AS (index:long, sequence, tag, timestamp:long, dateformat, x:double,y:double, z:double, activity) ;\n", "\n", "filt = FILTER myinput BY activity == 'walking' ;\n", "\n", "STORE filt INTO 'ConfLongDemo_JSI.small.example2.walking_test30_nowait.txt' USING PigStorage() ;"]}, {"cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
"], "text/plain": [""]}, "execution_count": 45, "metadata": {}, "output_type": "execute_result"}], "source": ["%pig_submit select3.pig"]}, {"cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "Output(s):\n", "Successfully stored 170 records (22166 bytes) in: \"hdfs://nameservice1/user/xavierdupre/ConfLongDemo_JSI.small.example2.walking_test30_nowait.txt\"\n", "\n", "Counters:\n", "Total records written : 170\n", "Total bytes written : 22166\n", "Spillable Memory Manager spill count : 0\n", "Total bags proactively spilled: 0\n", "Total records proactively spilled: 0\n", "\n", "Job DAG:\n", "job_1444669880271_0037\n", "\n", "\n", "2015-10-29 01:03:14,257 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!\n", "\n", "
"], "text/plain": [""]}, "execution_count": 46, "metadata": {}, "output_type": "execute_result"}], "source": ["%remote_cmd tail redirection.pig.err -n 15"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On v\u00e9rifie le contenu du cluster :"]}, {"cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "Found 9 items\n", "drwx------ - xavierdupre xavierdupre 0 2015-10-29 01:00 .Trash\n", "drwx------ - xavierdupre xavierdupre 0 2015-10-29 01:03 .staging\n", "-rw-r--r-- 3 xavierdupre xavierdupre 132727 2015-10-29 00:38 ConfLongDemo_JSI.small.example.txt\n", "drwxr-xr-x - xavierdupre xavierdupre 0 2015-10-29 00:42 ConfLongDemo_JSI.small.example2.walking.txt\n", "drwxr-xr-x - xavierdupre xavierdupre 0 2015-10-29 00:52 ConfLongDemo_JSI.small.example2.walking_test.txt\n", "drwxr-xr-x - xavierdupre xavierdupre 0 2015-10-29 00:53 ConfLongDemo_JSI.small.example2.walking_test2.txt\n", "drwxr-xr-x - xavierdupre xavierdupre 0 2015-10-29 01:00 ConfLongDemo_JSI.small.example2.walking_test20.txt\n", "drwxr-xr-x - xavierdupre xavierdupre 0 2015-10-29 01:03 ConfLongDemo_JSI.small.example2.walking_test30_nowait.txt\n", "drwxr-xr-x - xavierdupre xavierdupre 0 2014-11-20 23:43 unitest2\n", "\n", "
"], "text/plain": [""]}, "execution_count": 47, "metadata": {}, "output_type": "execute_result"}], "source": ["%remote_cmd hdfs dfs -ls "]}, {"cell_type": "markdown", "metadata": {}, "source": ["La sortie n'est pas un fichier mais un r\u00e9pertoire. Chaque partie provient d'une machine diff\u00e9rente. Dans notre cas, les donn\u00e9es \u00e9tant de petite taille le calcul n'a pas \u00e9t\u00e9 distribu\u00e9."]}, {"cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "6618983\t27.05.2009 14:03:47:663\t3.066070079803467\t1.6573837995529177\t1.0677484273910522\twalking\n", "696\tA01\t020-000-032-221\t633790226276889279\t27.05.2009 14:03:47:690\t4.3041510581970215\t1.6276369094848633\t1.2260531187057495\twalking\n", "697\tA01\t010-000-024-033\t633790226277159576\t27.05.2009 14:03:47:717\t3.1501431465148926\t1.8083082437515257\t-0.015722407028079033\twalking\n", "698\tA01\t010-000-030-096\t633790226277429870\t27.05.2009 14:03:47:743\t2.9654340744018555\t1.7824335098266602\t0.2285633087158203\twalking\n", "699\tA01\t020-000-032-221\t633790226277970452\t27.05.2009 14:03:47:797\t4.3224053382873535\t1.5714523792266846\t1.4004991054534912\twalking\n", "700\tA01\t010-000-024-033\t633790226278240749\t27.05.2009 14:03:47:823\t3.1330645084381104\t1.7693294286727903\t-0.022590305656194687\twalking\n", "701\tA01\t020-000-033-111\t633790226278781331\t27.05.2009 14:03:47:877\t3.121254205703736\t1.5498417615890503\t1.0481393337249756\twalking\n", "702\tA01\t020-000-032-221\t633790226279051629\t27.05.2009 14:03:47:907\t3.281498432159424\t1.4987335205078125\t0.6204121708869934\twalking\n", "\n", "
"], "text/plain": [""]}, "execution_count": 48, "metadata": {}, "output_type": "execute_result"}], "source": ["%remote_cmd hdfs dfs -tail ConfLongDemo_JSI.small.example2.walking.txt/part-m-00000"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On retrouve bien les m\u00eames r\u00e9sultats. Cependant, on n'a pas envie d'attendre la fin d'un job pour reprendre la main sur le notebook. Pour ce faire, on cr\u00e9e un second job."]}, {"cell_type": "markdown", "metadata": {}, "source": ["On regarde la liste des jobs en cours avec [hadoop queue](http://hadoop.apache.org/docs/r1.0.4/commands_manual.html#queue) :"]}, {"cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "======================\n", "Queue Name : root.xavierdupre \n", "Queue State : running \n", "Scheduling Info : Capacity: 0.0, MaximumCapacity: UNDEFINED, CurrentCapacity: 0.0 \n", "Total jobs:0\n", " JobId\t State\t StartTime\t UserName\t Queue\t Priority\t UsedContainers\t RsvdContainers\t UsedMem\t RsvdMem\t NeededMem\t AM info\n", "\n", "
"], "text/plain": [""]}, "execution_count": 49, "metadata": {}, "output_type": "execute_result"}], "source": ["%remote_cmd hadoop queue -info root.xavierdupre -showJobs"]}, {"cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "Found 2 items\n", "-rw-r--r-- 3 xavierdupre xavierdupre 0 2015-10-29 01:03 ConfLongDemo_JSI.small.example2.walking_test30_nowait.txt/_SUCCESS\n", "-rw-r--r-- 3 xavierdupre xavierdupre 22166 2015-10-29 01:03 ConfLongDemo_JSI.small.example2.walking_test30_nowait.txt/part-m-00000\n", "\n", "
"], "text/plain": [""]}, "execution_count": 50, "metadata": {}, "output_type": "execute_result"}], "source": ["%remote_cmd hdfs dfs -ls ConfLongDemo_JSI.small.example2.walking_test30_nowait.txt"]}, {"cell_type": "markdown", "metadata": {}, "source": ["C'est plus pratique mais la correction des erreurs quand elles se produisent n'est plus aussi pratique. On termine par une instruction qui permet de r\u00e9cup\u00e9rer tous les fichiers d'un m\u00eame rep\u00e9rtoire en une seule fois :"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "\n", "
"], "text/plain": [""]}, "execution_count": 51, "metadata": {}, "output_type": "execute_result"}], "source": ["%remote_cmd hdfs dfs -getmerge ConfLongDemo_JSI.small.example2.walking_test30_nowait.txt toutenun.txt"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Le fichier est maintenant sur la passerelle."]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "-rw-r--r-- 1 xavierdupre xavierdupre 22166 Oct 29 01:06 toutenun.txt\n", "\n", "
"], "text/plain": [""]}, "execution_count": 52, "metadata": {}, "output_type": "execute_result"}], "source": ["%remote_cmd ls -l tout*"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Lorsqu'on lance des jobs cons\u00e9quent, il est important de savoir comment les arr\u00eater avec [hadoop job](http://hadoop.apache.org/docs/r1.0.4/commands_manual.html#job) ``-kill jobid`` :"]}, {"cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "Total jobs:28\n", " JobId\t State\t StartTime\t UserName\t Queue\t Priority\t UsedContainers\t RsvdContainers\t UsedMem\t RsvdMem\t NeededMem\t AM info\n", " job_1444669880271_0009\t SUCCEEDED\t 1444779082040\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0009/jobhistory/job/job_1444669880271_0009\n", " job_1444669880271_0010\t SUCCEEDED\t 1444779114402\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0010/jobhistory/job/job_1444669880271_0010\n", " job_1444669880271_0018\t SUCCEEDED\t 1444781859339\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0018/jobhistory/job/job_1444669880271_0018\n", " job_1444669880271_0033\t SUCCEEDED\t 1446076296449\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0033/jobhistory/job/job_1444669880271_0033\n", " job_1444669880271_0007\t SUCCEEDED\t 1444778960311\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0007/jobhistory/job/job_1444669880271_0007\n", " job_1444669880271_0025\t SUCCEEDED\t 1444812309930\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0025/jobhistory/job/job_1444669880271_0025\n", " job_1444669880271_0036\t SUCCEEDED\t 1446076797029\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0036/jobhistory/job/job_1444669880271_0036\n", " job_1444669880271_0022\t SUCCEEDED\t 1444782235940\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0022/jobhistory/job/job_1444669880271_0022\n", " job_1444669880271_0013\t FAILED\t 1444780493283\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0013/jobhistory/job/job_1444669880271_0013\n", " job_1444669880271_0008\t SUCCEEDED\t 1444778993134\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0008/jobhistory/job/job_1444669880271_0008\n", " job_1444669880271_0035\t SUCCEEDED\t 1446076394207\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0035/jobhistory/job/job_1444669880271_0035\n", " job_1444669880271_0001\t SUCCEEDED\t 1444778245903\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0001/jobhistory/job/job_1444669880271_0001\n", " job_1444669880271_0005\t SUCCEEDED\t 1444778822775\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0005/jobhistory/job/job_1444669880271_0005\n", " job_1444669880271_0028\t SUCCEEDED\t 1444948433254\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0028/jobhistory/job/job_1444669880271_0028\n", " job_1444669880271_0002\t SUCCEEDED\t 1444778285584\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0002/jobhistory/job/job_1444669880271_0002\n", " job_1444669880271_0030\t SUCCEEDED\t 1445990873471\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0030/jobhistory/job/job_1444669880271_0030\n", " job_1444669880271_0027\t SUCCEEDED\t 1444948400340\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0027/jobhistory/job/job_1444669880271_0027\n", " job_1444669880271_0024\t SUCCEEDED\t 1444812276836\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0024/jobhistory/job/job_1444669880271_0024\n", " job_1444669880271_0019\t SUCCEEDED\t 1444781890983\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0019/jobhistory/job/job_1444669880271_0019\n", " job_1444669880271_0003\t SUCCEEDED\t 1444778547755\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0003/jobhistory/job/job_1444669880271_0003\n", " job_1444669880271_0006\t SUCCEEDED\t 1444778856950\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0006/jobhistory/job/job_1444669880271_0006\n", " job_1444669880271_0032\t SUCCEEDED\t 1446075704284\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0032/jobhistory/job/job_1444669880271_0032\n", " job_1444669880271_0021\t SUCCEEDED\t 1444782202256\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0021/jobhistory/job/job_1444669880271_0021\n", " job_1444669880271_0037\t SUCCEEDED\t 1446076967758\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0037/jobhistory/job/job_1444669880271_0037\n", " job_1444669880271_0031\t SUCCEEDED\t 1445990906117\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0031/jobhistory/job/job_1444669880271_0031\n", " job_1444669880271_0004\t SUCCEEDED\t 1444778575799\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0004/jobhistory/job/job_1444669880271_0004\n", " job_1444669880271_0034\t SUCCEEDED\t 1446076302576\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0034/jobhistory/job/job_1444669880271_0034\n", " job_1444669880271_0012\t SUCCEEDED\t 1444780465372\t xavierdupre\troot.xavierdupre\t NORMAL\t 0\t 0\t 0M\t 0M\t 0M\thttp://ws09-sr1.tl.teralab-datascience.fr:8088/proxy/application_1444669880271_0012/jobhistory/job/job_1444669880271_0012\n", "\n", "
"], "text/plain": [""]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["%remote_cmd hadoop job -list all"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On peut tuer un job lorsqu'il est dans la file d'attente ou en train de s'ex\u00e9cuter."]}, {"cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": ["#%remote_cmd hadoop job -kill job_1414491244634_0002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Partie 3 : syntaxe PIG et exercices\n", "\n", "Dans cette partie, l'objectif est de transcrire un ``GROUP BY`` en PIG, un ``JOIN`` et de combiner toutes ces op\u00e9rations en un seul job au cours du second exercice. Ces exemples utilisent de petits fichiers. Utiliser un job Map/Reduce n'a pas beaucoup d'int\u00e9r\u00eat \u00e0 moins que la taille de ces fichiers n'atteigne un giga-octets. Les instructions sont \u00e0 chercher dans cette page : [Pig Latin Basics](http://pig.apache.org/docs/r0.12.0/basic.html)."]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Exercice 1 : GROUP BY"]}, {"cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " activity | \n", " nb | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " falling | \n", " 2973 | \n", "
\n", " \n", " 1 | \n", " lying | \n", " 54480 | \n", "
\n", " \n", " 2 | \n", " lying down | \n", " 6168 | \n", "
\n", " \n", " 3 | \n", " on all fours | \n", " 5210 | \n", "
\n", " \n", " 4 | \n", " sitting | \n", " 27244 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" activity nb\n", "0 falling 2973\n", "1 lying 54480\n", "2 lying down 6168\n", "3 on all fours 5210\n", "4 sitting 27244"]}, "execution_count": 55, "metadata": {}, "output_type": "execute_result"}], "source": ["import pandas, sqlite3\n", "con = sqlite3.connect(\"ConfLongDemo_JSI.db3\")\n", "df = pandas.read_sql(\"\"\"SELECT activity, count(*) as nb FROM person GROUP BY activity\"\"\", con)\n", "con.close()\n", "df.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Il faut maintenant le faire avec PIG."]}, {"cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": []}, {"cell_type": "markdown", "metadata": {}, "source": ["### Exercice 2 : JOIN"]}, {"cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [{"data": {"text/html": ["\n", "
\n", " \n", " \n", " | \n", " index | \n", " sequence | \n", " tag | \n", " timestamp | \n", " dateformat | \n", " x | \n", " y | \n", " z | \n", " activity | \n", " nb | \n", "
\n", " \n", " \n", " \n", " 0 | \n", " 0 | \n", " A01 | \n", " 010-000-024-033 | \n", " 633790226051280329 | \n", " 27.05.2009 14:03:25:127 | \n", " 4.062931 | \n", " 1.892434 | \n", " 0.507425 | \n", " walking | \n", " 32710 | \n", "
\n", " \n", " 1 | \n", " 1 | \n", " A01 | \n", " 020-000-033-111 | \n", " 633790226051820913 | \n", " 27.05.2009 14:03:25:183 | \n", " 4.291954 | \n", " 1.781140 | \n", " 1.344495 | \n", " walking | \n", " 32710 | \n", "
\n", " \n", " 2 | \n", " 2 | \n", " A01 | \n", " 020-000-032-221 | \n", " 633790226052091205 | \n", " 27.05.2009 14:03:25:210 | \n", " 4.359101 | \n", " 1.826456 | \n", " 0.968821 | \n", " walking | \n", " 32710 | \n", "
\n", " \n", " 3 | \n", " 3 | \n", " A01 | \n", " 010-000-024-033 | \n", " 633790226052361498 | \n", " 27.05.2009 14:03:25:237 | \n", " 4.087835 | \n", " 1.879999 | \n", " 0.466983 | \n", " walking | \n", " 32710 | \n", "
\n", " \n", " 4 | \n", " 4 | \n", " A01 | \n", " 010-000-030-096 | \n", " 633790226052631792 | \n", " 27.05.2009 14:03:25:263 | \n", " 4.324462 | \n", " 2.072460 | \n", " 0.488065 | \n", " walking | \n", " 32710 | \n", "
\n", " \n", "
\n", "
"], "text/plain": [" index sequence tag timestamp \\\n", "0 0 A01 010-000-024-033 633790226051280329 \n", "1 1 A01 020-000-033-111 633790226051820913 \n", "2 2 A01 020-000-032-221 633790226052091205 \n", "3 3 A01 010-000-024-033 633790226052361498 \n", "4 4 A01 010-000-030-096 633790226052631792 \n", "\n", " dateformat x y z activity nb \n", "0 27.05.2009 14:03:25:127 4.062931 1.892434 0.507425 walking 32710 \n", "1 27.05.2009 14:03:25:183 4.291954 1.781140 1.344495 walking 32710 \n", "2 27.05.2009 14:03:25:210 4.359101 1.826456 0.968821 walking 32710 \n", "3 27.05.2009 14:03:25:237 4.087835 1.879999 0.466983 walking 32710 \n", "4 27.05.2009 14:03:25:263 4.324462 2.072460 0.488065 walking 32710 "]}, "execution_count": 57, "metadata": {}, "output_type": "execute_result"}], "source": ["con = sqlite3.connect(\"ConfLongDemo_JSI.db3\")\n", "df = pandas.read_sql(\"\"\"SELECT person.*, A.nb FROM person INNER JOIN (\n", " SELECT activity, count(*) as nb FROM person GROUP BY activity) AS A\n", " ON person.activity == A.activity\"\"\", con)\n", "con.close()\n", "df.head()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Idem, maintenant il faut le faire avec PIG."]}, {"cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4"}}, "nbformat": 4, "nbformat_minor": 2}