from jyquickhelper import add_notebook_menu
add_notebook_menu()


import pyensae, pyquickhelper
from pyquickhelper.ipythonhelper import open_html_form
params={"server":"df...fr", "username":"", "password":""}
open_html_form(params=params,title="server + credentials", key_save="params")


import pyensae
%load_ext pyensae
%load_ext pyenbc
password = params["password"]
server = params["server"]
username = params["username"]
client = %remote_open
client

<pyensae.remote.ssh_remote_connection.ASSHClient at 0x79ad290>


import random
with open("random.sample.txt", "w") as f :
    for i in range(0,10000) :
        x = random.random()
        f.write(str(x)+"\n")


%dfs_mkdir random

('', '')


%remote_up_cluster random.sample.txt random/random.sample.txt

'random/random.sample.txt'


%%PIG histogram.pig

values = LOAD 'random/random.sample.txt' USING PigStorage('\t') AS (x:double);

values_h = FOREACH values GENERATE x, ((int)(x / $bins)) * $bins AS h ;

hist_group = GROUP values_h BY h ;

hist = FOREACH hist_group GENERATE group, COUNT(values_h) AS nb ;

STORE hist INTO 'random/histo_$bins.txt' USING PigStorage('\t') ;


if client.dfs_exists("random/histo_0.1.txt"):
    client.dfs_rm("random/histo_0.1.txt", recursive=True)


client.pig_submit("histogram.pig", redirection="redirection", params =dict(bins="0.1") )

('', '')


%remote_cmd tail redirection.err

Total bytes written : 131
Spillable Memory Manager spill count : 0
Total bags proactively spilled: 0
Total records proactively spilled: 0

Job DAG:
job_1414491244634_0168


2014-11-28 00:11:44,435 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!


%dfs_ls random


if os.path.exists("histo.txt") : os.remove("histo.txt")
client.download_cluster("random/histo_0.1.txt","histo.txt", merge=True)

'random/histo_0.1.txt'


import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas
df = pandas.read_csv("histo.txt", sep="\t",names=["bin","nb"])
df.plot(x="bin", y="nb", kind="bar")

<matplotlib.axes._subplots.AxesSubplot at 0xaf7c330>

	attributes	code	alias	folder	size	date	time	name	isdir
0	drwxr-xr-x	-	xavierdupre	xavierdupre	0	2014-11-28	00:11	random/histo_0.1.txt	True
1	-rw-r--r--	3	xavierdupre	xavierdupre	202586	2014-11-27	23:38	random/random.sample.txt	False

PIG et Paramètres sur Cloudera - énoncé¶

Paramètres¶

Connexion au cluster¶

Upload version¶

PIG et paramètres¶

Exercice 1 : min, max¶