2018-09-18 - Rappels sur pandas et maplotlib#
Links: notebook
, html, python
, slides, GitHub
Manipulation de données autour du jeu des passagers du Titanic qu’on peut récupérer sur opendatasoft ou awesome-public-datasets.
import pandas
df = pandas.read_csv("titanic.csv/titanic.csv")
type(df)
pandas.core.frame.DataFrame
df.head(n=2)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
subset = df[ ["PassengerId", "Survived", "Pclass"] ]
subset.head(n=2)
PassengerId | Survived | Pclass | |
---|---|---|---|
0 | 1 | 0 | 3 |
1 | 2 | 1 | 1 |
survived = subset[ ["Survived", "Pclass"] ].groupby(["Pclass"]).sum()
compte = subset[ ["Survived", "Pclass"] ].groupby(["Pclass"]).count()
compte.columns = ['total']
survived
Survived | |
---|---|
Pclass | |
1 | 136 |
2 | 87 |
3 | 119 |
compte
total | |
---|---|
Pclass | |
1 | 216 |
2 | 184 |
3 | 491 |
jointure = survived.join(compte)
jointure
Survived | total | |
---|---|---|
Pclass | ||
1 | 136 | 216 |
2 | 87 | 184 |
3 | 119 | 491 |
jointure["survie"] = jointure['Survived'] / jointure.total
jointure
Survived | total | survie | |
---|---|---|---|
Pclass | |||
1 | 136 | 216 | 0.629630 |
2 | 87 | 184 | 0.472826 |
3 | 119 | 491 | 0.242363 |
%matplotlib inline
ax = jointure[['survie']].plot(kind="bar", figsize=(2, 2))
ax.set_title("Titanic")
ax.set_ylabel("%");
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 2, figsize=(8,3))
jointure[['survie']].plot(kind="bar", ax=ax[0])
ax[0].set_title("Titanic")
ax[0].set_ylabel("%");
jointure.drop('survie', axis=1).plot(kind="bar", ax=ax[1])
ax[1].set_title("Titanic")
ax[1].set_ylabel("%");
jointure.to_excel("titanic.xlsx")
df.head(n=2)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
mat = df[['Survived', 'Age']].values
mat.T
array([[ 0., 1., 1., ..., 0., 1., 0.],
[22., 38., 26., ..., nan, 26., 32.]])
mat.T @ mat
array([[342., nan],
[ nan, nan]])
mat @ mat.T
array([[ 484., 836., 572., ..., nan, 572., 704.],
[ 836., 1445., 989., ..., nan, 989., 1216.],
[ 572., 989., 677., ..., nan, 677., 832.],
...,
[ nan, nan, nan, ..., nan, nan, nan],
[ 572., 989., 677., ..., nan, 677., 832.],
[ 704., 1216., 832., ..., nan, 832., 1024.]])
df.tail(n=2)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.00 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.75 | NaN | Q |
names = list(df['Name'])
names
nom = names[0]
nom
'Braund, Mr. Owen Harris'
nom.split(',')[1].split('.')[0].strip().lower()
'mr'
mr = []
for nom in names:
mr.append(nom.split(',')[1].split('.')[0].strip().lower())
df['mr'] = mr
gr = df[ ['Sex', "mr", "PassengerId"] ].groupby(['Sex', "mr"], as_index=False).count()
gr.head()
Sex | mr | PassengerId | |
---|---|---|---|
0 | female | dr | 1 |
1 | female | lady | 1 |
2 | female | miss | 182 |
3 | female | mlle | 2 |
4 | female | mme | 1 |
gr.pivot("mr", "Sex", "PassengerId")
Sex | female | male |
---|---|---|
mr | ||
capt | NaN | 1.0 |
col | NaN | 2.0 |
don | NaN | 1.0 |
dr | 1.0 | 6.0 |
jonkheer | NaN | 1.0 |
lady | 1.0 | NaN |
major | NaN | 2.0 |
master | NaN | 40.0 |
miss | 182.0 | NaN |
mlle | 2.0 | NaN |
mme | 1.0 | NaN |
mr | NaN | 517.0 |
mrs | 125.0 | NaN |
ms | 1.0 | NaN |
rev | NaN | 6.0 |
sir | NaN | 1.0 |
the countess | 1.0 | NaN |