import pandas as pds
datas = pds.read_csv('titanic.csv')
datas.head()
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
datas.shape
(891, 15)
datas[['pclass', 'survived', 'sex', 'age']]
pclass | survived | sex | age | |
---|---|---|---|---|
0 | 3 | 0 | male | 22.0 |
1 | 1 | 1 | female | 38.0 |
2 | 3 | 1 | female | 26.0 |
3 | 1 | 1 | female | 35.0 |
4 | 3 | 0 | male | 35.0 |
... | ... | ... | ... | ... |
886 | 2 | 0 | male | 27.0 |
887 | 1 | 1 | female | 19.0 |
888 | 3 | 0 | female | NaN |
889 | 1 | 1 | male | 26.0 |
890 | 3 | 0 | male | 32.0 |
891 rows × 4 columns
res=datas[['pclass', 'survived', 'sex']].groupby(['sex', 'pclass']).mean()
print(res)
survived sex pclass female 1 0.968085 2 0.921053 3 0.500000 male 1 0.368852 2 0.157407 3 0.135447
res=datas[['pclass', 'survived', 'sex']].pivot_table(index = 'pclass', columns = 'sex')
res
survived | ||
---|---|---|
sex | female | male |
pclass | ||
1 | 0.968085 | 0.368852 |
2 | 0.921053 | 0.157407 |
3 | 0.500000 | 0.135447 |
# pour lister les attributs de la table
datas.columns
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'], dtype='object')
# pour avoir une synthèse des attributs numérique
datas.describe()
survived | pclass | age | sibsp | parch | fare | |
---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
# Pour avoir la moyenne des attributs numérique par sexe et classe
datas.groupby(['sex', 'pclass']).mean()
survived | age | sibsp | parch | fare | adult_male | alone | ||
---|---|---|---|---|---|---|---|---|
sex | pclass | |||||||
female | 1 | 0.968085 | 34.611765 | 0.553191 | 0.457447 | 106.125798 | 0.000000 | 0.361702 |
2 | 0.921053 | 28.722973 | 0.486842 | 0.605263 | 21.970121 | 0.000000 | 0.421053 | |
3 | 0.500000 | 21.750000 | 0.895833 | 0.798611 | 16.118810 | 0.000000 | 0.416667 | |
male | 1 | 0.368852 | 41.281386 | 0.311475 | 0.278689 | 67.226127 | 0.975410 | 0.614754 |
2 | 0.157407 | 30.740707 | 0.342593 | 0.222222 | 19.741782 | 0.916667 | 0.666667 | |
3 | 0.135447 | 26.507589 | 0.498559 | 0.224784 | 12.661633 | 0.919308 | 0.760807 |
# Pour selectionner quelques colonnes :
datas2=datas[['pclass', 'survived', 'sex', 'age']]
datas2.columns
Index(['pclass', 'survived', 'sex', 'age'], dtype='object')
datas2.head()
pclass | survived | sex | age | |
---|---|---|---|---|
0 | 3 | 0 | male | 22.0 |
1 | 1 | 1 | female | 38.0 |
2 | 3 | 1 | female | 26.0 |
3 | 1 | 1 | female | 35.0 |
4 | 3 | 0 | male | 35.0 |
# Pour supprimer quelques colonnes (axis=1)
datas2 = datas.drop([
'sibsp', 'parch', 'fare','embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'
], axis=1)
datas2.columns
Index(['survived', 'pclass', 'sex', 'age'], dtype='object')
# Pour supprimer quelques lignes (axis=0)
datas2 = datas.drop([
0, 1, 2
], axis=0)
datas2.head()
survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
5 | 0 | 3 | male | NaN | 0 | 0 | 8.4583 | Q | Third | man | True | NaN | Queenstown | no | True |
6 | 0 | 1 | male | 54.0 | 0 | 0 | 51.8625 | S | First | man | True | E | Southampton | no | True |
7 | 0 | 3 | male | 2.0 | 3 | 1 | 21.0750 | S | Third | child | False | NaN | Southampton | no | False |
import numpy as np
import pandas as pds
import seaborn as sns
datas = sns.load_dataset('titanic') # csv de gitub
datas.columns
datas.head()
res = datas.pivot_table('survived',
aggfunc=np.mean,
index='class',
columns='sex')
res
sex | female | male |
---|---|---|
class | ||
First | 0.968085 | 0.368852 |
Second | 0.921053 | 0.157407 |
Third | 0.500000 | 0.135447 |