import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv('titanic.csv')
df.head()


df.Survived

# note that df['Survived'] is equivalent syntax for accessing the Survived column

0      0
1      1
2      1
3      1
4      0
      ..
882    0
883    1
884    0
885    1
886    0
Name: Survived, Length: 887, dtype: int64


# Here we query the survived column values for equivalence to 1. (meaning survived)
df.Survived==1

0      False
1       True
2       True
3       True
4      False
       ...  
882    False
883     True
884    False
885     True
886    False
Name: Survived, Length: 887, dtype: bool


# find how many people survived by summing truth indices
sum(df.Survived==1)

342


# similarly we find the number of people who died 
sum(df.Survived==0)

545


# Here we groupby fare class and use the count function on all aggregating columns
# the count function counts the number of instances/rows a column has.
df.groupby('Pclass').count()


# This shows the number of passengers in each class
df.groupby('Pclass')['Survived'].count()

Pclass
1    216
2    184
3    487
Name: Survived, dtype: int64


# by grouping by fare class and apply the mean function to the fare price aggregate column
# we can find the mean fare price for each fare class
df.groupby('Pclass')['Fare'].mean()

Pclass
1    84.154687
2    20.662183
3    13.707707
Name: Fare, dtype: float64


# note that you can find these values with truth indices isntead of groupby
print(f"Mean fare price for class 1: {df.loc[df.Pclass==1,'Fare'].mean()}")
print(f"Mean fare price for class 2: {df.loc[df.Pclass==2,'Fare'].mean()}")
print(f"Mean fare price for class 3: {df.loc[df.Pclass==3,'Fare'].mean()}")

Mean fare price for class 1: 84.1546875
Mean fare price for class 2: 20.662183152173913
Mean fare price for class 3: 13.707707392197124


# Find total number of survived/died for both males/females
df.groupby(['Sex', 'Survived']).count()


# We can extract one aggregate column to get the counts into a series
# Pclass is a random choice, and could have been any other column
df.groupby(['Sex', 'Survived'])['Pclass'].count()

Sex     Survived
female  0            81
        1           233
male    0           464
        1           109
Name: Pclass, dtype: int64


# Removing 'Survived' from our groupby, we can get total number of females and males.
df.groupby('Sex')['Pclass'].count()

Sex
female    314
male      573
Name: Pclass, dtype: int64


# By dividing these two series, we can find the proportion survived for both m/f.


ratios = df.groupby(['Sex', 'Survived'])['Pclass'].count() / df.groupby('Sex')['Pclass'].count()
ratios

Sex     Survived
female  0           0.257962
        1           0.742038
male    0           0.809773
        1           0.190227
Name: Pclass, dtype: float64


# Using .loc to index, we can specify both sexes and survived values of 1
# to get the proportion of females and males that survived.
ratios.loc[:,1]

Sex
female    0.742038
male      0.190227
Name: Pclass, dtype: float64


# number of survived females / total number of females
sum((df.Survived==1) & (df.Sex=='female')) / sum(df.Sex=='female')

0.7420382165605095


# number of survived males / total number of males
sum((df.Survived==1) & (df.Sex=='male')) / sum(df.Sex=='male')

0.19022687609075042


# first we get counts of males/females in each class
mf_counts_class = df.groupby(['Pclass','Sex'])['Pclass'].count()
mf_counts_class

Pclass  Sex   
1       female     94
        male      122
2       female     76
        male      108
3       female    144
        male      343
Name: Pclass, dtype: int64


# then we get counts of survived/died, males/females in each class
smf_counts_class = df.groupby(['Pclass','Sex', 'Survived'])['Pclass'].count() 
smf_counts_class

Pclass  Sex     Survived
1       female  0             3
                1            91
        male    0            77
                1            45
2       female  0             6
                1            70
        male    0            91
                1            17
3       female  0            72
                1            72
        male    0           296
                1            47
Name: Pclass, dtype: int64


# dividing the surival counts by the total counts, we get survival proportions for each category
class_sex_survival_rates = smf_counts_class/mf_counts_class
class_sex_survival_rates

Pclass  Sex     Survived
1       female  0           0.031915
                1           0.968085
        male    0           0.631148
                1           0.368852
2       female  0           0.078947
                1           0.921053
        male    0           0.842593
                1           0.157407
3       female  0           0.500000
                1           0.500000
        male    0           0.862974
                1           0.137026
Name: Pclass, dtype: float64


# we can now index our survival rates series to get only survied passengers
# loc[class, sex, survived]
class_sex_lived_percent = class_sex_survival_rates.loc[:,:,1] *100 # convert to percentage
class_sex_lived_percent

Pclass  Sex   
1       female    96.808511
        male      36.885246
2       female    92.105263
        male      15.740741
3       female    50.000000
        male      13.702624
Name: Pclass, dtype: float64


# visualizing survival percentages based on our groups
_ = class_sex_lived_rates.plot(kind='bar')
_ = plt.xticks(rotation=45)
_ = plt.title('Survival rates for titanic passengers by Class and Sex')
_ = plt.ylabel('Percent Survived')


# Here we get the total counts of males and females broken down by class
num_mf_by_class = df.groupby(['Pclass','Sex'])['Pclass'].count()
num_mf_by_class

Pclass  Sex   
1       female     94
        male      122
2       female     76
        male      108
3       female    144
        male      343
Name: Pclass, dtype: int64


# Here we get the ratios of males to females broken down by class

# num of males in each class / num of females in each class
num_mf_by_class.loc[:,'male']/ num_mf_by_class.loc[:,'female']

Pclass
1    1.297872
2    1.421053
3    2.381944
Name: Pclass, dtype: float64

	Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Fare
0	0	3	Mr. Owen Harris Braund	male	22.0	1	7.2500
1	1	1	Mrs. John Bradley (Florence Briggs Thayer) Cum...	female	38.0	1	71.2833
2	1	3	Miss. Laina Heikkinen	female	26.0	0	7.9250
3	1	1	Mrs. Jacques Heath (Lily May Peel) Futrelle	female	35.0	1	53.1000
4	0	3	Mr. William Henry Allen	male	35.0	0	8.0500

Pandas Data Wrangling with the Titanic Dataset¶

dataset description:¶

We will show different techniques to arrive at solutions for the following questions.¶

read in titanic.csv¶

How many people survived? How many died?¶

Truth indexing is useful for querys¶

Use sum() on a truth array to find how many true instances there are.¶

What is the mean Fare for each Fare Class?¶

Using the Groupby method¶

Aggregation functions¶

Side quest: Number of passengers for each Pclass¶

Survival Rates for Males and Females¶

Equivalent solution using truth indices¶

Finding survival rates by Class and Sex.¶

What are the ratios of males to females by class?¶