Analyzing Titanic Dataset¶

Prepared by Mahsa Sadi on 2020 - 06 - 23

In this notbook, we perform five steps on the Titanic data set:

Reading Data
Visualizing Data
Analyzing Data
Cleaning Data
Modeling Data: To model the dataset, we apply logistic regression.

import pandas

import sklearn

from sklearn import model_selection

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

import numpy

import seaborn

import matplotlib.pyplot

%matplotlib inline

import math

Reading the Data Set¶

The data set is in a csv file in the same directory as the code.

data_set = pandas.read_csv ('titanic.csv')

data_set.head (20)

data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
Survived                   887 non-null int64
Pclass                     887 non-null int64
Name                       887 non-null object
Sex                        887 non-null object
Age                        887 non-null float64
Siblings/Spouses Aboard    887 non-null int64
Parents/Children Aboard    887 non-null int64
Fare                       887 non-null float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.5+ KB

Summerizing the Data Set¶

# how many rows and columns?

data_set.shape

(887, 8)

#how many survived and how many died?

data_set.groupby ('Survived').size()

Survived
0    545
1    342
dtype: int64

Visualizing Data¶

seaborn.countplot (x ="Survived", data = data_set)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe343ca8150>

=> More people died than survived

seaborn.countplot (x ="Survived", hue = "Sex", data = data_set)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe341c23a10>

=> More men died than women

seaborn.countplot (x ="Survived", hue = "Pclass", data = data_set)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe341bad050>

=> More 3rd class passengers died than others.

data_set ["Age"]. hist ()

<matplotlib.axes._subplots.AxesSubplot at 0x7fe341abda10>

=> The majority of the passengers were 18 to 40 years old.

data_set ["Fare"].hist (bins = 20, figsize = (10,5))

<matplotlib.axes._subplots.AxesSubplot at 0x7fe341a64a90>

seaborn.countplot (x ="Siblings/Spouses Aboard", data = data_set)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe3419f3750>

seaborn.countplot (x ="Parents/Children Aboard", data = data_set)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe34196a950>

=> The majority of passengers were alone or with one family member.

seaborn.boxplot( x = 'Pclass', y = 'Age', data = data_set)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe3418e5410>

=> First and second class passengers were older.

seaborn.heatmap (data_set.isnull(),yticklabels = False, cmap = 'YlGnBu')

<matplotlib.axes._subplots.AxesSubplot at 0x7fe341874c50>

Cleaning the Dataset¶

cleaned_data_set = data_set.drop (columns = ['Name'], axis = 1)

cleaned_data_set.head (5)

#Idnetify the cells with missing dada

cleaned_data_set.isnull().head (5)

cleaned_data_set.isnull().sum ()

Survived                   0
Pclass                     0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

To apply logistic regression all the columns should have categorial or numerical values.

pandas.get_dummies (cleaned_data_set ['Sex']).head (5)

binary_sex = pandas.get_dummies (cleaned_data_set ['Sex'],drop_first = True)

binary_sex.head (5)

pandas.get_dummies (cleaned_data_set ['Pclass']).head (5)

binary_Pclass = pandas.get_dummies (cleaned_data_set ['Pclass'],drop_first = True)

binary_Pclass.head (5)

modified_data_set = pandas.concat ([cleaned_data_set, binary_sex, binary_Pclass], axis = 1)

modified_data_set.head (5)

final_data_set = modified_data_set.drop (columns = ['Sex', 'Pclass'])
final_data_set.head (5)

final_data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
Survived                   887 non-null int64
Age                        887 non-null float64
Siblings/Spouses Aboard    887 non-null int64
Parents/Children Aboard    887 non-null int64
Fare                       887 non-null float64
male                       887 non-null uint8
2                          887 non-null uint8
3                          887 non-null uint8
dtypes: float64(2), int64(3), uint8(3)
memory usage: 37.3 KB

Modeling the DataSet¶

Y= final_data_set ['Survived']
X= final_data_set.drop (['Survived'], axis = 1 )

X.head (5)

Y.head (5)

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

test_set_size = 0.2
seed = 1

X_train, X_test, Y_train, Y_test = model_selection.train_test_split (X,Y, test_size = test_set_size, random_state = seed)

model = LogisticRegression (solver = 'liblinear')

model.fit (X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

predictions = model.predict (X_test)

report = classification_report (Y_test, predictions)

print (report)

              precision    recall  f1-score   support

           0       0.80      0.85      0.82       106
           1       0.75      0.68      0.72        72

    accuracy                           0.78       178
   macro avg       0.78      0.76      0.77       178
weighted avg       0.78      0.78      0.78       178

print (confusion_matrix (Y_test, predictions))

[[90 16]
 [23 49]]

accuracy_score (Y_test, predictions)

0.7808988764044944

	Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	0	3	Mr. Owen Harris Braund	male	22.0	1	0	7.2500
1	1	1	Mrs. John Bradley (Florence Briggs Thayer) Cum...	female	38.0	1	0	71.2833
2	1	3	Miss. Laina Heikkinen	female	26.0	0	0	7.9250
3	1	1	Mrs. Jacques Heath (Lily May Peel) Futrelle	female	35.0	1	0	53.1000
4	0	3	Mr. William Henry Allen	male	35.0	0	0	8.0500
5	0	3	Mr. James Moran	male	27.0	0	0	8.4583
6	0	1	Mr. Timothy J McCarthy	male	54.0	0	0	51.8625
7	0	3	Master. Gosta Leonard Palsson	male	2.0	3	1	21.0750
8	1	3	Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson	female	27.0	0	2	11.1333
9	1	2	Mrs. Nicholas (Adele Achem) Nasser	female	14.0	1	0	30.0708
10	1	3	Miss. Marguerite Rut Sandstrom	female	4.0	1	1	16.7000
11	1	1	Miss. Elizabeth Bonnell	female	58.0	0	0	26.5500
12	0	3	Mr. William Henry Saundercock	male	20.0	0	0	8.0500
13	0	3	Mr. Anders Johan Andersson	male	39.0	1	5	31.2750
14	0	3	Miss. Hulda Amanda Adolfina Vestrom	female	14.0	0	0	7.8542
15	1	2	Mrs. (Mary D Kingcome) Hewlett	female	55.0	0	0	16.0000
16	0	3	Master. Eugene Rice	male	2.0	4	1	29.1250
17	1	2	Mr. Charles Eugene Williams	male	23.0	0	0	13.0000
18	0	3	Mrs. Julius (Emelia Maria Vandemoortele) Vande...	female	31.0	1	0	18.0000
19	1	3	Mrs. Fatima Masselmani	female	22.0	0	0	7.2250

	Survived	Pclass	Sex	Age	Siblings/Spouses Aboard	Fare
0	0	3	male	22.0	1	7.2500
1	1	1	female	38.0	1	71.2833
2	1	3	female	26.0	0	7.9250
3	1	1	female	35.0	1	53.1000
4	0	3	male	35.0	0	8.0500

	Survived	Pclass	Sex	Age	Siblings/Spouses Aboard	Fare	male	3
0	0	3	male	22.0	1	7.2500	1	1
1	1	1	female	38.0	1	71.2833	0	0
2	1	3	female	26.0	0	7.9250	0	1
3	1	1	female	35.0	1	53.1000	0	0
4	0	3	male	35.0	0	8.0500	1	1

	Survived	Age	Siblings/Spouses Aboard	Fare	male	3
0	0	22.0	1	7.2500	1	1
1	1	38.0	1	71.2833	0	0
2	1	26.0	0	7.9250	0	1
3	1	35.0	1	53.1000	0	0
4	0	35.0	0	8.0500	1	1

	Age	Siblings/Spouses Aboard	Fare	male	3
0	22.0	1	7.2500	1	1
1	38.0	1	71.2833	0	0
2	26.0	0	7.9250	0	1
3	35.0	1	53.1000	0	0
4	35.0	0	8.0500	1	1

	Survived	Pclass	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
0	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False