Prepared by Mahsa Sadi on 2020 - 06 - 23
In this notbook, we perform five steps on the Titanic data set:
import pandas
import sklearn
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import numpy
import seaborn
import matplotlib.pyplot
%matplotlib inline
import math
The data set is in a csv file in the same directory as the code.
data_set = pandas.read_csv ('titanic.csv')
data_set.head (20)
data_set.info()
# how many rows and columns?
data_set.shape
#how many survived and how many died?
data_set.groupby ('Survived').size()
seaborn.countplot (x ="Survived", data = data_set)
=> More people died than survived
seaborn.countplot (x ="Survived", hue = "Sex", data = data_set)
=> More men died than women
seaborn.countplot (x ="Survived", hue = "Pclass", data = data_set)
=> More 3rd class passengers died than others.
data_set ["Age"]. hist ()
=> The majority of the passengers were 18 to 40 years old.
data_set ["Fare"].hist (bins = 20, figsize = (10,5))
seaborn.countplot (x ="Siblings/Spouses Aboard", data = data_set)
seaborn.countplot (x ="Parents/Children Aboard", data = data_set)
=> The majority of passengers were alone or with one family member.
seaborn.boxplot( x = 'Pclass', y = 'Age', data = data_set)
=> First and second class passengers were older.
seaborn.heatmap (data_set.isnull(),yticklabels = False, cmap = 'YlGnBu')
cleaned_data_set = data_set.drop (columns = ['Name'], axis = 1)
cleaned_data_set.head (5)
#Idnetify the cells with missing dada
cleaned_data_set.isnull().head (5)
cleaned_data_set.isnull().sum ()
To apply logistic regression all the columns should have categorial or numerical values.
pandas.get_dummies (cleaned_data_set ['Sex']).head (5)
binary_sex = pandas.get_dummies (cleaned_data_set ['Sex'],drop_first = True)
binary_sex.head (5)
pandas.get_dummies (cleaned_data_set ['Pclass']).head (5)
binary_Pclass = pandas.get_dummies (cleaned_data_set ['Pclass'],drop_first = True)
binary_Pclass.head (5)
modified_data_set = pandas.concat ([cleaned_data_set, binary_sex, binary_Pclass], axis = 1)
modified_data_set.head (5)
final_data_set = modified_data_set.drop (columns = ['Sex', 'Pclass'])
final_data_set.head (5)
final_data_set.info()
Y= final_data_set ['Survived']
X= final_data_set.drop (['Survived'], axis = 1 )
X.head (5)
Y.head (5)
test_set_size = 0.2
seed = 1
X_train, X_test, Y_train, Y_test = model_selection.train_test_split (X,Y, test_size = test_set_size, random_state = seed)
model = LogisticRegression (solver = 'liblinear')
model.fit (X_train, Y_train)
predictions = model.predict (X_test)
report = classification_report (Y_test, predictions)
print (report)
print (confusion_matrix (Y_test, predictions))
accuracy_score (Y_test, predictions)