Prepared by Mahsa Sadi on 2020 - 06 - 24
In this notebook, we perform two steps:
import sklearn
import pandas
import seaborn
import matplotlib
%matplotlib inline
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
Link to dataset : https://www.kaggle.com/iamaniket/suv-data
data_set = pandas.read_csv ('suv_data.csv')
data_set.head (20)
data_set.shape
data_set.info()
data_set.groupby ('Purchased').size()
cleaned_data_set = data_set.drop (columns = ['User ID'], axis = '1')
cleaned_data_set.head ()
cleaned_data_set.describe ()
seaborn.countplot (x = 'Purchased', data = cleaned_data_set)
seaborn.countplot ( x = 'Purchased', hue = 'Gender', data = cleaned_data_set)
=> Women buy SUV more than men.
data_set ['Age'].hist(bins = 20)
age_category = []
for i in range (0, len (data_set ['Age'])):
if cleaned_data_set ['Age'][i] <= 20:
age_category.append ('A');
elif 20 < cleaned_data_set ['Age'][i] <= 26:
age_category.append ('B');
elif 26 < cleaned_data_set ['Age'][i] <= 30:
age_category.append ('C');
elif 30 < cleaned_data_set ['Age'][i] <= 40:
age_category.append ('D');
elif 40 < cleaned_data_set ['Age'][i] <= 50:
age_category.append ('E');
else:
age_category.append ('F');
age_data_frame = pandas.DataFrame (data = age_category, columns = ['AgeCategory'])
augmented_data_set = pandas.concat([cleaned_data_set, age_data_frame], axis = 1)
augmented_data_set.head()
seaborn.countplot ( x = 'Purchased', hue = 'AgeCategory', data = augmented_data_set)
=> People yonger than 20 years old don't buy a SUV.
=> People between 20 and 26 years old don't buy SUV.
=> People younger than 26 don't buy SUVs.
=> Most of the people buying SUV are between 40 to 60 years old.
data_set ['EstimatedSalary'].hist(bins = 20)
income_category = []
for i in range (0, len (data_set ['EstimatedSalary'])):
if cleaned_data_set ['EstimatedSalary'][i] <= 19500:
income_category.append ('Very Low');
elif 19500 < cleaned_data_set ['EstimatedSalary'][i] <= 40000:
income_category.append ('Low');
elif 40000 < cleaned_data_set ['EstimatedSalary'][i] <= 60000:
income_category.append ('Moderately Low');
elif 60000 < cleaned_data_set ['EstimatedSalary'][i] <= 80000:
income_category.append ('Medium');
elif 80000 < cleaned_data_set ['EstimatedSalary'][i] <= 100000:
income_category.append ('Moderately high');
elif 100000 < cleaned_data_set ['EstimatedSalary'][i] <= 130000:
income_category.append ('Very High');
elif 130000 < cleaned_data_set ['EstimatedSalary'][i] <= 145000:
income_category.append ('Very High');
else:
income_category.append ('Extremely High');
income_data_frame = pandas.DataFrame (data = income_category, columns = ['IncomeCategory'])
augmented_data_set_2 = pandas.concat([augmented_data_set, income_data_frame], axis = 1)
augmented_data_set_2.head()
seaborn.countplot ( x = 'Purchased', hue = 'IncomeCategory', data = augmented_data_set_2)
=> People with salaries higher that 145000 will buy SUVs.
=> People with salaries lower than 19500 won't buy SUVs.
=> Majority of people buying SUVs have slaries between 100000 to 130000.
binary_gender = pandas.get_dummies (augmented_data_set_2 ['Gender'],drop_first = True)
binary_gender.head ()
binary_age = pandas.get_dummies (augmented_data_set_2 ['AgeCategory'])
binary_age.head ()
binary_income = pandas.get_dummies (augmented_data_set_2 ['IncomeCategory'])
binary_income.head ()
final_data_set = pandas.concat ([augmented_data_set_2, binary_age, binary_gender, binary_income], axis = 1)
final_data_set_1 = final_data_set.drop (columns = ['Age', 'Gender', 'EstimatedSalary', 'IncomeCategory', 'AgeCategory'], axis = 1)
final_data_set_1.head ()
Y = final_data_set_1 ['Purchased']
X = final_data_set_1.drop (columns = ['Purchased'], axis = 1)
X.head()
Y.head()
test_set_size = 0.2
seed = 0
X_train, X_test, Y_train, Y_test = model_selection.train_test_split (X,Y, test_size = test_set_size, random_state = seed)
model = LogisticRegression (solver = 'liblinear')
model.fit (X_train, Y_train)
predictions = model.predict (X_test)
report = classification_report (Y_test, predictions)
print (report)
print (confusion_matrix (Y_test, predictions))
accuracy_score (Y_test, predictions)
Y = data_set.iloc [:, 4]
X = data_set.iloc [:, 2:4]
print (X.head())
print (Y.head())
test_set_size = 0.2
seed = 0
X_train, X_test, Y_train, Y_test = model_selection.train_test_split (X,Y, test_size = test_set_size, random_state = seed)
Since the values of the estimated salary feature are in a wide range we need to scale the values.
scaler = StandardScaler ()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
classifier = LogisticRegression (random_state = seed, solver = 'liblinear')
classifier.fit (X_train, Y_train)
predictions = classifier.predict (X_test)
report = classification_report (Y_test, predictions)
print (report)
accuracy_score (Y_test, predictions)
=> Considering all features provides a better accuracy and a better prediction model of the data.