Analyzing SUV Dataset¶

Prepared by Mahsa Sadi on 2020 - 06 - 24

In this notebook, we perform two steps:

Reading and visualizng SUV Data
Modeling SUV data using logistic Regression

SUV dataset conatins information about customers and whether they purchase an SUV or not.

import sklearn
import pandas
import seaborn
import matplotlib
%matplotlib inline

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

Reading and Summerizing the SUV Dataset¶

Link to dataset : https://www.kaggle.com/iamaniket/suv-data

data_set = pandas.read_csv ('suv_data.csv')

data_set.head (20)

data_set.shape

(400, 5)

data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.7+ KB

data_set.groupby ('Purchased').size()

Purchased
0    257
1    143
dtype: int64

cleaned_data_set = data_set.drop (columns = ['User ID'], axis = '1')
cleaned_data_set.head ()

cleaned_data_set.describe ()

Visualizing Data¶

seaborn.countplot (x = 'Purchased', data = cleaned_data_set)

<matplotlib.axes._subplots.AxesSubplot at 0x7f44cafa5090>

seaborn.countplot ( x = 'Purchased', hue = 'Gender', data = cleaned_data_set)

<matplotlib.axes._subplots.AxesSubplot at 0x7f44c8e9c6d0>

=> Women buy SUV more than men.

data_set ['Age'].hist(bins = 20)

<matplotlib.axes._subplots.AxesSubplot at 0x7f44c8e20c50>

age_category = []
for i in range (0, len  (data_set ['Age'])):
    if cleaned_data_set ['Age'][i] <= 20:
        age_category.append ('A');
    elif 20 < cleaned_data_set ['Age'][i] <= 26:
        age_category.append ('B');
    elif 26 < cleaned_data_set ['Age'][i] <= 30:
        age_category.append ('C');
    elif 30 < cleaned_data_set ['Age'][i] <= 40:
        age_category.append ('D');
    elif 40 < cleaned_data_set ['Age'][i] <= 50:
        age_category.append ('E');
    else:
        age_category.append ('F');

age_data_frame = pandas.DataFrame (data = age_category, columns = ['AgeCategory'])
augmented_data_set = pandas.concat([cleaned_data_set, age_data_frame], axis = 1)
augmented_data_set.head()

seaborn.countplot ( x = 'Purchased', hue = 'AgeCategory', data = augmented_data_set)

<matplotlib.axes._subplots.AxesSubplot at 0x7f44c8db4f10>

=> People yonger than 20 years old don't buy a SUV.
=> People between 20 and 26 years old don't buy SUV.
=> People younger than 26 don't buy SUVs.
=> Most of the people buying SUV are between 40 to 60 years old.

data_set ['EstimatedSalary'].hist(bins = 20)

<matplotlib.axes._subplots.AxesSubplot at 0x7f44c8cded50>

income_category = []
for i in range (0, len  (data_set ['EstimatedSalary'])):
    if cleaned_data_set ['EstimatedSalary'][i] <= 19500:
        income_category.append ('Very Low');
    elif 19500 < cleaned_data_set ['EstimatedSalary'][i] <= 40000:
        income_category.append ('Low');
    elif 40000 < cleaned_data_set ['EstimatedSalary'][i] <= 60000:
        income_category.append ('Moderately Low');
    elif 60000 < cleaned_data_set ['EstimatedSalary'][i] <= 80000:
        income_category.append ('Medium');
    elif 80000 < cleaned_data_set ['EstimatedSalary'][i] <= 100000:
        income_category.append ('Moderately high');
    elif 100000 < cleaned_data_set ['EstimatedSalary'][i] <= 130000:
        income_category.append ('Very High');
    elif 130000 < cleaned_data_set ['EstimatedSalary'][i] <= 145000:
        income_category.append ('Very High');
    else:
        income_category.append ('Extremely High');

income_data_frame = pandas.DataFrame (data = income_category, columns = ['IncomeCategory'])
augmented_data_set_2 = pandas.concat([augmented_data_set, income_data_frame], axis = 1)
augmented_data_set_2.head()

seaborn.countplot ( x = 'Purchased', hue = 'IncomeCategory', data = augmented_data_set_2)

<matplotlib.axes._subplots.AxesSubplot at 0x7f44c8c7e850>

=> People with salaries higher that 145000 will buy SUVs.
=> People with salaries lower than 19500 won't buy SUVs.
=> Majority of people buying SUVs have slaries between 100000 to 130000.

binary_gender = pandas.get_dummies (augmented_data_set_2 ['Gender'],drop_first = True)
binary_gender.head ()

binary_age = pandas.get_dummies (augmented_data_set_2 ['AgeCategory'])
binary_age.head ()

binary_income = pandas.get_dummies (augmented_data_set_2 ['IncomeCategory'])
binary_income.head ()

final_data_set  = pandas.concat ([augmented_data_set_2, binary_age, binary_gender, binary_income], axis = 1)
final_data_set_1 = final_data_set.drop (columns = ['Age', 'Gender', 'EstimatedSalary', 'IncomeCategory', 'AgeCategory'], axis = 1)
final_data_set_1.head ()

Modeling Data¶

Modeling the data considering all the features available in the dataset¶

Y = final_data_set_1 ['Purchased']
X = final_data_set_1.drop (columns = ['Purchased'], axis = 1)

X.head()

Y.head()

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

test_set_size = 0.2
seed = 0
X_train, X_test, Y_train, Y_test = model_selection.train_test_split (X,Y, test_size = test_set_size, random_state = seed)
model = LogisticRegression (solver = 'liblinear')
model.fit (X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

predictions = model.predict (X_test)

report = classification_report (Y_test, predictions)
print (report)

              precision    recall  f1-score   support

           0       0.97      0.97      0.97        58
           1       0.91      0.91      0.91        22

    accuracy                           0.95        80
   macro avg       0.94      0.94      0.94        80
weighted avg       0.95      0.95      0.95        80

print (confusion_matrix (Y_test, predictions))

[[56  2]
 [ 2 20]]

accuracy_score (Y_test, predictions)

0.95

Modeling the data considering some of the features available in the dataset¶

Y = data_set.iloc [:, 4]
X = data_set.iloc [:, 2:4]

print (X.head())

   Age  EstimatedSalary
0   19            19000
1   35            20000
2   26            43000
3   27            57000
4   19            76000

print (Y.head())

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

test_set_size = 0.2
seed = 0
X_train, X_test, Y_train, Y_test = model_selection.train_test_split (X,Y, test_size = test_set_size, random_state = seed)

Since the values of the estimated salary feature are in a wide range we need to scale the values.

scaler = StandardScaler ()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

classifier = LogisticRegression (random_state = seed, solver = 'liblinear')
classifier.fit (X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

predictions = classifier.predict (X_test)

report = classification_report (Y_test, predictions)
print (report)

              precision    recall  f1-score   support

           0       0.95      0.90      0.92        58
           1       0.76      0.86      0.81        22

    accuracy                           0.89        80
   macro avg       0.85      0.88      0.86        80
weighted avg       0.89      0.89      0.89        80

accuracy_score (Y_test, predictions)

0.8875

=> Considering all features provides a better accuracy and a better prediction model of the data.

	User ID	Gender	Age	EstimatedSalary	Purchased
0	15624510	Male	19	19000	0
1	15810944	Male	35	20000	0
2	15668575	Female	26	43000	0
3	15603246	Female	27	57000	0
4	15804002	Male	19	76000	0
5	15728773	Male	27	58000	0
6	15598044	Female	27	84000	0
7	15694829	Female	32	150000	1
8	15600575	Male	25	33000	0
9	15727311	Female	35	65000	0
10	15570769	Female	26	80000	0
11	15606274	Female	26	52000	0
12	15746139	Male	20	86000	0
13	15704987	Male	32	18000	0
14	15628972	Male	18	82000	0
15	15697686	Male	29	80000	0
16	15733883	Male	47	25000	1
17	15617482	Male	45	26000	1
18	15704583	Male	46	28000	1
19	15621083	Female	48	29000	1

	Age	EstimatedSalary	Purchased
count	400.000000	400.000000	400.000000
mean	37.655000	69742.500000	0.357500
std	10.482877	34096.960282	0.479864
min	18.000000	15000.000000	0.000000
25%	29.750000	43000.000000	0.000000
50%	37.000000	70000.000000	0.000000
75%	46.000000	88000.000000	1.000000
max	60.000000	150000.000000	1.000000

	Gender	Age	EstimatedSalary	AgeCategory	IncomeCategory
0	Male	19	19000	A	Very Low
1	Male	35	20000	D	Low
2	Female	26	43000	B	Moderately Low
3	Female	27	57000	C	Moderately Low
4	Male	19	76000	A	Medium

	A	B	C	D	Male	Low	Medium	Moderately Low	Very Low
0	1	0	0	0	1	0	0	0	1
1	0	0	0	1	1	1	0	0	0
2	0	1	0	0	0	0	0	1	0
3	0	0	1	0	0	0	0	1	0
4	1	0	0	0	1	0	1	0	0

	A	B	C	D	Male	Low	Medium	Moderately Low	Very Low
0	1	0	0	0	1	0	0	0	1
1	0	0	0	1	1	1	0	0	0
2	0	1	0	0	0	0	0	1	0
3	0	0	1	0	0	0	0	1	0
4	1	0	0	0	1	0	1	0	0

	A	B	C	D	Male	Low	Medium	Moderately Low	Very Low
0	1	0	0	0	1	0	0	0	1
1	0	0	0	1	1	1	0	0	0
2	0	1	0	0	0	0	0	1	0
3	0	0	1	0	0	0	0	1	0
4	1	0	0	0	1	0	1	0	0

	A	B	C	D	Male	Low	Medium	Moderately Low	Very Low
0	1	0	0	0	1	0	0	0	1
1	0	0	0	1	1	1	0	0	0
2	0	1	0	0	0	0	0	1	0
3	0	0	1	0	0	0	0	1	0
4	1	0	0	0	1	0	1	0	0

	A	B	C	D	Male	Low	Medium	Moderately Low	Very Low
0	1	0	0	0	1	0	0	0	1
1	0	0	0	1	1	1	0	0	0
2	0	1	0	0	0	0	0	1	0
3	0	0	1	0	0	0	0	1	0
4	1	0	0	0	1	0	1	0	0

	A	B	C	D	Male	Low	Medium	Moderately Low	Very Low
0	1	0	0	0	1	0	0	0	1
1	0	0	0	1	1	1	0	0	0
2	0	1	0	0	0	0	0	1	0
3	0	0	1	0	0	0	0	1	0
4	1	0	0	0	1	0	1	0	0