Prepared by Mahsa Sadi on 2020 - 06 - 22
In this notebook, we perform three steps:
# pandas is a python library for manipulating and analyzing numerical tables and time-series
import pandas
print ("pandas {}".format (pandas.__version__))
from pandas.plotting import scatter_matrix
# matplotlib is a Python plotting library for numerical data
import matplotlib
print ("matplotlib {}".format (matplotlib.__version__))
import matplotlib.pyplot
# sklearn is a Python library for machine learning.
#sklearn contains various datasets and the reday-to-use implementation of various machine learning algorithms.
import sklearn
print ("sklearn {}".format (sklearn.__version__))
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# The url of the iris dataset.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# The iris dataset has five columns (four of them are independant variables and one of them are dependant variable.)
names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
#Read the iris dataset from the defined url, the names of the columns in the datas set is defined in the names list
data_set = pandas.read_csv (url, names = names)
# Dispaly how many rows and columns the iris data set has
print (data_set.shape)
# Display the first 20 rows of the iris dataset
print (data_set.head (20))
# Provide a summary of the iris dataset
print (data_set.describe())
# See how many instances of each class exist
print (data_set.groupby('class').size())
#Draw a box plot for the iris dataset without sub plots
# A box plot helps gain insight about the distribution of each of feature (i.e.; attribute)
data_set.plot (kind = 'box', subplots = False, sharex = False, sharey = False)
# Gain a summerized insight about the distribution of each feature in dataset
# Draw a box plot for the iris dataset with sub plots
# A box plot helps gain insight about the distribution of each of feature (i.e.; attribute)
data_set.plot (kind = 'box', subplots = True, layout = (2,2), sharex = False, sharey = False)
# Gain detailed insight about the distribution of each feature in the dataset; i.e.; look carefully into the distribution of each feature
# Draw the histogram of the dataset.
# The histogram of the features helps us understand whether the distribution of a feature is normal or gGaussian or not.
data_set.hist ()
# From the following histograms we see that the length of petal and the width of sepal has almost a Gaussian distribution.
# 4- Visualize Data
# Gain insight about the relationships between features
scatter_matrix (data_set)
data_table = data_set. values
print (data_table [0:20])
#independant variables of the iris flowers (the features)
X = data_table [:, 0:4]
#dependant variable (the category of the iris flowers )
Y = data_table [:, 4]
# Split the iris dataset into two sets: Training Set and Test Set
# 80% of the iris dataset is for training
#20% of the iris dataset is for testing
#the size of the test set
test_set_size = 0.2
#Randomly select train and test data set from the iris dataset
seed = 6
#Split the iris data set into training and test data sets and choose the training and test data set randomly.
X_train, X_test, Y_train, Y_test = model_selection.train_test_split (X,Y, test_size = test_set_size, random_state = seed)
# Use accuracy as the measure of the performance of the built model
scoring = 'accuracy'
# Build different models for the iris data
models = []
models.append (('Logistic Regression', LogisticRegression(solver ='lbfgs', multi_class = 'ovr')))
models.append (('Linear Discriminant Analysis', LinearDiscriminantAnalysis()))
models.append (('K Nearest Neigbors', KNeighborsClassifier()))
models.append (('CART', DecisionTreeClassifier()))
models.append (('Support Vector Machine', SVC(gamma ='scale')))
models.append (('Guassian Naive Bayes', GaussianNB()))
results = []
names = []
# When building different models, use cross validation by dividing the dataset into 10 slices, picking 8 of them for training set, and picking 2 of them for test set.
for name, model in models:
names.append (name)
K_Fold = model_selection.KFold (n_splits = 10, random_state = seed)
cv_results = model_selection.cross_val_score (model, X_train, Y_train, cv = K_Fold, scoring = scoring)
results.append (cv_results)
message = "%s: %f (%f)" % (name, cv_results.mean (), cv_results.std())
print (message)