In this post we explore the wine dataset. First, we perform descriptive and exploratory data analysis. Next, we run dimensionality reduction with PCA and TSNE algorithms in order to check their functionality. Finally a random forest classifier is implemented, comparing different parameter values in order to check how the impact on the classifier results.
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
%matplotlib inline
Load the dataset¶
#Let's import the data from sklearn
from sklearn.datasets import load_wine
wine=load_wine()
#Conver to pandas dataframe
data=pd.DataFrame(data=np.c_[wine['data'],wine['target']],columns=wine['feature_names']+['target'])
#Check data with info function
data.info()
# Search for missing, NA and null values)
(data.isnull() | data.empty | data.isna()).sum()
Data analysis¶
Basic statistical analysis¶
#Let's see the frequency of the variable target.
#Convert variable to categorical.
data.target=data.target.astype('int64').astype('category')
#Frequency.
freq=data['target'].value_counts()
freq
#Let's check graphically.
freq.plot(kind='bar')
#Let's show a summary of teh dataset where we can see
# the basic statistic data.
data.describe()
#Let's show the histograms of the variables alcohol, magnesium y color_intensity.
#Histogramas
data[['alcohol','magnesium','color_intensity']].hist()
Exploratory analysis¶
feats_to_explore = ['alcohol', 'magnesium', 'color_intensity']
# Alcohol variable histograms.
x1 = data.loc[data.target==0, 'alcohol']
x2 = data.loc[data.target==1, 'alcohol']
x3 = data.loc[data.target==2, 'alcohol']
kwargs = dict(alpha=0.3,bins=25)
plt.hist(x1, **kwargs, color='g', label='Tipo 0')
plt.hist(x2, **kwargs, color='b', label='Tipo 1')
plt.hist(x3, **kwargs, color='r', label='Tipo 2')
plt.gca().set(title='Frecuencia de alcohol por tipo de vino', ylabel='Frequencia')
plt.legend();
#Color_intensity histograms
x1 = data.loc[data.target==0, 'color_intensity']
x2 = data.loc[data.target==1, 'color_intensity']
x3 = data.loc[data.target==2, 'color_intensity']
kwargs = dict(alpha=0.3,bins=25)
plt.hist(x1, **kwargs, color='g', label='Tipo 0')
plt.hist(x2, **kwargs, color='b', label='Tipo 1')
plt.hist(x3, **kwargs, color='r', label='Tipo 2')
plt.gca().set(title='Frecuencia de intensidad de color por tipo de vino', ylabel='Frequencia')
plt.legend();
#Magnesium histograms
x1 = data.loc[data.target==0, 'magnesium']
x2 = data.loc[data.target==1, 'magnesium']
x3 = data.loc[data.target==2, 'magnesium']
kwargs = dict(alpha=0.3,bins=25)
plt.hist(x1, **kwargs, color='g', label='Tipo 0')
plt.hist(x2, **kwargs, color='b', label='Tipo 1')
plt.hist(x3, **kwargs, color='r', label='Tipo 2')
plt.gca().set(title='Frecuencia de magnesio por tipo de vino', ylabel='Frequencia')
plt.legend();
#Alcohol histograms with the mean and the standard deviation.
x1 = data.loc[data.target==0, 'alcohol']
x2 = data.loc[data.target==1, 'alcohol']
x3 = data.loc[data.target==2, 'alcohol']
kwargs = dict(alpha=0.3,bins=25)
plt.hist(x1, **kwargs, color='g', label='Tipo 0'+ str("{:6.2f}".format(x1.std())))
plt.hist(x2, **kwargs, color='b', label='Tipo 1'+ str("{:6.2f}".format(x2.std())))
plt.hist(x3, **kwargs, color='r', label='Tipo 2'+ str("{:6.2f}".format(x3.std())))
plt.gca().set(title='Frecuencia de alcohol por tipo de vino', ylabel='Frequencia')
plt.axvline(x1.mean(), color='g', linestyle='dashed', linewidth=1)
plt.axvline(x2.mean(), color='b', linestyle='dashed', linewidth=1)
plt.axvline(x3.mean(), color='r', linestyle='dashed', linewidth=1)
plt.legend();
#color_intensity histograms with the mean and the standard deviation..
x1 = data.loc[data.target==0, 'color_intensity']
x2 = data.loc[data.target==1, 'color_intensity']
x3 = data.loc[data.target==2, 'color_intensity']
kwargs = dict(alpha=0.3,bins=25)
plt.hist(x1, **kwargs, color='g', label='Tipo 0'+ str("{:6.2f}".format(x1.std())))
plt.hist(x2, **kwargs, color='b', label='Tipo 1'+ str("{:6.2f}".format(x2.std())))
plt.hist(x3, **kwargs, color='r', label='Tipo 2'+ str("{:6.2f}".format(x3.std())))
plt.gca().set(title='Frecuencia de intensidad de color por tipo de vino', ylabel='Frequencia')
plt.axvline(x1.mean(), color='g', linestyle='dashed', linewidth=1)
plt.axvline(x2.mean(), color='b', linestyle='dashed', linewidth=1)
plt.axvline(x3.mean(), color='r', linestyle='dashed', linewidth=1)
plt.legend();
#magnesium histograms with the mean and the standard deviation..
x1 = data.loc[data.target==0, 'magnesium']
x2 = data.loc[data.target==1, 'magnesium']
x3 = data.loc[data.target==2, 'magnesium']
kwargs = dict(alpha=0.3,bins=25)
plt.hist(x1, **kwargs, color='g', label='Tipo 0'+ str("{:6.2f}".format(x1.std())))
plt.hist(x2, **kwargs, color='b', label='Tipo 1'+ str("{:6.2f}".format(x2.std())))
plt.hist(x3, **kwargs, color='r', label='Tipo 2'+ str("{:6.2f}".format(x3.std())))
plt.gca().set(title='Frecuencia de magnesio por tipo de vino', ylabel='Frequencia')
plt.axvline(x1.mean(), color='g', linestyle='dashed', linewidth=1)
plt.axvline(x2.mean(), color='b', linestyle='dashed', linewidth=1)
plt.axvline(x3.mean(), color='r', linestyle='dashed', linewidth=1)
plt.legend();
#Correlation table
df=data[['alcohol','magnesium','color_intensity']]
df.corr()
#scatter plots
df=data[['alcohol','magnesium','color_intensity','target']]
sns.pairplot(df,hue='target')
Dimensionality reduction
#Import standardscaler
from sklearn.preprocessing import StandardScaler
#Remove target columns.
x = data.loc[:,data.columns != 'target'].values
y = data.loc[:,['target']].values
#Scale the data
x= pd.DataFrame(StandardScaler().fit_transform(x))
y=pd.DataFrame(y)
# Create PCA object.
pca = PCA(n_components=2)
#Run PCA.
pComp=pca.fit_transform(x)
principalDf = pd.DataFrame(data = pComp
, columns = ['PC 1', 'PC 2'])
principalDf.head()
# Join again the target variable
finalDf = pd.concat([principalDf, data[['target']]], axis = 1)
finalDf.head()
# Show the graphics.
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('PCA', fontsize = 20)
targets = [0.0, 1.0, 2.0]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
indicesToKeep = finalDf['target'] == target
ax.scatter(finalDf.loc[indicesToKeep, 'PC 1']
, finalDf.loc[indicesToKeep, 'PC 2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
#Use same variables as in the previous point, they are already standarized
# Create TSNE object.
X_embedded = TSNE(n_components=2,perplexity=15,random_state=42).fit_transform(x)
tsneDf = pd.DataFrame(data = X_embedded
, columns = ['PC 1', 'PC 2'])
tsneDf.head()
# Join the target variable
ftnseDf = pd.concat([tsneDf, data[['target']]], axis = 1)
ftnseDf.head()
# Show the graphic.
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('TSNE', fontsize = 25)
targets = [0.0, 1.0, 2.0]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
indicesToKeep = ftnseDf['target'] == target
ax.scatter(ftnseDf.loc[indicesToKeep, 'PC 1']
, ftnseDf.loc[indicesToKeep, 'PC 2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
Predictions¶
En este último ejercicio se trata de aplicar un método de aprendizaje supervisado, concretamente el clasificador Random Forest, para predecir la clase a la que pertenece cada vino y evaluar la precisión obtenida con el modelo. Para eso usaremos:
#Vamos a dividir el dataset usando los datos escalados
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33,random_state=42)
X_train.shape
X_test.shape
#Create the classifier.
clf=RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X_train,y_train.values.ravel())
#Apply cross validation to evaluate the results.
scores=cross_val_score(clf,X_train,y_train.values.ravel(),cv=5)
scores
#Calculate the mean and the standard deviation of the validation
print("Mean: %0.2f ; Standard Dev.: %0.2f)" % (scores.mean(), scores.std()))
#Apply PCA.
# Create PCA object.
pca = PCA(n_components=2)
#Apply PCA on training data
pComp=pca.fit_transform(X_train)
#Run PCA
principalDf = pd.DataFrame(data = pComp
, columns = ['PC 1', 'PC 2'])
principalDf.head()
#Create the classifier
pcaclf=RandomForestClassifier(n_estimators=10,random_state=42)
pcaclf.fit(principalDf,y_train.values.ravel())
#Apply cross validation
scores=cross_val_score(pcaclf,principalDf,y_train.values.ravel(),cv=5)
scores
#Mean and standard deviation of the validation.
print("Mean: %0.2f ; Standard dev.: %0.2f)" % (scores.mean(), scores.std()))
#Run TSNE.
X_embedded = TSNE(n_components=2,perplexity=15).fit_transform(X_train)
tsneDf = pd.DataFrame(data = X_embedded
, columns = ['PC 1', 'PC 2'])
tsneDf.head()
#Create the classifier
tclf=RandomForestClassifier(n_estimators=10, random_state=42 )
tclf.fit(tsneDf,y_train.values.ravel())
#Apply cross validation
scores=cross_val_score(tclf,tsneDf,y_train.values.ravel(),cv=5)
scores
#Calculate mean and standard deviation of the validation
print("Mean: %0.2f ; Standard dev.: %0.2f)" % (scores.mean(), scores.std()))
#Let's transform test data
PCA_test=pca.transform(X_test)
pcaTestDf = pd.DataFrame(data = PCA_test
, columns = ['PC 1', 'PC 2'])
pcaTestDf.shape
prediction=pcaclf.predict(pcaTestDf)
prediction
#Cross validation and metrics.
acc_score=accuracy_score(y_test,prediction)
acc_score
#We get a 98% accuracy, let's see confussion matrix.
conf_matrix=confusion_matrix(y_test,prediction)
conf_matrix
Next we are going to test n_estimators,max_depth and min_samples_split parameters with different values, to clearly see their purpose and effect on the results, we are going to save all the prediction efficiency results on the train and test data, and show a graph. To see more clearly its improvement we will test on the dataset without dimensionality reduction since it is not the best model, so we can check how much the model improves with each parameter.
n_estimators: This parameter represents the number of trees used in the model, in the first graph shows its effect, we can clearly see how the effectiveness of the model for new cases goes up to 16 trees, where it reaches its maximum, a higher number of trees does not get an improvement of the model.
max_depth: It represents the depth of the trees in the model, i.e. the number of levels of each tree.In the example we show its effect with 4 trees (n_estimators), it is observed how from a certain depth overfitting is produced and the model does not learn for new data.
min_samples_split: This parameter defines the number of data to use before splitting a node. A larger value in this parameter further restricts the tree by forcing it to use more data before splitting. In this case we use the parameter with percentages, we see how with values up to 40% the model stays above 90% effectiveness, from there its effectiveness drops quite a lot.
#Vamos a comenzar con n_estimators
from matplotlib.legend_handler import HandlerLine2D
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
train_results = []
test_results = []
#Save precision data in arrays in order to show the graphic.
for estimator in n_estimators:
clf = RandomForestClassifier(n_estimators=estimator,random_state=42)
clf.fit(X_train,y_train.values.ravel())
pred_train = clf.predict(X_train)
acc_score_train = accuracy_score(y_train,pred_train)
train_results.append(acc_score_train)
pred_test = clf.predict(X_test)
acc_score_test=accuracy_score(y_test,pred_test)
test_results.append(acc_score_test)
line1, = plt.plot(n_estimators, train_results, 'b', label='Train accuracy')
line2, = plt.plot(n_estimators, test_results, 'r', label='Test accuracy')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Accuracy')
plt.xlabel('n_estimators')
plt.show()
#Continue with max_depth
max_depths = np.linspace(1, 32, 32, endpoint=True)
train_results = []
test_results = []
#Save precision data in arrays in order to show the graphic
for max_depth in max_depths:
clf = RandomForestClassifier(n_estimators=4,max_depth=max_depth,random_state=42)
clf.fit(X_train,y_train.values.ravel())
pred_train = clf.predict(X_train)
acc_score_train = accuracy_score(y_train,pred_train)
train_results.append(acc_score_train)
pred_test = clf.predict(X_test)
acc_score_test=accuracy_score(y_test,pred_test)
test_results.append(acc_score_test)
line1, = plt.plot(max_depths, train_results, 'b', label='Train accuracy')
line2, = plt.plot(max_depths, test_results, 'r', label='Test accuracy')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Accuracy')
plt.xlabel('max_depths')
plt.show()
#Finally, min_samples_split
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
test_results = []
train_results = []
#Save precision data in arrays in order to show the graphic
for min_samples_split in min_samples_splits:
clf = RandomForestClassifier(n_estimators=4,max_depth=2,min_samples_split=min_samples_split,random_state=42)
clf.fit(X_train,y_train.values.ravel())
pred_train = clf.predict(X_train)
acc_score_train = accuracy_score(y_train,pred_train)
train_results.append(acc_score_train)
pred_test = clf.predict(X_test)
acc_score_test=accuracy_score(y_test,pred_test)
test_results.append(acc_score_test)
line1, = plt.plot(min_samples_splits, train_results, 'b', label='Train accuracy')
line2, = plt.plot(min_samples_splits, test_results, 'r', label='Test accuracy')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Accuracy')
plt.xlabel('min_samples_splits')
plt.show()