import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

import matplotlib.pyplot as plt


pd.set_option('display.max_columns', None)

%matplotlib inline

Load the dataset¶

#Let's import the data from sklearn
from sklearn.datasets import load_wine
wine=load_wine()

#Conver to pandas dataframe
data=pd.DataFrame(data=np.c_[wine['data'],wine['target']],columns=wine['feature_names']+['target'])

#Check data with info function
data.info()


RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  target                        178 non-null    float64
dtypes: float64(14)
memory usage: 19.6 KB

# Search for missing, NA and null values)


(data.isnull() | data.empty | data.isna()).sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

Data analysis¶

Basic statistical analysis¶

#Let's see the frequency of the variable target.
#Convert variable to categorical.
data.target=data.target.astype('int64').astype('category')

#Frequency.
freq=data['target'].value_counts()

freq

1    71
0    59
2    48
Name: target, dtype: int64

#Let's check graphically.
freq.plot(kind='bar')

#Let's show a summary of teh dataset where we can see 
# the basic statistic data.
data.describe()

#Let's show the histograms of the variables alcohol, magnesium y color_intensity.
#Histogramas
data[['alcohol','magnesium','color_intensity']].hist()

array([[,
        ],
       [,
        ]],
      dtype=object)

Análisis: Comentad los resultados.

In the previous points we see how all the variables in the dataset, except the target variable, are continuous numerical. There are no missing values in any of the variables. From the basic statistical values we can see that none of the variables follows a normal distribution, since none has mean 0 and standard deviation 1. In the histograms we can observe how the alcohol variable has a more or less centered distribution, with most of the records having values between 12 and 14 degrees, as for color_intensity and magnesium, we observe that their distributions are skewed to the left.

Exploratory analysis¶

feats_to_explore = ['alcohol', 'magnesium', 'color_intensity']

# Alcohol variable histograms.
x1 = data.loc[data.target==0, 'alcohol']
x2 = data.loc[data.target==1, 'alcohol']
x3 = data.loc[data.target==2, 'alcohol']

kwargs = dict(alpha=0.3,bins=25)

plt.hist(x1, **kwargs, color='g', label='Tipo 0')
plt.hist(x2, **kwargs, color='b', label='Tipo 1')
plt.hist(x3, **kwargs, color='r', label='Tipo 2')
plt.gca().set(title='Frecuencia de alcohol por tipo de vino', ylabel='Frequencia')

plt.legend();

#Color_intensity histograms

x1 = data.loc[data.target==0, 'color_intensity']
x2 = data.loc[data.target==1, 'color_intensity']
x3 = data.loc[data.target==2, 'color_intensity']

kwargs = dict(alpha=0.3,bins=25)

plt.hist(x1, **kwargs, color='g', label='Tipo 0')
plt.hist(x2, **kwargs, color='b', label='Tipo 1')
plt.hist(x3, **kwargs, color='r', label='Tipo 2')
plt.gca().set(title='Frecuencia de intensidad de color por tipo de vino', ylabel='Frequencia')

plt.legend();

#Magnesium histograms

x1 = data.loc[data.target==0, 'magnesium']
x2 = data.loc[data.target==1, 'magnesium']
x3 = data.loc[data.target==2, 'magnesium']

kwargs = dict(alpha=0.3,bins=25)

plt.hist(x1, **kwargs, color='g', label='Tipo 0')
plt.hist(x2, **kwargs, color='b', label='Tipo 1')
plt.hist(x3, **kwargs, color='r', label='Tipo 2')
plt.gca().set(title='Frecuencia de magnesio por tipo de vino', ylabel='Frequencia')

plt.legend();

We can observe that the variable that can best define the type of wine is the alcohol variable, since according to the graph the types of wine have less overlap according to the amount of alcohol, we see how type 0 and 1 are well differentiated in some ranges. As for color intensity, it would also allow us to obtain a classification, although perhaps a greater overlapping of the graphs is observed. Magnesium seems to be the variable that least defines the type of wine since the histograms are quite overlapped in almost all the graph.

Let's repeat the graphs but showing the mean and the standard deviation.

#Alcohol histograms with the mean and the standard deviation.

x1 = data.loc[data.target==0, 'alcohol']
x2 = data.loc[data.target==1, 'alcohol']
x3 = data.loc[data.target==2, 'alcohol']

kwargs = dict(alpha=0.3,bins=25)

plt.hist(x1, **kwargs, color='g', label='Tipo 0'+  str("{:6.2f}".format(x1.std())))
plt.hist(x2, **kwargs, color='b', label='Tipo 1'+  str("{:6.2f}".format(x2.std())))
plt.hist(x3, **kwargs, color='r', label='Tipo 2'+  str("{:6.2f}".format(x3.std())))
plt.gca().set(title='Frecuencia de alcohol por tipo de vino', ylabel='Frequencia')
plt.axvline(x1.mean(), color='g', linestyle='dashed', linewidth=1)
plt.axvline(x2.mean(), color='b', linestyle='dashed', linewidth=1)
plt.axvline(x3.mean(), color='r', linestyle='dashed', linewidth=1)
plt.legend();

#color_intensity histograms with the mean and the standard deviation..


x1 = data.loc[data.target==0, 'color_intensity']
x2 = data.loc[data.target==1, 'color_intensity']
x3 = data.loc[data.target==2, 'color_intensity']

kwargs = dict(alpha=0.3,bins=25)

plt.hist(x1, **kwargs, color='g', label='Tipo 0'+  str("{:6.2f}".format(x1.std())))
plt.hist(x2, **kwargs, color='b', label='Tipo 1'+  str("{:6.2f}".format(x2.std())))
plt.hist(x3, **kwargs, color='r', label='Tipo 2'+  str("{:6.2f}".format(x3.std())))
plt.gca().set(title='Frecuencia de intensidad de color por tipo de vino', ylabel='Frequencia')
plt.axvline(x1.mean(), color='g', linestyle='dashed', linewidth=1)
plt.axvline(x2.mean(), color='b', linestyle='dashed', linewidth=1)
plt.axvline(x3.mean(), color='r', linestyle='dashed', linewidth=1)
plt.legend();

#magnesium histograms with the mean and the standard deviation..


x1 = data.loc[data.target==0, 'magnesium']
x2 = data.loc[data.target==1, 'magnesium']
x3 = data.loc[data.target==2, 'magnesium']

kwargs = dict(alpha=0.3,bins=25)

plt.hist(x1, **kwargs, color='g', label='Tipo 0'+  str("{:6.2f}".format(x1.std())))
plt.hist(x2, **kwargs, color='b', label='Tipo 1'+  str("{:6.2f}".format(x2.std())))
plt.hist(x3, **kwargs, color='r', label='Tipo 2'+  str("{:6.2f}".format(x3.std())))
plt.gca().set(title='Frecuencia de magnesio por tipo de vino', ylabel='Frequencia')
plt.axvline(x1.mean(), color='g', linestyle='dashed', linewidth=1)
plt.axvline(x2.mean(), color='b', linestyle='dashed', linewidth=1)
plt.axvline(x3.mean(), color='r', linestyle='dashed', linewidth=1)
plt.legend();

Let's check the correlation among the variables and show scatterplots.

#Correlation table
df=data[['alcohol','magnesium','color_intensity']]
df.corr()

#scatter plots
df=data[['alcohol','magnesium','color_intensity','target']]
sns.pairplot(df,hue='target')

We can see how the correlation of alcohol with magnesium is low, 0.27, which can be seen in the low directionality of the points in this graph. We can also observe very little directionality in the plot of magnesium with color_intensity, which corresponds with the very low correlation index found previously (0.19). On the other hand, the correlation of alcohol with color_intensity is the highest of all (0.54) as can also be seen in the higher directionality of its dot plot, although without being a high correlation.

Dimensionality reduction

Let's apply dimensionality reduction in order to reduce the data to 2 dimensions. We will use 2 different functions (PCA and TSNE) to check which of them yields better results.

#Import standardscaler
from sklearn.preprocessing import StandardScaler

#Remove target columns.
x = data.loc[:,data.columns != 'target'].values
y = data.loc[:,['target']].values

#Scale the data
x= pd.DataFrame(StandardScaler().fit_transform(x))
y=pd.DataFrame(y)
# Create PCA object.
pca = PCA(n_components=2)

#Run PCA.
pComp=pca.fit_transform(x)

principalDf = pd.DataFrame(data = pComp
             , columns = ['PC 1', 'PC 2'])

principalDf.head()

# Join again the target variable

finalDf = pd.concat([principalDf, data[['target']]], axis = 1)
finalDf.head()

# Show the graphics.
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('PCA', fontsize = 20)
targets = [0.0, 1.0, 2.0]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['target'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'PC 1']
               , finalDf.loc[indicesToKeep, 'PC 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

Let's apply TSNE to reduce dimensionality. This algorithms tries to minimize the divergence between the distributions of the pairwise similarities of the original objects and the same in the low-dimensional data.

#Use same variables as in the previous point, they are already standarized
# Create TSNE object.
X_embedded = TSNE(n_components=2,perplexity=15,random_state=42).fit_transform(x)

tsneDf = pd.DataFrame(data = X_embedded
             , columns = ['PC 1', 'PC 2'])

tsneDf.head()

# Join the target variable

ftnseDf = pd.concat([tsneDf, data[['target']]], axis = 1)
ftnseDf.head()

# Show the graphic.
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('TSNE', fontsize = 25)
targets = [0.0, 1.0, 2.0]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = ftnseDf['target'] == target
    ax.scatter(ftnseDf.loc[indicesToKeep, 'PC 1']
               , ftnseDf.loc[indicesToKeep, 'PC 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

It seems that the dimensionality reduction has worked well since the classes are well separated in the methods. Both methods show a clear separation between the classes, perhaps PCA shows this separation more clearly and there is less scatter and mixing at the group boundaries. The different results are due to the fact that both methods work differently, while PCA explains the variance of the data when projected on an axis and looks for the components that explain the most variance, TSNE, on the other hand, uses probability distributions to look for a similarity between the reduced dimension space and the original dimension space.

Predictions¶

En este último ejercicio se trata de aplicar un método de aprendizaje supervisado, concretamente el clasificador Random Forest, para predecir la clase a la que pertenece cada vino y evaluar la precisión obtenida con el modelo. Para eso usaremos:

Let's start applying a random forest. With the original data, then with the reduced datasets (PCA and TSNE) to check the results.

#Vamos a dividir el dataset usando los datos escalados

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33,random_state=42)

X_train.shape

(119, 13)

X_test.shape

(59, 13)

#Create the classifier.

clf=RandomForestClassifier(n_estimators=10, random_state=42)

clf.fit(X_train,y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

#Apply cross validation to evaluate the results.
scores=cross_val_score(clf,X_train,y_train.values.ravel(),cv=5)
scores

array([1.        , 0.91666667, 0.91666667, 0.91666667, 0.95652174])

#Calculate the mean and the standard deviation of the validation
print("Mean: %0.2f ; Standard Dev.: %0.2f)" % (scores.mean(), scores.std()))

Mean: 0.94 ; Standard Dev.: 0.03)

Let's run the classifier with PCA reduced data

#Apply PCA.

# Create PCA object.
pca = PCA(n_components=2)

#Apply PCA on training data
pComp=pca.fit_transform(X_train)

#Run PCA


principalDf = pd.DataFrame(data = pComp
             , columns = ['PC 1', 'PC 2'])

principalDf.head()

#Create the classifier

pcaclf=RandomForestClassifier(n_estimators=10,random_state=42)

pcaclf.fit(principalDf,y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

#Apply cross validation
scores=cross_val_score(pcaclf,principalDf,y_train.values.ravel(),cv=5)
scores

array([0.95833333, 0.875     , 1.        , 0.95833333, 1.        ])

#Mean and standard deviation of the validation.
print("Mean: %0.2f ; Standard dev.: %0.2f)" % (scores.mean(), scores.std()))

Mean: 0.96 ; Standard dev.: 0.05)

Let's run the classifier with TSNE reduced data

#Run TSNE.


X_embedded = TSNE(n_components=2,perplexity=15).fit_transform(X_train)


tsneDf = pd.DataFrame(data = X_embedded
             , columns = ['PC 1', 'PC 2'])
tsneDf.head()

#Create the classifier

tclf=RandomForestClassifier(n_estimators=10, random_state=42 )

tclf.fit(tsneDf,y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

#Apply cross validation
scores=cross_val_score(tclf,tsneDf,y_train.values.ravel(),cv=5)
scores

array([0.91666667, 0.91666667, 1.        , 0.95833333, 0.95652174])

#Calculate mean and standard deviation of the validation
print("Mean: %0.2f ; Standard dev.: %0.2f)" % (scores.mean(), scores.std()))

Mean: 0.95 ; Standard dev.: 0.03)

In the case shown, both PCA and TNSE show an improvement in the model, both behave in a similar way, which is consistent with the graphs of exercise 3. It should be noted that this result has been obtained by repeatedly executing the TSNE algorithm since it contains a random component, as do the random forests. The parameter random_state is used to be able to repeat the results in the different executions of the algorithm.

Let's predict with the PCA data.

#Let's transform test data

PCA_test=pca.transform(X_test)

pcaTestDf = pd.DataFrame(data = PCA_test
             , columns = ['PC 1', 'PC 2'])

pcaTestDf.shape

(59, 2)

prediction=pcaclf.predict(pcaTestDf)
prediction

array([0, 0, 2, 0, 1, 0, 1, 2, 1, 2, 0, 2, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 2, 2, 2, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0, 2, 2, 1, 2, 1, 1, 1, 1,
       2, 0, 1, 1, 2, 0, 1, 0, 0, 2, 2, 1, 1, 0, 1], dtype=int64)

#Cross validation and metrics.
acc_score=accuracy_score(y_test,prediction)
acc_score

0.9830508474576272

#We get a 98% accuracy, let's see confussion matrix.
conf_matrix=confusion_matrix(y_test,prediction)
conf_matrix

array([[19,  1,  0],
       [ 0, 24,  0],
       [ 0,  0, 15]], dtype=int64)

Next we are going to test n_estimators,max_depth and min_samples_split parameters with different values, to clearly see their purpose and effect on the results, we are going to save all the prediction efficiency results on the train and test data, and show a graph. To see more clearly its improvement we will test on the dataset without dimensionality reduction since it is not the best model, so we can check how much the model improves with each parameter.

n_estimators: This parameter represents the number of trees used in the model, in the first graph shows its effect, we can clearly see how the effectiveness of the model for new cases goes up to 16 trees, where it reaches its maximum, a higher number of trees does not get an improvement of the model.

max_depth: It represents the depth of the trees in the model, i.e. the number of levels of each tree.In the example we show its effect with 4 trees (n_estimators), it is observed how from a certain depth overfitting is produced and the model does not learn for new data.

min_samples_split: This parameter defines the number of data to use before splitting a node. A larger value in this parameter further restricts the tree by forcing it to use more data before splitting. In this case we use the parameter with percentages, we see how with values up to 40% the model stays above 90% effectiveness, from there its effectiveness drops quite a lot.

#Vamos a comenzar con n_estimators
from matplotlib.legend_handler import HandlerLine2D

n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
train_results = []
test_results = []
#Save precision data in arrays in order to show the graphic.

for estimator in n_estimators:
   clf = RandomForestClassifier(n_estimators=estimator,random_state=42)
   clf.fit(X_train,y_train.values.ravel())
   pred_train = clf.predict(X_train)
   acc_score_train = accuracy_score(y_train,pred_train)
   train_results.append(acc_score_train)
   pred_test = clf.predict(X_test)
   acc_score_test=accuracy_score(y_test,pred_test)
   test_results.append(acc_score_test)

line1, = plt.plot(n_estimators, train_results, 'b', label='Train accuracy')
line2, = plt.plot(n_estimators, test_results, 'r', label='Test accuracy')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Accuracy')
plt.xlabel('n_estimators')
plt.show()

#Continue with max_depth

max_depths = np.linspace(1, 32, 32, endpoint=True)
train_results = []
test_results = []

#Save precision data in arrays in order to show the graphic
for max_depth in max_depths:
   clf = RandomForestClassifier(n_estimators=4,max_depth=max_depth,random_state=42)
   clf.fit(X_train,y_train.values.ravel())
   pred_train = clf.predict(X_train)
   acc_score_train = accuracy_score(y_train,pred_train)
   train_results.append(acc_score_train) 
   pred_test = clf.predict(X_test)
   acc_score_test=accuracy_score(y_test,pred_test)
   test_results.append(acc_score_test)


line1, = plt.plot(max_depths, train_results, 'b', label='Train accuracy')
line2, = plt.plot(max_depths, test_results, 'r', label='Test accuracy')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Accuracy')
plt.xlabel('max_depths')
plt.show()

#Finally, min_samples_split

min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
test_results = []
train_results = []
#Save precision data in arrays in order to show the graphic

for min_samples_split in min_samples_splits:
   clf = RandomForestClassifier(n_estimators=4,max_depth=2,min_samples_split=min_samples_split,random_state=42)
   clf.fit(X_train,y_train.values.ravel())
   pred_train = clf.predict(X_train)
   acc_score_train = accuracy_score(y_train,pred_train)
   train_results.append(acc_score_train) 
   pred_test = clf.predict(X_test)
   acc_score_test=accuracy_score(y_test,pred_test)
   test_results.append(acc_score_test)

line1, = plt.plot(min_samples_splits, train_results, 'b', label='Train accuracy')
line2, = plt.plot(min_samples_splits, test_results, 'r', label='Test accuracy')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Accuracy')
plt.xlabel('min_samples_splits')
plt.show()

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
count	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000
mean	13.000618	2.336348	2.366517	19.494944	99.741573	2.295112	2.029270	0.361854	1.590899	5.058090	0.957449	2.611685	746.893258
std	0.811827	1.117146	0.274344	3.339564	14.282484	0.625851	0.998859	0.124453	0.572359	2.318286	0.228572	0.709990	314.907474
min	11.030000	0.740000	1.360000	10.600000	70.000000	0.980000	0.340000	0.130000	0.410000	1.280000	0.480000	1.270000	278.000000
25%	12.362500	1.602500	2.210000	17.200000	88.000000	1.742500	1.205000	0.270000	1.250000	3.220000	0.782500	1.937500	500.500000
50%	13.050000	1.865000	2.360000	19.500000	98.000000	2.355000	2.135000	0.340000	1.555000	4.690000	0.965000	2.780000	673.500000
75%	13.677500	3.082500	2.557500	21.500000	107.000000	2.800000	2.875000	0.437500	1.950000	6.200000	1.120000	3.170000	985.000000
max	14.830000	5.800000	3.230000	30.000000	162.000000	3.880000	5.080000	0.660000	3.580000	13.000000	1.710000	4.000000	1680.000000

	alcohol	magnesium	color_intensity
alcohol	1.000000	0.270798	0.546364
magnesium	0.270798	1.000000	0.199950
color_intensity	0.546364	0.199950	1.000000

	PC 1	PC 2
0	3.316751	-1.443463
1	2.209465	0.333393
2	2.516740	-1.031151
3	3.757066	-2.756372
4	1.008908	-0.869831

	PC 1	PC 2
0	3.316751	-1.443463
1	2.209465	0.333393
2	2.516740	-1.031151
3	3.757066	-2.756372
4	1.008908	-0.869831

	PC 1	PC 2
0	24.224854	7.266065
1	15.240703	8.397134
2	18.322884	17.680305
3	25.254492	17.822229
4	15.645270	22.721230

	PC 1	PC 2
0	2.579072	0.038567
1	-4.244181	-0.907790
2	1.065278	2.421301
3	2.091564	1.225946
4	-2.913649	-1.378762

	PC 1	PC 2
0	15.619916	13.980567
1	-13.953738	-11.275820
2	5.828083	3.029533
3	6.573944	17.857536
4	-9.234414	-11.643311