Let’s perform some classification methods on the same tripadvisor data as in the post https://www.alldatascience.com/nlp/nlp-target-and-aspect-detection-with-python.
In this case we are going to read and preprocess the data again, then we are going to vectorize it in different ways,
1. With TF-IDF vectorizer that creates vectors having into account the frequency of words in a document and the frequency of words in all documents, decreasing weight of the words that appear too often (they can bee meaningless words such as articles or prepositions).
2. With Word2Vec that construct word embeddings having into account the words that are next or previous other words, so it is able to have into account the context in which a word can be important.
Later, using theese vectorizations we test two classifier:
- Logistic Regression, which searches the function that best fits the data.
- SVC: Support Vectors Classifier that searches the hyperplane that bests separates the data.
The Opinion field in the dataset is used to measure the accuracy, recall and F1 of the different classification approaches.
Let's load again tripadvisor data to calculate a classifier of positive and negative reviews using the Opinion field as a class..
tripadvisor_data = pd.read_csv('tripadvisor_data.csv')
tripadvisor_data[0:5]
#Join short and long descriptions in order to use all the words available.
trdata=tripadvisor_data['Short']+" "+tripadvisor_data['Long']
trdata[1]
import re
#Remove stopwords
stopwords = []
#Import the list of stopwords provided in NLTK,
#véase: https://sangeetablog.wordpress.com/2015/07/28/download-stopwords-from-nltk-library/#
stopwords = nltk.corpus.stopwords.words('english')
#Remove stopwords, punctuation marks and lemmatize.
def transform_sentence(sentence):
#Quitamos signos de puntuacion y stopwords
clean = re.sub("['.,“”']","",sentence)
terms=' '.join([word for word in clean.split() if word not in (stopwords)])
return terms.lower()
terms_stream = [transform_sentence(dt) for dt in trdata]
terms_stream[0]
#############################################
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
vectorizerNYT = TfidfVectorizer(
analyzer= 'word',
)
X = vectorizerNYT.fit_transform(terms_stream)
#Matriz con los vectores
MNYT = X.toarray()
print(MNYT)
#Let's save the Opinion labels.
data_labels=tripadvisor_data['Opinion']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
MNYT,
data_labels, # Opinion labels
train_size=0.80, # We use 80% of the corpus as training data.
random_state=1234)
from sklearn.linear_model import LogisticRegression
#Create the logistic regression classifier.
classifier = LogisticRegression()
#Let's fit the model
log_model = classifier.fit(X=X_train, y=y_train)
#Let's predict using test data.
y_pred = log_model.predict(X_test)
print("Prediction results")
print(y_pred)
def vectormean(words):
words = [word for word in words if word in model.vocab]
if len(words) >= 1:
return np.mean(model[words], axis=0)
vector=[]
for td in tripadvisor_data['Short']:
vector.append(np.mean(vectormean(td)))
len(vector)
#Split train and test data
X_train2, X_test2, y_train2, y_test2 = train_test_split(
vector,
data_labels, # Data labels
train_size=0.80, # Use the 80% of the corpus as training data.
random_state=1234)
#Transform data to 2 dimensions in order to perform the logistic regression.
X_train2=np.array(X_train2).reshape(-1,1)
X_test2=np.array(X_test2).reshape(-1,1)
from sklearn.linear_model import LogisticRegression
#Create the logistic regression model
classifier = LogisticRegression()
#Fit the model
log_model2 = classifier.fit(X=X_train2, y=y_train2)
#Predict with test data
y_pred2 = log_model2.predict(X_test2)
print("Prediction results")
print(y_pred2)
Model evaluation and comparison.¶
En este apartado aplicaremos los conocimientos estudiados en el módulo 3.
- Logistic Regression with tdifdvectorizer.
- Logistic Regression with mean word2vec vectors.
- SVC with tdifdvectorizer.
- SVC with mean word2vec vectors.
from sklearn import metrics
metrics.confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
(tn, fp, fn, tp)
accuracy1=(tp+tn)/(tp+fp+tn+fn)
recall1=(tp/(tp+fn))
F1=2*accuracy1*recall1/(accuracy1+recall1)
print('Accuracy:'+str(accuracy1)+" ; "+"Recall:"+str(recall1)+" ; "+"F1:"+str(F1))
tn, fp, fn, tp = metrics.confusion_matrix(y_test2, y_pred2).ravel()
(tn, fp, fn, tp)
accuracy2=(tp+tn)/(tp+fp+tn+fn)
recall2=(tp/(tp+fn))
F12=2*accuracy2*recall2/(accuracy2+recall2)
print('Accuracy:'+str(accuracy2)+" ; "+"Recall:"+str(recall2)+" ; "+"F1:"+str(F12))
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred3 = svclassifier.predict(X_test)
y_pred3
#############################################
# SOLUCIÓN #
#############################################
#Vamos a implementar SVC, ahora con la media word2vec
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train2, y_train2)
#Resultados con la media word2vec
y_pred4 = svclassifier.predict(X_test2)
y_pred4
from sklearn import metrics
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred3).ravel()
(tn, fp, fn, tp)
accuracy3=(tp+tn)/(tp+fp+tn+fn)
recall3=(tp/(tp+fn))
F13=2*accuracy3*recall3/(accuracy3+recall3)
print('Accuracy:'+str(accuracy3)+" ; "+"Recall:"+str(recall3)+" ; "+"F1:"+str(F13))
from sklearn import metrics
tn, fp, fn, tp = metrics.confusion_matrix(y_test2, y_pred4).ravel()
(tn, fp, fn, tp)
accuracy4=(tp+tn)/(tp+fp+tn+fn)
recall4=(tp/(tp+fn))
F14=2*accuracy4*recall4/(accuracy4+recall4)
print('Accuracy:'+str(accuracy4)+" ; "+"Recall:"+str(recall4)+" ; "+"F1:"+str(F14))
#Resumen de las 4 métricas
print('Logistic regression - TfidfVectorizer:')
print('Accuracy:'+str(accuracy1)+" ; "+"Recall:"+str(recall1)+" ; "+"F1:"+str(F1))
print('Logistic regression - word2vec:')
print('Accuracy:'+str(accuracy2)+" ; "+"Recall:"+str(recall2)+" ; "+"F1:"+str(F12))
print('SVM - TfidfVectorizer:')
print('Accuracy:'+str(accuracy3)+" ; "+"Recall:"+str(recall3)+" ; "+"F1:"+str(F13))
print('SVM - word2vec:')
print('Accuracy:'+str(accuracy4)+" ; "+"Recall:"+str(recall3)+" ; "+"F1:"+str(F14))