In this post we perform target and aspect detection on a dataset about tripadvisor opinions.
Target or topic are the words or topics the opinions are about. Aspects are parts or features of the target.
Here we explore the target detection using word embeddings (Word2Vec) which extracts similar words by context and try to extract aspects of the target by searching close words wusing the WordNet synsets.
First, we perform data preprocessing by removing stopwords and punctuation marks, convert to lowercase and lemmatize (get the words base). Then we perform target and aspect detection. Finally we perform some clustering in order to group the aspects found.
import pandas as pd
import gensim
import nltk
from nltk import word_tokenize
from nltk.collocations import *
from nltk.stem.wordnet import WordNetLemmatizer
import re
tripadvisor_data = pd.read_csv('tripadvisor_data.csv')
#tripadvisor_data
Let's preprocess data. Data has a short field (headline) and a long description.
Let's remove stopwords (using NLTK libraries) and punctuation marks. We also convert all the words to lowercase and lemmatize words (get the words base) with WordNetLemmatizer
tripadvisor_data.head()
#Remove punctuation marks and convert to lowercase
short=tripadvisor_data['Short'].str.strip('".,;:-():!?-‘’“” ').str.lower()
long=tripadvisor_data['Long'].str.strip('"./,;:-():!?-‘’“” ').str.lower()
#Remove stopwords
stopwords = []
#Import NLTK list of stopwords.
stopwords = nltk.corpus.stopwords.words('english')
short = short.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
long = long.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
short
Let's proceed with lemmatization for short and long texts
#Import NLTK methods to lemmatize with Wordnet
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
for s in short:
s = ' '.join([lemmatizer.lemmatize(w) for w in s])
for l in long:
l = ' '.join([lemmatizer.lemmatize(w) for w in l])
print(short[0:5])
print(long[0:5])
#Join short and long texts
texto=short.append(long)
texto[0:5]
Load the Word2Vec model.
from gensim.models import Word2Vec, KeyedVectors
model = KeyedVectors.load_word2vec_format('model_w2v')
#Let's calculate the similarity of every word with hotel
words=[]
wordsim=[]
similarity=[]
for t in texto:
for w in word_tokenize(t):
if(w in model.wv.vocab):
#Evitamos repetir palabras
if(w not in words):
words.append(w)
wordsim.append(w)
wordsim.append(model.similarity(w,'hotel'))
similarity.append(wordsim)
wordsim=[]
#Sort and get the 10 most similar words
similarity.sort(key = lambda x: x[1],reverse=True)
print(similarity[0:10])
#Let's add some restoration and shopping words as negative
most_similars = model.wv.most_similar(positive=['hotel','inn','apartment','resort'],negative=['restaurant','cafe','shopping'])
most_similars
#Now let's try with some location related words as negative
most_similars = model.wv.most_similar(positive=['hotel','inn','apartment','resort'],negative=['downtown','oceanfront'])
most_similars
from nltk.util import ngrams
#Search for noun and adjective-noun.
def is_np(candidate):
test = False
tokens = candidate
tagged_tokens = nltk.pos_tag(tokens)
if len(tagged_tokens) > 1:
PoS_initial = tagged_tokens[0][1][:2]
PoS_final = tagged_tokens[-1][1][:2]
if ((PoS_initial == 'JJ') and (PoS_final == 'NN' or PoS_final == 'NS')):
test = True
else: #If it's not greater than 1, then the length is 1.
PoS_initial = tagged_tokens[0][1][:2]
if (PoS_initial == 'NN' or PoS_initial == 'NS'):
test=True
return test
#Filter nouns and adjective-noun bigrams
bigrams=[]
aspect_candidates=[]
#Lowercase token list
for t in texto:
tokens = [w for w in word_tokenize(t)]
#Buscamos candidatos
for t in tokens:
if(is_np(t)):
aspect_candidates.append(t)
#Buscamos bigramas
bigrams=(list(ngrams(tokens, 2)))
for b in bigrams:
if(is_np(b)):
aspect_candidates.append(b[0]+'_'+b[1])
aspect_candidates[1:5]
#Let's choose the 50 candidates.
from nltk.corpus import wordnet as wn
def get_aspects(syn):
target=wn.synset(syn)
simvector=[]
cands=[]
for ac in aspect_candidates:
candidates_synsets=wn.synsets(lemmatizer.lemmatize(ac),pos=wn.NOUN)
for cs in candidates_synsets:
if "'" + ac.replace(' ','_') + ".n" in str(cs):
wup=target.wup_similarity(wn.synset(lemmatizer.lemmatize(ac)+'.n.01'))
simvector.append(lemmatizer.lemmatize(ac))
simvector.append(wup)
if(not(simvector in cands)):
cands.append(simvector)
simvector=[]
cands.sort(key = lambda x: x[1],reverse=True)
return cands
candidates=get_aspects('hotel.n.01')
#Show the 50 first candidates and their distance.
print(candidates[0:50])
#Repeat aspect selection with the word mansion.
candidates2=get_aspects('mansion.n.01')
print(candidates2[0:50])
#Let's repeat with the word oceanfront
candidates3=get_aspects('oceanfront.n.01')
print(candidates3[0:50])
- Aspect candidates (synset Wordnet) using Wu and Palmer distance.
- Cosine similarity of the Word2Vec terms (target candidates)
#create distance matrix with the Wu and Palmer distance
from numpy import matrix
def create_vector(synset, synsets_vocabulary):
vector = [synset.wup_similarity(s) for s in synsets_vocabulary] #Wu and Palmer score
return vector
#Create a vector with the synsets
#of the 50 best aspect candidates obtained before
vcandidates=[]
for c in candidates[0:50]:
vcandidates.append(wn.synset(c[0]+'.n.01'))
#Create distance vector
vectors = [create_vector(v,vcandidates) for v in vcandidates]
X = matrix(vectors)
print(X)
from sklearn.cluster import KMeans
#Clustering.Create function
def mykmeans(X,vcandidates):
num_clusters = [3,4,5,6,7]
labels=[]
clusters=[]
nclusters=[]
#Save labels to make graph
for nc in num_clusters:
km = KMeans(n_clusters=nc, n_init=10) # n_init para mantener la consistencia
km.fit(X)
listlabels=km.labels_.tolist()
labels.append(listlabels)
#Save clusters to make list
cluster=[]
for ind in range(0,len(vcandidates)):
c=[]
c.append(listlabels[ind])
c.append(vcandidates[ind])
cluster.append(c)
#Sort by cluster number
cluster.sort(key = lambda x: x[0])
nclusters.append(nc)
nclusters.append(cluster)
clusters.append(nclusters)
nclusters=[]
return clusters,labels
labels_color_map = {
0: '#20b2aa', 1: '#ff7373', 2: '#ffe4e1', 3: '#005073', 4: '#4d0404', 5: '#E342D0', 6: '#35C20C', 7: '#F0FA16'
}
#Cluster list
clusters,labels=mykmeans(X,vcandidates)
for cl in clusters:
print()
print('Número de clusters: '+str(cl[0]))
print()
print(cl[1])
#3 clusters graph
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
reduced_data = PCA(n_components=2).fit_transform(X)
fig, ax = plt.subplots(figsize=(16,10))
for index, instance in enumerate(reduced_data):
pca_comp_1, pca_comp_2 = reduced_data[index]
color = labels_color_map[labels[0][index]]
ax.scatter(pca_comp_1, pca_comp_2, c=color)
ax.annotate(s=str(vcandidates[index]),xy=(pca_comp_1, pca_comp_2))
plt.show()
#7 clusters graph
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
reduced_data = PCA(n_components=2).fit_transform(X)
fig, ax = plt.subplots(figsize=(16,10))
for index, instance in enumerate(reduced_data):
pca_comp_1, pca_comp_2 = reduced_data[index]
color = labels_color_map[labels[4][index]]
ax.scatter(pca_comp_1, pca_comp_2, c=color)
ax.annotate(s=str(vcandidates[index]),xy=(pca_comp_1, pca_comp_2))
plt.show()
#Let's create distance matrix using cosine similarity
#de los candidatos a target.
from numpy import matrix
def create_vector(word, vocabulary):
vector = [model.similarity(word,s) for s in vocabulary] #Calculamos la cosine similarity
return vector
#Create a vector with the synsets
#of the 50 best aspect candidates obtained before
scandidates=[]
for c in similarity[0:50]:
scandidates.append((c[0]))
#Creamos el vector de distancias
vectors2 = [create_vector(v,scandidates) for v in scandidates]
Xs = matrix(vectors2)
print(Xs)
#Let's use kmeans to perform clustering.
clusters2,labels2=mykmeans(Xs,scandidates)
for cl in clusters2:
print()
print('Number of clusters: '+str(cl[0]))
print()
print(cl[1])
#3 clusters graph
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
reduced_data = PCA(n_components=2).fit_transform(Xs)
fig, ax = plt.subplots(figsize=(16,10))
for index, instance in enumerate(reduced_data):
pca_comp_1, pca_comp_2 = reduced_data[index]
color = labels_color_map[labels2[0][index]]
ax.scatter(pca_comp_1, pca_comp_2, c=color)
ax.annotate(s=str(scandidates[index]),xy=(pca_comp_1, pca_comp_2))
plt.show()
#7 clusters graph
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
reduced_data = PCA(n_components=2).fit_transform(Xs)
fig, ax = plt.subplots(figsize=(16,10))
for index, instance in enumerate(reduced_data):
pca_comp_1, pca_comp_2 = reduced_data[index]
color = labels_color_map[labels2[4][index]]
ax.scatter(pca_comp_1, pca_comp_2, c=color)
ax.annotate(s=str(scandidates[index]),xy=(pca_comp_1, pca_comp_2))
plt.show()