Sentiment Analysis
Sentiment Analysis of the British Airways
Customers' Feedback
Data Overview
In [3]: # import the libraries and data
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
# set plot style
sns.set_theme()
In [4]: # import the dataset
df = pd.read_csv("reviewdata.csv")
df.info()
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
#
Column
Non-Null Count
--- ------------------0
verify
2092 non-null
1
review
2475 non-null
2
date
2500 non-null
3
country
2500 non-null
4
seat_type
2500 non-null
5
recommended
2500 non-null
6
stars
2500 non-null
7
departure
2500 non-null
8
arrival
2500 non-null
9
type_of_traveller 2500 non-null
dtypes: int64(1), object(9)
memory usage: 195.4+ KB
In [5]: # drop empty rows in the review column
df = df.dropna(subset = ["review"])
df.info()
Dtype
----object
object
object
object
object
object
int64
object
object
object
Index: 2475 entries, 0 to 2499
Data columns (total 10 columns):
#
Column
Non-Null Count
--- ------------------0
verify
2067 non-null
1
review
2475 non-null
2
date
2475 non-null
3
country
2475 non-null
4
seat_type
2475 non-null
5
recommended
2475 non-null
6
stars
2475 non-null
7
departure
2475 non-null
8
arrival
2475 non-null
9
type_of_traveller 2475 non-null
dtypes: int64(1), object(9)
memory usage: 212.7+ KB
Dtype
----object
object
object
object
object
object
int64
object
object
object
In [6]: df.head()
Out[6]:
0
1
2
3
4
verify
review
date
country
seat_type
recommended
stars
departure
Trip
Verified
I had the
most
fantastic
BA Flight
today. The
...
1st
August
2023
Hong
Kong
Business
CLas
Vegass
yes
5
Heathrow
La
Trip
Verified
Couldn't
book in
online.
Arrived at
check in
t...
31st
July
2023
United
Kingdom
Economy
CLas
Vegass
no
3
Rome
He
Trip
Verified
London
Heathrow
to
Mumbai
in a
Boeing
787-8 in...
31st
July
2023
Iceland
Business
CLas
Vegass
yes
3
Gatwick
Trip
Verified
Keflavík,
Iceland to
London
Heathrow
on an
A32...
31st
July
2023
Iceland
Business
CLas
Vegass
yes
5
London
Trip
Verified
Terrible
Experience
with
British
Airways. I
bo...
29th
July
2023
Canada
Economy
CLas
Vegass
no
5
Denver
He
In [7]: # correct the class types in the seat_type columns
df["seat_type"] = df["seat_type"].replace({"Business CLas Vegass": "Business Cla
df["seat_type"].unique()
# make the corrections in the review column also
df["review"] = df["review"].str.replace(r"(?i)clas vegass", "Class", regex=True)
Basic Data Exploration
In [9]: # Explore review by seat type
df["seat_type"].value_counts()
Out[9]:
seat_type
Economy Class
1351
Business Class
745
Premium Economy
245
First Class
134
Name: count, dtype: int64
In [10]: # a plot showing the seat types
plt.figure(figsize=(10, 6))
sns.countplot(y=df["seat_type"], order=df["seat_type"].value_counts().index, pal
plt.xlabel("Count")
plt.ylabel("Seat Class")
plt.title("Distribution of Seat Classes")
plt.show()
In [11]: #Explore data by traveller type
df["type_of_traveller"].value_counts()
Out[11]:
type_of_traveller
Couple Leisure
991
Family Leisure
499
Business
493
Solo Leisure
492
Name: count, dtype: int64
In [12]: # a plot showing the different types of travellers
plt.figure(figsize=(10, 6))
sns.countplot(y=df["type_of_traveller"], order=df["type_of_traveller"].value_cou
plt.xlabel("Count")
plt.ylabel("Type of Traveller")
plt.title("Distribution of Traveller Types")
plt.show()
In [13]: # the most frequent destination
df["arrival"].value_counts()
Out[13]:
arrival
Heathrow
LHR
Las Vegas
Luanda
Tampa
Sydney
Venice
CPT
Name: count,
-
dtype: int64
In [14]: # a plot showing the different destinations
plt.figure(figsize=(10, 6))
sns.countplot(y=df["arrival"], order=df["arrival"].value_counts().index, palette
plt.xlabel("Count")
plt.ylabel("Different Destinations")
plt.title("Distribution of The Most Frequent Destinations")
plt.show()
Data Preparation for Key Topic Extraction and
Sentiment Analysis
In [16]: ## Preprocess text for analysis
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
[nltk_data] Downloading package stopwords to
[nltk_data]
C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]
Package stopwords is already up-to-date!
In [17]: # Create a preprocess function
def preprocess_text(text):
text = re.sub(r'\W', ' ', str(text)) # to remove special characters
text = text.lower() # to convert to lowercase
text = " ".join(word for word in text.split() if word not in stop_words)
return text
In [18]: # replace review column with a preprocessed column
df["cleaned_review"] = df["review"].apply(preprocess_text)
Key Topic Extraction
In [20]: # import vectorizer and LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
In [21]: # converting text to document-term matrix
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(df["cleaned_review"])
#
In [22]: # LDA for topic extraction
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)
Out[22]: ▾
LatentDirichletAllocation
LatentDirichletAllocation(n_components=5, random_state=42)
In [23]: # Display top words for each topic
words = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda.components_):
print(f"Topic {i+1}: ", [words[j] for j in topic.argsort()[-10:]])
Topic 1: ['london', 'cabin', 'class', 'good', 'seat', 'crew', 'service', 'food',
'ba', 'flight']
Topic 2: ['booked', 'hours', 'told', 'service', 'customer', 'airways', 'britis
h', 'london', 'ba', 'flight']
Topic 3: ['london', 'customer', 'pay', 'airways', 'british', 'heathrow', 'servic
e', 'seat', 'ba', 'airline']
Topic 4: ['baggage', 'london', 'seats', 'ba', 'staff', 'bag', 'flight', 'boardin
g', 'luggage', 'check']
Topic 5: ['got', 'said', 'plane', 'flight', 'seats', 'luggage', 'asked', 'londo
n', 'ba', 'staff']
In [24]: ## Display Topics in a Word Cloud
In [25]: from wordcloud import WordCloud
In [26]: # Loop through each topic
for i, topic in enumerate(lda.components_):
# Create a dictionary of words and their importance
word_freq = {words[j]: topic[j] for j in topic.argsort()[-20:]}
# Top 20 wo
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').gener
# Plot the word cloud
plt.figure(figsize=(8, 4))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(f"Topic {i+1}")
plt.show()
Topics by Arrival Region
In [28]: # Convert reviews into a document-term matrix using your vectorizer
review_matrix = vectorizer.transform(df['cleaned_review'])
# Predict topics for each review
topic_probabilities = lda.transform(review_matrix)
df['dominant_topic'] = topic_probabilities.argmax(axis=1)
# Count topics per destination
topic_counts = df.groupby(['arrival', 'dominant_topic']).size().reset_index(name
# Pivot table for heatmap
pivot_table = topic_counts.pivot(index='arrival', columns='dominant_topic', valu
# Plot heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, cmap="Blues", annot=True, fmt=".0f")
plt.title("Topic Distribution by Arrival Destination")
plt.xlabel("Topic")
plt.ylabel("Arrival Destination")
plt.show()
Topics by Seat Class
In [30]: # Count topics per seat class
topic_counts = df.groupby(['seat_type', 'dominant_topic']).size().reset_index(na
# Pivot table for heatmap
pivot_table = topic_counts.pivot(index='seat_type', columns='dominant_topic', va
# Plot heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, cmap="Blues", annot=True, fmt=".0f")
plt.title("Topic Distribution by Seat Class")
plt.xlabel("Topic")
plt.ylabel("Seat Class")
plt.show()
In [ ]: