Data Visualization | Story Telling
Data Analysis of Supermarket Data
Importing the dependencies
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("C:/Users/Collins PC/Downloads/projects/supermarket_sales/supermarket_sales.csv")
data
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
Total
Date
Time Payment
-
-
1/5/2019 13:08
3.8200
80.2200
3/8/2019 10:29
cogs
0
-
A
Yangon
Member
Female
1
-
C Naypyitaw
Normal
Female
2
-
A
Yangon
Normal
Male
Home and
46.33
lifestyle
-
-
3/3/2019 13:23
Credit
324.31
card
3
-
A
Yangon
Member
Male
Health and
58.22
beauty
-
-/27/2019 20:33
Ewallet 465.76
4
-
A
Yangon
Normal
Male
Sports and
86.31
travel
-
-
Ewallet 604.17
...
...
...
...
...
...
995
-
C Naypyitaw
Normal
996
-
B
Mandalay
997
-
A
998
-
999
-
Electronic
accessories
...
15.28
5
...
2/8/2019 10:37
Cash
...
...
-/29/2019 13:46
Ewallet
76.40
...
...
Male
Health and
40.35
beauty
1
2.0175
Normal
Female
Home and
97.38
lifestyle
Yangon
Member
Male
Food and
31.84
beverages
1
1.5920
33.4320
2/9/2019 13:22
Cash
31.84
A
Yangon
Normal
Male
Home and
65.82
lifestyle
1
3.2910
-/22/2019 15:33
Cash
65.82
A
Yangon
Member
Female
Fashion
88.34
accessories
-
-/18/2019 13:28
-
...
Ewallet 522.83
...
1000 rows × 17 columns
data.info()
Health and
74.69
beauty
Tax 5%
3/2/2019 17:16
40.35
Ewallet 973.80
Cash 618.38
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
#
Column
Non-Null Count
--- ------------------0
Invoice ID
1000 non-null
1
Branch
1000 non-null
2
City
1000 non-null
3
Customer type
1000 non-null
4
Gender
1000 non-null
5
Product line
1000 non-null
6
Unit price
1000 non-null
7
Quantity
1000 non-null
8
Tax 5%
1000 non-null
9
Total
1000 non-null
10 Date
1000 non-null
11 Time
1000 non-null
12 Payment
1000 non-null
13 cogs
1000 non-null
14 gross margin percentage 1000 non-null
15 gross income
1000 non-null
16 Rating
1000 non-null
dtypes: float64(7), int64(1), object(9)
memory usage: 132.9+ KB
Dtype
----object
object
object
object
object
object
float64
int64
float64
float64
object
object
object
float64
float64
float64
float64
data.isnull().sum()
Invoice ID
Branch
City
Customer type
Gender
Product line
Unit price
Quantity
Tax 5%
Total
Date
Time
Payment
cogs
gross margin percentage
gross income
Rating
dtype: int64
-
Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
# Set the style of the plot to improve aesthetics
sns.set(style="whitegrid")
# Get the value counts of the 'City' column
city_category_counts = data['City'].value_counts()
# Create the bar plot using seaborn
plt.figure(figsize=(10, 6)) # Adjust the figure size for better readability
sns.barplot(x=city_category_counts.index, y=city_category_counts.values, palette='coolwarm')
# Customize the plot further
plt.title('City Distribution', fontsize=16)
plt.xlabel('City', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right') # Rotate the x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent label cut-off
# Show the plot
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Set the style of the plot to improve aesthetics
sns.set(style="whitegrid")
# Get the value counts of the 'City' column
customer_category_counts = data['Customer type'].value_counts()
# Create the bar plot using seaborn
plt.figure(figsize=(10, 6)) # Adjust the figure size for better readability
sns.barplot(x=customer_category_counts.index, y=customer_category_counts.values, palette='coolwarm')
# Customize the plot further
plt.title('Customer Distribution', fontsize=16)
plt.xlabel('Customer Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right') # Rotate the x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent label cut-off
# Show the plot
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Set the style of the plot to improve aesthetics
sns.set(style="whitegrid")
# Get the value counts of the 'City' column
payment_category_counts = data['Payment'].value_counts()
# Create the bar plot using seaborn
plt.figure(figsize=(10, 6)) # Adjust the figure size for better readability
sns.barplot(x=payment_category_counts.index, y=payment_category_counts.values, palette='coolwarm')
# Customize the plot further
plt.title('Payment Distribution', fontsize=16)
plt.xlabel('Payment Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right') # Rotate the x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent label cut-off
# Show the plot
plt.show()
sns.set(style="whitegrid")
# Set color palette to blue shades
blue_palette = sns.color_palette('Blues_r')
# Visualization 1: Count of customers by gender
plt.figure(figsize=(10, 6))
sns.countplot(x='Gender', data=data, palette=blue_palette)
plt.title('Count of Customers by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
# Visualization 2: Total sales by branch
plt.figure(figsize=(10, 6))
sns.barplot(x='Branch', y='Total', data=data, estimator=sum, palette=blue_palette)
plt.title('Total Sales by Branch')
plt.xlabel('Branch')
plt.ylabel('Total Sales')
plt.show()
# Visualization 3: Distribution of product line sales
plt.figure(figsize=(10, 6))
sns.boxplot(x='Product line', y='Total', data=data, palette=blue_palette)
plt.title('Distribution of Product Line Sales')
plt.xlabel('Product Line')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.show()
# Visualization 4: Total sales distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Total'], kde=True, color='blue', bins=20)
plt.title('Total Sales Distribution')
plt.xlabel('Total Sales')
plt.ylabel('Frequency')
plt.show()
# Visualization: Gross Income by Product
plt.figure(figsize=(10, 6))
sns.barplot(x='Product line', y='gross income', data=data, estimator=sum, palette=blue_palette)
plt.title('Gross Income Per Product')
plt.xlabel('Product')
plt.ylabel('Gross Income')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
data
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
Health and
74.69
beauty
Tax 5%
Total
Date
Time Payment
-
-
1/5/2019 13:08
3.8200
80.2200
3/8/2019 10:29
cogs
0
-
A
Yangon
Member
Female
1
-
C Naypyitaw
Normal
Female
2
-
A
Yangon
Normal
Male
Home and
46.33
lifestyle
-
-
3/3/2019 13:23
Credit
324.31
card
3
-
A
Yangon
Member
Male
Health and
58.22
beauty
-
-/27/2019 20:33
Ewallet 465.76
4
-
A
Yangon
Normal
Male
Sports and
86.31
travel
-
-
Ewallet 604.17
...
...
...
...
...
...
995
-
C Naypyitaw
Normal
996
-
B
Mandalay
997
-
A
998
-
999
-
Electronic
accessories
5
...
...
...
Male
Health and
40.35
beauty
1
2.0175
-/29/2019 13:46
Ewallet
Normal
Female
Home and
97.38
lifestyle
Yangon
Member
Male
Food and
31.84
beverages
1
1.5920
33.4320
2/9/2019 13:22
Cash
31.84
A
Yangon
Normal
Male
Home and
65.82
lifestyle
1
3.2910
-/22/2019 15:33
Cash
65.82
A
Yangon
Member
Female
Fashion
88.34
accessories
-
-/18/2019 13:28
-
...
76.40
...
data['Time'].value_counts()
...
2/8/2019 10:37
Cash
...
1000 rows × 17 columns
...
15.28
Ewallet 522.83
3/2/2019 17:16
40.35
Ewallet 973.80
Cash 618.38
Time
19:48
14:42
17:38
17:16
11:40
13:26
11:17
14:57
17:53
19:12
Name:
7
7
6
5
5
..
1
1
1
1
1
count, Length: 506, dtype: int64
data['Product line'].value_counts()
Product line
Fashion accessories
Food and beverages
Electronic accessories
Sports and travel
Home and lifestyle
Health and beauty
Name: count, dtype: int64
-
import matplotlib.pyplot as plt
# store value counts in a variable called `product_line_counts`
product_line_counts = data['Product line'].value_counts()
# Plot the data
plt.figure(figsize=(10, 8))
product_line_counts.plot(kind='barh', color='skyblue')
# Add labels and title
plt.xlabel('Number of Products')
plt.ylabel('Product Line')
plt.title('Frequency of Product Lines')
# Show the plot
plt.tight_layout()
plt.show()
Product Line Analysis
Health and beauty
data_health = data[data['Product line']=="Health and beauty"]
data_health
Invoice
Branch
ID
City
Customer
Product
Gender
type
line
Unit
Quantity
price
Tax 5%
Total
Date
Time Payment
cogs
0
-
A
Yangon
Member
Female
Health
and 74.69
beauty
-
1/5/2019 13:08
Ewallet 522.83
3
-
A
Yangon
Member
Male
Health
and 58.22
beauty
-/27/2019 20:33
Ewallet 465.76
8
-
A
Yangon
Member
Female
Health
and 36.26
beauty
2
14
-
A
Yangon
Normal
Female
Health
and 71.38
beauty
-/29/2019 19:21
Cash 713.80
16
-
A
Yangon
Member
Female
Health
and 68.93
beauty
-/11/2019 11:03
Credit
482.51
card
...
...
...
...
...
...
983
-
C Naypyitaw
Normal
Male
Health
and 99.96
beauty
-/23/2019 10:33
986
-
B
Mandalay
Normal
Female
Health
and 14.76
beauty
2
987
-
B
Mandalay
Member
Male
Health
and 62.00
beauty
-
1/3/2019 19:08
Credit
496.00
card
989
-
B
Mandalay
Member
Male
Health
and 75.37
beauty
-/28/2019 15:46
Credit
602.96
card
995
-
C Naypyitaw
Normal
Male
Health
and 40.35
beauty
1
...
...
3.6260
...
...
1.4760
2.0175
152 rows × 17 columns
import plotly.offline as pyo
import plotly.express as px
# Ensure the notebook mode is enabled
pyo.init_notebook_mode()
# Create the box plot
fig = px.box(x=data_health['Rating'],
labels={"x": "Rating"},
title="5-Number-Summary Plot of Ratings (Health & Beauty)")
# Display the plot in the notebook
pyo.iplot(fig)
-/10/2019 17:15
...
...
...
-/18/2019 14:42
-/29/2019 13:46
Credit
card
...
72.52
...
Cash 699.72
Ewallet
Ewallet
29.52
40.35
5-Number-Summary Plot of Ratings (Health & Beauty)
4
5
6
7
8
9
Rating
# Visualization: Count of customers by gender
plt.figure(figsize=(10, 6))
sns.countplot(x='Gender', data=data_health, palette=blue_palette)
plt.title('Number of Health and beauty Customers by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
Fashion accessories
data_fashion = data[data['Product line']=="Fashion accessories"]
data_fashion
10
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
Tax 5%
Total
Date
Time Payment
cogs
10
-
B
Mandalay
Member
Female
Fashion
14.48
accessories
4
2.8960
60.8160
2/6/2019 18:07
Ewallet
57.92
26
-
B
Mandalay
Normal
Male
Fashion
33.52
accessories
1
1.6760
35.1960
2/8/2019 15:31
Cash
33.52
27
-
A
Yangon
Normal
Female
Fashion
87.67
accessories
2
-/10/2019 12:17
Credit
175.34
card
30
-
B
Mandalay
Normal
Male
Fashion
94.13
accessories
-/25/2019 19:39
Credit
470.65
card
49
-
C Naypyitaw
Member
Female
Fashion
82.63
accessories
-/19/2019 17:08
Ewallet 826.30
...
...
...
...
...
974
-
C Naypyitaw
Normal
975
-
B
Mandalay
985
-
B
993
-
999
-
...
...
...
...
Male
Fashion
86.13
accessories
2
-
2/7/2019 17:59
Member
Male
Fashion
49.92
accessories
2
-
3/6/2019 11:55
Mandalay
Normal
Female
Fashion
63.71
accessories
-
2/7/2019 19:30
Ewallet 318.55
B
Mandalay
Normal
Male
Fashion
17.49
accessories
-/22/2019 18:35
Ewallet 174.90
A
Yangon
Member
Female
Fashion
88.34
accessories
-/18/2019 13:28
Cash 618.38
10
...
178 rows × 17 columns
import plotly.offline as pyo
import plotly.express as px
# Ensure the notebook mode is enabled
pyo.init_notebook_mode()
# Create the box plot
fig = px.box(x=data_fashion['Rating'],
labels={"x": "Rating"},
title="5-Number-Summary Plot of Ratings (Fashion accessories)")
# Display the plot in the notebook
pyo.iplot(fig)
# Visualization: Count of customers by gender
plt.figure(figsize=(10, 6))
...
...
...
...
...
Cash 172.26
Credit
card
99.84
sns.countplot(x='Gender', data=data_fashion, palette=blue_palette)
plt.title('Number of Fashion accessories Customers by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
Sports and travel
data_sports = data[data['Product line']=="Sports and travel"]
data_sports
Invoice
Branch
ID
City
Customer
Product
Gender
type
line
Unit
Quantity
price
Tax 5%
Total
Date
Time Payment
cogs
perc
4
-
A
Yangon
Normal
Male
Sports
and 86.31
travel
-
2/8/2019 10:37
Ewallet 604.17
15
-
B Mandalay
Member
Female
Sports
and 93.72
travel
-/15/2019 16:19
Cash 562.32
17
-
A
Yangon
Normal
Male
Sports
and 72.61
travel
-
1/1/2019 10:39
Credit
435.66
card
24
-
A
Yangon
Member
Male
Sports
and 88.63
travel
-
3/2/2019 17:36
Ewallet 265.89
31
-
B Mandalay
Member
Male
Sports
and 78.07
travel
-/28/2019 12:43
Cash 702.63
...
...
...
...
...
926
-
B Mandalay
Member
929
-
B Mandalay
937
-
A
982
-
A
991
-
...
...
...
...
...
...
...
...
Male
Sports
and 88.31
travel
1
4.4155
-/15/2019 17:38
Credit
card
88.31
Normal
Male
Sports
and 25.31
travel
2
2.5310
53.1510
Ewallet
50.62
Yangon
Normal
Female
Sports
and 89.48
travel
-/30/2019 10:18
Cash 447.40
Yangon
Member
Female
Sports
and 97.48
travel
-/14/2019 14:19
Ewallet 877.32
B Mandalay
Normal
Female
Sports
and 76.60
travel
-/24/2019 18:10
Ewallet 766.00
166 rows × 17 columns
import plotly.offline as pyo
import plotly.express as px
# Ensure the notebook mode is enabled
pyo.init_notebook_mode()
# Create the box plot
fig = px.box(x=data_sports['Rating'],
labels={"x": "Rating"},
title="5-Number-Summary Plot of Ratings (Sports and travel)")
# Display the plot in the notebook
pyo.iplot(fig)
...
...
3/2/2019 19:26
# Visualization: Count of customers by gender
plt.figure(figsize=(10, 6))
sns.countplot(x='Gender', data=data_sports, palette=blue_palette)
plt.title('Number of Sports and travel Customers by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
Food and beverages
data_food = data[data['Product line']=="Food and beverages"]
data_food
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
9
-
B
Mandalay
Member
Female
Food and
54.84
beverages
13
-
A
Yangon
Normal
Male
Food and
43.19
beverages
18
-
A
Yangon
Normal
Male
Food and
54.67
beverages
3
28
-
B
Mandalay
Normal
Female
34
-
C Naypyitaw
Member
Female
...
...
...
...
...
...
977
-
B
Mandalay
Member
979
-
B
Mandalay
980
-
C Naypyitaw
990
-
A
997
-
A
3
Tax 5%
Total
Date
Time Payment
-/20/2019 13:27
Credit
164.52
card
2/7/2019 16:48
Ewallet 431.90
-/21/2019 18:00
Credit
164.01
card
Food and
88.36
beverages
-/25/2019 19:48
Cash 441.80
Food and
99.42
beverages
-
...
-
cogs
...
...
Male
Food and
26.60
beverages
6
-/26/2019 15:10
Normal
Female
Food and
67.77
beverages
1
3.3885
Member
Male
Food and
59.59
beverages
-/19/2019 12:46
Cash 238.36
Yangon
Normal
Female
Food and
56.56
beverages
-/22/2019 19:06
Credit
282.80
card
Yangon
Member
Male
Food and
31.84
beverages
1
174 rows × 17 columns
import plotly.offline as pyo
import plotly.express as px
# Ensure the notebook mode is enabled
pyo.init_notebook_mode()
# Create the box plot
fig = px.box(x=data_food['Rating'],
labels={"x": "Rating"},
title="5-Number-Summary Plot of Ratings (Food and beverages)")
# Display the plot in the notebook
pyo.iplot(fig)
# Visualization: Count of customers by gender
plt.figure(figsize=(10, 6))
71.1585
33.4320
...
...
Ewallet 397.68
...
1.5920
...
2/6/2019 10:42
2/4/2019 20:43
2/9/2019 13:22
...
...
Ewallet 159.60
Credit
card
Cash
67.77
31.84
sns.countplot(x='Gender', data=data_food, palette=blue_palette)
plt.title('Number of Food and beverages Customers by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
Home and lifestyle
data_home = data[data['Product line']=="Home and lifestyle"]
data_home
Invoice
Branch
ID
City
Customer
Product
Gender
type
line
Unit
Quantity
price
Tax 5%
Total
-
Date
Time Payment
cogs
2
-
A
Yangon
Normal
Male
Home
and 46.33
lifestyle
-
7
-
C Naypyitaw
Normal
Female
Home
and 73.56
lifestyle
-
19
-
B
Mandalay
Normal
Female
Home
and 40.30
lifestyle
2
4.0300
-/11/2019 15:30
Ewallet
80.60
22
-
B
Mandalay
Normal
Male
Home
and 33.20
lifestyle
2
3.3200
-/15/2019 12:20
Credit
card
66.40
25
-
A
Yangon
Member
Female
Home
and 52.59
lifestyle
-
-/22/2019 19:20
...
...
...
...
...
...
967
-
A
Yangon
Member
Male
Home
and 81.01
lifestyle
-
-/13/2019 12:55
Credit
243.03
card
971
-
B
Mandalay
Member
Male
Home
and 36.91
lifestyle
-
-/10/2019 13:51
Ewallet 258.37
973
-
A
Yangon
Normal
Male
Home
and 80.08
lifestyle
-
-/11/2019 15:29
Cash 240.24
996
-
B
Mandalay
Normal
Female
Home
and 97.38
lifestyle
998
-
A
Yangon
Normal
Male
Home
and 65.82
lifestyle
...
...
...
...
Credit
324.31
card
-/24/2019 11:38
Ewallet 735.60
...
-
1
3.2910
160 rows × 17 columns
import plotly.offline as pyo
import plotly.express as px
# Ensure the notebook mode is enabled
pyo.init_notebook_mode()
# Create the box plot
fig = px.box(x=data_home['Rating'],
labels={"x": "Rating"},
title="5-Number-Summary Plot of Ratings (Home and lifestyle)")
# Display the plot in the notebook
pyo.iplot(fig)
3/3/2019 13:23
...
...
3/2/2019 17:16
-/22/2019 15:33
Credit
420.72
card
...
...
Ewallet 973.80
Cash
65.82
# Visualization: Count of customers by gender
plt.figure(figsize=(10, 6))
sns.countplot(x='Gender', data=data_home, palette=blue_palette)
plt.title('Number of Home and lifestyle Customers by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
Electronic accessories
data_electronics = data[data['Product line']=="Electronic accessories"]
data_electronics
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
Tax 5%
Total
3.8200
80.2200
Date
Time Payment
cogs
1
-
C Naypyitaw
Normal
Female
Electronic
accessories
15.28
5
5
-
C Naypyitaw
Normal
Male
Electronic
accessories
85.39
-/25/2019 18:30
Ewallet 597.73
6
-
A
Yangon
Member
Female
Electronic
accessories
68.84
-/25/2019 14:36
Ewallet 413.04
11
-
B
Mandalay
Member
Male
Electronic
accessories
25.51
4
3/9/2019 17:03
Cash 102.04
12
-
A
Yangon
Normal
Female
Electronic
accessories
46.95
-/12/2019 10:25
Ewallet 234.75
...
...
...
...
...
...
...
...
...
...
978
-
B
Mandalay
Normal
Female
Electronic
accessories
25.45
1
1.2725
984
-
C Naypyitaw
Normal
Male
Electronic
accessories
96.37
-
988
-
C Naypyitaw
Member
Male
Electronic
accessories
82.34
992
-
A
Yangon
Normal
Male
Electronic
accessories
58.03
2
-/10/2019 20:46
Ewallet 116.06
994
-
C Naypyitaw
Member
Female
Electronic
accessories
60.95
1
3.0475
Ewallet
-
...
76.40
...
...
-/10/2019 18:10
Credit
card
25.45
1/9/2019 11:40
Cash 674.59
-/29/2019 19:12
Ewallet 823.40
-/18/2019 11:40
import plotly.offline as pyo
import plotly.express as px
# Ensure the notebook mode is enabled
pyo.init_notebook_mode()
# Create the box plot
fig = px.box(x= data_electronics['Rating'],
labels={"x": "Rating"},
title="5-Number-Summary Plot of Ratings (Electronic accessories)")
# Visualization: Count of customers by gender
plt.figure(figsize=(10, 6))
...
Cash
...
170 rows × 17 columns
# Display the plot in the notebook
pyo.iplot(fig)
3/8/2019 10:29
60.95
sns.countplot(x='Gender', data=data_electronics, palette=blue_palette)
plt.title('Number of Electronic accessories Customers by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
Branches
A
branch_A = data[data['Branch']=="A"]
branch_A
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
Tax 5%
Total
Date
Time Payment
cogs
0
-
A Yangon
Member
Female
Health and
74.69
beauty
-
1/5/2019 13:08
Ewallet 522.83
2
-
A Yangon
Normal
Male
Home and
46.33
lifestyle
-
3/3/2019 13:23
Credit
324.31
card
3
-
A Yangon
Member
Male
Health and
58.22
beauty
-/27/2019 20:33
Ewallet 465.76
4
-
A Yangon
Normal
Male
Sports and
86.31
travel
-
2/8/2019 10:37
Ewallet 604.17
6
-
A Yangon
Member
Female
Electronic
68.84
accessories
-/25/2019 14:36
Ewallet 413.04
...
...
...
...
...
990
-
A Yangon
Normal
Female
Food and
56.56
beverages
992
-
A Yangon
Normal
Male
Electronic
58.03
accessories
2
-/10/2019 20:46
997
-
A Yangon
Member
Male
Food and
31.84
beverages
1
1.5920
33.4320
2/9/2019 13:22
Cash
31.84
998
-
A Yangon
Normal
Male
Home and
65.82
lifestyle
1
3.2910
-/22/2019 15:33
Cash
65.82
999
-
A Yangon
Member
Female
Fashion
88.34
accessories
...
...
...
...
...
...
...
...
-/22/2019 19:06
-/18/2019 13:28
...
...
Credit
282.80
card
Ewallet 116.06
Cash 618.38
340 rows × 17 columns
# Visualization: Gross Income by Product
plt.figure(figsize=(10, 6))
sns.barplot(x='Product line', y='gross income', data=branch_A, estimator=sum, palette=blue_palette)
plt.title('Gross Income Per Product Branch A')
plt.xlabel('Product')
plt.ylabel('Gross Income')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
fig = px.bar(branch_A, x = 'Product line', y = 'Rating', hover_data = ['gross income'],color = 'Gender', height = 400
fig.show()
import matplotlib.pyplot as plt
# store value counts in a variable called `product_line_counts`
product_line_counts = branch_A['Product line'].value_counts()
# Plot the data
plt.figure(figsize=(10, 8))
product_line_counts.plot(kind='barh', color='skyblue')
# Add labels and title
plt.xlabel('Number of Products Branch A')
plt.ylabel('Product Line')
plt.title('Frequency of Product Lines')
# Show the plot
plt.tight_layout()
plt.show()
B
branch_B = data[data['Branch']=="B"]
branch_B
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
Tax
5%
Total
Date
Time Payment
cogs
9
-
B Mandalay
Member
Female
Food and
54.84
beverages
3
8.226
10
-
B Mandalay
Member
Female
Fashion
14.48
accessories
4
2.896
60.816
2/6/2019 18:07
11
-
B Mandalay
Member
Male
Electronic
accessories
4
5.102
107.142
3/9/2019 17:03
Cash 102.04
15
-
B Mandalay
Member
Female
Sports and
93.72
travel
6 28.116
-/15/2019 16:19
Cash 562.32
19
-
B Mandalay
Normal
Female
Home and
40.30
lifestyle
2
4.030
...
...
...
...
...
...
...
...
...
987
-
B Mandalay
Member
Male
Health and
62.00
beauty
8 24.800
520.800
1/3/2019 19:08
Credit
496.00
card
989
-
B Mandalay
Member
Male
Health and
75.37
beauty
8 30.148
-/28/2019 15:46
Credit
602.96
card
991
-
B Mandalay
Normal
Female
Sports and
76.60
travel
-
-/24/2019 18:10
Ewallet 766.00
993
-
B Mandalay
Normal
Male
Fashion
17.49
accessories
10
-/22/2019 18:35
Ewallet 174.90
996
-
B Mandalay
Normal
Female
Home and
97.38
lifestyle
...
332 rows × 17 columns
25.51
...
8.745
-/20/2019 13:27
84.630 3/11/2019 15:30
-
...
...
3/2/2019 17:16
Credit
164.52
card
Ewallet
57.92
Ewallet
80.60
...
...
Ewallet 973.80
# Visualization: Gross Income by Product
plt.figure(figsize=(10, 6))
sns.barplot(x='Product line', y='gross income', data=branch_B , estimator=sum, palette=blue_palette)
plt.title('Gross Income Per Product Branch B')
plt.xlabel('Product')
plt.ylabel('Gross Income')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
fig = px.bar(branch_B, x = 'Product line', y = 'Rating', hover_data = ['gross income'],color = 'Gender', height = 400
fig.show()
import matplotlib.pyplot as plt
# store value counts in a variable called `product_line_counts`
product_line_counts = branch_B['Product line'].value_counts()
# Plot the data
plt.figure(figsize=(10, 8))
product_line_counts.plot(kind='barh', color='skyblue')
# Add labels and title
plt.xlabel('Number of Products Branch B')
plt.ylabel('Product Line')
plt.title('Frequency of Product Lines')
# Show the plot
plt.tight_layout()
plt.show()
C
branch_C = data[data['Branch']=="C"]
branch_C
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
Tax 5%
Total
3.8200
80.2200
Date
Time Payment
cogs
1
-
C Naypyitaw
Normal
Female
Electronic
accessories
15.28
5
5
-
C Naypyitaw
Normal
Male
Electronic
accessories
85.39
-/25/2019 18:30
Ewallet 597.73
7
-
C Naypyitaw
Normal
Female
Home and
73.56
lifestyle
-/24/2019 11:38
Ewallet 735.60
20
-
C Naypyitaw
Member
Male
-/25/2019 11:24
Ewallet 430.20
34
-
C Naypyitaw
Member
Female
-
Ewallet 397.68
...
...
...
...
...
983
-
C Naypyitaw
Normal
Male
984
-
C Naypyitaw
Normal
Male
Electronic
accessories
96.37
988
-
C Naypyitaw
Member
Male
Electronic
accessories
82.34
994
-
C Naypyitaw
Member
Female
Electronic
accessories
60.95
1
3.0475
-/18/2019 11:40
Ewallet
60.95
995
-
C Naypyitaw
Normal
Male
Health and
40.35
beauty
1
2.0175
-/29/2019 13:46
Ewallet
40.35
...
Electronic
accessories
86.04
Food and
99.42
beverages
...
...
Health and
99.96
beauty
...
...
...
3/8/2019 10:29
2/6/2019 10:42
...
...
Cash
...
76.40
...
-/23/2019 10:33
Cash 699.72
-
1/9/2019 11:40
Cash 674.59
-/29/2019 19:12
Ewallet 823.40
328 rows × 17 columns
# Visualization: Gross Income by Product
plt.figure(figsize=(10, 6))
sns.barplot(x='Product line', y='gross income', data=branch_C , estimator=sum, palette=blue_palette)
plt.title('Gross Income Per Product Branch C')
plt.xlabel('Product')
plt.ylabel('Gross Income')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
fig = px.bar(branch_B, x = 'Product line', y = 'Rating', hover_data = ['gross income'],color = 'Gender', height = 400
fig.show()
import matplotlib.pyplot as plt
# store value counts in a variable called `product_line_counts`
product_line_counts = branch_C['Product line'].value_counts()
# Plot the data
plt.figure(figsize=(10, 8))
product_line_counts.plot(kind='barh', color='skyblue')
# Add labels and title
plt.xlabel('Number of Products Branch C')
plt.ylabel('Product Line')
plt.title('Frequency of Product Lines')
# Show the plot
plt.tight_layout()
plt.show()
Date Analysis
import pandas as pd
# Function to handle multiple date formats
def convert_date(date_str):
formats = ['%Y-%m-%d %H:%M:%S', '%d/%m/%y %H:%M', '%Y-%m-%d', '%d/%m/%y']
for fmt in formats:
try:
# Try to convert the date using one of the formats
return pd.to_datetime(date_str, format=fmt)
except (ValueError, TypeError):
continue
return date_str # Return original value if no format matches
# Apply the conversion function to the 'Date' column
data['Date'] = data['Date'].apply(convert_date)
data
# Add other formats if needed
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
Health and
74.69
beauty
Tax 5%
Total
Date
Time Payment
-
-
1/5/2019 13:08
3.8200
80.2200
3/8/2019 10:29
cogs
0
-
A
Yangon
Member
Female
1
-
C Naypyitaw
Normal
Female
2
-
A
Yangon
Normal
Male
Home and
46.33
lifestyle
-
-
3/3/2019 13:23
Credit
324.31
card
3
-
A
Yangon
Member
Male
Health and
58.22
beauty
-
-/27/2019 20:33
Ewallet 465.76
4
-
A
Yangon
Normal
Male
Sports and
86.31
travel
-
-
Ewallet 604.17
...
...
...
...
...
...
995
-
C Naypyitaw
Normal
996
-
B
Mandalay
997
-
A
998
-
999
-
Electronic
accessories
...
15.28
5
Ewallet 522.83
Cash
2/8/2019 10:37
...
...
...
...
...
...
Male
Health and
40.35
beauty
1
2.0175
-/29/2019 13:46
Ewallet
Normal
Female
Home and
97.38
lifestyle
Yangon
Member
Male
Food and
31.84
beverages
1
1.5920
33.4320
2/9/2019 13:22
Cash
31.84
A
Yangon
Normal
Male
Home and
65.82
lifestyle
1
3.2910
-/22/2019 15:33
Cash
65.82
A
Yangon
Member
Female
Fashion
88.34
accessories
-
-/18/2019 13:28
-
...
76.40
3/2/2019 17:16
40.35
Ewallet 973.80
Cash 618.38
1000 rows × 17 columns
data['Date'] = pd.to_datetime(data['Date'])
# Extract year, month, day, and weekday
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day
data['weekday'] = data['Date'].dt.weekday
data
Invoice
Branch
ID
City
Customer
Gender
type
0
-
A
Yangon
Member
Female
1
-
C Naypyitaw
Normal
Female
2
-
A
Yangon
Normal
Male
3
-
A
Yangon
Member
4
-
A
Yangon
...
...
...
995
Product
line
Unit
Quantity
price
Health and
74.69
beauty
Electronic
accessories
Time Payment
cogs
percen
-
Total ...
- ... 13:08
3.8200
80.2200 ... 10:29
Home and
46.33
lifestyle
-
Male
Health and
58.22
beauty
Normal
Male
Sports and
86.31
travel
...
...
...
-
C Naypyitaw
Normal
996
-
B
Mandalay
997
-
A
998
-
999
-
Ewallet-
4.76
- ... 13:23
Credit
324.31
card
4.76
-
- ... 20:33
Ewallet 465.76
4.76
-
- ... 10:37
Ewallet 604.17
4.76
...
...
Male
Health and
40.35
beauty
1
2.0175
Normal
Female
Home and
97.38
lifestyle
Yangon
Member
Male
Food and
31.84
beverages
1
1.5920
33.4320 ... 13:22
Cash
31.84
4.76
A
Yangon
Normal
Male
Home and
65.82
lifestyle
1
3.2910
69.1110 ... 15:33
Cash
65.82
4.76
A
Yangon
Member
Female
Fashion
88.34
accessories
-
- ... 13:28
Cash 618.38
4.76
import calendar
# Convert numerical month to month name
... ...
Cash
4.76
...
1000 rows × 21 columns
...
15.28
Tax 5%
...
...
...
42.3675 ... 13:46
Ewallet
40.35
4.76
Ewallet 973.80
4.76
- ... 17:16
data['month_name'] = data['month'].apply(lambda x: calendar.month_name[x])
data
Invoice
Branch
ID
City
Customer
Gender
type
0
-
A
Yangon
Member
Female
1
-
C Naypyitaw
Normal
Female
2
-
A
Yangon
Normal
Male
3
-
A
Yangon
Member
4
-
A
Yangon
...
...
...
995
Product
line
Unit
Quantity
price
Health and
74.69
beauty
Electronic
accessories
-
cogs
gross
margin
percentage
Ewallet 522.83
-
Total ... Payment
- ...
3.8200
80.2200 ...
Home and
46.33
lifestyle
-
Male
Health and
58.22
beauty
Normal
Male
Sports and
86.31
travel
...
...
...
-
C Naypyitaw
Normal
996
-
B
Mandalay
997
-
A
998
-
999
-
...
15.28
Tax 5%
Cash
76.40
-
- ...
Credit
324.31
card
-
-
- ...
Ewallet 465.76
-
-
- ...
Ewallet 604.17
-
...
...
...
... ...
...
...
...
Male
Health and
40.35
beauty
1
2.0175
42.3675 ...
Ewallet
40.35
-
Normal
Female
Home and
97.38
lifestyle
Ewallet 973.80
-
Yangon
Member
Male
Food and
31.84
beverages
1
1.5920
33.4320 ...
Cash
31.84
-
A
Yangon
Normal
Male
Home and
65.82
lifestyle
1
3.2910
69.1110 ...
Cash
65.82
-
A
Yangon
Member
Female
Fashion
88.34
accessories
-
- ...
Cash 618.38
-
- ...
1000 rows × 22 columns
# Visualization: Total sales by Month
plt.figure(figsize=(10, 6))
sns.barplot(x='month_name', y='Total', data = data , estimator=sum, palette=blue_palette)
plt.title('Total Sales Per Month')
plt.xlabel('Product')
plt.ylabel('Gross Income')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
data['month_name'].value_counts()
month_name
January
352
March
345
February
303
Name: count, dtype: int64
import calendar
# Convert numerical weekday to day name
data['day_name'] = data['weekday'].apply(lambda x: calendar.day_name[x])
data
Invoice
Branch
ID
City
Customer
Gender
type
Product
line
Unit
Quantity
price
Health and
74.69
beauty
Tax 5%
Total ...
cogs
gross
margin
percentage
gross
income
0
-
A
Yangon
Member
Female
1
-
C Naypyitaw
Normal
Female
2
-
A
Yangon
Normal
Male
Home and
46.33
lifestyle
-
- ... 324.31
-
3
-
A
Yangon
Member
Male
Health and
58.22
beauty
-
- ... 465.76
-
4
-
A
Yangon
Normal
Male
Sports and
86.31
travel
-
- ... 604.17
-
...
...
...
...
...
...
995
-
C Naypyitaw
Normal
996
-
B
Mandalay
997
-
A
998
-
999
-
Electronic
accessories
...
15.28
-
3.8200
- ..- ...
76.40
-
3.8200
...
...
...
... ...
...
...
...
Male
Health and
40.35
beauty
1
2.0175
42.3675 ...
40.35
-
2.0175
Normal
Female
Home and
97.38
lifestyle
Yangon
Member
Male
Food and
31.84
beverages
1
1.5920
33.4320 ...
31.84
-
1.5920
A
Yangon
Normal
Male
Home and
65.82
lifestyle
1
3.2910
69.1110 ...
65.82
-
3.2910
A
Yangon
Member
Female
Fashion
88.34
accessories
- ... 973.80
-
- ... 618.38
1000 rows × 23 columns
# Visualization: Total sales by Month
plt.figure(figsize=(10, 6))
sns.barplot(x='day_name', y='Total', data = data , estimator=sum, palette=blue_palette)
plt.title('Total Quarterly Sales Per Day')
plt.xlabel('Day')
plt.ylabel('Total Sales')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
-
-
# Visualization: Total sales by Month
plt.figure(figsize=(10, 6))
sns.barplot(x='day_name', y='gross income', data = data , estimator=sum, palette=blue_palette)
plt.title('Total Quarterly Income Per Day')
plt.xlabel('Day')
plt.ylabel('Total Income')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
def generate_rating_df(df):
rating_df = data.groupby(['day_name','Rating']).agg({'Invoice ID': 'count'}).reset_index()
rating_df = rating_df[rating_df['Invoice ID'] != 0]
rating_df.columns = ['day_name','Rating', 'counts']
rating_df = rating_df.sort_values('Rating')
return rating_df
rating_df = generate_rating_df(data)
fig = px.bar(rating_df, x='day_name', y='counts', color='Rating')
fig.update_traces(textposition='auto',
textfont_size=20)
fig.update_layout(barmode='stack')
import plotly.express as px
import plotly.io as pio
# Function to generate a rating dataframe
def generate_rating_df(df):
rating_df = df.groupby(['day_name', 'Rating']).agg({'Invoice ID': 'count'}).reset_index()
rating_df = rating_df[rating_df['Invoice ID'] != 0]
rating_df.columns = ['day_name', 'Rating', 'counts']
rating_df = rating_df.sort_values('Rating')
return rating_df
# Generate the rating dataframe
rating_df = generate_rating_df(data)
# Create the bar plot
fig = px.bar(rating_df, x='day_name', y='counts', color='Rating')
# Update trace styles
fig.update_traces(textposition='auto', textfont_size=20)
# Set layout options for stacked bars
fig.update_layout(barmode='stack')
# Show interactive plot
fig.show()
# Export the plot as a static image for GitHub or PDF
# Ensure you have kaleido installed: `pip install -U kaleido`
fig.write_image("stacked_bar_ratings_per_day.png") # Save as PNG
# Optional: Save as PDF
fig.write_image("stacked_bar_ratings_per_day.pdf")
# Save as PDF
import plotly.express as px
import plotly.io as pio
import pandas as pd
# Function to generate rating DataFrame
def generate_rating_df(df):
# Group data by 'day_name' and 'Rating', and count the occurrences of 'Invoice ID'
rating_df = df.groupby(['day_name', 'Rating']).agg({'Invoice ID': 'count'}).reset_index()
rating_df = rating_df[rating_df['Invoice ID'] != 0]
rating_df.columns = ['day_name', 'Rating', 'counts']
rating_df = rating_df.sort_values('Rating')
return rating_df
# Generate rating_df using your data
rating_df = generate_rating_df(data)
# Create bar chart figure
fig = px.bar(rating_df, x='day_name', y='counts', color='Rating')
# Update chart layout and traces
fig.update_traces(textposition='auto', textfont_size=20)
fig.update_layout(barmode='stack', title="Ratings per Day",
xaxis_title="Day of the Week",
yaxis_title="Counts",
font=dict(size=14))
# Convert to a printable HTML file
html_content = pio.to_html(fig, full_html=False, include_plotlyjs='cdn')
# Write the HTML content to a file
with open("rating_chart.html", "w") as f:
f.write(html_content)
print("HTML file successfully generated for printing.")
HTML file successfully generated for printing.
fig = px.scatter(data_frame = data,
x="day_name",
y="gross income",
color="Gender",
size='Rating',
hover_data=['Total'],
marginal_x="histogram",
marginal_y="box",)
fig.update_layout(title_text=" Gross Income Per Day ",
titlefont={'size': 24, 'family':'Serif'},
width=1000,
height=550,
)
fig.show()
import plotly.express as px
import plotly.io as pio
# Create the scatter plot
fig = px.scatter(data_frame=data,
x="day_name",
y="gross income",
color="Gender",
size='Rating',
hover_data=['Total'],
marginal_x="histogram",
marginal_y="box")
# Update layout
fig.update_layout(title_text=" Gross Income Per Day ",
titlefont={'size': 24, 'family': 'Serif'},
width=1000,
height=550)
# Show interactive plot
fig.show()
# Export the plot as a static image for GitHub or PDF
# Ensure you have kaleido installed: `pip install -U kaleido`
fig.write_image("gross_income_per_day.png") # Save as PNG
# Optional: Save as PDF
fig.write_image("gross_income_per_day.pdf")
# Save as PDF
fig = px.histogram(data, x='Rating',height=500,width=900,template='simple_white',
color='Product line', # adding categorical column
color_discrete_sequence=['purple','pink','aliceblue','blanchedalmond','darkolivegreen','darkslateg
fig.update_layout(title={'text':'Histogram of Ratings Per Product Line','font':{'size':25}}
,title_font_family="Times New Roman",
title_font_color="darkgrey",
title_x=0.2)
fig.update_layout(
font_family='classic-roman',
font_color= 'grey',
yaxis_title={'text': " count", 'font': {'size':18}},
xaxis_title={'text': " Rating", 'font': {'size':18}}
)
fig.show()
import plotly.express as px
import plotly.io as pio
# Create the histogram plot
fig = px.histogram(data,
x='Rating',
height=500,
width=900,
template='simple_white',
color='Product line', # Adding categorical column
color_discrete_sequence=['purple', 'pink', 'aliceblue',
'blanchedalmond', 'darkolivegreen',
'darkslategray'])
# Update layout
fig.update_layout(
title={'text': 'Histogram of Ratings Per Product Line',
'font': {'size': 25}},
title_font_family="Times New Roman",
title_font_color="darkgrey",
title_x=0.2,
font_family='classic-roman',
font_color='grey',
yaxis_title={'text': "Count", 'font': {'size': 18}},
xaxis_title={'text': "Rating", 'font': {'size': 18}}
)
# Show interactive plot
fig.show()
# Export the plot as a static image for GitHub or PDF
# Ensure you have kaleido installed: `pip install -U kaleido`
fig.write_image("histogram_of_ratings_per_product_line.png") # Save as PNG
# Optional: Save as PDF
fig.write_image("histogram_of_ratings_per_product_line.pdf")
# Save as PDF