Superstore Sals Analysis
8/6/2024
Superstore_Sales Analysis
In [1]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]: super_sales=pd.read_csv("C:/Users/ty/Downloads/supermarket_sales.csv")
super_sales
Out[2]:
Invoice
ID
Branch
City
Customer
type
Gender
Product
line
Unit
price
Quantity
Tax 5%
Total
0
-
A
Yangon
Member
Female
Health and
beauty
74.69
7
26.1415
-
1
-
C
Naypyitaw
Normal
Female
Electronic
accessories
15.28
5
3.8200
80.2200
2
-
A
Yangon
Normal
Male
Home and
lifestyle
46.33
7
16.2155
-
3
-
A
Yangon
Member
Male
Health and
beauty
58.22
8
23.2880
-
4
-
A
Yangon
Normal
Male
Sports and
travel
86.31
7
30.2085
-
...
...
...
...
...
...
...
...
...
...
...
995
-
C
Naypyitaw
Normal
Male
Health and
beauty
40.35
1
2.0175
42.3675
996
-
B
Mandalay
Normal
Female
Home and
lifestyle
97.38
10
48.6900
-
997
-
A
Yangon
Member
Male
Food and
beverages
31.84
1
1.5920
33.4320
998
-
A
Yangon
Normal
Male
Home and
lifestyle
65.82
1
3.2910
69.1110
999
-
A
Yangon
Member
Female
Fashion
accessories
88.34
7
30.9190
-
1000 rows × 17 columns
In [3]: super_sales.shape
Out[3]:
(1000, 17)
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
1/26
8/6/2024
Superstore_Sales Analysis
In [4]: super_sales.info()
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
#
Column
Non-Null Count
--- ------------------0
Invoice ID
1000 non-null
1
Branch
1000 non-null
2
City
1000 non-null
3
Customer type
1000 non-null
4
Gender
1000 non-null
5
Product line
1000 non-null
6
Unit price
1000 non-null
7
Quantity
1000 non-null
8
Tax 5%
1000 non-null
9
Total
1000 non-null
10 Date
1000 non-null
11 Time
1000 non-null
12 Payment
1000 non-null
13 cogs
1000 non-null
14 gross margin percentage 1000 non-null
15 gross income
1000 non-null
16 Rating
1000 non-null
dtypes: float64(7), int64(1), object(9)
memory usage: 132.9+ KB
Dtype
----object
object
object
object
object
object
float64
int64
float64
float64
object
object
object
float64
float64
float64
float64
In [5]: super_sales.dtypes
Out[5]:
Invoice ID
Branch
City
Customer type
Gender
Product line
Unit price
Quantity
Tax 5%
Total
Date
Time
Payment
cogs
gross margin percentage
gross income
Rating
dtype: object
object
object
object
object
object
object
float64
int64
float64
float64
object
object
object
float64
float64
float64
float64
In [6]: #Converting Date to Date format
super_sales["Date"]=super_sales['Date'].apply(pd.to_datetime)
In [7]: super_sales.dtypes
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
2/26
8/6/2024
Superstore_Sales Analysis
Out[7]:
Invoice ID
Branch
City
Customer type
Gender
Product line
Unit price
Quantity
Tax 5%
Total
Date
Time
Payment
cogs
gross margin percentage
gross income
Rating
dtype: object
object
object
object
object
object
object
float64
int64
float64
float64
datetime64[ns]
object
object
float64
float64
float64
float64
In [8]: #Converting time to time format
super_sales["Time"]=pd.to_datetime(super_sales['Time'],format='%H:%M').dt.hour
In [9]: super_sales[['Date','Time']]
Date
Time
0
-
13
1
-
10
2
-
13
3
-
20
4
-
10
...
...
...
995
-
13
996
-
17
997
-
13
998
-
15
999
-
13
Out[9]:
1000 rows × 2 columns
Branch and City Analysis
In [10]: #Which branch has the highest total sales
super_sales.groupby(['Branch','City'])[['Total','City']].sum().sort_values('Total',asc
## Branch C had the highest Total sale, Totalling 110569 in the city Naypyitaw
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
3/26
8/6/2024
Superstore_Sales Analysis
C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: The d
efault value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future versi
on, numeric_only will default to False. Either specify numeric_only or select only co
lumns which should be valid for the function.
super_sales.groupby(['Branch','City'])[['Total','City']].sum().sort_values('Total',
ascending=False)
Total
Out[10]:
Branch
City
C
Naypyitaw
-
A
Yangon
-
B
Mandalay
-
In [11]: branch_totals=super_sales.groupby('Branch')['Total'].sum()
branch_totals.reset_index()
branch_total=pd.DataFrame(branch_totals)
In [12]: #How do sale's figures vary between branch A, B & C?
branch_total.reset_index(inplace=True)
plt.bar(branch_total['Branch'],branch_total['Total'],color=['blue','orange','green'])
plt.xlabel('Branch')
plt.ylabel('Total Sales')
plt.title("Total Sales by Branch")
plt.show
Out[12]:
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
4/26
8/6/2024
Superstore_Sales Analysis
In [13]: #Which city has the highest number of transactions?
city_transact=super_sales.groupby('City')['Quantity'].sum().sort_values(ascending=Fals
city_transact
#Yangon sold 1859 items in total, making them the city with the highest transaction
Out[13]:
City
Yangon
1859
Naypyitaw
1831
Mandalay
1820
Name: Quantity, dtype: int64
In [14]: #represented in a horizontal barchart
city_transaction=pd.DataFrame(city_transact)
city_transaction.reset_index(inplace=True)
In [15]: plt.barh(city_transaction['City'],city_transaction['Quantity'],height=0.5,color=['blue
plt.title('City with Highest Transaction')
plt.xlabel('Total QTY')
plt.ylabel('City')
plt.show()
Product Line Analysis
In [16]: import math as mth
In [17]: #What is the Total sales revenue for each product line
prod_rev= super_sales.groupby('Product line')['Total'].sum().sort_values(ascending=Fal
prod_revs=pd.DataFrame(prod_rev)
prod_revs.reset_index(inplace=True)
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
5/26
8/6/2024
Superstore_Sales Analysis
prod_revs['Total']=prod_revs['Total'].apply(mth.ceil)
prod_revs
Product line
Total
0
Food and beverages
56145
1
Sports and travel
55123
2
Electronic accessories
54338
3
Fashion accessories
54306
4
Home and lifestyle
53862
5
Health and beauty
49194
Out[17]:
In [18]: #Which product line generates the most gross income?
prod_gross= super_sales.groupby('Product line')['gross income'].sum().sort_values(asce
product_gross=pd.DataFrame(prod_gross)
product_gross.reset_index(inplace=True)
product_gross.max()
#Sports and travel generated the highest gross income
Out[18]:
Product line
gross income
dtype: object
Sports and travel-
In [19]: #What is the average unit price and quantity sold for each product line
Avg= super_sales.groupby('Product line')['Quantity','Unit price'].mean()
Avg_QtyUnit=pd.DataFrame(Avg)
Avg_QtyUnit.reset_index(inplace=True)
Avg_QtyUnit['Quantity']=Avg_QtyUnit['Quantity'].apply(mth.ceil)
Avg_QtyUnit
C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index
ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated,
use a list instead.
Avg= super_sales.groupby('Product line')['Quantity','Unit price'].mean()
Product line
Quantity
Unit price
0
Electronic accessories
6
-
1
Fashion accessories
6
-
2
Food and beverages
6
-
3
Health and beauty
6
-
4
Home and lifestyle
6
-
5
Sports and travel
6
-
Out[19]:
Customer Analysis
In [20]: #What is the distribution of customer types across branches
Member=super_sales.loc[super_sales['Customer type']=='Member',['Customer type','Branch
Member.value_counts()
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
6/26
8/6/2024
Out[20]:
Superstore_Sales Analysis
Customer type
Member
Branch
C
A
B
-
dtype: int64
In [21]: Normal=super_sales.loc[super_sales['Customer type']=='Normal',['Customer type','Branch
Normal.value_counts()
Out[21]:
Customer type
Normal
Branch
A
B
C
-
dtype: int64
N/B; in Branch A: Normal customers surpass members (Normal 173, Member 167) in Branch B:
Normal customers are slightly higher than Member (Normal 167, Member 165) in Branch C:
Members are higher than Normal customers(Normal 159, Member 169)
In [22]: #What customer type contribute more to total sales and gross income?
C_type=super_sales.groupby('Customer type')['Total','gross income'].sum()
C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index
ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated,
use a list instead.
C_type=super_sales.groupby('Customer type')['Total','gross income'].sum()
In [23]: C_types=pd.DataFrame(C_type).reset_index()
C_types
Customer type
Total
gross income
0
Member
-
-
1
Normal
-
-
Out[23]:
In [24]: C_types=C_types.rename(columns={'Customer type':'Customer_type'})
C_types
#matplot recognize the stand alone name type so I had to rename and join it with Custo
Customer_type
Total
gross income
0
Member
-
-
1
Normal
-
-
Out[24]:
In [25]: Member=C_types.loc[C_types['Customer_type']=='Member',['Total','gross income']]
Normal=C_types.loc[C_types['Customer_type']=='Normal',['Total','gross income']]
member=pd.DataFrame(Member)
normal=pd.DataFrame(Normal)
#getting values for each customer type
In [26]: normal1=normal.values
member1=member.values
r_normal=normal1.reshape(-1)
r_member=member1.reshape(-1)
#Getting only the numerical values which converts to numpy
#then reshaping because matplot only recognizes one dimensional arrays
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
7/26
8/6/2024
Superstore_Sales Analysis
In [27]: Values=['TotalRev','GrossIncome']
index=np.arange(len(Values))
bar_width=0.4
mem=plt.bar(index-0.2,r_member,bar_width,label='member')
norm=plt.bar(index+0.2,r_normal,bar_width,label='Normal')
plt.xticks(index,Values)
plt.title('Customer Revenue X GrossIncome')
plt.legend()
plt.bar_label(mem,padding=1)
plt.bar_label(norm,padding=1)
Out[27]:
[Text(0, 1, '158743'), Text(0, 1, '7559.2')]
In [28]: #How does average rating vary between members and normal customer
Avg_rate=super_sales.groupby('Customer type')['Rating'].mean()
Avg_rating=pd.DataFrame(Avg_rate).reset_index()
Avg_rating
Customer type
Rating
0
Member
-
1
Normal
-
Out[28]:
In [29]: Rating=Avg_rating['Rating']
Rating_df=pd.DataFrame(Rating)
Rating_np=Rating_df.values
Rate=Rating_np.reshape(-1)
Rate
Out[29]:
array([-,-])
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
8/26
8/6/2024
Superstore_Sales Analysis
In [30]: #represent in a pie chart
P_values=Rate
P_labels=['Member','Normal']
plt.pie(P_values,labels=P_labels,autopct='%1.1f%%',startangle=180)
plt.legend()
plt.show()
In [31]: super_sales.columns
Out[31]:
Index(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender',
'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date',
'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income',
'Rating'],
dtype='object')
Gender Analysis
In [32]: #How do sales figure compare between male and female customers?
Gender_sales=super_sales.groupby('Gender')['Quantity','Total'].sum().sort_values('Tota
Gender_sale=pd.DataFrame(Gender_sales).reset_index()
Gender_sale
C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index
ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated,
use a list instead.
Gender_sales=super_sales.groupby('Gender')['Quantity','Total'].sum().sort_values('T
otal',ascending=False)
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
9/26
8/6/2024
Superstore_Sales Analysis
Gender
Quantity
Total
0
Female
2869
-
1
Male
2641
-
Out[32]:
In [33]: FemQty_Tot=Gender_sale.loc[Gender_sale['Gender']=='Female',['Quantity','Total']]
MalQTy_Tot=Gender_sale.loc[Gender_sale['Gender']=='Male',['Quantity','Total']]
female_QtyTot=pd.DataFrame(FemQty_Tot)
male_QtyTot=pd.DataFrame(MalQTy_Tot)
GsMale=male_QtyTot.values.reshape(-1)
GsFemale=female_QtyTot.values.reshape(-1)
In [34]: Gs_values=('QTY','TotalRev')
Gs_index=np.arange(len(Gs_values))
Gs_bar_width=0.4
Gs_barMale=plt.bar(Gs_index-0.2,GsMale,Gs_bar_width,label='Male')
Gs_barFemale=plt.bar(Gs_index+0.2,GsFemale,Gs_bar_width,label='Female')
plt.xticks(Gs_index,Gs_values)
plt.xlabel('Totals')
plt.ylabel('Amount')
plt.bar_label(Gs_barMale,padding=1)
plt.bar_label(Gs_barFemale,padding=1)
plt.legend()
plt.show()
Females obviously made more impact in sales by buying more quantity and getting a higher
sales revenue
In [35]: #What is the average rating given by male and female customers?
mean_F= super_sales.query("Gender=='Female'")
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
10/26
8/6/2024
Superstore_Sales Analysis
mean_F['Rating'].mean()
#Average Rating for Female is 7
Out[35]:
-
In [36]: mean_M= super_sales.query("Gender=='Male'")
mean_M['Rating'].mean()
#Average mean Rating for Male is 7
Out[36]:
-
In [37]: #Are there any significant differences in purchasing patterns between genders?
super_sales.head()
super_sales.groupby(['Product line','Gender'])[['Quantity']].sum()
#Females made more purchases across all product line except health and beauty
Quantity
Out[37]:
Product line
Gender
Electronic accessories
Female
488
Male
483
Female
530
Male
372
Female
514
Male
438
Female
343
Male
511
Female
498
Male
413
Female
496
Male
424
Fashion accessories
Food and beverages
Health and beauty
Home and lifestyle
Sports and travel
Time Based Analysis
In [38]: #What are the peak sales hours for each branch
In [39]: #Find the totals for each hour, then find the max
peak_hr=super_sales.groupby(['Branch','Time'])['Total'].sum()
peak_hour=pd.DataFrame(peak_hr).reset_index()
peak=peak_hour.loc[peak_hour.groupby('Branch')['Total'].idxmax()]
peak
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
11/26
8/6/2024
Superstore_Sales Analysis
Branch
Time
Total
1
A
11
-
20
B
19
-
31
C
19
-
Out[39]:
Branch A Sales peaked within 11 am Branch B Sales peaked within 7pm Branch C sales peaked
within 7pm
In [40]: patt=plt.bar(peak['Branch'],peak['Time'],color=['blue','orange','green'])
plt.bar_label(patt,padding=1)
plt.xlabel('Branch')
plt.ylabel('Hour')
Out[40]:
Text(0, 0.5, 'Hour')
In [41]: #How do sales trend vary by day of the week and month
super_sales.head()
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
12/26
8/6/2024
Superstore_Sales Analysis
Out[41]:
Invoice
ID
Branch
City
Customer
type
Gender
Product
line
Unit
price
Quantity
Tax 5%
Total
D
0
-
A
Yangon
Member
Female
Health and
beauty
74.69
7
26.1415
-
20
01
1
-
C
Naypyitaw
Normal
Female
Electronic
accessories
15.28
5
3.8200
80.2200
20
03
2
-
A
Yangon
Normal
Male
Home and
lifestyle
46.33
7
16.2155
-
20
03
3
-
A
Yangon
Member
Male
Health and
beauty
58.22
8
23.2880
-
20
01
4
-
A
Yangon
Normal
Male
Sports and
travel
86.31
7
30.2085
-
20
02
In [42]: super_sales['Month']=pd.to_datetime(super_sales['Date']).dt.month_name()
super_sales['Day']=pd.to_datetime(super_sales['Date']).dt.day_name()
#Assigning mew columns for day and month
In [43]: day=super_sales.groupby('Day')['Total'].sum()
days=pd.DataFrame(day).reindex(['Monday','Tuesday','Wednesday','Thursday','Friday','Sa
plt.plot(days['Day'],days['Total'],marker='o')
plt.title("Daily/Weekly Patterns")
#Patterns for days
Out[43]:
Text(0.5, 1.0, 'Daily/Weekly Patterns')
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
13/26
8/6/2024
Superstore_Sales Analysis
In [44]: month=super_sales.groupby('Month')['Total'].sum()
months=pd.DataFrame(month).reindex(['January','February','March','April','May','Jun','
plt.plot(months['Month'],months['Total'],marker='o')
plt.xlabel('Month')
plt.ylabel('TotalSales')
plt.title('Monthly Sales Pattern')
#Pattern for month
Out[44]:
Text(0.5, 1.0, 'Monthly Sales Pattern')
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
14/26
8/6/2024
Superstore_Sales Analysis
In [45]: #Are there any seasonal patterns in trend?
In [46]: super_sales.loc[super_sales['Month'].isin(['December','January','February']),'Season']
super_sales.loc[super_sales['Month'].isin(['March','April','May']),'Season']='Spring'
super_sales.loc[super_sales['Month'].isin(['June','July','August']),'Season']='Summer'
super_sales.loc[super_sales['Month'].isin(['September','October','November']),'Season'
#Creating new column for season
In [47]: season=super_sales.groupby('Season')['Total'].sum()
seasons=pd.DataFrame(season).reset_index()
seasons_value=['Spring','Winter']
seasons_rating=seasons['Total'].values
plt.pie(seasons_rating,labels=seasons_value,autopct='%1.1f%%')
plt.legend()
plt.title("Seasonal Pattern")
plt.show()
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
15/26
8/6/2024
Superstore_Sales Analysis
Payment Method Analysis
In [48]: #Which Payment Method is most commonly used?
super_sales['Payment'].value_counts()
#Ewallet is the most commonly used payment method
Out[48]:
Ewallet
Cash
Credit card
Name: Payment,
-
dtype: int64
In [49]: #How do sales compare across the different payment method
pay=super_sales.groupby('Payment')['gross income','Total'].sum()
pays=pd.DataFrame(pay).reset_index()
pays
C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index
ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated,
use a list instead.
pay=super_sales.groupby('Payment')['gross income','Total'].sum()
Payment
gross income
Total
0
Cash
-
-
1
Credit card
-
-
2
Ewallet
-
-
Out[49]:
In [50]: p1=plt.bar(pays['Payment'],pays["Total"],label='TotalSales',width=0.3)
p2=plt.bar(pays['Payment'],pays["gross income"],label='gross income',width=0.3)
plt.legend(loc='upper left')
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
16/26
8/6/2024
Superstore_Sales Analysis
plt.bar_label(p2,padding=1)
plt.bar_label(p1,padding=1)
Out[50]:
[Text(0, 1, '112207'), Text(0, 1, '100767'), Text(0, 1, '109993')]
In [51]: #Which payment generates the highest gross income
pays['gross income'].max()
#Cash generates the most gross income
Out[51]:
5343.17
Profitablility Analysis
In [52]: #What is the gross margin percentage for each product line
super_sales.groupby('Product line')['gross margin percentage'].mean()
Out[52]:
Product line
Electronic accessories-
Fashion accessories-
Food and beverages-
Health and beauty-
Home and lifestyle-
Sports and travel-
Name: gross margin percentage, dtype: float64
In [53]: #Which procuct line has the highest gross income
#Ho does cost of goods compare across the product line
cogis=super_sales.groupby('Product line')['gross income','cogs'].sum()
cog_gi=pd.DataFrame(cogis).reset_index()
cog_gi
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
17/26
8/6/2024
Superstore_Sales Analysis
C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:3: FutureWarning: Index
ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated,
use a list instead.
cogis=super_sales.groupby('Product line')['gross income','cogs'].sum()
Product line
gross income
cogs
0
Electronic accessories
-
-
1
Fashion accessories
-
-
2
Food and beverages
-
-
3
Health and beauty
-
-
4
Home and lifestyle
-
-
5
Sports and travel
-
-
Out[53]:
In [54]: cog_gi.plot(x='Product line',kind='barh',stacked=False,width=0.8)
plt.title('GrossIncome vs Cogs')
Out[54]:
Text(0.5, 1.0, 'GrossIncome vs Cogs')
Food and beverages have the highest gross income Sports and travel has the highest cost of
goods
Customer Satisfaction Analysis
In [55]: #What is the average customer rating for each product line
avg_cust=super_sales.groupby('Product line')['Rating'].mean()
In [56]: #How do customer rating vary by branch and city
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
18/26
8/6/2024
Superstore_Sales Analysis
cb_rating=super_sales.groupby(['Branch','City'])['Rating'].mean()
cb_rate=pd.DataFrame(cb_rating).reset_index()
cb_rate
Branch
City
Rating
0
A
Yangon
-
1
B
Mandalay
-
2
C
Naypyitaw
-
Out[56]:
In [57]: cbr=plt.bar(cb_rate['Branch'],cb_rate['Rating'],color=['blue','orange','green'])
plt.bar_label(cbr,padding=1)
Out[57]:
[Text(0, 1, '7.02706'), Text(0, 1, '6.81807'), Text(0, 1, '7.07287')]
In [58]: #Are there any noticeable trends in customer rating over time
rate_tot=super_sales.groupby('Month')['Rating'].sum()
rate_ovt=pd.DataFrame(rate_tot).reindex(['January','February','March','April','May','J
rate_ovt
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
19/26
8/6/2024
Superstore_Sales Analysis
Month
Rating
0
January
2470.2
1
February
2142.6
2
March
2359.9
3
April
NaN
4
May
NaN
5
June
NaN
6
July
NaN
7
August
NaN
8
September
NaN
9
October
NaN
10
November
NaN
11
December
NaN
Out[58]:
In [59]: plt.plot(rate_ovt['Month'],rate_ovt['Rating'])
plt.title('Customer Rating Over Time')
Out[59]:
Text(0.5, 1.0, 'Customer Rating Over Time')
Segmentation Analysis
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
20/26
8/6/2024
Superstore_Sales Analysis
In [60]: #Segment customers based on their purchase behavior and identify key customer segments
super_sales.head()
Out[60]:
Invoice
ID
Branch
City
Customer
type
Gender
Product
line
Unit
price
Quantity
Tax 5%
Total
D
0
-
A
Yangon
Member
Female
Health and
beauty
74.69
7
26.1415
-
20
01
1
-
C
Naypyitaw
Normal
Female
Electronic
accessories
15.28
5
3.8200
80.2200
20
03
2
-
A
Yangon
Normal
Male
Home and
lifestyle
46.33
7
16.2155
-
20
03
3
-
A
Yangon
Member
Male
Health and
beauty
58.22
8
23.2880
-
20
01
4
-
A
Yangon
Normal
Male
Sports and
travel
86.31
7
30.2085
-
20
02
In [61]: super_sales.loc[super_sales['Total']>=500,'Customer Segment']='High'
super_sales.loc[super_sales['Total']<500,'Customer Segment']='Low'
In [62]: high_value=super_sales.loc[super_sales['Customer Segment']=='High',['Customer type','G
high_value
Customer type
Gender
Product line
Total
gross income
0
Member
Female
Health and beauty
-
26.1415
4
Normal
Male
Sports and travel
-
30.2085
5
Normal
Male
Electronic accessories
-
29.8865
7
Normal
Female
Home and lifestyle
-
36.7800
14
Normal
Female
Health and beauty
-
35.6900
...
...
...
...
...
...
988
Member
Male
Electronic accessories
-
41.1700
989
Member
Male
Health and beauty
-
30.1480
991
Normal
Female
Sports and travel
-
38.3000
996
Normal
Female
Home and lifestyle
-
48.6900
999
Member
Female
Fashion accessories
-
30.9190
Out[62]:
227 rows × 5 columns
In [63]: Low_value=super_sales.loc[super_sales['Customer Segment']=='Low',['Customer type','Gen
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
21/26
8/6/2024
Superstore_Sales Analysis
Low_value
Customer type
Gender
Product line
Total
gross income
1
Normal
Female
Electronic accessories
80.2200
3.8200
2
Normal
Male
Home and lifestyle
-
16.2155
3
Member
Male
Health and beauty
-
23.2880
6
Member
Female
Electronic accessories
-
20.6520
8
Member
Female
Health and beauty
76.1460
3.6260
...
...
...
...
...
...
993
Normal
Male
Fashion accessories
-
8.7450
994
Member
Female
Electronic accessories
63.9975
3.0475
995
Normal
Male
Health and beauty
42.3675
2.0175
997
Member
Male
Food and beverages
33.4320
1.5920
998
Normal
Male
Home and lifestyle
69.1110
3.2910
Out[63]:
773 rows × 5 columns
#What are the characteristics of high value customer Total of 227 customer High value customers have more
members than regular customer type(119/108) They are dominated by more female customers (121/106) They
made more sales in the product line Sports and travel Made total sales revenue of 159.555K Made total gross
income of 7.6K
#What are the characteristics of Low value customer Total of 773 customer Low value customers have more
regular than member customer type(391/382) They are dominated by more male customers (393/380) They made
more sales in the product line Fashion accesories Made total sales revenue of 163.412K Made total gross income
of 7.7K
In [64]: #How does the purchasing behavior of high value customers differ from low value custom
high=high_value.groupby('Customer type')['Total'].sum()
highh=pd.DataFrame(high).reset_index()
low=Low_value.groupby('Customer type')['Total'].sum()
loww=pd.DataFrame(low).reset_index()
In [65]: loww_v=loww['Total'].values.reshape(-1)
highh_v=highh['Total'].values.reshape(-1)
In [66]: loww
Customer type
Total
0
Member
-
1
Normal
-
Out[66]:
In [67]: v_values=['Member','Normal']
v_index=np.arange(len(v_values))
v_width=0.5
low1=plt.bar(v_index-0.2,loww_v,v_width,label="Low")
high1=plt.bar(v_index+0.2,highh_v,v_width,label="High")
plt.xticks(v_index,v_values)
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
22/26
8/6/2024
Superstore_Sales Analysis
plt.legend()
plt.title('High vs Low customer rating')
plt.bar_label(low1,padding=1)
plt.bar_label(high1,padding=1)
Out[67]:
[Text(0, 1, '83328.3'), Text(0, 1, '76226.3')]
Geographical Analysis
In [68]: #How do sales figure compare across the different cities?
citie=super_sales.groupby('City')['Total'].sum()
cities=pd.DataFrame(citie).reset_index()
city_c=plt.bar(cities['City'],cities['Total'],color=['blue','orange','green'])
plt.title('Total Sales by City')
plt.bar_label(city_c,padding=1)
Out[68]:
[Text(0, 1, '106198'), Text(0, 1, '110569'), Text(0, 1, '106200')]
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
23/26
8/6/2024
Superstore_Sales Analysis
In [69]: #Which city has the highest average unit price and quantity sold
super_sales.groupby('City')['Unit price','Quantity'].mean().sort_values('Unit price',a
C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index
ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated,
use a list instead.
super_sales.groupby('City')['Unit price','Quantity'].mean().sort_values('Unit pric
e',ascending=False)
Unit price
Quantity
Naypyitaw
-
-
Mandalay
-
-
Yangon
-
-
Out[69]:
City
In [70]: #Are there any significant differences in customer rating across the cities
super_sales.groupby('City')['Rating'].mean()
Out[70]:
City
Mandalay-
Naypyitaw-
Yangon-
Name: Rating, dtype: float64
Advanced analysis
In [71]: import scipy.stats
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
24/26
8/6/2024
Superstore_Sales Analysis
from scipy.stats import pearsonr
In [72]: #Is there a correaltion between unit price and quantity sold
corr,p_value=pearsonr(super_sales['Unit price'],super_sales['Quantity'])
print(f'Pearson correlation coefficient:{corr}')
print(f'p-value:{p_value}')
Pearson correlation coefficient:-
p-value:-
In [73]: plt.scatter(super_sales['Unit price'],super_sales['Quantity'])
plt.xlabel('Unit')
plt.ylabel('Quantity')
plt.title('Unit vs Quantity sold Correlation')
Out[73]:
Text(0.5, 1.0, 'Unit vs Quantity sold Correlation')
In [74]: #How does the total sales amount correlate with customer rating
corr,p_value=pearsonr(super_sales['Total'],super_sales['Rating'])
print(f'Pearson correlation coefficient:{corr}')
print(f'p-value:{p_value}')
Pearson correlation coefficient:-
p-value:-
In [75]: plt.scatter(super_sales['Total'],super_sales['Rating'])
plt.xlabel('Total Sales')
plt.ylabel('Rating')
plt.title('Total sales vs Rating Correlation')
Out[75]:
Text(0.5, 1.0, 'Total sales vs Rating Correlation')
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
25/26
8/6/2024
Superstore_Sales Analysis
Crossselling; Low value customers made more revenue in sales even though they bought more less costly items,
we can do more of cross selling to this segment with value offers Upselling; High value customers have been
noticed to buy the more costly things it can be assumed that they place more value on the cost of goods. We can
upsell to this segment, suggesting more of new high quality products
In [ ]:
file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html
26/26