Kenneth | Freelancer Superstore Sals Analysis

Superstore Sals Analysis

8/6/2024 Superstore_Sales Analysis In [1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt In [2]: super_sales=pd.read_csv("C:/Users/ty/Downloads/supermarket_sales.csv") super_sales Out[2]: Invoice ID Branch City Customer type Gender Product line Unit price Quantity Tax 5% Total 0 - A Yangon Member Female Health and beauty 74.69 7 26.1415 - 1 - C Naypyitaw Normal Female Electronic accessories 15.28 5 3.8200 80.2200 2 - A Yangon Normal Male Home and lifestyle 46.33 7 16.2155 - 3 - A Yangon Member Male Health and beauty 58.22 8 23.2880 - 4 - A Yangon Normal Male Sports and travel 86.31 7 30.2085 - ... ... ... ... ... ... ... ... ... ... ... 995 - C Naypyitaw Normal Male Health and beauty 40.35 1 2.0175 42.3675 996 - B Mandalay Normal Female Home and lifestyle 97.38 10 48.6900 - 997 - A Yangon Member Male Food and beverages 31.84 1 1.5920 33.4320 998 - A Yangon Normal Male Home and lifestyle 65.82 1 3.2910 69.1110 999 - A Yangon Member Female Fashion accessories 88.34 7 30.9190 - 1000 rows × 17 columns In [3]: super_sales.shape Out[3]: (1000, 17) file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 1/26 8/6/2024 Superstore_Sales Analysis In [4]: super_sales.info() RangeIndex: 1000 entries, 0 to 999 Data columns (total 17 columns): # Column Non-Null Count --- ------------------0 Invoice ID 1000 non-null 1 Branch 1000 non-null 2 City 1000 non-null 3 Customer type 1000 non-null 4 Gender 1000 non-null 5 Product line 1000 non-null 6 Unit price 1000 non-null 7 Quantity 1000 non-null 8 Tax 5% 1000 non-null 9 Total 1000 non-null 10 Date 1000 non-null 11 Time 1000 non-null 12 Payment 1000 non-null 13 cogs 1000 non-null 14 gross margin percentage 1000 non-null 15 gross income 1000 non-null 16 Rating 1000 non-null dtypes: float64(7), int64(1), object(9) memory usage: 132.9+ KB Dtype ----object object object object object object float64 int64 float64 float64 object object object float64 float64 float64 float64 In [5]: super_sales.dtypes Out[5]: Invoice ID Branch City Customer type Gender Product line Unit price Quantity Tax 5% Total Date Time Payment cogs gross margin percentage gross income Rating dtype: object object object object object object object float64 int64 float64 float64 object object object float64 float64 float64 float64 In [6]: #Converting Date to Date format super_sales["Date"]=super_sales['Date'].apply(pd.to_datetime) In [7]: super_sales.dtypes file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 2/26 8/6/2024 Superstore_Sales Analysis Out[7]: Invoice ID Branch City Customer type Gender Product line Unit price Quantity Tax 5% Total Date Time Payment cogs gross margin percentage gross income Rating dtype: object object object object object object object float64 int64 float64 float64 datetime64[ns] object object float64 float64 float64 float64 In [8]: #Converting time to time format super_sales["Time"]=pd.to_datetime(super_sales['Time'],format='%H:%M').dt.hour In [9]: super_sales[['Date','Time']] Date Time 0 - 13 1 - 10 2 - 13 3 - 20 4 - 10 ... ... ... 995 - 13 996 - 17 997 - 13 998 - 15 999 - 13 Out[9]: 1000 rows × 2 columns Branch and City Analysis In [10]: #Which branch has the highest total sales super_sales.groupby(['Branch','City'])[['Total','City']].sum().sort_values('Total',asc ## Branch C had the highest Total sale, Totalling 110569 in the city Naypyitaw file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 3/26 8/6/2024 Superstore_Sales Analysis C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: The d efault value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future versi on, numeric_only will default to False. Either specify numeric_only or select only co lumns which should be valid for the function. super_sales.groupby(['Branch','City'])[['Total','City']].sum().sort_values('Total', ascending=False) Total Out[10]: Branch City C Naypyitaw - A Yangon - B Mandalay - In [11]: branch_totals=super_sales.groupby('Branch')['Total'].sum() branch_totals.reset_index() branch_total=pd.DataFrame(branch_totals) In [12]: #How do sale's figures vary between branch A, B & C? branch_total.reset_index(inplace=True) plt.bar(branch_total['Branch'],branch_total['Total'],color=['blue','orange','green']) plt.xlabel('Branch') plt.ylabel('Total Sales') plt.title("Total Sales by Branch") plt.show Out[12]: file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 4/26 8/6/2024 Superstore_Sales Analysis In [13]: #Which city has the highest number of transactions? city_transact=super_sales.groupby('City')['Quantity'].sum().sort_values(ascending=Fals city_transact #Yangon sold 1859 items in total, making them the city with the highest transaction Out[13]: City Yangon 1859 Naypyitaw 1831 Mandalay 1820 Name: Quantity, dtype: int64 In [14]: #represented in a horizontal barchart city_transaction=pd.DataFrame(city_transact) city_transaction.reset_index(inplace=True) In [15]: plt.barh(city_transaction['City'],city_transaction['Quantity'],height=0.5,color=['blue plt.title('City with Highest Transaction') plt.xlabel('Total QTY') plt.ylabel('City') plt.show() Product Line Analysis In [16]: import math as mth In [17]: #What is the Total sales revenue for each product line prod_rev= super_sales.groupby('Product line')['Total'].sum().sort_values(ascending=Fal prod_revs=pd.DataFrame(prod_rev) prod_revs.reset_index(inplace=True) file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 5/26 8/6/2024 Superstore_Sales Analysis prod_revs['Total']=prod_revs['Total'].apply(mth.ceil) prod_revs Product line Total 0 Food and beverages 56145 1 Sports and travel 55123 2 Electronic accessories 54338 3 Fashion accessories 54306 4 Home and lifestyle 53862 5 Health and beauty 49194 Out[17]: In [18]: #Which product line generates the most gross income? prod_gross= super_sales.groupby('Product line')['gross income'].sum().sort_values(asce product_gross=pd.DataFrame(prod_gross) product_gross.reset_index(inplace=True) product_gross.max() #Sports and travel generated the highest gross income Out[18]: Product line gross income dtype: object Sports and travel- In [19]: #What is the average unit price and quantity sold for each product line Avg= super_sales.groupby('Product line')['Quantity','Unit price'].mean() Avg_QtyUnit=pd.DataFrame(Avg) Avg_QtyUnit.reset_index(inplace=True) Avg_QtyUnit['Quantity']=Avg_QtyUnit['Quantity'].apply(mth.ceil) Avg_QtyUnit C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. Avg= super_sales.groupby('Product line')['Quantity','Unit price'].mean() Product line Quantity Unit price 0 Electronic accessories 6 - 1 Fashion accessories 6 - 2 Food and beverages 6 - 3 Health and beauty 6 - 4 Home and lifestyle 6 - 5 Sports and travel 6 - Out[19]: Customer Analysis In [20]: #What is the distribution of customer types across branches Member=super_sales.loc[super_sales['Customer type']=='Member',['Customer type','Branch Member.value_counts() file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 6/26 8/6/2024 Out[20]: Superstore_Sales Analysis Customer type Member Branch C A B - dtype: int64 In [21]: Normal=super_sales.loc[super_sales['Customer type']=='Normal',['Customer type','Branch Normal.value_counts() Out[21]: Customer type Normal Branch A B C - dtype: int64 N/B; in Branch A: Normal customers surpass members (Normal 173, Member 167) in Branch B: Normal customers are slightly higher than Member (Normal 167, Member 165) in Branch C: Members are higher than Normal customers(Normal 159, Member 169) In [22]: #What customer type contribute more to total sales and gross income? C_type=super_sales.groupby('Customer type')['Total','gross income'].sum() C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. C_type=super_sales.groupby('Customer type')['Total','gross income'].sum() In [23]: C_types=pd.DataFrame(C_type).reset_index() C_types Customer type Total gross income 0 Member - - 1 Normal - - Out[23]: In [24]: C_types=C_types.rename(columns={'Customer type':'Customer_type'}) C_types #matplot recognize the stand alone name type so I had to rename and join it with Custo Customer_type Total gross income 0 Member - - 1 Normal - - Out[24]: In [25]: Member=C_types.loc[C_types['Customer_type']=='Member',['Total','gross income']] Normal=C_types.loc[C_types['Customer_type']=='Normal',['Total','gross income']] member=pd.DataFrame(Member) normal=pd.DataFrame(Normal) #getting values for each customer type In [26]: normal1=normal.values member1=member.values r_normal=normal1.reshape(-1) r_member=member1.reshape(-1) #Getting only the numerical values which converts to numpy #then reshaping because matplot only recognizes one dimensional arrays file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 7/26 8/6/2024 Superstore_Sales Analysis In [27]: Values=['TotalRev','GrossIncome'] index=np.arange(len(Values)) bar_width=0.4 mem=plt.bar(index-0.2,r_member,bar_width,label='member') norm=plt.bar(index+0.2,r_normal,bar_width,label='Normal') plt.xticks(index,Values) plt.title('Customer Revenue X GrossIncome') plt.legend() plt.bar_label(mem,padding=1) plt.bar_label(norm,padding=1) Out[27]: [Text(0, 1, '158743'), Text(0, 1, '7559.2')] In [28]: #How does average rating vary between members and normal customer Avg_rate=super_sales.groupby('Customer type')['Rating'].mean() Avg_rating=pd.DataFrame(Avg_rate).reset_index() Avg_rating Customer type Rating 0 Member - 1 Normal - Out[28]: In [29]: Rating=Avg_rating['Rating'] Rating_df=pd.DataFrame(Rating) Rating_np=Rating_df.values Rate=Rating_np.reshape(-1) Rate Out[29]: array([-,-]) file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 8/26 8/6/2024 Superstore_Sales Analysis In [30]: #represent in a pie chart P_values=Rate P_labels=['Member','Normal'] plt.pie(P_values,labels=P_labels,autopct='%1.1f%%',startangle=180) plt.legend() plt.show() In [31]: super_sales.columns Out[31]: Index(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date', 'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income', 'Rating'], dtype='object') Gender Analysis In [32]: #How do sales figure compare between male and female customers? Gender_sales=super_sales.groupby('Gender')['Quantity','Total'].sum().sort_values('Tota Gender_sale=pd.DataFrame(Gender_sales).reset_index() Gender_sale C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. Gender_sales=super_sales.groupby('Gender')['Quantity','Total'].sum().sort_values('T otal',ascending=False) file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 9/26 8/6/2024 Superstore_Sales Analysis Gender Quantity Total 0 Female 2869 - 1 Male 2641 - Out[32]: In [33]: FemQty_Tot=Gender_sale.loc[Gender_sale['Gender']=='Female',['Quantity','Total']] MalQTy_Tot=Gender_sale.loc[Gender_sale['Gender']=='Male',['Quantity','Total']] female_QtyTot=pd.DataFrame(FemQty_Tot) male_QtyTot=pd.DataFrame(MalQTy_Tot) GsMale=male_QtyTot.values.reshape(-1) GsFemale=female_QtyTot.values.reshape(-1) In [34]: Gs_values=('QTY','TotalRev') Gs_index=np.arange(len(Gs_values)) Gs_bar_width=0.4 Gs_barMale=plt.bar(Gs_index-0.2,GsMale,Gs_bar_width,label='Male') Gs_barFemale=plt.bar(Gs_index+0.2,GsFemale,Gs_bar_width,label='Female') plt.xticks(Gs_index,Gs_values) plt.xlabel('Totals') plt.ylabel('Amount') plt.bar_label(Gs_barMale,padding=1) plt.bar_label(Gs_barFemale,padding=1) plt.legend() plt.show() Females obviously made more impact in sales by buying more quantity and getting a higher sales revenue In [35]: #What is the average rating given by male and female customers? mean_F= super_sales.query("Gender=='Female'") file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 10/26 8/6/2024 Superstore_Sales Analysis mean_F['Rating'].mean() #Average Rating for Female is 7 Out[35]: - In [36]: mean_M= super_sales.query("Gender=='Male'") mean_M['Rating'].mean() #Average mean Rating for Male is 7 Out[36]: - In [37]: #Are there any significant differences in purchasing patterns between genders? super_sales.head() super_sales.groupby(['Product line','Gender'])[['Quantity']].sum() #Females made more purchases across all product line except health and beauty Quantity Out[37]: Product line Gender Electronic accessories Female 488 Male 483 Female 530 Male 372 Female 514 Male 438 Female 343 Male 511 Female 498 Male 413 Female 496 Male 424 Fashion accessories Food and beverages Health and beauty Home and lifestyle Sports and travel Time Based Analysis In [38]: #What are the peak sales hours for each branch In [39]: #Find the totals for each hour, then find the max peak_hr=super_sales.groupby(['Branch','Time'])['Total'].sum() peak_hour=pd.DataFrame(peak_hr).reset_index() peak=peak_hour.loc[peak_hour.groupby('Branch')['Total'].idxmax()] peak file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 11/26 8/6/2024 Superstore_Sales Analysis Branch Time Total 1 A 11 - 20 B 19 - 31 C 19 - Out[39]: Branch A Sales peaked within 11 am Branch B Sales peaked within 7pm Branch C sales peaked within 7pm In [40]: patt=plt.bar(peak['Branch'],peak['Time'],color=['blue','orange','green']) plt.bar_label(patt,padding=1) plt.xlabel('Branch') plt.ylabel('Hour') Out[40]: Text(0, 0.5, 'Hour') In [41]: #How do sales trend vary by day of the week and month super_sales.head() file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 12/26 8/6/2024 Superstore_Sales Analysis Out[41]: Invoice ID Branch City Customer type Gender Product line Unit price Quantity Tax 5% Total D 0 - A Yangon Member Female Health and beauty 74.69 7 26.1415 - 20 01 1 - C Naypyitaw Normal Female Electronic accessories 15.28 5 3.8200 80.2200 20 03 2 - A Yangon Normal Male Home and lifestyle 46.33 7 16.2155 - 20 03 3 - A Yangon Member Male Health and beauty 58.22 8 23.2880 - 20 01 4 - A Yangon Normal Male Sports and travel 86.31 7 30.2085 - 20 02 In [42]: super_sales['Month']=pd.to_datetime(super_sales['Date']).dt.month_name() super_sales['Day']=pd.to_datetime(super_sales['Date']).dt.day_name() #Assigning mew columns for day and month In [43]: day=super_sales.groupby('Day')['Total'].sum() days=pd.DataFrame(day).reindex(['Monday','Tuesday','Wednesday','Thursday','Friday','Sa plt.plot(days['Day'],days['Total'],marker='o') plt.title("Daily/Weekly Patterns") #Patterns for days Out[43]: Text(0.5, 1.0, 'Daily/Weekly Patterns') file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 13/26 8/6/2024 Superstore_Sales Analysis In [44]: month=super_sales.groupby('Month')['Total'].sum() months=pd.DataFrame(month).reindex(['January','February','March','April','May','Jun',' plt.plot(months['Month'],months['Total'],marker='o') plt.xlabel('Month') plt.ylabel('TotalSales') plt.title('Monthly Sales Pattern') #Pattern for month Out[44]: Text(0.5, 1.0, 'Monthly Sales Pattern') file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 14/26 8/6/2024 Superstore_Sales Analysis In [45]: #Are there any seasonal patterns in trend? In [46]: super_sales.loc[super_sales['Month'].isin(['December','January','February']),'Season'] super_sales.loc[super_sales['Month'].isin(['March','April','May']),'Season']='Spring' super_sales.loc[super_sales['Month'].isin(['June','July','August']),'Season']='Summer' super_sales.loc[super_sales['Month'].isin(['September','October','November']),'Season' #Creating new column for season In [47]: season=super_sales.groupby('Season')['Total'].sum() seasons=pd.DataFrame(season).reset_index() seasons_value=['Spring','Winter'] seasons_rating=seasons['Total'].values plt.pie(seasons_rating,labels=seasons_value,autopct='%1.1f%%') plt.legend() plt.title("Seasonal Pattern") plt.show() file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 15/26 8/6/2024 Superstore_Sales Analysis Payment Method Analysis In [48]: #Which Payment Method is most commonly used? super_sales['Payment'].value_counts() #Ewallet is the most commonly used payment method Out[48]: Ewallet Cash Credit card Name: Payment, - dtype: int64 In [49]: #How do sales compare across the different payment method pay=super_sales.groupby('Payment')['gross income','Total'].sum() pays=pd.DataFrame(pay).reset_index() pays C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. pay=super_sales.groupby('Payment')['gross income','Total'].sum() Payment gross income Total 0 Cash - - 1 Credit card - - 2 Ewallet - - Out[49]: In [50]: p1=plt.bar(pays['Payment'],pays["Total"],label='TotalSales',width=0.3) p2=plt.bar(pays['Payment'],pays["gross income"],label='gross income',width=0.3) plt.legend(loc='upper left') file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 16/26 8/6/2024 Superstore_Sales Analysis plt.bar_label(p2,padding=1) plt.bar_label(p1,padding=1) Out[50]: [Text(0, 1, '112207'), Text(0, 1, '100767'), Text(0, 1, '109993')] In [51]: #Which payment generates the highest gross income pays['gross income'].max() #Cash generates the most gross income Out[51]: 5343.17 Profitablility Analysis In [52]: #What is the gross margin percentage for each product line super_sales.groupby('Product line')['gross margin percentage'].mean() Out[52]: Product line Electronic accessories- Fashion accessories- Food and beverages- Health and beauty- Home and lifestyle- Sports and travel- Name: gross margin percentage, dtype: float64 In [53]: #Which procuct line has the highest gross income #Ho does cost of goods compare across the product line cogis=super_sales.groupby('Product line')['gross income','cogs'].sum() cog_gi=pd.DataFrame(cogis).reset_index() cog_gi file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 17/26 8/6/2024 Superstore_Sales Analysis C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:3: FutureWarning: Index ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. cogis=super_sales.groupby('Product line')['gross income','cogs'].sum() Product line gross income cogs 0 Electronic accessories - - 1 Fashion accessories - - 2 Food and beverages - - 3 Health and beauty - - 4 Home and lifestyle - - 5 Sports and travel - - Out[53]: In [54]: cog_gi.plot(x='Product line',kind='barh',stacked=False,width=0.8) plt.title('GrossIncome vs Cogs') Out[54]: Text(0.5, 1.0, 'GrossIncome vs Cogs') Food and beverages have the highest gross income Sports and travel has the highest cost of goods Customer Satisfaction Analysis In [55]: #What is the average customer rating for each product line avg_cust=super_sales.groupby('Product line')['Rating'].mean() In [56]: #How do customer rating vary by branch and city file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 18/26 8/6/2024 Superstore_Sales Analysis cb_rating=super_sales.groupby(['Branch','City'])['Rating'].mean() cb_rate=pd.DataFrame(cb_rating).reset_index() cb_rate Branch City Rating 0 A Yangon - 1 B Mandalay - 2 C Naypyitaw - Out[56]: In [57]: cbr=plt.bar(cb_rate['Branch'],cb_rate['Rating'],color=['blue','orange','green']) plt.bar_label(cbr,padding=1) Out[57]: [Text(0, 1, '7.02706'), Text(0, 1, '6.81807'), Text(0, 1, '7.07287')] In [58]: #Are there any noticeable trends in customer rating over time rate_tot=super_sales.groupby('Month')['Rating'].sum() rate_ovt=pd.DataFrame(rate_tot).reindex(['January','February','March','April','May','J rate_ovt file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 19/26 8/6/2024 Superstore_Sales Analysis Month Rating 0 January 2470.2 1 February 2142.6 2 March 2359.9 3 April NaN 4 May NaN 5 June NaN 6 July NaN 7 August NaN 8 September NaN 9 October NaN 10 November NaN 11 December NaN Out[58]: In [59]: plt.plot(rate_ovt['Month'],rate_ovt['Rating']) plt.title('Customer Rating Over Time') Out[59]: Text(0.5, 1.0, 'Customer Rating Over Time') Segmentation Analysis file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 20/26 8/6/2024 Superstore_Sales Analysis In [60]: #Segment customers based on their purchase behavior and identify key customer segments super_sales.head() Out[60]: Invoice ID Branch City Customer type Gender Product line Unit price Quantity Tax 5% Total D 0 - A Yangon Member Female Health and beauty 74.69 7 26.1415 - 20 01 1 - C Naypyitaw Normal Female Electronic accessories 15.28 5 3.8200 80.2200 20 03 2 - A Yangon Normal Male Home and lifestyle 46.33 7 16.2155 - 20 03 3 - A Yangon Member Male Health and beauty 58.22 8 23.2880 - 20 01 4 - A Yangon Normal Male Sports and travel 86.31 7 30.2085 - 20 02 In [61]: super_sales.loc[super_sales['Total']>=500,'Customer Segment']='High' super_sales.loc[super_sales['Total']<500,'Customer Segment']='Low' In [62]: high_value=super_sales.loc[super_sales['Customer Segment']=='High',['Customer type','G high_value Customer type Gender Product line Total gross income 0 Member Female Health and beauty - 26.1415 4 Normal Male Sports and travel - 30.2085 5 Normal Male Electronic accessories - 29.8865 7 Normal Female Home and lifestyle - 36.7800 14 Normal Female Health and beauty - 35.6900 ... ... ... ... ... ... 988 Member Male Electronic accessories - 41.1700 989 Member Male Health and beauty - 30.1480 991 Normal Female Sports and travel - 38.3000 996 Normal Female Home and lifestyle - 48.6900 999 Member Female Fashion accessories - 30.9190 Out[62]: 227 rows × 5 columns In [63]: Low_value=super_sales.loc[super_sales['Customer Segment']=='Low',['Customer type','Gen file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 21/26 8/6/2024 Superstore_Sales Analysis Low_value Customer type Gender Product line Total gross income 1 Normal Female Electronic accessories 80.2200 3.8200 2 Normal Male Home and lifestyle - 16.2155 3 Member Male Health and beauty - 23.2880 6 Member Female Electronic accessories - 20.6520 8 Member Female Health and beauty 76.1460 3.6260 ... ... ... ... ... ... 993 Normal Male Fashion accessories - 8.7450 994 Member Female Electronic accessories 63.9975 3.0475 995 Normal Male Health and beauty 42.3675 2.0175 997 Member Male Food and beverages 33.4320 1.5920 998 Normal Male Home and lifestyle 69.1110 3.2910 Out[63]: 773 rows × 5 columns #What are the characteristics of high value customer Total of 227 customer High value customers have more members than regular customer type(119/108) They are dominated by more female customers (121/106) They made more sales in the product line Sports and travel Made total sales revenue of 159.555K Made total gross income of 7.6K #What are the characteristics of Low value customer Total of 773 customer Low value customers have more regular than member customer type(391/382) They are dominated by more male customers (393/380) They made more sales in the product line Fashion accesories Made total sales revenue of 163.412K Made total gross income of 7.7K In [64]: #How does the purchasing behavior of high value customers differ from low value custom high=high_value.groupby('Customer type')['Total'].sum() highh=pd.DataFrame(high).reset_index() low=Low_value.groupby('Customer type')['Total'].sum() loww=pd.DataFrame(low).reset_index() In [65]: loww_v=loww['Total'].values.reshape(-1) highh_v=highh['Total'].values.reshape(-1) In [66]: loww Customer type Total 0 Member - 1 Normal - Out[66]: In [67]: v_values=['Member','Normal'] v_index=np.arange(len(v_values)) v_width=0.5 low1=plt.bar(v_index-0.2,loww_v,v_width,label="Low") high1=plt.bar(v_index+0.2,highh_v,v_width,label="High") plt.xticks(v_index,v_values) file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 22/26 8/6/2024 Superstore_Sales Analysis plt.legend() plt.title('High vs Low customer rating') plt.bar_label(low1,padding=1) plt.bar_label(high1,padding=1) Out[67]: [Text(0, 1, '83328.3'), Text(0, 1, '76226.3')] Geographical Analysis In [68]: #How do sales figure compare across the different cities? citie=super_sales.groupby('City')['Total'].sum() cities=pd.DataFrame(citie).reset_index() city_c=plt.bar(cities['City'],cities['Total'],color=['blue','orange','green']) plt.title('Total Sales by City') plt.bar_label(city_c,padding=1) Out[68]: [Text(0, 1, '106198'), Text(0, 1, '110569'), Text(0, 1, '106200')] file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 23/26 8/6/2024 Superstore_Sales Analysis In [69]: #Which city has the highest average unit price and quantity sold super_sales.groupby('City')['Unit price','Quantity'].mean().sort_values('Unit price',a C:\Users\ty\AppData\Local\Temp\ipykernel_123664\-.py:2: FutureWarning: Index ing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. super_sales.groupby('City')['Unit price','Quantity'].mean().sort_values('Unit pric e',ascending=False) Unit price Quantity Naypyitaw - - Mandalay - - Yangon - - Out[69]: City In [70]: #Are there any significant differences in customer rating across the cities super_sales.groupby('City')['Rating'].mean() Out[70]: City Mandalay- Naypyitaw- Yangon- Name: Rating, dtype: float64 Advanced analysis In [71]: import scipy.stats file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 24/26 8/6/2024 Superstore_Sales Analysis from scipy.stats import pearsonr In [72]: #Is there a correaltion between unit price and quantity sold corr,p_value=pearsonr(super_sales['Unit price'],super_sales['Quantity']) print(f'Pearson correlation coefficient:{corr}') print(f'p-value:{p_value}') Pearson correlation coefficient:- p-value:- In [73]: plt.scatter(super_sales['Unit price'],super_sales['Quantity']) plt.xlabel('Unit') plt.ylabel('Quantity') plt.title('Unit vs Quantity sold Correlation') Out[73]: Text(0.5, 1.0, 'Unit vs Quantity sold Correlation') In [74]: #How does the total sales amount correlate with customer rating corr,p_value=pearsonr(super_sales['Total'],super_sales['Rating']) print(f'Pearson correlation coefficient:{corr}') print(f'p-value:{p_value}') Pearson correlation coefficient:- p-value:- In [75]: plt.scatter(super_sales['Total'],super_sales['Rating']) plt.xlabel('Total Sales') plt.ylabel('Rating') plt.title('Total sales vs Rating Correlation') Out[75]: Text(0.5, 1.0, 'Total sales vs Rating Correlation') file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 25/26 8/6/2024 Superstore_Sales Analysis Crossselling; Low value customers made more revenue in sales even though they bought more less costly items, we can do more of cross selling to this segment with value offers Upselling; High value customers have been noticed to buy the more costly things it can be assumed that they place more value on the cost of goods. We can upsell to this segment, suggesting more of new high quality products In [ ]: file:///C:/Users/ty/Downloads/Superstore_Sales Analysis (1).html 26/26